[Pkg-ceph-commits] [ceph] 01/05: re-patch with latest Firefly HEAD.
Dmitry Smirnov
onlyjob at moszumanska.debian.org
Mon Sep 15 19:45:53 UTC 2014
This is an automated email from the git hooks/post-receive script.
onlyjob pushed a commit to branch master
in repository ceph.
commit ce32a12
Author: Dmitry Smirnov <onlyjob at member.fsf.org>
Date: Mon Sep 15 17:35:22 2014
re-patch with latest Firefly HEAD.
---
debian/patches/backfill-prio.patch | 68 +-
debian/patches/bug-8342.patch | 8 +-
debian/patches/bug-8821.patch | 28 +-
debian/patches/ceph-ao-require-cas.patch | 16 -
debian/patches/client-sleep1.patch | 4 +-
debian/patches/client-sleep2.patch | 8 +-
debian/patches/client-sleep3.patch | 4 +-
debian/patches/firefly-latest.patch | 11172 +++++++++++++++++++++
debian/patches/fix-blkdev-BLKGETSIZE-check.patch | 35 -
debian/patches/series | 3 +-
debian/patches/sleep-recover.patch | 4 +-
11 files changed, 11265 insertions(+), 85 deletions(-)
diff --git a/debian/patches/backfill-prio.patch b/debian/patches/backfill-prio.patch
index 8ac72ee..ae3669e 100644
--- a/debian/patches/backfill-prio.patch
+++ b/debian/patches/backfill-prio.patch
@@ -11,7 +11,8 @@ Date: Tue Jun 24 02:09:49 2014
--- a/src/common/AsyncReserver.h
+++ b/src/common/AsyncReserver.h
-@@ -33,6 +33,7 @@ template <typename T>
+@@ -32,8 +32,9 @@
+ template <typename T>
class AsyncReserver {
Finisher *f;
unsigned max_allowed;
@@ -19,7 +20,9 @@ Date: Tue Jun 24 02:09:49 2014
Mutex lock;
map<unsigned, list<pair<T, Context*> > > queues;
-@@ -42,7 +43,9 @@ class AsyncReserver {
+ map<T, pair<unsigned, typename list<pair<T, Context*> >::iterator > > queue_pointers;
+@@ -41,9 +42,11 @@
+
void do_queues() {
typename map<unsigned, list<pair<T, Context*> > >::reverse_iterator it;
for (it = queues.rbegin();
@@ -30,7 +33,9 @@ Date: Tue Jun 24 02:09:49 2014
++it) {
while (in_progress.size() < max_allowed &&
!it->second.empty()) {
-@@ -57,8 +60,12 @@ class AsyncReserver {
+ pair<T, Context*> p = it->second.front();
+@@ -56,17 +59,27 @@
+ }
public:
AsyncReserver(
Finisher *f,
@@ -45,7 +50,7 @@ Date: Tue Jun 24 02:09:49 2014
void set_max(unsigned max) {
Mutex::Locker l(lock);
-@@ -66,6 +73,12 @@ public:
+ max_allowed = max;
do_queues();
}
@@ -58,9 +63,11 @@ Date: Tue Jun 24 02:09:49 2014
/**
* Requests a reservation
*
+ * Note, on_reserved may be called following cancel_reservation. Thus,
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
-@@ -389,6 +389,9 @@ OPTION(osd_compact_leveldb_on_mount, OPT
+@@ -389,8 +389,11 @@
+
// Maximum number of backfills to or from a single osd
OPTION(osd_max_backfills, OPT_U64, 10)
@@ -70,9 +77,11 @@ Date: Tue Jun 24 02:09:49 2014
// Refuse backfills when OSD full ratio is above this value
OPTION(osd_backfill_full_ratio, OPT_FLOAT, 0.85)
+ // Seconds to wait before retrying refused backfills
--- a/src/messages/MBackfillReserve.h
+++ b/src/messages/MBackfillReserve.h
-@@ -28,8 +28,8 @@ public:
+@@ -27,10 +27,10 @@
+ REQUEST = 0,
GRANT = 1,
REJECT = 2,
};
@@ -83,9 +92,11 @@ Date: Tue Jun 24 02:09:49 2014
MBackfillReserve()
: Message(MSG_OSD_BACKFILL_RESERVE, HEAD_VERSION, COMPAT_VERSION),
+ query_epoch(0), type(-1), priority(-1) {}
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
-@@ -217,8 +217,10 @@ OSDService::OSDService(OSD *osd) :
+@@ -218,10 +218,12 @@
+ backfill_request_timer(cct, backfill_request_lock, false),
last_tid(0),
tid_lock("OSDService::tid_lock"),
reserver_finisher(cct),
@@ -98,15 +109,19 @@ Date: Tue Jun 24 02:09:49 2014
pg_temp_lock("OSDService::pg_temp_lock"),
map_cache_lock("OSDService::map_lock"),
map_cache(cct->_conf->osd_map_cache_size),
-@@ -7806,6 +7808,7 @@ const char** OSD::get_tracked_conf_keys(
+ map_bl_cache(cct->_conf->osd_map_cache_size),
+@@ -7870,8 +7872,9 @@
+ const char** OSD::get_tracked_conf_keys() const
{
static const char* KEYS[] = {
"osd_max_backfills",
+ "osd_min_recovery_priority",
"osd_op_complaint_time", "osd_op_log_threshold",
"osd_op_history_size", "osd_op_history_duration",
- NULL
-@@ -7820,6 +7823,10 @@ void OSD::handle_conf_change(const struc
+ "osd_map_cache_size",
+ "osd_map_max_advance",
+@@ -7889,8 +7892,12 @@
+ if (changed.count("osd_max_backfills")) {
service.local_reserver.set_max(cct->_conf->osd_max_backfills);
service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
}
@@ -117,9 +132,11 @@ Date: Tue Jun 24 02:09:49 2014
if (changed.count("osd_op_complaint_time") ||
changed.count("osd_op_log_threshold")) {
op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
+ cct->_conf->osd_op_log_threshold);
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
-@@ -594,11 +594,6 @@ public:
+@@ -629,13 +629,8 @@
+ return t;
}
// -- backfill_reservation --
@@ -131,9 +148,11 @@ Date: Tue Jun 24 02:09:49 2014
Finisher reserver_finisher;
AsyncReserver<spg_t> local_reserver;
AsyncReserver<spg_t> remote_reserver;
+
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
-@@ -1874,6 +1874,26 @@ void PG::mark_clean()
+@@ -1873,8 +1873,28 @@
+
dirty_info = true;
}
@@ -160,7 +179,9 @@ Date: Tue Jun 24 02:09:49 2014
void PG::finish_recovery(list<Context*>& tfin)
{
dout(10) << "finish_recovery" << dendl;
-@@ -5735,13 +5755,12 @@ PG::RecoveryState::WaitRemoteBackfillRes
+ assert(info.last_complete == info.last_update);
+@@ -5839,15 +5859,14 @@
+ ConnectionRef con = pg->osd->get_con_osd_cluster(
backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
if (con) {
if (con->has_feature(CEPH_FEATURE_BACKFILL_RESERVATION)) {
@@ -176,7 +197,9 @@ Date: Tue Jun 24 02:09:49 2014
con.get());
} else {
post_event(RemoteBackfillReserved());
-@@ -5810,8 +5829,8 @@ PG::RecoveryState::WaitLocalBackfillRese
+ }
+@@ -5914,10 +5933,10 @@
+ pg->osd->local_reserver.request_reservation(
pg->info.pgid,
new QueuePeeringEvt<LocalBackfillReserved>(
pg, pg->get_osdmap()->get_epoch(),
@@ -187,7 +210,9 @@ Date: Tue Jun 24 02:09:49 2014
}
void PG::RecoveryState::WaitLocalBackfillReserved::exit()
-@@ -5866,7 +5885,8 @@ PG::RecoveryState::RepWaitRecoveryReserv
+ {
+@@ -5982,9 +6001,10 @@
+ pg->osd->remote_reserver.request_reservation(
pg->info.pgid,
new QueuePeeringEvt<RemoteRecoveryReserved>(
pg, pg->get_osdmap()->get_epoch(),
@@ -197,7 +222,9 @@ Date: Tue Jun 24 02:09:49 2014
}
boost::statechart::result
-@@ -6007,7 +6027,8 @@ PG::RecoveryState::WaitLocalRecoveryRese
+ PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
+@@ -6123,9 +6143,10 @@
+ pg->osd->local_reserver.request_reservation(
pg->info.pgid,
new QueuePeeringEvt<LocalRecoveryReserved>(
pg, pg->get_osdmap()->get_epoch(),
@@ -207,9 +234,11 @@ Date: Tue Jun 24 02:09:49 2014
}
void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
+ {
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
-@@ -710,6 +710,11 @@ public:
+@@ -776,8 +776,13 @@
+
bool needs_recovery() const;
bool needs_backfill() const;
@@ -221,9 +250,11 @@ Date: Tue Jun 24 02:09:49 2014
void mark_clean(); ///< mark an active pg clean
bool _calc_past_interval_range(epoch_t *start, epoch_t *end);
+ void generate_past_intervals();
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
-@@ -56,6 +56,10 @@
+@@ -55,8 +55,12 @@
+ #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
@@ -234,3 +265,4 @@ Date: Tue Jun 24 02:09:49 2014
typedef hobject_t collection_list_handle_t;
typedef uint8_t shard_id_t;
+
diff --git a/debian/patches/bug-8342.patch b/debian/patches/bug-8342.patch
index 138626c..0de003b 100644
--- a/debian/patches/bug-8342.patch
+++ b/debian/patches/bug-8342.patch
@@ -11,7 +11,8 @@ Description: [Fixes:#8342]
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
-@@ -339,7 +339,11 @@ for name in $what; do
+@@ -361,9 +361,13 @@
+ osd_location=`$osd_location_hook --cluster $cluster --id $id --type osd`
get_conf osd_weight "" "osd crush initial weight"
defaultweight="$(df -P -k $osd_data/. | tail -1 | awk '{ print sprintf("%.2f",$2/1073741824) }')"
get_conf osd_keyring "$osd_data/keyring" "keyring"
@@ -24,7 +25,9 @@ Description: [Fixes:#8342]
fi
fi
-@@ -353,6 +357,7 @@ for name in $what; do
+ echo Starting Ceph $name on $host...
+@@ -375,8 +379,9 @@
+ [ -n "$pre_start" ] && do_cmd "$pre_start"
do_cmd_okfail "$cmd" $runarg
if [ "$ERR" != "0" ]; then
EXIT_STATUS=$ERR
@@ -32,3 +35,4 @@ Description: [Fixes:#8342]
fi
if [ "$type" = "mon" ]; then
+ # this will only work if we are using default paths
diff --git a/debian/patches/bug-8821.patch b/debian/patches/bug-8821.patch
index 9e80569..fe8b99c 100644
--- a/debian/patches/bug-8821.patch
+++ b/debian/patches/bug-8821.patch
@@ -30,7 +30,8 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
-@@ -738,8 +738,8 @@ OPTION(rbd_localize_parent_reads, OPT_BO
+@@ -748,10 +748,10 @@
+ * affected by rbd_default_order.
*/
OPTION(rbd_default_format, OPT_INT, 1)
OPTION(rbd_default_order, OPT_INT, 22)
@@ -41,9 +42,11 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
OPTION(rbd_default_features, OPT_INT, 3) // 1 for layering, 3 for layering+stripingv2. only applies to format 2 images
OPTION(nss_db_path, OPT_STR, "") // path to nss db
+
--- a/src/rbd.cc
+++ b/src/rbd.cc
-@@ -2345,7 +2345,8 @@ int main(int argc, const char **argv)
+@@ -2344,9 +2344,10 @@
+
const char *poolname = NULL;
uint64_t size = 0; // in bytes
int order = 0;
@@ -53,7 +56,9 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
int format = 1;
uint64_t features = RBD_FEATURE_LAYERING;
const char *imgname = NULL, *snapname = NULL, *destname = NULL,
-@@ -2359,7 +2360,7 @@ int main(int argc, const char **argv)
+ *dest_poolname = NULL, *dest_snapname = NULL, *path = NULL,
+@@ -2358,9 +2359,9 @@
+ long long stripe_unit = 0, stripe_count = 0;
long long bench_io_size = 4096, bench_io_threads = 16, bench_bytes = 1 << 30;
string bench_pattern = "seq";
@@ -62,7 +67,9 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
std::ostringstream err;
long long sizell = 0;
std::vector<const char*>::iterator i;
-@@ -2375,13 +2376,15 @@ int main(int argc, const char **argv)
+ for (i = args.begin(); i != args.end(); ) {
+@@ -2374,15 +2375,17 @@
+ return 0;
} else if (ceph_argparse_flag(args, i, "--new-format", (char*)NULL)) {
format = 2;
format_specified = true;
@@ -81,7 +88,9 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
} else if (ceph_argparse_witharg(args, i, &val, "-p", "--pool", (char*)NULL)) {
poolname = strdup(val.c_str());
} else if (ceph_argparse_witharg(args, i, &val, "--dest-pool", (char*)NULL)) {
-@@ -2416,7 +2419,6 @@ int main(int argc, const char **argv)
+ dest_poolname = strdup(val.c_str());
+@@ -2415,9 +2418,8 @@
+ } else if (ceph_argparse_withlonglong(args, i, &bench_io_size, &err, "--io-size", (char*)NULL)) {
} else if (ceph_argparse_withlonglong(args, i, &bench_io_threads, &err, "--io-threads", (char*)NULL)) {
} else if (ceph_argparse_withlonglong(args, i, &bench_bytes, &err, "--io-total", (char*)NULL)) {
} else if (ceph_argparse_witharg(args, i, &bench_pattern, &err, "--io-pattern", (char*)NULL)) {
@@ -89,7 +98,9 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
} else if (ceph_argparse_witharg(args, i, &val, "--path", (char*)NULL)) {
path = strdup(val.c_str());
} else if (ceph_argparse_witharg(args, i, &val, "--dest", (char*)NULL)) {
-@@ -2441,9 +2443,9 @@ int main(int argc, const char **argv)
+ destname = strdup(val.c_str());
+@@ -2440,11 +2442,11 @@
+ progress = false;
} else if (ceph_argparse_flag(args, i , "--allow-shrink", (char *)NULL)) {
resize_allow_shrink = true;
} else if (ceph_argparse_witharg(args, i, &val, "--format", (char *) NULL)) {
@@ -102,7 +113,9 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
format = ret;
format_specified = true;
cerr << "rbd: using --format for specifying the rbd image format is"
-@@ -2557,6 +2559,17 @@ if (!set_conf_param(v, p1, p2, p3)) { \
+ << " deprecated, use --image-format instead"
+@@ -2556,8 +2558,19 @@
+ break;
}
}
@@ -120,6 +133,7 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
if (format_specified && opt_cmd != OPT_IMPORT && opt_cmd != OPT_CREATE) {
cerr << "rbd: image format can only be set when "
<< "creating or importing an image" << std::endl;
+ return EXIT_FAILURE;
--- /dev/null
+++ b/src/test/cli-integration/rbd/defaults.t
@@ -0,0 +1,214 @@
diff --git a/debian/patches/ceph-ao-require-cas.patch b/debian/patches/ceph-ao-require-cas.patch
deleted file mode 100644
index 0a893a3..0000000
--- a/debian/patches/ceph-ao-require-cas.patch
+++ /dev/null
@@ -1,16 +0,0 @@
-Last-Update: 2014-05-21
-Forwarded: https://github.com/ceph/ceph/pull/1844
-Bug-Debian: http://bugs.debian.org/748571
-Author: John David Anglin <dave.anglin at bell.net>
-Description: Define AO_REQUIRE_CAS to fix FTBFS on 'hppa'.
-
---- a/src/include/atomic.h
-+++ b/src/include/atomic.h
-@@ -25,6 +25,7 @@
- #ifndef NO_ATOMIC_OPS
-
- // libatomic_ops implementation
-+#define AO_REQUIRE_CAS
- #include <atomic_ops.h>
-
- // reinclude our assert to clobber the system one
diff --git a/debian/patches/client-sleep1.patch b/debian/patches/client-sleep1.patch
index 7334ee6..f3b2367 100644
--- a/debian/patches/client-sleep1.patch
+++ b/debian/patches/client-sleep1.patch
@@ -11,7 +11,8 @@ Signed-off-by: Yan, Zheng <zheng.z.yan at intel.com>
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
-@@ -3075,12 +3075,27 @@ void Client::remove_all_caps(Inode *in)
+@@ -3108,14 +3108,29 @@
+ while (!in->caps.empty())
remove_cap(in->caps.begin()->second, true);
}
@@ -42,3 +43,4 @@ Signed-off-by: Yan, Zheng <zheng.z.yan at intel.com>
}
void Client::trim_caps(MetaSession *s, int max)
+ {
diff --git a/debian/patches/client-sleep2.patch b/debian/patches/client-sleep2.patch
index 0faea5e..67ce24a 100644
--- a/debian/patches/client-sleep2.patch
+++ b/debian/patches/client-sleep2.patch
@@ -10,7 +10,8 @@ Signed-off-by: Yan, Zheng <zheng.z.yan at intel.com>
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
-@@ -2352,6 +2352,9 @@ void Client::put_cap_ref(Inode *in, int
+@@ -2385,8 +2385,11 @@
+
int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
{
while (1) {
@@ -20,7 +21,9 @@ Signed-off-by: Yan, Zheng <zheng.z.yan at intel.com>
if (endoff > 0 &&
(endoff >= (loff_t)in->max_size ||
endoff > (loff_t)(in->size << 1)) &&
-@@ -3083,9 +3086,13 @@ void Client::remove_session_caps(MetaSes
+ endoff > (loff_t)in->wanted_max_size) {
+@@ -3116,11 +3119,15 @@
+ while (s->caps.size()) {
Cap *cap = *s->caps.begin();
Inode *in = cap->inode;
int dirty_caps = 0;
@@ -35,3 +38,4 @@ Signed-off-by: Yan, Zheng <zheng.z.yan at intel.com>
if (dirty_caps) {
lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
if (in->flushing_caps)
+ num_flushing_caps--;
diff --git a/debian/patches/client-sleep3.patch b/debian/patches/client-sleep3.patch
index 8dabc7b..edf1a36 100644
--- a/debian/patches/client-sleep3.patch
+++ b/debian/patches/client-sleep3.patch
@@ -10,7 +10,8 @@ Signed-off-by: Yan, Zheng <zheng.z.yan at intel.com>
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
-@@ -2090,15 +2090,21 @@ void Client::kick_requests_closed(MetaSe
+@@ -2123,17 +2123,23 @@
+ void Client::kick_requests_closed(MetaSession *session)
{
ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
@@ -40,3 +41,4 @@ Signed-off-by: Yan, Zheng <zheng.z.yan at intel.com>
}
}
assert(session->requests.empty());
+ assert(session->unsafe_requests.empty());
diff --git a/debian/patches/firefly-latest.patch b/debian/patches/firefly-latest.patch
new file mode 100644
index 0000000..8414fc0
--- /dev/null
+++ b/debian/patches/firefly-latest.patch
@@ -0,0 +1,11172 @@
+Last-Update: 2014-09-16
+Forwarded: not-needed
+Origin: upstream
+Author: Dmitry Smirnov <onlyjob at member.fsf.org>
+Description: fixes from "firefly" branch since 0.80.5 release
+
+--- a/configure.ac
++++ b/configure.ac
+@@ -471,11 +471,16 @@
+ [AC_MSG_FAILURE(
+ [no libatomic-ops found (use --without-libatomic-ops to disable)])
+ ])])
+ AS_IF([test "$HAVE_ATOMIC_OPS" = "1"],
+- [],
++ [
++ AC_CHECK_SIZEOF(AO_t, [], [
++ #include <atomic_ops.h>
++ ])
++ ],
+ [AC_DEFINE([NO_ATOMIC_OPS], [1], [Defined if you do not have atomic_ops])])
+
++
+ AM_CONDITIONAL(WITH_LIBATOMIC, [test "$HAVE_ATOMIC_OPS" = "1"])
+
+ # newsyn? requires mpi.
+ #AC_ARG_WITH([newsyn],
+--- /dev/null
++++ b/doc/_templates/layout.html
+@@ -0,0 +1,5 @@
++{% extends "!layout.html" %}
++
++{%- block extrahead %}
++ <script type="text/javascript" src="http://ayni.ceph.com/public/js/ceph.js"></script>
++{% endblock %}
+--- a/src/ceph-disk
++++ b/src/ceph-disk
+@@ -118,8 +118,11 @@
+ STATEDIR = '/var/lib/ceph'
+
+ SYSCONFDIR = '/etc/ceph'
+
++# only warn once about some things
++warned_about = {}
++
+ # Nuke the TERM variable to avoid confusing any subprocesses we call.
+ # For example, libreadline will print weird control sequences for some
+ # TERM values.
+ if 'TERM' in os.environ:
+@@ -130,10 +133,8 @@
+ LOG_NAME = os.path.basename(sys.argv[0])
+ LOG = logging.getLogger(LOG_NAME)
+
+
+-
+-
+ ###### lock ########
+
+ class filelock(object):
+ def __init__(self, fn):
+@@ -149,10 +150,12 @@
+ assert self.fd
+ fcntl.lockf(self.fd, fcntl.LOCK_UN)
+ self.fd = None
+
++
+ ###### exceptions ########
+
++
+ class Error(Exception):
+ """
+ Error
+ """
+@@ -160,51 +163,60 @@
+ def __str__(self):
+ doc = self.__doc__.strip()
+ return ': '.join([doc] + [str(a) for a in self.args])
+
++
+ class MountError(Error):
+ """
+ Mounting filesystem failed
+ """
+
++
+ class UnmountError(Error):
+ """
+ Unmounting filesystem failed
+ """
+
++
+ class BadMagicError(Error):
+ """
+ Does not look like a Ceph OSD, or incompatible version
+ """
+
++
+ class TruncatedLineError(Error):
+ """
+ Line is truncated
+ """
+
++
+ class TooManyLinesError(Error):
+ """
+ Too many lines
+ """
+
++
+ class FilesystemTypeError(Error):
+ """
+ Cannot discover filesystem type
+ """
+
++
+ class CephDiskException(Exception):
+ """
+ A base exception for ceph-disk to provide custom (ad-hoc) messages that
+ will be caught and dealt with when main() is executed
+ """
+ pass
+
++
+ class ExecutableNotFound(CephDiskException):
+ """
+ Exception to report on executables not available in PATH
+ """
+ pass
+
++
+ ####### utils
+
+
+ def maybe_mkdir(*a, **kw):
+@@ -299,9 +311,9 @@
+ of making sure that executables *will* be found and will error nicely
+ otherwise.
+ """
+ arguments = _get_command_executable(arguments)
+- LOG.info('Running command: %s' % ' '.join(arguments))
++ LOG.info('Running command: %s', ' '.join(arguments))
+ return subprocess.check_call(arguments)
+
+
+ def platform_distro():
+@@ -339,35 +351,67 @@
+ str(codename).strip()
+ )
+
+
+-# a device "name" is something like
+-# sdb
+-# cciss!c0d1
+ def get_dev_name(path):
+ """
+- get device name from path. e.g., /dev/sda -> sdas, /dev/cciss/c0d1 -> cciss!c0d1
++ get device name from path. e.g.::
++
++ /dev/sda -> sdas, /dev/cciss/c0d1 -> cciss!c0d1
++
++ a device "name" is something like::
++
++ sdb
++ cciss!c0d1
++
+ """
+ assert path.startswith('/dev/')
+ base = path[5:]
+ return base.replace('/', '!')
+
+-# a device "path" is something like
+-# /dev/sdb
+-# /dev/cciss/c0d1
++
+ def get_dev_path(name):
+ """
+ get a path (/dev/...) from a name (cciss!c0d1)
++ a device "path" is something like::
++
++ /dev/sdb
++ /dev/cciss/c0d1
++
+ """
+ return '/dev/' + name.replace('!', '/')
+
++
+ def get_dev_relpath(name):
+ """
+ get a relative path to /dev from a name (cciss!c0d1)
+ """
+ return name.replace('!', '/')
+
+
++def get_dev_size(dev, size='megabytes'):
++ """
++ Attempt to get the size of a device so that we can prevent errors
++ from actions to devices that are smaller, and improve error reporting.
++
++ Because we want to avoid breakage in case this approach is not robust, we
++ will issue a warning if we failed to get the size.
++
++ :param size: bytes or megabytes
++ :param dev: the device to calculate the size
++ """
++ fd = os.open(dev, os.O_RDONLY)
++ dividers = {'bytes': 1, 'megabytes': 1024*1024}
++ try:
++ device_size = os.lseek(fd, 0, os.SEEK_END)
++ divider = dividers.get(size, 1024*1024) # default to megabytes
++ return device_size/divider
++ except Exception as error:
++ LOG.warning('failed to get size of %s: %s' % (dev, str(error)))
++ finally:
++ os.close(fd)
++
++
+ def get_partition_dev(dev, pnum):
+ """
+ get the device name for a partition
+
+@@ -388,8 +432,9 @@
+ return get_dev_path(partname)
+ else:
+ raise Error('partition %d for %s does not appear to exist' % (pnum, dev))
+
++
+ def list_all_partitions():
+ """
+ Return a list of devices and partitions
+ """
+@@ -402,8 +447,9 @@
+ continue
+ dev_part_list[name] = list_partitions(name)
+ return dev_part_list
+
++
+ def list_partitions(basename):
+ """
+ Return a list of partitions on the given device name
+ """
+@@ -412,8 +458,25 @@
+ if name.startswith(basename):
+ partitions.append(name)
+ return partitions
+
++def get_partition_base(dev):
++ """
++ Get the base device for a partition
++ """
++ dev = os.path.realpath(dev)
++ if not stat.S_ISBLK(os.lstat(dev).st_mode):
++ raise Error('not a block device', dev)
++
++ name = get_dev_name(dev)
++ if os.path.exists(os.path.join('/sys/block', name)):
++ raise Error('not a partition', dev)
++
++ # find the base
++ for basename in os.listdir('/sys/block'):
++ if os.path.exists(os.path.join('/sys/block', basename, name)):
++ return '/dev/' + basename
++ raise Error('no parent device for partition', dev)
+
+ def is_partition(dev):
+ """
+ Check whether a given device path is a partition or a full disk.
+@@ -475,23 +538,23 @@
+ base = base[:-1]
+ return []
+
+
+-def verify_not_in_use(dev):
++def verify_not_in_use(dev, check_partitions=False):
+ """
+ Verify if a given device (path) is in use (e.g. mounted or
+ in use by device-mapper).
+
+ :raises: Error if device is in use.
+ """
+ assert os.path.exists(dev)
+- if is_partition(dev):
+- if is_mounted(dev):
+- raise Error('Device is mounted', dev)
+- holders = is_held(dev)
+- if holders:
+- raise Error('Device is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders))
+- else:
++ if is_mounted(dev):
++ raise Error('Device is mounted', dev)
++ holders = is_held(dev)
++ if holders:
++ raise Error('Device is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders))
++
++ if check_partitions and not is_partition(dev):
+ basename = get_dev_name(os.path.realpath(dev))
+ for partname in list_partitions(basename):
+ partition = get_dev_path(partname)
+ if is_mounted(partition):
+@@ -535,12 +598,14 @@
+
+ try:
+ line = must_be_one_line(line)
+ except (TruncatedLineError, TooManyLinesError) as e:
+- raise Error('File is corrupt: {path}: {msg}'.format(
++ raise Error(
++ 'File is corrupt: {path}: {msg}'.format(
+ path=path,
+ msg=e,
+- ))
++ )
++ )
+ return line
+
+
+ def write_one_line(parent, name, text):
+@@ -745,9 +810,9 @@
+ Maps a device to a dmcrypt device.
+
+ :return: Path to the dmcrypt device.
+ """
+- dev = '/dev/mapper/'+ _uuid
++ dev = '/dev/mapper/' + _uuid
+ args = [
+ 'cryptsetup',
+ '--key-file',
+ keypath,
+@@ -791,8 +856,14 @@
+ """
+ Mounts a device with given filessystem type and
+ mount options to a tempfile path under /var/lib/ceph/tmp.
+ """
++ # sanity check: none of the arguments are None
++ if dev is None:
++ raise ValueError('dev may not be None')
++ if fstype is None:
++ raise ValueError('fstype may not be None')
++
+ # pick best-of-breed mount options based on fs type
+ if options is None:
+ options = MOUNT_OPTIONS.get(fstype, '')
+
+@@ -966,8 +1037,17 @@
+ size=journal_size,
+ )
+ LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
+
++ dev_size = get_dev_size(journal)
++
++ if journal_size > dev_size:
++ LOG.error('refusing to create journal on %s' % journal)
++ LOG.error('journal size (%sM) is bigger than device (%sM)' % (journal_size, dev_size))
++ raise Error(
++ '%s device size (%sM) is not big enough for journal' % (journal, dev_size)
++ )
++
+ try:
+ LOG.debug('Creating journal partition num %d size %d on %s', num, journal_size, journal)
+ command_check_call(
+ [
+@@ -1043,9 +1123,9 @@
+ journal):
+
+ if not os.path.exists(journal):
+ LOG.debug('Creating journal file %s with size 0 (ceph-osd will resize and allocate)', journal)
+- with file(journal, 'wb') as journal_file:
++ with file(journal, 'wb') as journal_file: # noqa
+ pass
+
+ LOG.debug('Journal is file %s', journal)
+ LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
+@@ -1109,15 +1189,16 @@
+ os.symlink(target, path)
+ except:
+ raise Error('unable to create symlink %s -> %s' % (path, target))
+
++
+ def prepare_dir(
+ path,
+ journal,
+ cluster_uuid,
+ osd_uuid,
+ journal_uuid,
+- journal_dmcrypt = None,
++ journal_dmcrypt=None,
+ ):
+
+ if os.path.exists(os.path.join(path, 'magic')):
+ LOG.debug('Data dir %s already exists', path)
+@@ -1182,11 +1263,8 @@
+ if is_partition(data):
+ LOG.debug('OSD data device %s is a partition', data)
+ rawdev = data
+ else:
+- if journal_dmcrypt is not None:
+- dmcrypt_unmap(journal)
+-
+ LOG.debug('Creating osd partition on %s', data)
+ try:
+ command_check_call(
+ [
+@@ -1237,11 +1315,11 @@
+ args.extend(['-f']) # always force
+ else:
+ args.extend(MKFS_ARGS.get(fstype, []))
+ args.extend([
+- '--',
+- dev,
+- ])
++ '--',
++ dev,
++ ])
+ try:
+ LOG.debug('Creating %s fs on %s', fstype, dev)
+ command_check_call(args)
+ except subprocess.CalledProcessError as e:
+@@ -1266,10 +1344,8 @@
+ unmount(path)
+ finally:
+ if rawdev != dev:
+ dmcrypt_unmap(osd_uuid)
+- if journal_dmcrypt is not None:
+- dmcrypt_unmap(journal)
+
+ if not is_partition(data):
+ try:
+ command_check_call(
+@@ -1288,9 +1364,9 @@
+ journal_dm_keypath = None
+ osd_dm_keypath = None
+
+ try:
+- prepare_lock.acquire()
++ prepare_lock.acquire() # noqa
+ if not os.path.exists(args.data):
+ if args.data_dev:
+ raise Error('data path does not exist', args.data)
+ else:
+@@ -1298,14 +1374,14 @@
+
+ # in use?
+ dmode = os.stat(args.data).st_mode
+ if stat.S_ISBLK(dmode):
+- verify_not_in_use(args.data)
++ verify_not_in_use(args.data, True)
+
+ if args.journal and os.path.exists(args.journal):
+ jmode = os.stat(args.journal).st_mode
+ if stat.S_ISBLK(jmode):
+- verify_not_in_use(args.journal)
++ verify_not_in_use(args.journal, False)
+
+ if args.zap_disk is not None:
+ if stat.S_ISBLK(dmode) and not is_partition(args.data):
+ zap(args.data)
+@@ -1420,9 +1496,9 @@
+ osd_dm_keypath=osd_dm_keypath,
+ )
+ else:
+ raise Error('not a dir or block device', args.data)
+- prepare_lock.release()
++ prepare_lock.release() # noqa
+
+ if stat.S_ISBLK(dmode):
+ # try to make sure the kernel refreshes the table. note
+ # that if this gets ebusy, we are probably racing with
+@@ -1456,9 +1532,9 @@
+ if journal_dm_keypath:
+ os.unlink(journal_dm_keypath)
+ if osd_dm_keypath:
+ os.unlink(osd_dm_keypath)
+- prepare_lock.release()
++ prepare_lock.release() # noqa
+ raise e
+
+
+ ###########################
+@@ -1622,20 +1698,23 @@
+ command_check_call(
+ [
+ svc,
+ 'ceph',
++ '--cluster',
++ '{cluster}'.format(cluster=cluster),
+ 'start',
+ 'osd.{osd_id}'.format(osd_id=osd_id),
+ ],
+ )
+ else:
+ raise Error('{cluster} osd.{osd_id} is not tagged with an init system'.format(
+- cluster=cluster,
+- osd_id=osd_id,
+- ))
++ cluster=cluster,
++ osd_id=osd_id,
++ ))
+ except subprocess.CalledProcessError as e:
+ raise Error('ceph osd start failed', e)
+
++
+ def detect_fstype(
+ dev,
+ ):
+ fstype = _check_output(
+@@ -1703,10 +1782,10 @@
+ other = False
+ src_dev = os.stat(path).st_dev
+ try:
+ dst_dev = os.stat((STATEDIR + '/osd/{cluster}-{osd_id}').format(
+- cluster=cluster,
+- osd_id=osd_id)).st_dev
++ cluster=cluster,
++ osd_id=osd_id)).st_dev
+ if src_dev == dst_dev:
+ active = True
+ else:
+ parent_dev = os.stat(STATEDIR + '/osd').st_dev
+@@ -1759,9 +1838,9 @@
+ )
+
+ (osd_id, cluster) = activate(path, activate_key_template, init)
+
+- if init not in ( None, 'none' ):
++ if init not in (None, 'none' ):
+ canonical = (STATEDIR + '/osd/{cluster}-{osd_id}').format(
+ cluster=cluster,
+ osd_id=osd_id)
+ if path != canonical:
+@@ -1814,8 +1893,9 @@
+ LOG.warning('No fsid defined in ' + SYSCONFDIR + '/ceph.conf; using anyway')
+ return 'ceph'
+ return None
+
++
+ def activate(
+ path,
+ activate_key_template,
+ init,
+@@ -1860,9 +1940,9 @@
+ fsid=fsid,
+ keyring=keyring,
+ )
+
+- if init not in ( None, 'none' ):
++ if init not in (None, 'none' ):
+ if init == 'auto':
+ conf_val = get_conf(
+ cluster=cluster,
+ variable='init'
+@@ -1911,9 +1991,9 @@
+ if is_suppressed(args.path):
+ LOG.info('suppressed activate request on %s', args.path)
+ return
+
+- activate_lock.acquire()
++ activate_lock.acquire() # noqa
+ try:
+ mode = os.stat(args.path).st_mode
+ if stat.S_ISBLK(mode):
+ (cluster, osd_id) = mount_activate(
+@@ -1931,9 +2011,9 @@
+
+ if args.mark_init == 'none':
+ command_check_call(
+ [
+- 'ceph-osd',
++ 'ceph-osd',
+ '--cluster={cluster}'.format(cluster=cluster),
+ '--id={osd_id}'.format(osd_id=osd_id),
+ '--osd-data={path}'.format(path=args.path),
+ '--osd-journal={path}/journal'.format(path=args.path),
+@@ -1942,17 +2022,17 @@
+
+ else:
+ raise Error('%s is not a directory or block device' % args.path)
+
+- if args.mark_init not in ( None, 'none' ):
++ if args.mark_init not in (None, 'none' ):
+
+ start_daemon(
+ cluster=cluster,
+ osd_id=osd_id,
+ )
+
+ finally:
+- activate_lock.release()
++ activate_lock.release() # noqa
+
+
+ ###########################
+
+@@ -1983,16 +2063,17 @@
+ value = str(out).split('\n', 1)[0]
+ LOG.debug('Journal %s has OSD UUID %s', path, value)
+ return value
+
++
+ def main_activate_journal(args):
+ if not os.path.exists(args.dev):
+ raise Error('%s does not exist' % args.dev)
+
+ cluster = None
+ osd_id = None
+ osd_uuid = None
+- activate_lock.acquire()
++ activate_lock.acquire() # noqa
+ try:
+ osd_uuid = get_journal_osd_uuid(args.dev)
+ path = os.path.join('/dev/disk/by-partuuid/', osd_uuid.lower())
+
+@@ -2007,12 +2088,14 @@
+ osd_id=osd_id,
+ )
+
+ finally:
+- activate_lock.release()
++ activate_lock.release() # noqa
++
+
+ ###########################
+
++
+ def main_activate_all(args):
+ dir = '/dev/disk/by-parttypeuuid'
+ LOG.debug('Scanning %s', dir)
+ if not os.path.exists(dir):
+@@ -2021,12 +2104,18 @@
+ for name in os.listdir(dir):
+ if name.find('.') < 0:
+ continue
+ (tag, uuid) = name.split('.')
+- if tag == OSD_UUID:
+- path = os.path.join(dir, name)
++
++ if tag == OSD_UUID or tag == DMCRYPT_OSD_UUID:
++
++ if tag == DMCRYPT_OSD_UUID:
++ path = os.path.join('/dev/mapper', uuid)
++ else:
++ path = os.path.join(dir, name)
++
+ LOG.info('Activating %s', path)
+- activate_lock.acquire()
++ activate_lock.acquire() # noqa
+ try:
+ (cluster, osd_id) = mount_activate(
+ dev=path,
+ activate_key_template=args.activate_key_template,
+@@ -2044,9 +2133,9 @@
+ )
+ err = True
+
+ finally:
+- activate_lock.release()
++ activate_lock.release() # noqa
+ if err:
+ raise Error('One or more partitions failed to activate')
+
+
+@@ -2065,15 +2154,17 @@
+ if swaps_dev == dev:
+ return True
+ return False
+
++
+ def get_oneliner(base, name):
+ path = os.path.join(base, name)
+ if os.path.isfile(path):
+ with open(path, 'r') as _file:
+ return _file.readline().rstrip()
+ return None
+
++
+ def get_dev_fs(dev):
+ fscheck, _ = command(
+ [
+ 'blkid',
+@@ -2087,9 +2178,58 @@
+ return fstype
+ else:
+ return None
+
++
+ def get_partition_type(part):
++ """
++ Get the GPT partition type UUID. If we have an old blkid and can't
++ get it that way, use sgdisk and use the description instead (and hope
++ dmcrypt isn't being used).
++ """
++ blkid, _ = command(
++ [
++ 'blkid',
++ '-p',
++ '-o', 'udev',
++ part,
++ ]
++ )
++ saw_part_entry = False
++ for line in blkid.splitlines():
++ (key, value) = line.split('=')
++ if key == 'ID_PART_ENTRY_TYPE':
++ return value
++ if key == 'ID_PART_ENTRY_SCHEME':
++ table_type = value
++ if key.startswith('ID_PART_ENTRY_'):
++ saw_part_entry = True
++
++ # hmm, is it in fact GPT?
++ table_type = None
++ base = get_partition_base(part)
++ blkid, _ = command(
++ [
++ 'blkid',
++ '-p',
++ '-o', 'udev',
++ base
++ ]
++ )
++ for line in blkid.splitlines():
++ (key, value) = line.split('=')
++ if key == 'ID_PART_TABLE_TYPE':
++ table_type = value
++ if table_type != 'gpt':
++ return None # not even GPT
++
++ if saw_part_entry:
++ return None # GPT, and blkid appears to be new, so we're done.
++
++ # bah, fall back to sgdisk.
++ if 'blkid' not in warned_about:
++ LOG.warning('Old blkid does not support ID_PART_ENTRY_* fields, trying sgdisk; may not correctly identify ceph volumes with dmcrypt')
++ warned_about['blkid'] = True
+ (base, partnum) = re.match('(\D+)(\d+)', part).group(1, 2)
+ sgdisk, _ = command(
+ [
+ 'sgdisk',
+@@ -2103,11 +2243,18 @@
+ if m is not None:
+ num = m.group(1)
+ if num != partnum:
+ continue
+- return m.group(2)
++ desc = m.group(2)
++ # assume unencrypted ... blkid has failed us :(
++ if desc == 'ceph data':
++ return OSD_UUID
++ if desc == 'ceph journal':
++ return JOURNAL_UUID
++
+ return None
+
++
+ def get_partition_uuid(dev):
+ (base, partnum) = re.match('(\D+)(\d+)', dev).group(1, 2)
+ out, _ = command(['sgdisk', '-i', partnum, base])
+ for line in out.splitlines():
+@@ -2115,8 +2262,9 @@
+ if m:
+ return m.group(1).lower()
+ return None
+
++
+ def more_osd_info(path, uuid_map):
+ desc = []
+ ceph_fsid = get_oneliner(path, 'ceph_fsid')
+ if ceph_fsid:
+@@ -2137,46 +2285,71 @@
+ desc.append('journal %s' % uuid_map[journal_uuid])
+
+ return desc
+
++def list_dev_osd(dev, uuid_map):
++ path = is_mounted(dev)
++ fs_type = get_dev_fs(dev)
++ desc = []
++ if path:
++ desc.append('active')
++ desc.extend(more_osd_info(path, uuid_map))
++ elif fs_type:
++ try:
++ tpath = mount(dev=dev, fstype=fs_type, options='')
++ if tpath:
++ try:
++ magic = get_oneliner(tpath, 'magic')
++ if magic is not None:
++ desc.append('prepared')
++ desc.extend(more_osd_info(tpath, uuid_map))
++ finally:
++ unmount(tpath)
++ except MountError:
++ pass
++ return desc
+
+ def list_dev(dev, uuid_map, journal_map):
+ ptype = 'unknown'
+ prefix = ''
+ if is_partition(dev):
+ ptype = get_partition_type(dev)
+ prefix = ' '
+- fs_type = get_dev_fs(dev)
+- path = is_mounted(dev)
+
+ desc = []
+- if ptype == 'ceph data':
+- if path:
+- desc.append('active')
+- desc.extend(more_osd_info(path, uuid_map))
+- elif fs_type:
+- try:
+- tpath = mount(dev=dev, fstype=fs_type, options='')
+- if tpath:
+- try:
+- magic = get_oneliner(tpath, 'magic')
+- if magic is not None:
+- desc.append('prepared')
+- desc.extend(more_osd_info(tpath, uuid_map))
+- finally:
+- unmount(tpath)
+- except MountError:
+- pass
++ if ptype == OSD_UUID:
++ desc = list_dev_osd(dev, uuid_map)
+ if desc:
+ desc = ['ceph data'] + desc
+ else:
+ desc = ['ceph data', 'unprepared']
+- elif ptype == 'ceph journal':
++ elif ptype == DMCRYPT_OSD_UUID:
++ holders = is_held(dev)
++ if not holders:
++ desc = ['ceph data (dmcrypt)', 'not currently mapped']
++ elif len(holders) == 1:
++ holder = '/dev/' + holders[0]
++ fs_desc = list_dev_osd(holder, uuid_map)
++ desc = ['ceph data (dmcrypt %s)' % holder] + fs_desc
++ else:
++ desc = ['ceph data (dmcrypt)', 'holders: ' + ','.join(holders)]
++ elif ptype == JOURNAL_UUID:
+ desc.append('ceph journal')
+ part_uuid = get_partition_uuid(dev)
+ if part_uuid and part_uuid in journal_map:
+ desc.append('for %s' % journal_map[part_uuid])
++ elif ptype == DMCRYPT_JOURNAL_UUID:
++ holders = is_held(dev)
++ if len(holders) == 1:
++ desc = ['ceph journal (dmcrypt /dev/%s)' % holders[0]]
++ else:
++ desc = ['ceph journal (dmcrypt)']
++ part_uuid = get_partition_uuid(dev)
++ if part_uuid and part_uuid in journal_map:
++ desc.append('for %s' % journal_map[part_uuid])
+ else:
++ path = is_mounted(dev)
++ fs_type = get_dev_fs(dev)
+ if is_swap(dev):
+ desc.append('swap')
+ else:
+ desc.append('other')
+@@ -2189,9 +2362,8 @@
+
+ print '%s%s %s' % (prefix, dev, ', '.join(desc))
+
+
+-
+ def main_list(args):
+ partmap = list_all_partitions()
+
+ uuid_map = {}
+@@ -2202,20 +2374,37 @@
+ part_uuid = get_partition_uuid(dev)
+ if part_uuid:
+ uuid_map[part_uuid] = dev
+ ptype = get_partition_type(dev)
+- if ptype == 'ceph data':
++ if ptype == OSD_UUID:
+ fs_type = get_dev_fs(dev)
+- try:
+- tpath = mount(dev=dev, fstype=fs_type, options='')
++ if fs_type is not None:
+ try:
+- journal_uuid = get_oneliner(tpath, 'journal_uuid')
+- if journal_uuid:
+- journal_map[journal_uuid.lower()] = dev
+- finally:
+- unmount(tpath)
+- except MountError:
+- pass
++ tpath = mount(dev=dev, fstype=fs_type, options='')
++ try:
++ journal_uuid = get_oneliner(tpath, 'journal_uuid')
++ if journal_uuid:
++ journal_map[journal_uuid.lower()] = dev
++ finally:
++ unmount(tpath)
++ except MountError:
++ pass
++ if ptype == DMCRYPT_OSD_UUID:
++ holders = is_held(dev)
++ if len(holders) == 1:
++ holder = '/dev/' + holders[0]
++ fs_type = get_dev_fs(holder)
++ if fs_type is not None:
++ try:
++ tpath = mount(dev=holder, fstype=fs_type, options='')
++ try:
++ journal_uuid = get_oneliner(tpath, 'journal_uuid')
++ if journal_uuid:
++ journal_map[journal_uuid.lower()] = dev
++ finally:
++ unmount(tpath)
++ except MountError:
++ pass
+
+ for base, parts in sorted(partmap.iteritems()):
+ if parts:
+ print '%s :' % get_dev_path(base)
+@@ -2243,26 +2432,28 @@
+ if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(path).st_mode):
+ return False
+ base = get_dev_name(disk)
+ while len(base):
+- if os.path.exists(SUPPRESS_PREFIX + base):
++ if os.path.exists(SUPPRESS_PREFIX + base): # noqa
+ return True
+ base = base[:-1]
+ except:
+ return False
+
++
+ def set_suppress(path):
+ disk = os.path.realpath(path)
+ if not os.path.exists(disk):
+ raise Error('does not exist', path)
+ if not stat.S_ISBLK(os.lstat(path).st_mode):
+ raise Error('not a block device', path)
+ base = get_dev_name(disk)
+
+- with file(SUPPRESS_PREFIX + base, 'w') as f:
++ with file(SUPPRESS_PREFIX + base, 'w') as f: # noqa
+ pass
+ LOG.info('set suppress flag on %s', base)
+
++
+ def unset_suppress(path):
+ disk = os.path.realpath(path)
+ if not os.path.exists(disk):
+ raise Error('does not exist', path)
+@@ -2270,9 +2461,9 @@
+ raise Error('not a block device', path)
+ assert disk.startswith('/dev/')
+ base = get_dev_name(disk)
+
+- fn = SUPPRESS_PREFIX + base
++ fn = SUPPRESS_PREFIX + base # noqa
+ if not os.path.exists(fn):
+ raise Error('not marked as suppressed', path)
+
+ try:
+@@ -2284,18 +2475,24 @@
+
+ def main_suppress(args):
+ set_suppress(args.path)
+
++
+ def main_unsuppress(args):
+ unset_suppress(args.path)
+
++
+ def main_zap(args):
+ for dev in args.dev:
+ zap(dev)
+
+ ###########################
+
++
+ def setup_statedir(dir):
++ # XXX The following use of globals makes linting
++ # really hard. Global state in Python is iffy and
++ # should be avoided.
+ global STATEDIR
+ STATEDIR = dir
+
+ if not os.path.exists(STATEDIR):
+@@ -2311,12 +2508,14 @@
+
+ global SUPPRESS_PREFIX
+ SUPPRESS_PREFIX = STATEDIR + '/tmp/suppress-activate.'
+
++
+ def setup_sysconfdir(dir):
+ global SYSCONFDIR
+ SYSCONFDIR = dir
+
++
+ def parse_args():
+ parser = argparse.ArgumentParser(
+ 'ceph-disk',
+ )
+@@ -2588,4 +2787,5 @@
+
+
+ if __name__ == '__main__':
+ main()
++ warned_about = {}
+--- a/src/ceph.in
++++ b/src/ceph.in
+@@ -105,8 +105,16 @@
+ for mdsdict in infodict.values():
+ l.append(mdsdict['name'])
+ return l
+
++# these args must be passed to all child programs
++GLOBAL_ARGS = {
++ 'client_id': '--id',
++ 'client_name': '--name',
++ 'cluster': '--cluster',
++ 'cephconf': '--conf',
++}
++
+ def parse_cmdargs(args=None, target=''):
+ # alias: let the line-wrapping be sane
+ AP = argparse.ArgumentParser
+
+@@ -338,17 +346,25 @@
+
+ return ret
+
+
+-def ceph_conf(field, name):
++def ceph_conf(parsed_args, field, name):
++ args=['ceph-conf']
++
++ if name:
++ args.extend(['--name', name])
++
++ # add any args in GLOBAL_ARGS
++ for key, val in GLOBAL_ARGS.iteritems():
++ # ignore name in favor of argument name, if any
++ if name and key == 'client_name':
++ continue
++ if getattr(parsed_args, key):
++ args.extend([val, getattr(parsed_args, key)])
++
++ args.extend(['--show-config-value', field])
+ p = subprocess.Popen(
+- args=[
+- 'ceph-conf',
+- '--show-config-value',
+- field,
+- '-n',
+- name,
+- ],
++ args,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ outdata, errdata = p.communicate()
+ if (len(errdata)):
+@@ -537,9 +553,10 @@
+ sockpath = childargs[1]
+ else:
+ # try resolve daemon name
+ try:
+- sockpath = ceph_conf('admin_socket', childargs[1])
++ sockpath = ceph_conf(parsed_args, 'admin_socket',
++ childargs[1])
+ except Exception as e:
+ print >> sys.stderr, \
+ 'Can\'t get admin socket path: ' + str(e)
+ return errno.EINVAL
+--- a/src/ceph_common.sh
++++ b/src/ceph_common.sh
+@@ -49,14 +49,15 @@
+ get_conf user "" "user"
+
+ #echo host for $name is $host, i am $hostname
+
+- if [ -e "/var/lib/ceph/$type/ceph-$id/upstart" ]; then
++ cluster=$1
++ if [ -e "/var/lib/ceph/$type/$cluster-$id/upstart" ]; then
+ return 1
+ fi
+
+ # sysvinit managed instance in standard location?
+- if [ -e "/var/lib/ceph/$type/ceph-$id/sysvinit" ]; then
++ if [ -e "/var/lib/ceph/$type/$cluster-$id/sysvinit" ]; then
+ host="$hostname"
+ echo "=== $type.$id === "
+ return 0
+ fi
+--- a/src/ceph_mon.cc
++++ b/src/ceph_mon.cc
+@@ -42,8 +42,10 @@
+ #include "global/signal_handler.h"
+
+ #include "include/assert.h"
+
++#include "erasure-code/ErasureCodePlugin.h"
++
+ #define dout_subsys ceph_subsys_mon
+
+ Monitor *mon = NULL;
+
+@@ -183,8 +185,23 @@
+ cerr << " where the mon store and keyring are located\n";
+ generic_server_usage();
+ }
+
++int preload_erasure_code()
++{
++ string directory = g_conf->osd_pool_default_erasure_code_directory;
++ string plugins = g_conf->osd_erasure_code_plugins;
++ stringstream ss;
++ int r = ErasureCodePluginRegistry::instance().preload(plugins,
++ directory,
++ ss);
++ if (r)
++ derr << ss.str() << dendl;
++ else
++ dout(10) << ss.str() << dendl;
++ return r;
++}
++
+ int main(int argc, const char **argv)
+ {
+ int err;
+
+@@ -415,8 +432,10 @@
+ global_init_postfork_start(g_ceph_context);
+ }
+ common_init_finish(g_ceph_context);
+ global_init_chdir(g_ceph_context);
++ if (preload_erasure_code() < -1)
++ prefork.exit(1);
+ }
+
+ MonitorDBStore *store = new MonitorDBStore(g_conf->mon_data);
+
+--- a/src/ceph_osd.cc
++++ b/src/ceph_osd.cc
+@@ -47,8 +47,10 @@
+ #include "perfglue/heap_profiler.h"
+
+ #include "include/assert.h"
+
++#include "erasure-code/ErasureCodePlugin.h"
++
+ #define dout_subsys ceph_subsys_osd
+
+ OSD *osd = NULL;
+
+@@ -65,8 +67,23 @@
+ derr << " --debug_osd N set debug level (e.g. 10)" << dendl;
+ generic_server_usage();
+ }
+
++int preload_erasure_code()
++{
++ string directory = g_conf->osd_pool_default_erasure_code_directory;
++ string plugins = g_conf->osd_erasure_code_plugins;
++ stringstream ss;
++ int r = ErasureCodePluginRegistry::instance().preload(plugins,
++ directory,
++ ss);
++ if (r)
++ derr << ss.str() << dendl;
++ else
++ dout(10) << ss.str() << dendl;
++ return r;
++}
++
+ int main(int argc, const char **argv)
+ {
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+@@ -450,8 +467,11 @@
+ if (mc.build_initial_monmap() < 0)
+ return -1;
+ global_init_chdir(g_ceph_context);
+
++ if (preload_erasure_code() < -1)
++ return -1;
++
+ osd = new OSD(g_ceph_context,
+ store,
+ whoami,
+ ms_cluster,
+--- a/src/cls/rgw/cls_rgw.cc
++++ b/src/cls/rgw/cls_rgw.cc
+@@ -669,9 +669,9 @@
+ CLS_LOG(0, "rgw_bucket_complete_op(): entry.name=%s entry.meta.category=%d\n", remove_entry.name.c_str(), remove_entry.meta.category);
+ unaccount_entry(header, remove_entry);
+
+ if (op.log_op) {
+- rc = log_index_operation(hctx, op.name, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
++ rc = log_index_operation(hctx, remove_oid_name, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
+ remove_entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker);
+ if (rc < 0)
+ continue;
+ }
+--- a/src/common/Finisher.h
++++ b/src/common/Finisher.h
+@@ -76,8 +76,17 @@
+ ls.clear();
+ if (logger)
+ logger->inc(l_finisher_queue_len);
+ }
++ void queue(list<Context*>& ls) {
++ finisher_lock.Lock();
++ finisher_queue.insert(finisher_queue.end(), ls.begin(), ls.end());
++ finisher_cond.Signal();
++ finisher_lock.Unlock();
++ ls.clear();
++ if (logger)
++ logger->inc(l_finisher_queue_len);
++ }
+
+ void start();
+ void stop();
+
+--- a/src/common/LogClient.cc
++++ b/src/common/LogClient.cc
+@@ -123,8 +123,9 @@
+ }
+
+ Message *LogClient::_get_mon_log_message()
+ {
++ assert(log_lock.is_locked());
+ if (log_queue.empty())
+ return NULL;
+
+ // only send entries that haven't been sent yet during this mon
+@@ -148,9 +149,9 @@
+ << " sending " << num_send << dendl;
+ assert(num_unsent <= log_queue.size());
+ std::deque<LogEntry>::iterator p = log_queue.begin();
+ std::deque<LogEntry> o;
+- while (p->seq < last_log_sent) {
++ while (p->seq <= last_log_sent) {
+ ++p;
+ assert(p != log_queue.end());
+ }
+ while (num_send--) {
+--- a/src/common/Makefile.am
++++ b/src/common/Makefile.am
+@@ -12,8 +12,9 @@
+ common/admin_socket.cc \
+ common/admin_socket_client.cc \
+ common/cmdparse.cc \
+ common/escape.c \
++ common/io_priority.cc \
+ common/Clock.cc \
+ common/Throttle.cc \
+ common/Timer.cc \
+ common/Finisher.cc \
+@@ -155,8 +156,9 @@
+ common/perf_counters.h \
+ common/OutputDataSocket.h \
+ common/admin_socket.h \
+ common/admin_socket_client.h \
++ common/random_cache.hpp \
+ common/shared_cache.hpp \
+ common/tracked_int_ptr.hpp \
+ common/simple_cache.hpp \
+ common/sharedptr_registry.hpp \
+@@ -174,8 +176,9 @@
+ common/TrackedOp.h \
+ common/arch.h \
+ common/armor.h \
+ common/common_init.h \
++ common/io_priority.h \
+ common/pipe.h \
+ common/code_environment.h \
+ common/signal.h \
+ common/simple_spin.h \
+--- a/src/common/Thread.cc
++++ b/src/common/Thread.cc
+@@ -15,8 +15,9 @@
+ #include "common/Thread.h"
+ #include "common/code_environment.h"
+ #include "common/debug.h"
+ #include "common/signal.h"
++#include "common/io_priority.h"
+
+ #include <dirent.h>
+ #include <errno.h>
+ #include <iostream>
+@@ -28,21 +29,38 @@
+ #include <sys/types.h>
+
+
+ Thread::Thread()
+- : thread_id(0)
++ : thread_id(0),
++ pid(0),
++ ioprio_class(-1),
++ ioprio_priority(-1)
+ {
+ }
+
+ Thread::~Thread()
+ {
+ }
+
+ void *Thread::_entry_func(void *arg) {
+- void *r = ((Thread*)arg)->entry();
++ void *r = ((Thread*)arg)->entry_wrapper();
+ return r;
+ }
+
++void *Thread::entry_wrapper()
++{
++ int p = ceph_gettid(); // may return -ENOSYS on other platforms
++ if (p > 0)
++ pid = p;
++ if (ioprio_class >= 0 &&
++ ioprio_priority >= 0) {
++ ceph_ioprio_set(IOPRIO_WHO_PROCESS,
++ pid,
++ IOPRIO_PRIO_VALUE(ioprio_class, ioprio_priority));
++ }
++ return entry();
++}
++
+ const pthread_t &Thread::get_thread_id()
+ {
+ return thread_id;
+ }
+@@ -127,4 +145,16 @@
+ int Thread::detach()
+ {
+ return pthread_detach(thread_id);
+ }
++
++int Thread::set_ioprio(int cls, int prio)
++{
++ // fixme, maybe: this can race with create()
++ ioprio_class = cls;
++ ioprio_priority = prio;
++ if (pid && cls >= 0 && prio >= 0)
++ return ceph_ioprio_set(IOPRIO_WHO_PROCESS,
++ pid,
++ IOPRIO_PRIO_VALUE(cls, prio));
++ return 0;
++}
+--- a/src/common/Thread.h
++++ b/src/common/Thread.h
+@@ -20,8 +20,12 @@
+
+ class Thread {
+ private:
+ pthread_t thread_id;
++ pid_t pid;
++ int ioprio_class, ioprio_priority;
++
++ void *entry_wrapper();
+
+ public:
+ Thread(const Thread& other);
+ const Thread& operator=(const Thread& other);
+@@ -43,7 +47,8 @@
+ int try_create(size_t stacksize);
+ void create(size_t stacksize = 0);
+ int join(void **prval = 0);
+ int detach();
++ int set_ioprio(int cls, int prio);
+ };
+
+ #endif
+--- a/src/common/WorkQueue.cc
++++ b/src/common/WorkQueue.cc
+@@ -15,8 +15,9 @@
+ #include <sstream>
+
+ #include "include/types.h"
+ #include "include/utime.h"
++#include "common/errno.h"
+ #include "WorkQueue.h"
+
+ #include "common/config.h"
+ #include "common/HeartbeatMap.h"
+@@ -32,8 +33,10 @@
+ _lock(lockname.c_str()), // this should be safe due to declaration order
+ _stop(false),
+ _pause(0),
+ _draining(0),
++ ioprio_class(-1),
++ ioprio_priority(-1),
+ _num_threads(n),
+ last_work_queue(0),
+ processing(0)
+ {
+@@ -155,8 +158,13 @@
+ while (_threads.size() < _num_threads) {
+ WorkThread *wt = new WorkThread(this);
+ ldout(cct, 10) << "start_threads creating and starting " << wt << dendl;
+ _threads.insert(wt);
++
++ int r = wt->set_ioprio(ioprio_class, ioprio_priority);
++ if (r < 0)
++ lderr(cct) << " set_ioprio got " << cpp_strerror(r) << dendl;
++
+ wt->create();
+ }
+ }
+
+@@ -254,4 +262,17 @@
+ _draining--;
+ _lock.Unlock();
+ }
+
++void ThreadPool::set_ioprio(int cls, int priority)
++{
++ Mutex::Locker l(_lock);
++ ioprio_class = cls;
++ ioprio_priority = priority;
++ for (set<WorkThread*>::iterator p = _threads.begin();
++ p != _threads.end();
++ ++p) {
++ int r = (*p)->set_ioprio(cls, priority);
++ if (r < 0)
++ lderr(cct) << " set_ioprio got " << cpp_strerror(r) << dendl;
++ }
++}
+--- a/src/common/WorkQueue.h
++++ b/src/common/WorkQueue.h
+@@ -32,8 +32,9 @@
+ bool _stop;
+ int _pause;
+ int _draining;
+ Cond _wait_cond;
++ int ioprio_class, ioprio_priority;
+
+ public:
+ class TPHandle {
+ friend class ThreadPool;
+@@ -387,8 +388,11 @@
+ /// resume work in thread pool. must match each pause() call 1:1 to resume.
+ void unpause();
+ /// wait for all work to complete
+ void drain(WorkQueue_* wq = 0);
++
++ /// set io priority
++ void set_ioprio(int cls, int priority);
+ };
+
+ class GenContextWQ :
+ public ThreadPool::WorkQueueVal<GenContext<ThreadPool::TPHandle&>*> {
+--- a/src/common/blkdev.cc
++++ b/src/common/blkdev.cc
+@@ -9,9 +9,9 @@
+ int get_block_device_size(int fd, int64_t *psize)
+ {
+ #ifdef BLKGETSIZE64
+ int ret = ::ioctl(fd, BLKGETSIZE64, psize);
+-#elif BLKGETSIZE
++#elif defined(BLKGETSIZE)
+ unsigned long sectors = 0;
+ int ret = ::ioctl(fd, BLKGETSIZE, §ors);
+ *psize = sectors * 512ULL;
+ #else
+--- a/src/common/config.cc
++++ b/src/common/config.cc
+@@ -878,17 +878,17 @@
+ assert(lock.is_locked());
+ switch (opt->type) {
+ case OPT_INT: {
+ std::string err;
+- int f = strict_strtol(val, 10, &err);
++ int f = strict_sistrtoll(val, &err);
+ if (!err.empty())
+ return -EINVAL;
+ *(int*)opt->conf_ptr(this) = f;
+ return 0;
+ }
+ case OPT_LONGLONG: {
+ std::string err;
+- long long f = strict_strtoll(val, 10, &err);
++ long long f = strict_sistrtoll(val, &err);
+ if (!err.empty())
+ return -EINVAL;
+ *(long long*)opt->conf_ptr(this) = f;
+ return 0;
+@@ -916,17 +916,17 @@
+ }
+ return 0;
+ case OPT_U32: {
+ std::string err;
+- int f = strict_strtol(val, 10, &err);
++ int f = strict_sistrtoll(val, &err);
+ if (!err.empty())
+ return -EINVAL;
+ *(uint32_t*)opt->conf_ptr(this) = f;
+ return 0;
+ }
+ case OPT_U64: {
+ std::string err;
+- long long f = strict_strtoll(val, 10, &err);
++ long long f = strict_sistrtoll(val, &err);
+ if (!err.empty())
+ return -EINVAL;
+ *(uint64_t*)opt->conf_ptr(this) = f;
+ return 0;
+--- a/src/common/config_opts.h
++++ b/src/common/config_opts.h
+@@ -176,8 +176,9 @@
+ OPTION(mon_force_standby_active, OPT_BOOL, true) // should mons force standby-replay mds to be active
+ OPTION(mon_warn_on_old_mons, OPT_BOOL, true) // should mons set health to WARN if part of quorum is old?
+ OPTION(mon_warn_on_legacy_crush_tunables, OPT_BOOL, true) // warn if crush tunables are not optimal
+ OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0'
++OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL, true)
+ OPTION(mon_min_osdmap_epochs, OPT_INT, 500)
+ OPTION(mon_max_pgmap_epochs, OPT_INT, 500)
+ OPTION(mon_max_log_epochs, OPT_INT, 500)
+ OPTION(mon_max_mdsmap_epochs, OPT_INT, 500)
+@@ -433,8 +434,9 @@
+ "technique=reed_sol_van "
+ "k=2 "
+ "m=1 "
+ ) // default properties of osd pool create
++OPTION(osd_erasure_code_plugins, OPT_STR, "jerasure") // list of erasure code plugins
+ OPTION(osd_pool_default_flags, OPT_INT, 0) // default flags for new pools
+ OPTION(osd_pool_default_flag_hashpspool, OPT_BOOL, true) // use new pg hashing to prevent pool/pg overlap
+ OPTION(osd_pool_default_hit_set_bloom_fpp, OPT_FLOAT, .05)
+ OPTION(osd_pool_default_cache_target_dirty_ratio, OPT_FLOAT, .4)
+@@ -449,16 +451,19 @@
+ OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200)
+ OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom")
+
+ OPTION(osd_map_dedup, OPT_BOOL, true)
++OPTION(osd_map_max_advance, OPT_INT, 200) // make this < cache_size!
+ OPTION(osd_map_cache_size, OPT_INT, 500)
+ OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message
+ OPTION(osd_map_share_max_epochs, OPT_INT, 100) // cap on # of inc maps we send to peers, clients
+ OPTION(osd_op_threads, OPT_INT, 2) // 0 == no threading
+ OPTION(osd_peering_wq_batch_size, OPT_U64, 20)
+ OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64, 4194304)
+ OPTION(osd_op_pq_min_cost, OPT_U64, 65536)
+ OPTION(osd_disk_threads, OPT_INT, 1)
++OPTION(osd_disk_thread_ioprio_class, OPT_STR, "") // rt realtime be besteffort best effort idle
++OPTION(osd_disk_thread_ioprio_priority, OPT_INT, -1) // 0-7
+ OPTION(osd_recovery_threads, OPT_INT, 1)
+ OPTION(osd_recover_clone_overlap, OPT_BOOL, true) // preserve clone_overlap during recovery/migration
+
+ // Only use clone_overlap for recovery if there are fewer than
+@@ -472,8 +477,9 @@
+ OPTION(osd_snap_trim_thread_timeout, OPT_INT, 60*60*1)
+ OPTION(osd_snap_trim_sleep, OPT_FLOAT, 0)
+ OPTION(osd_scrub_thread_timeout, OPT_INT, 60)
+ OPTION(osd_scrub_finalize_thread_timeout, OPT_INT, 60*10)
++OPTION(osd_scrub_invalid_stats, OPT_BOOL, true)
+ OPTION(osd_remove_thread_timeout, OPT_INT, 60*60)
+ OPTION(osd_command_thread_timeout, OPT_INT, 10*60)
+ OPTION(osd_age, OPT_FLOAT, .8)
+ OPTION(osd_age_time, OPT_INT, 0)
+@@ -508,8 +514,9 @@
+ OPTION(osd_scrub_min_interval, OPT_FLOAT, 60*60*24) // if load is low
+ OPTION(osd_scrub_max_interval, OPT_FLOAT, 7*60*60*24) // regardless of load
+ OPTION(osd_scrub_chunk_min, OPT_INT, 5)
+ OPTION(osd_scrub_chunk_max, OPT_INT, 25)
++OPTION(osd_scrub_sleep, OPT_FLOAT, 0) // sleep between [deep]scrub ops
+ OPTION(osd_deep_scrub_interval, OPT_FLOAT, 60*60*24*7) // once a week
+ OPTION(osd_deep_scrub_stride, OPT_INT, 524288)
+ OPTION(osd_scan_list_ping_tp_interval, OPT_U64, 100)
+ OPTION(osd_auto_weight, OPT_BOOL, false)
+@@ -689,8 +696,11 @@
+ OPTION(keyvaluestore_debug_check_backend, OPT_BOOL, 0) // Expensive debugging check on sync
+ OPTION(keyvaluestore_op_threads, OPT_INT, 2)
+ OPTION(keyvaluestore_op_thread_timeout, OPT_INT, 60)
+ OPTION(keyvaluestore_op_thread_suicide_timeout, OPT_INT, 180)
++OPTION(keyvaluestore_default_strip_size, OPT_INT, 4096) // Only affect new object
++OPTION(keyvaluestore_max_expected_write_size, OPT_U64, 1ULL << 24) // bytes
++OPTION(keyvaluestore_header_cache_size, OPT_INT, 4096) // Header cache size
+
+ // max bytes to search ahead in journal searching for corruption
+ OPTION(journal_max_corrupt_search, OPT_U64, 10<<20)
+ OPTION(journal_block_align, OPT_BOOL, true)
+@@ -712,8 +722,9 @@
+ OPTION(rbd_cache_size, OPT_LONGLONG, 32<<20) // cache size in bytes
+ OPTION(rbd_cache_max_dirty, OPT_LONGLONG, 24<<20) // dirty limit in bytes - set to 0 for write-through caching
+ OPTION(rbd_cache_target_dirty, OPT_LONGLONG, 16<<20) // target dirty limit in bytes
+ OPTION(rbd_cache_max_dirty_age, OPT_FLOAT, 1.0) // seconds in cache before writeback starts
++OPTION(rbd_cache_max_dirty_object, OPT_INT, 0) // dirty limit for objects - set to 0 for auto calculate from rbd_cache_size
+ OPTION(rbd_cache_block_writes_upfront, OPT_BOOL, false) // whether to block writes to the cache before the aio_write call completes (true), or block before the aio completion is called (false)
+ OPTION(rbd_concurrent_management_ops, OPT_INT, 10) // how many operations can be in flight for a management operation like deleting or resizing an image
+ OPTION(rbd_balance_snap_reads, OPT_BOOL, false)
+ OPTION(rbd_localize_snap_reads, OPT_BOOL, false)
+--- /dev/null
++++ b/src/common/io_priority.cc
+@@ -0,0 +1,54 @@
++// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
++// vim: ts=8 sw=2 smarttab
++/*
++ * Ceph - scalable distributed file system
++ *
++ * Copyright (C) 2012 Red Hat
++ *
++ * This is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License version 2.1, as published by the Free Software
++ * Foundation. See file COPYING.
++ *
++ */
++
++#include <sys/types.h>
++#include <unistd.h>
++#include <sys/syscall.h> /* For SYS_xxx definitions */
++#include <algorithm>
++#include <errno.h>
++
++#include "common/errno.h"
++#include "io_priority.h"
++
++pid_t ceph_gettid(void)
++{
++#ifdef __linux__
++ return syscall(SYS_gettid);
++#else
++ return -ENOSYS;
++#endif
++}
++
++int ceph_ioprio_set(int whence, int who, int ioprio)
++{
++#ifdef __linux__
++ return syscall(SYS_ioprio_set, whence, who, ioprio);
++#else
++ return -ENOSYS;
++#endif
++}
++
++int ceph_ioprio_string_to_class(const std::string& s)
++{
++ std::string l;
++ std::transform(s.begin(), s.end(), l.begin(), ::tolower);
++
++ if (l == "idle")
++ return IOPRIO_CLASS_IDLE;
++ if (l == "be" || l == "besteffort" || l == "best effort")
++ return IOPRIO_CLASS_BE;
++ if (l == "rt" || l == "realtime" || l == "real time")
++ return IOPRIO_CLASS_RT;
++ return -EINVAL;
++}
+--- /dev/null
++++ b/src/common/io_priority.h
+@@ -0,0 +1,44 @@
++// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
++// vim: ts=8 sw=2 smarttab
++/*
++ * Ceph - scalable distributed file system
++ *
++ * Copyright (C) 2012 Red Hat
++ *
++ * This is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License version 2.1, as published by the Free Software
++ * Foundation. See file COPYING.
++ *
++ */
++
++#ifndef CEPH_COMMON_IO_PRIORITY_H
++#define CEPH_COMMON_IO_PRIORITY_H
++
++#include <string>
++
++extern pid_t ceph_gettid();
++
++#ifndef IOPRIO_WHO_PROCESS
++# define IOPRIO_WHO_PROCESS 1
++#endif
++#ifndef IOPRIO_PRIO_VALUE
++# define IOPRIO_CLASS_SHIFT 13
++# define IOPRIO_PRIO_VALUE(class, data) \
++ (((class) << IOPRIO_CLASS_SHIFT) | (data))
++#endif
++#ifndef IOPRIO_CLASS_RT
++# define IOPRIO_CLASS_RT 1
++#endif
++#ifndef IOPRIO_CLASS_BE
++# define IOPRIO_CLASS_BE 2
++#endif
++#ifndef IOPRIO_CLASS_IDLE
++# define IOPRIO_CLASS_IDLE 3
++#endif
++
++extern int ceph_ioprio_set(int whence, int who, int ioprio);
++
++extern int ceph_ioprio_string_to_class(const std::string& s);
++
++#endif
+--- /dev/null
++++ b/src/common/random_cache.hpp
+@@ -0,0 +1,111 @@
++// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
++// vim: ts=8 sw=2 smarttab
++/*
++ * Ceph - scalable distributed file system
++ *
++ * Copyright (C) 2014 UnitedStack <haomai at unitedstack.com>
++ *
++ * Author: Haomai Wang <haomaiwang at gmail.com>
++ *
++ * This is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License version 2.1, as published by the Free Software
++ * Foundation. See file COPYING.
++ *
++ */
++
++#ifndef CEPH_RANDOMCACHE_H
++#define CEPH_RANDOMCACHE_H
++
++#include "common/Mutex.h"
++#include "include/compat.h"
++#include "include/unordered_map.h"
++
++
++// Although This is a ramdom cache implementation, here still consider to make
++// the trim progress more reasonable. Each item owns its lookup frequency,
++// when the cache is full it will randomly pick up several items and compare the
++// frequency associated with. The least frequency of items will be evicted.
++template <class K, class V>
++class RandomCache {
++ // The first element of pair is the frequency of item, it's used to evict item
++ ceph::unordered_map<K, pair<uint64_t, V> > contents;
++ Mutex lock;
++ uint64_t max_size;
++ K last_trim_key;
++
++ // When cache reach full, consider to evict a certain number of items
++ static const uint64_t EVICT_COUNT = 5;
++ // Avoid too much overhead on comparing items's frequency, the number of
++ // compare items is expected to small.
++ static const uint64_t COMPARE_COUNT = 3;
++
++ // In order to make evict cache progress more lightweight and effective,
++ // several items are expected to evicted in one call
++ void trim_cache(uint64_t evict_count) {
++ typename ceph::unordered_map<K, pair<uint64_t, V> >::iterator it = contents.find(last_trim_key);
++ uint64_t total_compare = evict_count * COMPARE_COUNT;
++ map<uint64_t, K> candidates;
++
++ while (total_compare--) {
++ if (it == contents.end()) {
++ it = contents.begin();
++ }
++
++ candidates[it->second.first] = it->first;
++ it++;
++ }
++ if (it != contents.end())
++ last_trim_key = it->first;
++ else
++ last_trim_key = contents.begin()->first;
++
++ for (typename map<uint64_t, K>::iterator j = candidates.begin(); j != candidates.end(); j++) {
++ contents.erase(j->second);
++ evict_count--;
++ if (!evict_count)
++ break;
++ }
++ }
++
++ public:
++ RandomCache(size_t max_size=20) : lock("RandomCache::lock"),
++ max_size(max_size) {}
++ ~RandomCache() {
++ contents.clear();
++ }
++
++ void clear(K key) {
++ Mutex::Locker l(lock);
++ contents.erase(key);
++ }
++
++ void set_size(size_t new_size) {
++ Mutex::Locker l(lock);
++ max_size = new_size;
++ if (max_size <= contents.size()) {
++ trim_cache(contents.size() - max_size);
++ }
++ }
++
++ bool lookup(K key, V *out) {
++ Mutex::Locker l(lock);
++ typename ceph::unordered_map<K, pair<uint64_t, V> >::iterator it = contents.find(key);
++ if (it != contents.end()) {
++ it->second.first++;
++ *out = it->second.second;
++ return true;
++ }
++ return false;
++ }
++
++ void add(K key, V value) {
++ Mutex::Locker l(lock);
++ if (max_size <= contents.size()) {
++ trim_cache(EVICT_COUNT);
++ }
++ contents[key] = make_pair(1, value);
++ }
++};
++
++#endif
+--- a/src/common/str_map.cc
++++ b/src/common/str_map.cc
+@@ -23,9 +23,9 @@
+
+ using namespace std;
+
+ int get_str_map(const string &str,
+- stringstream &ss,
++ ostream &ss,
+ map<string,string> *str_map)
+ {
+ json_spirit::mValue json;
+ try {
+--- a/src/common/strtol.cc
++++ b/src/common/strtol.cc
+@@ -16,8 +16,11 @@
+ #include <limits.h>
+ #include <sstream>
+ #include <stdlib.h>
+ #include <string>
++extern "C" {
++#include <stdint.h>
++}
+
+ using std::ostringstream;
+
+ long long strict_strtoll(const char *str, int base, std::string *err)
+@@ -123,4 +126,44 @@
+ }
+ *err = "";
+ return ret;
+ }
++
++uint64_t strict_sistrtoll(const char *str, std::string *err)
++{
++ std::string s(str);
++ if (s.size() == 0) {
++ ostringstream oss;
++ oss << "strict_sistrtoll: value not specified";
++ *err = oss.str();
++ return 0;
++ }
++ const char &u = s.at(s.size()-1); //str[std::strlen(str)-1];
++ int m = 0;
++ if (u == 'B')
++ m = 0;
++ else if (u == 'K')
++ m = 10;
++ else if (u == 'M')
++ m = 20;
++ else if (u == 'G')
++ m = 30;
++ else if (u == 'T')
++ m = 40;
++ else if (u == 'P')
++ m = 50;
++ else if (u == 'E')
++ m = 60;
++ else
++ m = -1;
++
++ const char *v = NULL;
++ if (m >= 0)
++ s = std::string(str, s.size()-1);
++ v = s.c_str();
++
++ uint64_t r = strict_strtoll(v, 10, err);
++ if (err->empty() && m > 0) {
++ r = (r << m);
++ }
++ return r;
++}
+--- a/src/common/strtol.h
++++ b/src/common/strtol.h
+@@ -15,8 +15,11 @@
+ #ifndef CEPH_COMMON_STRTOL_H
+ #define CEPH_COMMON_STRTOL_H
+
+ #include <string>
++extern "C" {
++#include <stdint.h>
++}
+
+ long long strict_strtoll(const char *str, int base, std::string *err);
+
+ int strict_strtol(const char *str, int base, std::string *err);
+@@ -24,5 +27,7 @@
+ double strict_strtod(const char *str, std::string *err);
+
+ float strict_strtof(const char *str, std::string *err);
+
++uint64_t strict_sistrtoll(const char *str, std::string *err);
++
+ #endif
+--- a/src/crush/CrushWrapper.cc
++++ b/src/crush/CrushWrapper.cc
+@@ -9,34 +9,56 @@
+ #define dout_subsys ceph_subsys_crush
+
+ bool CrushWrapper::has_v2_rules() const
+ {
+- // check rules for use of indep or new SET_* rule steps
+ for (unsigned i=0; i<crush->max_rules; i++) {
+- crush_rule *r = crush->rules[i];
+- if (!r)
+- continue;
+- for (unsigned j=0; j<r->len; j++) {
+- if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
+- r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
+- r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
+- r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES)
+- return true;
++ if (is_v2_rule(i)) {
++ return true;
++ }
++ }
++ return false;
++}
++
++bool CrushWrapper::is_v2_rule(unsigned ruleid) const
++{
++ // check rule for use of indep or new SET_* rule steps
++ if (ruleid >= crush->max_rules)
++ return false;
++ crush_rule *r = crush->rules[ruleid];
++ if (!r)
++ return false;
++ for (unsigned j=0; j<r->len; j++) {
++ if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
++ r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
++ r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
++ r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) {
++ return true;
+ }
+ }
+ return false;
+ }
+
+ bool CrushWrapper::has_v3_rules() const
+ {
+- // check rules for use of SET_CHOOSELEAF_VARY_R step
+ for (unsigned i=0; i<crush->max_rules; i++) {
+- crush_rule *r = crush->rules[i];
+- if (!r)
+- continue;
+- for (unsigned j=0; j<r->len; j++) {
+- if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R)
+- return true;
++ if (is_v3_rule(i)) {
++ return true;
++ }
++ }
++ return false;
++}
++
++bool CrushWrapper::is_v3_rule(unsigned ruleid) const
++{
++ // check rule for use of SET_CHOOSELEAF_VARY_R step
++ if (ruleid >= crush->max_rules)
++ return false;
++ crush_rule *r = crush->rules[ruleid];
++ if (!r)
++ return false;
++ for (unsigned j=0; j<r->len; j++) {
++ if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) {
++ return true;
+ }
+ }
+ return false;
+ }
+@@ -793,8 +815,61 @@
+ have_rmaps = false;
+ return rno;
+ }
+
++int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
++{
++ if (ruleno >= crush->max_rules)
++ return -ENOENT;
++ if (crush->rules[ruleno] == NULL)
++ return -ENOENT;
++ crush_rule *rule = crush->rules[ruleno];
++
++ // build a weight map for each TAKE in the rule, and then merge them
++ for (unsigned i=0; i<rule->len; ++i) {
++ map<int,float> m;
++ float sum = 0;
++ if (rule->steps[i].op == CRUSH_RULE_TAKE) {
++ int n = rule->steps[i].arg1;
++ if (n >= 0) {
++ m[n] = 1.0;
++ sum = 1.0;
++ } else {
++ list<int> q;
++ q.push_back(n);
++ //breadth first iterate the OSD tree
++ while (!q.empty()) {
++ int bno = q.front();
++ q.pop_front();
++ crush_bucket *b = crush->buckets[-1-bno];
++ assert(b);
++ for (unsigned j=0; j<b->size; ++j) {
++ int item_id = b->items[j];
++ if (item_id >= 0) //it's an OSD
++ {
++ float w = crush_get_bucket_item_weight(b, j);
++ m[item_id] = w;
++ sum += w;
++ }
++ else //not an OSD, expand the child later
++ q.push_back(item_id);
++ }
++ }
++ }
++ }
++ for (map<int,float>::iterator p = m.begin(); p != m.end(); ++p) {
++ map<int,float>::iterator q = pmap->find(p->first);
++ if (q == pmap->end()) {
++ (*pmap)[p->first] = p->second / sum;
++ } else {
++ q->second += p->second / sum;
++ }
++ }
++ }
++
++ return 0;
++}
++
+ int CrushWrapper::remove_rule(int ruleno)
+ {
+ if (ruleno >= (int)crush->max_rules)
+ return -ENOENT;
+--- a/src/crush/CrushWrapper.h
++++ b/src/crush/CrushWrapper.h
+@@ -215,8 +215,10 @@
+ }
+ bool has_v2_rules() const;
+ bool has_v3_rules() const;
+
++ bool is_v2_rule(unsigned ruleid) const;
++ bool is_v3_rule(unsigned ruleid) const;
+
+ // bucket types
+ int get_num_type_names() const {
+ return type_map.size();
+@@ -630,8 +632,20 @@
+ if (IS_ERR(s)) return PTR_ERR(s);
+ return s->arg2;
+ }
+
++ /**
++ * calculate a map of osds to weights for a given rule
++ *
++ * Generate a map of which OSDs get how much relative weight for a
++ * given rule.
++ *
++ * @param ruleno [in] rule id
++ * @param pmap [out] map of osd to weight
++ * @return 0 for success, or negative error code
++ */
++ int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap);
++
+ /* modifiers */
+ int add_rule(int len, int ruleset, int type, int minsize, int maxsize, int ruleno) {
+ if (!crush) return -ENOENT;
+ crush_rule *n = crush_make_rule(len, ruleset, type, minsize, maxsize);
+--- a/src/erasure-code/ErasureCodeInterface.h
++++ b/src/erasure-code/ErasureCodeInterface.h
+@@ -166,9 +166,9 @@
+ *
+ * @param [in] name of the ruleset to create
+ * @param [in] crush crushmap in which the ruleset is created
+ * @param [out] ss contains informative messages when an error occurs
+- * @return **0** on success or a negative errno on error.
++ * @return a ruleset on success or a negative errno on error.
+ */
+ virtual int create_ruleset(const string &name,
+ CrushWrapper &crush,
+ ostream *ss) const = 0;
+--- a/src/erasure-code/ErasureCodePlugin.cc
++++ b/src/erasure-code/ErasureCodePlugin.cc
+@@ -3,8 +3,9 @@
+ /*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing at cloudwatt.com>
++ * Copyright (C) 2014 Red Hat <contact at redhat.com>
+ *
+ * Author: Loic Dachary <loic at dachary.org>
+ *
+ * This library is free software; you can redistribute it and/or
+@@ -18,8 +19,9 @@
+ #include <dlfcn.h>
+
+ #include "ErasureCodePlugin.h"
+ #include "common/errno.h"
++#include "include/str_list.h"
+
+ #define PLUGIN_PREFIX "libec_"
+ #define PLUGIN_SUFFIX ".so"
+ #define PLUGIN_INIT_FUNCTION "__erasure_code_init"
+@@ -129,7 +131,33 @@
+ }
+
+ (*plugin)->library = library;
+
++ ss << __func__ << ": " << plugin_name << " ";
++
+ return 0;
+ }
+
++int ErasureCodePluginRegistry::preload(const std::string &plugins,
++ const std::string &directory,
++ ostream &ss)
++{
++ map<string,string> profile;
++ profile["directory"] = directory;
++ list<string> plugins_list;
++ get_str_list(plugins, plugins_list);
++ for (list<string>::iterator i = plugins_list.begin();
++ i != plugins_list.end();
++ i++) {
++ ErasureCodePlugin *plugin;
++ int r = load(*i, profile, &plugin, ss);
++ if (r)
++ return r;
++
++ ErasureCodeInterfaceRef erasure_code;
++ profile["technique"] = "reed_sol_van";
++ r = plugin->factory(profile, &erasure_code);
++ if (r)
++ return r;
++ }
++ return 0;
++}
+--- a/src/erasure-code/ErasureCodePlugin.h
++++ b/src/erasure-code/ErasureCodePlugin.h
+@@ -66,8 +66,11 @@
+ const map<std::string,std::string> ¶meters,
+ ErasureCodePlugin **plugin,
+ ostream &ss);
+
++ int preload(const std::string &plugins,
++ const std::string &directory,
++ ostream &ss);
+ };
+ }
+
+ #endif
+--- a/src/erasure-code/jerasure/ErasureCodeJerasure.cc
++++ b/src/erasure-code/jerasure/ErasureCodeJerasure.cc
+@@ -43,10 +43,14 @@
+ int ErasureCodeJerasure::create_ruleset(const string &name,
+ CrushWrapper &crush,
+ ostream *ss) const
+ {
+- return crush.add_simple_ruleset(name, ruleset_root, ruleset_failure_domain,
+- "indep", pg_pool_t::TYPE_ERASURE, ss);
++ int ruleid = crush.add_simple_ruleset(name, ruleset_root, ruleset_failure_domain,
++ "indep", pg_pool_t::TYPE_ERASURE, ss);
++ if (ruleid < 0)
++ return ruleid;
++ else
++ return crush.get_rule_mask_ruleset(ruleid);
+ }
+
+ void ErasureCodeJerasure::init(const map<string,string> ¶meters)
+ {
+--- a/src/include/atomic.h
++++ b/src/include/atomic.h
+@@ -20,12 +20,68 @@
+ # include "acconfig.h"
+ #endif
+
+ #include <stdlib.h>
++#include "include/Spinlock.h"
++
++namespace ceph {
++ template <class T>
++ class atomic_spinlock_t {
++ mutable ceph_spinlock_t lock;
++ T val;
++ public:
++ atomic_spinlock_t(T i=0)
++ : val(i) {
++ ceph_spin_init(&lock);
++ }
++ ~atomic_spinlock_t() {
++ ceph_spin_destroy(&lock);
++ }
++ void set(T v) {
++ ceph_spin_lock(&lock);
++ val = v;
++ ceph_spin_unlock(&lock);
++ }
++ T inc() {
++ ceph_spin_lock(&lock);
++ T r = ++val;
++ ceph_spin_unlock(&lock);
++ return r;
++ }
++ T dec() {
++ ceph_spin_lock(&lock);
++ T r = --val;
++ ceph_spin_unlock(&lock);
++ return r;
++ }
++ void add(T d) {
++ ceph_spin_lock(&lock);
++ val += d;
++ ceph_spin_unlock(&lock);
++ }
++ void sub(T d) {
++ ceph_spin_lock(&lock);
++ val -= d;
++ ceph_spin_unlock(&lock);
++ }
++ T read() const {
++ T ret;
++ ceph_spin_lock(&lock);
++ ret = val;
++ ceph_spin_unlock(&lock);
++ return ret;
++ }
++ private:
++ // forbid copying
++ atomic_spinlock_t(const atomic_spinlock_t<T> &other);
++ atomic_spinlock_t &operator=(const atomic_spinlock_t<T> &rhs);
++ };
++}
+
+ #ifndef NO_ATOMIC_OPS
+
+ // libatomic_ops implementation
++#define AO_REQUIRE_CAS
+ #include <atomic_ops.h>
+
+ // reinclude our assert to clobber the system one
+ #include "include/assert.h"
+@@ -34,9 +90,9 @@
+ class atomic_t {
+ AO_t val;
+ public:
+ atomic_t(AO_t i=0) : val(i) {}
+- void set(size_t v) {
++ void set(AO_t v) {
+ AO_store(&val, v);
+ }
+ AO_t inc() {
+ return AO_fetch_and_add1(&val) + 1;
+@@ -46,10 +102,10 @@
+ }
+ void add(AO_t add_me) {
+ AO_fetch_and_add(&val, add_me);
+ }
+- void sub(int sub_me) {
+- int negsub = 0 - sub_me;
++ void sub(AO_t sub_me) {
++ AO_t negsub = 0 - sub_me;
+ AO_fetch_and_add_write(&val, (AO_t)negsub);
+ }
+ AO_t read() const {
+ // cast away const on the pointer. this is only needed to build
+@@ -61,65 +117,26 @@
+ // forbid copying
+ atomic_t(const atomic_t &other);
+ atomic_t &operator=(const atomic_t &rhs);
+ };
++
++#if SIZEOF_AO_T == 8
++ typedef atomic_t atomic64_t;
++#else
++ typedef atomic_spinlock_t<unsigned long long> atomic64_t;
++#endif
++
+ }
++
+ #else
+ /*
+ * crappy slow implementation that uses a pthreads spinlock.
+ */
+ #include "include/Spinlock.h"
+
+ namespace ceph {
+- class atomic_t {
+- mutable ceph_spinlock_t lock;
+- signed long val;
+- public:
+- atomic_t(int i=0)
+- : val(i) {
+- ceph_spin_init(&lock);
+- }
+- ~atomic_t() {
+- ceph_spin_destroy(&lock);
+- }
+- void set(size_t v) {
+- ceph_spin_lock(&lock);
+- val = v;
+- ceph_spin_unlock(&lock);
+- }
+- int inc() {
+- ceph_spin_lock(&lock);
+- int r = ++val;
+- ceph_spin_unlock(&lock);
+- return r;
+- }
+- int dec() {
+- ceph_spin_lock(&lock);
+- int r = --val;
+- ceph_spin_unlock(&lock);
+- return r;
+- }
+- void add(int d) {
+- ceph_spin_lock(&lock);
+- val += d;
+- ceph_spin_unlock(&lock);
+- }
+- void sub(int d) {
+- ceph_spin_lock(&lock);
+- val -= d;
+- ceph_spin_unlock(&lock);
+- }
+- int read() const {
+- signed long ret;
+- ceph_spin_lock(&lock);
+- ret = val;
+- ceph_spin_unlock(&lock);
+- return ret;
+- }
+- private:
+- // forbid copying
+- atomic_t(const atomic_t &other);
+- atomic_t &operator=(const atomic_t &rhs);
+- };
++ typedef atomic_spinlock_t<unsigned> atomic_t;
++ typedef atomic_spinlock_t<unsigned long long> atomic64_t;
+ }
++
+ #endif
+ #endif
+--- a/src/include/intarith.h
++++ b/src/include/intarith.h
+@@ -27,9 +27,9 @@
+ # define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+ #endif
+
+ #ifndef ROUND_UP_TO
+-# define ROUND_UP_TO(n, d) (((n)+(d)-1) & ~((d)-1))
++# define ROUND_UP_TO(n, d) ((n)%(d) ? ((n)+(d)-(n)%(d)) : (n))
+ #endif
+
+ #ifndef SHIFT_ROUND_UP
+ # define SHIFT_ROUND_UP(x,y) (((x)+(1<<(y))-1) >> (y))
+--- a/src/include/rbd/librbd.h
++++ b/src/include/rbd/librbd.h
+@@ -38,8 +38,9 @@
+ #define LIBRBD_VERSION_CODE LIBRBD_VERSION(LIBRBD_VER_MAJOR, LIBRBD_VER_MINOR, LIBRBD_VER_EXTRA)
+
+ #define LIBRBD_SUPPORTS_WATCH 0
+ #define LIBRBD_SUPPORTS_AIO_FLUSH 1
++#define LIBRBD_SUPPORTS_INVALIDATE 1
+
+ typedef void *rbd_snap_t;
+ typedef void *rbd_image_t;
+
+@@ -375,8 +376,16 @@
+ * @returns 0 on success, negative error code on failure
+ */
+ int rbd_aio_flush(rbd_image_t image, rbd_completion_t c);
+
++/**
++ * Drop any cached data for an image
++ *
++ * @param image the image to invalidate cached data for
++ * @returns 0 on success, negative error code on failure
++ */
++int rbd_invalidate_cache(rbd_image_t image);
++
+ #ifdef __cplusplus
+ }
+ #endif
+
+--- a/src/include/rbd/librbd.hpp
++++ b/src/include/rbd/librbd.hpp
+@@ -215,8 +215,16 @@
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_flush(RBD::AioCompletion *c);
+
++ /**
++ * Drop any cached data for an image
++ *
++ * @param image the image to invalidate cached data for
++ * @returns 0 on success, negative error code on failure
++ */
++ int invalidate_cache();
++
+ private:
+ friend class RBD;
+
+ Image(const Image& rhs);
+--- a/src/include/str_map.h
++++ b/src/include/str_map.h
+@@ -52,8 +52,8 @@
+ * @param [out] str_map key/value pairs read from str
+ * @return **0** on success or a -EINVAL on error.
+ */
+ extern int get_str_map(const std::string &str,
+- std::stringstream &ss,
++ std::ostream &ss,
+ std::map<std::string,std::string> *str_map);
+
+ #endif
+--- a/src/init-ceph.in
++++ b/src/init-ceph.in
+@@ -30,8 +30,9 @@
+
+ usage_exit() {
+ echo "usage: $0 [options] {start|stop|restart|condrestart} [mon|osd|mds]..."
+ printf "\t-c ceph.conf\n"
++ printf "\t--cluster [cluster name]\tdefine the cluster name\n"
+ printf "\t--valgrind\trun via valgrind\n"
+ printf "\t--hostname [hostname]\toverride hostname lookup\n"
+ exit
+ }
+@@ -112,8 +113,10 @@
+ monaddr=
+ dofsmount=1
+ dofsumount=0
+ verbose=0
++use_default_conf=1
++
+
+ while echo $1 | grep -q '^-'; do # FIXME: why not '^-'?
+ case $1 in
+ -v | --verbose)
+@@ -152,10 +155,17 @@
+ --conf | -c)
+ [ -z "$2" ] && usage_exit
+ options="$options $1"
+ shift
++ use_default_conf=0
+ conf=$1
+ ;;
++ --cluster )
++ [ -z "$2" ] && usage_exit
++ options="$options $1"
++ shift
++ cluster=$1
++ ;;
+ --hostname )
+ [ -z "$2" ] && usage_exit
+ options="$options $1"
+ shift
+@@ -169,8 +179,22 @@
+ options="$options $1"
+ shift
+ done
+
++
++# if `--cluster` was not passed in, fallback to looking at the config name
++if [ -z "$cluster" ]; then
++ cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1`
++else
++ # if we were told to use a given cluster name then $conf needs to be updated
++ # but just define it if `--conf` was not specified, otherwise we would be silently
++ # overriding $conf even if it was defined with `--conf`
++ if [ $use_default_conf -eq 1 ]; then
++ conf="/etc/ceph/$cluster.conf"
++ fi
++fi
++
++
+ verify_conf
+
+ command=$1
+ [ -n "$*" ] && shift
+@@ -188,13 +212,12 @@
+
+ for name in $what; do
+ type=`echo $name | cut -c 1-3` # e.g. 'mon', if $item is 'mon1'
+ id=`echo $name | cut -c 4- | sed 's/^\\.//'`
+- cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1`
+ num=$id
+ name="$type.$id"
+
+- check_host || continue
++ check_host $cluster || continue
+
+ binary="$BINDIR/ceph-$type"
+ cmd="$binary -i $id"
+
+@@ -234,9 +257,9 @@
+ # conf file
+ cmd="$cmd -c $conf"
+
+ if echo $name | grep -q ^osd; then
+- get_conf osd_data "/var/lib/ceph/osd/ceph-$id" "osd data"
++ get_conf osd_data "/var/lib/ceph/osd/$cluster-$id" "osd data"
+ get_conf fs_path "$osd_data" "fs path" # mount point defaults so osd data
+ get_conf fs_devs "" "devs"
+ if [ -z "$fs_devs" ]; then
+ # try to fallback to old keys
+@@ -334,9 +357,9 @@
+ get_conf update_crush "" "osd crush update on start"
+ if [ "${update_crush:-1}" = "1" -o "${update_crush:-1}" = "true" ]; then
+ # update location in crush
+ get_conf osd_location_hook "$BINDIR/ceph-crush-location" "osd crush location hook"
+- osd_location=`$osd_location_hook --cluster ceph --id $id --type osd`
++ osd_location=`$osd_location_hook --cluster $cluster --id $id --type osd`
+ get_conf osd_weight "" "osd crush initial weight"
+ defaultweight="$(df -P -k $osd_data/. | tail -1 | awk '{ print sprintf("%.2f",$2/1073741824) }')"
+ get_conf osd_keyring "$osd_data/keyring" "keyring"
+ do_cmd "timeout 30 $BINDIR/ceph -c $conf --name=osd.$id --keyring=$osd_keyring osd crush create-or-move -- $id ${osd_weight:-${defaultweight:-1}} $osd_location"
+@@ -365,9 +388,9 @@
+ # in creating these keys.
+ get_conf mon_data "/var/lib/ceph/mon/ceph-$id" "mon data"
+ if [ "$mon_data" = "/var/lib/ceph/mon/ceph-$id" -a "$asok" = "/var/run/ceph/ceph-mon.$id.asok" ]; then
+ echo Starting ceph-create-keys on $host...
+- cmd2="$SBINDIR/ceph-create-keys -i $id 2> /dev/null &"
++ cmd2="$SBINDIR/ceph-create-keys --cluster $cluster -i $id 2> /dev/null &"
+ do_cmd "$cmd2"
+ fi
+ fi
+
+--- a/src/init-radosgw.sysv
++++ b/src/init-radosgw.sysv
+@@ -14,8 +14,9 @@
+ . /etc/rc.d/init.d/functions
+
+ daemon_is_running() {
+ daemon=$1
++ sleep 1
+ if pidof $daemon >/dev/null; then
+ echo "$daemon is running."
+ exit 0
+ else
+@@ -43,8 +44,12 @@
+ [ $VERBOSE -eq 1 ] && echo "$RADOSGW could not start, it is not executable."
+ exit 1
+ fi
+
++# detect systemd
++SYSTEMD=0
++grep -qs systemd /proc/1/comm && SYSTEMD=1
++
+ case "$1" in
+ start)
+ echo "Starting radosgw instance(s)..."
+ for name in `ceph-conf --list-sections $PREFIX`;
+@@ -78,10 +83,14 @@
+ touch "$log_file"
+ chown $user $log_file
+ fi
+
+- #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
+- daemon --user="$user" "ulimit -n 32768; $RADOSGW -n $name"
++ if [ $SYSTEMD -eq 1 ]; then
++ systemd-run -r bash -c "ulimit -n 32768; $RADOSGW -n $name"
++ else
++ #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
++ daemon --user="$user" "ulimit -n 32768; $RADOSGW -n $name"
++ fi
+ echo "Starting $name..."
+ done
+ daemon_is_running $RADOSGW
+ ;;
+--- a/src/librados/RadosClient.cc
++++ b/src/librados/RadosClient.cc
+@@ -102,10 +102,12 @@
+
+ lock.Lock();
+
+ int r = wait_for_osdmap();
+- if (r < 0)
++ if (r < 0) {
++ lock.Unlock();
+ return r;
++ }
+ int64_t ret = osdmap.lookup_pg_pool_name(name);
+ pool_cache_rwl.get_write();
+ lock.Unlock();
+ if (ret < 0) {
+@@ -581,10 +583,12 @@
+ int librados::RadosClient::pool_delete(const char *name)
+ {
+ lock.Lock();
+ int r = wait_for_osdmap();
+- if (r < 0)
++ if (r < 0) {
++ lock.Unlock();
+ return r;
++ }
+ int tmp_pool_id = osdmap.lookup_pg_pool_name(name);
+ if (tmp_pool_id < 0) {
+ lock.Unlock();
+ return -ENOENT;
+--- a/src/librbd/ImageCtx.cc
++++ b/src/librbd/ImageCtx.cc
+@@ -184,12 +184,16 @@
+ }
+
+ // size object cache appropriately
+ if (object_cacher) {
+- uint64_t obj = cct->_conf->rbd_cache_size / (1ull << order);
++ uint64_t obj = cct->_conf->rbd_cache_max_dirty_object;
++ if (!obj) {
++ obj = cct->_conf->rbd_cache_size / (1ull << order);
++ obj = obj * 4 + 10;
++ }
+ ldout(cct, 10) << " cache bytes " << cct->_conf->rbd_cache_size << " order " << (int)order
+ << " -> about " << obj << " objects" << dendl;
+- object_cacher->set_max_objects(obj * 4 + 10);
++ object_cacher->set_max_objects(obj);
+ }
+
+ ldout(cct, 10) << "init_layout stripe_unit " << stripe_unit
+ << " stripe_count " << stripe_count
+@@ -572,11 +576,11 @@
+ md_lock.put_write();
+ object_cacher->stop();
+ }
+
+- void ImageCtx::invalidate_cache() {
++ int ImageCtx::invalidate_cache() {
+ if (!object_cacher)
+- return;
++ return 0;
+ cache_lock.Lock();
+ object_cacher->release_set(object_set);
+ cache_lock.Unlock();
+ int r = flush_cache();
+@@ -584,10 +588,14 @@
+ lderr(cct) << "flush_cache returned " << r << dendl;
+ cache_lock.Lock();
+ bool unclean = object_cacher->release_set(object_set);
+ cache_lock.Unlock();
+- if (unclean)
+- lderr(cct) << "could not release all objects from cache" << dendl;
++ if (unclean) {
++ lderr(cct) << "could not release all objects from cache: "
++ << unclean << " bytes remain" << dendl;
++ return -EBUSY;
++ }
++ return r;
+ }
+
+ void ImageCtx::clear_nonexistence_cache() {
+ if (!object_cacher)
+--- a/src/librbd/ImageCtx.h
++++ b/src/librbd/ImageCtx.h
+@@ -138,9 +138,9 @@
+ void user_flushed();
+ void flush_cache_aio(Context *onfinish);
+ int flush_cache();
+ void shutdown_cache();
+- void invalidate_cache();
++ int invalidate_cache();
+ void clear_nonexistence_cache();
+ int register_watch();
+ void unregister_watch();
+ size_t parent_io_len(uint64_t offset, size_t length,
+--- a/src/librbd/internal.cc
++++ b/src/librbd/internal.cc
+@@ -831,8 +831,11 @@
+ int create(IoCtx& io_ctx, const char *imgname, uint64_t size,
+ bool old_format, uint64_t features, int *order,
+ uint64_t stripe_unit, uint64_t stripe_count)
+ {
++ if (!order)
++ return -EINVAL;
++
+ CephContext *cct = (CephContext *)io_ctx.cct();
+ ldout(cct, 20) << "create " << &io_ctx << " name = " << imgname
+ << " size = " << size << " old_format = " << old_format
+ << " features = " << features << " order = " << *order
+@@ -856,11 +859,8 @@
+ lderr(cct) << "rbd image " << imgname << " already exists" << dendl;
+ return -EEXIST;
+ }
+
+- if (!order)
+- return -EINVAL;
+-
+ if (!*order)
+ *order = cct->_conf->rbd_default_order;
+ if (!*order)
+ *order = RBD_DEFAULT_OBJ_ORDER;
+@@ -1503,9 +1503,11 @@
+ RWLock::WLocker l(ictx->md_lock);
+ if (size < ictx->size && ictx->object_cacher) {
+ // need to invalidate since we're deleting objects, and
+ // ObjectCacher doesn't track non-existent objects
+- ictx->invalidate_cache();
++ r = ictx->invalidate_cache();
++ if (r < 0)
++ return r;
+ }
+ resize_helper(ictx, size, prog_ctx);
+
+ ldout(cct, 2) << "done." << dendl;
+@@ -1846,9 +1848,11 @@
+
+ // need to flush any pending writes before resizing and rolling back -
+ // writes might create new snapshots. Rolling back will replace
+ // the current version, so we have to invalidate that too.
+- ictx->invalidate_cache();
++ r = ictx->invalidate_cache();
++ if (r < 0)
++ return r;
+
+ ldout(cct, 2) << "resizing to snapshot size..." << dendl;
+ NoOpProgressContext no_op;
+ r = resize_helper(ictx, new_size, no_op);
+@@ -2070,9 +2074,9 @@
+ << "' snap_name = '"
+ << ictx->snap_name << "'" << dendl;
+ int r = ictx->init();
+ if (r < 0)
+- return r;
++ goto err_close;
+
+ if (!ictx->read_only) {
+ r = ictx->register_watch();
+ if (r < 0) {
+@@ -2876,8 +2880,21 @@
+
+ return r;
+ }
+
++ int invalidate_cache(ImageCtx *ictx)
++ {
++ CephContext *cct = ictx->cct;
++ ldout(cct, 20) << "invalidate_cache " << ictx << dendl;
++
++ int r = ictx_check(ictx);
++ if (r < 0)
++ return r;
++
++ RWLock::WLocker l(ictx->md_lock);
++ return ictx->invalidate_cache();
++ }
++
+ int aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
+ AioCompletion *c)
+ {
+ CephContext *cct = ictx->cct;
+--- a/src/librbd/internal.h
++++ b/src/librbd/internal.h
+@@ -187,8 +187,9 @@
+ char *buf, bufferlist *pbl, AioCompletion *c);
+ int aio_flush(ImageCtx *ictx, AioCompletion *c);
+ int flush(ImageCtx *ictx);
+ int _flush(ImageCtx *ictx);
++ int invalidate_cache(ImageCtx *ictx);
+
+ ssize_t handle_sparse_read(CephContext *cct,
+ ceph::bufferlist data_bl,
+ uint64_t block_ofs,
+--- a/src/librbd/librbd.cc
++++ b/src/librbd/librbd.cc
+@@ -513,8 +513,14 @@
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::aio_flush(ictx, (librbd::AioCompletion *)c->pc);
+ }
+
++ int Image::invalidate_cache()
++ {
++ ImageCtx *ictx = (ImageCtx *)ctx;
++ return librbd::invalidate_cache(ictx);
++ }
++
+ } // namespace librbd
+
+ extern "C" void rbd_version(int *major, int *minor, int *extra)
+ {
+@@ -1129,8 +1135,14 @@
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ return librbd::aio_flush(ictx, (librbd::AioCompletion *)comp->pc);
+ }
+
++extern "C" int rbd_invalidate_cache(rbd_image_t image)
++{
++ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
++ return librbd::invalidate_cache(ictx);
++}
++
+ extern "C" int rbd_aio_is_complete(rbd_completion_t c)
+ {
+ librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+ return comp->is_complete();
+--- a/src/mds/Locker.cc
++++ b/src/mds/Locker.cc
+@@ -2061,9 +2061,15 @@
+
+ void Locker::calc_new_client_ranges(CInode *in, uint64_t size, map<client_t,client_writeable_range_t>& new_ranges)
+ {
+ inode_t *latest = in->get_projected_inode();
+- uint64_t ms = ROUND_UP_TO((size+1)<<1, latest->get_layout_size_increment());
++ uint64_t ms;
++ if(latest->has_layout()) {
++ ms = ROUND_UP_TO((size+1)<<1, latest->get_layout_size_increment());
++ } else {
++ // Layout-less directories like ~mds0/, have zero size
++ ms = 0;
++ }
+
+ // increase ranges as appropriate.
+ // shrink to 0 if no WR|BUFFER caps issued.
+ for (map<client_t,Capability*>::iterator p = in->client_caps.begin();
+--- a/src/mds/MDCache.cc
++++ b/src/mds/MDCache.cc
+@@ -348,8 +348,9 @@
+ rootdir->fnode.accounted_rstat = rootdir->fnode.rstat;
+
+ root->inode.dirstat = rootdir->fnode.fragstat;
+ root->inode.rstat = rootdir->fnode.rstat;
++ ++root->inode.rstat.rsubdirs;
+ root->inode.accounted_rstat = root->inode.rstat;
+
+ rootdir->mark_complete();
+ rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
+@@ -398,8 +399,9 @@
+ mydir->fnode.accounted_rstat = mydir->fnode.rstat;
+
+ myin->inode.dirstat = mydir->fnode.fragstat;
+ myin->inode.rstat = mydir->fnode.rstat;
++ ++myin->inode.rstat.rsubdirs;
+ myin->inode.accounted_rstat = myin->inode.rstat;
+
+
+ mydir->mark_complete();
+--- a/src/messages/MOSDSubOp.h
++++ b/src/messages/MOSDSubOp.h
+@@ -24,9 +24,9 @@
+ */
+
+ class MOSDSubOp : public Message {
+
+- static const int HEAD_VERSION = 10;
++ static const int HEAD_VERSION = 11;
+ static const int COMPAT_VERSION = 1;
+
+ public:
+ epoch_t map_epoch;
+@@ -62,8 +62,10 @@
+ eversion_t version;
+
+ // piggybacked osd/og state
+ eversion_t pg_trim_to; // primary->replica: trim to here
++ eversion_t pg_trim_rollback_to; // primary->replica: trim rollback
++ // info to here
+ osd_peer_stat_t peer_stat;
+
+ map<string,bufferlist> attrset;
+
+@@ -174,8 +176,13 @@
+ }
+ if (header.version >= 10) {
+ ::decode(updated_hit_set_history, p);
+ }
++ if (header.version >= 11) {
++ ::decode(pg_trim_rollback_to, p);
++ } else {
++ pg_trim_rollback_to = pg_trim_to;
++ }
+ }
+
+ virtual void encode_payload(uint64_t features) {
+ ::encode(map_epoch, payload);
+@@ -223,8 +230,9 @@
+ ::encode(discard_temp_oid, payload);
+ ::encode(from, payload);
+ ::encode(pgid.shard, payload);
+ ::encode(updated_hit_set_history, payload);
++ ::encode(pg_trim_rollback_to, payload);
+ }
+
+ MOSDSubOp()
+ : Message(MSG_OSD_SUBOP, HEAD_VERSION, COMPAT_VERSION) { }
+--- a/src/mon/DataHealthService.cc
++++ b/src/mon/DataHealthService.cc
+@@ -227,9 +227,9 @@
+ if (ours.latest_avail_percent <= g_conf->mon_data_avail_warn) {
+ if (ours.latest_avail_percent != last_warned_percent)
+ mon->clog.warn()
+ << "reached concerning levels of available space on local monitor storage"
+- << " (" << ours.latest_avail_percent << "\% free)\n";
++ << " (" << ours.latest_avail_percent << "% free)\n";
+ last_warned_percent = ours.latest_avail_percent;
+ } else {
+ last_warned_percent = 0;
+ }
+--- a/src/mon/MonCommands.h
++++ b/src/mon/MonCommands.h
+@@ -551,9 +551,9 @@
+ "name=destpool,type=CephPoolname", \
+ "rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
+ COMMAND("osd pool get " \
+ "name=pool,type=CephPoolname " \
+- "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid", \
++ "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile", \
+ "get pool parameter <var>", "osd", "r", "cli,rest")
+ COMMAND("osd pool set " \
+ "name=pool,type=CephPoolname " \
+ "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid " \
+@@ -567,8 +567,12 @@
+ "name=pool,type=CephPoolname " \
+ "name=field,type=CephChoices,strings=max_objects|max_bytes " \
+ "name=val,type=CephString",
+ "set object or byte limit on pool", "osd", "rw", "cli,rest")
++COMMAND("osd pool get-quota " \
++ "name=pool,type=CephPoolname ",
++ "obtain object or byte limits for pool",
++ "osd", "r", "cli,rest")
+ COMMAND("osd pool stats " \
+ "name=name,type=CephString,req=false",
+ "obtain stats from all pools, or from specified pool",
+ "osd", "r", "cli,rest")
+--- a/src/mon/Monitor.cc
++++ b/src/mon/Monitor.cc
+@@ -620,8 +620,23 @@
+
+ void Monitor::refresh_from_paxos(bool *need_bootstrap)
+ {
+ dout(10) << __func__ << dendl;
++
++ bufferlist bl;
++ int r = store->get(MONITOR_NAME, "cluster_fingerprint", bl);
++ if (r >= 0) {
++ try {
++ bufferlist::iterator p = bl.begin();
++ ::decode(fingerprint, p);
++ }
++ catch (buffer::error& e) {
++ dout(10) << __func__ << " failed to decode cluster_fingerprint" << dendl;
++ }
++ } else {
++ dout(10) << __func__ << " no cluster_fingerprint" << dendl;
++ }
++
+ for (int i = 0; i < PAXOS_NUM; ++i) {
+ paxos_service[i]->refresh(need_bootstrap);
+ }
+ for (int i = 0; i < PAXOS_NUM; ++i) {
+@@ -2392,8 +2407,9 @@
+ // this must be formatted, in its current form
+ if (!f)
+ f.reset(new_formatter("json-pretty"));
+ f->open_object_section("report");
++ f->dump_stream("cluster_fingerprint") << fingerprint;
+ f->dump_string("version", ceph_version_to_str());
+ f->dump_string("commit", git_version_to_str());
+ f->dump_stream("timestamp") << ceph_clock_now(NULL);
+
+@@ -2865,10 +2881,11 @@
+ // let it go through and be dispatched immediately!
+ return dispatch(s, m, false);
+ }
+ dout(1) << __func__ << " dropping stray message " << *m
+- << " from " << m->get_source_inst() << dendl;
+- return false;
++ << " from " << m->get_source_inst() << dendl;
++ m->put();
++ return true;
+ }
+
+ if (!exited_quorum.is_zero() && !src_is_mon) {
+ waitlist_or_zap_client(m);
+@@ -3846,11 +3863,31 @@
+ if (!maybe_wait_for_quorum.empty()) {
+ finish_contexts(g_ceph_context, maybe_wait_for_quorum);
+ }
+
++ if (is_leader() && paxos->is_active() && fingerprint.is_zero()) {
++ // this is only necessary on upgraded clusters.
++ MonitorDBStore::Transaction t;
++ prepare_new_fingerprint(&t);
++ bufferlist tbl;
++ t.encode(tbl);
++ paxos->propose_new_value(tbl, new C_NoopContext);
++ }
++
+ new_tick();
+ }
+
++void Monitor::prepare_new_fingerprint(MonitorDBStore::Transaction *t)
++{
++ uuid_d nf;
++ nf.generate_random();
++ dout(10) << __func__ << " proposing cluster_fingerprint " << nf << dendl;
++
++ bufferlist bl;
++ ::encode(nf, bl);
++ t->put(MONITOR_NAME, "cluster_fingerprint", bl);
++}
++
+ int Monitor::check_fsid()
+ {
+ if (!store->exists(MONITOR_NAME, "cluster_uuid"))
+ return -ENOENT;
+--- a/src/mon/Monitor.h
++++ b/src/mon/Monitor.h
+@@ -127,8 +127,9 @@
+ void register_cluster_logger();
+ void unregister_cluster_logger();
+
+ MonMap *monmap;
++ uuid_d fingerprint;
+
+ set<entity_addr_t> extra_probe_peers;
+
+ LogClient clog;
+@@ -189,8 +190,10 @@
+ bool is_peon() const { return state == STATE_PEON; }
+
+ const utime_t &get_leader_since() const;
+
++ void prepare_new_fingerprint(MonitorDBStore::Transaction *t);
++
+ // -- elector --
+ private:
+ Paxos *paxos;
+ Elector elector;
+--- a/src/mon/MonmapMonitor.cc
++++ b/src/mon/MonmapMonitor.cc
+@@ -96,8 +96,13 @@
+ pending_map.encode(bl, mon->get_quorum_features());
+
+ put_version(t, pending_map.epoch, bl);
+ put_last_committed(t, pending_map.epoch);
++
++ // generate a cluster fingerprint, too?
++ if (pending_map.epoch == 1) {
++ mon->prepare_new_fingerprint(t);
++ }
+ }
+
+ void MonmapMonitor::on_active()
+ {
+--- a/src/mon/OSDMonitor.cc
++++ b/src/mon/OSDMonitor.cc
+@@ -2066,8 +2066,34 @@
+ }
+ }
+ }
+
++ // hit_set-less cache_mode?
++ if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
++ int problem_cache_pools = 0;
++ for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
++ p != osdmap.pools.end();
++ ++p) {
++ const pg_pool_t& info = p->second;
++ if (info.cache_mode_requires_hit_set() &&
++ info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
++ ++problem_cache_pools;
++ if (detail) {
++ ostringstream ss;
++ ss << "pool '" << osdmap.get_pool_name(p->first)
++ << "' with cache_mode " << info.get_cache_mode_name()
++ << " needs hit_set_type to be set but it is not";
++ detail->push_back(make_pair(HEALTH_WARN, ss.str()));
++ }
++ }
++ }
++ if (problem_cache_pools) {
++ ostringstream ss;
++ ss << problem_cache_pools << " cache pools are missing hit_sets";
++ summary.push_back(make_pair(HEALTH_WARN, ss.str()));
++ }
++ }
++
+ // Warn if 'mon_osd_down_out_interval' is set to zero.
+ // Having this option set to zero on the leader acts much like the
+ // 'noout' flag. It's hard to figure out what's going wrong with clusters
+ // without the 'noout' flag set but acting like that just the same, so
+@@ -2452,8 +2478,28 @@
+ const pg_pool_t *p = osdmap.get_pg_pool(pool);
+ string var;
+ cmd_getval(g_ceph_context, cmdmap, "var", var);
+
++ if (!p->is_tier() &&
++ (var == "hit_set_type" || var == "hit_set_period" ||
++ var == "hit_set_count" || var == "hit_set_fpp" ||
++ var == "target_max_objects" || var == "target_max_bytes" ||
++ var == "cache_target_full_ratio" ||
++ var == "cache_target_dirty_ratio" ||
++ var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
++ ss << "pool '" << poolstr
++ << "' is not a tier pool: variable not applicable";
++ r = -EACCES;
++ goto reply;
++ }
++
++ if (!p->is_erasure() && var == "erasure_code_profile") {
++ ss << "pool '" << poolstr
++ << "' is not a erasure pool: variable not applicable";
++ r = -EACCES;
++ goto reply;
++ }
++
+ if (f) {
+ f->open_object_section("pool");
+ f->dump_string("pool", poolstr);
+ f->dump_int("pool_id", pool);
+@@ -2487,8 +2533,28 @@
+ } else {
+ BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
+ f->dump_float("hit_set_fpp", bloomp->get_fpp());
+ }
++ } else if (var == "target_max_objects") {
++ f->dump_unsigned("target_max_objects", p->target_max_objects);
++ } else if (var == "target_max_bytes") {
++ f->dump_unsigned("target_max_bytes", p->target_max_bytes);
++ } else if (var == "cache_target_dirty_ratio") {
++ f->dump_unsigned("cache_target_dirty_ratio_micro",
++ p->cache_target_dirty_ratio_micro);
++ f->dump_float("cache_target_dirty_ratio",
++ ((float)p->cache_target_dirty_ratio_micro/1000000));
++ } else if (var == "cache_target_full_ratio") {
++ f->dump_unsigned("cache_target_full_ratio_micro",
++ p->cache_target_full_ratio_micro);
++ f->dump_float("cache_target_full_ratio",
++ ((float)p->cache_target_full_ratio_micro/1000000));
++ } else if (var == "cache_min_flush_age") {
++ f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
++ } else if (var == "cache_min_evict_age") {
++ f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
++ } else if (var == "erasure_code_profile") {
++ f->dump_string("erasure_code_profile", p->erasure_code_profile);
+ }
+
+ f->close_section();
+ f->flush(rdata);
+@@ -2520,9 +2586,26 @@
+ goto reply;
+ }
+ BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
+ ss << "hit_set_fpp: " << bloomp->get_fpp();
++ } else if (var == "target_max_objects") {
++ ss << "target_max_objects: " << p->target_max_objects;
++ } else if (var == "target_max_bytes") {
++ ss << "target_max_bytes: " << p->target_max_bytes;
++ } else if (var == "cache_target_dirty_ratio") {
++ ss << "cache_target_dirty_ratio: "
++ << ((float)p->cache_target_dirty_ratio_micro/1000000);
++ } else if (var == "cache_target_full_ratio") {
++ ss << "cache_target_full_ratio: "
++ << ((float)p->cache_target_full_ratio_micro/1000000);
++ } else if (var == "cache_min_flush_age") {
++ ss << "cache_min_flush_age: " << p->cache_min_flush_age;
++ } else if (var == "cache_min_evict_age") {
++ ss << "cache_min_evict_age: " << p->cache_min_evict_age;
++ } else if (var == "erasure_code_profile") {
++ ss << "erasure_code_profile: " << p->erasure_code_profile;
+ }
++
+ rdata.append(ss);
+ ss.str("");
+ }
+ r = 0;
+@@ -2625,8 +2708,47 @@
+ }
+ rdata.append("\n");
+ r = 0;
+
++ } else if (prefix == "osd pool get-quota") {
++ string pool_name;
++ cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
++
++ int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
++ if (poolid < 0) {
++ assert(poolid == -ENOENT);
++ ss << "unrecognized pool '" << pool_name << "'";
++ r = -ENOENT;
++ goto reply;
++ }
++ const pg_pool_t *p = osdmap.get_pg_pool(poolid);
++
++ if (f) {
++ f->open_object_section("pool_quotas");
++ f->dump_string("pool_name", pool_name);
++ f->dump_unsigned("pool_id", poolid);
++ f->dump_unsigned("quota_max_objects", p->quota_max_objects);
++ f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
++ f->close_section();
++ f->flush(rdata);
++ } else {
++ stringstream rs;
++ rs << "quotas for pool '" << pool_name << "':\n"
++ << " max objects: ";
++ if (p->quota_max_objects == 0)
++ rs << "N/A";
++ else
++ rs << si_t(p->quota_max_objects) << " objects";
++ rs << "\n"
++ << " max bytes : ";
++ if (p->quota_max_bytes == 0)
++ rs << "N/A";
++ else
++ rs << si_t(p->quota_max_bytes) << "B";
++ rdata.append(rs.str());
++ }
++ rdata.append("\n");
++ r = 0;
+ } else if (prefix == "osd crush rule list" ||
+ prefix == "osd crush rule ls") {
+ string format;
+ cmd_getval(g_ceph_context, cmdmap, "format", format, string("json-pretty"));
+@@ -2924,17 +3046,20 @@
+ const string &profile,
+ int *ruleset,
+ stringstream &ss)
+ {
+- *ruleset = osdmap.crush->get_rule_id(name);
+- if (*ruleset != -ENOENT)
++ int ruleid = osdmap.crush->get_rule_id(name);
++ if (ruleid != -ENOENT) {
++ *ruleset = osdmap.crush->get_rule_mask_ruleset(ruleid);
+ return -EEXIST;
++ }
+
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+
+- *ruleset = newcrush.get_rule_id(name);
+- if (*ruleset != -ENOENT) {
++ ruleid = newcrush.get_rule_id(name);
++ if (ruleid != -ENOENT) {
++ *ruleset = newcrush.get_rule_mask_ruleset(ruleid);
+ return -EALREADY;
+ } else {
+ ErasureCodeInterfaceRef erasure_code;
+ int err = get_erasure_code(profile, &erasure_code, ss);
+@@ -3088,22 +3213,25 @@
+ }
+
+ int OSDMonitor::prepare_pool_size(const unsigned pool_type,
+ const string &erasure_code_profile,
+- unsigned *size,
++ unsigned *size, unsigned *min_size,
+ stringstream &ss)
+ {
+ int err = 0;
+ switch (pool_type) {
+ case pg_pool_t::TYPE_REPLICATED:
+ *size = g_conf->osd_pool_default_size;
++ *min_size = g_conf->get_osd_pool_default_min_size();
+ break;
+ case pg_pool_t::TYPE_ERASURE:
+ {
+ ErasureCodeInterfaceRef erasure_code;
+ err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
+- if (err == 0)
++ if (err == 0) {
+ *size = erasure_code->get_chunk_count();
++ *min_size = erasure_code->get_data_chunk_count();
++ }
+ }
+ break;
+ default:
+ ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
+@@ -3218,10 +3346,10 @@
+ r = prepare_pool_crush_ruleset(pool_type, erasure_code_profile,
+ crush_ruleset_name, &crush_ruleset, ss);
+ if (r)
+ return r;
+- unsigned size;
+- r = prepare_pool_size(pool_type, erasure_code_profile, &size, ss);
++ unsigned size, min_size;
++ r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
+ if (r)
+ return r;
+ uint32_t stripe_width = 0;
+ r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
+@@ -3245,9 +3373,9 @@
+ if (g_conf->osd_pool_default_flag_hashpspool)
+ pi->flags |= pg_pool_t::FLAG_HASHPSPOOL;
+
+ pi->size = size;
+- pi->min_size = g_conf->get_osd_pool_default_min_size();
++ pi->min_size = min_size;
+ pi->crush_ruleset = crush_ruleset;
+ pi->object_hash = CEPH_STR_HASH_RJENKINS;
+ pi->set_pg_num(pg_num ? pg_num : g_conf->osd_pool_default_pg_num);
+ pi->set_pgp_num(pgp_num ? pgp_num : g_conf->osd_pool_default_pgp_num);
+@@ -3335,8 +3463,9 @@
+ string val;
+ string interr, floaterr;
+ int64_t n = 0;
+ double f = 0;
++ int64_t uf = 0; // micro-f
+ if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
+ // wasn't a string; maybe an older mon forwarded json with an int?
+ if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
+ return -EINVAL; // no value!
+@@ -3344,8 +3473,19 @@
+ // we got a string. see if it contains an int.
+ n = strict_strtoll(val.c_str(), 10, &interr);
+ // or a float
+ f = strict_strtod(val.c_str(), &floaterr);
++ uf = llrintl(f * (double)1000000.0);
++ }
++
++ if (!p.is_tier() &&
++ (var == "hit_set_type" || var == "hit_set_period" ||
++ var == "hit_set_count" || var == "hit_set_fpp" ||
++ var == "target_max_objects" || var == "target_max_bytes" ||
++ var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
++ var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
++ ss << "pool '" << poolstr << "' is not a tier pool: variable not applicable";
++ return -EACCES;
+ }
+
+ if (var == "size") {
+ if (p.type == pg_pool_t::TYPE_ERASURE) {
+@@ -3398,9 +3538,9 @@
+ force != "--yes-i-really-mean-it") {
+ ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
+ return -EPERM;
+ }
+- int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
++ int expected_osds = MAX(1, MIN(p.get_pg_num(), osdmap.get_num_osds()));
+ int64_t new_pgs = n - p.get_pg_num();
+ int64_t pgs_per_osd = new_pgs / expected_osds;
+ if (pgs_per_osd > g_conf->mon_osd_max_split_count) {
+ ss << "specified pg_num " << n << " is too large (creating "
+@@ -3486,8 +3626,9 @@
+ return -EINVAL;
+ }
+ p.hit_set_period = n;
+ } else if (var == "hit_set_count") {
++
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+@@ -3527,9 +3668,9 @@
+ if (f < 0 || f > 1.0) {
+ ss << "value must be in the range 0..1";
+ return -ERANGE;
+ }
+- p.cache_target_dirty_ratio_micro = f * 1000000;
++ p.cache_target_dirty_ratio_micro = uf;
+ } else if (var == "cache_target_full_ratio") {
+ if (floaterr.length()) {
+ ss << "error parsing float '" << val << "': " << floaterr;
+ return -EINVAL;
+@@ -3537,9 +3678,9 @@
+ if (f < 0 || f > 1.0) {
+ ss << "value must be in the range 0..1";
+ return -ERANGE;
+ }
+- p.cache_target_full_ratio_micro = f * 1000000;
++ p.cache_target_full_ratio_micro = uf;
+ } else if (var == "cache_min_flush_age") {
+ if (interr.length()) {
+ ss << "error parsing int '" << val << "': " << interr;
+ return -EINVAL;
+@@ -4171,8 +4312,26 @@
+ string profile;
+ cmd_getval(g_ceph_context, cmdmap, "profile", profile);
+ if (profile == "")
+ profile = "default";
++ if (profile == "default") {
++ if (!osdmap.has_erasure_code_profile(profile)) {
++ if (pending_inc.has_erasure_code_profile(profile)) {
++ dout(20) << "erasure code profile " << profile << " already pending" << dendl;
++ goto wait;
++ }
++
++ map<string,string> profile_map;
++ err = osdmap.get_erasure_code_profile_default(g_ceph_context,
++ profile_map,
++ &ss);
++ if (err)
++ goto reply;
++ dout(20) << "erasure code profile " << profile << " set" << dendl;
++ pending_inc.set_erasure_code_profile(profile, profile_map);
++ goto wait;
++ }
++ }
+
+ int ruleset;
+ err = crush_ruleset_create_erasure(name, profile, &ruleset, ss);
+ if (err < 0) {
+@@ -4846,8 +5005,27 @@
+ string erasure_code_profile;
+ cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
+ if (erasure_code_profile == "")
+ erasure_code_profile = "default";
++ if (erasure_code_profile == "default") {
++ if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
++ if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
++ dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
++ goto wait;
++ }
++
++ map<string,string> profile_map;
++ err = osdmap.get_erasure_code_profile_default(g_ceph_context,
++ profile_map,
++ &ss);
++ if (err)
++ goto reply;
++ dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
++ pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
++ goto wait;
++ }
++ }
++
+ if (ruleset_name == "") {
+ if (erasure_code_profile == "default") {
+ ruleset_name = "erasure-code";
+ } else {
+@@ -5053,9 +5231,12 @@
+ err = 0;
+ goto reply;
+ }
+ if (tp->tier_of != pool_id) {
+- ss << "tier pool '" << tierpoolstr << "' is a tier of '" << tp->tier_of << "'";
++ ss << "tier pool '" << tierpoolstr << "' is a tier of '"
++ << osdmap.get_pool_name(tp->tier_of) << "': "
++ // be scary about it; this is an inconsistency and bells must go off
++ << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (p->read_tier == tierpool_id) {
+@@ -5181,10 +5362,69 @@
+ ss << "'" << modestr << "' is not a valid cache mode";
+ err = -EINVAL;
+ goto reply;
+ }
++
++ // pool already has this cache-mode set and there are no pending changes
++ if (p->cache_mode == mode &&
++ (pending_inc.new_pools.count(pool_id) == 0 ||
++ pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
++ ss << "set cache-mode for pool '" << poolstr << "'"
++ << " to " << pg_pool_t::get_cache_mode_name(mode);
++ err = 0;
++ goto reply;
++ }
++
++ /* Mode description:
++ *
++ * none: No cache-mode defined
++ * forward: Forward all reads and writes to base pool
++ * writeback: Cache writes, promote reads from base pool
++ * readonly: Forward writes to base pool
++ *
++ * Hence, these are the allowed transitions:
++ *
++ * none -> any
++ * forward -> writeback || any IF num_objects_dirty == 0
++ * writeback -> forward
++ * readonly -> any
++ */
++
++ // We check if the transition is valid against the current pool mode, as
++ // it is the only committed state thus far. We will blantly squash
++ // whatever mode is on the pending state.
++
++ if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
++ mode != pg_pool_t::CACHEMODE_FORWARD) {
++ ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
++ << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
++ << "' pool; only '"
++ << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
++ << "' allowed.";
++ err = -EINVAL;
++ goto reply;
++ }
++ if (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
++ mode != pg_pool_t::CACHEMODE_WRITEBACK) {
++
++ const pool_stat_t& tier_stats =
++ mon->pgmon()->pg_map.get_pg_pool_sum_stat(pool_id);
++
++ if (tier_stats.stats.sum.num_objects_dirty > 0) {
++ ss << "unable to set cache-mode '"
++ << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
++ << "': dirty objects found";
++ err = -EBUSY;
++ goto reply;
++ }
++ }
++
+ // go
+- pending_inc.get_new_pool(pool_id, p)->cache_mode = mode;
++ pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
++ np->cache_mode = mode;
++ // set this both when moving to and from cache_mode NONE. this is to
++ // capture legacy pools that were set up before this flag existed.
++ np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
+ ss << "set cache-mode for pool '" << poolstr
+ << "' to " << pg_pool_t::get_cache_mode_name(mode);
+ wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
+ get_last_committed() + 1));
+@@ -5622,10 +5862,14 @@
+ << osdmap.get_pool_name(p->tier_of) << "'";
+ return -EBUSY;
+ }
+ if (!p->tiers.empty()) {
+- *ss << "pool '" << poolstr << "' includes tiers "
+- << p->tiers;
++ *ss << "pool '" << poolstr << "' has tiers";
++ for(std::set<uint64_t>::iterator i = p->tiers.begin(); i != p->tiers.end(); ++i) {
++ const char *name = osdmap.get_pool_name(*i);
++ assert(name != NULL);
++ *ss << " " << name;
++ }
+ return -EBUSY;
+ }
+ *ss << "pool '" << poolstr << "' removed";
+ return 0;
+--- a/src/mon/OSDMonitor.h
++++ b/src/mon/OSDMonitor.h
+@@ -271,9 +271,9 @@
+ map<string,string> *erasure_code_profile_map,
+ stringstream &ss);
+ int prepare_pool_size(const unsigned pool_type,
+ const string &erasure_code_profile,
+- unsigned *size,
++ unsigned *size, unsigned *min_size,
+ stringstream &ss);
+ int prepare_pool_stripe_width(const unsigned pool_type,
+ const string &erasure_code_profile,
+ unsigned *stripe_width,
+--- a/src/mon/PGMonitor.cc
++++ b/src/mon/PGMonitor.cc
+@@ -1214,13 +1214,15 @@
+ }
+
+ //void PGMonitor::dump_object_stat_sum(stringstream& ss, Formatter *f,
+ void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f,
+- object_stat_sum_t &sum, bool verbose)
++ object_stat_sum_t &sum, uint64_t avail,
++ bool verbose)
+ {
+ if (f) {
+ f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
+ f->dump_int("bytes_used", sum.num_bytes);
++ f->dump_unsigned("max_avail", avail);
+ f->dump_int("objects", sum.num_objects);
+ if (verbose) {
+ f->dump_int("dirty", sum.num_objects_dirty);
+ f->dump_int("rd", sum.num_rd);
+@@ -1231,8 +1233,9 @@
+ } else {
+ tbl << stringify(si_t(sum.num_bytes));
+ int64_t kb_used = SHIFT_ROUND_UP(sum.num_bytes, 10);
+ tbl << percentify(((float)kb_used / pg_map.osd_sum.kb)*100);
++ tbl << si_t(avail);
+ tbl << sum.num_objects;
+ if (verbose) {
+ tbl << stringify(si_t(sum.num_objects_dirty))
+ << stringify(si_t(sum.num_rd))
+@@ -1240,8 +1243,26 @@
+ }
+ }
+ }
+
++int64_t PGMonitor::get_rule_avail(OSDMap& osdmap, int ruleno)
++{
++ map<int,float> wm;
++ int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
++ if (r < 0)
++ return r;
++ if(wm.size() == 0)
++ return 0;
++ int64_t min = -1;
++ for (map<int,float>::iterator p = wm.begin(); p != wm.end(); ++p) {
++ int64_t proj = (float)(pg_map.osd_stat[p->first].kb_avail * 1024ull) /
++ (double)p->second;
++ if (min < 0 || proj < min)
++ min = proj;
++ }
++ return min;
++}
++
+ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
+ {
+ TextTable tbl;
+
+@@ -1251,18 +1272,20 @@
+ tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
+ if (verbose)
+ tbl.define_column("CATEGORY", TextTable::LEFT, TextTable::LEFT);
+- tbl.define_column("USED", TextTable::LEFT, TextTable::LEFT);
+- tbl.define_column("\%USED", TextTable::LEFT, TextTable::LEFT);
+- tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::LEFT);
+- if (verbose) {
+- tbl.define_column("DIRTY", TextTable::LEFT, TextTable::LEFT);
+- tbl.define_column("READ", TextTable::LEFT, TextTable::LEFT);
+- tbl.define_column("WRITE", TextTable::LEFT, TextTable::LEFT);
++ tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
++ tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
++ tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
++ tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
++ if (verbose) {
++ tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
++ tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
++ tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
+ }
+ }
+
++ map<int,uint64_t> avail_by_rule;
+ OSDMap &osdmap = mon->osdmon()->osdmap;
+ for (map<int64_t,pg_pool_t>::const_iterator p = osdmap.get_pools().begin();
+ p != osdmap.get_pools().end(); ++p) {
+ int64_t pool_id = p->first;
+@@ -1270,8 +1293,40 @@
+ continue;
+ string pool_name = osdmap.get_pool_name(pool_id);
+ pool_stat_t &stat = pg_map.pg_pool_sum[pool_id];
+
++ const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
++ int ruleno = osdmap.crush->find_rule(pool->get_crush_ruleset(),
++ pool->get_type(),
++ pool->get_size());
++ uint64_t avail;
++ if (avail_by_rule.count(ruleno) == 0) {
++ avail = get_rule_avail(osdmap, ruleno);
++ avail_by_rule[ruleno] = avail;
++ } else {
++ avail = avail_by_rule[ruleno];
++ }
++ switch (pool->get_type()) {
++ case pg_pool_t::TYPE_REPLICATED:
++ avail /= pool->get_size();
++ break;
++ case pg_pool_t::TYPE_ERASURE:
++ {
++ const map<string,string>& ecp =
++ osdmap.get_erasure_code_profile(pool->erasure_code_profile);
++ map<string,string>::const_iterator pm = ecp.find("m");
++ map<string,string>::const_iterator pk = ecp.find("k");
++ if (pm != ecp.end() && pk != ecp.end()) {
++ int k = atoi(pk->second.c_str());
++ int m = atoi(pm->second.c_str());
++ avail = avail * k / (m + k);
++ }
++ }
++ break;
++ default:
++ assert(0 == "unrecognized pool type");
++ }
++
+ if (f) {
+ f->open_object_section("pool");
+ f->dump_string("name", pool_name);
+ f->dump_int("id", pool_id);
+@@ -1281,9 +1336,9 @@
+ << pool_id;
+ if (verbose)
+ tbl << "-";
+ }
+- dump_object_stat_sum(tbl, f, stat.stats.sum, verbose);
++ dump_object_stat_sum(tbl, f, stat.stats.sum, avail, verbose);
+ if (f)
+ f->close_section(); // stats
+ else
+ tbl << TextTable::endrow;
+@@ -1300,9 +1355,9 @@
+ tbl << ""
+ << ""
+ << it->first;
+ }
+- dump_object_stat_sum(tbl, f, it->second, verbose);
++ dump_object_stat_sum(tbl, f, it->second, avail, verbose);
+ if (f)
+ f->close_section(); // category name
+ else
+ tbl << TextTable::endrow;
+@@ -1334,14 +1389,14 @@
+ }
+ f->close_section();
+ } else {
+ TextTable tbl;
+- tbl.define_column("SIZE", TextTable::LEFT, TextTable::LEFT);
+- tbl.define_column("AVAIL", TextTable::LEFT, TextTable::LEFT);
+- tbl.define_column("RAW USED", TextTable::LEFT, TextTable::LEFT);
+- tbl.define_column("\%RAW USED", TextTable::LEFT, TextTable::LEFT);
++ tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
++ tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
++ tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
++ tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
+ if (verbose) {
+- tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::LEFT);
++ tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
+ }
+ tbl << stringify(si_t(pg_map.osd_sum.kb*1024))
+ << stringify(si_t(pg_map.osd_sum.kb_avail*1024))
+ << stringify(si_t(pg_map.osd_sum.kb_used*1024));
+--- a/src/mon/PGMonitor.h
++++ b/src/mon/PGMonitor.h
+@@ -145,9 +145,13 @@
+ int threshold,
+ vector<string>& args) const;
+
+ void dump_object_stat_sum(TextTable &tbl, Formatter *f,
+- object_stat_sum_t &sum, bool verbose);
++ object_stat_sum_t &sum,
++ uint64_t avail,
++ bool verbose);
++
++ int64_t get_rule_avail(OSDMap& osdmap, int ruleno);
+
+ public:
+ PGMonitor(Monitor *mn, Paxos *p, const string& service_name)
+ : PaxosService(mn, p, service_name),
+--- a/src/mon/Paxos.cc
++++ b/src/mon/Paxos.cc
+@@ -1263,9 +1263,10 @@
+ // -- READ --
+
+ bool Paxos::is_readable(version_t v)
+ {
+- dout(1) << "is_readable now=" << ceph_clock_now(g_ceph_context) << " lease_expire=" << lease_expire
++ dout(5) << "is_readable now=" << ceph_clock_now(g_ceph_context)
++ << " lease_expire=" << lease_expire
+ << " has v" << v << " lc " << last_committed << dendl;
+ if (v > last_committed)
+ return false;
+ return
+--- a/src/msg/SimpleMessenger.cc
++++ b/src/msg/SimpleMessenger.cc
+@@ -85,8 +85,11 @@
+ {
+ ldout(cct,10) << "shutdown " << get_myaddr() << dendl;
+ mark_down_all();
+ dispatch_queue.shutdown();
++
++ // break ref cycles on the loopback connection
++ local_connection->set_priv(NULL);
+ return 0;
+ }
+
+ int SimpleMessenger::_send_message(Message *m, const entity_inst_t& dest,
+--- a/src/os/FileJournal.cc
++++ b/src/os/FileJournal.cc
+@@ -1757,9 +1757,14 @@
+
+ // ok!
+ if (seq)
+ *seq = h->seq;
+- journalq.push_back(pair<uint64_t,off64_t>(h->seq, pos));
++
++ // works around an apparent GCC 4.8(?) compiler bug about unaligned
++ // bind by reference to (packed) h->seq
++ journalq.push_back(
++ pair<uint64_t,off64_t>(static_cast<uint64_t>(h->seq),
++ static_cast<off64_t>(pos)));
+
+ if (next_pos)
+ *next_pos = pos;
+
+--- a/src/os/FileStore.cc
++++ b/src/os/FileStore.cc
+@@ -125,9 +125,9 @@
+ PerfCounters &logger)
+ {
+ os_commit_latency.consume_next(
+ logger.get_tavg_ms(
+- l_os_commit_lat));
++ l_os_j_lat));
+ os_apply_latency.consume_next(
+ logger.get_tavg_ms(
+ l_os_apply_lat));
+ }
+@@ -1557,8 +1557,10 @@
+ delete backend;
+ backend = generic_backend;
+ }
+
++ force_sync = false;
++
+ object_map.reset();
+
+ {
+ Mutex::Locker l(sync_entry_timeo_lock);
+@@ -1710,9 +1712,10 @@
+ }
+
+ void FileStore::_finish_op(OpSequencer *osr)
+ {
+- Op *o = osr->dequeue();
++ list<Context*> to_queue;
++ Op *o = osr->dequeue(&to_queue);
+
+ dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << dendl;
+ osr->apply_lock.Unlock(); // locked in _do_op
+
+@@ -1728,8 +1731,9 @@
+ }
+ if (o->onreadable) {
+ op_finisher.queue(o->onreadable);
+ }
++ op_finisher.queue(to_queue);
+ delete o;
+ }
+
+
+@@ -1843,16 +1847,18 @@
+
+ // this should queue in order because the journal does it's completions in order.
+ queue_op(osr, o);
+
+- osr->dequeue_journal();
++ list<Context*> to_queue;
++ osr->dequeue_journal(&to_queue);
+
+ // do ondisk completions async, to prevent any onreadable_sync completions
+ // getting blocked behind an ondisk completion.
+ if (ondisk) {
+ dout(10) << " queueing ondisk " << ondisk << dendl;
+ ondisk_finisher.queue(ondisk);
+ }
++ ondisk_finisher.queue(to_queue);
+ }
+
+ int FileStore::_do_transactions(
+ list<Transaction*> &tls,
+@@ -2544,13 +2550,14 @@
+ t.dump(&f);
+ f.close_section();
+ f.flush(*_dout);
+ *_dout << dendl;
+- assert(0 == "unexpected error");
+
+ if (r == -EMFILE) {
+ dump_open_fds(g_ceph_context);
+ }
++
++ assert(0 == "unexpected error");
+ }
+ }
+
+ spos.op++;
+--- a/src/os/FileStore.h
++++ b/src/os/FileStore.h
+@@ -192,21 +192,72 @@
+ class OpSequencer : public Sequencer_impl {
+ Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
+ list<Op*> q;
+ list<uint64_t> jq;
++ list<pair<uint64_t, Context*> > flush_commit_waiters;
+ Cond cond;
+ public:
+ Sequencer *parent;
+ Mutex apply_lock; // for apply mutual exclusion
+
++ /// get_max_uncompleted
++ bool _get_max_uncompleted(
++ uint64_t *seq ///< [out] max uncompleted seq
++ ) {
++ assert(qlock.is_locked());
++ assert(seq);
++ *seq = 0;
++ if (q.empty() && jq.empty())
++ return true;
++
++ if (!q.empty())
++ *seq = q.back()->op;
++ if (!jq.empty() && jq.back() > *seq)
++ *seq = jq.back();
++
++ return false;
++ } /// @returns true if both queues are empty
++
++ /// get_min_uncompleted
++ bool _get_min_uncompleted(
++ uint64_t *seq ///< [out] min uncompleted seq
++ ) {
++ assert(qlock.is_locked());
++ assert(seq);
++ *seq = 0;
++ if (q.empty() && jq.empty())
++ return true;
++
++ if (!q.empty())
++ *seq = q.front()->op;
++ if (!jq.empty() && jq.front() < *seq)
++ *seq = jq.front();
++
++ return false;
++ } /// @returns true if both queues are empty
++
++ void _wake_flush_waiters(list<Context*> *to_queue) {
++ uint64_t seq;
++ if (_get_min_uncompleted(&seq))
++ seq = -1;
++
++ for (list<pair<uint64_t, Context*> >::iterator i =
++ flush_commit_waiters.begin();
++ i != flush_commit_waiters.end() && i->first < seq;
++ flush_commit_waiters.erase(i++)) {
++ to_queue->push_back(i->second);
++ }
++ }
++
+ void queue_journal(uint64_t s) {
+ Mutex::Locker l(qlock);
+ jq.push_back(s);
+ }
+- void dequeue_journal() {
++ void dequeue_journal(list<Context*> *to_queue) {
+ Mutex::Locker l(qlock);
+ jq.pop_front();
+ cond.Signal();
++ _wake_flush_waiters(to_queue);
+ }
+ void queue(Op *o) {
+ Mutex::Locker l(qlock);
+ q.push_back(o);
+@@ -214,22 +265,28 @@
+ Op *peek_queue() {
+ assert(apply_lock.is_locked());
+ return q.front();
+ }
+- Op *dequeue() {
++
++ Op *dequeue(list<Context*> *to_queue) {
++ assert(to_queue);
+ assert(apply_lock.is_locked());
+ Mutex::Locker l(qlock);
+ Op *o = q.front();
+ q.pop_front();
+ cond.Signal();
++
++ _wake_flush_waiters(to_queue);
+ return o;
+ }
++
+ void flush() {
+ Mutex::Locker l(qlock);
+
+ while (g_conf->filestore_blackhole)
+ cond.Wait(qlock); // wait forever
+
++
+ // get max for journal _or_ op queues
+ uint64_t seq = 0;
+ if (!q.empty())
+ seq = q.back()->op;
+@@ -242,8 +299,19 @@
+ (!jq.empty() && jq.front() <= seq))
+ cond.Wait(qlock);
+ }
+ }
++ bool flush_commit(Context *c) {
++ Mutex::Locker l(qlock);
++ uint64_t seq = 0;
++ if (_get_max_uncompleted(&seq)) {
++ delete c;
++ return true;
++ } else {
++ flush_commit_waiters.push_back(make_pair(seq, c));
++ return false;
++ }
++ }
+
+ OpSequencer()
+ : qlock("FileStore::OpSequencer::qlock", false, false),
+ parent(0),
+--- a/src/os/GenericObjectMap.cc
++++ b/src/os/GenericObjectMap.cc
+@@ -688,10 +688,8 @@
+ remove_header(old_header->cid, old_header->oid, old_header, t);
+ old_header->cid = cid;
+ old_header->oid = target;
+ set_header(cid, target, *old_header, t);
+-
+- // "in_use" still hold the "seq"
+ }
+
+ int GenericObjectMap::init(bool do_upgrade)
+ {
+@@ -925,64 +923,43 @@
+ set<string> to_get;
+ to_get.insert(header_key(cid, oid));
+ _Header header;
+
+- while (1) {
+- map<string, bufferlist> out;
+- bool try_again = false;
++ map<string, bufferlist> out;
+
+- int r = db->get(GHOBJECT_TO_SEQ_PREFIX, to_get, &out);
+- if (r < 0)
+- return Header();
+- if (out.empty())
+- return Header();
+-
+- bufferlist::iterator iter = out.begin()->second.begin();
+- header.decode(iter);
+-
+- while (in_use.count(header.seq)) {
+- header_cond.Wait(header_lock);
+-
+- // Another thread is hold this header, wait for it.
+- // Because the seq of this object may change, such as clone
+- // and rename operation, here need to look up "seq" again
+- try_again = true;
+- }
++ int r = db->get(GHOBJECT_TO_SEQ_PREFIX, to_get, &out);
++ if (r < 0)
++ return Header();
++ if (out.empty())
++ return Header();
+
+- if (!try_again) {
+- break;
+- }
+- }
++ bufferlist::iterator iter = out.begin()->second.begin();
++ header.decode(iter);
+
+- Header ret = Header(new _Header(header), RemoveOnDelete(this));
+- in_use.insert(ret->seq);
++ Header ret = Header(new _Header(header));
+ return ret;
+ }
+
+ GenericObjectMap::Header GenericObjectMap::_generate_new_header(
+ const coll_t &cid, const ghobject_t &oid, Header parent,
+ KeyValueDB::Transaction t)
+ {
+- Header header = Header(new _Header(), RemoveOnDelete(this));
++ Header header = Header(new _Header());
+ header->seq = state.seq++;
+ if (parent) {
+ header->parent = parent->seq;
+ }
+ header->num_children = 1;
+ header->oid = oid;
+ header->cid = cid;
+- assert(!in_use.count(header->seq));
+- in_use.insert(header->seq);
+
+ write_state(t);
+ return header;
+ }
+
+ GenericObjectMap::Header GenericObjectMap::lookup_parent(Header input)
+ {
+ Mutex::Locker l(header_lock);
+- while (in_use.count(input->parent))
+- header_cond.Wait(header_lock);
+ map<string, bufferlist> out;
+ set<string> keys;
+ keys.insert(PARENT_KEY);
+
+@@ -998,15 +975,14 @@
+ assert(0);
+ return Header();
+ }
+
+- Header header = Header(new _Header(), RemoveOnDelete(this));
++ Header header = Header(new _Header());
+ header->seq = input->parent;
+ bufferlist::iterator iter = out.begin()->second.begin();
+ header->decode(iter);
+ dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent "
+ << header->parent << dendl;
+- in_use.insert(header->seq);
+ return header;
+ }
+
+ GenericObjectMap::Header GenericObjectMap::lookup_create_header(
+--- a/src/os/GenericObjectMap.h
++++ b/src/os/GenericObjectMap.h
+@@ -73,14 +73,8 @@
+ /**
+ * Serializes access to next_seq as well as the in_use set
+ */
+ Mutex header_lock;
+- Cond header_cond;
+-
+- /**
+- * Set of headers currently in use
+- */
+- set<uint64_t> in_use;
+
+ GenericObjectMap(KeyValueDB *db) : db(db), header_lock("GenericObjectMap") {}
+
+ int get(
+@@ -370,8 +364,14 @@
+ GenericObjectMapIterator _get_iterator(Header header, string prefix) {
+ return GenericObjectMapIterator(new GenericObjectMapIteratorImpl(this, header, prefix));
+ }
+
++ Header generate_new_header(const coll_t &cid, const ghobject_t &oid,
++ Header parent, KeyValueDB::Transaction t) {
++ Mutex::Locker l(header_lock);
++ return _generate_new_header(cid, oid, parent, t);
++ }
++
+ // Scan keys in header into out_keys and out_values (if nonnull)
+ int scan(Header header, const string &prefix, const set<string> &in_keys,
+ set<string> *out_keys, map<string, bufferlist> *out_values);
+
+@@ -393,13 +393,8 @@
+ * Has the side effect of syncronously saving the new GenericObjectMap state
+ */
+ Header _generate_new_header(const coll_t &cid, const ghobject_t &oid,
+ Header parent, KeyValueDB::Transaction t);
+- Header generate_new_header(const coll_t &cid, const ghobject_t &oid,
+- Header parent, KeyValueDB::Transaction t) {
+- Mutex::Locker l(header_lock);
+- return _generate_new_header(cid, oid, parent, t);
+- }
+
+ // Lookup leaf header for c oid
+ Header _lookup_header(const coll_t &cid, const ghobject_t &oid);
+
+@@ -424,28 +419,8 @@
+
+ // Sets header @see set_header
+ void _set_header(Header header, const bufferlist &bl,
+ KeyValueDB::Transaction t);
+-
+- /**
+- * Removes header seq lock once Header is out of scope
+- * @see _lookup_header
+- * @see lookup_parent
+- * @see generate_new_header
+- */
+- class RemoveOnDelete {
+- public:
+- GenericObjectMap *db;
+- RemoveOnDelete(GenericObjectMap *db) :
+- db(db) {}
+- void operator() (_Header *header) {
+- Mutex::Locker l(db->header_lock);
+- db->in_use.erase(header->seq);
+- db->header_cond.Signal();
+- delete header;
+- }
+- };
+- friend class RemoveOnDelete;
+ };
+ WRITE_CLASS_ENCODER(GenericObjectMap::_Header)
+ WRITE_CLASS_ENCODER(GenericObjectMap::State)
+
+--- a/src/os/KeyValueStore.cc
++++ b/src/os/KeyValueStore.cc
+@@ -68,90 +68,78 @@
+ const string KeyValueStore::COLLECTION_ATTR = "__COLL_ATTR__";
+
+ // ============== StripObjectMap Implementation =================
+
+-void StripObjectMap::sync_wrap(StripObjectHeader &strip_header,
+- KeyValueDB::Transaction t,
+- const SequencerPosition &spos)
+-{
+- dout(10) << __func__ << " cid: " << strip_header.cid << "oid: "
+- << strip_header.oid << " setting spos to " << strip_header.spos
+- << dendl;
+- strip_header.spos = spos;
+- strip_header.header->data.clear();
+- ::encode(strip_header, strip_header.header->data);
+-
+- sync(strip_header.header, t);
+-}
+-
+-bool StripObjectMap::check_spos(const StripObjectHeader &header,
+- const SequencerPosition &spos)
+-{
+- if (spos > header.spos) {
+- stringstream out;
+- dout(10) << "cid: " << "oid: " << header.oid
+- << " not skipping op, *spos " << spos << dendl;
+- dout(10) << " > header.spos " << header.spos << dendl;
+- return false;
+- } else {
+- dout(10) << "cid: " << "oid: " << header.oid << " skipping op, spos "
+- << spos << " <= header.spos " << header.spos << dendl;
+- return true;
+- }
+-}
+-
+-int StripObjectMap::save_strip_header(StripObjectHeader &strip_header,
+- const SequencerPosition &spos,
++int StripObjectMap::save_strip_header(StripObjectHeaderRef strip_header,
+ KeyValueDB::Transaction t)
+ {
+- strip_header.spos = spos;
+- strip_header.header->data.clear();
+- ::encode(strip_header, strip_header.header->data);
++ strip_header->header->data.clear();
++ ::encode(*strip_header, strip_header->header->data);
+
+- set_header(strip_header.cid, strip_header.oid, *(strip_header.header), t);
++ set_header(strip_header->cid, strip_header->oid, *(strip_header->header), t);
+ return 0;
+ }
+
+ int StripObjectMap::create_strip_header(const coll_t &cid,
+ const ghobject_t &oid,
+- StripObjectHeader &strip_header,
++ StripObjectHeaderRef *strip_header,
+ KeyValueDB::Transaction t)
+ {
+- Header header = lookup_create_header(cid, oid, t);
++ Header header = generate_new_header(cid, oid, Header(), t);
+ if (!header)
+ return -EINVAL;
+
+- strip_header.oid = oid;
+- strip_header.cid = cid;
+- strip_header.header = header;
++ StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
++ tmp->oid = oid;
++ tmp->cid = cid;
++ tmp->header = header;
++ if (strip_header)
++ *strip_header = tmp;
+
+ return 0;
+ }
+
+ int StripObjectMap::lookup_strip_header(const coll_t &cid,
+ const ghobject_t &oid,
+- StripObjectHeader &strip_header)
++ StripObjectHeaderRef *strip_header)
+ {
++ if (cid != coll_t()) {
++ Mutex::Locker l(lock);
++ pair<coll_t, StripObjectHeaderRef> p;
++ if (caches.lookup(oid, &p)) {
++ if (p.first == cid) {
++ *strip_header = p.second;
++ return 0;
++ }
++ }
++ }
+ Header header = lookup_header(cid, oid);
+
+ if (!header) {
+ dout(20) << "lookup_strip_header failed to get strip_header "
+ << " cid " << cid <<" oid " << oid << dendl;
+ return -ENOENT;
+ }
+
++
++ StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
+ if (header->data.length()) {
+ bufferlist::iterator bliter = header->data.begin();
+- ::decode(strip_header, bliter);
++ ::decode(*tmp, bliter);
+ }
+
+- if (strip_header.strip_size == 0)
+- strip_header.strip_size = default_strip_size;
++ if (tmp->strip_size == 0)
++ tmp->strip_size = default_strip_size;
+
+- strip_header.oid = oid;
+- strip_header.cid = cid;
+- strip_header.header = header;
++ tmp->oid = oid;
++ tmp->cid = cid;
++ tmp->header = header;
+
++ {
++ Mutex::Locker l(lock);
++ caches.add(oid, make_pair(cid, tmp));
++ }
++ *strip_header = tmp;
+ dout(10) << "lookup_strip_header done " << " cid " << cid << " oid "
+ << oid << dendl;
+ return 0;
+ }
+@@ -193,125 +181,114 @@
+ dout(10) << "file_to_extents done " << dendl;
+ return 0;
+ }
+
+-void StripObjectMap::clone_wrap(StripObjectHeader &old_header,
++void StripObjectMap::clone_wrap(StripObjectHeaderRef old_header,
+ const coll_t &cid, const ghobject_t &oid,
+ KeyValueDB::Transaction t,
+- StripObjectHeader *origin_header,
+- StripObjectHeader *target_header)
++ StripObjectHeaderRef *target_header)
+ {
+ Header new_origin_header;
++ StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
+
+- if (target_header)
+- *target_header = old_header;
+- if (origin_header)
+- *origin_header = old_header;
+-
+- clone(old_header.header, cid, oid, t, &new_origin_header,
+- &target_header->header);
++ clone(old_header->header, cid, oid, t, &new_origin_header,
++ &tmp->header);
+
+- if(origin_header)
+- origin_header->header = new_origin_header;
++ tmp->oid = oid;
++ tmp->cid = cid;
++ tmp->strip_size = old_header->strip_size;
++ tmp->max_size = old_header->max_size;
++ tmp->bits = old_header->bits;
++ old_header->header = new_origin_header;
+
+- if (target_header) {
+- target_header->oid = oid;
+- target_header->cid = cid;
+- }
++ if (target_header)
++ *target_header = tmp;
+ }
+
+-void StripObjectMap::rename_wrap(const coll_t &cid, const ghobject_t &oid,
++void StripObjectMap::rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid,
+ KeyValueDB::Transaction t,
+- StripObjectHeader *header)
++ StripObjectHeaderRef *new_header)
+ {
+- assert(header);
+- rename(header->header, cid, oid, t);
++ rename(old_header->header, cid, oid, t);
+
+- if (header) {
+- header->oid = oid;
+- header->cid = cid;
+- }
++ StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
++ tmp->strip_size = old_header->strip_size;
++ tmp->max_size = old_header->max_size;
++ tmp->bits = old_header->bits;
++ tmp->header = old_header->header;
++ tmp->oid = oid;
++ tmp->cid = cid;
++
++ if (new_header)
++ *new_header = tmp;
++
++ old_header->header = Header();
++ old_header->deleted = true;
+ }
+
+-int StripObjectMap::get_values_with_header(const StripObjectHeader &header,
++int StripObjectMap::get_values_with_header(const StripObjectHeaderRef header,
+ const string &prefix,
+ const set<string> &keys,
+ map<string, bufferlist> *out)
+ {
+- return scan(header.header, prefix, keys, 0, out);
++ return scan(header->header, prefix, keys, 0, out);
+ }
+
+-int StripObjectMap::get_keys_with_header(const StripObjectHeader &header,
++int StripObjectMap::get_keys_with_header(const StripObjectHeaderRef header,
+ const string &prefix,
+ set<string> *keys)
+ {
+- ObjectMap::ObjectMapIterator iter = _get_iterator(header.header, prefix);
++ ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix);
+ for (; iter->valid(); iter->next()) {
+ if (iter->status())
+ return iter->status();
+ keys->insert(iter->key());
+ }
+ return 0;
+ }
+
+-int StripObjectMap::get_with_header(const StripObjectHeader &header,
++int StripObjectMap::get_with_header(const StripObjectHeaderRef header,
+ const string &prefix, map<string, bufferlist> *out)
+ {
+- ObjectMap::ObjectMapIterator iter = _get_iterator(header.header, prefix);
++ ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix);
+ for (iter->seek_to_first(); iter->valid(); iter->next()) {
+ if (iter->status())
+ return iter->status();
+ out->insert(make_pair(iter->key(), iter->value()));
+ }
+
+ return 0;
+ }
+-// =========== KeyValueStore::SubmitManager Implementation ==============
+-
+-uint64_t KeyValueStore::SubmitManager::op_submit_start()
+-{
+- lock.Lock();
+- uint64_t op = ++op_seq;
+- dout(10) << "op_submit_start " << op << dendl;
+- return op;
+-}
+-
+-void KeyValueStore::SubmitManager::op_submit_finish(uint64_t op)
+-{
+- dout(10) << "op_submit_finish " << op << dendl;
+- if (op != op_submitted + 1) {
+- dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1)
+- << ", OUT OF ORDER" << dendl;
+- assert(0 == "out of order op_submit_finish");
+- }
+- op_submitted = op;
+- lock.Unlock();
+-}
+-
+
+ // ========= KeyValueStore::BufferTransaction Implementation ============
+
+ int KeyValueStore::BufferTransaction::lookup_cached_header(
+ const coll_t &cid, const ghobject_t &oid,
+- StripObjectMap::StripObjectHeader **strip_header,
++ StripObjectMap::StripObjectHeaderRef *strip_header,
+ bool create_if_missing)
+ {
+- StripObjectMap::StripObjectHeader header;
++ StripObjectMap::StripObjectHeaderRef header;
+ int r = 0;
+
+ StripHeaderMap::iterator it = strip_headers.find(make_pair(cid, oid));
+ if (it != strip_headers.end()) {
+- if (it->second.deleted)
++
++ if (!it->second->deleted) {
++ if (strip_header)
++ *strip_header = it->second;
++ return 0;
++ } else if (!create_if_missing) {
+ return -ENOENT;
++ }
+
+- if (strip_header)
+- *strip_header = &it->second;
+- return 0;
++ // If (it->second.deleted && create_if_missing) go down
++ r = -ENOENT;
++ } else {
++ r = store->backend->lookup_strip_header(cid, oid, &header);
+ }
+
+- r = store->backend->lookup_strip_header(cid, oid, header);
+- if (r < 0 && create_if_missing) {
+- r = store->backend->create_strip_header(cid, oid, header, t);
++ if (r == -ENOENT && create_if_missing) {
++ r = store->backend->create_strip_header(cid, oid, &header, t);
+ }
+
+ if (r < 0) {
+ dout(10) << __func__ << " " << cid << "/" << oid << " "
+@@ -320,23 +297,23 @@
+ }
+
+ strip_headers[make_pair(cid, oid)] = header;
+ if (strip_header)
+- *strip_header = &strip_headers[make_pair(cid, oid)];
++ *strip_header = strip_headers[make_pair(cid, oid)];
+ return r;
+ }
+
+ int KeyValueStore::BufferTransaction::get_buffer_keys(
+- StripObjectMap::StripObjectHeader &strip_header, const string &prefix,
++ StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix,
+ const set<string> &keys, map<string, bufferlist> *out)
+ {
+ set<string> need_lookup;
+
+ for (set<string>::iterator it = keys.begin(); it != keys.end(); ++it) {
+ map<pair<string, string>, bufferlist>::iterator i =
+- strip_header.buffers.find(make_pair(prefix, *it));
++ strip_header->buffers.find(make_pair(prefix, *it));
+
+- if (i != strip_header.buffers.end()) {
++ if (i != strip_header->buffers.end()) {
+ (*out)[*it].swap(i->second);
+ } else {
+ need_lookup.insert(*it);
+ }
+@@ -345,117 +322,118 @@
+ if (!need_lookup.empty()) {
+ int r = store->backend->get_values_with_header(strip_header, prefix,
+ need_lookup, out);
+ if (r < 0) {
+- dout(10) << __func__ << " " << strip_header.cid << "/"
+- << strip_header.oid << " " << " r = " << r << dendl;
++ dout(10) << __func__ << " " << strip_header->cid << "/"
++ << strip_header->oid << " " << " r = " << r << dendl;
+ return r;
+ }
+ }
+
+ return 0;
+ }
+
+ void KeyValueStore::BufferTransaction::set_buffer_keys(
+- StripObjectMap::StripObjectHeader &strip_header,
++ StripObjectMap::StripObjectHeaderRef strip_header,
+ const string &prefix, map<string, bufferlist> &values)
+ {
+- store->backend->set_keys(strip_header.header, prefix, values, t);
++ store->backend->set_keys(strip_header->header, prefix, values, t);
+
+ for (map<string, bufferlist>::iterator iter = values.begin();
+ iter != values.end(); ++iter) {
+- strip_header.buffers[make_pair(prefix, iter->first)].swap(iter->second);
++ strip_header->buffers[make_pair(prefix, iter->first)].swap(iter->second);
+ }
+ }
+
+ int KeyValueStore::BufferTransaction::remove_buffer_keys(
+- StripObjectMap::StripObjectHeader &strip_header, const string &prefix,
++ StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix,
+ const set<string> &keys)
+ {
+ for (set<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
+- strip_header.buffers[make_pair(prefix, *iter)] = bufferlist();
++ strip_header->buffers[make_pair(prefix, *iter)] = bufferlist();
+ }
+
+- return store->backend->rm_keys(strip_header.header, prefix, keys, t);
++ return store->backend->rm_keys(strip_header->header, prefix, keys, t);
+ }
+
+ void KeyValueStore::BufferTransaction::clear_buffer_keys(
+- StripObjectMap::StripObjectHeader &strip_header, const string &prefix)
++ StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix)
+ {
+- for (map<pair<string, string>, bufferlist>::iterator iter = strip_header.buffers.begin();
+- iter != strip_header.buffers.end(); ++iter) {
++ for (map<pair<string, string>, bufferlist>::iterator iter = strip_header->buffers.begin();
++ iter != strip_header->buffers.end(); ++iter) {
+ if (iter->first.first == prefix)
+ iter->second = bufferlist();
+ }
+ }
+
+ int KeyValueStore::BufferTransaction::clear_buffer(
+- StripObjectMap::StripObjectHeader &strip_header)
++ StripObjectMap::StripObjectHeaderRef strip_header)
+ {
+- strip_header.deleted = true;
++ strip_header->deleted = true;
+
+- return store->backend->clear(strip_header.header, t);
++ InvalidateCacheContext *c = new InvalidateCacheContext(store, strip_header->cid, strip_header->oid);
++ finishes.push_back(c);
++ return store->backend->clear(strip_header->header, t);
+ }
+
+ void KeyValueStore::BufferTransaction::clone_buffer(
+- StripObjectMap::StripObjectHeader &old_header,
++ StripObjectMap::StripObjectHeaderRef old_header,
+ const coll_t &cid, const ghobject_t &oid)
+ {
+ // Remove target ahead to avoid dead lock
+ strip_headers.erase(make_pair(cid, oid));
+
+- StripObjectMap::StripObjectHeader new_origin_header, new_target_header;
++ StripObjectMap::StripObjectHeaderRef new_target_header;
+
+- store->backend->clone_wrap(old_header, cid, oid, t,
+- &new_origin_header, &new_target_header);
++ store->backend->clone_wrap(old_header, cid, oid, t, &new_target_header);
+
+ // FIXME: Lacking of lock for origin header(now become parent), it will
+ // cause other operation can get the origin header while submitting
+ // transactions
+- strip_headers[make_pair(cid, old_header.oid)] = new_origin_header;
+ strip_headers[make_pair(cid, oid)] = new_target_header;
+ }
+
+ void KeyValueStore::BufferTransaction::rename_buffer(
+- StripObjectMap::StripObjectHeader &old_header,
++ StripObjectMap::StripObjectHeaderRef old_header,
+ const coll_t &cid, const ghobject_t &oid)
+ {
+- if (store->backend->check_spos(old_header, spos))
+- return ;
+-
+ // FIXME: Lacking of lock for origin header, it will cause other operation
+ // can get the origin header while submitting transactions
+- store->backend->rename_wrap(cid, oid, t, &old_header);
++ StripObjectMap::StripObjectHeaderRef new_header;
++ store->backend->rename_wrap(old_header, cid, oid, t, &new_header);
+
+- strip_headers.erase(make_pair(old_header.cid, old_header.oid));
+- strip_headers[make_pair(cid, oid)] = old_header;
++ InvalidateCacheContext *c = new InvalidateCacheContext(store, old_header->cid, old_header->oid);
++ finishes.push_back(c);
++ strip_headers[make_pair(cid, oid)] = new_header;
+ }
+
+ int KeyValueStore::BufferTransaction::submit_transaction()
+ {
+ int r = 0;
+
+ for (StripHeaderMap::iterator header_iter = strip_headers.begin();
+ header_iter != strip_headers.end(); ++header_iter) {
+- StripObjectMap::StripObjectHeader header = header_iter->second;
++ StripObjectMap::StripObjectHeaderRef header = header_iter->second;
+
+- if (store->backend->check_spos(header, spos))
++ if (header->deleted)
+ continue;
+
+- if (header.deleted)
+- continue;
++ r = store->backend->save_strip_header(header, t);
+
+- r = store->backend->save_strip_header(header, spos, t);
+ if (r < 0) {
+ dout(10) << __func__ << " save strip header failed " << dendl;
+ goto out;
+ }
+ }
+
+-out:
++ r = store->backend->submit_transaction(t);
++ for (list<Context*>::iterator it = finishes.begin(); it != finishes.end(); ++it) {
++ (*it)->complete(r);
++ }
+
++out:
+ dout(5) << __func__ << " r = " << r << dendl;
+- return store->backend->submit_transaction(t);
++ return r;
+ }
+
+ // =========== KeyValueStore Intern Helper Implementation ==============
+
+@@ -494,9 +472,9 @@
+ const char *name, bool do_update) :
+ ObjectStore(base),
+ internal_name(name),
+ basedir(base),
+- fsid_fd(-1), op_fd(-1), current_fd(-1),
++ fsid_fd(-1), current_fd(-1),
+ kv_type(KV_TYPE_NONE),
+ backend(NULL),
+ ondisk_finisher(g_ceph_context),
+ lock("KeyValueStore::lock"),
+@@ -905,12 +883,8 @@
+ if (fsid_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+ fsid_fd = -1;
+ }
+- if (op_fd >= 0) {
+- VOID_TEMP_FAILURE_RETRY(::close(op_fd));
+- op_fd = -1;
+- }
+ if (current_fd >= 0) {
+ VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+ current_fd = -1;
+ }
+@@ -962,16 +936,11 @@
+ }
+
+ Op *o = build_op(tls, ondisk, onreadable, onreadable_sync, osd_op);
+ op_queue_reserve_throttle(o, handle);
+- uint64_t op = submit_manager.op_submit_start();
+- o->op = op;
+- dout(5) << "queue_transactions (trailing journal) " << op << " "
+- << tls <<dendl;
++ dout(5) << "queue_transactions (trailing journal) " << " " << tls <<dendl;
+ queue_op(osr, o);
+
+- submit_manager.op_submit_finish(op);
+-
+ return 0;
+ }
+
+
+@@ -1087,9 +1056,10 @@
+ }
+
+ void KeyValueStore::_finish_op(OpSequencer *osr)
+ {
+- Op *o = osr->dequeue();
++ list<Context*> to_queue;
++ Op *o = osr->dequeue(&to_queue);
+
+ dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << dendl;
+ osr->apply_lock.Unlock(); // locked in _do_op
+ op_queue_release_throttle(o);
+@@ -1101,8 +1071,9 @@
+ if (o->onreadable_sync) {
+ o->onreadable_sync->complete(0);
+ }
+ op_finisher.queue(o->onreadable);
++ op_finisher.queue(to_queue);
+ delete o;
+ }
+
+ // Combine all the ops in the same transaction using "BufferTransaction" and
+@@ -1125,15 +1096,14 @@
+ ops += (*p)->get_num_ops();
+ }
+
+ int trans_num = 0;
+- SequencerPosition spos(op_seq, trans_num, 0);
+- BufferTransaction bt(this, spos);
++ BufferTransaction bt(this);
+
+ for (list<Transaction*>::iterator p = tls.begin();
+ p != tls.end();
+ ++p, trans_num++) {
+- r = _do_transaction(**p, bt, spos, handle);
++ r = _do_transaction(**p, bt, handle);
+ if (r < 0)
+ break;
+ if (handle)
+ handle->reset_tp_timeout();
+@@ -1148,14 +1118,14 @@
+ }
+
+ unsigned KeyValueStore::_do_transaction(Transaction& transaction,
+ BufferTransaction &t,
+- SequencerPosition& spos,
+ ThreadPool::TPHandle *handle)
+ {
+ dout(10) << "_do_transaction on " << &transaction << dendl;
+
+ Transaction::iterator i = transaction.begin();
++ uint64_t op_num = 0;
+
+ while (i.have_op()) {
+ if (handle)
+ handle->reset_tp_timeout();
+@@ -1448,9 +1418,15 @@
+ }
+ break;
+
+ case Transaction::OP_SETALLOCHINT:
+- // TODO: can kvstore make use of the hint?
++ {
++ // TODO: can kvstore make use of the hint?
++ coll_t cid(i.get_cid());
++ ghobject_t oid = i.get_oid();
++ (void)i.get_length(); // discard result
++ (void)i.get_length(); // discard result
++ }
+ break;
+
+ default:
+ derr << "bad op " << op << dendl;
+@@ -1486,10 +1462,9 @@
+ msg = "ENOTEMPTY suggests garbage data in osd data dir";
+ }
+
+ dout(0) << " error " << cpp_strerror(r) << " not handled on operation "
+- << op << " (" << spos << ", or op " << spos.op
+- << ", counting from 0)" << dendl;
++ << op << " op " << op_num << ", counting from 0)" << dendl;
+ dout(0) << msg << dendl;
+ dout(0) << " transaction dump:\n";
+ JSONFormatter f(true);
+ f.open_object_section("transaction");
+@@ -1504,9 +1479,9 @@
+ }
+ }
+ }
+
+- spos.op++;
++ op_num++;
+ }
+
+ return 0; // FIXME count errors
+ }
+@@ -1519,11 +1494,11 @@
+ {
+ dout(10) << __func__ << "collection: " << cid << " object: " << oid
+ << dendl;
+ int r;
+- StripObjectMap::StripObjectHeader header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+- r = backend->lookup_strip_header(cid, oid, header);
++ r = backend->lookup_strip_header(cid, oid, &header);
+ if (r < 0) {
+ return false;
+ }
+
+@@ -1534,44 +1509,44 @@
+ struct stat *st, bool allow_eio)
+ {
+ dout(10) << "stat " << cid << "/" << oid << dendl;
+
+- StripObjectMap::StripObjectHeader header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+- int r = backend->lookup_strip_header(cid, oid, header);
++ int r = backend->lookup_strip_header(cid, oid, &header);
+ if (r < 0) {
+ dout(10) << "stat " << cid << "/" << oid << "=" << r << dendl;
+ return -ENOENT;
+ }
+
+- st->st_blocks = header.max_size / header.strip_size;
+- if (header.max_size % header.strip_size)
++ st->st_blocks = header->max_size / header->strip_size;
++ if (header->max_size % header->strip_size)
+ st->st_blocks++;
+ st->st_nlink = 1;
+- st->st_size = header.max_size;
+- st->st_blksize = header.strip_size;
++ st->st_size = header->max_size;
++ st->st_blksize = header->strip_size;
+
+ return r;
+ }
+
+-int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeader &header,
++int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeaderRef header,
+ uint64_t offset, size_t len, bufferlist& bl,
+ bool allow_eio, BufferTransaction *bt)
+ {
+- if (header.max_size < offset) {
+- dout(10) << __func__ << " " << header.cid << "/" << header.oid << ")"
++ if (header->max_size < offset) {
++ dout(10) << __func__ << " " << header->cid << "/" << header->oid << ")"
+ << " offset exceed the length of bl"<< dendl;
+ return 0;
+ }
+
+ if (len == 0)
+- len = header.max_size - offset;
++ len = header->max_size - offset;
+
+- if (offset + len > header.max_size)
+- len = header.max_size - offset;
++ if (offset + len > header->max_size)
++ len = header->max_size - offset;
+
+ vector<StripObjectMap::StripExtent> extents;
+- StripObjectMap::file_to_extents(offset, len, header.strip_size,
++ StripObjectMap::file_to_extents(offset, len, header->strip_size,
+ extents);
+ map<string, bufferlist> out;
+ set<string> keys;
+
+@@ -1579,35 +1554,35 @@
+ iter != extents.end(); ++iter) {
+ bufferlist old;
+ string key = strip_object_key(iter->no);
+
+- if (bt && header.buffers.count(make_pair(OBJECT_STRIP_PREFIX, key))) {
++ if (bt && header->buffers.count(make_pair(OBJECT_STRIP_PREFIX, key))) {
+ // use strip_header buffer
+- assert(header.bits[iter->no]);
+- out[key] = header.buffers[make_pair(OBJECT_STRIP_PREFIX, key)];
+- } else if (header.bits[iter->no]) {
++ assert(header->bits[iter->no]);
++ out[key] = header->buffers[make_pair(OBJECT_STRIP_PREFIX, key)];
++ } else if (header->bits[iter->no]) {
+ keys.insert(key);
+ }
+ }
+
+ int r = backend->get_values_with_header(header, OBJECT_STRIP_PREFIX, keys, &out);
+ if (r < 0) {
+- dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
++ dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
+ << offset << "~" << len << " = " << r << dendl;
+ return r;
+ } else if (out.size() != keys.size()) {
+ dout(0) << __func__ << " broken header or missing data in backend "
+- << header.cid << "/" << header.oid << " " << offset << "~"
++ << header->cid << "/" << header->oid << " " << offset << "~"
+ << len << " = " << r << dendl;
+ return -EBADF;
+ }
+
+ for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
+ iter != extents.end(); ++iter) {
+ string key = strip_object_key(iter->no);
+
+- if (header.bits[iter->no]) {
+- if (iter->len == header.strip_size) {
++ if (header->bits[iter->no]) {
++ if (iter->len == header->strip_size) {
+ bl.claim_append(out[key]);
+ } else {
+ out[key].copy(iter->offset, iter->len, bl);
+ }
+@@ -1615,9 +1590,9 @@
+ bl.append_zero(iter->len);
+ }
+ }
+
+- dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
++ dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
+ << offset << "~" << bl.length() << "/" << len << " r = " << r
+ << dendl;
+
+ return bl.length();
+@@ -1629,11 +1604,11 @@
+ {
+ dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
+ << len << dendl;
+
+- StripObjectMap::StripObjectHeader header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+- int r = backend->lookup_strip_header(cid, oid, header);
++ int r = backend->lookup_strip_header(cid, oid, &header);
+
+ if (r < 0) {
+ dout(10) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
+ << len << " header isn't exist: r = " << r << dendl;
+@@ -1648,25 +1623,26 @@
+ {
+ dout(10) << __func__ << " " << cid << " " << oid << " " << offset << "~"
+ << len << dendl;
+ int r;
+- StripObjectMap::StripObjectHeader header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+- r = backend->lookup_strip_header(cid, oid, header);
++ r = backend->lookup_strip_header(cid, oid, &header);
+ if (r < 0) {
+ dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len
+ << " failed to get header: r = " << r << dendl;
+ return r;
+ }
+
+ vector<StripObjectMap::StripExtent> extents;
+- StripObjectMap::file_to_extents(offset, len, header.strip_size,
++ StripObjectMap::file_to_extents(offset, len, header->strip_size,
+ extents);
+
+ map<uint64_t, uint64_t> m;
+ for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
+ iter != extents.end(); ++iter) {
+- m[iter->offset] = iter->len;
++ uint64_t off = iter->no * header->strip_size + iter->offset;
++ m[off] = iter->len;
+ }
+ ::encode(m, bl);
+ return 0;
+ }
+@@ -1676,18 +1652,20 @@
+ {
+ dout(15) << __func__ << " " << cid << "/" << oid << dendl;
+
+ int r;
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ r = t.lookup_cached_header(cid, oid, &header, false);
+ if (r < 0) {
+ dout(10) << __func__ << " " << cid << "/" << oid << " "
+ << " failed to get header: r = " << r << dendl;
+ return r;
+ }
+
+- r = t.clear_buffer(*header);
++ header->max_size = 0;
++ header->bits.clear();
++ r = t.clear_buffer(header);
+
+ dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+ }
+@@ -1698,9 +1676,9 @@
+ dout(15) << __func__ << " " << cid << "/" << oid << " size " << size
+ << dendl;
+
+ int r;
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ r = t.lookup_cached_header(cid, oid, &header, false);
+ if (r < 0) {
+ dout(10) << __func__ << " " << cid << "/" << oid << " " << size
+@@ -1724,9 +1702,9 @@
+ set<string> lookup_keys;
+ string key = strip_object_key(iter->no);
+
+ lookup_keys.insert(key);
+- r = t.get_buffer_keys(*header, OBJECT_STRIP_PREFIX,
++ r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX,
+ lookup_keys, &values);
+ if (r < 0) {
+ dout(10) << __func__ << " " << cid << "/" << oid << " "
+ << size << " = " << r << dendl;
+@@ -1742,9 +1720,9 @@
+ value.append_zero(header->strip_size-iter->offset);
+ assert(value.length() == header->strip_size);
+ value.swap(values[key]);
+
+- t.set_buffer_keys(*header, OBJECT_STRIP_PREFIX, values);
++ t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values);
+ ++iter;
+ }
+
+ set<string> keys;
+@@ -1753,9 +1731,9 @@
+ keys.insert(strip_object_key(iter->no));
+ header->bits[iter->no] = 0;
+ }
+ }
+- r = t.remove_buffer_keys(*header, OBJECT_STRIP_PREFIX, keys);
++ r = t.remove_buffer_keys(header, OBJECT_STRIP_PREFIX, keys);
+ if (r < 0) {
+ dout(10) << __func__ << " " << cid << "/" << oid << " "
+ << size << " = " << r << dendl;
+ return r;
+@@ -1775,9 +1753,9 @@
+ {
+ dout(15) << __func__ << " " << cid << "/" << oid << dendl;
+
+ int r;
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ r = t.lookup_cached_header(cid, oid, &header, true);
+ if (r < 0) {
+ dout(10) << __func__ << " " << cid << "/" << oid << " "
+@@ -1789,44 +1767,44 @@
+ dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+ }
+
+-int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeader &header,
++int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeaderRef header,
+ uint64_t offset, size_t len,
+ const bufferlist& bl, BufferTransaction &t,
+ bool replica)
+ {
+ if (len > bl.length())
+ len = bl.length();
+
+- if (len + offset > header.max_size) {
+- header.max_size = len + offset;
+- header.bits.resize(header.max_size/header.strip_size+1);
++ if (len + offset > header->max_size) {
++ header->max_size = len + offset;
++ header->bits.resize(header->max_size/header->strip_size+1);
+ }
+
+ vector<StripObjectMap::StripExtent> extents;
+- StripObjectMap::file_to_extents(offset, len, header.strip_size,
++ StripObjectMap::file_to_extents(offset, len, header->strip_size,
+ extents);
+
+ map<string, bufferlist> out;
+ set<string> keys;
+ for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
+ iter != extents.end(); ++iter) {
+- if (header.bits[iter->no] && !(iter->offset == 0 &&
+- iter->len == header.strip_size))
++ if (header->bits[iter->no] && !(iter->offset == 0 &&
++ iter->len == header->strip_size))
+ keys.insert(strip_object_key(iter->no));
+ }
+
+ int r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, keys, &out);
+ if (r < 0) {
+- dout(10) << __func__ << " failed to get value " << header.cid << "/"
+- << header.oid << " " << offset << "~" << len << " = " << r
++ dout(10) << __func__ << " failed to get value " << header->cid << "/"
++ << header->oid << " " << offset << "~" << len << " = " << r
+ << dendl;
+ return r;
+ } else if (keys.size() != out.size()) {
+ // Error on header.bits or the corresponding key/value pair is missing
+ dout(0) << __func__ << " broken header or missing data in backend "
+- << header.cid << "/" << header.oid << " " << offset << "~"
++ << header->cid << "/" << header->oid << " " << offset << "~"
+ << len << " = " << r << dendl;
+ return -EBADF;
+ }
+
+@@ -1835,41 +1813,41 @@
+ for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
+ iter != extents.end(); ++iter) {
+ bufferlist value;
+ string key = strip_object_key(iter->no);
+- if (header.bits[iter->no]) {
+- if (iter->offset == 0 && iter->len == header.strip_size) {
++ if (header->bits[iter->no]) {
++ if (iter->offset == 0 && iter->len == header->strip_size) {
+ bl.copy(bl_offset, iter->len, value);
+ bl_offset += iter->len;
+ } else {
+- assert(out[key].length() == header.strip_size);
++ assert(out[key].length() == header->strip_size);
+
+ out[key].copy(0, iter->offset, value);
+ bl.copy(bl_offset, iter->len, value);
+ bl_offset += iter->len;
+
+- if (value.length() != header.strip_size)
+- out[key].copy(value.length(), header.strip_size-value.length(),
++ if (value.length() != header->strip_size)
++ out[key].copy(value.length(), header->strip_size-value.length(),
+ value);
+ }
+ } else {
+ if (iter->offset)
+ value.append_zero(iter->offset);
+ bl.copy(bl_offset, iter->len, value);
+ bl_offset += iter->len;
+
+- if (value.length() < header.strip_size)
+- value.append_zero(header.strip_size-value.length());
++ if (value.length() < header->strip_size)
++ value.append_zero(header->strip_size-value.length());
+
+- header.bits[iter->no] = 1;
++ header->bits[iter->no] = 1;
+ }
+- assert(value.length() == header.strip_size);
++ assert(value.length() == header->strip_size);
+ values[key].swap(value);
+ }
+ assert(bl_offset == len);
+
+ t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values);
+- dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
++ dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
+ << offset << "~" << len << " = " << r << dendl;
+
+ return r;
+ }
+@@ -1881,18 +1859,18 @@
+ dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
+ << len << dendl;
+
+ int r;
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ r = t.lookup_cached_header(cid, oid, &header, true);
+ if (r < 0) {
+ dout(10) << __func__ << " " << cid << "/" << oid << " " << offset
+ << "~" << len << " failed to get header: r = " << r << dendl;
+ return r;
+ }
+
+- return _generic_write(*header, offset, len, bl, t, replica);
++ return _generic_write(header, offset, len, bl, t, replica);
+ }
+
+ int KeyValueStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset,
+ size_t len, BufferTransaction &t)
+@@ -1919,18 +1897,18 @@
+ if (oldoid == newoid)
+ return 0;
+
+ int r;
+- StripObjectMap::StripObjectHeader *old_header;
++ StripObjectMap::StripObjectHeaderRef old_header;
+
+ r = t.lookup_cached_header(cid, oldoid, &old_header, false);
+ if (r < 0) {
+ dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
+ << newoid << " = " << r << dendl;
+ return r;
+ }
+
+- t.clone_buffer(*old_header, cid, newoid);
++ t.clone_buffer(old_header, cid, newoid);
+
+ dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
+ << newoid << " = " << r << dendl;
+ return r;
+@@ -1947,9 +1925,9 @@
+
+ int r;
+ bufferlist bl;
+
+- StripObjectMap::StripObjectHeader *old_header, *new_header;
++ StripObjectMap::StripObjectHeaderRef old_header, new_header;
+
+ r = t.lookup_cached_header(cid, oldoid, &old_header, false);
+ if (r < 0) {
+ dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
+@@ -1965,13 +1943,13 @@
+ << " can't create header: r = " << r << dendl;
+ return r;
+ }
+
+- r = _generic_read(*old_header, srcoff, len, bl, &t);
++ r = _generic_read(old_header, srcoff, len, bl, &t);
+ if (r < 0)
+ goto out;
+
+- r = _generic_write(*new_header, dstoff, len, bl, t);
++ r = _generic_write(new_header, dstoff, len, bl, t);
+
+ out:
+ dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
+ << newoid << " " << srcoff << "~" << len << " to " << dstoff
+@@ -1989,11 +1967,19 @@
+
+ int r;
+ map<string, bufferlist> got;
+ set<string> to_get;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ to_get.insert(string(name));
+- r = backend->get_values(cid, oid, OBJECT_XATTR, to_get, &got);
++
++ r = backend->lookup_strip_header(cid, oid, &header);
++ if (r < 0) {
++ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
++ return r;
++ }
++
++ r = backend->get_values_with_header(header, OBJECT_XATTR, to_get, &got);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " get_xattrs err r =" << r << dendl;
+ goto out;
+ }
+@@ -2055,9 +2041,9 @@
+ dout(15) << __func__ << " " << cid << "/" << oid << dendl;
+
+ int r;
+
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+ map<string, bufferlist> attrs;
+
+ r = t.lookup_cached_header(cid, oid, &header, false);
+ if (r < 0)
+@@ -2067,9 +2053,9 @@
+ it != aset.end(); ++it) {
+ attrs[it->first].push_back(it->second);
+ }
+
+- t.set_buffer_keys(*header, OBJECT_XATTR, attrs);
++ t.set_buffer_keys(header, OBJECT_XATTR, attrs);
+
+ out:
+ dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+@@ -2083,9 +2069,9 @@
+ << dendl;
+
+ int r;
+ set<string> to_remove;
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ r = t.lookup_cached_header(cid, oid, &header, false);
+ if (r < 0) {
+ dout(10) << __func__ << " could not find header r = " << r
+@@ -2093,9 +2079,9 @@
+ return r;
+ }
+
+ to_remove.insert(string(name));
+- r = t.remove_buffer_keys(*header, OBJECT_XATTR, to_remove);
++ r = t.remove_buffer_keys(header, OBJECT_XATTR, to_remove);
+
+ dout(10) << __func__ << " " << cid << "/" << oid << " '" << name << "' = "
+ << r << dendl;
+ return r;
+@@ -2108,25 +2094,25 @@
+
+ int r;
+ set<string> attrs;
+
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ r = t.lookup_cached_header(cid, oid, &header, false);
+ if (r < 0) {
+ dout(10) << __func__ << " could not find header r = " << r
+ << dendl;
+ return r;
+ }
+
+- r = backend->get_keys_with_header(*header, OBJECT_XATTR, &attrs);
++ r = backend->get_keys_with_header(header, OBJECT_XATTR, &attrs);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not get attrs r = " << r << dendl;
+ return r;
+ }
+
+- r = t.remove_buffer_keys(*header, OBJECT_XATTR, attrs);
+- t.clear_buffer_keys(*header, OBJECT_XATTR);
++ r = t.remove_buffer_keys(header, OBJECT_XATTR, attrs);
++ t.clear_buffer_keys(header, OBJECT_XATTR);
+
+ dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
+ return r;
+ }
+@@ -2167,12 +2153,20 @@
+ << "'" << dendl;
+
+ set<string> keys;
+ map<string, bufferlist> out;
++ StripObjectMap::StripObjectHeaderRef header;
++
+ keys.insert(string(name));
+
+- int r = backend->get_values(get_coll_for_coll(), make_ghobject_for_coll(c),
+- COLLECTION_ATTR, keys, &out);
++ int r = backend->lookup_strip_header(get_coll_for_coll(),
++ make_ghobject_for_coll(c), &header);
++ if (r < 0) {
++ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
++ return r;
++ }
++
++ r = backend->get_values_with_header(header, COLLECTION_ATTR, keys, &out);
+ if (r < 0) {
+ dout(10) << __func__ << " could not get key" << string(name) << dendl;
+ r = -EINVAL;
+ }
+@@ -2191,16 +2185,23 @@
+ dout(10) << __func__ << " " << cid.to_str() << dendl;
+
+ map<string, bufferlist> out;
+ set<string> keys;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ for (map<string, bufferptr>::iterator it = aset.begin();
+ it != aset.end(); ++it) {
+ keys.insert(it->first);
+ }
+
+- int r = backend->get_values(get_coll_for_coll(), make_ghobject_for_coll(cid),
+- COLLECTION_ATTR, keys, &out);
++ int r = backend->lookup_strip_header(get_coll_for_coll(),
++ make_ghobject_for_coll(cid), &header);
++ if (r < 0) {
++ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
++ return r;
++ }
++
++ r = backend->get_values_with_header(header, COLLECTION_ATTR, keys, &out);
+ if (r < 0) {
+ dout(10) << __func__ << " could not get keys" << dendl;
+ r = -EINVAL;
+ goto out;
+@@ -2226,9 +2227,9 @@
+
+ int r;
+ bufferlist bl;
+ map<string, bufferlist> out;
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ r = t.lookup_cached_header(get_coll_for_coll(),
+ make_ghobject_for_coll(c),
+ &header, false);
+@@ -2239,9 +2240,9 @@
+
+ bl.append(reinterpret_cast<const char*>(value), size);
+ out.insert(make_pair(string(name), bl));
+
+- t.set_buffer_keys(*header, COLLECTION_ATTR, out);
++ t.set_buffer_keys(header, COLLECTION_ATTR, out);
+
+ dout(10) << __func__ << " " << c << " '"
+ << name << "' len " << size << " = " << r << dendl;
+ return r;
+@@ -2253,9 +2254,9 @@
+ dout(15) << __func__ << " " << c << dendl;
+
+ bufferlist bl;
+ set<string> out;
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ int r = t.lookup_cached_header(get_coll_for_coll(),
+ make_ghobject_for_coll(c), &header, false);
+ if (r < 0) {
+@@ -2263,9 +2264,9 @@
+ return r;
+ }
+
+ out.insert(string(name));
+- r = t.remove_buffer_keys(*header, COLLECTION_ATTR, out);
++ r = t.remove_buffer_keys(header, COLLECTION_ATTR, out);
+
+ dout(10) << __func__ << " " << c << " = " << r << dendl;
+ return r;
+ }
+@@ -2276,9 +2277,9 @@
+ {
+ dout(15) << __func__ << " " << cid << dendl;
+
+ map<string, bufferlist> attrs;
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+ int r = t.lookup_cached_header(get_coll_for_coll(),
+ make_ghobject_for_coll(cid),
+ &header, false);
+ if (r < 0) {
+@@ -2290,9 +2291,9 @@
+ ++it) {
+ attrs[it->first].push_back(it->second);
+ }
+
+- t.set_buffer_keys(*header, COLLECTION_ATTR, attrs);
++ t.set_buffer_keys(header, COLLECTION_ATTR, attrs);
+
+ dout(10) << __func__ << " " << cid << " = " << r << dendl;
+ return r;
+ }
+@@ -2304,9 +2305,9 @@
+ {
+ dout(15) << __func__ << " " << c << dendl;
+
+ int r;
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+ bufferlist bl;
+
+ r = t.lookup_cached_header(get_coll_for_coll(),
+ make_ghobject_for_coll(c), &header,
+@@ -2329,9 +2330,9 @@
+ dout(15) << __func__ << " " << c << dendl;
+
+ int r;
+ uint64_t modified_object = 0;
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+ vector<ghobject_t> oids;
+
+ r = t.lookup_cached_header(get_coll_for_coll(), make_ghobject_for_coll(c),
+ &header, false);
+@@ -2346,9 +2347,9 @@
+ if (iter->first.first != c)
+ continue;
+
+ modified_object++;
+- if (!iter->second.deleted) {
++ if (!iter->second->deleted) {
+ r = -ENOTEMPTY;
+ goto out;
+ }
+ }
+@@ -2368,9 +2369,9 @@
+ goto out;
+ }
+ }
+
+- r = t.clear_buffer(*header);
++ r = t.clear_buffer(header);
+
+ out:
+ dout(10) << __func__ << " " << c << " = " << r << dendl;
+ return r;
+@@ -2384,9 +2385,9 @@
+ dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/"
+ << o << dendl;
+
+ bufferlist bl;
+- StripObjectMap::StripObjectHeader *header, *old_header;
++ StripObjectMap::StripObjectHeaderRef header, old_header;
+
+ int r = t.lookup_cached_header(oldcid, o, &old_header, false);
+ if (r < 0) {
+ goto out;
+@@ -2399,15 +2400,15 @@
+ << o << " already exist " << dendl;
+ goto out;
+ }
+
+- r = _generic_read(*old_header, 0, old_header->max_size, bl, &t);
++ r = _generic_read(old_header, 0, old_header->max_size, bl, &t);
+ if (r < 0) {
+ r = -EINVAL;
+ goto out;
+ }
+
+- r = _generic_write(*header, 0, bl.length(), bl, t);
++ r = _generic_write(header, 0, bl.length(), bl, t);
+ if (r < 0) {
+ r = -EINVAL;
+ }
+
+@@ -2424,9 +2425,9 @@
+ {
+ dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/"
+ << oldoid << dendl;
+ int r;
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ r = t.lookup_cached_header(c, o, &header, false);
+ if (r == 0) {
+ dout(10) << __func__ << " " << oldcid << "/" << oldoid << " -> " << c
+@@ -2440,9 +2441,9 @@
+ << "/" << o << " = " << r << dendl;
+ return r;
+ }
+
+- t.rename_buffer(*header, c, o);
++ t.rename_buffer(header, c, o);
+
+ dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/"
+ << oldoid << " = " << r << dendl;
+ return r;
+@@ -2452,9 +2453,9 @@
+ BufferTransaction &t)
+ {
+ dout(15) << __func__ << " " << cid << dendl;
+
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ int r = t.lookup_cached_header(get_coll_for_coll(),
+ make_ghobject_for_coll(cid),
+ &header, false);
+@@ -2477,9 +2478,9 @@
+ return r;
+ }
+ }
+
+- r = t.clear_buffer(*header);
++ r = t.clear_buffer(header);
+
+ dout(10) << __func__ << " " << cid << " r = " << r << dendl;
+ return 0;
+ }
+@@ -2489,9 +2490,9 @@
+ {
+ dout(10) << __func__ << " origin cid " << cid << " new cid " << ncid
+ << dendl;
+
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ int r = t.lookup_cached_header(get_coll_for_coll(),
+ make_ghobject_for_coll(ncid),
+ &header, false);
+@@ -2531,9 +2532,9 @@
+ objects.clear();
+ current = next;
+ }
+
+- t.rename_buffer(*header, get_coll_for_coll(), make_ghobject_for_coll(ncid));
++ t.rename_buffer(header, get_coll_for_coll(), make_ghobject_for_coll(ncid));
+
+ dout(10) << __func__ << " origin cid " << cid << " new cid " << ncid
+ << dendl;
+ return 0;
+@@ -2559,11 +2560,11 @@
+ bool KeyValueStore::collection_exists(coll_t c)
+ {
+ dout(10) << __func__ << " " << dendl;
+
+- StripObjectMap::StripObjectHeader header;
++ StripObjectMap::StripObjectHeaderRef header;
+ int r = backend->lookup_strip_header(get_coll_for_coll(),
+- make_ghobject_for_coll(c), header);
++ make_ghobject_for_coll(c), &header);
+ if (r < 0) {
+ return false;
+ }
+ return true;
+@@ -2651,17 +2652,16 @@
+ bufferlist *bl, map<string, bufferlist> *out)
+ {
+ dout(15) << __func__ << " " << c << "/" << hoid << dendl;
+
+- StripObjectMap::StripObjectHeader header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+- int r = backend->lookup_strip_header(c, hoid, header);
++ int r = backend->lookup_strip_header(c, hoid, &header);
+ if (r < 0) {
+ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
+ return r;
+ }
+
+-
+ r = backend->get_with_header(header, OBJECT_OMAP, out);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " err r =" << r << dendl;
+ return r;
+@@ -2691,11 +2691,18 @@
+ dout(15) << __func__ << " " << c << "/" << hoid << dendl;
+
+ set<string> keys;
+ map<string, bufferlist> got;
++ StripObjectMap::StripObjectHeaderRef header;
++
++ int r = backend->lookup_strip_header(c, hoid, &header);
++ if (r < 0) {
++ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
++ return r;
++ }
+
+ keys.insert(OBJECT_OMAP_HEADER_KEY);
+- int r = backend->get_values(c, hoid, OBJECT_OMAP_HEADER, keys, &got);
++ r = backend->get_values_with_header(header, OBJECT_OMAP_HEADER, keys, &got);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " err r =" << r << dendl;
+ return r;
+ }
+@@ -2711,9 +2718,16 @@
+ int KeyValueStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set<string> *keys)
+ {
+ dout(15) << __func__ << " " << c << "/" << hoid << dendl;
+
+- int r = backend->get_keys(c, hoid, OBJECT_OMAP, keys);
++ StripObjectMap::StripObjectHeaderRef header;
++ int r = backend->lookup_strip_header(c, hoid, &header);
++ if (r < 0) {
++ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
++ return r;
++ }
++
++ r = backend->get_keys_with_header(header, OBJECT_OMAP, keys);
+ if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+ return 0;
+@@ -2724,9 +2738,16 @@
+ map<string, bufferlist> *out)
+ {
+ dout(15) << __func__ << " " << c << "/" << hoid << dendl;
+
+- int r = backend->get_values(c, hoid, OBJECT_OMAP, keys, out);
++ StripObjectMap::StripObjectHeaderRef header;
++ int r = backend->lookup_strip_header(c, hoid, &header);
++ if (r < 0) {
++ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
++ return r;
++ }
++
++ r = backend->get_values_with_header(header, OBJECT_OMAP, keys, out);
+ if (r < 0 && r != -ENOENT) {
+ return r;
+ }
+ return 0;
+@@ -2755,9 +2776,9 @@
+ BufferTransaction &t)
+ {
+ dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
+
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ int r = t.lookup_cached_header(cid, hoid, &header, false);
+ if (r < 0) {
+ dout(10) << __func__ << " " << cid << "/" << hoid << " "
+@@ -2765,29 +2786,29 @@
+ return r;
+ }
+
+ set<string> keys;
+- r = backend->get_keys_with_header(*header, OBJECT_OMAP, &keys);
++ r = backend->get_keys_with_header(header, OBJECT_OMAP, &keys);
+ if (r < 0 && r != -ENOENT) {
+ dout(10) << __func__ << " could not get omap_keys r = " << r << dendl;
+ return r;
+ }
+
+- r = t.remove_buffer_keys(*header, OBJECT_OMAP, keys);
++ r = t.remove_buffer_keys(header, OBJECT_OMAP, keys);
+ if (r < 0) {
+ dout(10) << __func__ << " could not remove keys r = " << r << dendl;
+ return r;
+ }
+
+ keys.clear();
+ keys.insert(OBJECT_OMAP_HEADER_KEY);
+- r = t.remove_buffer_keys(*header, OBJECT_OMAP_HEADER, keys);
++ r = t.remove_buffer_keys(header, OBJECT_OMAP_HEADER, keys);
+ if (r < 0) {
+ dout(10) << __func__ << " could not remove keys r = " << r << dendl;
+ return r;
+ }
+
+- t.clear_buffer_keys(*header, OBJECT_OMAP_HEADER);
++ t.clear_buffer_keys(header, OBJECT_OMAP_HEADER);
+
+ dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl;
+ return 0;
+ }
+@@ -2797,18 +2818,18 @@
+ BufferTransaction &t)
+ {
+ dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
+
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ int r = t.lookup_cached_header(cid, hoid, &header, false);
+ if (r < 0) {
+ dout(10) << __func__ << " " << cid << "/" << hoid << " "
+ << " failed to get header: r = " << r << dendl;
+ return r;
+ }
+
+- t.set_buffer_keys(*header, OBJECT_OMAP, aset);
++ t.set_buffer_keys(header, OBJECT_OMAP, aset);
+
+ return 0;
+ }
+
+@@ -2817,18 +2838,18 @@
+ BufferTransaction &t)
+ {
+ dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
+
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ int r = t.lookup_cached_header(cid, hoid, &header, false);
+ if (r < 0) {
+ dout(10) << __func__ << " " << cid << "/" << hoid << " "
+ << " failed to get header: r = " << r << dendl;
+ return r;
+ }
+
+- r = t.remove_buffer_keys(*header, OBJECT_OMAP, keys);
++ r = t.remove_buffer_keys(header, OBJECT_OMAP, keys);
+
+ dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl;
+ return r;
+ }
+@@ -2860,9 +2881,9 @@
+ {
+ dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
+
+ map<string, bufferlist> sets;
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ int r = t.lookup_cached_header(cid, hoid, &header, false);
+ if (r < 0) {
+ dout(10) << __func__ << " " << cid << "/" << hoid << " "
+@@ -2870,9 +2891,9 @@
+ return r;
+ }
+
+ sets[OBJECT_OMAP_HEADER_KEY] = bl;
+- t.set_buffer_keys(*header, OBJECT_OMAP_HEADER, sets);
++ t.set_buffer_keys(header, OBJECT_OMAP_HEADER, sets);
+ return 0;
+ }
+
+ int KeyValueStore::_split_collection(coll_t cid, uint32_t bits, uint32_t rem,
+@@ -2880,9 +2901,9 @@
+ {
+ {
+ dout(15) << __func__ << " " << cid << " bits: " << bits << dendl;
+
+- StripObjectMap::StripObjectHeader *header;
++ StripObjectMap::StripObjectHeaderRef header;
+
+ int r = t.lookup_cached_header(get_coll_for_coll(),
+ make_ghobject_for_coll(cid),
+ &header, false);
+--- a/src/os/KeyValueStore.h
++++ b/src/os/KeyValueStore.h
+@@ -35,10 +35,10 @@
+ #include "common/fd.h"
+
+ #include "common/Mutex.h"
+ #include "GenericObjectMap.h"
+-#include "SequencerPosition.h"
+ #include "KeyValueDB.h"
++#include "common/random_cache.hpp"
+
+ #include "include/uuid.h"
+
+ enum kvstore_types {
+@@ -47,8 +47,10 @@
+ KV_TYPE_OTHER
+ };
+
+
++static uint64_t default_strip_size = 1024;
++
+ class StripObjectMap: public GenericObjectMap {
+ public:
+
+ struct StripExtent {
+@@ -64,9 +66,8 @@
+ // Persistent state
+ uint64_t strip_size;
+ uint64_t max_size;
+ vector<char> bits;
+- SequencerPosition spos;
+
+ // soft state
+ Header header; // FIXME: Hold lock to avoid concurrent operations, it will
+ // also block read operation which not should be permitted.
+@@ -81,67 +82,66 @@
+ ENCODE_START(1, 1, bl);
+ ::encode(strip_size, bl);
+ ::encode(max_size, bl);
+ ::encode(bits, bl);
+- ::encode(spos, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::iterator &bl) {
+ DECODE_START(1, bl);
+ ::decode(strip_size, bl);
+ ::decode(max_size, bl);
+ ::decode(bits, bl);
+- ::decode(spos, bl);
+ DECODE_FINISH(bl);
+ }
+ };
+-
+- bool check_spos(const StripObjectHeader &header,
+- const SequencerPosition &spos);
+- void sync_wrap(StripObjectHeader &strip_header, KeyValueDB::Transaction t,
+- const SequencerPosition &spos);
++ typedef ceph::shared_ptr<StripObjectHeader> StripObjectHeaderRef;
+
+ static int file_to_extents(uint64_t offset, size_t len, uint64_t strip_size,
+ vector<StripExtent> &extents);
+ int lookup_strip_header(const coll_t & cid, const ghobject_t &oid,
+- StripObjectHeader &header);
+- int save_strip_header(StripObjectHeader &header,
+- const SequencerPosition &spos,
+- KeyValueDB::Transaction t);
++ StripObjectHeaderRef *header);
++ int save_strip_header(StripObjectHeaderRef header, KeyValueDB::Transaction t);
+ int create_strip_header(const coll_t &cid, const ghobject_t &oid,
+- StripObjectHeader &strip_header,
++ StripObjectHeaderRef *strip_header,
+ KeyValueDB::Transaction t);
+- void clone_wrap(StripObjectHeader &old_header,
++ void clone_wrap(StripObjectHeaderRef old_header,
+ const coll_t &cid, const ghobject_t &oid,
+ KeyValueDB::Transaction t,
+- StripObjectHeader *origin_header,
+- StripObjectHeader *target_header);
+- void rename_wrap(const coll_t &cid, const ghobject_t &oid,
++ StripObjectHeaderRef *target_header);
++ void rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid,
+ KeyValueDB::Transaction t,
+- StripObjectHeader *header);
++ StripObjectHeaderRef *new_header);
+ // Already hold header to avoid lock header seq again
+ int get_with_header(
+- const StripObjectHeader &header,
++ const StripObjectHeaderRef header,
+ const string &prefix,
+ map<string, bufferlist> *out
+ );
+
+ int get_values_with_header(
+- const StripObjectHeader &header,
++ const StripObjectHeaderRef header,
+ const string &prefix,
+ const set<string> &keys,
+ map<string, bufferlist> *out
+ );
+ int get_keys_with_header(
+- const StripObjectHeader &header,
++ const StripObjectHeaderRef header,
+ const string &prefix,
+ set<string> *keys
+ );
+
+- StripObjectMap(KeyValueDB *db): GenericObjectMap(db) {}
++ Mutex lock;
++ void invalidate_cache(const coll_t &c, const ghobject_t &oid) {
++ Mutex::Locker l(lock);
++ caches.clear(oid);
++ }
+
+- static const uint64_t default_strip_size = 1024;
++ RandomCache<ghobject_t, pair<coll_t, StripObjectHeaderRef> > caches;
++ StripObjectMap(KeyValueDB *db): GenericObjectMap(db),
++ lock("StripObjectMap::lock"),
++ caches(g_conf->keyvaluestore_header_cache_size)
++ {}
+ };
+
+
+ class KeyValueStore : public ObjectStore,
+@@ -160,9 +160,9 @@
+ std::string current_fn;
+ std::string current_op_seq_fn;
+ uuid_d fsid;
+
+- int fsid_fd, op_fd, current_fd;
++ int fsid_fd, current_fd;
+
+ enum kvstore_types kv_type;
+
+ deque<uint64_t> snaps;
+@@ -209,41 +209,51 @@
+ // 3. Object modify(including omap, xattr)
+ // 4. Clone or rename
+ struct BufferTransaction {
+ typedef pair<coll_t, ghobject_t> uniq_id;
+- typedef map<uniq_id, StripObjectMap::StripObjectHeader> StripHeaderMap;
++ typedef map<uniq_id, StripObjectMap::StripObjectHeaderRef> StripHeaderMap;
+
+ //Dirty records
+ StripHeaderMap strip_headers;
++ list<Context*> finishes;
+
+ KeyValueStore *store;
+
+- SequencerPosition spos;
+ KeyValueDB::Transaction t;
+
+ int lookup_cached_header(const coll_t &cid, const ghobject_t &oid,
+- StripObjectMap::StripObjectHeader **strip_header,
++ StripObjectMap::StripObjectHeaderRef *strip_header,
+ bool create_if_missing);
+- int get_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
++ int get_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
+ const string &prefix, const set<string> &keys,
+ map<string, bufferlist> *out);
+- void set_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
++ void set_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
+ const string &prefix, map<string, bufferlist> &bl);
+- int remove_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
++ int remove_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
+ const string &prefix, const set<string> &keys);
+- void clear_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
++ void clear_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
+ const string &prefix);
+- int clear_buffer(StripObjectMap::StripObjectHeader &strip_header);
+- void clone_buffer(StripObjectMap::StripObjectHeader &old_header,
++ int clear_buffer(StripObjectMap::StripObjectHeaderRef strip_header);
++ void clone_buffer(StripObjectMap::StripObjectHeaderRef old_header,
+ const coll_t &cid, const ghobject_t &oid);
+- void rename_buffer(StripObjectMap::StripObjectHeader &old_header,
++ void rename_buffer(StripObjectMap::StripObjectHeaderRef old_header,
+ const coll_t &cid, const ghobject_t &oid);
+ int submit_transaction();
+
+- BufferTransaction(KeyValueStore *store,
+- SequencerPosition &spos): store(store), spos(spos) {
++ BufferTransaction(KeyValueStore *store): store(store) {
+ t = store->backend->get_transaction();
+ }
++
++ struct InvalidateCacheContext : public Context {
++ KeyValueStore *store;
++ const coll_t cid;
++ const ghobject_t oid;
++ InvalidateCacheContext(KeyValueStore *s, const coll_t &c, const ghobject_t &oid): store(s), cid(c), oid(oid) {}
++ void finish(int r) {
++ if (r == 0)
++ store->backend->invalidate_cache(cid, oid);
++ }
++ };
+ };
+
+ // -- op workqueue --
+ struct Op {
+@@ -256,52 +266,111 @@
+ };
+ class OpSequencer : public Sequencer_impl {
+ Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
+ list<Op*> q;
+- list<uint64_t> jq;
+ Cond cond;
++ list<pair<uint64_t, Context*> > flush_commit_waiters;
++ uint64_t op; // used by flush() to know the sequence of op
+ public:
+ Sequencer *parent;
+ Mutex apply_lock; // for apply mutual exclusion
++
++ /// get_max_uncompleted
++ bool _get_max_uncompleted(
++ uint64_t *seq ///< [out] max uncompleted seq
++ ) {
++ assert(qlock.is_locked());
++ assert(seq);
++ *seq = 0;
++ if (q.empty()) {
++ return true;
++ } else {
++ *seq = q.back()->op;
++ return false;
++ }
++ } /// @returns true if the queue is empty
++
++ /// get_min_uncompleted
++ bool _get_min_uncompleted(
++ uint64_t *seq ///< [out] min uncompleted seq
++ ) {
++ assert(qlock.is_locked());
++ assert(seq);
++ *seq = 0;
++ if (q.empty()) {
++ return true;
++ } else {
++ *seq = q.front()->op;
++ return false;
++ }
++ } /// @returns true if both queues are empty
++
++ void _wake_flush_waiters(list<Context*> *to_queue) {
++ uint64_t seq;
++ if (_get_min_uncompleted(&seq))
++ seq = -1;
++
++ for (list<pair<uint64_t, Context*> >::iterator i =
++ flush_commit_waiters.begin();
++ i != flush_commit_waiters.end() && i->first < seq;
++ flush_commit_waiters.erase(i++)) {
++ to_queue->push_back(i->second);
++ }
++ }
+
+ void queue(Op *o) {
+ Mutex::Locker l(qlock);
+ q.push_back(o);
++ op++;
++ o->op = op;
+ }
+ Op *peek_queue() {
+ assert(apply_lock.is_locked());
+ return q.front();
+ }
+- Op *dequeue() {
++
++ Op *dequeue(list<Context*> *to_queue) {
++ assert(to_queue);
+ assert(apply_lock.is_locked());
+ Mutex::Locker l(qlock);
+ Op *o = q.front();
+ q.pop_front();
+ cond.Signal();
++
++ _wake_flush_waiters(to_queue);
+ return o;
+ }
++
+ void flush() {
+ Mutex::Locker l(qlock);
+
+ // get max for journal _or_ op queues
+ uint64_t seq = 0;
+ if (!q.empty())
+ seq = q.back()->op;
+- if (!jq.empty() && jq.back() > seq)
+- seq = jq.back();
+
+ if (seq) {
+ // everything prior to our watermark to drain through either/both
+ // queues
+- while ((!q.empty() && q.front()->op <= seq) ||
+- (!jq.empty() && jq.front() <= seq))
++ while (!q.empty() && q.front()->op <= seq)
+ cond.Wait(qlock);
+ }
+ }
++ bool flush_commit(Context *c) {
++ Mutex::Locker l(qlock);
++ uint64_t seq = 0;
++ if (_get_max_uncompleted(&seq)) {
++ delete c;
++ return true;
++ } else {
++ flush_commit_waiters.push_back(make_pair(seq, c));
++ return false;
++ }
++ }
+
+ OpSequencer()
+ : qlock("KeyValueStore::OpSequencer::qlock", false, false),
+- parent(0),
++ op(0), parent(0),
+ apply_lock("KeyValueStore::OpSequencer::apply_lock", false, false) {}
+ ~OpSequencer() {
+ assert(q.empty());
+ }
+@@ -416,9 +485,8 @@
+ return _do_transactions(tls, op_seq, 0);
+ }
+ unsigned _do_transaction(Transaction& transaction,
+ BufferTransaction &bt,
+- SequencerPosition& spos,
+ ThreadPool::TPHandle *handle);
+
+ int queue_transactions(Sequencer *osr, list<Transaction*>& tls,
+ TrackedOpRef op = TrackedOpRef(),
+@@ -427,12 +495,12 @@
+
+ // ------------------
+ // objects
+
+- int _generic_read(StripObjectMap::StripObjectHeader &header,
++ int _generic_read(StripObjectMap::StripObjectHeaderRef header,
+ uint64_t offset, size_t len, bufferlist& bl,
+ bool allow_eio = false, BufferTransaction *bt = 0);
+- int _generic_write(StripObjectMap::StripObjectHeader &header,
++ int _generic_write(StripObjectMap::StripObjectHeaderRef header,
+ uint64_t offset, size_t len, const bufferlist& bl,
+ BufferTransaction &t, bool replica = false);
+
+ bool exists(coll_t cid, const ghobject_t& oid);
+@@ -571,28 +639,8 @@
+ static const string OBJECT_OMAP_HEADER_KEY;
+ static const string COLLECTION;
+ static const string COLLECTION_ATTR;
+ static const uint32_t COLLECTION_VERSION = 1;
+-
+- class SubmitManager {
+- Mutex lock;
+- uint64_t op_seq;
+- uint64_t op_submitted;
+- public:
+- SubmitManager() :
+- lock("JOS::SubmitManager::lock", false, true, false, g_ceph_context),
+- op_seq(0), op_submitted(0)
+- {}
+- uint64_t op_submit_start();
+- void op_submit_finish(uint64_t op);
+- void set_op_seq(uint64_t seq) {
+- Mutex::Locker l(lock);
+- op_submitted = op_seq = seq;
+- }
+- uint64_t get_op_seq() {
+- return op_seq;
+- }
+- } submit_manager;
+ };
+
+ WRITE_CLASS_ENCODER(StripObjectMap::StripObjectHeader)
+
+--- a/src/os/LFNIndex.cc
++++ b/src/os/LFNIndex.cc
+@@ -60,8 +60,19 @@
+ ++current_failure;
+ }
+ }
+
++// Helper to close fd's when we leave scope. This is useful when used
++// in combination with RetryException, thrown by the above.
++struct FDCloser {
++ int fd;
++ FDCloser(int f) : fd(f) {}
++ ~FDCloser() {
++ VOID_TEMP_FAILURE_RETRY(::close(fd));
++ }
++};
++
++
+ /* Public methods */
+
+ void LFNIndex::set_ref(ceph::shared_ptr<CollectionIndex> ref)
+ {
+@@ -159,11 +170,11 @@
+ maybe_inject_failure();
+ int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY);
+ if (fd < 0)
+ return -errno;
++ FDCloser f(fd);
+ maybe_inject_failure();
+ int r = ::fsync(fd);
+- VOID_TEMP_FAILURE_RETRY(::close(fd));
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+ else
+@@ -752,9 +763,10 @@
+ char buf[FILENAME_MAX_LEN + 1];
+ for ( ; ; ++i) {
+ candidate = lfn_get_short_name(oid, i);
+ candidate_path = get_full_path(path, candidate);
+- r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(), buf, sizeof(buf));
++ r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(),
++ buf, sizeof(buf));
+ if (r < 0) {
+ if (errno != ENODATA && errno != ENOENT)
+ return -errno;
+ if (errno == ENODATA) {
+@@ -783,8 +795,40 @@
+ if (exists)
+ *exists = 1;
+ return 0;
+ }
++ r = chain_getxattr(candidate_path.c_str(), get_alt_lfn_attr().c_str(),
++ buf, sizeof(buf));
++ if (r > 0) {
++ // only consider alt name if nlink > 1
++ struct stat st;
++ int rc = ::stat(candidate_path.c_str(), &st);
++ if (rc < 0)
++ return -errno;
++ if (st.st_nlink <= 1) {
++ // left over from incomplete unlink, remove
++ maybe_inject_failure();
++ dout(20) << __func__ << " found extra alt attr for " << candidate_path
++ << ", long name " << string(buf, r) << dendl;
++ rc = chain_removexattr(candidate_path.c_str(),
++ get_alt_lfn_attr().c_str());
++ maybe_inject_failure();
++ if (rc < 0)
++ return rc;
++ continue;
++ }
++ buf[MIN((int)sizeof(buf) - 1, r)] = '\0';
++ if (!strcmp(buf, full_name.c_str())) {
++ dout(20) << __func__ << " used alt attr for " << full_name << dendl;
++ if (mangled_name)
++ *mangled_name = candidate;
++ if (out_path)
++ *out_path = candidate_path;
++ if (exists)
++ *exists = 1;
++ return 0;
++ }
++ }
+ }
+ assert(0); // Unreachable
+ return 0;
+ }
+@@ -797,9 +841,26 @@
+ return 0;
+ string full_path = get_full_path(path, mangled_name);
+ string full_name = lfn_generate_object_name(oid);
+ maybe_inject_failure();
+- return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(),
++
++ // if the main attr exists and is different, move it to the alt attr.
++ char buf[FILENAME_MAX_LEN + 1];
++ int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(),
++ buf, sizeof(buf));
++ if (r >= 0 && (r != (int)full_name.length() ||
++ memcmp(buf, full_name.c_str(), full_name.length()))) {
++ dout(20) << __func__ << " " << mangled_name
++ << " moving old name to alt attr "
++ << string(buf, r)
++ << ", new name is " << full_name << dendl;
++ r = chain_setxattr(full_path.c_str(), get_alt_lfn_attr().c_str(),
++ buf, r);
++ if (r < 0)
++ return r;
++ }
++
++ return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(),
+ full_name.c_str(), full_name.size());
+ }
+
+ int LFNIndex::lfn_unlink(const vector<string> &path,
+@@ -838,28 +899,37 @@
+ return -errno;
+ }
+ }
+ }
++ string full_path = get_full_path(path, mangled_name);
++ int fd = ::open(full_path.c_str(), O_RDONLY);
++ if (fd < 0)
++ return -errno;
++ FDCloser f(fd);
+ if (i == removed_index + 1) {
+- string full_path = get_full_path(path, mangled_name);
+ maybe_inject_failure();
+ int r = ::unlink(full_path.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+- else
+- return 0;
+ } else {
+- string rename_to = get_full_path(path, mangled_name);
++ string& rename_to = full_path;
+ string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1));
+ maybe_inject_failure();
+ int r = ::rename(rename_from.c_str(), rename_to.c_str());
+ maybe_inject_failure();
+ if (r < 0)
+ return -errno;
+- else
+- return 0;
+ }
++ struct stat st;
++ int r = ::fstat(fd, &st);
++ if (r == 0 && st.st_nlink > 0) {
++ // remove alt attr
++ dout(20) << __func__ << " removing alt attr from " << full_path << dendl;
++ fsync_dir(path);
++ chain_fremovexattr(fd, get_alt_lfn_attr().c_str());
++ }
++ return r;
+ }
+
+ int LFNIndex::lfn_translate(const vector<string> &path,
+ const string &short_name,
+--- a/src/os/LFNIndex.h
++++ b/src/os/LFNIndex.h
+@@ -122,9 +122,9 @@
+ error_injection_enabled = false;
+ }
+
+ private:
+- string lfn_attribute;
++ string lfn_attribute, lfn_alt_attribute;
+ coll_t collection;
+
+ public:
+ /// Constructor
+@@ -145,9 +145,10 @@
+ } else {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d", index_version);
+ lfn_attribute = LFN_ATTR + string(buf);
+- }
++ lfn_alt_attribute = LFN_ATTR + string(buf) + "-alt";
++ }
+ }
+
+ coll_t coll() const { return collection; }
+
+@@ -422,8 +423,11 @@
+ */
+ const string &get_lfn_attr() const {
+ return lfn_attribute;
+ }
++ const string &get_alt_lfn_attr() const {
++ return lfn_alt_attribute;
++ }
+
+ /**
+ * Gets the filename corresponsing to oid in path.
+ *
+--- a/src/os/MemStore.cc
++++ b/src/os/MemStore.cc
+@@ -949,9 +949,14 @@
+ }
+ break;
+
+ case Transaction::OP_SETALLOCHINT:
+- // nop
++ {
++ coll_t cid(i.get_cid());
++ ghobject_t oid = i.get_oid();
++ (void)i.get_length(); // discard result
++ (void)i.get_length(); // discard result
++ }
+ break;
+
+ default:
+ derr << "bad op " << op << dendl;
+--- a/src/os/ObjectStore.cc
++++ b/src/os/ObjectStore.cc
+@@ -143,9 +143,13 @@
+ int ObjectStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
+ snapid_t seq, vector<hobject_t> *ls)
+ {
+ vector<ghobject_t> go;
+- ghobject_t gstart(start), gend(end);
++ // Starts with the smallest shard id and generation to
++ // make sure the result list has the marker object
++ ghobject_t gstart(start, 0, shard_id_t(0));
++ // Exclusive end, choose the smallest end ghobject
++ ghobject_t gend(end, 0, shard_id_t(0));
+ int ret = collection_list_range(c, gstart, gend, seq, &go);
+ if (ret == 0) {
+ ls->reserve(go.size());
+ for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; ++i)
+--- a/src/os/ObjectStore.h
++++ b/src/os/ObjectStore.h
+@@ -127,8 +127,24 @@
+ * created in ...::queue_transaction(s)
+ */
+ struct Sequencer_impl {
+ virtual void flush() = 0;
++
++ /**
++ * Async flush_commit
++ *
++ * There are two cases:
++ * 1) sequencer is currently idle: the method returns true and
++ * c is deleted
++ * 2) sequencer is not idle: the method returns false and c is
++ * called asyncronously with a value of 0 once all transactions
++ * queued on this sequencer prior to the call have been applied
++ * and committed.
++ */
++ virtual bool flush_commit(
++ Context *c ///< [in] context to call upon flush/commit
++ ) = 0; ///< @return true if idle, false otherwise
++
+ virtual ~Sequencer_impl() {}
+ };
+
+ /**
+@@ -152,8 +168,18 @@
+ void flush() {
+ if (p)
+ p->flush();
+ }
++
++ /// @see Sequencer_impl::flush_commit()
++ bool flush_commit(Context *c) {
++ if (!p) {
++ delete c;
++ return true;
++ } else {
++ return p->flush_commit(c);
++ }
++ }
+ };
+
+ /*********************************
+ *
+--- a/src/osd/ECBackend.cc
++++ b/src/osd/ECBackend.cc
+@@ -104,15 +104,15 @@
+ }
+
+ void ECBackend::ReadOp::dump(Formatter *f) const
+ {
+- f->dump_stream("tid") << tid;
++ f->dump_unsigned("tid", tid);
+ if (op && op->get_req()) {
+ f->dump_stream("op") << *(op->get_req());
+ }
+ f->dump_stream("to_read") << to_read;
+ f->dump_stream("complete") << complete;
+- f->dump_stream("priority") << priority;
++ f->dump_int("priority", priority);
+ f->dump_stream("obj_to_source") << obj_to_source;
+ f->dump_stream("source_to_obj") << source_to_obj;
+ f->dump_stream("in_progress") << in_progress;
+ }
+@@ -157,9 +157,9 @@
+ f->dump_stream("missing_on") << missing_on;
+ f->dump_stream("missing_on_shards") << missing_on_shards;
+ f->dump_stream("recovery_info") << recovery_info;
+ f->dump_stream("recovery_progress") << recovery_progress;
+- f->dump_stream("pending_read") << pending_read;
++ f->dump_bool("pending_read", pending_read);
+ f->dump_stream("state") << tostr(state);
+ f->dump_stream("waiting_on_pushes") << waiting_on_pushes;
+ f->dump_stream("extent_requested") << extent_requested;
+ }
+@@ -828,8 +828,9 @@
+ get_parent()->log_operation(
+ op.log_entries,
+ op.updated_hit_set_history,
+ op.trim_to,
++ op.trim_rollback_to,
+ !(op.t.empty()),
+ localt);
+ localt->append(op.t);
+ if (on_local_applied_sync) {
+@@ -1210,8 +1211,9 @@
+ const hobject_t &hoid,
+ const eversion_t &at_version,
+ PGTransaction *_t,
+ const eversion_t &trim_to,
++ const eversion_t &trim_rollback_to,
+ vector<pg_log_entry_t> &log_entries,
+ boost::optional<pg_hit_set_history_t> &hset_history,
+ Context *on_local_applied_sync,
+ Context *on_all_applied,
+@@ -1225,8 +1227,9 @@
+ Op *op = &(tid_to_op_map[tid]);
+ op->hoid = hoid;
+ op->version = at_version;
+ op->trim_to = trim_to;
++ op->trim_rollback_to = trim_rollback_to;
+ op->log_entries.swap(log_entries);
+ std::swap(op->updated_hit_set_history, hset_history);
+ op->on_local_applied_sync = on_local_applied_sync;
+ op->on_all_applied = on_all_applied;
+@@ -1531,8 +1534,9 @@
+ stats,
+ should_send ? iter->second : ObjectStore::Transaction(),
+ op->version,
+ op->trim_to,
++ op->trim_rollback_to,
+ op->log_entries,
+ op->updated_hit_set_history,
+ op->temp_added,
+ op->temp_cleared);
+--- a/src/osd/ECBackend.h
++++ b/src/osd/ECBackend.h
+@@ -96,8 +96,9 @@
+ const hobject_t &hoid,
+ const eversion_t &at_version,
+ PGTransaction *t,
+ const eversion_t &trim_to,
++ const eversion_t &trim_rollback_to,
+ vector<pg_log_entry_t> &log_entries,
+ boost::optional<pg_hit_set_history_t> &hset_history,
+ Context *on_local_applied_sync,
+ Context *on_all_applied,
+@@ -325,8 +326,9 @@
+ struct Op {
+ hobject_t hoid;
+ eversion_t version;
+ eversion_t trim_to;
++ eversion_t trim_rollback_to;
+ vector<pg_log_entry_t> log_entries;
+ boost::optional<pg_hit_set_history_t> updated_hit_set_history;
+ Context *on_local_applied_sync;
+ Context *on_all_applied;
+--- a/src/osd/ECMsgTypes.cc
++++ b/src/osd/ECMsgTypes.cc
+@@ -15,9 +15,9 @@
+ #include "ECMsgTypes.h"
+
+ void ECSubWrite::encode(bufferlist &bl) const
+ {
+- ENCODE_START(2, 1, bl);
++ ENCODE_START(3, 1, bl);
+ ::encode(from, bl);
+ ::encode(tid, bl);
+ ::encode(reqid, bl);
+ ::encode(soid, bl);
+@@ -28,14 +28,15 @@
+ ::encode(log_entries, bl);
+ ::encode(temp_added, bl);
+ ::encode(temp_removed, bl);
+ ::encode(updated_hit_set_history, bl);
++ ::encode(trim_rollback_to, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void ECSubWrite::decode(bufferlist::iterator &bl)
+ {
+- DECODE_START(2, bl);
++ DECODE_START(3, bl);
+ ::decode(from, bl);
+ ::decode(tid, bl);
+ ::decode(reqid, bl);
+ ::decode(soid, bl);
+@@ -48,8 +49,13 @@
+ ::decode(temp_removed, bl);
+ if (struct_v >= 2) {
+ ::decode(updated_hit_set_history, bl);
+ }
++ if (struct_v >= 3) {
++ ::decode(trim_rollback_to, bl);
++ } else {
++ trim_rollback_to = trim_to;
++ }
+ DECODE_FINISH(bl);
+ }
+
+ std::ostream &operator<<(
+@@ -57,20 +63,22 @@
+ {
+ lhs << "ECSubWrite(tid=" << rhs.tid
+ << ", reqid=" << rhs.reqid
+ << ", at_version=" << rhs.at_version
+- << ", trim_to=" << rhs.trim_to;
++ << ", trim_to=" << rhs.trim_to
++ << ", trim_rollback_to=" << rhs.trim_rollback_to;
+ if (rhs.updated_hit_set_history)
+ lhs << ", has_updated_hit_set_history";
+ return lhs << ")";
+ }
+
+ void ECSubWrite::dump(Formatter *f) const
+ {
+- f->dump_stream("tid") << tid;
++ f->dump_unsigned("tid", tid);
+ f->dump_stream("reqid") << reqid;
+ f->dump_stream("at_version") << at_version;
+ f->dump_stream("trim_to") << trim_to;
++ f->dump_stream("trim_rollback_to") << trim_rollback_to;
+ f->dump_stream("has_updated_hit_set_history")
+ << static_cast<bool>(updated_hit_set_history);
+ }
+
+@@ -84,8 +92,14 @@
+ o.back()->tid = 4;
+ o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
+ o.back()->at_version = eversion_t(10, 300);
+ o.back()->trim_to = eversion_t(5, 42);
++ o.push_back(new ECSubWrite());
++ o.back()->tid = 9;
++ o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
++ o.back()->at_version = eversion_t(10, 300);
++ o.back()->trim_to = eversion_t(5, 42);
++ o.back()->trim_rollback_to = eversion_t(8, 250);
+ }
+
+ void ECSubWriteReply::encode(bufferlist &bl) const
+ {
+@@ -120,9 +134,9 @@
+ }
+
+ void ECSubWriteReply::dump(Formatter *f) const
+ {
+- f->dump_stream("tid") << tid;
++ f->dump_unsigned("tid", tid);
+ f->dump_stream("last_complete") << last_complete;
+ f->dump_stream("committed") << committed;
+ f->dump_stream("applied") << applied;
+ }
+@@ -170,9 +184,9 @@
+
+ void ECSubRead::dump(Formatter *f) const
+ {
+ f->dump_stream("from") << from;
+- f->dump_stream("tid") << tid;
++ f->dump_unsigned("tid", tid);
+ f->open_array_section("objects");
+ for (map<hobject_t, list<pair<uint64_t, uint64_t> > >::const_iterator i =
+ to_read.begin();
+ i != to_read.end();
+@@ -258,9 +272,9 @@
+
+ void ECSubReadReply::dump(Formatter *f) const
+ {
+ f->dump_stream("from") << from;
+- f->dump_stream("tid") << tid;
++ f->dump_unsigned("tid", tid);
+ f->open_array_section("buffers_read");
+ for (map<hobject_t, list<pair<uint64_t, bufferlist> > >::const_iterator i =
+ buffers_read.begin();
+ i != buffers_read.end();
+--- a/src/osd/ECMsgTypes.h
++++ b/src/osd/ECMsgTypes.h
+@@ -27,8 +27,9 @@
+ pg_stat_t stats;
+ ObjectStore::Transaction t;
+ eversion_t at_version;
+ eversion_t trim_to;
++ eversion_t trim_rollback_to;
+ vector<pg_log_entry_t> log_entries;
+ set<hobject_t> temp_added;
+ set<hobject_t> temp_removed;
+ boost::optional<pg_hit_set_history_t> updated_hit_set_history;
+@@ -41,16 +42,18 @@
+ const pg_stat_t &stats,
+ const ObjectStore::Transaction &t,
+ eversion_t at_version,
+ eversion_t trim_to,
++ eversion_t trim_rollback_to,
+ vector<pg_log_entry_t> log_entries,
+ boost::optional<pg_hit_set_history_t> updated_hit_set_history,
+ const set<hobject_t> &temp_added,
+ const set<hobject_t> &temp_removed)
+ : from(from), tid(tid), reqid(reqid),
+ soid(soid), stats(stats), t(t),
+ at_version(at_version),
+- trim_to(trim_to), log_entries(log_entries),
++ trim_to(trim_to), trim_rollback_to(trim_rollback_to),
++ log_entries(log_entries),
+ temp_added(temp_added),
+ temp_removed(temp_removed),
+ updated_hit_set_history(updated_hit_set_history) {}
+ void encode(bufferlist &bl) const;
+--- a/src/osd/HitSet.h
++++ b/src/osd/HitSet.h
+@@ -368,9 +368,9 @@
+ double get_fpp() const {
+ return (double)fpp_micro / 1000000.0;
+ }
+ void set_fpp(double f) {
+- fpp_micro = (unsigned)(f * 1000000.0);
++ fpp_micro = (unsigned)(llrintl(f * (double)1000000.0));
+ }
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+--- a/src/osd/OSD.cc
++++ b/src/osd/OSD.cc
+@@ -41,8 +41,9 @@
+ #include "osdc/Objecter.h"
+
+ #include "common/ceph_argparse.h"
+ #include "common/version.h"
++#include "common/io_priority.h"
+
+ #include "os/ObjectStore.h"
+
+ #include "ReplicatedPG.h"
+@@ -190,8 +191,9 @@
+ rep_scrub_wq(osd->rep_scrub_wq),
+ push_wq("push_wq", cct->_conf->osd_recovery_thread_timeout, &osd->recovery_tp),
+ gen_wq("gen_wq", cct->_conf->osd_recovery_thread_timeout, &osd->recovery_tp),
+ class_handler(osd->class_handler),
++ pg_epoch_lock("OSDService::pg_epoch_lock"),
+ publish_lock("OSDService::publish_lock"),
+ pre_publish_lock("OSDService::pre_publish_lock"),
+ sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
+ scrubs_active(0),
+@@ -1276,8 +1278,10 @@
+ recovery_tp.start();
+ disk_tp.start();
+ command_tp.start();
+
++ set_disk_tp_priority();
++
+ // start the heartbeat
+ heartbeat_thread.create();
+
+ // tick
+@@ -1304,8 +1308,10 @@
+ osd_lock.Lock();
+ if (is_stopping())
+ return 0;
+
++ check_config();
++
+ dout(10) << "ensuring pgs have consumed prior maps" << dendl;
+ consume_map();
+ peering_wq.drain();
+
+@@ -1662,10 +1668,12 @@
+ recovery_tp.stop();
+ dout(10) << "recovery tp stopped" << dendl;
+
+ op_tp.drain();
++ peering_wq.clear();
++ scrub_finalize_wq.clear();
+ op_tp.stop();
+- dout(10) << "op tp stopped" << dendl;
++ dout(10) << "osd tp stopped" << dendl;
+
+ command_tp.drain();
+ command_tp.stop();
+ dout(10) << "command tp stopped" << dendl;
+@@ -1707,9 +1715,8 @@
+ Mutex::Locker l(pg_stat_queue_lock);
+ assert(pg_stat_queue.empty());
+ }
+
+- peering_wq.clear();
+ // Remove PGs
+ #ifdef PG_DEBUG_REFS
+ service.dump_live_pgids();
+ #endif
+@@ -1853,8 +1860,10 @@
+ PG* pg = _make_pg(createmap, pgid);
+
+ pg_map[pgid] = pg;
+
++ service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
++
+ pg->lock(no_lockdep_check);
+ pg->get("PGMap"); // because it's in pg_map
+ return pg;
+ }
+@@ -1884,8 +1893,9 @@
+ {
+ epoch_t e(service.get_osdmap()->get_epoch());
+ pg->get("PGMap"); // For pg_map
+ pg_map[pg->info.pgid] = pg;
++ service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
+ dout(10) << "Adding newly split pg " << *pg << dendl;
+ vector<int> up, acting;
+ pg->get_osdmap()->pg_to_up_acting_osds(pg->info.pgid.pgid, up, acting);
+ int role = OSDMap::calc_pg_role(service.whoami, acting);
+@@ -4391,11 +4401,10 @@
+ // 1MB block sizes are big enough so that we get more stuff done.
+ // However, to avoid the osd from getting hung on this and having
+ // timers being triggered, we are going to limit the count assuming
+ // a configurable throughput and duration.
+- int64_t total_throughput =
++ int64_t max_count =
+ g_conf->osd_bench_large_size_max_throughput * duration;
+- int64_t max_count = (int64_t) (total_throughput / bsize);
+ if (count > max_count) {
+ ss << "'count' values greater than " << max_count
+ << " for a block size of " << prettybyte_t(bsize) << ", assuming "
+ << prettybyte_t(g_conf->osd_bench_large_size_max_throughput) << "/s,"
+@@ -5712,13 +5721,14 @@
+ client_messenger->set_default_policy(p);
+ }
+ }
+ {
+- Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_MON);
++ Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
+ uint64_t mask;
+ uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
+ if ((p.features_required & mask) != features) {
+ dout(0) << "crush map has features " << features
++ << " was " << p.features_required
+ << ", adjusting msgr requires for mons" << dendl;
+ p.features_required = (p.features_required & ~mask) | features;
+ client_messenger->set_policy(entity_name_t::TYPE_MON, p);
+ }
+@@ -5747,9 +5757,9 @@
+ }
+ }
+ }
+
+-void OSD::advance_pg(
++bool OSD::advance_pg(
+ epoch_t osd_epoch, PG *pg,
+ ThreadPool::TPHandle &handle,
+ PG::RecoveryCtx *rctx,
+ set<boost::intrusive_ptr<PG> > *new_pgs)
+@@ -5758,13 +5768,21 @@
+ epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
+ OSDMapRef lastmap = pg->get_osdmap();
+
+ if (lastmap->get_epoch() == osd_epoch)
+- return;
++ return true;
+ assert(lastmap->get_epoch() < osd_epoch);
+
++ epoch_t min_epoch = service.get_min_pg_epoch();
++ epoch_t max;
++ if (min_epoch) {
++ max = min_epoch + g_conf->osd_map_max_advance;
++ } else {
++ max = next_epoch + g_conf->osd_map_max_advance;
++ }
++
+ for (;
+- next_epoch <= osd_epoch;
++ next_epoch <= osd_epoch && next_epoch <= max;
+ ++next_epoch) {
+ OSDMapRef nextmap = service.try_get_map(next_epoch);
+ if (!nextmap)
+ continue;
+@@ -5794,9 +5812,17 @@
+
+ lastmap = nextmap;
+ handle.reset_tp_timeout();
+ }
++ service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
+ pg->handle_activate_map(rctx);
++ if (next_epoch <= osd_epoch) {
++ dout(10) << __func__ << " advanced by max " << g_conf->osd_map_max_advance
++ << " past min epoch " << min_epoch
++ << " ... will requeue " << *pg << dendl;
++ return false;
++ }
++ return true;
+ }
+
+ /**
+ * scan placement groups, initiate any replication
+@@ -6126,9 +6152,9 @@
+ }
+ return true;
+ }
+
+-bool OSD::require_osd_peer(OpRequestRef op)
++bool OSD::require_osd_peer(OpRequestRef& op)
+ {
+ if (!op->get_req()->get_connection()->peer_is_osd()) {
+ dout(0) << "require_osd_peer received from non-osd " << op->get_req()->get_connection()->get_peer_addr()
+ << " " << *op->get_req() << dendl;
+@@ -6136,13 +6162,66 @@
+ }
+ return true;
+ }
+
++bool OSD::require_self_aliveness(OpRequestRef& op, epoch_t epoch)
++{
++ if (epoch < up_epoch) {
++ dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
++ return false;
++ }
++
++ if (!is_active()) {
++ dout(7) << "still in boot state, dropping message " << *op->get_req() << dendl;
++ return false;
++ }
++
++ return true;
++}
++
++bool OSD::require_same_peer_instance(OpRequestRef& op, OSDMapRef& map)
++{
++ Message *m = op->get_req();
++ int from = m->get_source().num();
++
++ if (!map->have_inst(from) ||
++ (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
++ dout(5) << "from dead osd." << from << ", marking down, "
++ << " msg was " << m->get_source_inst().addr
++ << " expected " << (map->have_inst(from) ?
++ map->get_cluster_addr(from) : entity_addr_t())
++ << dendl;
++ ConnectionRef con = m->get_connection();
++ cluster_messenger->mark_down(con.get());
++ Session *s = static_cast<Session*>(con->get_priv());
++ if (s) {
++ con->set_priv(NULL); // break ref <-> session cycle, if any
++ s->put();
++ }
++ return false;
++ }
++ return true;
++}
++
++bool OSD::require_up_osd_peer(OpRequestRef& op, OSDMapRef& map,
++ epoch_t their_epoch)
++{
++ if (!require_self_aliveness(op, their_epoch)) {
++ return false;
++ } else if (!require_osd_peer(op)) {
++ return false;
++ } else if (map->get_epoch() >= their_epoch &&
++ !require_same_peer_instance(op, map)) {
++ return false;
++ }
++ return true;
++}
++
+ /*
+ * require that we have same (or newer) map, and that
+ * the source is the pg primary.
+ */
+-bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch)
++bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch)
+ {
+ Message *m = op->get_req();
+ dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
+
+@@ -6154,32 +6233,15 @@
+ wait_for_new_map(op);
+ return false;
+ }
+
+- if (epoch < up_epoch) {
+- dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
++ if (!require_self_aliveness(op, epoch)) {
+ return false;
+ }
+
+ // ok, our map is same or newer.. do they still exist?
+- if (m->get_connection()->get_messenger() == cluster_messenger) {
+- int from = m->get_source().num();
+- if (!osdmap->have_inst(from) ||
+- osdmap->get_cluster_addr(from) != m->get_source_inst().addr) {
+- dout(5) << "from dead osd." << from << ", marking down, "
+- << " msg was " << m->get_source_inst().addr
+- << " expected " << (osdmap->have_inst(from) ? osdmap->get_cluster_addr(from) : entity_addr_t())
+- << dendl;
+- ConnectionRef con = m->get_connection();
+- con->set_priv(NULL); // break ref <-> session cycle, if any
+- cluster_messenger->mark_down(con.get());
+- return false;
+- }
+- }
+-
+- // ok, we have at least as new a map as they do. are we (re)booting?
+- if (!is_active()) {
+- dout(7) << "still in boot state, dropping message " << *m << dendl;
++ if (m->get_connection()->get_messenger() == cluster_messenger &&
++ !require_same_peer_instance(op, osdmap)) {
+ return false;
+ }
+
+ return true;
+@@ -7141,8 +7203,10 @@
+ PGRef(pg))
+ );
+ remove_wq.queue(make_pair(PGRef(pg), deleting));
+
++ service.pg_remove_epoch(pg->info.pgid);
++
+ // remove from map
+ pg_map.erase(pg->info.pgid);
+ pg->put("PGMap"); // since we've taken it out of map
+ }
+@@ -7554,9 +7618,9 @@
+ dout(3) << "replica op from before up" << dendl;
+ return;
+ }
+
+- if (!require_osd_peer(op))
++ if (!require_up_osd_peer(op, osdmap, m->map_epoch))
+ return;
+
+ // must be a rep op.
+ assert(m->get_source().is_osd());
+@@ -7769,10 +7833,11 @@
+ if (pg->deleting) {
+ pg->unlock();
+ continue;
+ }
+- advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs);
+- if (!pg->peering_queue.empty()) {
++ if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
++ pg->queue_null(curmap->get_epoch(), curmap->get_epoch());
++ } else if (!pg->peering_queue.empty()) {
+ PG::CephPeeringEvtRef evt = pg->peering_queue.front();
+ pg->peering_queue.pop_front();
+ pg->handle_peering_event(evt, &rctx);
+ }
+@@ -7807,8 +7872,13 @@
+ static const char* KEYS[] = {
+ "osd_max_backfills",
+ "osd_op_complaint_time", "osd_op_log_threshold",
+ "osd_op_history_size", "osd_op_history_duration",
++ "osd_map_cache_size",
++ "osd_map_max_advance",
++ "osd_pg_epoch_persisted_max_stale",
++ "osd_disk_thread_ioprio_class",
++ "osd_disk_thread_ioprio_priority",
+ NULL
+ };
+ return KEYS;
+ }
+@@ -7829,8 +7899,40 @@
+ changed.count("osd_op_history_duration")) {
+ op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
+ cct->_conf->osd_op_history_duration);
+ }
++ if (changed.count("osd_disk_thread_ioprio_class") ||
++ changed.count("osd_disk_thread_ioprio_priority")) {
++ set_disk_tp_priority();
++ }
++
++ check_config();
++}
++
++void OSD::check_config()
++{
++ // some sanity checks
++ if (g_conf->osd_map_cache_size <= g_conf->osd_map_max_advance + 2) {
++ clog.warn() << "osd_map_cache_size (" << g_conf->osd_map_cache_size << ")"
++ << " is not > osd_map_max_advance ("
++ << g_conf->osd_map_max_advance << ")";
++ }
++ if (g_conf->osd_map_cache_size <= (int)g_conf->osd_pg_epoch_persisted_max_stale + 2) {
++ clog.warn() << "osd_map_cache_size (" << g_conf->osd_map_cache_size << ")"
++ << " is not > osd_pg_epoch_persisted_max_stale ("
++ << g_conf->osd_pg_epoch_persisted_max_stale << ")";
++ }
++}
++
++void OSD::set_disk_tp_priority()
++{
++ dout(10) << __func__
++ << " class " << cct->_conf->osd_disk_thread_ioprio_class
++ << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
++ << dendl;
++ int cls =
++ ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
++ disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
+ }
+
+ // --------------------------------
+
+--- a/src/osd/OSD.h
++++ b/src/osd/OSD.h
+@@ -333,8 +333,44 @@
+ ClassHandler *&class_handler;
+
+ void dequeue_pg(PG *pg, list<OpRequestRef> *dequeued);
+
++ // -- map epoch lower bound --
++ Mutex pg_epoch_lock;
++ multiset<epoch_t> pg_epochs;
++ map<spg_t,epoch_t> pg_epoch;
++
++ void pg_add_epoch(spg_t pgid, epoch_t epoch) {
++ Mutex::Locker l(pg_epoch_lock);
++ map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
++ assert(t == pg_epoch.end());
++ pg_epoch[pgid] = epoch;
++ pg_epochs.insert(epoch);
++ }
++ void pg_update_epoch(spg_t pgid, epoch_t epoch) {
++ Mutex::Locker l(pg_epoch_lock);
++ map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
++ assert(t != pg_epoch.end());
++ pg_epochs.erase(pg_epochs.find(t->second));
++ t->second = epoch;
++ pg_epochs.insert(epoch);
++ }
++ void pg_remove_epoch(spg_t pgid) {
++ Mutex::Locker l(pg_epoch_lock);
++ map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
++ if (t != pg_epoch.end()) {
++ pg_epochs.erase(pg_epochs.find(t->second));
++ pg_epoch.erase(t);
++ }
++ }
++ epoch_t get_min_pg_epoch() {
++ Mutex::Locker l(pg_epoch_lock);
++ if (pg_epochs.empty())
++ return 0;
++ else
++ return *pg_epochs.begin();
++ }
++
+ // -- superblock --
+ Mutex publish_lock, pre_publish_lock; // pre-publish orders before publish
+ OSDSuperblock superblock;
+ OSDSuperblock get_superblock() {
+@@ -783,8 +819,9 @@
+ // config observer bits
+ virtual const char** get_tracked_conf_keys() const;
+ virtual void handle_conf_change(const struct md_config_t *conf,
+ const std::set <std::string> &changed);
++ void check_config();
+
+ protected:
+ Mutex osd_lock; // global lock
+ SafeTimer tick_timer; // safe timer (osd_lock)
+@@ -943,8 +980,10 @@
+ ThreadPool command_tp;
+
+ bool paused_recovery;
+
++ void set_disk_tp_priority();
++
+ // -- sessions --
+ public:
+ struct Session : public RefCountedObject {
+ EntityName entity_name;
+@@ -1254,9 +1293,9 @@
+ void handle_osd_map(class MOSDMap *m);
+ void note_down_osd(int osd);
+ void note_up_osd(int osd);
+
+- void advance_pg(
++ bool advance_pg(
+ epoch_t advance_to, PG *pg,
+ ThreadPool::TPHandle &handle,
+ PG::RecoveryCtx *rctx,
+ set<boost::intrusive_ptr<PG> > *split_pgs
+@@ -1512,11 +1551,24 @@
+ OSDMapRef map);
+ void repeer(PG *pg, map< int, map<spg_t,pg_query_t> >& query_map);
+
+ bool require_mon_peer(Message *m);
+- bool require_osd_peer(OpRequestRef op);
++ bool require_osd_peer(OpRequestRef& op);
++ /***
++ * Verifies that we were alive in the given epoch, and that
++ * still are.
++ */
++ bool require_self_aliveness(OpRequestRef& op, epoch_t alive_since);
++ /**
++ * Verifies that the OSD who sent the given op has the same
++ * address as in the given map.
++ * @pre op was sent by an OSD using the cluster messenger
++ */
++ bool require_same_peer_instance(OpRequestRef& op, OSDMapRef& map);
++ bool require_up_osd_peer(OpRequestRef& Op, OSDMapRef& map,
++ epoch_t their_epoch);
+
+- bool require_same_or_newer_map(OpRequestRef op, epoch_t e);
++ bool require_same_or_newer_map(OpRequestRef& op, epoch_t e);
+
+ void handle_pg_query(OpRequestRef op);
+ void handle_pg_notify(OpRequestRef op);
+ void handle_pg_log(OpRequestRef op);
+--- a/src/osd/OSDMap.cc
++++ b/src/osd/OSDMap.cc
+@@ -958,12 +958,9 @@
+ if (crush->has_nondefault_tunables())
+ features |= CEPH_FEATURE_CRUSH_TUNABLES;
+ if (crush->has_nondefault_tunables2())
+ features |= CEPH_FEATURE_CRUSH_TUNABLES2;
+- if (crush->has_v2_rules())
+- features |= CEPH_FEATURE_CRUSH_V2;
+- if (crush->has_nondefault_tunables3() ||
+- crush->has_v3_rules())
++ if (crush->has_nondefault_tunables3())
+ features |= CEPH_FEATURE_CRUSH_TUNABLES3;
+ mask |= CEPH_FEATURES_CRUSH;
+
+ for (map<int64_t,pg_pool_t>::const_iterator p = pools.begin(); p != pools.end(); ++p) {
+@@ -977,8 +974,17 @@
+ if (!p->second.tiers.empty() ||
+ p->second.is_tier()) {
+ features |= CEPH_FEATURE_OSD_CACHEPOOL;
+ }
++ int ruleid = crush->find_rule(p->second.get_crush_ruleset(),
++ p->second.get_type(),
++ p->second.get_size());
++ if (ruleid >= 0) {
++ if (crush->is_v2_rule(ruleid))
++ features |= CEPH_FEATURE_CRUSH_V2;
++ if (crush->is_v3_rule(ruleid))
++ features |= CEPH_FEATURE_CRUSH_TUNABLES3;
++ }
+ }
+ mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
+ if (entity_type != CEPH_ENTITY_TYPE_CLIENT)
+ mask |= CEPH_FEATURE_OSD_ERASURE_CODES;
+@@ -1800,9 +1806,17 @@
+ {
+ ENCODE_START(1, 1, bl); // extended, osd-only data
+ ::encode(osd_addrs->hb_back_addr, bl);
+ ::encode(osd_info, bl);
+- ::encode(blacklist, bl);
++ {
++ // put this in a sorted, ordered map<> so that we encode in a
++ // deterministic order.
++ map<entity_addr_t,utime_t> blacklist_map;
++ for (ceph::unordered_map<entity_addr_t,utime_t>::const_iterator p =
++ blacklist.begin(); p != blacklist.end(); ++p)
++ blacklist_map.insert(make_pair(p->first, p->second));
++ ::encode(blacklist_map, bl);
++ }
+ ::encode(osd_addrs->cluster_addr, bl);
+ ::encode(cluster_snapshot_epoch, bl);
+ ::encode(cluster_snapshot, bl);
+ ::encode(*osd_uuid, bl);
+@@ -2158,8 +2172,9 @@
+ o.push_back(new OSDMap);
+ uuid_d fsid;
+ o.back()->build_simple(cct, 1, fsid, 16, 7, 8);
+ o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
++ o.back()->blacklist[entity_addr_t()] = utime_t(5, 6);
+ cct->put();
+ }
+
+ string OSDMap::get_flag_string(unsigned f)
+@@ -2550,15 +2565,27 @@
+ set_state(i, 0);
+ set_weight(i, CEPH_OSD_OUT);
+ }
+
+- map<string,string> erasure_code_profile_map;
+- r = get_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
+- ss,
+- &erasure_code_profile_map);
+- erasure_code_profile_map["directory"] =
++ map<string,string> profile_map;
++ r = get_erasure_code_profile_default(cct, profile_map, &ss);
++ if (r < 0) {
++ lderr(cct) << ss.str() << dendl;
++ return r;
++ }
++ set_erasure_code_profile("default", profile_map);
++ return 0;
++}
++
++int OSDMap::get_erasure_code_profile_default(CephContext *cct,
++ map<string,string> &profile_map,
++ ostream *ss)
++{
++ int r = get_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
++ *ss,
++ &profile_map);
++ profile_map["directory"] =
+ cct->_conf->osd_pool_default_erasure_code_directory;
+- set_erasure_code_profile("default", erasure_code_profile_map);
+ return r;
+ }
+
+ int OSDMap::_build_crush_types(CrushWrapper& crush)
+--- a/src/osd/OSDMap.h
++++ b/src/osd/OSDMap.h
+@@ -379,8 +379,11 @@
+ map<string,map<string,string> >::const_iterator i =
+ erasure_code_profiles.find(name);
+ return i != erasure_code_profiles.end();
+ }
++ int get_erasure_code_profile_default(CephContext *cct,
++ map<string,string> &profile_map,
++ ostream *ss);
+ void set_erasure_code_profile(const string &name,
+ const map<string,string> &profile) {
+ erasure_code_profiles[name] = profile;
+ }
+--- a/src/osd/OpRequest.cc
++++ b/src/osd/OpRequest.cc
+@@ -32,9 +32,9 @@
+ f->open_object_section("client_info");
+ stringstream client_name;
+ client_name << m->get_orig_source();
+ f->dump_string("client", client_name.str());
+- f->dump_int("tid", m->get_tid());
++ f->dump_unsigned("tid", m->get_tid());
+ f->close_section(); // client_info
+ }
+ {
+ f->open_array_section("events");
+--- a/src/osd/OpRequest.h
++++ b/src/osd/OpRequest.h
+@@ -73,8 +73,12 @@
+ void set_pg_op();
+
+ void _dump(utime_t now, Formatter *f) const;
+
++ bool has_feature(uint64_t f) const {
++ return request->get_connection()->has_feature(f);
++ }
++
+ private:
+ osd_reqid_t reqid;
+ uint8_t hit_flag_points;
+ uint8_t latest_flag_point;
+--- a/src/osd/PG.cc
++++ b/src/osd/PG.cc
+@@ -1442,9 +1442,9 @@
+ last_update_ondisk = info.last_update;
+ min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
+ }
+ last_update_applied = info.last_update;
+-
++ last_rollback_info_trimmed_to_applied = pg_log.get_rollback_trimmed_to();
+
+ need_up_thru = false;
+
+ // write pg info, log
+@@ -2640,9 +2640,12 @@
+ }
+
+
+ void PG::append_log(
+- vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t,
++ vector<pg_log_entry_t>& logv,
++ eversion_t trim_to,
++ eversion_t trim_rollback_to,
++ ObjectStore::Transaction &t,
+ bool transaction_applied)
+ {
+ if (transaction_applied)
+ update_snap_map(logv, t);
+@@ -2654,15 +2657,35 @@
+ ++p) {
+ p->offset = 0;
+ add_log_entry(*p, keys[p->get_key_name()]);
+ }
+- if (!transaction_applied)
+- pg_log.clear_can_rollback_to();
++
++ PGLogEntryHandler handler;
++ if (!transaction_applied) {
++ pg_log.clear_can_rollback_to(&handler);
++ t.register_on_applied(
++ new C_UpdateLastRollbackInfoTrimmedToApplied(
++ this,
++ get_osdmap()->get_epoch(),
++ info.last_update));
++ } else if (trim_rollback_to > pg_log.get_rollback_trimmed_to()) {
++ pg_log.trim_rollback_info(
++ trim_rollback_to,
++ &handler);
++ t.register_on_applied(
++ new C_UpdateLastRollbackInfoTrimmedToApplied(
++ this,
++ get_osdmap()->get_epoch(),
++ trim_rollback_to));
++ }
+
+ dout(10) << "append_log adding " << keys.size() << " keys" << dendl;
+ t.omap_setkeys(coll_t::META_COLL, log_oid, keys);
+- PGLogEntryHandler handler;
++
+ pg_log.trim(&handler, trim_to, info);
++
++ dout(10) << __func__ << ": trimming to " << trim_rollback_to
++ << " entries " << handler.to_trim << dendl;
+ handler.apply(this, &t);
+
+ // update the local pg, pg log
+ dirty_info = true;
+@@ -3003,9 +3026,10 @@
+ }
+
+ void PG::reg_next_scrub()
+ {
+- if (scrubber.must_scrub) {
++ if (scrubber.must_scrub ||
++ (info.stats.stats_invalid && g_conf->osd_scrub_invalid_stats)) {
+ scrubber.scrub_reg_stamp = utime_t();
+ } else {
+ scrubber.scrub_reg_stamp = info.history.last_scrub_stamp;
+ }
+@@ -3261,8 +3285,36 @@
+ osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
+ }
+ }
+
++void PG::_scan_rollback_obs(
++ const vector<ghobject_t> &rollback_obs,
++ ThreadPool::TPHandle &handle)
++{
++ ObjectStore::Transaction *t = NULL;
++ eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
++ for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
++ i != rollback_obs.end();
++ ++i) {
++ if (i->generation < trimmed_to.version) {
++ osd->clog.error() << "osd." << osd->whoami
++ << " pg " << info.pgid
++ << " found obsolete rollback obj "
++ << *i << " generation < trimmed_to "
++ << trimmed_to
++ << "...repaired";
++ if (!t)
++ t = new ObjectStore::Transaction;
++ t->remove(coll, *i);
++ }
++ }
++ if (t) {
++ derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
++ << dendl;
++ osd->store->queue_transaction_and_cleanup(osr.get(), t);
++ }
++}
++
+ void PG::_scan_snaps(ScrubMap &smap)
+ {
+ for (map<hobject_t, ScrubMap::object>::iterator i = smap.objects.begin();
+ i != smap.objects.end();
+@@ -3348,15 +3400,23 @@
+ map.valid_through = info.last_update;
+
+ // objects
+ vector<hobject_t> ls;
+- int ret = get_pgbackend()->objects_list_range(start, end, 0, &ls);
++ vector<ghobject_t> rollback_obs;
++ int ret = get_pgbackend()->objects_list_range(
++ start,
++ end,
++ 0,
++ &ls,
++ &rollback_obs);
+ if (ret < 0) {
+ dout(5) << "objects_list_range error: " << ret << dendl;
+ return ret;
+ }
+
++
+ get_pgbackend()->be_scan_list(map, ls, deep, handle);
++ _scan_rollback_obs(rollback_obs, handle);
+ _scan_snaps(map);
+
+ // pg attrs
+ osd->store->collection_getattrs(coll, map.attrs);
+@@ -3577,8 +3637,19 @@
+ */
+ void PG::scrub(ThreadPool::TPHandle &handle)
+ {
+ lock();
++ if (g_conf->osd_scrub_sleep > 0 &&
++ (scrubber.state == PG::Scrubber::NEW_CHUNK ||
++ scrubber.state == PG::Scrubber::INACTIVE)) {
++ dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
++ unlock();
++ utime_t t;
++ t.set_from_double(g_conf->osd_scrub_sleep);
++ t.sleep();
++ lock();
++ dout(20) << __func__ << " slept for " << t << dendl;
++ }
+ if (deleting) {
+ unlock();
+ return;
+ }
+@@ -4630,8 +4701,23 @@
+ on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
+ on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
+ }
+
++void PG::reset_interval_flush()
++{
++ dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
++ recovery_state.clear_blocked_outgoing();
++
++ if (!osr->flush_commit(
++ new QueuePeeringEvt<IntervalFlush>(
++ this, get_osdmap()->get_epoch(), IntervalFlush()))) {
++ dout(10) << "Beginning to block outgoing recovery messages" << dendl;
++ recovery_state.begin_block_outgoing();
++ } else {
++ dout(10) << "Not blocking outgoing recovery messages" << dendl;
++ }
++}
++
+ /* Called before initializing peering during advance_map */
+ void PG::start_peering_interval(
+ const OSDMapRef lastmap,
+ const vector<int>& newup, int new_up_primary,
+@@ -4640,8 +4726,9 @@
+ {
+ const OSDMapRef osdmap = get_osdmap();
+
+ set_last_peering_reset();
++ reset_interval_flush();
+
+ vector<int> oldacting, oldup;
+ int oldrole = get_role();
+
+@@ -5049,9 +5136,9 @@
+ return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
+ case MSG_OSD_PG_PUSH_REPLY:
+ return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
+ case MSG_OSD_SUBOPREPLY:
+- return false;
++ return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
+
+ case MSG_OSD_EC_WRITE:
+ return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
+ case MSG_OSD_EC_WRITE_REPLY:
+@@ -5385,8 +5472,17 @@
+ context< RecoveryMachine >().log_enter(state_name);
+ }
+
+ boost::statechart::result
++PG::RecoveryState::Started::react(const IntervalFlush&)
++{
++ dout(10) << "Ending blocked outgoing recovery messages" << dendl;
++ context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
++ return discard_event();
++}
++
++
++boost::statechart::result
+ PG::RecoveryState::Started::react(const FlushedEvt&)
+ {
+ PG *pg = context< RecoveryMachine >().pg;
+ pg->on_flushed();
+@@ -5435,8 +5531,9 @@
+ NamedState(context< RecoveryMachine >().pg->cct, "Reset")
+ {
+ context< RecoveryMachine >().log_enter(state_name);
+ PG *pg = context< RecoveryMachine >().pg;
++
+ pg->flushes_in_progress = 0;
+ pg->set_last_peering_reset();
+ }
+
+@@ -5447,8 +5544,16 @@
+ pg->on_flushed();
+ return discard_event();
+ }
+
++boost::statechart::result
++PG::RecoveryState::Reset::react(const IntervalFlush&)
++{
++ dout(10) << "Ending blocked outgoing recovery messages" << dendl;
++ context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
++ return discard_event();
++}
++
+ boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
+ {
+ PG *pg = context< RecoveryMachine >().pg;
+ dout(10) << "Reset advmap" << dendl;
+@@ -5829,8 +5934,20 @@
+ {
+ context< RecoveryMachine >().log_enter(state_name);
+ }
+
++boost::statechart::result
++PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
++{
++ return discard_event();
++}
++
++boost::statechart::result
++PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
++{
++ return discard_event();
++}
++
+ void PG::RecoveryState::NotBackfilling::exit()
+ {
+ context< RecoveryMachine >().log_exit(state_name, enter_time);
+ PG *pg = context< RecoveryMachine >().pg;
+@@ -6587,19 +6704,23 @@
+ PG *pg = context< RecoveryMachine >().pg;
+ MOSDPGLog *msg = logevt.msg.get();
+ dout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
+
++ ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
+ if (msg->info.last_backfill == hobject_t()) {
+ // restart backfill
+ pg->unreg_next_scrub();
+ pg->info = msg->info;
+ pg->reg_next_scrub();
+ pg->dirty_info = true;
+ pg->dirty_big_info = true; // maybe.
+- pg->pg_log.claim_log(msg->log);
++
++ PGLogEntryHandler rollbacker;
++ pg->pg_log.claim_log_and_clear_rollback_info(msg->log, &rollbacker);
++ rollbacker.apply(pg, t);
++
+ pg->pg_log.reset_backfill();
+ } else {
+- ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
+ pg->merge_log(*t, msg->info, msg->log, logevt.from);
+ }
+
+ assert(pg->pg_log.get_head() == pg->info.last_update);
+@@ -7491,20 +7612,53 @@
+ }
+
+ void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
+ assert(!rctx);
+- rctx = new_ctx;
+- if (rctx)
++ assert(!orig_ctx);
++ orig_ctx = new_ctx;
++ if (new_ctx) {
++ if (messages_pending_flush) {
++ rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
++ } else {
++ rctx = *new_ctx;
++ }
+ rctx->start_time = ceph_clock_now(pg->cct);
++ }
++}
++
++void PG::RecoveryState::begin_block_outgoing() {
++ assert(!messages_pending_flush);
++ assert(orig_ctx);
++ assert(rctx);
++ messages_pending_flush = BufferedRecoveryMessages();
++ rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
++}
++
++void PG::RecoveryState::clear_blocked_outgoing() {
++ assert(orig_ctx);
++ assert(rctx);
++ messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
++}
++
++void PG::RecoveryState::end_block_outgoing() {
++ assert(messages_pending_flush);
++ assert(orig_ctx);
++ assert(rctx);
++
++ rctx = RecoveryCtx(*orig_ctx);
++ rctx->accept_buffered_messages(*messages_pending_flush);
++ messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
+ }
+
+ void PG::RecoveryState::end_handle() {
+ if (rctx) {
+ utime_t dur = ceph_clock_now(pg->cct) - rctx->start_time;
+ machine.event_time += dur;
+ }
++
+ machine.event_count++;
+- rctx = 0;
++ rctx = boost::optional<RecoveryCtx>();
++ orig_ctx = NULL;
+ }
+
+ void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
+ void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
+--- a/src/osd/PG.h
++++ b/src/osd/PG.h
+@@ -446,8 +446,27 @@
+ eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
+ eversion_t last_complete_ondisk; // last_complete that has committed.
+ eversion_t last_update_applied;
+
++
++ struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
++ PGRef pg;
++ epoch_t e;
++ eversion_t v;
++ C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
++ : pg(pg), e(e), v(v) {}
++ void finish(int) {
++ pg->lock();
++ if (!pg->pg_has_reset_since(e)) {
++ pg->last_rollback_info_trimmed_to_applied = v;
++ }
++ pg->unlock();
++ }
++ };
++ // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
++ // and the transaction has applied
++ eversion_t last_rollback_info_trimmed_to_applied;
++
+ // primary state
+ public:
+ pg_shard_t primary;
+ pg_shard_t pg_whoami;
+@@ -486,8 +505,14 @@
+ bool may_need_replay(const OSDMapRef osdmap) const;
+
+
+ public:
++ struct BufferedRecoveryMessages {
++ map<int, map<spg_t, pg_query_t> > query_map;
++ map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > info_map;
++ map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > notify_list;
++ };
++
+ struct RecoveryCtx {
+ utime_t start_time;
+ map<int, map<spg_t, pg_query_t> > *query_map;
+ map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map;
+@@ -507,8 +532,50 @@
+ notify_list(notify_list),
+ on_applied(on_applied),
+ on_safe(on_safe),
+ transaction(transaction) {}
++
++ RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
++ : query_map(&(buf.query_map)),
++ info_map(&(buf.info_map)),
++ notify_list(&(buf.notify_list)),
++ on_applied(rctx.on_applied),
++ on_safe(rctx.on_safe),
++ transaction(rctx.transaction) {}
++
++ void accept_buffered_messages(BufferedRecoveryMessages &m) {
++ assert(query_map);
++ assert(info_map);
++ assert(notify_list);
++ for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
++ i != m.query_map.end();
++ ++i) {
++ map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
++ for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
++ j != i->second.end();
++ ++j) {
++ omap[j->first] = j->second;
++ }
++ }
++ for (map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator i
++ = m.info_map.begin();
++ i != m.info_map.end();
++ ++i) {
++ vector<pair<pg_notify_t, pg_interval_map_t> > &ovec =
++ (*info_map)[i->first];
++ ovec.reserve(ovec.size() + i->second.size());
++ ovec.insert(ovec.end(), i->second.begin(), i->second.end());
++ }
++ for (map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator i
++ = m.notify_list.begin();
++ i != m.notify_list.end();
++ ++i) {
++ vector<pair<pg_notify_t, pg_interval_map_t> > &ovec =
++ (*notify_list)[i->first];
++ ovec.reserve(ovec.size() + i->second.size());
++ ovec.insert(ovec.end(), i->second.begin(), i->second.end());
++ }
++ }
+ };
+
+ struct NamedState {
+ const char *state_name;
+@@ -1107,8 +1174,11 @@
+ void scrub_finish();
+ void scrub_clear_state();
+ bool scrub_gather_replica_maps();
+ void _scan_snaps(ScrubMap &map);
++ void _scan_rollback_obs(
++ const vector<ghobject_t> &rollback_obs,
++ ThreadPool::TPHandle &handle);
+ void _request_scrub_map_classic(pg_shard_t replica, eversion_t version);
+ void _request_scrub_map(pg_shard_t replica, eversion_t version,
+ hobject_t start, hobject_t end, bool deep);
+ int build_scrub_map_chunk(
+@@ -1332,12 +1402,19 @@
+ TrivialEvent(GoClean)
+
+ TrivialEvent(AllReplicasActivated)
+
++ TrivialEvent(IntervalFlush)
++
+ /* Encapsulates PG recovery process */
+ class RecoveryState {
+ void start_handle(RecoveryCtx *new_ctx);
+ void end_handle();
++ public:
++ void begin_block_outgoing();
++ void end_block_outgoing();
++ void clear_blocked_outgoing();
++ private:
+
+ /* States */
+ struct Initial;
+ class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
+@@ -1359,42 +1436,49 @@
+ RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
+
+ /* Accessor functions for state methods */
+ ObjectStore::Transaction* get_cur_transaction() {
++ assert(state->rctx);
+ assert(state->rctx->transaction);
+ return state->rctx->transaction;
+ }
+
+ void send_query(pg_shard_t to, const pg_query_t &query) {
++ assert(state->rctx);
+ assert(state->rctx->query_map);
+ (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
+ query;
+ }
+
+ map<int, map<spg_t, pg_query_t> > *get_query_map() {
++ assert(state->rctx);
+ assert(state->rctx->query_map);
+ return state->rctx->query_map;
+ }
+
+ map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *get_info_map() {
++ assert(state->rctx);
+ assert(state->rctx->info_map);
+ return state->rctx->info_map;
+ }
+
+ list< Context* > *get_on_safe_context_list() {
++ assert(state->rctx);
+ assert(state->rctx->on_safe);
+ return &(state->rctx->on_safe->contexts);
+ }
+
+ list< Context * > *get_on_applied_context_list() {
++ assert(state->rctx);
+ assert(state->rctx->on_applied);
+ return &(state->rctx->on_applied->contexts);
+ }
+
+- RecoveryCtx *get_recovery_ctx() { return state->rctx; }
++ RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
+
+ void send_notify(pg_shard_t to,
+ const pg_notify_t &info, const pg_interval_map_t &pi) {
++ assert(state->rctx);
+ assert(state->rctx->notify_list);
+ (*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
+ }
+ };
+@@ -1438,14 +1522,16 @@
+ boost::statechart::custom_reaction< AdvMap >,
+ boost::statechart::custom_reaction< ActMap >,
+ boost::statechart::custom_reaction< NullEvt >,
+ boost::statechart::custom_reaction< FlushedEvt >,
++ boost::statechart::custom_reaction< IntervalFlush >,
+ boost::statechart::transition< boost::statechart::event_base, Crashed >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const AdvMap&);
+ boost::statechart::result react(const ActMap&);
+ boost::statechart::result react(const FlushedEvt&);
++ boost::statechart::result react(const IntervalFlush&);
+ boost::statechart::result react(const boost::statechart::event_base&) {
+ return discard_event();
+ }
+ };
+@@ -1460,13 +1546,15 @@
+ boost::statechart::custom_reaction< QueryState >,
+ boost::statechart::custom_reaction< AdvMap >,
+ boost::statechart::custom_reaction< NullEvt >,
+ boost::statechart::custom_reaction< FlushedEvt >,
++ boost::statechart::custom_reaction< IntervalFlush >,
+ boost::statechart::transition< boost::statechart::event_base, Crashed >
+ > reactions;
+ boost::statechart::result react(const QueryState& q);
+ boost::statechart::result react(const AdvMap&);
+ boost::statechart::result react(const FlushedEvt&);
++ boost::statechart::result react(const IntervalFlush&);
+ boost::statechart::result react(const boost::statechart::event_base&) {
+ return discard_event();
+ }
+ };
+@@ -1634,12 +1722,16 @@
+ };
+
+ struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
+ typedef boost::mpl::list<
+- boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>
++ boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
++ boost::statechart::custom_reaction< RemoteBackfillReserved >,
++ boost::statechart::custom_reaction< RemoteReservationRejected >
+ > reactions;
+ NotBackfilling(my_context ctx);
+ void exit();
++ boost::statechart::result react(const RemoteBackfillReserved& evt);
++ boost::statechart::result react(const RemoteReservationRejected& evt);
+ };
+
+ struct RepNotRecovering;
+ struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
+@@ -1854,12 +1946,25 @@
+
+
+ RecoveryMachine machine;
+ PG *pg;
+- RecoveryCtx *rctx;
++
++ /// context passed in by state machine caller
++ RecoveryCtx *orig_ctx;
++
++ /// populated if we are buffering messages pending a flush
++ boost::optional<BufferedRecoveryMessages> messages_pending_flush;
++
++ /**
++ * populated between start_handle() and end_handle(), points into
++ * the message lists for messages_pending_flush while blocking messages
++ * or into orig_ctx otherwise
++ */
++ boost::optional<RecoveryCtx> rctx;
+
+ public:
+- RecoveryState(PG *pg) : machine(this, pg), pg(pg), rctx(0) {
++ RecoveryState(PG *pg)
++ : machine(this, pg), pg(pg), orig_ctx(0) {
+ machine.initiate();
+ }
+
+ void handle_event(const boost::statechart::event_base &evt,
+@@ -1995,9 +2100,12 @@
+ }
+
+ void add_log_entry(pg_log_entry_t& e, bufferlist& log_bl);
+ void append_log(
+- vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t,
++ vector<pg_log_entry_t>& logv,
++ eversion_t trim_to,
++ eversion_t trim_rollback_to,
++ ObjectStore::Transaction &t,
+ bool transaction_applied = true);
+ bool check_log_for_corruption(ObjectStore *store);
+ void trim_peers();
+
+@@ -2025,8 +2133,9 @@
+ void share_pg_info();
+ /// share new pg log entries after a pg is active
+ void share_pg_log();
+
++ void reset_interval_flush();
+ void start_peering_interval(
+ const OSDMapRef lastmap,
+ const vector<int>& newup, int up_primary,
+ const vector<int>& newacting, int acting_primary,
+--- a/src/osd/PGBackend.cc
++++ b/src/osd/PGBackend.cc
+@@ -114,9 +114,13 @@
+ vector<hobject_t> *ls,
+ hobject_t *next)
+ {
+ assert(ls);
+- ghobject_t _next(begin);
++ // Starts with the smallest shard id and generation to
++ // make sure the result list has the marker object (
++ // it might have multiple generations though, which would
++ // be filtered).
++ ghobject_t _next(begin, 0, shard_id_t(0));
+ ls->reserve(max);
+ int r = 0;
+ while (!_next.is_max() && ls->size() < (unsigned)min) {
+ vector<ghobject_t> objects;
+@@ -146,9 +150,10 @@
+ int PGBackend::objects_list_range(
+ const hobject_t &start,
+ const hobject_t &end,
+ snapid_t seq,
+- vector<hobject_t> *ls)
++ vector<hobject_t> *ls,
++ vector<ghobject_t> *gen_obs)
+ {
+ assert(ls);
+ vector<ghobject_t> objects;
+ int r = store->collection_list_range(
+@@ -162,8 +167,10 @@
+ i != objects.end();
+ ++i) {
+ if (i->is_no_gen()) {
+ ls->push_back(i->hobj);
++ } else if (gen_obs) {
++ gen_obs->push_back(*i);
+ }
+ }
+ return r;
+ }
+--- a/src/osd/PGBackend.h
++++ b/src/osd/PGBackend.h
+@@ -176,8 +176,9 @@
+ virtual void log_operation(
+ vector<pg_log_entry_t> &logv,
+ boost::optional<pg_hit_set_history_t> &hset_history,
+ const eversion_t &trim_to,
++ const eversion_t &trim_rollback_to,
+ bool transaction_applied,
+ ObjectStore::Transaction *t) = 0;
+
+ virtual void update_peer_last_complete_ondisk(
+@@ -495,8 +496,9 @@
+ const hobject_t &hoid, ///< [in] object
+ const eversion_t &at_version, ///< [in] version
+ PGTransaction *t, ///< [in] trans to execute
+ const eversion_t &trim_to, ///< [in] trim log to here
++ const eversion_t &trim_rollback_to, ///< [in] trim rollback info to here
+ vector<pg_log_entry_t> &log_entries, ///< [in] log entries for t
+ /// [in] hitset history (if updated with this transaction)
+ boost::optional<pg_hit_set_history_t> &hset_history,
+ Context *on_local_applied_sync, ///< [in] called when applied locally
+@@ -554,9 +556,10 @@
+ int objects_list_range(
+ const hobject_t &start,
+ const hobject_t &end,
+ snapid_t seq,
+- vector<hobject_t> *ls);
++ vector<hobject_t> *ls,
++ vector<ghobject_t> *gen_obs=0);
+
+ int objects_get_attr(
+ const hobject_t &hoid,
+ const string &attr,
+--- a/src/osd/PGLog.cc
++++ b/src/osd/PGLog.cc
+@@ -23,8 +23,27 @@
+ #define dout_subsys ceph_subsys_osd
+
+ //////////////////// PGLog::IndexedLog ////////////////////
+
++void PGLog::IndexedLog::advance_rollback_info_trimmed_to(
++ eversion_t to,
++ LogEntryHandler *h)
++{
++ assert(to <= can_rollback_to);
++
++ if (to > rollback_info_trimmed_to)
++ rollback_info_trimmed_to = to;
++
++ while (rollback_info_trimmed_to_riter != log.rbegin()) {
++ --rollback_info_trimmed_to_riter;
++ if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) {
++ ++rollback_info_trimmed_to_riter;
++ break;
++ }
++ h->trim(*rollback_info_trimmed_to_riter);
++ }
++}
++
+ void PGLog::IndexedLog::split_into(
+ pg_t child_pgid,
+ unsigned split_bits,
+ PGLog::IndexedLog *olog)
+@@ -46,11 +65,13 @@
+ }
+ oldlog.erase(i++);
+ }
+
++
++ olog->can_rollback_to = can_rollback_to;
++
+ olog->index();
+ index();
+- olog->can_rollback_to = can_rollback_to;
+ }
+
+ void PGLog::IndexedLog::trim(
+ LogEntryHandler *handler,
+@@ -58,22 +79,33 @@
+ set<eversion_t> *trimmed)
+ {
+ if (complete_to != log.end() &&
+ complete_to->version <= s) {
+- generic_dout(0) << " bad trim to " << s << " when complete_to is " << complete_to->version
++ generic_dout(0) << " bad trim to " << s << " when complete_to is "
++ << complete_to->version
+ << " on " << *this << dendl;
+ }
+
++ if (s > can_rollback_to)
++ can_rollback_to = s;
++ advance_rollback_info_trimmed_to(s, handler);
++
+ while (!log.empty()) {
+ pg_log_entry_t &e = *log.begin();
+ if (e.version > s)
+ break;
+ generic_dout(20) << "trim " << e << dendl;
+ if (trimmed)
+ trimmed->insert(e.version);
+- handler->trim(e);
++
+ unindex(e); // remove from index,
+- log.pop_front(); // from log
++
++ if (e.version == rollback_info_trimmed_to_riter->version) {
++ log.pop_front();
++ rollback_info_trimmed_to_riter = log.rend();
++ } else {
++ log.pop_front();
++ }
+ }
+
+ // raise tail?
+ if (tail < s)
+@@ -103,9 +135,9 @@
+
+ void PGLog::clear() {
+ divergent_priors.clear();
+ missing.clear();
+- log.zero();
++ log.clear();
+ log_keys_debug.clear();
+ undirty();
+ }
+
+--- a/src/osd/PGLog.h
++++ b/src/osd/PGLog.h
+@@ -61,13 +61,35 @@
+ // recovery pointers
+ list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item
+ version_t last_requested; // last object requested by primary
+
++ //
++ private:
++ /**
++ * rollback_info_trimmed_to_riter points to the first log entry <=
++ * rollback_info_trimmed_to
++ *
++ * It's a reverse_iterator because rend() is a natural representation for
++ * tail, and rbegin() works nicely for head.
++ */
++ list<pg_log_entry_t>::reverse_iterator rollback_info_trimmed_to_riter;
++ public:
++ void advance_rollback_info_trimmed_to(eversion_t to, LogEntryHandler *h);
++
+ /****/
+- IndexedLog() : last_requested(0) {}
++ IndexedLog() :
++ complete_to(log.end()),
++ last_requested(0),
++ rollback_info_trimmed_to_riter(log.rbegin())
++ {}
++
++ void claim_log_and_clear_rollback_info(const pg_log_t& o) {
++ // we must have already trimmed the old entries
++ assert(rollback_info_trimmed_to == head);
++ assert(rollback_info_trimmed_to_riter == log.rbegin());
+
+- void claim_log(const pg_log_t& o) {
+ log = o.log;
++ rollback_info_trimmed_to = head;
+ head = o.head;
+ tail = o.tail;
+ index();
+ }
+@@ -77,12 +99,22 @@
+ unsigned split_bits,
+ IndexedLog *olog);
+
+ void zero() {
++ // we must have already trimmed the old entries
++ assert(rollback_info_trimmed_to == head);
++ assert(rollback_info_trimmed_to_riter == log.rbegin());
++
+ unindex();
+ pg_log_t::clear();
++ rollback_info_trimmed_to_riter = log.rbegin();
+ reset_recovery_pointers();
+ }
++ void clear() {
++ rollback_info_trimmed_to = head;
++ rollback_info_trimmed_to_riter = log.rbegin();
++ zero();
++ }
+ void reset_recovery_pointers() {
+ complete_to = log.end();
+ last_requested = 0;
+ }
+@@ -111,8 +143,13 @@
+ //assert(caller_ops.count(i->reqid) == 0); // divergent merge_log indexes new before unindexing old
+ caller_ops[i->reqid] = &(*i);
+ }
+ }
++
++ rollback_info_trimmed_to_riter = log.rbegin();
++ while (rollback_info_trimmed_to_riter != log.rend() &&
++ rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to)
++ rollback_info_trimmed_to_riter++;
+ }
+
+ void index(pg_log_entry_t& e) {
+ if (objects.count(e.soid) == 0 ||
+@@ -140,8 +177,13 @@
+ // actors
+ void add(pg_log_entry_t& e) {
+ // add to log
+ log.push_back(e);
++
++ // riter previously pointed to the previous entry
++ if (rollback_info_trimmed_to_riter == log.rbegin())
++ ++rollback_info_trimmed_to_riter;
++
+ assert(e.version > head);
+ assert(head.version == 0 || e.version.version > head.version);
+ head = e.version;
+
+@@ -324,16 +366,35 @@
+ LogEntryHandler *handler,
+ eversion_t trim_to,
+ pg_info_t &info);
+
+- void clear_can_rollback_to() {
++ void trim_rollback_info(
++ eversion_t trim_rollback_to,
++ LogEntryHandler *h) {
++ if (trim_rollback_to > log.can_rollback_to)
++ log.can_rollback_to = trim_rollback_to;
++ log.advance_rollback_info_trimmed_to(
++ trim_rollback_to,
++ h);
++ }
++
++ eversion_t get_rollback_trimmed_to() const {
++ return log.rollback_info_trimmed_to;
++ }
++
++ void clear_can_rollback_to(LogEntryHandler *h) {
+ log.can_rollback_to = log.head;
++ log.advance_rollback_info_trimmed_to(
++ log.head,
++ h);
+ }
+
+ //////////////////// get or set log & missing ////////////////////
+
+- void claim_log(const pg_log_t &o) {
+- log.claim_log(o);
++ void claim_log_and_clear_rollback_info(const pg_log_t &o, LogEntryHandler *h) {
++ log.can_rollback_to = log.head;
++ log.advance_rollback_info_trimmed_to(log.head, h);
++ log.claim_log_and_clear_rollback_info(o);
+ missing.clear();
+ mark_dirty_to(eversion_t::max());
+ }
+
+--- a/src/osd/ReplicatedBackend.cc
++++ b/src/osd/ReplicatedBackend.cc
+@@ -493,8 +493,9 @@
+ const hobject_t &soid,
+ const eversion_t &at_version,
+ PGTransaction *_t,
+ const eversion_t &trim_to,
++ const eversion_t &trim_rollback_to,
+ vector<pg_log_entry_t> &log_entries,
+ boost::optional<pg_hit_set_history_t> &hset_history,
+ Context *on_local_applied_sync,
+ Context *on_all_acked,
+@@ -533,8 +534,9 @@
+ at_version,
+ tid,
+ reqid,
+ trim_to,
++ trim_rollback_to,
+ t->get_temp_added().size() ? *(t->get_temp_added().begin()) : hobject_t(),
+ t->get_temp_cleared().size() ?
+ *(t->get_temp_cleared().begin()) :hobject_t(),
+ log_entries,
+@@ -548,9 +550,15 @@
+ add_temp_objs(t->get_temp_added());
+ }
+ clear_temp_objs(t->get_temp_cleared());
+
+- parent->log_operation(log_entries, hset_history, trim_to, true, &local_t);
++ parent->log_operation(
++ log_entries,
++ hset_history,
++ trim_to,
++ trim_rollback_to,
++ true,
++ &local_t);
+ local_t.append(*op_t);
+ local_t.swap(*op_t);
+
+ op_t->register_on_applied_sync(on_local_applied_sync);
+--- a/src/osd/ReplicatedBackend.h
++++ b/src/osd/ReplicatedBackend.h
+@@ -341,8 +341,9 @@
+ const hobject_t &hoid,
+ const eversion_t &at_version,
+ PGTransaction *t,
+ const eversion_t &trim_to,
++ const eversion_t &trim_rollback_to,
+ vector<pg_log_entry_t> &log_entries,
+ boost::optional<pg_hit_set_history_t> &hset_history,
+ Context *on_local_applied_sync,
+ Context *on_all_applied,
+@@ -358,8 +359,9 @@
+ const eversion_t &at_version,
+ ceph_tid_t tid,
+ osd_reqid_t reqid,
+ eversion_t pg_trim_to,
++ eversion_t pg_trim_rollback_to,
+ hobject_t new_temp_oid,
+ hobject_t discard_temp_oid,
+ vector<pg_log_entry_t> &log_entries,
+ boost::optional<pg_hit_set_history_t> &hset_history,
+--- a/src/osd/ReplicatedPG.cc
++++ b/src/osd/ReplicatedPG.cc
+@@ -1119,8 +1119,14 @@
+ dout(20) << " replay, waiting for active on " << op << dendl;
+ waiting_for_active.push_back(op);
+ return;
+ }
++ // verify client features
++ if ((pool.info.has_tiers() || pool.info.is_tier()) &&
++ !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
++ osd->reply_op_error(op, -EOPNOTSUPP);
++ return;
++ }
+ do_op(op); // do it now
+ break;
+
+ case MSG_OSD_SUBOP:
+@@ -1351,11 +1357,12 @@
+ if (hit_set->is_full() ||
+ hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
+ hit_set_persist();
+ }
++ }
+
+- if (agent_state)
+- agent_choose_mode();
++ if (agent_state) {
++ agent_choose_mode();
+ }
+
+ if ((m->get_flags() & CEPH_OSD_FLAG_IGNORE_CACHE) == 0 &&
+ maybe_handle_cache(op, write_ordered, obc, r, missing_oid, false))
+@@ -4853,10 +4860,11 @@
+ ctx->clone_obc->ssc->ref++;
+ if (pool.info.require_rollback())
+ ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
+ snap_oi = &ctx->clone_obc->obs.oi;
+- bool got = ctx->clone_obc->get_write(ctx->op);
++ bool got = ctx->clone_obc->get_write_greedy(ctx->op);
+ assert(got);
++ dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
+ } else {
+ snap_oi = &static_snap_oi;
+ }
+ snap_oi->version = ctx->at_version;
+@@ -5159,10 +5167,11 @@
+ eversion_t(),
+ 0, osd_reqid_t(), ctx->mtime));
+
+ ctx->snapset_obc = get_object_context(snapoid, true);
+- bool got = ctx->snapset_obc->get_write(ctx->op);
++ bool got = ctx->snapset_obc->get_write_greedy(ctx->op);
+ assert(got);
++ dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
+ ctx->release_snapset_obc = true;
+ if (pool.info.require_rollback() && !ctx->snapset_obc->obs.exists) {
+ ctx->log.back().mod_desc.create();
+ } else if (!pool.info.require_rollback()) {
+@@ -6025,8 +6034,13 @@
+ kick_object_context_blocked(cop->obc);
+ cop->results.should_requeue = requeue;
+ CopyCallbackResults result(-ECANCELED, &cop->results);
+ cop->cb->complete(result);
++
++ // There may still be an objecter callback referencing this copy op.
++ // That callback will not need the obc since it's been canceled, and
++ // we need the obc reference to go away prior to flush.
++ cop->obc = ObjectContextRef();
+ }
+
+ void ReplicatedPG::cancel_copy_ops(bool requeue)
+ {
+@@ -6441,9 +6455,9 @@
+ }
+
+ bool ReplicatedPG::is_present_clone(hobject_t coid)
+ {
+- if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
++ if (!pool.info.allow_incomplete_clones())
+ return true;
+ if (is_missing_object(coid))
+ return true;
+ ObjectContextRef obc = get_object_context(coid, false);
+@@ -6734,8 +6748,9 @@
+ soid,
+ repop->ctx->at_version,
+ repop->ctx->op_t,
+ pg_trim_to,
++ min_last_complete_ondisk,
+ repop->ctx->log,
+ repop->ctx->updated_hset_history,
+ onapplied_sync,
+ on_all_applied,
+@@ -6751,8 +6766,9 @@
+ const eversion_t &at_version,
+ ceph_tid_t tid,
+ osd_reqid_t reqid,
+ eversion_t pg_trim_to,
++ eversion_t pg_trim_rollback_to,
+ hobject_t new_temp_oid,
+ hobject_t discard_temp_oid,
+ vector<pg_log_entry_t> &log_entries,
+ boost::optional<pg_hit_set_history_t> &hset_hist,
+@@ -6806,8 +6822,9 @@
+ else
+ wr->pg_stats = get_info().stats;
+
+ wr->pg_trim_to = pg_trim_to;
++ wr->pg_trim_rollback_to = pg_trim_rollback_to;
+
+ wr->new_temp_oid = new_temp_oid;
+ wr->discard_temp_oid = discard_temp_oid;
+ wr->updated_hit_set_history = hset_hist;
+@@ -6840,8 +6857,14 @@
+
+ void ReplicatedPG::remove_repop(RepGather *repop)
+ {
+ dout(20) << __func__ << " " << *repop << dendl;
++ if (repop->ctx->obc)
++ dout(20) << " obc " << *repop->ctx->obc << dendl;
++ if (repop->ctx->clone_obc)
++ dout(20) << " clone_obc " << *repop->ctx->clone_obc << dendl;
++ if (repop->ctx->snapset_obc)
++ dout(20) << " snapset_obc " << *repop->ctx->snapset_obc << dendl;
+ release_op_ctx_locks(repop->ctx);
+ repop->ctx->finish(0); // FIXME: return value here is sloppy
+ repop_map.erase(repop->rep_tid);
+ repop->put();
+@@ -7606,8 +7629,9 @@
+ parent->log_operation(
+ log,
+ m->updated_hit_set_history,
+ m->pg_trim_to,
++ m->pg_trim_rollback_to,
+ update_snaps,
+ &(rm->localt));
+
+ rm->bytes_written = rm->opt.get_encoded_bytes();
+@@ -7701,10 +7725,10 @@
+ uint64_t size = obc->obs.oi.size;
+ if (size)
+ data_subset.insert(0, size);
+
+- if (get_parent()->get_pool().cache_mode != pg_pool_t::CACHEMODE_NONE) {
+- dout(10) << __func__ << ": caching enabled, skipping clone subsets" << dendl;
++ if (get_parent()->get_pool().allow_incomplete_clones()) {
++ dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
+ return;
+ }
+
+ if (!cct->_conf->osd_recover_clone_overlap) {
+@@ -7761,10 +7785,10 @@
+ uint64_t size = snapset.clone_size[soid.snap];
+ if (size)
+ data_subset.insert(0, size);
+
+- if (get_parent()->get_pool().cache_mode != pg_pool_t::CACHEMODE_NONE) {
+- dout(10) << __func__ << ": caching enabled, skipping clone subsets" << dendl;
++ if (get_parent()->get_pool().allow_incomplete_clones()) {
++ dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
+ return;
+ }
+
+ if (!cct->_conf->osd_recover_clone_overlap) {
+@@ -9464,8 +9488,19 @@
+
+ void ReplicatedPG::on_pool_change()
+ {
+ dout(10) << __func__ << dendl;
++ // requeue cache full waiters just in case the cache_mode is
++ // changing away from writeback mode. note that if we are not
++ // active the normal requeuing machinery is sufficient (and properly
++ // ordered).
++ if (is_active() &&
++ pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
++ !waiting_for_cache_not_full.empty()) {
++ dout(10) << __func__ << " requeuing full waiters (not in writeback) "
++ << dendl;
++ requeue_ops(waiting_for_cache_not_full);
++ }
+ hit_set_setup();
+ agent_setup();
+ }
+
+@@ -11288,9 +11323,10 @@
+ return false;
+ }
+ }
+
+- if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
++ if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL &&
++ hit_set) {
+ // is this object old and/or cold enough?
+ int atime = -1, temp = 0;
+ agent_estimate_atime_temp(soid, &atime, NULL /*FIXME &temp*/);
+
+@@ -11420,9 +11456,13 @@
+ else
+ num_dirty = 0;
+ }
+
+- dout(10) << __func__ << ": "
++ dout(10) << __func__
++ << " flush_mode: "
++ << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
++ << " evict_mode: "
++ << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
+ << " num_objects: " << info.stats.stats.sum.num_objects
+ << " num_bytes: " << info.stats.stats.sum.num_bytes
+ << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
+ << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
+@@ -11434,9 +11474,9 @@
+
+ // get dirty, full ratios
+ uint64_t dirty_micro = 0;
+ uint64_t full_micro = 0;
+- if (pool.info.target_max_bytes && info.stats.stats.sum.num_objects) {
++ if (pool.info.target_max_bytes && info.stats.stats.sum.num_objects > 0) {
+ uint64_t avg_size = info.stats.stats.sum.num_bytes /
+ info.stats.stats.sum.num_objects;
+ dirty_micro =
+ num_dirty * avg_size * 1000000 /
+@@ -11444,9 +11484,9 @@
+ full_micro =
+ num_user_objects * avg_size * 1000000 /
+ MAX(pool.info.target_max_bytes / divisor, 1);
+ }
+- if (pool.info.target_max_objects) {
++ if (pool.info.target_max_objects > 0) {
+ uint64_t dirty_objects_micro =
+ num_dirty * 1000000 /
+ MAX(pool.info.target_max_objects / divisor, 1);
+ if (dirty_objects_micro > dirty_micro)
+@@ -11530,10 +11570,12 @@
+ << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
+ << " -> "
+ << TierAgentState::get_evict_mode_name(evict_mode)
+ << dendl;
+- if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
++ if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
++ is_active()) {
+ requeue_ops(waiting_for_cache_not_full);
++ requeue_ops(waiting_for_active);
+ }
+ agent_state->evict_mode = evict_mode;
+ }
+ uint64_t old_effort = agent_state->evict_effort;
+@@ -11659,9 +11701,9 @@
+ ::decode(snapset, blp);
+
+ // did we finish the last oid?
+ if (head != hobject_t() &&
+- pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
++ !pool.info.allow_incomplete_clones()) {
+ osd->clog.error() << mode << " " << info.pgid << " " << head
+ << " missing clones";
+ ++scrubber.shallow_errors;
+ }
+@@ -11720,9 +11762,9 @@
+ //assert(data.length() == p->size);
+ //
+
+ if (!next_clone.is_min() && next_clone != soid &&
+- pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE) {
++ pool.info.allow_incomplete_clones()) {
+ // it is okay to be missing one or more clones in a cache tier.
+ // skip higher-numbered clones in the list.
+ while (curclone != snapset.clones.rend() &&
+ soid.snap < *curclone)
+@@ -11808,9 +11850,9 @@
+ scrub_cstat.add(stat, cat);
+ }
+
+ if (!next_clone.is_min() &&
+- pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
++ !pool.info.allow_incomplete_clones()) {
+ osd->clog.error() << mode << " " << info.pgid
+ << " expected clone " << next_clone;
+ ++scrubber.shallow_errors;
+ }
+--- a/src/osd/ReplicatedPG.h
++++ b/src/osd/ReplicatedPG.h
+@@ -346,15 +346,16 @@
+ void log_operation(
+ vector<pg_log_entry_t> &logv,
+ boost::optional<pg_hit_set_history_t> &hset_history,
+ const eversion_t &trim_to,
++ const eversion_t &trim_rollback_to,
+ bool transaction_applied,
+ ObjectStore::Transaction *t) {
+ if (hset_history) {
+ info.hit_set = *hset_history;
+ dirty_info = true;
+ }
+- append_log(logv, trim_to, *t, transaction_applied);
++ append_log(logv, trim_to, trim_rollback_to, *t, transaction_applied);
+ }
+
+ void op_applied(
+ const eversion_t &applied_version);
+--- a/src/osd/osd_types.cc
++++ b/src/osd/osd_types.cc
+@@ -2101,10 +2101,10 @@
+ void pg_notify_t::dump(Formatter *f) const
+ {
+ f->dump_int("from", from);
+ f->dump_int("to", to);
+- f->dump_stream("query_epoch") << query_epoch;
+- f->dump_stream("epoch_sent") << epoch_sent;
++ f->dump_unsigned("query_epoch", query_epoch);
++ f->dump_unsigned("epoch_sent", epoch_sent);
+ {
+ f->open_object_section("info");
+ info.dump(f);
+ f->close_section();
+@@ -2460,10 +2460,10 @@
+
+ void ObjectModDesc::dump(Formatter *f) const
+ {
+ f->open_object_section("object_mod_desc");
+- f->dump_stream("can_local_rollback") << can_local_rollback;
+- f->dump_stream("stashed") << stashed;
++ f->dump_bool("can_local_rollback", can_local_rollback);
++ f->dump_bool("rollback_info_completed", rollback_info_completed);
+ {
+ f->open_array_section("ops");
+ DumpVisitor vis(f);
+ visit(&vis);
+@@ -2496,17 +2496,17 @@
+ void ObjectModDesc::encode(bufferlist &_bl) const
+ {
+ ENCODE_START(1, 1, _bl);
+ ::encode(can_local_rollback, _bl);
+- ::encode(stashed, _bl);
++ ::encode(rollback_info_completed, _bl);
+ ::encode(bl, _bl);
+ ENCODE_FINISH(_bl);
+ }
+ void ObjectModDesc::decode(bufferlist::iterator &_bl)
+ {
+ DECODE_START(1, _bl);
+ ::decode(can_local_rollback, _bl);
+- ::decode(stashed, _bl);
++ ::decode(rollback_info_completed, _bl);
+ ::decode(bl, _bl);
+ DECODE_FINISH(_bl);
+ }
+
+@@ -2679,19 +2679,20 @@
+ // -- pg_log_t --
+
+ void pg_log_t::encode(bufferlist& bl) const
+ {
+- ENCODE_START(5, 3, bl);
++ ENCODE_START(6, 3, bl);
+ ::encode(head, bl);
+ ::encode(tail, bl);
+ ::encode(log, bl);
+ ::encode(can_rollback_to, bl);
++ ::encode(rollback_info_trimmed_to, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
+ {
+- DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
++ DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
+ ::decode(head, bl);
+ ::decode(tail, bl);
+ if (struct_v < 2) {
+ bool backlog;
+@@ -2699,8 +2700,13 @@
+ }
+ ::decode(log, bl);
+ if (struct_v >= 5)
+ ::decode(can_rollback_to, bl);
++
++ if (struct_v >= 6)
++ ::decode(rollback_info_trimmed_to, bl);
++ else
++ rollback_info_trimmed_to = tail;
+ DECODE_FINISH(bl);
+
+ // handle hobject_t format change
+ if (struct_v < 4) {
+--- a/src/osd/osd_types.h
++++ b/src/osd/osd_types.h
+@@ -810,18 +810,20 @@
+ return "replicated";
+ }
+
+ enum {
+- FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding)
+- FLAG_FULL = 2, // pool is full
++ FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
++ FLAG_FULL = 1<<1, // pool is full
+ FLAG_DEBUG_FAKE_EC_POOL = 1<<2, // require ReplicatedPG to act like an EC pg
++ FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
+ };
+
+ static const char *get_flag_name(int f) {
+ switch (f) {
+ case FLAG_HASHPSPOOL: return "hashpspool";
+ case FLAG_FULL: return "full";
+ case FLAG_DEBUG_FAKE_EC_POOL: return "require_local_rollback";
++ case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
+ default: return "???";
+ }
+ }
+ static string get_flags_string(uint64_t f) {
+@@ -867,8 +869,20 @@
+ }
+ const char *get_cache_mode_name() const {
+ return get_cache_mode_name(cache_mode);
+ }
++ bool cache_mode_requires_hit_set() const {
++ switch (cache_mode) {
++ case CACHEMODE_NONE:
++ case CACHEMODE_FORWARD:
++ case CACHEMODE_READONLY:
++ return false;
++ case CACHEMODE_WRITEBACK:
++ return true;
++ default:
++ assert(0 == "implement me");
++ }
++ }
+
+ uint64_t flags; ///< FLAG_*
+ __u8 type; ///< TYPE_*
+ __u8 size, min_size; ///< number of osds in each pg
+@@ -915,13 +929,31 @@
+ cache_mode_t cache_mode; ///< cache pool mode
+
+ bool is_tier() const { return tier_of >= 0; }
+ bool has_tiers() const { return !tiers.empty(); }
+- void clear_tier() { tier_of = -1; }
++ void clear_tier() {
++ tier_of = -1;
++ clear_read_tier();
++ clear_write_tier();
++ clear_tier_tunables();
++ }
+ bool has_read_tier() const { return read_tier >= 0; }
+ void clear_read_tier() { read_tier = -1; }
+ bool has_write_tier() const { return write_tier >= 0; }
+ void clear_write_tier() { write_tier = -1; }
++ void clear_tier_tunables() {
++ if (cache_mode != CACHEMODE_NONE)
++ flags |= FLAG_INCOMPLETE_CLONES;
++ cache_mode = CACHEMODE_NONE;
++
++ target_max_bytes = 0;
++ target_max_objects = 0;
++ cache_target_dirty_ratio_micro = 0;
++ cache_target_full_ratio_micro = 0;
++ hit_set_params = HitSet::Params();
++ hit_set_period = 0;
++ hit_set_count = 0;
++ }
+
+ uint64_t target_max_bytes; ///< tiering: target max pool size
+ uint64_t target_max_objects; ///< tiering: target max pool size
+
+@@ -963,8 +995,9 @@
+
+ void dump(Formatter *f) const;
+
+ uint64_t get_flags() const { return flags; }
++ bool has_flag(uint64_t f) const { return flags & f; }
+
+ /// This method will later return true for ec pools as well
+ bool ec_pool() const {
+ return type == TYPE_ERASURE;
+@@ -972,8 +1005,13 @@
+ bool require_rollback() const {
+ return ec_pool() || flags & FLAG_DEBUG_FAKE_EC_POOL;
+ }
+
++ /// true if incomplete clones may be present
++ bool allow_incomplete_clones() const {
++ return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
++ }
++
+ unsigned get_type() const { return type; }
+ unsigned get_size() const { return size; }
+ unsigned get_min_size() const { return min_size; }
+ int get_crush_ruleset() const { return crush_ruleset; }
+@@ -1810,9 +1848,9 @@
+
+ class PGBackend;
+ class ObjectModDesc {
+ bool can_local_rollback;
+- bool stashed;
++ bool rollback_info_completed;
+ public:
+ class Visitor {
+ public:
+ virtual void append(uint64_t old_offset) {}
+@@ -1830,75 +1868,76 @@
+ DELETE = 3,
+ CREATE = 4,
+ UPDATE_SNAPS = 5
+ };
+- ObjectModDesc() : can_local_rollback(true), stashed(false) {}
++ ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {}
+ void claim(ObjectModDesc &other) {
+ bl.clear();
+ bl.claim(other.bl);
+ can_local_rollback = other.can_local_rollback;
+- stashed = other.stashed;
++ rollback_info_completed = other.rollback_info_completed;
+ }
+ void claim_append(ObjectModDesc &other) {
+- if (!can_local_rollback || stashed)
++ if (!can_local_rollback || rollback_info_completed)
+ return;
+ if (!other.can_local_rollback) {
+ mark_unrollbackable();
+ return;
+ }
+ bl.claim_append(other.bl);
+- stashed = other.stashed;
++ rollback_info_completed = other.rollback_info_completed;
+ }
+ void swap(ObjectModDesc &other) {
+ bl.swap(other.bl);
+
+ bool temp = other.can_local_rollback;
+ other.can_local_rollback = can_local_rollback;
+ can_local_rollback = temp;
+
+- temp = other.stashed;
+- other.stashed = stashed;
+- stashed = temp;
++ temp = other.rollback_info_completed;
++ other.rollback_info_completed = rollback_info_completed;
++ rollback_info_completed = temp;
+ }
+ void append_id(ModID id) {
+ uint8_t _id(id);
+ ::encode(_id, bl);
+ }
+ void append(uint64_t old_size) {
+- if (!can_local_rollback || stashed)
++ if (!can_local_rollback || rollback_info_completed)
+ return;
+ ENCODE_START(1, 1, bl);
+ append_id(APPEND);
+ ::encode(old_size, bl);
+ ENCODE_FINISH(bl);
+ }
+ void setattrs(map<string, boost::optional<bufferlist> > &old_attrs) {
+- if (!can_local_rollback || stashed)
++ if (!can_local_rollback || rollback_info_completed)
+ return;
+ ENCODE_START(1, 1, bl);
+ append_id(SETATTRS);
+ ::encode(old_attrs, bl);
+ ENCODE_FINISH(bl);
+ }
+ bool rmobject(version_t deletion_version) {
+- if (!can_local_rollback || stashed)
++ if (!can_local_rollback || rollback_info_completed)
+ return false;
+ ENCODE_START(1, 1, bl);
+ append_id(DELETE);
+ ::encode(deletion_version, bl);
+ ENCODE_FINISH(bl);
+- stashed = true;
++ rollback_info_completed = true;
+ return true;
+ }
+ void create() {
+- if (!can_local_rollback || stashed)
++ if (!can_local_rollback || rollback_info_completed)
+ return;
++ rollback_info_completed = true;
+ ENCODE_START(1, 1, bl);
+ append_id(CREATE);
+ ENCODE_FINISH(bl);
+ }
+ void update_snaps(set<snapid_t> &old_snaps) {
+- if (!can_local_rollback || stashed)
++ if (!can_local_rollback || rollback_info_completed)
+ return;
+ ENCODE_START(1, 1, bl);
+ append_id(UPDATE_SNAPS);
+ ::encode(old_snaps, bl);
+@@ -2060,8 +2099,12 @@
+
+ // We can rollback rollback-able entries > can_rollback_to
+ eversion_t can_rollback_to;
+
++ // always <= can_rollback_to, indicates how far stashed rollback
++ // data can be found
++ eversion_t rollback_info_trimmed_to;
++
+ list<pg_log_entry_t> log; // the actual log.
+
+ pg_log_t() {}
+
+@@ -2761,21 +2804,23 @@
+ return false;
+ }
+ }
+
+- bool get_write(OpRequestRef op) {
+- if (get_write_lock()) {
++ bool get_write(OpRequestRef op, bool greedy=false) {
++ if (get_write_lock(greedy)) {
+ return true;
+ } // else
+ if (op)
+ waiters.push_back(op);
+ return false;
+ }
+- bool get_write_lock() {
+- // don't starve anybody!
+- if (!waiters.empty() ||
+- backfill_read_marker) {
+- return false;
++ bool get_write_lock(bool greedy=false) {
++ if (!greedy) {
++ // don't starve anybody!
++ if (!waiters.empty() ||
++ backfill_read_marker) {
++ return false;
++ }
+ }
+ switch (state) {
+ case RWNONE:
+ assert(count == 0);
+@@ -2822,9 +2867,12 @@
+ bool get_read(OpRequestRef op) {
+ return rwstate.get_read(op);
+ }
+ bool get_write(OpRequestRef op) {
+- return rwstate.get_write(op);
++ return rwstate.get_write(op, false);
++ }
++ bool get_write_greedy(OpRequestRef op) {
++ return rwstate.get_write(op, true);
+ }
+ bool get_snaptrimmer_write() {
+ if (rwstate.get_write_lock()) {
+ return true;
+--- a/src/osdc/Objecter.cc
++++ b/src/osdc/Objecter.cc
+@@ -1363,8 +1363,13 @@
+ }
+
+ ldout(cct, 10) << __func__ << " tid " << tid << dendl;
+ Op *op = p->second;
++ if (op->con) {
++ ldout(cct, 20) << " revoking rx buffer for " << tid
++ << " on " << op->con << dendl;
++ op->con->revoke_rx_buffer(tid);
++ }
+ if (op->onack) {
+ op->onack->complete(r);
+ op->onack = NULL;
+ }
+@@ -1433,9 +1438,9 @@
+ return -ENOENT;
+ return p->raw_hash_to_pg(p->hash_key(key, ns));
+ }
+
+-int Objecter::calc_target(op_target_t *t)
++int Objecter::calc_target(op_target_t *t, bool any_change)
+ {
+ bool is_read = t->flags & CEPH_OSD_FLAG_READ;
+ bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
+
+@@ -1490,9 +1495,10 @@
+ need_resend = true;
+ }
+
+ if (t->pgid != pgid ||
+- is_pg_changed(t->primary, t->acting, primary, acting, t->used_replica) ||
++ is_pg_changed(
++ t->primary, t->acting, primary, acting, t->used_replica || any_change) ||
+ force_resend) {
+ t->pgid = pgid;
+ t->acting = acting;
+ t->primary = primary;
+@@ -1569,9 +1575,9 @@
+ }
+
+ bool Objecter::recalc_linger_op_target(LingerOp *linger_op)
+ {
+- int r = calc_target(&linger_op->target);
++ int r = calc_target(&linger_op->target, true);
+ if (r == RECALC_OP_TARGET_NEED_RESEND) {
+ ldout(cct, 10) << "recalc_linger_op_target tid " << linger_op->linger_id
+ << " pgid " << linger_op->target.pgid
+ << " acting " << linger_op->target.acting << dendl;
+--- a/src/osdc/Objecter.h
++++ b/src/osdc/Objecter.h
+@@ -1479,9 +1479,9 @@
+ };
+ bool osdmap_full_flag() const;
+ bool target_should_be_paused(op_target_t *op);
+
+- int calc_target(op_target_t *t);
++ int calc_target(op_target_t *t, bool any_change=false);
+ int recalc_op_target(Op *op);
+ bool recalc_linger_op_target(LingerOp *op);
+
+ void send_linger(LingerOp *info);
+--- a/src/pybind/rbd.py
++++ b/src/pybind/rbd.py
+@@ -749,8 +749,16 @@
+ ret = self.librbd.rbd_flush(self.image)
+ if ret < 0:
+ raise make_ex(ret, 'error flushing image')
+
++ def invalidate_cache(self):
++ """
++ Drop any cached data for the image.
++ """
++ ret = self.librbd.rbd_invalidate_cache(self.image)
++ if ret < 0:
++ raise make_ex(ret, 'error invalidating cache')
++
+ def stripe_unit(self):
+ """
+ Returns the stripe unit used for the image.
+ """
+--- a/src/rgw/rgw_common.cc
++++ b/src/rgw/rgw_common.cc
+@@ -696,15 +696,17 @@
+ char dest[src_str.size() + 1];
+ int pos = 0;
+ char c;
+
++ bool in_query = false;
+ while (*src) {
+ if (*src != '%') {
+- if (*src != '+') {
+- dest[pos++] = *src++;
++ if (!in_query || *src != '+') {
++ if (*src == '?') in_query = true;
++ dest[pos++] = *src++;
+ } else {
+- dest[pos++] = ' ';
+- ++src;
++ dest[pos++] = ' ';
++ ++src;
+ }
+ } else {
+ src++;
+ if (!*src)
+--- a/src/rgw/rgw_op.cc
++++ b/src/rgw/rgw_op.cc
+@@ -1379,9 +1379,12 @@
+ };
+
+ int RGWPutObjProcessor_Multipart::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
+ {
+- RGWPutObjProcessor::prepare(store, obj_ctx, NULL);
++ int r = prepare_init(store, obj_ctx, NULL);
++ if (r < 0) {
++ return r;
++ }
+
+ string oid = obj_str;
+ upload_id = s->info.args.get("uploadId");
+ if (!oid_rand) {
+@@ -1418,9 +1421,9 @@
+ manifest.set_prefix(upload_prefix);
+
+ manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, num);
+
+- int r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, target_obj);
++ r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, target_obj);
+ if (r < 0) {
+ return r;
+ }
+
+@@ -1559,8 +1562,38 @@
+
+ return 0;
+ }
+
++static int put_data_and_throttle(RGWPutObjProcessor *processor, bufferlist& data, off_t ofs,
++ MD5 *hash, bool need_to_wait)
++{
++ const unsigned char *data_ptr = (hash ? (const unsigned char *)data.c_str() : NULL);
++ bool again;
++ uint64_t len = data.length();
++
++ do {
++ void *handle;
++
++ int ret = processor->handle_data(data, ofs, &handle, &again);
++ if (ret < 0)
++ return ret;
++
++ if (hash) {
++ hash->Update(data_ptr, len);
++ hash = NULL; /* only calculate hash once */
++ }
++
++ ret = processor->throttle_data(handle, need_to_wait);
++ if (ret < 0)
++ return ret;
++
++ need_to_wait = false; /* the need to wait only applies to the first iteration */
++ } while (again);
++
++ return 0;
++}
++
++
+ void RGWPutObj::execute()
+ {
+ RGWPutObjProcessor *processor = NULL;
+ char supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1];
+@@ -1632,25 +1665,14 @@
+ }
+ if (!len)
+ break;
+
+- void *handle;
+- const unsigned char *data_ptr = (const unsigned char *)data.c_str();
+-
+- ret = processor->handle_data(data, ofs, &handle);
+- if (ret < 0)
+- goto done;
+-
+- if (need_calc_md5) {
+- hash.Update(data_ptr, len);
+- }
+-
+ /* do we need this operation to be synchronous? if we're dealing with an object with immutable
+ * head, e.g., multipart object we need to make sure we're the first one writing to this object
+ */
+ bool need_to_wait = (ofs == 0) && multipart;
+
+- ret = processor->throttle_data(handle, need_to_wait);
++ ret = put_data_and_throttle(processor, data, ofs, (need_calc_md5 ? &hash : NULL), need_to_wait);
+ if (ret < 0) {
+ if (!need_to_wait || ret != -EEXIST) {
+ ldout(s->cct, 20) << "processor->thottle_data() returned ret=" << ret << dendl;
+ goto done;
+@@ -1673,17 +1695,10 @@
+ ldout(s->cct, 0) << "ERROR: processor->prepare() returned " << ret << dendl;
+ goto done;
+ }
+
+- ret = processor->handle_data(data, ofs, &handle);
++ ret = put_data_and_throttle(processor, data, ofs, NULL, false);
+ if (ret < 0) {
+- ldout(s->cct, 0) << "ERROR: processor->handle_data() returned " << ret << dendl;
+- goto done;
+- }
+-
+- ret = processor->throttle_data(handle, false);
+- if (ret < 0) {
+- ldout(s->cct, 0) << "ERROR: processor->throttle_data() returned " << ret << dendl;
+ goto done;
+ }
+ }
+
+@@ -1845,20 +1860,9 @@
+
+ if (!len)
+ break;
+
+- void *handle;
+- const unsigned char *data_ptr = (const unsigned char *)data.c_str();
+-
+- ret = processor->handle_data(data, ofs, &handle);
+- if (ret < 0)
+- goto done;
+-
+- hash.Update(data_ptr, len);
+-
+- ret = processor->throttle_data(handle, false);
+- if (ret < 0)
+- goto done;
++ ret = put_data_and_throttle(processor, data, ofs, &hash, false);
+
+ ofs += len;
+
+ if (ofs > max_len) {
+--- a/src/rgw/rgw_rados.cc
++++ b/src/rgw/rgw_rados.cc
+@@ -899,10 +899,12 @@
+
+ return 0;
+ };
+
+-int RGWPutObjProcessor_Plain::handle_data(bufferlist& bl, off_t _ofs, void **phandle)
++int RGWPutObjProcessor_Plain::handle_data(bufferlist& bl, off_t _ofs, void **phandle, bool *again)
+ {
++ *again = false;
++
+ if (ofs != _ofs)
+ return -EINVAL;
+
+ data.append(bl);
+@@ -1025,10 +1027,12 @@
+
+ return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
+ }
+
+-int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle)
++int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again)
+ {
++ *again = false;
++
+ *phandle = NULL;
+ if (extra_data_len) {
+ size_t extra_len = bl.length();
+ if (extra_len > extra_data_len)
+@@ -1043,15 +1047,18 @@
+ return 0;
+ }
+ }
+
+- uint64_t max_chunk_size = store->get_max_chunk_size();
++ uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
+
+ pending_data_bl.claim_append(bl);
+- if (pending_data_bl.length() < max_chunk_size)
++ if (pending_data_bl.length() < max_write_size)
+ return 0;
+
+- pending_data_bl.splice(0, max_chunk_size, &bl);
++ pending_data_bl.splice(0, max_write_size, &bl);
++
++ /* do we have enough data pending accumulated that needs to be written? */
++ *again = (pending_data_bl.length() >= max_chunk_size);
+
+ if (!data_ofs && !immutable_head()) {
+ first_chunk.claim(bl);
+ obj_len = (uint64_t)first_chunk.length();
+@@ -1069,19 +1076,32 @@
+ object and cleanup can be messy */
+ return write_data(bl, write_ofs, phandle, exclusive);
+ }
+
+-int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
++
++int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, void *obj_ctx, string *oid_rand)
+ {
+ RGWPutObjProcessor::prepare(store, obj_ctx, oid_rand);
+
+- head_obj.init(bucket, obj_str);
++ int r = store->get_max_chunk_size(bucket, &max_chunk_size);
++ if (r < 0) {
++ return r;
++ }
++
++ return 0;
++}
+
+- uint64_t max_chunk_size = store->get_max_chunk_size();
++int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
++{
++ int r = prepare_init(store, obj_ctx, oid_rand);
++ if (r < 0) {
++ return r;
++ }
++ head_obj.init(bucket, obj_str);
+
+ manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
+
+- int r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, head_obj);
++ r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, head_obj);
+ if (r < 0) {
+ return r;
+ }
+
+@@ -1200,8 +1220,46 @@
+ objs_state[new_obj].prefetch_data = true;
+ }
+ }
+
++int RGWRados::get_required_alignment(rgw_bucket& bucket, uint64_t *alignment)
++{
++ IoCtx ioctx;
++ int r = open_bucket_data_ctx(bucket, ioctx);
++ if (r < 0) {
++ ldout(cct, 0) << "ERROR: open_bucket_data_ctx() returned " << r << dendl;
++ return r;
++ }
++
++ *alignment = ioctx.pool_required_alignment();
++ return 0;
++}
++
++int RGWRados::get_max_chunk_size(rgw_bucket& bucket, uint64_t *max_chunk_size)
++{
++ uint64_t alignment;
++ int r = get_required_alignment(bucket, &alignment);
++ if (r < 0) {
++ return r;
++ }
++
++ uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
++
++ if (alignment == 0) {
++ *max_chunk_size = config_chunk_size;
++ return 0;
++ }
++
++ if (config_chunk_size <= alignment) {
++ *max_chunk_size = alignment;
++ return 0;
++ }
++
++ *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
++
++ return 0;
++}
++
+ void RGWRados::finalize()
+ {
+ if (need_watch_notify()) {
+ finalize_watch();
+@@ -1235,10 +1293,8 @@
+ int RGWRados::init_rados()
+ {
+ int ret;
+
+- max_chunk_size = cct->_conf->rgw_max_chunk_size;
+-
+ rados = new Rados();
+ if (!rados)
+ return -ENOMEM;
+
+@@ -2956,27 +3012,35 @@
+ progress_data(_progress_data) {}
+ int handle_data(bufferlist& bl, off_t ofs, off_t len) {
+ progress_cb(ofs, progress_data);
+
+- void *handle;
+- int ret = processor->handle_data(bl, ofs, &handle);
+- if (ret < 0)
+- return ret;
++ bool again;
+
+- if (opstate) {
+- /* need to update opstate repository with new state. This is ratelimited, so we're not
+- * really doing it every time
+- */
+- ret = opstate->renew_state();
+- if (ret < 0) {
+- /* could not renew state! might have been marked as cancelled */
++ bool need_opstate = true;
++
++ do {
++ void *handle;
++ int ret = processor->handle_data(bl, ofs, &handle, &again);
++ if (ret < 0)
+ return ret;
++
++ if (need_opstate && opstate) {
++ /* need to update opstate repository with new state. This is ratelimited, so we're not
++ * really doing it every time
++ */
++ ret = opstate->renew_state();
++ if (ret < 0) {
++ /* could not renew state! might have been marked as cancelled */
++ return ret;
++ }
++
++ need_opstate = false;
+ }
+- }
+
+- ret = processor->throttle_data(handle, false);
+- if (ret < 0)
+- return ret;
++ ret = processor->throttle_data(handle, false);
++ if (ret < 0)
++ return ret;
++ } while (again);
+
+ return 0;
+ }
+
+@@ -3191,26 +3255,8 @@
+ return ret;
+
+ vector<rgw_obj> ref_objs;
+
+- bool copy_data = !astate->has_manifest;
+- bool copy_first = false;
+- if (astate->has_manifest) {
+- if (!astate->manifest.has_tail()) {
+- copy_data = true;
+- } else {
+- uint64_t head_size = astate->manifest.get_head_size();
+-
+- if (head_size > 0) {
+- if (head_size > max_chunk_size) // should never happen
+- copy_data = true;
+- else
+- copy_first = true;
+- }
+- }
+- }
+-
+-
+ if (remote_dest) {
+ /* dest is in a different region, copy it there */
+
+ string etag;
+@@ -3229,10 +3275,37 @@
+ if (ret < 0)
+ return ret;
+
+ return 0;
+- } else if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
+- return copy_obj_data(ctx, dest_bucket_info.owner, &handle, end, dest_obj, src_obj, mtime, src_attrs, category, ptag, err);
++ }
++
++ uint64_t max_chunk_size;
++
++ ret = get_max_chunk_size(dest_obj.bucket, &max_chunk_size);
++ if (ret < 0) {
++ ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
++ return ret;
++ }
++
++ bool copy_data = !astate->has_manifest;
++ bool copy_first = false;
++ if (astate->has_manifest) {
++ if (!astate->manifest.has_tail()) {
++ copy_data = true;
++ } else {
++ uint64_t head_size = astate->manifest.get_head_size();
++
++ if (head_size > 0) {
++ if (head_size > max_chunk_size)
++ copy_data = true;
++ else
++ copy_first = true;
++ }
++ }
++ }
++
++ if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
++ return copy_obj_data(ctx, dest_bucket_info.owner, &handle, end, dest_obj, src_obj, max_chunk_size, mtime, src_attrs, category, ptag, err);
+ }
+
+ RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
+
+@@ -3340,8 +3413,9 @@
+ const string& owner,
+ void **handle, off_t end,
+ rgw_obj& dest_obj,
+ rgw_obj& src_obj,
++ uint64_t max_chunk_size,
+ time_t *mtime,
+ map<string, bufferlist>& attrs,
+ RGWObjCategory category,
+ string *ptag,
+@@ -4472,8 +4546,10 @@
+
+ bool merge_bl = false;
+ bufferlist *pbl = &bl;
+ bufferlist read_bl;
++ uint64_t max_chunk_size;
++
+
+ get_obj_bucket_and_oid_key(obj, bucket, oid, key);
+
+ if (!rctx) {
+@@ -4504,8 +4580,14 @@
+ get_obj_bucket_and_oid_key(read_obj, bucket, oid, key);
+ }
+ }
+
++ r = get_max_chunk_size(bucket, &max_chunk_size);
++ if (r < 0) {
++ ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << bucket << dendl;
++ goto done_ret;
++ }
++
+ if (len > max_chunk_size)
+ len = max_chunk_size;
+
+
+--- a/src/rgw/rgw_rados.h
++++ b/src/rgw/rgw_rados.h
+@@ -547,9 +547,9 @@
+ store = _store;
+ obj_ctx = _o;
+ return 0;
+ };
+- virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle) = 0;
++ virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again) = 0;
+ virtual int throttle_data(void *handle, bool need_to_wait) = 0;
+ virtual int complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs);
+ };
+
+@@ -563,9 +563,9 @@
+ off_t ofs;
+
+ protected:
+ int prepare(RGWRados *store, void *obj_ctx, string *oid_rand);
+- int handle_data(bufferlist& bl, off_t ofs, void **phandle);
++ int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again);
+ int do_complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs);
+
+ public:
+ int throttle_data(void *handle, bool need_to_wait) { return 0; }
+@@ -612,8 +612,10 @@
+
+ uint64_t extra_data_len;
+ bufferlist extra_data_bl;
+ bufferlist pending_data_bl;
++ uint64_t max_chunk_size;
++
+ protected:
+ rgw_bucket bucket;
+ string obj_str;
+
+@@ -630,8 +632,10 @@
+ int prepare_next_part(off_t ofs);
+ int complete_parts();
+ int complete_writing_data();
+
++ int prepare_init(RGWRados *store, void *obj_ctx, string *oid_rand);
++
+ public:
+ ~RGWPutObjProcessor_Atomic() {}
+ RGWPutObjProcessor_Atomic(const string& bucket_owner, rgw_bucket& _b, const string& _o, uint64_t _p, const string& _t) :
+ RGWPutObjProcessor_Aio(bucket_owner),
+@@ -640,17 +644,18 @@
+ next_part_ofs(_p),
+ cur_part_id(0),
+ data_ofs(0),
+ extra_data_len(0),
++ max_chunk_size(0),
+ bucket(_b),
+ obj_str(_o),
+ unique_tag(_t) {}
+ int prepare(RGWRados *store, void *obj_ctx, string *oid_rand);
+ virtual bool immutable_head() { return false; }
+ void set_extra_data_len(uint64_t len) {
+ extra_data_len = len;
+ }
+- virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle);
++ virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again);
+ bufferlist& get_extra_data() { return extra_data_bl; }
+ };
+
+
+@@ -1220,10 +1225,8 @@
+ int get_obj_ioctx(const rgw_obj& obj, librados::IoCtx *ioctx);
+ int get_obj_ref(const rgw_obj& obj, rgw_rados_ref *ref, rgw_bucket *bucket, bool ref_system_obj = false);
+ uint64_t max_bucket_id;
+
+- uint64_t max_chunk_size;
+-
+ int get_obj_state(RGWRadosCtx *rctx, rgw_obj& obj, RGWObjState **state, RGWObjVersionTracker *objv_tracker);
+ int append_atomic_test(RGWRadosCtx *rctx, rgw_obj& obj,
+ librados::ObjectOperation& op, RGWObjState **state);
+ int prepare_atomic_for_write_impl(RGWRadosCtx *rctx, rgw_obj& obj,
+@@ -1286,9 +1289,8 @@
+ gc(NULL), use_gc_thread(false), quota_threads(false),
+ num_watchers(0), watchers(NULL), watch_handles(NULL),
+ watch_initialized(false),
+ bucket_id_lock("rados_bucket_id"), max_bucket_id(0),
+- max_chunk_size(0),
+ cct(NULL), rados(NULL),
+ pools_initialized(false),
+ quota_handler(NULL),
+ rest_master_conn(NULL),
+@@ -1324,11 +1326,10 @@
+ delete rados;
+ }
+ }
+
+- uint64_t get_max_chunk_size() {
+- return max_chunk_size;
+- }
++ int get_required_alignment(rgw_bucket& bucket, uint64_t *alignment);
++ int get_max_chunk_size(rgw_bucket& bucket, uint64_t *max_chunk_size);
+
+ int list_raw_objects(rgw_bucket& pool, const string& prefix_filter, int max,
+ RGWListRawObjsCtx& ctx, list<string>& oids,
+ bool *is_truncated);
+@@ -1562,8 +1563,9 @@
+ const string& owner,
+ void **handle, off_t end,
+ rgw_obj& dest_obj,
+ rgw_obj& src_obj,
++ uint64_t max_chunk_size,
+ time_t *mtime,
+ map<string, bufferlist>& attrs,
+ RGWObjCategory category,
+ string *ptag,
+--- a/src/rgw/rgw_rest.cc
++++ b/src/rgw/rgw_rest.cc
+@@ -179,9 +179,9 @@
+ {
+ std::ostringstream oss;
+ formatter->flush(oss);
+ std::string outs(oss.str());
+- if (!outs.empty()) {
++ if (!outs.empty() && s->op != OP_HEAD) {
+ s->cio->write(outs.c_str(), outs.size());
+ }
+
+ s->formatter->reset();
+@@ -191,9 +191,9 @@
+ {
+ std::ostringstream oss;
+ formatter->flush(oss);
+ std::string outs(oss.str());
+- if (!outs.empty()) {
++ if (!outs.empty() && s->op != OP_HEAD) {
+ s->cio->write(outs.c_str(), outs.size());
+ }
+ }
+
+--- a/src/rgw/rgw_rest_swift.cc
++++ b/src/rgw/rgw_rest_swift.cc
+@@ -626,20 +626,18 @@
+ string hdrs, exp_hdrs;
+ uint32_t max_age = CORS_MAX_AGE_INVALID;
+ /*EACCES means, there is no CORS registered yet for the bucket
+ *ENOENT means, there is no match of the Origin in the list of CORSRule
+- *ENOTSUPP means, the HTTP_METHOD is not supported
+ */
+ if (ret == -ENOENT)
+ ret = -EACCES;
+- if (ret != -EACCES) {
+- get_response_params(hdrs, exp_hdrs, &max_age);
+- } else {
++ if (ret < 0) {
+ set_req_state_err(s, ret);
+ dump_errno(s);
+ end_header(s, NULL);
+ return;
+ }
++ get_response_params(hdrs, exp_hdrs, &max_age);
+ dump_errno(s);
+ dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(), max_age);
+ end_header(s, NULL);
+ }
+--- a/src/test/crush/TestCrushWrapper.cc
++++ b/src/test/crush/TestCrushWrapper.cc
+@@ -537,8 +537,13 @@
+ EXPECT_NE(string::npos,
+ ss.str().find("<item_name>default</item_name></step>"));
+ }
+
++ map<int,float> wm;
++ c->get_rule_weight_osd_map(0, &wm);
++ ASSERT_TRUE(wm.size() == 1);
++ ASSERT_TRUE(wm[0] == 1.0);
++
+ delete c;
+ }
+
+ TEST(CrushWrapper, distance) {
+--- a/src/test/erasure-code/TestErasureCodeJerasure.cc
++++ b/src/test/erasure-code/TestErasureCodeJerasure.cc
+@@ -287,8 +287,38 @@
+ c->insert_item(g_ceph_context, osd, 1.0, string("osd.") + stringify(osd), loc);
+ }
+ }
+
++ //
++ // The ruleid may be different from the ruleset when a crush rule is
++ // removed because the removed ruleid will be reused but the removed
++ // ruleset will not be reused.
++ //
++ // This also asserts that the create_ruleset() method returns a
++ // ruleset and not a ruleid http://tracker.ceph.com/issues/9044
++ //
++ {
++ stringstream ss;
++ ErasureCodeJerasureReedSolomonVandermonde jerasure;
++ map<std::string,std::string> parameters;
++ parameters["k"] = "2";
++ parameters["m"] = "2";
++ parameters["w"] = "8";
++ jerasure.init(parameters);
++ int FIRST = jerasure.create_ruleset("FIRST", *c, &ss);
++ int SECOND = jerasure.create_ruleset("SECOND", *c, &ss);
++ int FIRST_ruleid = c->get_rule_id("FIRST");
++ EXPECT_EQ(0, c->remove_rule(FIRST_ruleid));
++ int ruleset = jerasure.create_ruleset("myrule", *c, &ss);
++ EXPECT_NE(FIRST, ruleset);
++ EXPECT_NE(SECOND, ruleset);
++ EXPECT_NE(ruleset, c->get_rule_id("myrule"));
++ int SECOND_ruleid = c->get_rule_id("SECOND");
++ EXPECT_EQ(0, c->remove_rule(SECOND_ruleid));
++ int myrule_ruleid = c->get_rule_id("myrule");
++ EXPECT_EQ(0, c->remove_rule(myrule_ruleid));
++ }
++
+ {
+ stringstream ss;
+ ErasureCodeJerasureReedSolomonVandermonde jerasure;
+ map<std::string,std::string> parameters;
+--- a/src/test/librados/TestCase.cc
++++ b/src/test/librados/TestCase.cc
+@@ -7,8 +7,9 @@
+
+ using namespace librados;
+
+ std::string RadosTest::pool_name;
++std::string RadosTest::nspace;
+ rados_t RadosTest::s_cluster = NULL;
+
+ void RadosTest::SetUpTestCase()
+ {
+@@ -24,9 +25,9 @@
+ void RadosTest::SetUp()
+ {
+ cluster = RadosTest::s_cluster;
+ ASSERT_EQ(0, rados_ioctx_create(cluster, pool_name.c_str(), &ioctx));
+- std::string nspace = get_temp_pool_name();
++ nspace = get_temp_pool_name();
+ rados_ioctx_set_namespace(ioctx, nspace.c_str());
+ ASSERT_FALSE(rados_ioctx_pool_requires_alignment(ioctx));
+ }
+
+@@ -205,26 +206,8 @@
+ cleanup_default_namespace(ioctx);
+ rados_ioctx_destroy(ioctx);
+ }
+
+-void RadosTestEC::cleanup_default_namespace(rados_ioctx_t ioctx)
+-{
+- // remove all objects from the default namespace to avoid polluting
+- // other tests
+- rados_ioctx_set_namespace(ioctx, "");
+- rados_list_ctx_t list_ctx;
+- ASSERT_EQ(0, rados_objects_list_open(ioctx, &list_ctx));
+- int r;
+- const char *entry = NULL;
+- const char *key = NULL;
+- while ((r = rados_objects_list_next(list_ctx, &entry, &key)) != -ENOENT) {
+- ASSERT_EQ(0, r);
+- rados_ioctx_locator_set_key(ioctx, key);
+- ASSERT_EQ(0, rados_remove(ioctx, entry));
+- }
+- rados_objects_list_close(list_ctx);
+-}
+-
+ std::string RadosTestECPP::pool_name;
+ Rados RadosTestECPP::s_cluster;
+
+ void RadosTestECPP::SetUpTestCase()
+@@ -253,15 +236,4 @@
+ cleanup_default_namespace(ioctx);
+ ioctx.close();
+ }
+
+-void RadosTestECPP::cleanup_default_namespace(librados::IoCtx ioctx)
+-{
+- // remove all objects from the default namespace to avoid polluting
+- // other tests
+- ioctx.set_namespace("");
+- for (ObjectIterator it = ioctx.objects_begin();
+- it != ioctx.objects_end(); ++it) {
+- ioctx.locator_set_key(it->second);
+- ASSERT_EQ(0, ioctx.remove(it->first));
+- }
+-}
+--- a/src/test/librados/TestCase.h
++++ b/src/test/librados/TestCase.h
+@@ -27,8 +27,9 @@
+ static void TearDownTestCase();
+ static void cleanup_default_namespace(rados_ioctx_t ioctx);
+ static rados_t s_cluster;
+ static std::string pool_name;
++ static std::string nspace;
+
+ virtual void SetUp();
+ virtual void TearDown();
+ rados_t cluster;
+@@ -71,16 +72,15 @@
+ librados::IoCtx ioctx;
+ std::string ns;
+ };
+
+-class RadosTestEC : public ::testing::Test {
++class RadosTestEC : public RadosTest {
+ public:
+ RadosTestEC() {}
+ virtual ~RadosTestEC() {}
+ protected:
+ static void SetUpTestCase();
+ static void TearDownTestCase();
+- static void cleanup_default_namespace(rados_ioctx_t ioctx);
+ static rados_t s_cluster;
+ static std::string pool_name;
+
+ virtual void SetUp();
+@@ -89,16 +89,15 @@
+ rados_ioctx_t ioctx;
+ uint64_t alignment;
+ };
+
+-class RadosTestECPP : public ::testing::Test {
++class RadosTestECPP : public RadosTestPP {
+ public:
+ RadosTestECPP() : cluster(s_cluster) {};
+ virtual ~RadosTestECPP() {};
+ protected:
+ static void SetUpTestCase();
+ static void TearDownTestCase();
+- static void cleanup_default_namespace(librados::IoCtx ioctx);
+ static librados::Rados s_cluster;
+ static std::string pool_name;
+
+ virtual void SetUp();
+--- a/src/test/librados/io.cc
++++ b/src/test/librados/io.cc
+@@ -24,8 +24,60 @@
+ rados_ioctx_set_namespace(ioctx, "nspace");
+ ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+ }
+
++TEST_F(LibRadosIo, ReadTimeout) {
++ char buf[128];
++ memset(buf, 'a', sizeof(buf));
++ ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
++
++ {
++ // set up a second client
++ rados_t cluster;
++ rados_ioctx_t ioctx;
++ rados_create(&cluster, "admin");
++ rados_conf_read_file(cluster, NULL);
++ rados_conf_parse_env(cluster, NULL);
++ rados_conf_set(cluster, "rados_osd_op_timeout", "0.00001"); // use any small value that will result in a timeout
++ rados_connect(cluster);
++ rados_ioctx_create(cluster, pool_name.c_str(), &ioctx);
++ rados_ioctx_set_namespace(ioctx, nspace.c_str());
++
++ // then we show that the buffer is changed after rados_read returned
++ // with a timeout
++ for (int i=0; i<5; i++) {
++ char buf2[sizeof(buf)];
++ memset(buf2, 0, sizeof(buf2));
++ int err = rados_read(ioctx, "foo", buf2, sizeof(buf2), 0);
++ if (err == -110) {
++ int startIndex = 0;
++ // find the index until which librados already read the object before the timeout occurred
++ for (unsigned b=0; b<sizeof(buf); b++) {
++ if (buf2[b] != buf[b]) {
++ startIndex = b;
++ break;
++ }
++ }
++
++ // wait some time to give librados a change to do something
++ sleep(1);
++
++ // then check if the buffer was changed after the call
++ if (buf2[startIndex] == 'a') {
++ printf("byte at index %d was changed after the timeout to %d\n",
++ startIndex, (int)buf[startIndex]);
++ ASSERT_TRUE(0);
++ break;
++ }
++ } else {
++ printf("no timeout :/\n");
++ }
++ }
++ rados_ioctx_destroy(ioctx);
++ rados_shutdown(cluster);
++ }
++}
++
+ TEST_F(LibRadosIoPP, SimpleWritePP) {
+ char buf[128];
+ memset(buf, 0xcc, sizeof(buf));
+ bufferlist bl;
+--- a/src/test/librados/tier.cc
++++ b/src/test/librados/tier.cc
+@@ -33,8 +33,40 @@
+
+ typedef RadosTestPP LibRadosTierPP;
+ typedef RadosTestECPP LibRadosTierECPP;
+
++void flush_evict_all(librados::Rados& cluster, librados::IoCtx& cache_ioctx)
++{
++ bufferlist inbl;
++ cache_ioctx.set_namespace("");
++ for (ObjectIterator it = cache_ioctx.objects_begin();
++ it != cache_ioctx.objects_end(); ++it) {
++ cache_ioctx.locator_set_key(it->second);
++ {
++ ObjectReadOperation op;
++ op.cache_flush();
++ librados::AioCompletion *completion = cluster.aio_create_completion();
++ cache_ioctx.aio_operate(
++ it->first, completion, &op,
++ librados::OPERATION_IGNORE_OVERLAY, NULL);
++ completion->wait_for_safe();
++ completion->get_return_value();
++ completion->release();
++ }
++ {
++ ObjectReadOperation op;
++ op.cache_evict();
++ librados::AioCompletion *completion = cluster.aio_create_completion();
++ cache_ioctx.aio_operate(
++ it->first, completion, &op,
++ librados::OPERATION_IGNORE_OVERLAY, NULL);
++ completion->wait_for_safe();
++ completion->get_return_value();
++ completion->release();
++ }
++ }
++}
++
+ class LibRadosTwoPoolsPP : public RadosTestPP
+ {
+ public:
+ LibRadosTwoPoolsPP() {};
+@@ -58,9 +90,28 @@
+ cache_ioctx.set_namespace(ns);
+ }
+ virtual void TearDown() {
+ RadosTestPP::TearDown();
++
++ // flush + evict cache
++ flush_evict_all(cluster, cache_ioctx);
++
++ bufferlist inbl;
++ // tear down tiers
++ ASSERT_EQ(0, cluster.mon_command(
++ "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
++ "\"}",
++ inbl, NULL, NULL));
++ ASSERT_EQ(0, cluster.mon_command(
++ "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
++ "\", \"tierpool\": \"" + cache_pool_name + "\"}",
++ inbl, NULL, NULL));
++
++ // wait for maps to settle before next test
++ cluster.wait_for_latest_osdmap();
++
+ cleanup_default_namespace(cache_ioctx);
++
+ cache_ioctx.close();
+ }
+ librados::IoCtx cache_ioctx;
+ };
+@@ -179,21 +230,8 @@
+ ASSERT_EQ(0, completion->get_return_value());
+ completion->release();
+ ASSERT_EQ('b', bl[0]);
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsPP, Promote) {
+ // create object
+@@ -246,21 +284,8 @@
+ ASSERT_TRUE(it->first == string("foo") || it->first == string("bar"));
+ ++it;
+ ASSERT_TRUE(it == cache_ioctx.objects_end());
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsPP, PromoteSnap) {
+ // create object
+@@ -399,21 +424,8 @@
+ {
+ bufferlist bl;
+ ASSERT_EQ(-ENOENT, ioctx.read("baz", bl, 1, 0));
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsPP, PromoteSnapScrub) {
+ int num = 100;
+@@ -508,21 +520,8 @@
+ cout << "done waiting" << std::endl;
+ }
+
+ ioctx.snap_set_read(librados::SNAP_HEAD);
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+
+ TEST_F(LibRadosTwoPoolsPP, PromoteSnapTrimRace) {
+@@ -576,21 +575,8 @@
+ {
+ bufferlist bl;
+ ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsPP, Whiteout) {
+ // create object
+@@ -652,21 +638,8 @@
+ bufferlist bl;
+ ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+ ASSERT_EQ('h', bl[0]);
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsPP, Evict) {
+ // create object
+@@ -755,21 +728,8 @@
+ completion->wait_for_safe();
+ ASSERT_EQ(-EBUSY, completion->get_return_value());
+ completion->release();
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsPP, EvictSnap) {
+ // create object
+@@ -1003,21 +963,8 @@
+ completion->wait_for_safe();
+ ASSERT_EQ(0, completion->get_return_value());
+ completion->release();
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsPP, TryFlush) {
+ // configure cache
+@@ -1124,21 +1071,8 @@
+ {
+ ObjectIterator it = cache_ioctx.objects_begin();
+ ASSERT_TRUE(it == cache_ioctx.objects_end());
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsPP, Flush) {
+ // configure cache
+@@ -1297,21 +1231,8 @@
+ {
+ ObjectIterator it = ioctx.objects_begin();
+ ASSERT_TRUE(it == ioctx.objects_end());
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsPP, FlushSnap) {
+ // configure cache
+@@ -1469,20 +1390,13 @@
+ ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+ ASSERT_EQ('a', bl[0]);
+ }
+
+- // tear down tiers
++ // remove overlay
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+ "\"}",
+ inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle
+- cluster.wait_for_latest_osdmap();
+
+ // verify i can read the snaps from the base pool
+ ioctx.snap_set_read(librados::SNAP_HEAD);
+ {
+@@ -1501,8 +1415,13 @@
+ bufferlist bl;
+ ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+ ASSERT_EQ('a', bl[0]);
+ }
++
++ ASSERT_EQ(0, cluster.mon_command(
++ "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
++ "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
++ inbl, NULL, NULL));
+ }
+
+ TEST_F(LibRadosTierPP, FlushWriteRaces) {
+ Rados cluster;
+@@ -1785,21 +1704,8 @@
+ ASSERT_EQ(0, completion2->get_return_value());
+ completion->release();
+ completion2->release();
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+
+ IoCtx *read_ioctx = 0;
+@@ -1894,21 +1800,8 @@
+ max_reads = 0;
+ while (num_reads > 0)
+ cond.Wait(test_lock);
+ test_lock.Unlock();
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTierPP, HitSetNone) {
+ {
+@@ -1943,23 +1836,30 @@
+ + string("\",\"var\": \"") + var + string("\",\"val\": \"")
+ + stringify(val) + string("\"}");
+ }
+
+-TEST_F(LibRadosTierPP, HitSetRead) {
+- // enable hitset tracking for this pool
++TEST_F(LibRadosTwoPoolsPP, HitSetRead) {
++ // make it a tier
+ bufferlist inbl;
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 2),
++ ASSERT_EQ(0, cluster.mon_command(
++ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
++ "\", \"tierpool\": \"" + cache_pool_name +
++ "\", \"force_nonempty\": \"--force-nonempty\" }",
++ inbl, NULL, NULL));
++
++ // enable hitset tracking for this pool
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 2),
+ inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
+ inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
+ "explicit_object"),
+ inbl, NULL, NULL));
+
+ // wait for maps to settle
+ cluster.wait_for_latest_osdmap();
+
+- ioctx.set_namespace("");
++ cache_ioctx.set_namespace("");
+
+ // keep reading until we see our object appear in the HitSet
+ utime_t start = ceph_clock_now(NULL);
+ utime_t hard_stop = start + utime_t(600, 0);
+@@ -1968,18 +1868,18 @@
+ utime_t now = ceph_clock_now(NULL);
+ ASSERT_TRUE(now < hard_stop);
+
+ string name = "foo";
+- uint32_t hash = ioctx.get_object_hash_position(name);
++ uint32_t hash = cache_ioctx.get_object_hash_position(name);
+ hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash,
+- cluster.pool_lookup(pool_name.c_str()), "");
++ cluster.pool_lookup(cache_pool_name.c_str()), "");
+
+ bufferlist bl;
+- ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
++ ASSERT_EQ(-ENOENT, cache_ioctx.read("foo", bl, 1, 0));
+
+ bufferlist hbl;
+ AioCompletion *c = librados::Rados::aio_create_completion();
+- ASSERT_EQ(0, ioctx.hit_set_get(hash, c, now.sec(), &hbl));
++ ASSERT_EQ(0, cache_ioctx.hit_set_get(hash, c, now.sec(), &hbl));
+ c->wait_for_complete();
+ c->release();
+
+ if (hbl.length()) {
+@@ -2027,49 +1927,58 @@
+ return -1;
+ }
+
+
+-TEST_F(LibRadosTierPP, HitSetWrite) {
++TEST_F(LibRadosTwoPoolsPP, HitSetWrite) {
+ int num_pg = _get_pg_num(cluster, pool_name);
+ assert(num_pg > 0);
+
+- // enable hitset tracking for this pool
++ // make it a tier
+ bufferlist inbl;
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 8),
++ ASSERT_EQ(0, cluster.mon_command(
++ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
++ "\", \"tierpool\": \"" + cache_pool_name +
++ "\", \"force_nonempty\": \"--force-nonempty\" }",
++ inbl, NULL, NULL));
++
++ // enable hitset tracking for this pool
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 8),
+ inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
+ inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
+ "explicit_hash"),
+ inbl, NULL, NULL));
+
+ // wait for maps to settle
+ cluster.wait_for_latest_osdmap();
+
+- ioctx.set_namespace("");
++ cache_ioctx.set_namespace("");
++
++ int num = 200;
+
+ // do a bunch of writes
+- for (int i=0; i<1000; ++i) {
++ for (int i=0; i<num; ++i) {
+ bufferlist bl;
+ bl.append("a");
+- ASSERT_EQ(0, ioctx.write(stringify(i), bl, 1, 0));
++ ASSERT_EQ(0, cache_ioctx.write(stringify(i), bl, 1, 0));
+ }
+
+ // get HitSets
+ std::map<int,HitSet> hitsets;
+ for (int i=0; i<num_pg; ++i) {
+ list< pair<time_t,time_t> > ls;
+ AioCompletion *c = librados::Rados::aio_create_completion();
+- ASSERT_EQ(0, ioctx.hit_set_list(i, c, &ls));
++ ASSERT_EQ(0, cache_ioctx.hit_set_list(i, c, &ls));
+ c->wait_for_complete();
+ c->release();
+ std::cout << "pg " << i << " ls " << ls << std::endl;
+ ASSERT_FALSE(ls.empty());
+
+ // get the latest
+ c = librados::Rados::aio_create_completion();
+ bufferlist bl;
+- ASSERT_EQ(0, ioctx.hit_set_get(i, c, ls.back().first, &bl));
++ ASSERT_EQ(0, cache_ioctx.hit_set_get(i, c, ls.back().first, &bl));
+ c->wait_for_complete();
+ c->release();
+
+ //std::cout << "bl len is " << bl.length() << "\n";
+@@ -2080,16 +1989,16 @@
+ ::decode(hitsets[i], p);
+
+ // cope with racing splits by refreshing pg_num
+ if (i == num_pg - 1)
+- num_pg = _get_pg_num(cluster, pool_name);
++ num_pg = _get_pg_num(cluster, cache_pool_name);
+ }
+
+- for (int i=0; i<1000; ++i) {
++ for (int i=0; i<num; ++i) {
+ string n = stringify(i);
+- uint32_t hash = ioctx.get_object_hash_position(n);
++ uint32_t hash = cache_ioctx.get_object_hash_position(n);
+ hobject_t oid(sobject_t(n, CEPH_NOSNAP), "", hash,
+- cluster.pool_lookup(pool_name.c_str()), "");
++ cluster.pool_lookup(cache_pool_name.c_str()), "");
+ std::cout << "checking for " << oid << std::endl;
+ bool found = false;
+ for (int p=0; p<num_pg; ++p) {
+ if (hitsets[p].contains(oid)) {
+@@ -2100,45 +2009,52 @@
+ ASSERT_TRUE(found);
+ }
+ }
+
+-TEST_F(LibRadosTierPP, HitSetTrim) {
++TEST_F(LibRadosTwoPoolsPP, HitSetTrim) {
+ unsigned count = 3;
+ unsigned period = 3;
+
+- // enable hitset tracking for this pool
++ // make it a tier
+ bufferlist inbl;
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", count),
++ ASSERT_EQ(0, cluster.mon_command(
++ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
++ "\", \"tierpool\": \"" + cache_pool_name +
++ "\", \"force_nonempty\": \"--force-nonempty\" }",
++ inbl, NULL, NULL));
++
++ // enable hitset tracking for this pool
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", count),
+ inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", period),
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", period),
+ inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type", "bloom"),
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
+ inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_fpp", ".01"),
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_fpp", ".01"),
+ inbl, NULL, NULL));
+
+ // wait for maps to settle
+ cluster.wait_for_latest_osdmap();
+
+- ioctx.set_namespace("");
++ cache_ioctx.set_namespace("");
+
+ // do a bunch of writes and make sure the hitsets rotate
+ utime_t start = ceph_clock_now(NULL);
+ utime_t hard_stop = start + utime_t(count * period * 50, 0);
+
+ time_t first = 0;
+ while (true) {
+ string name = "foo";
+- uint32_t hash = ioctx.get_object_hash_position(name);
++ uint32_t hash = cache_ioctx.get_object_hash_position(name);
+ hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash, -1, "");
+
+ bufferlist bl;
+ bl.append("f");
+- ASSERT_EQ(0, ioctx.write("foo", bl, 1, 0));
++ ASSERT_EQ(0, cache_ioctx.write("foo", bl, 1, 0));
+
+ list<pair<time_t, time_t> > ls;
+ AioCompletion *c = librados::Rados::aio_create_completion();
+- ASSERT_EQ(0, ioctx.hit_set_list(hash, c, &ls));
++ ASSERT_EQ(0, cache_ioctx.hit_set_list(hash, c, &ls));
+ c->wait_for_complete();
+ c->release();
+
+ ASSERT_TRUE(ls.size() <= count + 1);
+@@ -2186,11 +2102,31 @@
+ cache_ioctx.set_namespace(ns);
+ }
+ virtual void TearDown() {
+ RadosTestECPP::TearDown();
++
++ // flush + evict cache
++ flush_evict_all(cluster, cache_ioctx);
++
++ bufferlist inbl;
++ // tear down tiers
++ ASSERT_EQ(0, cluster.mon_command(
++ "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
++ "\"}",
++ inbl, NULL, NULL));
++ ASSERT_EQ(0, cluster.mon_command(
++ "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
++ "\", \"tierpool\": \"" + cache_pool_name + "\"}",
++ inbl, NULL, NULL));
++
++ // wait for maps to settle before next test
++ cluster.wait_for_latest_osdmap();
++
+ cleanup_default_namespace(cache_ioctx);
++
+ cache_ioctx.close();
+ }
++
+ librados::IoCtx cache_ioctx;
+ };
+
+ std::string LibRadosTwoPoolsECPP::cache_pool_name;
+@@ -2307,21 +2243,8 @@
+ ASSERT_EQ(0, completion->get_return_value());
+ completion->release();
+ ASSERT_EQ('b', bl[0]);
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsECPP, Promote) {
+ // create object
+@@ -2374,21 +2297,8 @@
+ ASSERT_TRUE(it->first == string("foo") || it->first == string("bar"));
+ ++it;
+ ASSERT_TRUE(it == cache_ioctx.objects_end());
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsECPP, PromoteSnap) {
+ // create object
+@@ -2551,21 +2461,8 @@
+ {
+ bufferlist bl;
+ ASSERT_EQ(-ENOENT, ioctx.read("baz", bl, 1, 0));
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsECPP, PromoteSnapTrimRace) {
+ // create object
+@@ -2618,21 +2515,8 @@
+ {
+ bufferlist bl;
+ ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsECPP, Whiteout) {
+ // create object
+@@ -2694,21 +2578,8 @@
+ bufferlist bl;
+ ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+ ASSERT_EQ('h', bl[0]);
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsECPP, Evict) {
+ // create object
+@@ -2797,21 +2668,8 @@
+ completion->wait_for_safe();
+ ASSERT_EQ(-EBUSY, completion->get_return_value());
+ completion->release();
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsECPP, EvictSnap) {
+ // create object
+@@ -3045,21 +2903,8 @@
+ completion->wait_for_safe();
+ ASSERT_EQ(0, completion->get_return_value());
+ completion->release();
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsECPP, TryFlush) {
+ // configure cache
+@@ -3166,21 +3011,8 @@
+ {
+ ObjectIterator it = cache_ioctx.objects_begin();
+ ASSERT_TRUE(it == cache_ioctx.objects_end());
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsECPP, Flush) {
+ // configure cache
+@@ -3339,21 +3171,8 @@
+ {
+ ObjectIterator it = ioctx.objects_begin();
+ ASSERT_TRUE(it == ioctx.objects_end());
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsECPP, FlushSnap) {
+ // configure cache
+@@ -3516,12 +3335,8 @@
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+ "\"}",
+ inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+
+ // wait for maps to settle
+ cluster.wait_for_latest_osdmap();
+
+@@ -3543,8 +3358,13 @@
+ bufferlist bl;
+ ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+ ASSERT_EQ('a', bl[0]);
+ }
++
++ ASSERT_EQ(0, cluster.mon_command(
++ "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
++ "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
++ inbl, NULL, NULL));
+ }
+
+ TEST_F(LibRadosTierECPP, FlushWriteRaces) {
+ Rados cluster;
+@@ -3827,21 +3647,8 @@
+ ASSERT_EQ(0, completion2->get_return_value());
+ completion->release();
+ completion2->release();
+ }
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTwoPoolsECPP, TryFlushReadRace) {
+ // configure cache
+@@ -3902,21 +3709,8 @@
+ max_reads = 0;
+ while (num_reads > 0)
+ cond.Wait(test_lock);
+ test_lock.Unlock();
+-
+- // tear down tiers
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+- "\"}",
+- inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(
+- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+- inbl, NULL, NULL));
+-
+- // wait for maps to settle before next test
+- cluster.wait_for_latest_osdmap();
+ }
+
+ TEST_F(LibRadosTierECPP, HitSetNone) {
+ {
+@@ -3937,23 +3731,30 @@
+ c->release();
+ }
+ }
+
+-TEST_F(LibRadosTierECPP, HitSetRead) {
+- // enable hitset tracking for this pool
++TEST_F(LibRadosTwoPoolsECPP, HitSetRead) {
++ // make it a tier
+ bufferlist inbl;
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 2),
++ ASSERT_EQ(0, cluster.mon_command(
++ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
++ "\", \"tierpool\": \"" + cache_pool_name +
++ "\", \"force_nonempty\": \"--force-nonempty\" }",
++ inbl, NULL, NULL));
++
++ // enable hitset tracking for this pool
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 2),
+ inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
+ inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
+ "explicit_object"),
+ inbl, NULL, NULL));
+
+ // wait for maps to settle
+ cluster.wait_for_latest_osdmap();
+
+- ioctx.set_namespace("");
++ cache_ioctx.set_namespace("");
+
+ // keep reading until we see our object appear in the HitSet
+ utime_t start = ceph_clock_now(NULL);
+ utime_t hard_stop = start + utime_t(600, 0);
+@@ -3962,18 +3763,18 @@
+ utime_t now = ceph_clock_now(NULL);
+ ASSERT_TRUE(now < hard_stop);
+
+ string name = "foo";
+- uint32_t hash = ioctx.get_object_hash_position(name);
++ uint32_t hash = cache_ioctx.get_object_hash_position(name);
+ hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash,
+- cluster.pool_lookup(pool_name.c_str()), "");
++ cluster.pool_lookup(cache_pool_name.c_str()), "");
+
+ bufferlist bl;
+- ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
++ ASSERT_EQ(-ENOENT, cache_ioctx.read("foo", bl, 1, 0));
+
+ bufferlist hbl;
+ AioCompletion *c = librados::Rados::aio_create_completion();
+- ASSERT_EQ(0, ioctx.hit_set_get(hash, c, now.sec(), &hbl));
++ ASSERT_EQ(0, cache_ioctx.hit_set_get(hash, c, now.sec(), &hbl));
+ c->wait_for_complete();
+ c->release();
+
+ if (hbl.length()) {
+@@ -4068,27 +3869,34 @@
+ }
+ }
+ #endif
+
+-TEST_F(LibRadosTierECPP, HitSetTrim) {
++TEST_F(LibRadosTwoPoolsECPP, HitSetTrim) {
+ unsigned count = 3;
+ unsigned period = 3;
+
+- // enable hitset tracking for this pool
++ // make it a tier
+ bufferlist inbl;
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", count),
++ ASSERT_EQ(0, cluster.mon_command(
++ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
++ "\", \"tierpool\": \"" + cache_pool_name +
++ "\", \"force_nonempty\": \"--force-nonempty\" }",
++ inbl, NULL, NULL));
++
++ // enable hitset tracking for this pool
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", count),
+ inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", period),
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", period),
+ inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type", "bloom"),
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
+ inbl, NULL, NULL));
+- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_fpp", ".01"),
++ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_fpp", ".01"),
+ inbl, NULL, NULL));
+
+ // wait for maps to settle
+ cluster.wait_for_latest_osdmap();
+
+- ioctx.set_namespace("");
++ cache_ioctx.set_namespace("");
+
+ // do a bunch of writes and make sure the hitsets rotate
+ utime_t start = ceph_clock_now(NULL);
+ utime_t hard_stop = start + utime_t(count * period * 50, 0);
+@@ -4099,18 +3907,18 @@
+ memset(buf, 'f', bsize);
+
+ while (true) {
+ string name = "foo";
+- uint32_t hash = ioctx.get_object_hash_position(name);
++ uint32_t hash = cache_ioctx.get_object_hash_position(name);
+ hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash, -1, "");
+
+ bufferlist bl;
+ bl.append(buf, bsize);
+- ASSERT_EQ(0, ioctx.append("foo", bl, bsize));
++ ASSERT_EQ(0, cache_ioctx.append("foo", bl, bsize));
+
+ list<pair<time_t, time_t> > ls;
+ AioCompletion *c = librados::Rados::aio_create_completion();
+- ASSERT_EQ(0, ioctx.hit_set_list(hash, c, &ls));
++ ASSERT_EQ(0, cache_ioctx.hit_set_list(hash, c, &ls));
+ c->wait_for_complete();
+ c->release();
+
+ ASSERT_TRUE(ls.size() <= count + 1);
+--- a/src/test/objectstore/store_test.cc
++++ b/src/test/objectstore/store_test.cc
+@@ -1114,8 +1114,113 @@
+ ASSERT_EQ(1u, newomap.size());
+ ASSERT_TRUE(newomap.count("omap_key"));
+ ASSERT_TRUE(newomap["omap_key"].contents_equal(omap["omap_key"]));
+ }
++ {
++ ObjectStore::Transaction t;
++ t.remove(cid, oid);
++ t.remove_collection(cid);
++ t.remove_collection(temp_cid);
++ r = store->apply_transaction(t);
++ ASSERT_EQ(r, 0);
++ }
++}
++
++TEST_P(StoreTest, BigRGWObjectName) {
++ store->set_allow_sharded_objects();
++ store->sync_and_flush();
++ coll_t temp_cid("mytemp");
++ hobject_t temp_oid("tmp_oid", "", CEPH_NOSNAP, 0, 0, "");
++ coll_t cid("dest");
++ ghobject_t oid(
++ hobject_t(
++ "default.4106.50_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa [...]
++ "",
++ CEPH_NOSNAP,
++ 0x81920472,
++ 3,
++ ""),
++ 15,
++ shard_id_t(1));
++ ghobject_t oid2(oid);
++ oid2.generation = 17;
++ ghobject_t oidhead(oid);
++ oidhead.generation = ghobject_t::NO_GEN;
++
++ int r;
++ {
++ ObjectStore::Transaction t;
++ t.create_collection(cid);
++ t.touch(cid, oidhead);
++ t.collection_move_rename(cid, oidhead, cid, oid);
++ t.touch(cid, oidhead);
++ t.collection_move_rename(cid, oidhead, cid, oid2);
++ r = store->apply_transaction(t);
++ ASSERT_EQ(r, 0);
++ }
++
++ {
++ ObjectStore::Transaction t;
++ t.remove(cid, oid);
++ r = store->apply_transaction(t);
++ ASSERT_EQ(r, 0);
++ }
++
++ {
++ vector<ghobject_t> objects;
++ r = store->collection_list(cid, objects);
++ ASSERT_EQ(r, 0);
++ ASSERT_EQ(objects.size(), 1u);
++ ASSERT_EQ(objects[0], oid2);
++ }
++
++ ASSERT_FALSE(store->exists(cid, oid));
++
++ {
++ ObjectStore::Transaction t;
++ t.remove(cid, oid2);
++ t.remove_collection(cid);
++ r = store->apply_transaction(t);
++ ASSERT_EQ(r, 0);
++
++ }
++}
++
++TEST_P(StoreTest, SetAllocHint) {
++ coll_t cid("alloc_hint");
++ ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, 0, ""));
++ int r;
++ {
++ ObjectStore::Transaction t;
++ t.create_collection(cid);
++ t.touch(cid, hoid);
++ r = store->apply_transaction(t);
++ ASSERT_EQ(r, 0);
++ }
++ {
++ ObjectStore::Transaction t;
++ t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
++ r = store->apply_transaction(t);
++ ASSERT_EQ(r, 0);
++ }
++ {
++ ObjectStore::Transaction t;
++ t.remove(cid, hoid);
++ r = store->apply_transaction(t);
++ ASSERT_EQ(r, 0);
++ }
++ {
++ ObjectStore::Transaction t;
++ t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
++ r = store->apply_transaction(t);
++ ASSERT_EQ(r, 0);
++ }
++ {
++ ObjectStore::Transaction t;
++ t.remove_collection(cid);
++ r = store->apply_transaction(t);
++ ASSERT_EQ(r, 0);
++ }
+ }
+
+ INSTANTIATE_TEST_CASE_P(
+ ObjectStore,
+--- a/src/test/osd/TestOSDMap.cc
++++ b/src/test/osd/TestOSDMap.cc
+@@ -49,15 +49,26 @@
+ pending_inc.new_uuid[i] = sample_uuid;
+ }
+ osdmap.apply_incremental(pending_inc);
+
+- // kludge to get an erasure coding rule and pool
++ // Create an EC ruleset and a pool using it
+ int r = osdmap.crush->add_simple_ruleset("erasure", "default", "osd",
+ "indep", pg_pool_t::TYPE_ERASURE,
+ &cerr);
+- pg_pool_t *p = (pg_pool_t *)osdmap.get_pg_pool(2);
++
++ OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
++ new_pool_inc.new_pool_max = osdmap.get_pool_max();
++ new_pool_inc.fsid = osdmap.get_fsid();
++ pg_pool_t empty;
++ uint64_t pool_id = ++new_pool_inc.new_pool_max;
++ pg_pool_t *p = new_pool_inc.get_new_pool(pool_id, &empty);
++ p->size = 3;
++ p->set_pg_num(64);
++ p->set_pgp_num(64);
+ p->type = pg_pool_t::TYPE_ERASURE;
+ p->crush_ruleset = r;
++ new_pool_inc.new_pool_names[pool_id] = "ec";
++ osdmap.apply_incremental(new_pool_inc);
+ }
+ unsigned int get_num_osds() { return num_osds; }
+
+ void test_mappings(int pool,
+@@ -85,8 +96,50 @@
+ ASSERT_EQ(get_num_osds(), (unsigned)osdmap.get_max_osd());
+ ASSERT_EQ(get_num_osds(), osdmap.get_num_in_osds());
+ }
+
++TEST_F(OSDMapTest, Features) {
++ // with EC pool
++ set_up_map();
++ uint64_t features = osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
++ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
++ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
++ ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
++ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
++ ASSERT_TRUE(features & CEPH_FEATURE_OSD_ERASURE_CODES);
++ ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
++ ASSERT_FALSE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
++
++ // clients have a slightly different view
++ features = osdmap.get_features(CEPH_ENTITY_TYPE_CLIENT, NULL);
++ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
++ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
++ ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
++ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
++ ASSERT_FALSE(features & CEPH_FEATURE_OSD_ERASURE_CODES); // dont' need this
++ ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
++ ASSERT_FALSE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
++
++ // remove teh EC pool, but leave the rule. add primary affinity.
++ {
++ OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
++ new_pool_inc.old_pools.insert(osdmap.lookup_pg_pool_name("ec"));
++ new_pool_inc.new_primary_affinity[0] = 0x8000;
++ osdmap.apply_incremental(new_pool_inc);
++ }
++
++ features = osdmap.get_features(CEPH_ENTITY_TYPE_MON, NULL);
++ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
++ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
++ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); // shared bit with primary affinity
++ ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_V2);
++ ASSERT_FALSE(features & CEPH_FEATURE_OSD_ERASURE_CODES);
++ ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
++ ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
++
++ // FIXME: test tiering feature bits
++}
++
+ TEST_F(OSDMapTest, MapPG) {
+ set_up_map();
+
+ pg_t rawpg(0, 0, -1);
+--- a/src/test/osd/osd-test-helpers.sh
++++ b/src/test/osd/osd-test-helpers.sh
+@@ -36,8 +36,9 @@
+ local ceph_args="$CEPH_ARGS"
+ ceph_args+=" --osd-journal-size=100"
+ ceph_args+=" --osd-data=$osd_data"
+ ceph_args+=" --chdir="
++ ceph_args+=" --osd-pool-default-erasure-code-directory=.libs"
+ ceph_args+=" --run-dir=$dir"
+ ceph_args+=" --debug-osd=20"
+ ceph_args+=" --log-file=$dir/osd-\$id.log"
+ ceph_args+=" --pid-file=$dir/osd-\$id.pidfile"
+--- a/src/test/strtol.cc
++++ b/src/test/strtol.cc
+@@ -13,8 +13,9 @@
+ */
+
+ #include "common/strtol.h"
+ #include <string>
++#include <map>
+
+ #include "gtest/gtest.h"
+
+ static void test_strict_strtoll(const char *str, long long expected)
+@@ -133,4 +134,78 @@
+ test_strict_strtod_err("34.0 garbo");
+
+ test_strict_strtof_err("0.05.0");
+ }
++
++
++static void test_strict_sistrtoll(const char *str)
++{
++ std::string err;
++ strict_sistrtoll(str, &err);
++ ASSERT_EQ(err, "");
++}
++
++static void test_strict_sistrtoll_units(const std::string& foo,
++ char u, const int m)
++{
++ std::string s(foo);
++ s.push_back(u);
++ const char *str = s.c_str();
++ std::string err;
++ uint64_t r = strict_sistrtoll(str, &err);
++ ASSERT_EQ(err, "");
++
++ str = foo.c_str();
++ std::string err2;
++ long long tmp = strict_strtoll(str, 10, &err2);
++ ASSERT_EQ(err2, "");
++ tmp = (tmp << m);
++ ASSERT_EQ(tmp, (long long)r);
++}
++
++TEST(SIStrToLL, WithUnits) {
++ std::map<char,int> units;
++ units['B'] = 0;
++ units['K'] = 10;
++ units['M'] = 20;
++ units['G'] = 30;
++ units['T'] = 40;
++ units['P'] = 50;
++ units['E'] = 60;
++
++ for (std::map<char,int>::iterator p = units.begin();
++ p != units.end(); ++p) {
++ test_strict_sistrtoll_units("1024", p->first, p->second);
++ test_strict_sistrtoll_units("1", p->first, p->second);
++ test_strict_sistrtoll_units("0", p->first, p->second);
++ }
++}
++
++TEST(SIStrToLL, WithoutUnits) {
++ test_strict_sistrtoll("1024");
++ test_strict_sistrtoll("1152921504606846976");
++ test_strict_sistrtoll("0");
++}
++
++static void test_strict_sistrtoll_err(const char *str)
++{
++ std::string err;
++ strict_sistrtoll(str, &err);
++ ASSERT_NE(err, "");
++}
++
++TEST(SIStrToLL, Error) {
++ test_strict_sistrtoll_err("1024F");
++ test_strict_sistrtoll_err("QDDSA");
++ test_strict_sistrtoll_err("1b");
++ test_strict_sistrtoll_err("100k");
++ test_strict_sistrtoll_err("1000m");
++ test_strict_sistrtoll_err("1g");
++ test_strict_sistrtoll_err("20t");
++ test_strict_sistrtoll_err("100p");
++ test_strict_sistrtoll_err("1000e");
++ test_strict_sistrtoll_err("B");
++ test_strict_sistrtoll_err("M");
++ test_strict_sistrtoll_err("BM");
++ test_strict_sistrtoll_err("B0wef");
++ test_strict_sistrtoll_err("0m");
++}
diff --git a/debian/patches/fix-blkdev-BLKGETSIZE-check.patch b/debian/patches/fix-blkdev-BLKGETSIZE-check.patch
deleted file mode 100644
index bef37bb..0000000
--- a/debian/patches/fix-blkdev-BLKGETSIZE-check.patch
+++ /dev/null
@@ -1,35 +0,0 @@
-Last-Update: 2014-08-24
-Forwarded: https://github.com/ceph/ceph/pull/2311
-From: Michael Cree <mcree at orcon.net.nz>
-Bug-Debian: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=756892
-Description: fix FTBFS on alpha due to incorrect check on BLKGETSIZE
- Ceph FTBFS on Alpha with:
-
- ~~~~
- libtool: compile: g++ -DHAVE_CONFIG_H -I. -D__CEPH__ -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -D__STDC_FORMAT_MACROS -D_GNU_SOURCE -DCEPH_LIBDIR=\"/usr/lib/alpha-linux-gnu\" -DCEPH_PKGLIBDIR=\"/usr/lib/alpha-linux-gnu/ceph\" -DGTEST_HAS_TR1_TUPLE=0 -D_FORTIFY_SOURCE=2 -I/usr/include/nss -I/usr/include/nspr -Wall -Wtype-limits -Wignored-qualifiers -Winit-self -Wpointer-arith -Werror=format-security -fno-strict-aliasing -fsigned-char -rdynamic -ftemplate-depth-1024 -Wnon-virtua [...]
- In file included from /usr/include/alpha-linux-gnu/asm/ioctls.h:4:0,
- from /usr/include/alpha-linux-gnu/bits/ioctls.h:23,
- from /usr/include/alpha-linux-gnu/sys/ioctl.h:26,
- from common/blkdev.cc:3:
- common/blkdev.cc:13:7: error: missing binary operator before token "int"
- #elif BLKGETSIZE
- ^
- ~~~~
-
- This error occurs because the value of BLKGETSIZE is tested in a
- c-preprocessor conditional compilation test whereas the test should
- be for existence.
-
---- a/src/common/blkdev.cc
-+++ b/src/common/blkdev.cc
-@@ -9,9 +9,9 @@
- int get_block_device_size(int fd, int64_t *psize)
- {
- #ifdef BLKGETSIZE64
- int ret = ::ioctl(fd, BLKGETSIZE64, psize);
--#elif BLKGETSIZE
-+#elif defined(BLKGETSIZE)
- unsigned long sectors = 0;
- int ret = ::ioctl(fd, BLKGETSIZE, §ors);
- *psize = sectors * 512ULL;
- #else
diff --git a/debian/patches/series b/debian/patches/series
index 24098c2..b46c8ca 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,4 +1,5 @@
## Backported / Upstream
+firefly-latest.patch
bug-8342.patch
bug-8624a.patch
bug-8624b.patch
@@ -9,7 +10,6 @@ client-sleep3.patch
sleep-recover.patch
backfill-prio.patch
bash-completion.patch
-ceph-ao-require-cas.patch
rbdmap1-mount.patch
rbdmap2-hooks.patch
@@ -19,4 +19,3 @@ arch.patch
modules.patch
sample.ceph.conf.patch
virtualenv-never-download.patch
-fix-blkdev-BLKGETSIZE-check.patch
diff --git a/debian/patches/sleep-recover.patch b/debian/patches/sleep-recover.patch
index 23c42ce..bf89722 100644
--- a/debian/patches/sleep-recover.patch
+++ b/debian/patches/sleep-recover.patch
@@ -6,7 +6,8 @@ Description: fix fuse-client hang after wake-up from suspend.
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
-@@ -9015,6 +9015,7 @@ void Client::ms_handle_remote_reset(Conn
+@@ -9048,8 +9048,9 @@
+
case MetaSession::STATE_OPEN:
ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
s->state = MetaSession::STATE_STALE;
@@ -14,3 +15,4 @@ Description: fix fuse-client hang after wake-up from suspend.
break;
case MetaSession::STATE_NEW:
+ case MetaSession::STATE_CLOSED:
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git
More information about the Pkg-ceph-commits
mailing list