[Pkg-ceph-commits] [ceph] 01/01: New upstream release [0.80.6]
Dmitry Smirnov
onlyjob at moszumanska.debian.org
Thu Oct 2 16:07:46 UTC 2014
This is an automated email from the git hooks/post-receive script.
onlyjob pushed a commit to branch master
in repository ceph.
commit 7aac2f4 (HEAD, master)
Author: Dmitry Smirnov <onlyjob at member.fsf.org>
Date: Thu Oct 2 14:00:36 2014
New upstream release [0.80.6]
---
debian/changelog | 7 +
debian/copyright | 19 +-
debian/patches/backfill-prio.patch | 10 +-
debian/patches/bug-8821.patch | 2 +-
debian/patches/firefly-latest.patch | 11188 +---------------------------------
5 files changed, 43 insertions(+), 11183 deletions(-)
diff --git a/debian/changelog b/debian/changelog
index c25f4db..37b801f 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,10 @@
+ceph (0.80.6-1) unstable; urgency=medium
+
+ * New upstream release [October 2014].
+ * Standards-Version: 3.9.6.
+
+ -- Dmitry Smirnov <onlyjob at debian.org> Thu, 02 Oct 2014 23:07:04 +1000
+
ceph (0.80.5-2) unstable; urgency=low
* Patchworks:
diff --git a/debian/copyright b/debian/copyright
index 462177b..da07f5f 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -8,6 +8,7 @@ Copyright: 2004-2013 Sage Weil <sage at newdream.net>
2004-2014 Inktank <info at inktank.com>
Inktank, Inc
Inktank Storage, Inc.
+ 2012-2014 Red Hat <contact at redhat.com>
2013-2014 Cloudwatt <libre.licensing at cloudwatt.com>
2004-2011 Dreamhost
2013 eNovance SAS <licensing at enovance.com>
@@ -16,12 +17,11 @@ Copyright: 2004-2013 Sage Weil <sage at newdream.net>
2014 John Spray <john.spray at inktank.com
2004-2012 New Dream Network
2011 Stanislav Sedov <stas at FreeBSD.org>
- 2013 UnitedStack <haomai at unitedstack.com>
+ 2013-2014 UnitedStack <haomai at unitedstack.com>
2011 Wido den Hollander <wido at widodh.nl>
License: LGPL-2.1
-Files: src/erasure-code/jerasure/vectorop.h
- src/erasure-code/jerasure/ErasureCode*
+Files: src/erasure-code/jerasure/ErasureCode*
src/erasure-code/ErasureCode*
src/include/str_map.h
src/test/common/test_str_map.cc
@@ -36,7 +36,7 @@ Files: src/mount/canonicalize.c
src/test/common/test_config.cc
src/test/crush/TestCrushWrapper.cc
src/test/common/Throttle.cc
- src/test/filestore/chain_xattr.cc
+ src/test/objectstore/chain_xattr.cc
src/test/mon/mon-test-helpers.sh
src/test/objectstore/chain_xattr.cc
src/test/osd/osd-test-helpers.sh
@@ -53,7 +53,7 @@ Copyright: 2007 Oracle. All rights reserved.
2014 Inktank <info at inktank.com>
License: GPL-2
-Files: src/include/ceph_hash.cc
+Files: src/common/ceph_hash.cc
Copyright: 1995-1997 Robert J. Jenkins Jr.
License: public-domain
This file uses Robert Jenkin's hash function as detailed at:
@@ -177,7 +177,7 @@ License: BSD-3-clause
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
THE POSSIBILITY OF SUCH DAMAGE.
-Files: src/ceph/ceph-0.79/src/erasure-code/jerasure/gf-complete/*
+Files: src/erasure-code/jerasure/gf-complete/*/*
Copyright: 2013 James S. Plank
Ethan L. Miller
Kevin M. Greenan
@@ -217,12 +217,7 @@ License: BSD-3-clause
Comment:
https://bitbucket.org/jimplank/gf-complete
-Files: src/erasure-code/jerasure/cauchy.*
- src/erasure-code/jerasure/galois.*
- src/erasure-code/jerasure/jerasure.*
- src/erasure-code/jerasure/liberation.*
- src/erasure-code/jerasure/reed_sol.*
- src/erasure-code/jerasure/jerasure/*
+Files: src/erasure-code/jerasure/jerasure/*/*
Copyright: 2011-2013 James S. Plank <plank at cs.utk.edu>
2013 Kevin Greenan
License: BSD-3-clause
diff --git a/debian/patches/backfill-prio.patch b/debian/patches/backfill-prio.patch
index ae3669e..c163aeb 100644
--- a/debian/patches/backfill-prio.patch
+++ b/debian/patches/backfill-prio.patch
@@ -151,7 +151,7 @@ Date: Tue Jun 24 02:09:49 2014
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
-@@ -1873,8 +1873,28 @@
+@@ -1885,8 +1885,28 @@
dirty_info = true;
}
@@ -180,7 +180,7 @@ Date: Tue Jun 24 02:09:49 2014
{
dout(10) << "finish_recovery" << dendl;
assert(info.last_complete == info.last_update);
-@@ -5839,15 +5859,14 @@
+@@ -5852,15 +5872,14 @@
ConnectionRef con = pg->osd->get_con_osd_cluster(
backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
if (con) {
@@ -198,7 +198,7 @@ Date: Tue Jun 24 02:09:49 2014
} else {
post_event(RemoteBackfillReserved());
}
-@@ -5914,10 +5933,10 @@
+@@ -5927,10 +5946,10 @@
pg->osd->local_reserver.request_reservation(
pg->info.pgid,
new QueuePeeringEvt<LocalBackfillReserved>(
@@ -211,7 +211,7 @@ Date: Tue Jun 24 02:09:49 2014
void PG::RecoveryState::WaitLocalBackfillReserved::exit()
{
-@@ -5982,9 +6001,10 @@
+@@ -5995,9 +6014,10 @@
pg->osd->remote_reserver.request_reservation(
pg->info.pgid,
new QueuePeeringEvt<RemoteRecoveryReserved>(
@@ -223,7 +223,7 @@ Date: Tue Jun 24 02:09:49 2014
boost::statechart::result
PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
-@@ -6123,9 +6143,10 @@
+@@ -6136,9 +6156,10 @@
pg->osd->local_reserver.request_reservation(
pg->info.pgid,
new QueuePeeringEvt<LocalRecoveryReserved>(
diff --git a/debian/patches/bug-8821.patch b/debian/patches/bug-8821.patch
index fe8b99c..d2abce2 100644
--- a/debian/patches/bug-8821.patch
+++ b/debian/patches/bug-8821.patch
@@ -30,7 +30,7 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
-@@ -748,10 +748,10 @@
+@@ -749,10 +749,10 @@
* affected by rbd_default_order.
*/
OPTION(rbd_default_format, OPT_INT, 1)
diff --git a/debian/patches/firefly-latest.patch b/debian/patches/firefly-latest.patch
index 8414fc0..b042b23 100644
--- a/debian/patches/firefly-latest.patch
+++ b/debian/patches/firefly-latest.patch
@@ -1,11172 +1,30 @@
-Last-Update: 2014-09-16
+Last-Update: 2014-10-02
Forwarded: not-needed
Origin: upstream
Author: Dmitry Smirnov <onlyjob at member.fsf.org>
-Description: fixes from "firefly" branch since 0.80.5 release
+Description: fixes from "firefly" branch since 0.80.6 release
---- a/configure.ac
-+++ b/configure.ac
-@@ -471,11 +471,16 @@
- [AC_MSG_FAILURE(
- [no libatomic-ops found (use --without-libatomic-ops to disable)])
- ])])
- AS_IF([test "$HAVE_ATOMIC_OPS" = "1"],
-- [],
-+ [
-+ AC_CHECK_SIZEOF(AO_t, [], [
-+ #include <atomic_ops.h>
-+ ])
-+ ],
- [AC_DEFINE([NO_ATOMIC_OPS], [1], [Defined if you do not have atomic_ops])])
-
-+
- AM_CONDITIONAL(WITH_LIBATOMIC, [test "$HAVE_ATOMIC_OPS" = "1"])
-
- # newsyn? requires mpi.
- #AC_ARG_WITH([newsyn],
---- /dev/null
-+++ b/doc/_templates/layout.html
-@@ -0,0 +1,5 @@
-+{% extends "!layout.html" %}
-+
-+{%- block extrahead %}
-+ <script type="text/javascript" src="http://ayni.ceph.com/public/js/ceph.js"></script>
-+{% endblock %}
---- a/src/ceph-disk
-+++ b/src/ceph-disk
-@@ -118,8 +118,11 @@
- STATEDIR = '/var/lib/ceph'
-
- SYSCONFDIR = '/etc/ceph'
-
-+# only warn once about some things
-+warned_about = {}
-+
- # Nuke the TERM variable to avoid confusing any subprocesses we call.
- # For example, libreadline will print weird control sequences for some
- # TERM values.
- if 'TERM' in os.environ:
-@@ -130,10 +133,8 @@
- LOG_NAME = os.path.basename(sys.argv[0])
- LOG = logging.getLogger(LOG_NAME)
-
-
--
--
- ###### lock ########
-
- class filelock(object):
- def __init__(self, fn):
-@@ -149,10 +150,12 @@
- assert self.fd
- fcntl.lockf(self.fd, fcntl.LOCK_UN)
- self.fd = None
-
-+
- ###### exceptions ########
-
-+
- class Error(Exception):
- """
- Error
- """
-@@ -160,51 +163,60 @@
- def __str__(self):
- doc = self.__doc__.strip()
- return ': '.join([doc] + [str(a) for a in self.args])
-
-+
- class MountError(Error):
- """
- Mounting filesystem failed
- """
-
-+
- class UnmountError(Error):
- """
- Unmounting filesystem failed
- """
-
-+
- class BadMagicError(Error):
- """
- Does not look like a Ceph OSD, or incompatible version
- """
-
-+
- class TruncatedLineError(Error):
- """
- Line is truncated
- """
-
-+
- class TooManyLinesError(Error):
- """
- Too many lines
- """
-
-+
- class FilesystemTypeError(Error):
- """
- Cannot discover filesystem type
- """
-
-+
- class CephDiskException(Exception):
- """
- A base exception for ceph-disk to provide custom (ad-hoc) messages that
- will be caught and dealt with when main() is executed
- """
- pass
-
-+
- class ExecutableNotFound(CephDiskException):
- """
- Exception to report on executables not available in PATH
- """
- pass
-
-+
- ####### utils
-
-
- def maybe_mkdir(*a, **kw):
-@@ -299,9 +311,9 @@
- of making sure that executables *will* be found and will error nicely
- otherwise.
- """
- arguments = _get_command_executable(arguments)
-- LOG.info('Running command: %s' % ' '.join(arguments))
-+ LOG.info('Running command: %s', ' '.join(arguments))
- return subprocess.check_call(arguments)
-
-
- def platform_distro():
-@@ -339,35 +351,67 @@
- str(codename).strip()
- )
-
-
--# a device "name" is something like
--# sdb
--# cciss!c0d1
- def get_dev_name(path):
- """
-- get device name from path. e.g., /dev/sda -> sdas, /dev/cciss/c0d1 -> cciss!c0d1
-+ get device name from path. e.g.::
-+
-+ /dev/sda -> sdas, /dev/cciss/c0d1 -> cciss!c0d1
-+
-+ a device "name" is something like::
-+
-+ sdb
-+ cciss!c0d1
-+
- """
- assert path.startswith('/dev/')
- base = path[5:]
- return base.replace('/', '!')
-
--# a device "path" is something like
--# /dev/sdb
--# /dev/cciss/c0d1
-+
- def get_dev_path(name):
- """
- get a path (/dev/...) from a name (cciss!c0d1)
-+ a device "path" is something like::
-+
-+ /dev/sdb
-+ /dev/cciss/c0d1
-+
- """
- return '/dev/' + name.replace('!', '/')
-
-+
- def get_dev_relpath(name):
- """
- get a relative path to /dev from a name (cciss!c0d1)
- """
- return name.replace('!', '/')
-
-
-+def get_dev_size(dev, size='megabytes'):
-+ """
-+ Attempt to get the size of a device so that we can prevent errors
-+ from actions to devices that are smaller, and improve error reporting.
-+
-+ Because we want to avoid breakage in case this approach is not robust, we
-+ will issue a warning if we failed to get the size.
-+
-+ :param size: bytes or megabytes
-+ :param dev: the device to calculate the size
-+ """
-+ fd = os.open(dev, os.O_RDONLY)
-+ dividers = {'bytes': 1, 'megabytes': 1024*1024}
-+ try:
-+ device_size = os.lseek(fd, 0, os.SEEK_END)
-+ divider = dividers.get(size, 1024*1024) # default to megabytes
-+ return device_size/divider
-+ except Exception as error:
-+ LOG.warning('failed to get size of %s: %s' % (dev, str(error)))
-+ finally:
-+ os.close(fd)
-+
-+
- def get_partition_dev(dev, pnum):
- """
- get the device name for a partition
-
-@@ -388,8 +432,9 @@
- return get_dev_path(partname)
- else:
- raise Error('partition %d for %s does not appear to exist' % (pnum, dev))
-
-+
- def list_all_partitions():
- """
- Return a list of devices and partitions
- """
-@@ -402,8 +447,9 @@
- continue
- dev_part_list[name] = list_partitions(name)
- return dev_part_list
-
-+
- def list_partitions(basename):
- """
- Return a list of partitions on the given device name
- """
-@@ -412,8 +458,25 @@
- if name.startswith(basename):
- partitions.append(name)
- return partitions
-
-+def get_partition_base(dev):
-+ """
-+ Get the base device for a partition
-+ """
-+ dev = os.path.realpath(dev)
-+ if not stat.S_ISBLK(os.lstat(dev).st_mode):
-+ raise Error('not a block device', dev)
-+
-+ name = get_dev_name(dev)
-+ if os.path.exists(os.path.join('/sys/block', name)):
-+ raise Error('not a partition', dev)
-+
-+ # find the base
-+ for basename in os.listdir('/sys/block'):
-+ if os.path.exists(os.path.join('/sys/block', basename, name)):
-+ return '/dev/' + basename
-+ raise Error('no parent device for partition', dev)
-
- def is_partition(dev):
- """
- Check whether a given device path is a partition or a full disk.
-@@ -475,23 +538,23 @@
- base = base[:-1]
- return []
-
-
--def verify_not_in_use(dev):
-+def verify_not_in_use(dev, check_partitions=False):
- """
- Verify if a given device (path) is in use (e.g. mounted or
- in use by device-mapper).
-
- :raises: Error if device is in use.
- """
- assert os.path.exists(dev)
-- if is_partition(dev):
-- if is_mounted(dev):
-- raise Error('Device is mounted', dev)
-- holders = is_held(dev)
-- if holders:
-- raise Error('Device is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders))
-- else:
-+ if is_mounted(dev):
-+ raise Error('Device is mounted', dev)
-+ holders = is_held(dev)
-+ if holders:
-+ raise Error('Device is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders))
-+
-+ if check_partitions and not is_partition(dev):
- basename = get_dev_name(os.path.realpath(dev))
- for partname in list_partitions(basename):
- partition = get_dev_path(partname)
- if is_mounted(partition):
-@@ -535,12 +598,14 @@
-
- try:
- line = must_be_one_line(line)
- except (TruncatedLineError, TooManyLinesError) as e:
-- raise Error('File is corrupt: {path}: {msg}'.format(
-+ raise Error(
-+ 'File is corrupt: {path}: {msg}'.format(
- path=path,
- msg=e,
-- ))
-+ )
-+ )
- return line
-
-
- def write_one_line(parent, name, text):
-@@ -745,9 +810,9 @@
- Maps a device to a dmcrypt device.
-
- :return: Path to the dmcrypt device.
- """
-- dev = '/dev/mapper/'+ _uuid
-+ dev = '/dev/mapper/' + _uuid
- args = [
- 'cryptsetup',
- '--key-file',
- keypath,
-@@ -791,8 +856,14 @@
- """
- Mounts a device with given filessystem type and
- mount options to a tempfile path under /var/lib/ceph/tmp.
- """
-+ # sanity check: none of the arguments are None
-+ if dev is None:
-+ raise ValueError('dev may not be None')
-+ if fstype is None:
-+ raise ValueError('fstype may not be None')
-+
- # pick best-of-breed mount options based on fs type
- if options is None:
- options = MOUNT_OPTIONS.get(fstype, '')
-
-@@ -966,8 +1037,17 @@
- size=journal_size,
- )
- LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
-
-+ dev_size = get_dev_size(journal)
-+
-+ if journal_size > dev_size:
-+ LOG.error('refusing to create journal on %s' % journal)
-+ LOG.error('journal size (%sM) is bigger than device (%sM)' % (journal_size, dev_size))
-+ raise Error(
-+ '%s device size (%sM) is not big enough for journal' % (journal, dev_size)
-+ )
-+
- try:
- LOG.debug('Creating journal partition num %d size %d on %s', num, journal_size, journal)
- command_check_call(
- [
-@@ -1043,9 +1123,9 @@
- journal):
-
- if not os.path.exists(journal):
- LOG.debug('Creating journal file %s with size 0 (ceph-osd will resize and allocate)', journal)
-- with file(journal, 'wb') as journal_file:
-+ with file(journal, 'wb') as journal_file: # noqa
- pass
-
- LOG.debug('Journal is file %s', journal)
- LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
-@@ -1109,15 +1189,16 @@
- os.symlink(target, path)
- except:
- raise Error('unable to create symlink %s -> %s' % (path, target))
-
-+
- def prepare_dir(
- path,
- journal,
- cluster_uuid,
- osd_uuid,
- journal_uuid,
-- journal_dmcrypt = None,
-+ journal_dmcrypt=None,
- ):
-
- if os.path.exists(os.path.join(path, 'magic')):
- LOG.debug('Data dir %s already exists', path)
-@@ -1182,11 +1263,8 @@
- if is_partition(data):
- LOG.debug('OSD data device %s is a partition', data)
- rawdev = data
- else:
-- if journal_dmcrypt is not None:
-- dmcrypt_unmap(journal)
--
- LOG.debug('Creating osd partition on %s', data)
- try:
- command_check_call(
- [
-@@ -1237,11 +1315,11 @@
- args.extend(['-f']) # always force
- else:
- args.extend(MKFS_ARGS.get(fstype, []))
- args.extend([
-- '--',
-- dev,
-- ])
-+ '--',
-+ dev,
-+ ])
- try:
- LOG.debug('Creating %s fs on %s', fstype, dev)
- command_check_call(args)
- except subprocess.CalledProcessError as e:
-@@ -1266,10 +1344,8 @@
- unmount(path)
- finally:
- if rawdev != dev:
- dmcrypt_unmap(osd_uuid)
-- if journal_dmcrypt is not None:
-- dmcrypt_unmap(journal)
-
- if not is_partition(data):
- try:
- command_check_call(
-@@ -1288,9 +1364,9 @@
- journal_dm_keypath = None
- osd_dm_keypath = None
-
- try:
-- prepare_lock.acquire()
-+ prepare_lock.acquire() # noqa
- if not os.path.exists(args.data):
- if args.data_dev:
- raise Error('data path does not exist', args.data)
- else:
-@@ -1298,14 +1374,14 @@
-
- # in use?
- dmode = os.stat(args.data).st_mode
- if stat.S_ISBLK(dmode):
-- verify_not_in_use(args.data)
-+ verify_not_in_use(args.data, True)
-
- if args.journal and os.path.exists(args.journal):
- jmode = os.stat(args.journal).st_mode
- if stat.S_ISBLK(jmode):
-- verify_not_in_use(args.journal)
-+ verify_not_in_use(args.journal, False)
-
- if args.zap_disk is not None:
- if stat.S_ISBLK(dmode) and not is_partition(args.data):
- zap(args.data)
-@@ -1420,9 +1496,9 @@
- osd_dm_keypath=osd_dm_keypath,
- )
- else:
- raise Error('not a dir or block device', args.data)
-- prepare_lock.release()
-+ prepare_lock.release() # noqa
-
- if stat.S_ISBLK(dmode):
- # try to make sure the kernel refreshes the table. note
- # that if this gets ebusy, we are probably racing with
-@@ -1456,9 +1532,9 @@
- if journal_dm_keypath:
- os.unlink(journal_dm_keypath)
- if osd_dm_keypath:
- os.unlink(osd_dm_keypath)
-- prepare_lock.release()
-+ prepare_lock.release() # noqa
- raise e
-
-
- ###########################
-@@ -1622,20 +1698,23 @@
- command_check_call(
- [
- svc,
- 'ceph',
-+ '--cluster',
-+ '{cluster}'.format(cluster=cluster),
- 'start',
- 'osd.{osd_id}'.format(osd_id=osd_id),
- ],
- )
- else:
- raise Error('{cluster} osd.{osd_id} is not tagged with an init system'.format(
-- cluster=cluster,
-- osd_id=osd_id,
-- ))
-+ cluster=cluster,
-+ osd_id=osd_id,
-+ ))
- except subprocess.CalledProcessError as e:
- raise Error('ceph osd start failed', e)
-
-+
- def detect_fstype(
- dev,
- ):
- fstype = _check_output(
-@@ -1703,10 +1782,10 @@
- other = False
- src_dev = os.stat(path).st_dev
- try:
- dst_dev = os.stat((STATEDIR + '/osd/{cluster}-{osd_id}').format(
-- cluster=cluster,
-- osd_id=osd_id)).st_dev
-+ cluster=cluster,
-+ osd_id=osd_id)).st_dev
- if src_dev == dst_dev:
- active = True
- else:
- parent_dev = os.stat(STATEDIR + '/osd').st_dev
-@@ -1759,9 +1838,9 @@
- )
-
- (osd_id, cluster) = activate(path, activate_key_template, init)
-
-- if init not in ( None, 'none' ):
-+ if init not in (None, 'none' ):
- canonical = (STATEDIR + '/osd/{cluster}-{osd_id}').format(
- cluster=cluster,
- osd_id=osd_id)
- if path != canonical:
-@@ -1814,8 +1893,9 @@
- LOG.warning('No fsid defined in ' + SYSCONFDIR + '/ceph.conf; using anyway')
- return 'ceph'
- return None
-
-+
- def activate(
- path,
- activate_key_template,
- init,
-@@ -1860,9 +1940,9 @@
- fsid=fsid,
- keyring=keyring,
- )
-
-- if init not in ( None, 'none' ):
-+ if init not in (None, 'none' ):
- if init == 'auto':
- conf_val = get_conf(
- cluster=cluster,
- variable='init'
-@@ -1911,9 +1991,9 @@
- if is_suppressed(args.path):
- LOG.info('suppressed activate request on %s', args.path)
- return
-
-- activate_lock.acquire()
-+ activate_lock.acquire() # noqa
- try:
- mode = os.stat(args.path).st_mode
- if stat.S_ISBLK(mode):
- (cluster, osd_id) = mount_activate(
-@@ -1931,9 +2011,9 @@
-
- if args.mark_init == 'none':
- command_check_call(
- [
-- 'ceph-osd',
-+ 'ceph-osd',
- '--cluster={cluster}'.format(cluster=cluster),
- '--id={osd_id}'.format(osd_id=osd_id),
- '--osd-data={path}'.format(path=args.path),
- '--osd-journal={path}/journal'.format(path=args.path),
-@@ -1942,17 +2022,17 @@
-
- else:
- raise Error('%s is not a directory or block device' % args.path)
-
-- if args.mark_init not in ( None, 'none' ):
-+ if args.mark_init not in (None, 'none' ):
-
- start_daemon(
- cluster=cluster,
- osd_id=osd_id,
- )
-
- finally:
-- activate_lock.release()
-+ activate_lock.release() # noqa
-
-
- ###########################
-
-@@ -1983,16 +2063,17 @@
- value = str(out).split('\n', 1)[0]
- LOG.debug('Journal %s has OSD UUID %s', path, value)
- return value
-
-+
- def main_activate_journal(args):
- if not os.path.exists(args.dev):
- raise Error('%s does not exist' % args.dev)
-
- cluster = None
- osd_id = None
- osd_uuid = None
-- activate_lock.acquire()
-+ activate_lock.acquire() # noqa
- try:
- osd_uuid = get_journal_osd_uuid(args.dev)
- path = os.path.join('/dev/disk/by-partuuid/', osd_uuid.lower())
-
-@@ -2007,12 +2088,14 @@
- osd_id=osd_id,
- )
-
- finally:
-- activate_lock.release()
-+ activate_lock.release() # noqa
-+
-
- ###########################
-
-+
- def main_activate_all(args):
- dir = '/dev/disk/by-parttypeuuid'
- LOG.debug('Scanning %s', dir)
- if not os.path.exists(dir):
-@@ -2021,12 +2104,18 @@
- for name in os.listdir(dir):
- if name.find('.') < 0:
- continue
- (tag, uuid) = name.split('.')
-- if tag == OSD_UUID:
-- path = os.path.join(dir, name)
-+
-+ if tag == OSD_UUID or tag == DMCRYPT_OSD_UUID:
-+
-+ if tag == DMCRYPT_OSD_UUID:
-+ path = os.path.join('/dev/mapper', uuid)
-+ else:
-+ path = os.path.join(dir, name)
-+
- LOG.info('Activating %s', path)
-- activate_lock.acquire()
-+ activate_lock.acquire() # noqa
- try:
- (cluster, osd_id) = mount_activate(
- dev=path,
- activate_key_template=args.activate_key_template,
-@@ -2044,9 +2133,9 @@
- )
- err = True
-
- finally:
-- activate_lock.release()
-+ activate_lock.release() # noqa
- if err:
- raise Error('One or more partitions failed to activate')
-
-
-@@ -2065,15 +2154,17 @@
- if swaps_dev == dev:
- return True
- return False
-
-+
- def get_oneliner(base, name):
- path = os.path.join(base, name)
- if os.path.isfile(path):
- with open(path, 'r') as _file:
- return _file.readline().rstrip()
- return None
-
-+
- def get_dev_fs(dev):
- fscheck, _ = command(
- [
- 'blkid',
-@@ -2087,9 +2178,58 @@
- return fstype
- else:
- return None
-
-+
- def get_partition_type(part):
-+ """
-+ Get the GPT partition type UUID. If we have an old blkid and can't
-+ get it that way, use sgdisk and use the description instead (and hope
-+ dmcrypt isn't being used).
-+ """
-+ blkid, _ = command(
-+ [
-+ 'blkid',
-+ '-p',
-+ '-o', 'udev',
-+ part,
-+ ]
-+ )
-+ saw_part_entry = False
-+ for line in blkid.splitlines():
-+ (key, value) = line.split('=')
-+ if key == 'ID_PART_ENTRY_TYPE':
-+ return value
-+ if key == 'ID_PART_ENTRY_SCHEME':
-+ table_type = value
-+ if key.startswith('ID_PART_ENTRY_'):
-+ saw_part_entry = True
-+
-+ # hmm, is it in fact GPT?
-+ table_type = None
-+ base = get_partition_base(part)
-+ blkid, _ = command(
-+ [
-+ 'blkid',
-+ '-p',
-+ '-o', 'udev',
-+ base
-+ ]
-+ )
-+ for line in blkid.splitlines():
-+ (key, value) = line.split('=')
-+ if key == 'ID_PART_TABLE_TYPE':
-+ table_type = value
-+ if table_type != 'gpt':
-+ return None # not even GPT
-+
-+ if saw_part_entry:
-+ return None # GPT, and blkid appears to be new, so we're done.
-+
-+ # bah, fall back to sgdisk.
-+ if 'blkid' not in warned_about:
-+ LOG.warning('Old blkid does not support ID_PART_ENTRY_* fields, trying sgdisk; may not correctly identify ceph volumes with dmcrypt')
-+ warned_about['blkid'] = True
- (base, partnum) = re.match('(\D+)(\d+)', part).group(1, 2)
- sgdisk, _ = command(
- [
- 'sgdisk',
-@@ -2103,11 +2243,18 @@
- if m is not None:
- num = m.group(1)
- if num != partnum:
- continue
-- return m.group(2)
-+ desc = m.group(2)
-+ # assume unencrypted ... blkid has failed us :(
-+ if desc == 'ceph data':
-+ return OSD_UUID
-+ if desc == 'ceph journal':
-+ return JOURNAL_UUID
-+
- return None
-
-+
- def get_partition_uuid(dev):
- (base, partnum) = re.match('(\D+)(\d+)', dev).group(1, 2)
- out, _ = command(['sgdisk', '-i', partnum, base])
- for line in out.splitlines():
-@@ -2115,8 +2262,9 @@
- if m:
- return m.group(1).lower()
- return None
-
-+
- def more_osd_info(path, uuid_map):
- desc = []
- ceph_fsid = get_oneliner(path, 'ceph_fsid')
- if ceph_fsid:
-@@ -2137,46 +2285,71 @@
- desc.append('journal %s' % uuid_map[journal_uuid])
-
- return desc
-
-+def list_dev_osd(dev, uuid_map):
-+ path = is_mounted(dev)
-+ fs_type = get_dev_fs(dev)
-+ desc = []
-+ if path:
-+ desc.append('active')
-+ desc.extend(more_osd_info(path, uuid_map))
-+ elif fs_type:
-+ try:
-+ tpath = mount(dev=dev, fstype=fs_type, options='')
-+ if tpath:
-+ try:
-+ magic = get_oneliner(tpath, 'magic')
-+ if magic is not None:
-+ desc.append('prepared')
-+ desc.extend(more_osd_info(tpath, uuid_map))
-+ finally:
-+ unmount(tpath)
-+ except MountError:
-+ pass
-+ return desc
-
- def list_dev(dev, uuid_map, journal_map):
- ptype = 'unknown'
- prefix = ''
- if is_partition(dev):
- ptype = get_partition_type(dev)
- prefix = ' '
-- fs_type = get_dev_fs(dev)
-- path = is_mounted(dev)
-
- desc = []
-- if ptype == 'ceph data':
-- if path:
-- desc.append('active')
-- desc.extend(more_osd_info(path, uuid_map))
-- elif fs_type:
-- try:
-- tpath = mount(dev=dev, fstype=fs_type, options='')
-- if tpath:
-- try:
-- magic = get_oneliner(tpath, 'magic')
-- if magic is not None:
-- desc.append('prepared')
-- desc.extend(more_osd_info(tpath, uuid_map))
-- finally:
-- unmount(tpath)
-- except MountError:
-- pass
-+ if ptype == OSD_UUID:
-+ desc = list_dev_osd(dev, uuid_map)
- if desc:
- desc = ['ceph data'] + desc
- else:
- desc = ['ceph data', 'unprepared']
-- elif ptype == 'ceph journal':
-+ elif ptype == DMCRYPT_OSD_UUID:
-+ holders = is_held(dev)
-+ if not holders:
-+ desc = ['ceph data (dmcrypt)', 'not currently mapped']
-+ elif len(holders) == 1:
-+ holder = '/dev/' + holders[0]
-+ fs_desc = list_dev_osd(holder, uuid_map)
-+ desc = ['ceph data (dmcrypt %s)' % holder] + fs_desc
-+ else:
-+ desc = ['ceph data (dmcrypt)', 'holders: ' + ','.join(holders)]
-+ elif ptype == JOURNAL_UUID:
- desc.append('ceph journal')
- part_uuid = get_partition_uuid(dev)
- if part_uuid and part_uuid in journal_map:
- desc.append('for %s' % journal_map[part_uuid])
-+ elif ptype == DMCRYPT_JOURNAL_UUID:
-+ holders = is_held(dev)
-+ if len(holders) == 1:
-+ desc = ['ceph journal (dmcrypt /dev/%s)' % holders[0]]
-+ else:
-+ desc = ['ceph journal (dmcrypt)']
-+ part_uuid = get_partition_uuid(dev)
-+ if part_uuid and part_uuid in journal_map:
-+ desc.append('for %s' % journal_map[part_uuid])
- else:
-+ path = is_mounted(dev)
-+ fs_type = get_dev_fs(dev)
- if is_swap(dev):
- desc.append('swap')
- else:
- desc.append('other')
-@@ -2189,9 +2362,8 @@
-
- print '%s%s %s' % (prefix, dev, ', '.join(desc))
-
-
--
- def main_list(args):
- partmap = list_all_partitions()
-
- uuid_map = {}
-@@ -2202,20 +2374,37 @@
- part_uuid = get_partition_uuid(dev)
- if part_uuid:
- uuid_map[part_uuid] = dev
- ptype = get_partition_type(dev)
-- if ptype == 'ceph data':
-+ if ptype == OSD_UUID:
- fs_type = get_dev_fs(dev)
-- try:
-- tpath = mount(dev=dev, fstype=fs_type, options='')
-+ if fs_type is not None:
- try:
-- journal_uuid = get_oneliner(tpath, 'journal_uuid')
-- if journal_uuid:
-- journal_map[journal_uuid.lower()] = dev
-- finally:
-- unmount(tpath)
-- except MountError:
-- pass
-+ tpath = mount(dev=dev, fstype=fs_type, options='')
-+ try:
-+ journal_uuid = get_oneliner(tpath, 'journal_uuid')
-+ if journal_uuid:
-+ journal_map[journal_uuid.lower()] = dev
-+ finally:
-+ unmount(tpath)
-+ except MountError:
-+ pass
-+ if ptype == DMCRYPT_OSD_UUID:
-+ holders = is_held(dev)
-+ if len(holders) == 1:
-+ holder = '/dev/' + holders[0]
-+ fs_type = get_dev_fs(holder)
-+ if fs_type is not None:
-+ try:
-+ tpath = mount(dev=holder, fstype=fs_type, options='')
-+ try:
-+ journal_uuid = get_oneliner(tpath, 'journal_uuid')
-+ if journal_uuid:
-+ journal_map[journal_uuid.lower()] = dev
-+ finally:
-+ unmount(tpath)
-+ except MountError:
-+ pass
-
- for base, parts in sorted(partmap.iteritems()):
- if parts:
- print '%s :' % get_dev_path(base)
-@@ -2243,26 +2432,28 @@
- if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(path).st_mode):
- return False
- base = get_dev_name(disk)
- while len(base):
-- if os.path.exists(SUPPRESS_PREFIX + base):
-+ if os.path.exists(SUPPRESS_PREFIX + base): # noqa
- return True
- base = base[:-1]
- except:
- return False
-
-+
- def set_suppress(path):
- disk = os.path.realpath(path)
- if not os.path.exists(disk):
- raise Error('does not exist', path)
- if not stat.S_ISBLK(os.lstat(path).st_mode):
- raise Error('not a block device', path)
- base = get_dev_name(disk)
-
-- with file(SUPPRESS_PREFIX + base, 'w') as f:
-+ with file(SUPPRESS_PREFIX + base, 'w') as f: # noqa
- pass
- LOG.info('set suppress flag on %s', base)
-
-+
- def unset_suppress(path):
- disk = os.path.realpath(path)
- if not os.path.exists(disk):
- raise Error('does not exist', path)
-@@ -2270,9 +2461,9 @@
- raise Error('not a block device', path)
- assert disk.startswith('/dev/')
- base = get_dev_name(disk)
-
-- fn = SUPPRESS_PREFIX + base
-+ fn = SUPPRESS_PREFIX + base # noqa
- if not os.path.exists(fn):
- raise Error('not marked as suppressed', path)
-
- try:
-@@ -2284,18 +2475,24 @@
-
- def main_suppress(args):
- set_suppress(args.path)
-
-+
- def main_unsuppress(args):
- unset_suppress(args.path)
-
-+
- def main_zap(args):
- for dev in args.dev:
- zap(dev)
-
- ###########################
-
-+
- def setup_statedir(dir):
-+ # XXX The following use of globals makes linting
-+ # really hard. Global state in Python is iffy and
-+ # should be avoided.
- global STATEDIR
- STATEDIR = dir
-
- if not os.path.exists(STATEDIR):
-@@ -2311,12 +2508,14 @@
-
- global SUPPRESS_PREFIX
- SUPPRESS_PREFIX = STATEDIR + '/tmp/suppress-activate.'
-
-+
- def setup_sysconfdir(dir):
- global SYSCONFDIR
- SYSCONFDIR = dir
-
-+
- def parse_args():
- parser = argparse.ArgumentParser(
- 'ceph-disk',
- )
-@@ -2588,4 +2787,5 @@
-
-
- if __name__ == '__main__':
- main()
-+ warned_about = {}
---- a/src/ceph.in
-+++ b/src/ceph.in
-@@ -105,8 +105,16 @@
- for mdsdict in infodict.values():
- l.append(mdsdict['name'])
- return l
-
-+# these args must be passed to all child programs
-+GLOBAL_ARGS = {
-+ 'client_id': '--id',
-+ 'client_name': '--name',
-+ 'cluster': '--cluster',
-+ 'cephconf': '--conf',
-+}
-+
- def parse_cmdargs(args=None, target=''):
- # alias: let the line-wrapping be sane
- AP = argparse.ArgumentParser
-
-@@ -338,17 +346,25 @@
-
- return ret
-
-
--def ceph_conf(field, name):
-+def ceph_conf(parsed_args, field, name):
-+ args=['ceph-conf']
-+
-+ if name:
-+ args.extend(['--name', name])
-+
-+ # add any args in GLOBAL_ARGS
-+ for key, val in GLOBAL_ARGS.iteritems():
-+ # ignore name in favor of argument name, if any
-+ if name and key == 'client_name':
-+ continue
-+ if getattr(parsed_args, key):
-+ args.extend([val, getattr(parsed_args, key)])
-+
-+ args.extend(['--show-config-value', field])
- p = subprocess.Popen(
-- args=[
-- 'ceph-conf',
-- '--show-config-value',
-- field,
-- '-n',
-- name,
-- ],
-+ args,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE)
- outdata, errdata = p.communicate()
- if (len(errdata)):
-@@ -537,9 +553,10 @@
- sockpath = childargs[1]
- else:
- # try resolve daemon name
- try:
-- sockpath = ceph_conf('admin_socket', childargs[1])
-+ sockpath = ceph_conf(parsed_args, 'admin_socket',
-+ childargs[1])
- except Exception as e:
- print >> sys.stderr, \
- 'Can\'t get admin socket path: ' + str(e)
- return errno.EINVAL
---- a/src/ceph_common.sh
-+++ b/src/ceph_common.sh
-@@ -49,14 +49,15 @@
- get_conf user "" "user"
-
- #echo host for $name is $host, i am $hostname
-
-- if [ -e "/var/lib/ceph/$type/ceph-$id/upstart" ]; then
-+ cluster=$1
-+ if [ -e "/var/lib/ceph/$type/$cluster-$id/upstart" ]; then
- return 1
- fi
-
- # sysvinit managed instance in standard location?
-- if [ -e "/var/lib/ceph/$type/ceph-$id/sysvinit" ]; then
-+ if [ -e "/var/lib/ceph/$type/$cluster-$id/sysvinit" ]; then
- host="$hostname"
- echo "=== $type.$id === "
- return 0
- fi
---- a/src/ceph_mon.cc
-+++ b/src/ceph_mon.cc
-@@ -42,8 +42,10 @@
- #include "global/signal_handler.h"
-
- #include "include/assert.h"
-
-+#include "erasure-code/ErasureCodePlugin.h"
-+
- #define dout_subsys ceph_subsys_mon
-
- Monitor *mon = NULL;
-
-@@ -183,8 +185,23 @@
- cerr << " where the mon store and keyring are located\n";
- generic_server_usage();
- }
-
-+int preload_erasure_code()
-+{
-+ string directory = g_conf->osd_pool_default_erasure_code_directory;
-+ string plugins = g_conf->osd_erasure_code_plugins;
-+ stringstream ss;
-+ int r = ErasureCodePluginRegistry::instance().preload(plugins,
-+ directory,
-+ ss);
-+ if (r)
-+ derr << ss.str() << dendl;
-+ else
-+ dout(10) << ss.str() << dendl;
-+ return r;
-+}
-+
- int main(int argc, const char **argv)
- {
- int err;
-
-@@ -415,8 +432,10 @@
- global_init_postfork_start(g_ceph_context);
- }
- common_init_finish(g_ceph_context);
- global_init_chdir(g_ceph_context);
-+ if (preload_erasure_code() < -1)
-+ prefork.exit(1);
- }
-
- MonitorDBStore *store = new MonitorDBStore(g_conf->mon_data);
-
---- a/src/ceph_osd.cc
-+++ b/src/ceph_osd.cc
-@@ -47,8 +47,10 @@
- #include "perfglue/heap_profiler.h"
-
- #include "include/assert.h"
-
-+#include "erasure-code/ErasureCodePlugin.h"
-+
- #define dout_subsys ceph_subsys_osd
-
- OSD *osd = NULL;
-
-@@ -65,8 +67,23 @@
- derr << " --debug_osd N set debug level (e.g. 10)" << dendl;
- generic_server_usage();
- }
-
-+int preload_erasure_code()
-+{
-+ string directory = g_conf->osd_pool_default_erasure_code_directory;
-+ string plugins = g_conf->osd_erasure_code_plugins;
-+ stringstream ss;
-+ int r = ErasureCodePluginRegistry::instance().preload(plugins,
-+ directory,
-+ ss);
-+ if (r)
-+ derr << ss.str() << dendl;
-+ else
-+ dout(10) << ss.str() << dendl;
-+ return r;
-+}
-+
- int main(int argc, const char **argv)
- {
- vector<const char*> args;
- argv_to_vec(argc, argv, args);
-@@ -450,8 +467,11 @@
- if (mc.build_initial_monmap() < 0)
- return -1;
- global_init_chdir(g_ceph_context);
-
-+ if (preload_erasure_code() < -1)
-+ return -1;
-+
- osd = new OSD(g_ceph_context,
- store,
- whoami,
- ms_cluster,
---- a/src/cls/rgw/cls_rgw.cc
-+++ b/src/cls/rgw/cls_rgw.cc
-@@ -669,9 +669,9 @@
- CLS_LOG(0, "rgw_bucket_complete_op(): entry.name=%s entry.meta.category=%d\n", remove_entry.name.c_str(), remove_entry.meta.category);
- unaccount_entry(header, remove_entry);
-
- if (op.log_op) {
-- rc = log_index_operation(hctx, op.name, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
-+ rc = log_index_operation(hctx, remove_oid_name, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
- remove_entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker);
- if (rc < 0)
- continue;
- }
---- a/src/common/Finisher.h
-+++ b/src/common/Finisher.h
-@@ -76,8 +76,17 @@
- ls.clear();
- if (logger)
- logger->inc(l_finisher_queue_len);
- }
-+ void queue(list<Context*>& ls) {
-+ finisher_lock.Lock();
-+ finisher_queue.insert(finisher_queue.end(), ls.begin(), ls.end());
-+ finisher_cond.Signal();
-+ finisher_lock.Unlock();
-+ ls.clear();
-+ if (logger)
-+ logger->inc(l_finisher_queue_len);
-+ }
-
- void start();
- void stop();
-
---- a/src/common/LogClient.cc
-+++ b/src/common/LogClient.cc
-@@ -123,8 +123,9 @@
- }
-
- Message *LogClient::_get_mon_log_message()
- {
-+ assert(log_lock.is_locked());
- if (log_queue.empty())
- return NULL;
-
- // only send entries that haven't been sent yet during this mon
-@@ -148,9 +149,9 @@
- << " sending " << num_send << dendl;
- assert(num_unsent <= log_queue.size());
- std::deque<LogEntry>::iterator p = log_queue.begin();
- std::deque<LogEntry> o;
-- while (p->seq < last_log_sent) {
-+ while (p->seq <= last_log_sent) {
- ++p;
- assert(p != log_queue.end());
- }
- while (num_send--) {
---- a/src/common/Makefile.am
-+++ b/src/common/Makefile.am
-@@ -12,8 +12,9 @@
- common/admin_socket.cc \
- common/admin_socket_client.cc \
- common/cmdparse.cc \
- common/escape.c \
-+ common/io_priority.cc \
- common/Clock.cc \
- common/Throttle.cc \
- common/Timer.cc \
- common/Finisher.cc \
-@@ -155,8 +156,9 @@
- common/perf_counters.h \
- common/OutputDataSocket.h \
- common/admin_socket.h \
- common/admin_socket_client.h \
-+ common/random_cache.hpp \
- common/shared_cache.hpp \
- common/tracked_int_ptr.hpp \
- common/simple_cache.hpp \
- common/sharedptr_registry.hpp \
-@@ -174,8 +176,9 @@
- common/TrackedOp.h \
- common/arch.h \
- common/armor.h \
- common/common_init.h \
-+ common/io_priority.h \
- common/pipe.h \
- common/code_environment.h \
- common/signal.h \
- common/simple_spin.h \
---- a/src/common/Thread.cc
-+++ b/src/common/Thread.cc
-@@ -15,8 +15,9 @@
- #include "common/Thread.h"
- #include "common/code_environment.h"
- #include "common/debug.h"
- #include "common/signal.h"
-+#include "common/io_priority.h"
-
- #include <dirent.h>
- #include <errno.h>
- #include <iostream>
-@@ -28,21 +29,38 @@
- #include <sys/types.h>
-
-
- Thread::Thread()
-- : thread_id(0)
-+ : thread_id(0),
-+ pid(0),
-+ ioprio_class(-1),
-+ ioprio_priority(-1)
- {
- }
-
- Thread::~Thread()
- {
- }
-
- void *Thread::_entry_func(void *arg) {
-- void *r = ((Thread*)arg)->entry();
-+ void *r = ((Thread*)arg)->entry_wrapper();
- return r;
- }
-
-+void *Thread::entry_wrapper()
-+{
-+ int p = ceph_gettid(); // may return -ENOSYS on other platforms
-+ if (p > 0)
-+ pid = p;
-+ if (ioprio_class >= 0 &&
-+ ioprio_priority >= 0) {
-+ ceph_ioprio_set(IOPRIO_WHO_PROCESS,
-+ pid,
-+ IOPRIO_PRIO_VALUE(ioprio_class, ioprio_priority));
-+ }
-+ return entry();
-+}
-+
- const pthread_t &Thread::get_thread_id()
- {
- return thread_id;
- }
-@@ -127,4 +145,16 @@
- int Thread::detach()
- {
- return pthread_detach(thread_id);
- }
-+
-+int Thread::set_ioprio(int cls, int prio)
-+{
-+ // fixme, maybe: this can race with create()
-+ ioprio_class = cls;
-+ ioprio_priority = prio;
-+ if (pid && cls >= 0 && prio >= 0)
-+ return ceph_ioprio_set(IOPRIO_WHO_PROCESS,
-+ pid,
-+ IOPRIO_PRIO_VALUE(cls, prio));
-+ return 0;
-+}
---- a/src/common/Thread.h
-+++ b/src/common/Thread.h
-@@ -20,8 +20,12 @@
-
- class Thread {
- private:
- pthread_t thread_id;
-+ pid_t pid;
-+ int ioprio_class, ioprio_priority;
-+
-+ void *entry_wrapper();
-
- public:
- Thread(const Thread& other);
- const Thread& operator=(const Thread& other);
-@@ -43,7 +47,8 @@
- int try_create(size_t stacksize);
- void create(size_t stacksize = 0);
- int join(void **prval = 0);
- int detach();
-+ int set_ioprio(int cls, int prio);
- };
-
- #endif
---- a/src/common/WorkQueue.cc
-+++ b/src/common/WorkQueue.cc
-@@ -15,8 +15,9 @@
- #include <sstream>
-
- #include "include/types.h"
- #include "include/utime.h"
-+#include "common/errno.h"
- #include "WorkQueue.h"
-
- #include "common/config.h"
- #include "common/HeartbeatMap.h"
-@@ -32,8 +33,10 @@
- _lock(lockname.c_str()), // this should be safe due to declaration order
- _stop(false),
- _pause(0),
- _draining(0),
-+ ioprio_class(-1),
-+ ioprio_priority(-1),
- _num_threads(n),
- last_work_queue(0),
- processing(0)
- {
-@@ -155,8 +158,13 @@
- while (_threads.size() < _num_threads) {
- WorkThread *wt = new WorkThread(this);
- ldout(cct, 10) << "start_threads creating and starting " << wt << dendl;
- _threads.insert(wt);
-+
-+ int r = wt->set_ioprio(ioprio_class, ioprio_priority);
-+ if (r < 0)
-+ lderr(cct) << " set_ioprio got " << cpp_strerror(r) << dendl;
-+
- wt->create();
- }
- }
-
-@@ -254,4 +262,17 @@
- _draining--;
- _lock.Unlock();
- }
-
-+void ThreadPool::set_ioprio(int cls, int priority)
-+{
-+ Mutex::Locker l(_lock);
-+ ioprio_class = cls;
-+ ioprio_priority = priority;
-+ for (set<WorkThread*>::iterator p = _threads.begin();
-+ p != _threads.end();
-+ ++p) {
-+ int r = (*p)->set_ioprio(cls, priority);
-+ if (r < 0)
-+ lderr(cct) << " set_ioprio got " << cpp_strerror(r) << dendl;
-+ }
-+}
---- a/src/common/WorkQueue.h
-+++ b/src/common/WorkQueue.h
-@@ -32,8 +32,9 @@
- bool _stop;
- int _pause;
- int _draining;
- Cond _wait_cond;
-+ int ioprio_class, ioprio_priority;
-
- public:
- class TPHandle {
- friend class ThreadPool;
-@@ -387,8 +388,11 @@
- /// resume work in thread pool. must match each pause() call 1:1 to resume.
- void unpause();
- /// wait for all work to complete
- void drain(WorkQueue_* wq = 0);
-+
-+ /// set io priority
-+ void set_ioprio(int cls, int priority);
- };
-
- class GenContextWQ :
- public ThreadPool::WorkQueueVal<GenContext<ThreadPool::TPHandle&>*> {
---- a/src/common/blkdev.cc
-+++ b/src/common/blkdev.cc
-@@ -9,9 +9,9 @@
- int get_block_device_size(int fd, int64_t *psize)
- {
- #ifdef BLKGETSIZE64
- int ret = ::ioctl(fd, BLKGETSIZE64, psize);
--#elif BLKGETSIZE
-+#elif defined(BLKGETSIZE)
- unsigned long sectors = 0;
- int ret = ::ioctl(fd, BLKGETSIZE, §ors);
- *psize = sectors * 512ULL;
- #else
--- a/src/common/config.cc
+++ b/src/common/config.cc
-@@ -878,17 +878,17 @@
- assert(lock.is_locked());
- switch (opt->type) {
- case OPT_INT: {
- std::string err;
-- int f = strict_strtol(val, 10, &err);
-+ int f = strict_sistrtoll(val, &err);
- if (!err.empty())
- return -EINVAL;
- *(int*)opt->conf_ptr(this) = f;
- return 0;
- }
- case OPT_LONGLONG: {
- std::string err;
-- long long f = strict_strtoll(val, 10, &err);
-+ long long f = strict_sistrtoll(val, &err);
- if (!err.empty())
- return -EINVAL;
- *(long long*)opt->conf_ptr(this) = f;
- return 0;
-@@ -916,17 +916,17 @@
+@@ -946,9 +946,9 @@
+ return -ENOSYS;
+ }
+
+ static const char *CONF_METAVARIABLES[] =
+- { "cluster", "type", "name", "host", "num", "id", "pid" };
++ { "cluster", "type", "name", "host", "num", "id", "pid", "cctid" };
+ static const int NUM_CONF_METAVARIABLES =
+ (sizeof(CONF_METAVARIABLES) / sizeof(CONF_METAVARIABLES[0]));
+
+ void md_config_t::expand_all_meta()
+@@ -1058,8 +1058,10 @@
+ else if (var == "id")
+ out += name.get_id().c_str();
+ else if (var == "pid")
+ out += stringify(getpid());
++ else if (var == "cctid")
++ out += stringify((unsigned long long)this);
+ else
+ assert(0); // unreachable
+ expanded = true;
}
- return 0;
- case OPT_U32: {
- std::string err;
-- int f = strict_strtol(val, 10, &err);
-+ int f = strict_sistrtoll(val, &err);
- if (!err.empty())
- return -EINVAL;
- *(uint32_t*)opt->conf_ptr(this) = f;
- return 0;
- }
- case OPT_U64: {
- std::string err;
-- long long f = strict_strtoll(val, 10, &err);
-+ long long f = strict_sistrtoll(val, &err);
- if (!err.empty())
- return -EINVAL;
- *(uint64_t*)opt->conf_ptr(this) = f;
- return 0;
---- a/src/common/config_opts.h
-+++ b/src/common/config_opts.h
-@@ -176,8 +176,9 @@
- OPTION(mon_force_standby_active, OPT_BOOL, true) // should mons force standby-replay mds to be active
- OPTION(mon_warn_on_old_mons, OPT_BOOL, true) // should mons set health to WARN if part of quorum is old?
- OPTION(mon_warn_on_legacy_crush_tunables, OPT_BOOL, true) // warn if crush tunables are not optimal
- OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0'
-+OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL, true)
- OPTION(mon_min_osdmap_epochs, OPT_INT, 500)
- OPTION(mon_max_pgmap_epochs, OPT_INT, 500)
- OPTION(mon_max_log_epochs, OPT_INT, 500)
- OPTION(mon_max_mdsmap_epochs, OPT_INT, 500)
-@@ -433,8 +434,9 @@
- "technique=reed_sol_van "
- "k=2 "
- "m=1 "
- ) // default properties of osd pool create
-+OPTION(osd_erasure_code_plugins, OPT_STR, "jerasure") // list of erasure code plugins
- OPTION(osd_pool_default_flags, OPT_INT, 0) // default flags for new pools
- OPTION(osd_pool_default_flag_hashpspool, OPT_BOOL, true) // use new pg hashing to prevent pool/pg overlap
- OPTION(osd_pool_default_hit_set_bloom_fpp, OPT_FLOAT, .05)
- OPTION(osd_pool_default_cache_target_dirty_ratio, OPT_FLOAT, .4)
-@@ -449,16 +451,19 @@
- OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200)
- OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom")
-
- OPTION(osd_map_dedup, OPT_BOOL, true)
-+OPTION(osd_map_max_advance, OPT_INT, 200) // make this < cache_size!
- OPTION(osd_map_cache_size, OPT_INT, 500)
- OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message
- OPTION(osd_map_share_max_epochs, OPT_INT, 100) // cap on # of inc maps we send to peers, clients
- OPTION(osd_op_threads, OPT_INT, 2) // 0 == no threading
- OPTION(osd_peering_wq_batch_size, OPT_U64, 20)
- OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64, 4194304)
- OPTION(osd_op_pq_min_cost, OPT_U64, 65536)
- OPTION(osd_disk_threads, OPT_INT, 1)
-+OPTION(osd_disk_thread_ioprio_class, OPT_STR, "") // rt realtime be besteffort best effort idle
-+OPTION(osd_disk_thread_ioprio_priority, OPT_INT, -1) // 0-7
- OPTION(osd_recovery_threads, OPT_INT, 1)
- OPTION(osd_recover_clone_overlap, OPT_BOOL, true) // preserve clone_overlap during recovery/migration
-
- // Only use clone_overlap for recovery if there are fewer than
-@@ -472,8 +477,9 @@
- OPTION(osd_snap_trim_thread_timeout, OPT_INT, 60*60*1)
- OPTION(osd_snap_trim_sleep, OPT_FLOAT, 0)
- OPTION(osd_scrub_thread_timeout, OPT_INT, 60)
- OPTION(osd_scrub_finalize_thread_timeout, OPT_INT, 60*10)
-+OPTION(osd_scrub_invalid_stats, OPT_BOOL, true)
- OPTION(osd_remove_thread_timeout, OPT_INT, 60*60)
- OPTION(osd_command_thread_timeout, OPT_INT, 10*60)
- OPTION(osd_age, OPT_FLOAT, .8)
- OPTION(osd_age_time, OPT_INT, 0)
-@@ -508,8 +514,9 @@
- OPTION(osd_scrub_min_interval, OPT_FLOAT, 60*60*24) // if load is low
- OPTION(osd_scrub_max_interval, OPT_FLOAT, 7*60*60*24) // regardless of load
- OPTION(osd_scrub_chunk_min, OPT_INT, 5)
- OPTION(osd_scrub_chunk_max, OPT_INT, 25)
-+OPTION(osd_scrub_sleep, OPT_FLOAT, 0) // sleep between [deep]scrub ops
- OPTION(osd_deep_scrub_interval, OPT_FLOAT, 60*60*24*7) // once a week
- OPTION(osd_deep_scrub_stride, OPT_INT, 524288)
- OPTION(osd_scan_list_ping_tp_interval, OPT_U64, 100)
- OPTION(osd_auto_weight, OPT_BOOL, false)
-@@ -689,8 +696,11 @@
- OPTION(keyvaluestore_debug_check_backend, OPT_BOOL, 0) // Expensive debugging check on sync
- OPTION(keyvaluestore_op_threads, OPT_INT, 2)
- OPTION(keyvaluestore_op_thread_timeout, OPT_INT, 60)
- OPTION(keyvaluestore_op_thread_suicide_timeout, OPT_INT, 180)
-+OPTION(keyvaluestore_default_strip_size, OPT_INT, 4096) // Only affect new object
-+OPTION(keyvaluestore_max_expected_write_size, OPT_U64, 1ULL << 24) // bytes
-+OPTION(keyvaluestore_header_cache_size, OPT_INT, 4096) // Header cache size
-
- // max bytes to search ahead in journal searching for corruption
- OPTION(journal_max_corrupt_search, OPT_U64, 10<<20)
- OPTION(journal_block_align, OPT_BOOL, true)
-@@ -712,8 +722,9 @@
- OPTION(rbd_cache_size, OPT_LONGLONG, 32<<20) // cache size in bytes
- OPTION(rbd_cache_max_dirty, OPT_LONGLONG, 24<<20) // dirty limit in bytes - set to 0 for write-through caching
- OPTION(rbd_cache_target_dirty, OPT_LONGLONG, 16<<20) // target dirty limit in bytes
- OPTION(rbd_cache_max_dirty_age, OPT_FLOAT, 1.0) // seconds in cache before writeback starts
-+OPTION(rbd_cache_max_dirty_object, OPT_INT, 0) // dirty limit for objects - set to 0 for auto calculate from rbd_cache_size
- OPTION(rbd_cache_block_writes_upfront, OPT_BOOL, false) // whether to block writes to the cache before the aio_write call completes (true), or block before the aio completion is called (false)
- OPTION(rbd_concurrent_management_ops, OPT_INT, 10) // how many operations can be in flight for a management operation like deleting or resizing an image
- OPTION(rbd_balance_snap_reads, OPT_BOOL, false)
- OPTION(rbd_localize_snap_reads, OPT_BOOL, false)
---- /dev/null
-+++ b/src/common/io_priority.cc
-@@ -0,0 +1,54 @@
-+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-+// vim: ts=8 sw=2 smarttab
-+/*
-+ * Ceph - scalable distributed file system
-+ *
-+ * Copyright (C) 2012 Red Hat
-+ *
-+ * This is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License version 2.1, as published by the Free Software
-+ * Foundation. See file COPYING.
-+ *
-+ */
-+
-+#include <sys/types.h>
-+#include <unistd.h>
-+#include <sys/syscall.h> /* For SYS_xxx definitions */
-+#include <algorithm>
-+#include <errno.h>
-+
-+#include "common/errno.h"
-+#include "io_priority.h"
-+
-+pid_t ceph_gettid(void)
-+{
-+#ifdef __linux__
-+ return syscall(SYS_gettid);
-+#else
-+ return -ENOSYS;
-+#endif
-+}
-+
-+int ceph_ioprio_set(int whence, int who, int ioprio)
-+{
-+#ifdef __linux__
-+ return syscall(SYS_ioprio_set, whence, who, ioprio);
-+#else
-+ return -ENOSYS;
-+#endif
-+}
-+
-+int ceph_ioprio_string_to_class(const std::string& s)
-+{
-+ std::string l;
-+ std::transform(s.begin(), s.end(), l.begin(), ::tolower);
-+
-+ if (l == "idle")
-+ return IOPRIO_CLASS_IDLE;
-+ if (l == "be" || l == "besteffort" || l == "best effort")
-+ return IOPRIO_CLASS_BE;
-+ if (l == "rt" || l == "realtime" || l == "real time")
-+ return IOPRIO_CLASS_RT;
-+ return -EINVAL;
-+}
---- /dev/null
-+++ b/src/common/io_priority.h
-@@ -0,0 +1,44 @@
-+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-+// vim: ts=8 sw=2 smarttab
-+/*
-+ * Ceph - scalable distributed file system
-+ *
-+ * Copyright (C) 2012 Red Hat
-+ *
-+ * This is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License version 2.1, as published by the Free Software
-+ * Foundation. See file COPYING.
-+ *
-+ */
-+
-+#ifndef CEPH_COMMON_IO_PRIORITY_H
-+#define CEPH_COMMON_IO_PRIORITY_H
-+
-+#include <string>
-+
-+extern pid_t ceph_gettid();
-+
-+#ifndef IOPRIO_WHO_PROCESS
-+# define IOPRIO_WHO_PROCESS 1
-+#endif
-+#ifndef IOPRIO_PRIO_VALUE
-+# define IOPRIO_CLASS_SHIFT 13
-+# define IOPRIO_PRIO_VALUE(class, data) \
-+ (((class) << IOPRIO_CLASS_SHIFT) | (data))
-+#endif
-+#ifndef IOPRIO_CLASS_RT
-+# define IOPRIO_CLASS_RT 1
-+#endif
-+#ifndef IOPRIO_CLASS_BE
-+# define IOPRIO_CLASS_BE 2
-+#endif
-+#ifndef IOPRIO_CLASS_IDLE
-+# define IOPRIO_CLASS_IDLE 3
-+#endif
-+
-+extern int ceph_ioprio_set(int whence, int who, int ioprio);
-+
-+extern int ceph_ioprio_string_to_class(const std::string& s);
-+
-+#endif
---- /dev/null
-+++ b/src/common/random_cache.hpp
-@@ -0,0 +1,111 @@
-+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-+// vim: ts=8 sw=2 smarttab
-+/*
-+ * Ceph - scalable distributed file system
-+ *
-+ * Copyright (C) 2014 UnitedStack <haomai at unitedstack.com>
-+ *
-+ * Author: Haomai Wang <haomaiwang at gmail.com>
-+ *
-+ * This is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License version 2.1, as published by the Free Software
-+ * Foundation. See file COPYING.
-+ *
-+ */
-+
-+#ifndef CEPH_RANDOMCACHE_H
-+#define CEPH_RANDOMCACHE_H
-+
-+#include "common/Mutex.h"
-+#include "include/compat.h"
-+#include "include/unordered_map.h"
-+
-+
-+// Although This is a ramdom cache implementation, here still consider to make
-+// the trim progress more reasonable. Each item owns its lookup frequency,
-+// when the cache is full it will randomly pick up several items and compare the
-+// frequency associated with. The least frequency of items will be evicted.
-+template <class K, class V>
-+class RandomCache {
-+ // The first element of pair is the frequency of item, it's used to evict item
-+ ceph::unordered_map<K, pair<uint64_t, V> > contents;
-+ Mutex lock;
-+ uint64_t max_size;
-+ K last_trim_key;
-+
-+ // When cache reach full, consider to evict a certain number of items
-+ static const uint64_t EVICT_COUNT = 5;
-+ // Avoid too much overhead on comparing items's frequency, the number of
-+ // compare items is expected to small.
-+ static const uint64_t COMPARE_COUNT = 3;
-+
-+ // In order to make evict cache progress more lightweight and effective,
-+ // several items are expected to evicted in one call
-+ void trim_cache(uint64_t evict_count) {
-+ typename ceph::unordered_map<K, pair<uint64_t, V> >::iterator it = contents.find(last_trim_key);
-+ uint64_t total_compare = evict_count * COMPARE_COUNT;
-+ map<uint64_t, K> candidates;
-+
-+ while (total_compare--) {
-+ if (it == contents.end()) {
-+ it = contents.begin();
-+ }
-+
-+ candidates[it->second.first] = it->first;
-+ it++;
-+ }
-+ if (it != contents.end())
-+ last_trim_key = it->first;
-+ else
-+ last_trim_key = contents.begin()->first;
-+
-+ for (typename map<uint64_t, K>::iterator j = candidates.begin(); j != candidates.end(); j++) {
-+ contents.erase(j->second);
-+ evict_count--;
-+ if (!evict_count)
-+ break;
-+ }
-+ }
-+
-+ public:
-+ RandomCache(size_t max_size=20) : lock("RandomCache::lock"),
-+ max_size(max_size) {}
-+ ~RandomCache() {
-+ contents.clear();
-+ }
-+
-+ void clear(K key) {
-+ Mutex::Locker l(lock);
-+ contents.erase(key);
-+ }
-+
-+ void set_size(size_t new_size) {
-+ Mutex::Locker l(lock);
-+ max_size = new_size;
-+ if (max_size <= contents.size()) {
-+ trim_cache(contents.size() - max_size);
-+ }
-+ }
-+
-+ bool lookup(K key, V *out) {
-+ Mutex::Locker l(lock);
-+ typename ceph::unordered_map<K, pair<uint64_t, V> >::iterator it = contents.find(key);
-+ if (it != contents.end()) {
-+ it->second.first++;
-+ *out = it->second.second;
-+ return true;
-+ }
-+ return false;
-+ }
-+
-+ void add(K key, V value) {
-+ Mutex::Locker l(lock);
-+ if (max_size <= contents.size()) {
-+ trim_cache(EVICT_COUNT);
-+ }
-+ contents[key] = make_pair(1, value);
-+ }
-+};
-+
-+#endif
---- a/src/common/str_map.cc
-+++ b/src/common/str_map.cc
-@@ -23,9 +23,9 @@
-
- using namespace std;
-
- int get_str_map(const string &str,
-- stringstream &ss,
-+ ostream &ss,
- map<string,string> *str_map)
- {
- json_spirit::mValue json;
- try {
---- a/src/common/strtol.cc
-+++ b/src/common/strtol.cc
-@@ -16,8 +16,11 @@
- #include <limits.h>
- #include <sstream>
- #include <stdlib.h>
- #include <string>
-+extern "C" {
-+#include <stdint.h>
-+}
-
- using std::ostringstream;
-
- long long strict_strtoll(const char *str, int base, std::string *err)
-@@ -123,4 +126,44 @@
- }
- *err = "";
- return ret;
- }
-+
-+uint64_t strict_sistrtoll(const char *str, std::string *err)
-+{
-+ std::string s(str);
-+ if (s.size() == 0) {
-+ ostringstream oss;
-+ oss << "strict_sistrtoll: value not specified";
-+ *err = oss.str();
-+ return 0;
-+ }
-+ const char &u = s.at(s.size()-1); //str[std::strlen(str)-1];
-+ int m = 0;
-+ if (u == 'B')
-+ m = 0;
-+ else if (u == 'K')
-+ m = 10;
-+ else if (u == 'M')
-+ m = 20;
-+ else if (u == 'G')
-+ m = 30;
-+ else if (u == 'T')
-+ m = 40;
-+ else if (u == 'P')
-+ m = 50;
-+ else if (u == 'E')
-+ m = 60;
-+ else
-+ m = -1;
-+
-+ const char *v = NULL;
-+ if (m >= 0)
-+ s = std::string(str, s.size()-1);
-+ v = s.c_str();
-+
-+ uint64_t r = strict_strtoll(v, 10, err);
-+ if (err->empty() && m > 0) {
-+ r = (r << m);
-+ }
-+ return r;
-+}
---- a/src/common/strtol.h
-+++ b/src/common/strtol.h
-@@ -15,8 +15,11 @@
- #ifndef CEPH_COMMON_STRTOL_H
- #define CEPH_COMMON_STRTOL_H
-
- #include <string>
-+extern "C" {
-+#include <stdint.h>
-+}
-
- long long strict_strtoll(const char *str, int base, std::string *err);
-
- int strict_strtol(const char *str, int base, std::string *err);
-@@ -24,5 +27,7 @@
- double strict_strtod(const char *str, std::string *err);
-
- float strict_strtof(const char *str, std::string *err);
-
-+uint64_t strict_sistrtoll(const char *str, std::string *err);
-+
- #endif
---- a/src/crush/CrushWrapper.cc
-+++ b/src/crush/CrushWrapper.cc
-@@ -9,34 +9,56 @@
- #define dout_subsys ceph_subsys_crush
-
- bool CrushWrapper::has_v2_rules() const
- {
-- // check rules for use of indep or new SET_* rule steps
- for (unsigned i=0; i<crush->max_rules; i++) {
-- crush_rule *r = crush->rules[i];
-- if (!r)
-- continue;
-- for (unsigned j=0; j<r->len; j++) {
-- if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
-- r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
-- r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
-- r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES)
-- return true;
-+ if (is_v2_rule(i)) {
-+ return true;
-+ }
-+ }
-+ return false;
-+}
-+
-+bool CrushWrapper::is_v2_rule(unsigned ruleid) const
-+{
-+ // check rule for use of indep or new SET_* rule steps
-+ if (ruleid >= crush->max_rules)
-+ return false;
-+ crush_rule *r = crush->rules[ruleid];
-+ if (!r)
-+ return false;
-+ for (unsigned j=0; j<r->len; j++) {
-+ if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
-+ r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
-+ r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
-+ r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) {
-+ return true;
- }
- }
- return false;
- }
-
- bool CrushWrapper::has_v3_rules() const
- {
-- // check rules for use of SET_CHOOSELEAF_VARY_R step
- for (unsigned i=0; i<crush->max_rules; i++) {
-- crush_rule *r = crush->rules[i];
-- if (!r)
-- continue;
-- for (unsigned j=0; j<r->len; j++) {
-- if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R)
-- return true;
-+ if (is_v3_rule(i)) {
-+ return true;
-+ }
-+ }
-+ return false;
-+}
-+
-+bool CrushWrapper::is_v3_rule(unsigned ruleid) const
-+{
-+ // check rule for use of SET_CHOOSELEAF_VARY_R step
-+ if (ruleid >= crush->max_rules)
-+ return false;
-+ crush_rule *r = crush->rules[ruleid];
-+ if (!r)
-+ return false;
-+ for (unsigned j=0; j<r->len; j++) {
-+ if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) {
-+ return true;
- }
- }
- return false;
- }
-@@ -793,8 +815,61 @@
- have_rmaps = false;
- return rno;
- }
-
-+int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
-+{
-+ if (ruleno >= crush->max_rules)
-+ return -ENOENT;
-+ if (crush->rules[ruleno] == NULL)
-+ return -ENOENT;
-+ crush_rule *rule = crush->rules[ruleno];
-+
-+ // build a weight map for each TAKE in the rule, and then merge them
-+ for (unsigned i=0; i<rule->len; ++i) {
-+ map<int,float> m;
-+ float sum = 0;
-+ if (rule->steps[i].op == CRUSH_RULE_TAKE) {
-+ int n = rule->steps[i].arg1;
-+ if (n >= 0) {
-+ m[n] = 1.0;
-+ sum = 1.0;
-+ } else {
-+ list<int> q;
-+ q.push_back(n);
-+ //breadth first iterate the OSD tree
-+ while (!q.empty()) {
-+ int bno = q.front();
-+ q.pop_front();
-+ crush_bucket *b = crush->buckets[-1-bno];
-+ assert(b);
-+ for (unsigned j=0; j<b->size; ++j) {
-+ int item_id = b->items[j];
-+ if (item_id >= 0) //it's an OSD
-+ {
-+ float w = crush_get_bucket_item_weight(b, j);
-+ m[item_id] = w;
-+ sum += w;
-+ }
-+ else //not an OSD, expand the child later
-+ q.push_back(item_id);
-+ }
-+ }
-+ }
-+ }
-+ for (map<int,float>::iterator p = m.begin(); p != m.end(); ++p) {
-+ map<int,float>::iterator q = pmap->find(p->first);
-+ if (q == pmap->end()) {
-+ (*pmap)[p->first] = p->second / sum;
-+ } else {
-+ q->second += p->second / sum;
-+ }
-+ }
-+ }
-+
-+ return 0;
-+}
-+
- int CrushWrapper::remove_rule(int ruleno)
- {
- if (ruleno >= (int)crush->max_rules)
- return -ENOENT;
---- a/src/crush/CrushWrapper.h
-+++ b/src/crush/CrushWrapper.h
-@@ -215,8 +215,10 @@
- }
- bool has_v2_rules() const;
- bool has_v3_rules() const;
-
-+ bool is_v2_rule(unsigned ruleid) const;
-+ bool is_v3_rule(unsigned ruleid) const;
-
- // bucket types
- int get_num_type_names() const {
- return type_map.size();
-@@ -630,8 +632,20 @@
- if (IS_ERR(s)) return PTR_ERR(s);
- return s->arg2;
- }
-
-+ /**
-+ * calculate a map of osds to weights for a given rule
-+ *
-+ * Generate a map of which OSDs get how much relative weight for a
-+ * given rule.
-+ *
-+ * @param ruleno [in] rule id
-+ * @param pmap [out] map of osd to weight
-+ * @return 0 for success, or negative error code
-+ */
-+ int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap);
-+
- /* modifiers */
- int add_rule(int len, int ruleset, int type, int minsize, int maxsize, int ruleno) {
- if (!crush) return -ENOENT;
- crush_rule *n = crush_make_rule(len, ruleset, type, minsize, maxsize);
---- a/src/erasure-code/ErasureCodeInterface.h
-+++ b/src/erasure-code/ErasureCodeInterface.h
-@@ -166,9 +166,9 @@
- *
- * @param [in] name of the ruleset to create
- * @param [in] crush crushmap in which the ruleset is created
- * @param [out] ss contains informative messages when an error occurs
-- * @return **0** on success or a negative errno on error.
-+ * @return a ruleset on success or a negative errno on error.
- */
- virtual int create_ruleset(const string &name,
- CrushWrapper &crush,
- ostream *ss) const = 0;
---- a/src/erasure-code/ErasureCodePlugin.cc
-+++ b/src/erasure-code/ErasureCodePlugin.cc
-@@ -3,8 +3,9 @@
- /*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2013,2014 Cloudwatt <libre.licensing at cloudwatt.com>
-+ * Copyright (C) 2014 Red Hat <contact at redhat.com>
- *
- * Author: Loic Dachary <loic at dachary.org>
- *
- * This library is free software; you can redistribute it and/or
-@@ -18,8 +19,9 @@
- #include <dlfcn.h>
-
- #include "ErasureCodePlugin.h"
- #include "common/errno.h"
-+#include "include/str_list.h"
-
- #define PLUGIN_PREFIX "libec_"
- #define PLUGIN_SUFFIX ".so"
- #define PLUGIN_INIT_FUNCTION "__erasure_code_init"
-@@ -129,7 +131,33 @@
- }
-
- (*plugin)->library = library;
-
-+ ss << __func__ << ": " << plugin_name << " ";
-+
- return 0;
- }
-
-+int ErasureCodePluginRegistry::preload(const std::string &plugins,
-+ const std::string &directory,
-+ ostream &ss)
-+{
-+ map<string,string> profile;
-+ profile["directory"] = directory;
-+ list<string> plugins_list;
-+ get_str_list(plugins, plugins_list);
-+ for (list<string>::iterator i = plugins_list.begin();
-+ i != plugins_list.end();
-+ i++) {
-+ ErasureCodePlugin *plugin;
-+ int r = load(*i, profile, &plugin, ss);
-+ if (r)
-+ return r;
-+
-+ ErasureCodeInterfaceRef erasure_code;
-+ profile["technique"] = "reed_sol_van";
-+ r = plugin->factory(profile, &erasure_code);
-+ if (r)
-+ return r;
-+ }
-+ return 0;
-+}
---- a/src/erasure-code/ErasureCodePlugin.h
-+++ b/src/erasure-code/ErasureCodePlugin.h
-@@ -66,8 +66,11 @@
- const map<std::string,std::string> ¶meters,
- ErasureCodePlugin **plugin,
- ostream &ss);
-
-+ int preload(const std::string &plugins,
-+ const std::string &directory,
-+ ostream &ss);
- };
- }
-
- #endif
---- a/src/erasure-code/jerasure/ErasureCodeJerasure.cc
-+++ b/src/erasure-code/jerasure/ErasureCodeJerasure.cc
-@@ -43,10 +43,14 @@
- int ErasureCodeJerasure::create_ruleset(const string &name,
- CrushWrapper &crush,
- ostream *ss) const
- {
-- return crush.add_simple_ruleset(name, ruleset_root, ruleset_failure_domain,
-- "indep", pg_pool_t::TYPE_ERASURE, ss);
-+ int ruleid = crush.add_simple_ruleset(name, ruleset_root, ruleset_failure_domain,
-+ "indep", pg_pool_t::TYPE_ERASURE, ss);
-+ if (ruleid < 0)
-+ return ruleid;
-+ else
-+ return crush.get_rule_mask_ruleset(ruleid);
- }
-
- void ErasureCodeJerasure::init(const map<string,string> ¶meters)
- {
---- a/src/include/atomic.h
-+++ b/src/include/atomic.h
-@@ -20,12 +20,68 @@
- # include "acconfig.h"
- #endif
-
- #include <stdlib.h>
-+#include "include/Spinlock.h"
-+
-+namespace ceph {
-+ template <class T>
-+ class atomic_spinlock_t {
-+ mutable ceph_spinlock_t lock;
-+ T val;
-+ public:
-+ atomic_spinlock_t(T i=0)
-+ : val(i) {
-+ ceph_spin_init(&lock);
-+ }
-+ ~atomic_spinlock_t() {
-+ ceph_spin_destroy(&lock);
-+ }
-+ void set(T v) {
-+ ceph_spin_lock(&lock);
-+ val = v;
-+ ceph_spin_unlock(&lock);
-+ }
-+ T inc() {
-+ ceph_spin_lock(&lock);
-+ T r = ++val;
-+ ceph_spin_unlock(&lock);
-+ return r;
-+ }
-+ T dec() {
-+ ceph_spin_lock(&lock);
-+ T r = --val;
-+ ceph_spin_unlock(&lock);
-+ return r;
-+ }
-+ void add(T d) {
-+ ceph_spin_lock(&lock);
-+ val += d;
-+ ceph_spin_unlock(&lock);
-+ }
-+ void sub(T d) {
-+ ceph_spin_lock(&lock);
-+ val -= d;
-+ ceph_spin_unlock(&lock);
-+ }
-+ T read() const {
-+ T ret;
-+ ceph_spin_lock(&lock);
-+ ret = val;
-+ ceph_spin_unlock(&lock);
-+ return ret;
-+ }
-+ private:
-+ // forbid copying
-+ atomic_spinlock_t(const atomic_spinlock_t<T> &other);
-+ atomic_spinlock_t &operator=(const atomic_spinlock_t<T> &rhs);
-+ };
-+}
-
- #ifndef NO_ATOMIC_OPS
-
- // libatomic_ops implementation
-+#define AO_REQUIRE_CAS
- #include <atomic_ops.h>
-
- // reinclude our assert to clobber the system one
- #include "include/assert.h"
-@@ -34,9 +90,9 @@
- class atomic_t {
- AO_t val;
- public:
- atomic_t(AO_t i=0) : val(i) {}
-- void set(size_t v) {
-+ void set(AO_t v) {
- AO_store(&val, v);
- }
- AO_t inc() {
- return AO_fetch_and_add1(&val) + 1;
-@@ -46,10 +102,10 @@
- }
- void add(AO_t add_me) {
- AO_fetch_and_add(&val, add_me);
- }
-- void sub(int sub_me) {
-- int negsub = 0 - sub_me;
-+ void sub(AO_t sub_me) {
-+ AO_t negsub = 0 - sub_me;
- AO_fetch_and_add_write(&val, (AO_t)negsub);
- }
- AO_t read() const {
- // cast away const on the pointer. this is only needed to build
-@@ -61,65 +117,26 @@
- // forbid copying
- atomic_t(const atomic_t &other);
- atomic_t &operator=(const atomic_t &rhs);
- };
-+
-+#if SIZEOF_AO_T == 8
-+ typedef atomic_t atomic64_t;
-+#else
-+ typedef atomic_spinlock_t<unsigned long long> atomic64_t;
-+#endif
-+
- }
-+
- #else
- /*
- * crappy slow implementation that uses a pthreads spinlock.
- */
- #include "include/Spinlock.h"
-
- namespace ceph {
-- class atomic_t {
-- mutable ceph_spinlock_t lock;
-- signed long val;
-- public:
-- atomic_t(int i=0)
-- : val(i) {
-- ceph_spin_init(&lock);
-- }
-- ~atomic_t() {
-- ceph_spin_destroy(&lock);
-- }
-- void set(size_t v) {
-- ceph_spin_lock(&lock);
-- val = v;
-- ceph_spin_unlock(&lock);
-- }
-- int inc() {
-- ceph_spin_lock(&lock);
-- int r = ++val;
-- ceph_spin_unlock(&lock);
-- return r;
-- }
-- int dec() {
-- ceph_spin_lock(&lock);
-- int r = --val;
-- ceph_spin_unlock(&lock);
-- return r;
-- }
-- void add(int d) {
-- ceph_spin_lock(&lock);
-- val += d;
-- ceph_spin_unlock(&lock);
-- }
-- void sub(int d) {
-- ceph_spin_lock(&lock);
-- val -= d;
-- ceph_spin_unlock(&lock);
-- }
-- int read() const {
-- signed long ret;
-- ceph_spin_lock(&lock);
-- ret = val;
-- ceph_spin_unlock(&lock);
-- return ret;
-- }
-- private:
-- // forbid copying
-- atomic_t(const atomic_t &other);
-- atomic_t &operator=(const atomic_t &rhs);
-- };
-+ typedef atomic_spinlock_t<unsigned> atomic_t;
-+ typedef atomic_spinlock_t<unsigned long long> atomic64_t;
- }
-+
- #endif
- #endif
---- a/src/include/intarith.h
-+++ b/src/include/intarith.h
-@@ -27,9 +27,9 @@
- # define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
- #endif
-
- #ifndef ROUND_UP_TO
--# define ROUND_UP_TO(n, d) (((n)+(d)-1) & ~((d)-1))
-+# define ROUND_UP_TO(n, d) ((n)%(d) ? ((n)+(d)-(n)%(d)) : (n))
- #endif
-
- #ifndef SHIFT_ROUND_UP
- # define SHIFT_ROUND_UP(x,y) (((x)+(1<<(y))-1) >> (y))
---- a/src/include/rbd/librbd.h
-+++ b/src/include/rbd/librbd.h
-@@ -38,8 +38,9 @@
- #define LIBRBD_VERSION_CODE LIBRBD_VERSION(LIBRBD_VER_MAJOR, LIBRBD_VER_MINOR, LIBRBD_VER_EXTRA)
-
- #define LIBRBD_SUPPORTS_WATCH 0
- #define LIBRBD_SUPPORTS_AIO_FLUSH 1
-+#define LIBRBD_SUPPORTS_INVALIDATE 1
-
- typedef void *rbd_snap_t;
- typedef void *rbd_image_t;
-
-@@ -375,8 +376,16 @@
- * @returns 0 on success, negative error code on failure
- */
- int rbd_aio_flush(rbd_image_t image, rbd_completion_t c);
-
-+/**
-+ * Drop any cached data for an image
-+ *
-+ * @param image the image to invalidate cached data for
-+ * @returns 0 on success, negative error code on failure
-+ */
-+int rbd_invalidate_cache(rbd_image_t image);
-+
- #ifdef __cplusplus
- }
- #endif
-
---- a/src/include/rbd/librbd.hpp
-+++ b/src/include/rbd/librbd.hpp
-@@ -215,8 +215,16 @@
- * @returns 0 on success, negative error code on failure
- */
- int aio_flush(RBD::AioCompletion *c);
-
-+ /**
-+ * Drop any cached data for an image
-+ *
-+ * @param image the image to invalidate cached data for
-+ * @returns 0 on success, negative error code on failure
-+ */
-+ int invalidate_cache();
-+
- private:
- friend class RBD;
-
- Image(const Image& rhs);
---- a/src/include/str_map.h
-+++ b/src/include/str_map.h
-@@ -52,8 +52,8 @@
- * @param [out] str_map key/value pairs read from str
- * @return **0** on success or a -EINVAL on error.
- */
- extern int get_str_map(const std::string &str,
-- std::stringstream &ss,
-+ std::ostream &ss,
- std::map<std::string,std::string> *str_map);
-
- #endif
---- a/src/init-ceph.in
-+++ b/src/init-ceph.in
-@@ -30,8 +30,9 @@
-
- usage_exit() {
- echo "usage: $0 [options] {start|stop|restart|condrestart} [mon|osd|mds]..."
- printf "\t-c ceph.conf\n"
-+ printf "\t--cluster [cluster name]\tdefine the cluster name\n"
- printf "\t--valgrind\trun via valgrind\n"
- printf "\t--hostname [hostname]\toverride hostname lookup\n"
- exit
- }
-@@ -112,8 +113,10 @@
- monaddr=
- dofsmount=1
- dofsumount=0
- verbose=0
-+use_default_conf=1
-+
-
- while echo $1 | grep -q '^-'; do # FIXME: why not '^-'?
- case $1 in
- -v | --verbose)
-@@ -152,10 +155,17 @@
- --conf | -c)
- [ -z "$2" ] && usage_exit
- options="$options $1"
- shift
-+ use_default_conf=0
- conf=$1
- ;;
-+ --cluster )
-+ [ -z "$2" ] && usage_exit
-+ options="$options $1"
-+ shift
-+ cluster=$1
-+ ;;
- --hostname )
- [ -z "$2" ] && usage_exit
- options="$options $1"
- shift
-@@ -169,8 +179,22 @@
- options="$options $1"
- shift
- done
-
-+
-+# if `--cluster` was not passed in, fallback to looking at the config name
-+if [ -z "$cluster" ]; then
-+ cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1`
-+else
-+ # if we were told to use a given cluster name then $conf needs to be updated
-+ # but just define it if `--conf` was not specified, otherwise we would be silently
-+ # overriding $conf even if it was defined with `--conf`
-+ if [ $use_default_conf -eq 1 ]; then
-+ conf="/etc/ceph/$cluster.conf"
-+ fi
-+fi
-+
-+
- verify_conf
-
- command=$1
- [ -n "$*" ] && shift
-@@ -188,13 +212,12 @@
-
- for name in $what; do
- type=`echo $name | cut -c 1-3` # e.g. 'mon', if $item is 'mon1'
- id=`echo $name | cut -c 4- | sed 's/^\\.//'`
-- cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1`
- num=$id
- name="$type.$id"
-
-- check_host || continue
-+ check_host $cluster || continue
-
- binary="$BINDIR/ceph-$type"
- cmd="$binary -i $id"
-
-@@ -234,9 +257,9 @@
- # conf file
- cmd="$cmd -c $conf"
-
- if echo $name | grep -q ^osd; then
-- get_conf osd_data "/var/lib/ceph/osd/ceph-$id" "osd data"
-+ get_conf osd_data "/var/lib/ceph/osd/$cluster-$id" "osd data"
- get_conf fs_path "$osd_data" "fs path" # mount point defaults so osd data
- get_conf fs_devs "" "devs"
- if [ -z "$fs_devs" ]; then
- # try to fallback to old keys
-@@ -334,9 +357,9 @@
- get_conf update_crush "" "osd crush update on start"
- if [ "${update_crush:-1}" = "1" -o "${update_crush:-1}" = "true" ]; then
- # update location in crush
- get_conf osd_location_hook "$BINDIR/ceph-crush-location" "osd crush location hook"
-- osd_location=`$osd_location_hook --cluster ceph --id $id --type osd`
-+ osd_location=`$osd_location_hook --cluster $cluster --id $id --type osd`
- get_conf osd_weight "" "osd crush initial weight"
- defaultweight="$(df -P -k $osd_data/. | tail -1 | awk '{ print sprintf("%.2f",$2/1073741824) }')"
- get_conf osd_keyring "$osd_data/keyring" "keyring"
- do_cmd "timeout 30 $BINDIR/ceph -c $conf --name=osd.$id --keyring=$osd_keyring osd crush create-or-move -- $id ${osd_weight:-${defaultweight:-1}} $osd_location"
-@@ -365,9 +388,9 @@
- # in creating these keys.
- get_conf mon_data "/var/lib/ceph/mon/ceph-$id" "mon data"
- if [ "$mon_data" = "/var/lib/ceph/mon/ceph-$id" -a "$asok" = "/var/run/ceph/ceph-mon.$id.asok" ]; then
- echo Starting ceph-create-keys on $host...
-- cmd2="$SBINDIR/ceph-create-keys -i $id 2> /dev/null &"
-+ cmd2="$SBINDIR/ceph-create-keys --cluster $cluster -i $id 2> /dev/null &"
- do_cmd "$cmd2"
- fi
- fi
-
---- a/src/init-radosgw.sysv
-+++ b/src/init-radosgw.sysv
-@@ -14,8 +14,9 @@
- . /etc/rc.d/init.d/functions
-
- daemon_is_running() {
- daemon=$1
-+ sleep 1
- if pidof $daemon >/dev/null; then
- echo "$daemon is running."
- exit 0
- else
-@@ -43,8 +44,12 @@
- [ $VERBOSE -eq 1 ] && echo "$RADOSGW could not start, it is not executable."
- exit 1
- fi
-
-+# detect systemd
-+SYSTEMD=0
-+grep -qs systemd /proc/1/comm && SYSTEMD=1
-+
- case "$1" in
- start)
- echo "Starting radosgw instance(s)..."
- for name in `ceph-conf --list-sections $PREFIX`;
-@@ -78,10 +83,14 @@
- touch "$log_file"
- chown $user $log_file
- fi
-
-- #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
-- daemon --user="$user" "ulimit -n 32768; $RADOSGW -n $name"
-+ if [ $SYSTEMD -eq 1 ]; then
-+ systemd-run -r bash -c "ulimit -n 32768; $RADOSGW -n $name"
-+ else
-+ #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
-+ daemon --user="$user" "ulimit -n 32768; $RADOSGW -n $name"
-+ fi
- echo "Starting $name..."
- done
- daemon_is_running $RADOSGW
- ;;
---- a/src/librados/RadosClient.cc
-+++ b/src/librados/RadosClient.cc
-@@ -102,10 +102,12 @@
-
- lock.Lock();
-
- int r = wait_for_osdmap();
-- if (r < 0)
-+ if (r < 0) {
-+ lock.Unlock();
- return r;
-+ }
- int64_t ret = osdmap.lookup_pg_pool_name(name);
- pool_cache_rwl.get_write();
- lock.Unlock();
- if (ret < 0) {
-@@ -581,10 +583,12 @@
- int librados::RadosClient::pool_delete(const char *name)
- {
- lock.Lock();
- int r = wait_for_osdmap();
-- if (r < 0)
-+ if (r < 0) {
-+ lock.Unlock();
- return r;
-+ }
- int tmp_pool_id = osdmap.lookup_pg_pool_name(name);
- if (tmp_pool_id < 0) {
- lock.Unlock();
- return -ENOENT;
---- a/src/librbd/ImageCtx.cc
-+++ b/src/librbd/ImageCtx.cc
-@@ -184,12 +184,16 @@
- }
-
- // size object cache appropriately
- if (object_cacher) {
-- uint64_t obj = cct->_conf->rbd_cache_size / (1ull << order);
-+ uint64_t obj = cct->_conf->rbd_cache_max_dirty_object;
-+ if (!obj) {
-+ obj = cct->_conf->rbd_cache_size / (1ull << order);
-+ obj = obj * 4 + 10;
-+ }
- ldout(cct, 10) << " cache bytes " << cct->_conf->rbd_cache_size << " order " << (int)order
- << " -> about " << obj << " objects" << dendl;
-- object_cacher->set_max_objects(obj * 4 + 10);
-+ object_cacher->set_max_objects(obj);
- }
-
- ldout(cct, 10) << "init_layout stripe_unit " << stripe_unit
- << " stripe_count " << stripe_count
-@@ -572,11 +576,11 @@
- md_lock.put_write();
- object_cacher->stop();
- }
-
-- void ImageCtx::invalidate_cache() {
-+ int ImageCtx::invalidate_cache() {
- if (!object_cacher)
-- return;
-+ return 0;
- cache_lock.Lock();
- object_cacher->release_set(object_set);
- cache_lock.Unlock();
- int r = flush_cache();
-@@ -584,10 +588,14 @@
- lderr(cct) << "flush_cache returned " << r << dendl;
- cache_lock.Lock();
- bool unclean = object_cacher->release_set(object_set);
- cache_lock.Unlock();
-- if (unclean)
-- lderr(cct) << "could not release all objects from cache" << dendl;
-+ if (unclean) {
-+ lderr(cct) << "could not release all objects from cache: "
-+ << unclean << " bytes remain" << dendl;
-+ return -EBUSY;
-+ }
-+ return r;
- }
-
- void ImageCtx::clear_nonexistence_cache() {
- if (!object_cacher)
---- a/src/librbd/ImageCtx.h
-+++ b/src/librbd/ImageCtx.h
-@@ -138,9 +138,9 @@
- void user_flushed();
- void flush_cache_aio(Context *onfinish);
- int flush_cache();
- void shutdown_cache();
-- void invalidate_cache();
-+ int invalidate_cache();
- void clear_nonexistence_cache();
- int register_watch();
- void unregister_watch();
- size_t parent_io_len(uint64_t offset, size_t length,
---- a/src/librbd/internal.cc
-+++ b/src/librbd/internal.cc
-@@ -831,8 +831,11 @@
- int create(IoCtx& io_ctx, const char *imgname, uint64_t size,
- bool old_format, uint64_t features, int *order,
- uint64_t stripe_unit, uint64_t stripe_count)
- {
-+ if (!order)
-+ return -EINVAL;
-+
- CephContext *cct = (CephContext *)io_ctx.cct();
- ldout(cct, 20) << "create " << &io_ctx << " name = " << imgname
- << " size = " << size << " old_format = " << old_format
- << " features = " << features << " order = " << *order
-@@ -856,11 +859,8 @@
- lderr(cct) << "rbd image " << imgname << " already exists" << dendl;
- return -EEXIST;
- }
-
-- if (!order)
-- return -EINVAL;
--
- if (!*order)
- *order = cct->_conf->rbd_default_order;
- if (!*order)
- *order = RBD_DEFAULT_OBJ_ORDER;
-@@ -1503,9 +1503,11 @@
- RWLock::WLocker l(ictx->md_lock);
- if (size < ictx->size && ictx->object_cacher) {
- // need to invalidate since we're deleting objects, and
- // ObjectCacher doesn't track non-existent objects
-- ictx->invalidate_cache();
-+ r = ictx->invalidate_cache();
-+ if (r < 0)
-+ return r;
- }
- resize_helper(ictx, size, prog_ctx);
-
- ldout(cct, 2) << "done." << dendl;
-@@ -1846,9 +1848,11 @@
-
- // need to flush any pending writes before resizing and rolling back -
- // writes might create new snapshots. Rolling back will replace
- // the current version, so we have to invalidate that too.
-- ictx->invalidate_cache();
-+ r = ictx->invalidate_cache();
-+ if (r < 0)
-+ return r;
-
- ldout(cct, 2) << "resizing to snapshot size..." << dendl;
- NoOpProgressContext no_op;
- r = resize_helper(ictx, new_size, no_op);
-@@ -2070,9 +2074,9 @@
- << "' snap_name = '"
- << ictx->snap_name << "'" << dendl;
- int r = ictx->init();
- if (r < 0)
-- return r;
-+ goto err_close;
-
- if (!ictx->read_only) {
- r = ictx->register_watch();
- if (r < 0) {
-@@ -2876,8 +2880,21 @@
-
- return r;
- }
-
-+ int invalidate_cache(ImageCtx *ictx)
-+ {
-+ CephContext *cct = ictx->cct;
-+ ldout(cct, 20) << "invalidate_cache " << ictx << dendl;
-+
-+ int r = ictx_check(ictx);
-+ if (r < 0)
-+ return r;
-+
-+ RWLock::WLocker l(ictx->md_lock);
-+ return ictx->invalidate_cache();
-+ }
-+
- int aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
- AioCompletion *c)
- {
- CephContext *cct = ictx->cct;
---- a/src/librbd/internal.h
-+++ b/src/librbd/internal.h
-@@ -187,8 +187,9 @@
- char *buf, bufferlist *pbl, AioCompletion *c);
- int aio_flush(ImageCtx *ictx, AioCompletion *c);
- int flush(ImageCtx *ictx);
- int _flush(ImageCtx *ictx);
-+ int invalidate_cache(ImageCtx *ictx);
-
- ssize_t handle_sparse_read(CephContext *cct,
- ceph::bufferlist data_bl,
- uint64_t block_ofs,
---- a/src/librbd/librbd.cc
-+++ b/src/librbd/librbd.cc
-@@ -513,8 +513,14 @@
- ImageCtx *ictx = (ImageCtx *)ctx;
- return librbd::aio_flush(ictx, (librbd::AioCompletion *)c->pc);
- }
-
-+ int Image::invalidate_cache()
-+ {
-+ ImageCtx *ictx = (ImageCtx *)ctx;
-+ return librbd::invalidate_cache(ictx);
-+ }
-+
- } // namespace librbd
-
- extern "C" void rbd_version(int *major, int *minor, int *extra)
- {
-@@ -1129,8 +1135,14 @@
- librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
- return librbd::aio_flush(ictx, (librbd::AioCompletion *)comp->pc);
- }
-
-+extern "C" int rbd_invalidate_cache(rbd_image_t image)
-+{
-+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
-+ return librbd::invalidate_cache(ictx);
-+}
-+
- extern "C" int rbd_aio_is_complete(rbd_completion_t c)
- {
- librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
- return comp->is_complete();
---- a/src/mds/Locker.cc
-+++ b/src/mds/Locker.cc
-@@ -2061,9 +2061,15 @@
-
- void Locker::calc_new_client_ranges(CInode *in, uint64_t size, map<client_t,client_writeable_range_t>& new_ranges)
- {
- inode_t *latest = in->get_projected_inode();
-- uint64_t ms = ROUND_UP_TO((size+1)<<1, latest->get_layout_size_increment());
-+ uint64_t ms;
-+ if(latest->has_layout()) {
-+ ms = ROUND_UP_TO((size+1)<<1, latest->get_layout_size_increment());
-+ } else {
-+ // Layout-less directories like ~mds0/, have zero size
-+ ms = 0;
-+ }
-
- // increase ranges as appropriate.
- // shrink to 0 if no WR|BUFFER caps issued.
- for (map<client_t,Capability*>::iterator p = in->client_caps.begin();
---- a/src/mds/MDCache.cc
-+++ b/src/mds/MDCache.cc
-@@ -348,8 +348,9 @@
- rootdir->fnode.accounted_rstat = rootdir->fnode.rstat;
-
- root->inode.dirstat = rootdir->fnode.fragstat;
- root->inode.rstat = rootdir->fnode.rstat;
-+ ++root->inode.rstat.rsubdirs;
- root->inode.accounted_rstat = root->inode.rstat;
-
- rootdir->mark_complete();
- rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
-@@ -398,8 +399,9 @@
- mydir->fnode.accounted_rstat = mydir->fnode.rstat;
-
- myin->inode.dirstat = mydir->fnode.fragstat;
- myin->inode.rstat = mydir->fnode.rstat;
-+ ++myin->inode.rstat.rsubdirs;
- myin->inode.accounted_rstat = myin->inode.rstat;
-
-
- mydir->mark_complete();
---- a/src/messages/MOSDSubOp.h
-+++ b/src/messages/MOSDSubOp.h
-@@ -24,9 +24,9 @@
- */
-
- class MOSDSubOp : public Message {
-
-- static const int HEAD_VERSION = 10;
-+ static const int HEAD_VERSION = 11;
- static const int COMPAT_VERSION = 1;
-
- public:
- epoch_t map_epoch;
-@@ -62,8 +62,10 @@
- eversion_t version;
-
- // piggybacked osd/og state
- eversion_t pg_trim_to; // primary->replica: trim to here
-+ eversion_t pg_trim_rollback_to; // primary->replica: trim rollback
-+ // info to here
- osd_peer_stat_t peer_stat;
-
- map<string,bufferlist> attrset;
-
-@@ -174,8 +176,13 @@
- }
- if (header.version >= 10) {
- ::decode(updated_hit_set_history, p);
- }
-+ if (header.version >= 11) {
-+ ::decode(pg_trim_rollback_to, p);
-+ } else {
-+ pg_trim_rollback_to = pg_trim_to;
-+ }
- }
-
- virtual void encode_payload(uint64_t features) {
- ::encode(map_epoch, payload);
-@@ -223,8 +230,9 @@
- ::encode(discard_temp_oid, payload);
- ::encode(from, payload);
- ::encode(pgid.shard, payload);
- ::encode(updated_hit_set_history, payload);
-+ ::encode(pg_trim_rollback_to, payload);
- }
-
- MOSDSubOp()
- : Message(MSG_OSD_SUBOP, HEAD_VERSION, COMPAT_VERSION) { }
---- a/src/mon/DataHealthService.cc
-+++ b/src/mon/DataHealthService.cc
-@@ -227,9 +227,9 @@
- if (ours.latest_avail_percent <= g_conf->mon_data_avail_warn) {
- if (ours.latest_avail_percent != last_warned_percent)
- mon->clog.warn()
- << "reached concerning levels of available space on local monitor storage"
-- << " (" << ours.latest_avail_percent << "\% free)\n";
-+ << " (" << ours.latest_avail_percent << "% free)\n";
- last_warned_percent = ours.latest_avail_percent;
- } else {
- last_warned_percent = 0;
- }
---- a/src/mon/MonCommands.h
-+++ b/src/mon/MonCommands.h
-@@ -551,9 +551,9 @@
- "name=destpool,type=CephPoolname", \
- "rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
- COMMAND("osd pool get " \
- "name=pool,type=CephPoolname " \
-- "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid", \
-+ "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile", \
- "get pool parameter <var>", "osd", "r", "cli,rest")
- COMMAND("osd pool set " \
- "name=pool,type=CephPoolname " \
- "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid " \
-@@ -567,8 +567,12 @@
- "name=pool,type=CephPoolname " \
- "name=field,type=CephChoices,strings=max_objects|max_bytes " \
- "name=val,type=CephString",
- "set object or byte limit on pool", "osd", "rw", "cli,rest")
-+COMMAND("osd pool get-quota " \
-+ "name=pool,type=CephPoolname ",
-+ "obtain object or byte limits for pool",
-+ "osd", "r", "cli,rest")
- COMMAND("osd pool stats " \
- "name=name,type=CephString,req=false",
- "obtain stats from all pools, or from specified pool",
- "osd", "r", "cli,rest")
---- a/src/mon/Monitor.cc
-+++ b/src/mon/Monitor.cc
-@@ -620,8 +620,23 @@
-
- void Monitor::refresh_from_paxos(bool *need_bootstrap)
- {
- dout(10) << __func__ << dendl;
-+
-+ bufferlist bl;
-+ int r = store->get(MONITOR_NAME, "cluster_fingerprint", bl);
-+ if (r >= 0) {
-+ try {
-+ bufferlist::iterator p = bl.begin();
-+ ::decode(fingerprint, p);
-+ }
-+ catch (buffer::error& e) {
-+ dout(10) << __func__ << " failed to decode cluster_fingerprint" << dendl;
-+ }
-+ } else {
-+ dout(10) << __func__ << " no cluster_fingerprint" << dendl;
-+ }
-+
- for (int i = 0; i < PAXOS_NUM; ++i) {
- paxos_service[i]->refresh(need_bootstrap);
- }
- for (int i = 0; i < PAXOS_NUM; ++i) {
-@@ -2392,8 +2407,9 @@
- // this must be formatted, in its current form
- if (!f)
- f.reset(new_formatter("json-pretty"));
- f->open_object_section("report");
-+ f->dump_stream("cluster_fingerprint") << fingerprint;
- f->dump_string("version", ceph_version_to_str());
- f->dump_string("commit", git_version_to_str());
- f->dump_stream("timestamp") << ceph_clock_now(NULL);
-
-@@ -2865,10 +2881,11 @@
- // let it go through and be dispatched immediately!
- return dispatch(s, m, false);
- }
- dout(1) << __func__ << " dropping stray message " << *m
-- << " from " << m->get_source_inst() << dendl;
-- return false;
-+ << " from " << m->get_source_inst() << dendl;
-+ m->put();
-+ return true;
- }
-
- if (!exited_quorum.is_zero() && !src_is_mon) {
- waitlist_or_zap_client(m);
-@@ -3846,11 +3863,31 @@
- if (!maybe_wait_for_quorum.empty()) {
- finish_contexts(g_ceph_context, maybe_wait_for_quorum);
- }
-
-+ if (is_leader() && paxos->is_active() && fingerprint.is_zero()) {
-+ // this is only necessary on upgraded clusters.
-+ MonitorDBStore::Transaction t;
-+ prepare_new_fingerprint(&t);
-+ bufferlist tbl;
-+ t.encode(tbl);
-+ paxos->propose_new_value(tbl, new C_NoopContext);
-+ }
-+
- new_tick();
- }
-
-+void Monitor::prepare_new_fingerprint(MonitorDBStore::Transaction *t)
-+{
-+ uuid_d nf;
-+ nf.generate_random();
-+ dout(10) << __func__ << " proposing cluster_fingerprint " << nf << dendl;
-+
-+ bufferlist bl;
-+ ::encode(nf, bl);
-+ t->put(MONITOR_NAME, "cluster_fingerprint", bl);
-+}
-+
- int Monitor::check_fsid()
- {
- if (!store->exists(MONITOR_NAME, "cluster_uuid"))
- return -ENOENT;
---- a/src/mon/Monitor.h
-+++ b/src/mon/Monitor.h
-@@ -127,8 +127,9 @@
- void register_cluster_logger();
- void unregister_cluster_logger();
-
- MonMap *monmap;
-+ uuid_d fingerprint;
-
- set<entity_addr_t> extra_probe_peers;
-
- LogClient clog;
-@@ -189,8 +190,10 @@
- bool is_peon() const { return state == STATE_PEON; }
-
- const utime_t &get_leader_since() const;
-
-+ void prepare_new_fingerprint(MonitorDBStore::Transaction *t);
-+
- // -- elector --
- private:
- Paxos *paxos;
- Elector elector;
---- a/src/mon/MonmapMonitor.cc
-+++ b/src/mon/MonmapMonitor.cc
-@@ -96,8 +96,13 @@
- pending_map.encode(bl, mon->get_quorum_features());
-
- put_version(t, pending_map.epoch, bl);
- put_last_committed(t, pending_map.epoch);
-+
-+ // generate a cluster fingerprint, too?
-+ if (pending_map.epoch == 1) {
-+ mon->prepare_new_fingerprint(t);
-+ }
- }
-
- void MonmapMonitor::on_active()
- {
---- a/src/mon/OSDMonitor.cc
-+++ b/src/mon/OSDMonitor.cc
-@@ -2066,8 +2066,34 @@
- }
- }
- }
-
-+ // hit_set-less cache_mode?
-+ if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
-+ int problem_cache_pools = 0;
-+ for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
-+ p != osdmap.pools.end();
-+ ++p) {
-+ const pg_pool_t& info = p->second;
-+ if (info.cache_mode_requires_hit_set() &&
-+ info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
-+ ++problem_cache_pools;
-+ if (detail) {
-+ ostringstream ss;
-+ ss << "pool '" << osdmap.get_pool_name(p->first)
-+ << "' with cache_mode " << info.get_cache_mode_name()
-+ << " needs hit_set_type to be set but it is not";
-+ detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-+ }
-+ }
-+ }
-+ if (problem_cache_pools) {
-+ ostringstream ss;
-+ ss << problem_cache_pools << " cache pools are missing hit_sets";
-+ summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-+ }
-+ }
-+
- // Warn if 'mon_osd_down_out_interval' is set to zero.
- // Having this option set to zero on the leader acts much like the
- // 'noout' flag. It's hard to figure out what's going wrong with clusters
- // without the 'noout' flag set but acting like that just the same, so
-@@ -2452,8 +2478,28 @@
- const pg_pool_t *p = osdmap.get_pg_pool(pool);
- string var;
- cmd_getval(g_ceph_context, cmdmap, "var", var);
-
-+ if (!p->is_tier() &&
-+ (var == "hit_set_type" || var == "hit_set_period" ||
-+ var == "hit_set_count" || var == "hit_set_fpp" ||
-+ var == "target_max_objects" || var == "target_max_bytes" ||
-+ var == "cache_target_full_ratio" ||
-+ var == "cache_target_dirty_ratio" ||
-+ var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
-+ ss << "pool '" << poolstr
-+ << "' is not a tier pool: variable not applicable";
-+ r = -EACCES;
-+ goto reply;
-+ }
-+
-+ if (!p->is_erasure() && var == "erasure_code_profile") {
-+ ss << "pool '" << poolstr
-+ << "' is not a erasure pool: variable not applicable";
-+ r = -EACCES;
-+ goto reply;
-+ }
-+
- if (f) {
- f->open_object_section("pool");
- f->dump_string("pool", poolstr);
- f->dump_int("pool_id", pool);
-@@ -2487,8 +2533,28 @@
- } else {
- BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
- f->dump_float("hit_set_fpp", bloomp->get_fpp());
- }
-+ } else if (var == "target_max_objects") {
-+ f->dump_unsigned("target_max_objects", p->target_max_objects);
-+ } else if (var == "target_max_bytes") {
-+ f->dump_unsigned("target_max_bytes", p->target_max_bytes);
-+ } else if (var == "cache_target_dirty_ratio") {
-+ f->dump_unsigned("cache_target_dirty_ratio_micro",
-+ p->cache_target_dirty_ratio_micro);
-+ f->dump_float("cache_target_dirty_ratio",
-+ ((float)p->cache_target_dirty_ratio_micro/1000000));
-+ } else if (var == "cache_target_full_ratio") {
-+ f->dump_unsigned("cache_target_full_ratio_micro",
-+ p->cache_target_full_ratio_micro);
-+ f->dump_float("cache_target_full_ratio",
-+ ((float)p->cache_target_full_ratio_micro/1000000));
-+ } else if (var == "cache_min_flush_age") {
-+ f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
-+ } else if (var == "cache_min_evict_age") {
-+ f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
-+ } else if (var == "erasure_code_profile") {
-+ f->dump_string("erasure_code_profile", p->erasure_code_profile);
- }
-
- f->close_section();
- f->flush(rdata);
-@@ -2520,9 +2586,26 @@
- goto reply;
- }
- BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
- ss << "hit_set_fpp: " << bloomp->get_fpp();
-+ } else if (var == "target_max_objects") {
-+ ss << "target_max_objects: " << p->target_max_objects;
-+ } else if (var == "target_max_bytes") {
-+ ss << "target_max_bytes: " << p->target_max_bytes;
-+ } else if (var == "cache_target_dirty_ratio") {
-+ ss << "cache_target_dirty_ratio: "
-+ << ((float)p->cache_target_dirty_ratio_micro/1000000);
-+ } else if (var == "cache_target_full_ratio") {
-+ ss << "cache_target_full_ratio: "
-+ << ((float)p->cache_target_full_ratio_micro/1000000);
-+ } else if (var == "cache_min_flush_age") {
-+ ss << "cache_min_flush_age: " << p->cache_min_flush_age;
-+ } else if (var == "cache_min_evict_age") {
-+ ss << "cache_min_evict_age: " << p->cache_min_evict_age;
-+ } else if (var == "erasure_code_profile") {
-+ ss << "erasure_code_profile: " << p->erasure_code_profile;
- }
-+
- rdata.append(ss);
- ss.str("");
- }
- r = 0;
-@@ -2625,8 +2708,47 @@
- }
- rdata.append("\n");
- r = 0;
-
-+ } else if (prefix == "osd pool get-quota") {
-+ string pool_name;
-+ cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
-+
-+ int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
-+ if (poolid < 0) {
-+ assert(poolid == -ENOENT);
-+ ss << "unrecognized pool '" << pool_name << "'";
-+ r = -ENOENT;
-+ goto reply;
-+ }
-+ const pg_pool_t *p = osdmap.get_pg_pool(poolid);
-+
-+ if (f) {
-+ f->open_object_section("pool_quotas");
-+ f->dump_string("pool_name", pool_name);
-+ f->dump_unsigned("pool_id", poolid);
-+ f->dump_unsigned("quota_max_objects", p->quota_max_objects);
-+ f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
-+ f->close_section();
-+ f->flush(rdata);
-+ } else {
-+ stringstream rs;
-+ rs << "quotas for pool '" << pool_name << "':\n"
-+ << " max objects: ";
-+ if (p->quota_max_objects == 0)
-+ rs << "N/A";
-+ else
-+ rs << si_t(p->quota_max_objects) << " objects";
-+ rs << "\n"
-+ << " max bytes : ";
-+ if (p->quota_max_bytes == 0)
-+ rs << "N/A";
-+ else
-+ rs << si_t(p->quota_max_bytes) << "B";
-+ rdata.append(rs.str());
-+ }
-+ rdata.append("\n");
-+ r = 0;
- } else if (prefix == "osd crush rule list" ||
- prefix == "osd crush rule ls") {
- string format;
- cmd_getval(g_ceph_context, cmdmap, "format", format, string("json-pretty"));
-@@ -2924,17 +3046,20 @@
- const string &profile,
- int *ruleset,
- stringstream &ss)
- {
-- *ruleset = osdmap.crush->get_rule_id(name);
-- if (*ruleset != -ENOENT)
-+ int ruleid = osdmap.crush->get_rule_id(name);
-+ if (ruleid != -ENOENT) {
-+ *ruleset = osdmap.crush->get_rule_mask_ruleset(ruleid);
- return -EEXIST;
-+ }
-
- CrushWrapper newcrush;
- _get_pending_crush(newcrush);
-
-- *ruleset = newcrush.get_rule_id(name);
-- if (*ruleset != -ENOENT) {
-+ ruleid = newcrush.get_rule_id(name);
-+ if (ruleid != -ENOENT) {
-+ *ruleset = newcrush.get_rule_mask_ruleset(ruleid);
- return -EALREADY;
- } else {
- ErasureCodeInterfaceRef erasure_code;
- int err = get_erasure_code(profile, &erasure_code, ss);
-@@ -3088,22 +3213,25 @@
- }
-
- int OSDMonitor::prepare_pool_size(const unsigned pool_type,
- const string &erasure_code_profile,
-- unsigned *size,
-+ unsigned *size, unsigned *min_size,
- stringstream &ss)
- {
- int err = 0;
- switch (pool_type) {
- case pg_pool_t::TYPE_REPLICATED:
- *size = g_conf->osd_pool_default_size;
-+ *min_size = g_conf->get_osd_pool_default_min_size();
- break;
- case pg_pool_t::TYPE_ERASURE:
- {
- ErasureCodeInterfaceRef erasure_code;
- err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
-- if (err == 0)
-+ if (err == 0) {
- *size = erasure_code->get_chunk_count();
-+ *min_size = erasure_code->get_data_chunk_count();
-+ }
- }
- break;
- default:
- ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
-@@ -3218,10 +3346,10 @@
- r = prepare_pool_crush_ruleset(pool_type, erasure_code_profile,
- crush_ruleset_name, &crush_ruleset, ss);
- if (r)
- return r;
-- unsigned size;
-- r = prepare_pool_size(pool_type, erasure_code_profile, &size, ss);
-+ unsigned size, min_size;
-+ r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
- if (r)
- return r;
- uint32_t stripe_width = 0;
- r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
-@@ -3245,9 +3373,9 @@
- if (g_conf->osd_pool_default_flag_hashpspool)
- pi->flags |= pg_pool_t::FLAG_HASHPSPOOL;
-
- pi->size = size;
-- pi->min_size = g_conf->get_osd_pool_default_min_size();
-+ pi->min_size = min_size;
- pi->crush_ruleset = crush_ruleset;
- pi->object_hash = CEPH_STR_HASH_RJENKINS;
- pi->set_pg_num(pg_num ? pg_num : g_conf->osd_pool_default_pg_num);
- pi->set_pgp_num(pgp_num ? pgp_num : g_conf->osd_pool_default_pgp_num);
-@@ -3335,8 +3463,9 @@
- string val;
- string interr, floaterr;
- int64_t n = 0;
- double f = 0;
-+ int64_t uf = 0; // micro-f
- if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
- // wasn't a string; maybe an older mon forwarded json with an int?
- if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
- return -EINVAL; // no value!
-@@ -3344,8 +3473,19 @@
- // we got a string. see if it contains an int.
- n = strict_strtoll(val.c_str(), 10, &interr);
- // or a float
- f = strict_strtod(val.c_str(), &floaterr);
-+ uf = llrintl(f * (double)1000000.0);
-+ }
-+
-+ if (!p.is_tier() &&
-+ (var == "hit_set_type" || var == "hit_set_period" ||
-+ var == "hit_set_count" || var == "hit_set_fpp" ||
-+ var == "target_max_objects" || var == "target_max_bytes" ||
-+ var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
-+ var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
-+ ss << "pool '" << poolstr << "' is not a tier pool: variable not applicable";
-+ return -EACCES;
- }
-
- if (var == "size") {
- if (p.type == pg_pool_t::TYPE_ERASURE) {
-@@ -3398,9 +3538,9 @@
- force != "--yes-i-really-mean-it") {
- ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
- return -EPERM;
- }
-- int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
-+ int expected_osds = MAX(1, MIN(p.get_pg_num(), osdmap.get_num_osds()));
- int64_t new_pgs = n - p.get_pg_num();
- int64_t pgs_per_osd = new_pgs / expected_osds;
- if (pgs_per_osd > g_conf->mon_osd_max_split_count) {
- ss << "specified pg_num " << n << " is too large (creating "
-@@ -3486,8 +3626,9 @@
- return -EINVAL;
- }
- p.hit_set_period = n;
- } else if (var == "hit_set_count") {
-+
- if (interr.length()) {
- ss << "error parsing integer value '" << val << "': " << interr;
- return -EINVAL;
- }
-@@ -3527,9 +3668,9 @@
- if (f < 0 || f > 1.0) {
- ss << "value must be in the range 0..1";
- return -ERANGE;
- }
-- p.cache_target_dirty_ratio_micro = f * 1000000;
-+ p.cache_target_dirty_ratio_micro = uf;
- } else if (var == "cache_target_full_ratio") {
- if (floaterr.length()) {
- ss << "error parsing float '" << val << "': " << floaterr;
- return -EINVAL;
-@@ -3537,9 +3678,9 @@
- if (f < 0 || f > 1.0) {
- ss << "value must be in the range 0..1";
- return -ERANGE;
- }
-- p.cache_target_full_ratio_micro = f * 1000000;
-+ p.cache_target_full_ratio_micro = uf;
- } else if (var == "cache_min_flush_age") {
- if (interr.length()) {
- ss << "error parsing int '" << val << "': " << interr;
- return -EINVAL;
-@@ -4171,8 +4312,26 @@
- string profile;
- cmd_getval(g_ceph_context, cmdmap, "profile", profile);
- if (profile == "")
- profile = "default";
-+ if (profile == "default") {
-+ if (!osdmap.has_erasure_code_profile(profile)) {
-+ if (pending_inc.has_erasure_code_profile(profile)) {
-+ dout(20) << "erasure code profile " << profile << " already pending" << dendl;
-+ goto wait;
-+ }
-+
-+ map<string,string> profile_map;
-+ err = osdmap.get_erasure_code_profile_default(g_ceph_context,
-+ profile_map,
-+ &ss);
-+ if (err)
-+ goto reply;
-+ dout(20) << "erasure code profile " << profile << " set" << dendl;
-+ pending_inc.set_erasure_code_profile(profile, profile_map);
-+ goto wait;
-+ }
-+ }
-
- int ruleset;
- err = crush_ruleset_create_erasure(name, profile, &ruleset, ss);
- if (err < 0) {
-@@ -4846,8 +5005,27 @@
- string erasure_code_profile;
- cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
- if (erasure_code_profile == "")
- erasure_code_profile = "default";
-+ if (erasure_code_profile == "default") {
-+ if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
-+ if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
-+ dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
-+ goto wait;
-+ }
-+
-+ map<string,string> profile_map;
-+ err = osdmap.get_erasure_code_profile_default(g_ceph_context,
-+ profile_map,
-+ &ss);
-+ if (err)
-+ goto reply;
-+ dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
-+ pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
-+ goto wait;
-+ }
-+ }
-+
- if (ruleset_name == "") {
- if (erasure_code_profile == "default") {
- ruleset_name = "erasure-code";
- } else {
-@@ -5053,9 +5231,12 @@
- err = 0;
- goto reply;
- }
- if (tp->tier_of != pool_id) {
-- ss << "tier pool '" << tierpoolstr << "' is a tier of '" << tp->tier_of << "'";
-+ ss << "tier pool '" << tierpoolstr << "' is a tier of '"
-+ << osdmap.get_pool_name(tp->tier_of) << "': "
-+ // be scary about it; this is an inconsistency and bells must go off
-+ << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
- err = -EINVAL;
- goto reply;
- }
- if (p->read_tier == tierpool_id) {
-@@ -5181,10 +5362,69 @@
- ss << "'" << modestr << "' is not a valid cache mode";
- err = -EINVAL;
- goto reply;
- }
-+
-+ // pool already has this cache-mode set and there are no pending changes
-+ if (p->cache_mode == mode &&
-+ (pending_inc.new_pools.count(pool_id) == 0 ||
-+ pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
-+ ss << "set cache-mode for pool '" << poolstr << "'"
-+ << " to " << pg_pool_t::get_cache_mode_name(mode);
-+ err = 0;
-+ goto reply;
-+ }
-+
-+ /* Mode description:
-+ *
-+ * none: No cache-mode defined
-+ * forward: Forward all reads and writes to base pool
-+ * writeback: Cache writes, promote reads from base pool
-+ * readonly: Forward writes to base pool
-+ *
-+ * Hence, these are the allowed transitions:
-+ *
-+ * none -> any
-+ * forward -> writeback || any IF num_objects_dirty == 0
-+ * writeback -> forward
-+ * readonly -> any
-+ */
-+
-+ // We check if the transition is valid against the current pool mode, as
-+ // it is the only committed state thus far. We will blantly squash
-+ // whatever mode is on the pending state.
-+
-+ if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
-+ mode != pg_pool_t::CACHEMODE_FORWARD) {
-+ ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
-+ << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
-+ << "' pool; only '"
-+ << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
-+ << "' allowed.";
-+ err = -EINVAL;
-+ goto reply;
-+ }
-+ if (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
-+ mode != pg_pool_t::CACHEMODE_WRITEBACK) {
-+
-+ const pool_stat_t& tier_stats =
-+ mon->pgmon()->pg_map.get_pg_pool_sum_stat(pool_id);
-+
-+ if (tier_stats.stats.sum.num_objects_dirty > 0) {
-+ ss << "unable to set cache-mode '"
-+ << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
-+ << "': dirty objects found";
-+ err = -EBUSY;
-+ goto reply;
-+ }
-+ }
-+
- // go
-- pending_inc.get_new_pool(pool_id, p)->cache_mode = mode;
-+ pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
-+ np->cache_mode = mode;
-+ // set this both when moving to and from cache_mode NONE. this is to
-+ // capture legacy pools that were set up before this flag existed.
-+ np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
- ss << "set cache-mode for pool '" << poolstr
- << "' to " << pg_pool_t::get_cache_mode_name(mode);
- wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
- get_last_committed() + 1));
-@@ -5622,10 +5862,14 @@
- << osdmap.get_pool_name(p->tier_of) << "'";
- return -EBUSY;
- }
- if (!p->tiers.empty()) {
-- *ss << "pool '" << poolstr << "' includes tiers "
-- << p->tiers;
-+ *ss << "pool '" << poolstr << "' has tiers";
-+ for(std::set<uint64_t>::iterator i = p->tiers.begin(); i != p->tiers.end(); ++i) {
-+ const char *name = osdmap.get_pool_name(*i);
-+ assert(name != NULL);
-+ *ss << " " << name;
-+ }
- return -EBUSY;
- }
- *ss << "pool '" << poolstr << "' removed";
- return 0;
---- a/src/mon/OSDMonitor.h
-+++ b/src/mon/OSDMonitor.h
-@@ -271,9 +271,9 @@
- map<string,string> *erasure_code_profile_map,
- stringstream &ss);
- int prepare_pool_size(const unsigned pool_type,
- const string &erasure_code_profile,
-- unsigned *size,
-+ unsigned *size, unsigned *min_size,
- stringstream &ss);
- int prepare_pool_stripe_width(const unsigned pool_type,
- const string &erasure_code_profile,
- unsigned *stripe_width,
---- a/src/mon/PGMonitor.cc
-+++ b/src/mon/PGMonitor.cc
-@@ -1214,13 +1214,15 @@
- }
-
- //void PGMonitor::dump_object_stat_sum(stringstream& ss, Formatter *f,
- void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f,
-- object_stat_sum_t &sum, bool verbose)
-+ object_stat_sum_t &sum, uint64_t avail,
-+ bool verbose)
- {
- if (f) {
- f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
- f->dump_int("bytes_used", sum.num_bytes);
-+ f->dump_unsigned("max_avail", avail);
- f->dump_int("objects", sum.num_objects);
- if (verbose) {
- f->dump_int("dirty", sum.num_objects_dirty);
- f->dump_int("rd", sum.num_rd);
-@@ -1231,8 +1233,9 @@
- } else {
- tbl << stringify(si_t(sum.num_bytes));
- int64_t kb_used = SHIFT_ROUND_UP(sum.num_bytes, 10);
- tbl << percentify(((float)kb_used / pg_map.osd_sum.kb)*100);
-+ tbl << si_t(avail);
- tbl << sum.num_objects;
- if (verbose) {
- tbl << stringify(si_t(sum.num_objects_dirty))
- << stringify(si_t(sum.num_rd))
-@@ -1240,8 +1243,26 @@
- }
- }
- }
-
-+int64_t PGMonitor::get_rule_avail(OSDMap& osdmap, int ruleno)
-+{
-+ map<int,float> wm;
-+ int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
-+ if (r < 0)
-+ return r;
-+ if(wm.size() == 0)
-+ return 0;
-+ int64_t min = -1;
-+ for (map<int,float>::iterator p = wm.begin(); p != wm.end(); ++p) {
-+ int64_t proj = (float)(pg_map.osd_stat[p->first].kb_avail * 1024ull) /
-+ (double)p->second;
-+ if (min < 0 || proj < min)
-+ min = proj;
-+ }
-+ return min;
-+}
-+
- void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
- {
- TextTable tbl;
-
-@@ -1251,18 +1272,20 @@
- tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
- tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
- if (verbose)
- tbl.define_column("CATEGORY", TextTable::LEFT, TextTable::LEFT);
-- tbl.define_column("USED", TextTable::LEFT, TextTable::LEFT);
-- tbl.define_column("\%USED", TextTable::LEFT, TextTable::LEFT);
-- tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::LEFT);
-- if (verbose) {
-- tbl.define_column("DIRTY", TextTable::LEFT, TextTable::LEFT);
-- tbl.define_column("READ", TextTable::LEFT, TextTable::LEFT);
-- tbl.define_column("WRITE", TextTable::LEFT, TextTable::LEFT);
-+ tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
-+ tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
-+ tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
-+ tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
-+ if (verbose) {
-+ tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
-+ tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
-+ tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
- }
- }
-
-+ map<int,uint64_t> avail_by_rule;
- OSDMap &osdmap = mon->osdmon()->osdmap;
- for (map<int64_t,pg_pool_t>::const_iterator p = osdmap.get_pools().begin();
- p != osdmap.get_pools().end(); ++p) {
- int64_t pool_id = p->first;
-@@ -1270,8 +1293,40 @@
- continue;
- string pool_name = osdmap.get_pool_name(pool_id);
- pool_stat_t &stat = pg_map.pg_pool_sum[pool_id];
-
-+ const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
-+ int ruleno = osdmap.crush->find_rule(pool->get_crush_ruleset(),
-+ pool->get_type(),
-+ pool->get_size());
-+ uint64_t avail;
-+ if (avail_by_rule.count(ruleno) == 0) {
-+ avail = get_rule_avail(osdmap, ruleno);
-+ avail_by_rule[ruleno] = avail;
-+ } else {
-+ avail = avail_by_rule[ruleno];
-+ }
-+ switch (pool->get_type()) {
-+ case pg_pool_t::TYPE_REPLICATED:
-+ avail /= pool->get_size();
-+ break;
-+ case pg_pool_t::TYPE_ERASURE:
-+ {
-+ const map<string,string>& ecp =
-+ osdmap.get_erasure_code_profile(pool->erasure_code_profile);
-+ map<string,string>::const_iterator pm = ecp.find("m");
-+ map<string,string>::const_iterator pk = ecp.find("k");
-+ if (pm != ecp.end() && pk != ecp.end()) {
-+ int k = atoi(pk->second.c_str());
-+ int m = atoi(pm->second.c_str());
-+ avail = avail * k / (m + k);
-+ }
-+ }
-+ break;
-+ default:
-+ assert(0 == "unrecognized pool type");
-+ }
-+
- if (f) {
- f->open_object_section("pool");
- f->dump_string("name", pool_name);
- f->dump_int("id", pool_id);
-@@ -1281,9 +1336,9 @@
- << pool_id;
- if (verbose)
- tbl << "-";
- }
-- dump_object_stat_sum(tbl, f, stat.stats.sum, verbose);
-+ dump_object_stat_sum(tbl, f, stat.stats.sum, avail, verbose);
- if (f)
- f->close_section(); // stats
- else
- tbl << TextTable::endrow;
-@@ -1300,9 +1355,9 @@
- tbl << ""
- << ""
- << it->first;
- }
-- dump_object_stat_sum(tbl, f, it->second, verbose);
-+ dump_object_stat_sum(tbl, f, it->second, avail, verbose);
- if (f)
- f->close_section(); // category name
- else
- tbl << TextTable::endrow;
-@@ -1334,14 +1389,14 @@
- }
- f->close_section();
- } else {
- TextTable tbl;
-- tbl.define_column("SIZE", TextTable::LEFT, TextTable::LEFT);
-- tbl.define_column("AVAIL", TextTable::LEFT, TextTable::LEFT);
-- tbl.define_column("RAW USED", TextTable::LEFT, TextTable::LEFT);
-- tbl.define_column("\%RAW USED", TextTable::LEFT, TextTable::LEFT);
-+ tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
-+ tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
-+ tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
-+ tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
- if (verbose) {
-- tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::LEFT);
-+ tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
- }
- tbl << stringify(si_t(pg_map.osd_sum.kb*1024))
- << stringify(si_t(pg_map.osd_sum.kb_avail*1024))
- << stringify(si_t(pg_map.osd_sum.kb_used*1024));
---- a/src/mon/PGMonitor.h
-+++ b/src/mon/PGMonitor.h
-@@ -145,9 +145,13 @@
- int threshold,
- vector<string>& args) const;
-
- void dump_object_stat_sum(TextTable &tbl, Formatter *f,
-- object_stat_sum_t &sum, bool verbose);
-+ object_stat_sum_t &sum,
-+ uint64_t avail,
-+ bool verbose);
-+
-+ int64_t get_rule_avail(OSDMap& osdmap, int ruleno);
-
- public:
- PGMonitor(Monitor *mn, Paxos *p, const string& service_name)
- : PaxosService(mn, p, service_name),
---- a/src/mon/Paxos.cc
-+++ b/src/mon/Paxos.cc
-@@ -1263,9 +1263,10 @@
- // -- READ --
-
- bool Paxos::is_readable(version_t v)
- {
-- dout(1) << "is_readable now=" << ceph_clock_now(g_ceph_context) << " lease_expire=" << lease_expire
-+ dout(5) << "is_readable now=" << ceph_clock_now(g_ceph_context)
-+ << " lease_expire=" << lease_expire
- << " has v" << v << " lc " << last_committed << dendl;
- if (v > last_committed)
- return false;
- return
---- a/src/msg/SimpleMessenger.cc
-+++ b/src/msg/SimpleMessenger.cc
-@@ -85,8 +85,11 @@
- {
- ldout(cct,10) << "shutdown " << get_myaddr() << dendl;
- mark_down_all();
- dispatch_queue.shutdown();
-+
-+ // break ref cycles on the loopback connection
-+ local_connection->set_priv(NULL);
- return 0;
- }
-
- int SimpleMessenger::_send_message(Message *m, const entity_inst_t& dest,
---- a/src/os/FileJournal.cc
-+++ b/src/os/FileJournal.cc
-@@ -1757,9 +1757,14 @@
-
- // ok!
- if (seq)
- *seq = h->seq;
-- journalq.push_back(pair<uint64_t,off64_t>(h->seq, pos));
-+
-+ // works around an apparent GCC 4.8(?) compiler bug about unaligned
-+ // bind by reference to (packed) h->seq
-+ journalq.push_back(
-+ pair<uint64_t,off64_t>(static_cast<uint64_t>(h->seq),
-+ static_cast<off64_t>(pos)));
-
- if (next_pos)
- *next_pos = pos;
-
---- a/src/os/FileStore.cc
-+++ b/src/os/FileStore.cc
-@@ -125,9 +125,9 @@
- PerfCounters &logger)
- {
- os_commit_latency.consume_next(
- logger.get_tavg_ms(
-- l_os_commit_lat));
-+ l_os_j_lat));
- os_apply_latency.consume_next(
- logger.get_tavg_ms(
- l_os_apply_lat));
- }
-@@ -1557,8 +1557,10 @@
- delete backend;
- backend = generic_backend;
- }
-
-+ force_sync = false;
-+
- object_map.reset();
-
- {
- Mutex::Locker l(sync_entry_timeo_lock);
-@@ -1710,9 +1712,10 @@
- }
-
- void FileStore::_finish_op(OpSequencer *osr)
- {
-- Op *o = osr->dequeue();
-+ list<Context*> to_queue;
-+ Op *o = osr->dequeue(&to_queue);
-
- dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << dendl;
- osr->apply_lock.Unlock(); // locked in _do_op
-
-@@ -1728,8 +1731,9 @@
- }
- if (o->onreadable) {
- op_finisher.queue(o->onreadable);
- }
-+ op_finisher.queue(to_queue);
- delete o;
- }
-
-
-@@ -1843,16 +1847,18 @@
-
- // this should queue in order because the journal does it's completions in order.
- queue_op(osr, o);
-
-- osr->dequeue_journal();
-+ list<Context*> to_queue;
-+ osr->dequeue_journal(&to_queue);
-
- // do ondisk completions async, to prevent any onreadable_sync completions
- // getting blocked behind an ondisk completion.
- if (ondisk) {
- dout(10) << " queueing ondisk " << ondisk << dendl;
- ondisk_finisher.queue(ondisk);
- }
-+ ondisk_finisher.queue(to_queue);
- }
-
- int FileStore::_do_transactions(
- list<Transaction*> &tls,
-@@ -2544,13 +2550,14 @@
- t.dump(&f);
- f.close_section();
- f.flush(*_dout);
- *_dout << dendl;
-- assert(0 == "unexpected error");
-
- if (r == -EMFILE) {
- dump_open_fds(g_ceph_context);
- }
-+
-+ assert(0 == "unexpected error");
- }
- }
-
- spos.op++;
---- a/src/os/FileStore.h
-+++ b/src/os/FileStore.h
-@@ -192,21 +192,72 @@
- class OpSequencer : public Sequencer_impl {
- Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
- list<Op*> q;
- list<uint64_t> jq;
-+ list<pair<uint64_t, Context*> > flush_commit_waiters;
- Cond cond;
- public:
- Sequencer *parent;
- Mutex apply_lock; // for apply mutual exclusion
-
-+ /// get_max_uncompleted
-+ bool _get_max_uncompleted(
-+ uint64_t *seq ///< [out] max uncompleted seq
-+ ) {
-+ assert(qlock.is_locked());
-+ assert(seq);
-+ *seq = 0;
-+ if (q.empty() && jq.empty())
-+ return true;
-+
-+ if (!q.empty())
-+ *seq = q.back()->op;
-+ if (!jq.empty() && jq.back() > *seq)
-+ *seq = jq.back();
-+
-+ return false;
-+ } /// @returns true if both queues are empty
-+
-+ /// get_min_uncompleted
-+ bool _get_min_uncompleted(
-+ uint64_t *seq ///< [out] min uncompleted seq
-+ ) {
-+ assert(qlock.is_locked());
-+ assert(seq);
-+ *seq = 0;
-+ if (q.empty() && jq.empty())
-+ return true;
-+
-+ if (!q.empty())
-+ *seq = q.front()->op;
-+ if (!jq.empty() && jq.front() < *seq)
-+ *seq = jq.front();
-+
-+ return false;
-+ } /// @returns true if both queues are empty
-+
-+ void _wake_flush_waiters(list<Context*> *to_queue) {
-+ uint64_t seq;
-+ if (_get_min_uncompleted(&seq))
-+ seq = -1;
-+
-+ for (list<pair<uint64_t, Context*> >::iterator i =
-+ flush_commit_waiters.begin();
-+ i != flush_commit_waiters.end() && i->first < seq;
-+ flush_commit_waiters.erase(i++)) {
-+ to_queue->push_back(i->second);
-+ }
-+ }
-+
- void queue_journal(uint64_t s) {
- Mutex::Locker l(qlock);
- jq.push_back(s);
- }
-- void dequeue_journal() {
-+ void dequeue_journal(list<Context*> *to_queue) {
- Mutex::Locker l(qlock);
- jq.pop_front();
- cond.Signal();
-+ _wake_flush_waiters(to_queue);
- }
- void queue(Op *o) {
- Mutex::Locker l(qlock);
- q.push_back(o);
-@@ -214,22 +265,28 @@
- Op *peek_queue() {
- assert(apply_lock.is_locked());
- return q.front();
- }
-- Op *dequeue() {
-+
-+ Op *dequeue(list<Context*> *to_queue) {
-+ assert(to_queue);
- assert(apply_lock.is_locked());
- Mutex::Locker l(qlock);
- Op *o = q.front();
- q.pop_front();
- cond.Signal();
-+
-+ _wake_flush_waiters(to_queue);
- return o;
- }
-+
- void flush() {
- Mutex::Locker l(qlock);
-
- while (g_conf->filestore_blackhole)
- cond.Wait(qlock); // wait forever
-
-+
- // get max for journal _or_ op queues
- uint64_t seq = 0;
- if (!q.empty())
- seq = q.back()->op;
-@@ -242,8 +299,19 @@
- (!jq.empty() && jq.front() <= seq))
- cond.Wait(qlock);
- }
- }
-+ bool flush_commit(Context *c) {
-+ Mutex::Locker l(qlock);
-+ uint64_t seq = 0;
-+ if (_get_max_uncompleted(&seq)) {
-+ delete c;
-+ return true;
-+ } else {
-+ flush_commit_waiters.push_back(make_pair(seq, c));
-+ return false;
-+ }
-+ }
-
- OpSequencer()
- : qlock("FileStore::OpSequencer::qlock", false, false),
- parent(0),
---- a/src/os/GenericObjectMap.cc
-+++ b/src/os/GenericObjectMap.cc
-@@ -688,10 +688,8 @@
- remove_header(old_header->cid, old_header->oid, old_header, t);
- old_header->cid = cid;
- old_header->oid = target;
- set_header(cid, target, *old_header, t);
--
-- // "in_use" still hold the "seq"
- }
-
- int GenericObjectMap::init(bool do_upgrade)
- {
-@@ -925,64 +923,43 @@
- set<string> to_get;
- to_get.insert(header_key(cid, oid));
- _Header header;
-
-- while (1) {
-- map<string, bufferlist> out;
-- bool try_again = false;
-+ map<string, bufferlist> out;
-
-- int r = db->get(GHOBJECT_TO_SEQ_PREFIX, to_get, &out);
-- if (r < 0)
-- return Header();
-- if (out.empty())
-- return Header();
--
-- bufferlist::iterator iter = out.begin()->second.begin();
-- header.decode(iter);
--
-- while (in_use.count(header.seq)) {
-- header_cond.Wait(header_lock);
--
-- // Another thread is hold this header, wait for it.
-- // Because the seq of this object may change, such as clone
-- // and rename operation, here need to look up "seq" again
-- try_again = true;
-- }
-+ int r = db->get(GHOBJECT_TO_SEQ_PREFIX, to_get, &out);
-+ if (r < 0)
-+ return Header();
-+ if (out.empty())
-+ return Header();
-
-- if (!try_again) {
-- break;
-- }
-- }
-+ bufferlist::iterator iter = out.begin()->second.begin();
-+ header.decode(iter);
-
-- Header ret = Header(new _Header(header), RemoveOnDelete(this));
-- in_use.insert(ret->seq);
-+ Header ret = Header(new _Header(header));
- return ret;
- }
-
- GenericObjectMap::Header GenericObjectMap::_generate_new_header(
- const coll_t &cid, const ghobject_t &oid, Header parent,
- KeyValueDB::Transaction t)
- {
-- Header header = Header(new _Header(), RemoveOnDelete(this));
-+ Header header = Header(new _Header());
- header->seq = state.seq++;
- if (parent) {
- header->parent = parent->seq;
- }
- header->num_children = 1;
- header->oid = oid;
- header->cid = cid;
-- assert(!in_use.count(header->seq));
-- in_use.insert(header->seq);
-
- write_state(t);
- return header;
- }
-
- GenericObjectMap::Header GenericObjectMap::lookup_parent(Header input)
- {
- Mutex::Locker l(header_lock);
-- while (in_use.count(input->parent))
-- header_cond.Wait(header_lock);
- map<string, bufferlist> out;
- set<string> keys;
- keys.insert(PARENT_KEY);
-
-@@ -998,15 +975,14 @@
- assert(0);
- return Header();
- }
-
-- Header header = Header(new _Header(), RemoveOnDelete(this));
-+ Header header = Header(new _Header());
- header->seq = input->parent;
- bufferlist::iterator iter = out.begin()->second.begin();
- header->decode(iter);
- dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent "
- << header->parent << dendl;
-- in_use.insert(header->seq);
- return header;
- }
-
- GenericObjectMap::Header GenericObjectMap::lookup_create_header(
---- a/src/os/GenericObjectMap.h
-+++ b/src/os/GenericObjectMap.h
-@@ -73,14 +73,8 @@
- /**
- * Serializes access to next_seq as well as the in_use set
- */
- Mutex header_lock;
-- Cond header_cond;
--
-- /**
-- * Set of headers currently in use
-- */
-- set<uint64_t> in_use;
-
- GenericObjectMap(KeyValueDB *db) : db(db), header_lock("GenericObjectMap") {}
-
- int get(
-@@ -370,8 +364,14 @@
- GenericObjectMapIterator _get_iterator(Header header, string prefix) {
- return GenericObjectMapIterator(new GenericObjectMapIteratorImpl(this, header, prefix));
- }
-
-+ Header generate_new_header(const coll_t &cid, const ghobject_t &oid,
-+ Header parent, KeyValueDB::Transaction t) {
-+ Mutex::Locker l(header_lock);
-+ return _generate_new_header(cid, oid, parent, t);
-+ }
-+
- // Scan keys in header into out_keys and out_values (if nonnull)
- int scan(Header header, const string &prefix, const set<string> &in_keys,
- set<string> *out_keys, map<string, bufferlist> *out_values);
-
-@@ -393,13 +393,8 @@
- * Has the side effect of syncronously saving the new GenericObjectMap state
- */
- Header _generate_new_header(const coll_t &cid, const ghobject_t &oid,
- Header parent, KeyValueDB::Transaction t);
-- Header generate_new_header(const coll_t &cid, const ghobject_t &oid,
-- Header parent, KeyValueDB::Transaction t) {
-- Mutex::Locker l(header_lock);
-- return _generate_new_header(cid, oid, parent, t);
-- }
-
- // Lookup leaf header for c oid
- Header _lookup_header(const coll_t &cid, const ghobject_t &oid);
-
-@@ -424,28 +419,8 @@
-
- // Sets header @see set_header
- void _set_header(Header header, const bufferlist &bl,
- KeyValueDB::Transaction t);
--
-- /**
-- * Removes header seq lock once Header is out of scope
-- * @see _lookup_header
-- * @see lookup_parent
-- * @see generate_new_header
-- */
-- class RemoveOnDelete {
-- public:
-- GenericObjectMap *db;
-- RemoveOnDelete(GenericObjectMap *db) :
-- db(db) {}
-- void operator() (_Header *header) {
-- Mutex::Locker l(db->header_lock);
-- db->in_use.erase(header->seq);
-- db->header_cond.Signal();
-- delete header;
-- }
-- };
-- friend class RemoveOnDelete;
- };
- WRITE_CLASS_ENCODER(GenericObjectMap::_Header)
- WRITE_CLASS_ENCODER(GenericObjectMap::State)
-
---- a/src/os/KeyValueStore.cc
-+++ b/src/os/KeyValueStore.cc
-@@ -68,90 +68,78 @@
- const string KeyValueStore::COLLECTION_ATTR = "__COLL_ATTR__";
-
- // ============== StripObjectMap Implementation =================
-
--void StripObjectMap::sync_wrap(StripObjectHeader &strip_header,
-- KeyValueDB::Transaction t,
-- const SequencerPosition &spos)
--{
-- dout(10) << __func__ << " cid: " << strip_header.cid << "oid: "
-- << strip_header.oid << " setting spos to " << strip_header.spos
-- << dendl;
-- strip_header.spos = spos;
-- strip_header.header->data.clear();
-- ::encode(strip_header, strip_header.header->data);
--
-- sync(strip_header.header, t);
--}
--
--bool StripObjectMap::check_spos(const StripObjectHeader &header,
-- const SequencerPosition &spos)
--{
-- if (spos > header.spos) {
-- stringstream out;
-- dout(10) << "cid: " << "oid: " << header.oid
-- << " not skipping op, *spos " << spos << dendl;
-- dout(10) << " > header.spos " << header.spos << dendl;
-- return false;
-- } else {
-- dout(10) << "cid: " << "oid: " << header.oid << " skipping op, spos "
-- << spos << " <= header.spos " << header.spos << dendl;
-- return true;
-- }
--}
--
--int StripObjectMap::save_strip_header(StripObjectHeader &strip_header,
-- const SequencerPosition &spos,
-+int StripObjectMap::save_strip_header(StripObjectHeaderRef strip_header,
- KeyValueDB::Transaction t)
- {
-- strip_header.spos = spos;
-- strip_header.header->data.clear();
-- ::encode(strip_header, strip_header.header->data);
-+ strip_header->header->data.clear();
-+ ::encode(*strip_header, strip_header->header->data);
-
-- set_header(strip_header.cid, strip_header.oid, *(strip_header.header), t);
-+ set_header(strip_header->cid, strip_header->oid, *(strip_header->header), t);
- return 0;
- }
-
- int StripObjectMap::create_strip_header(const coll_t &cid,
- const ghobject_t &oid,
-- StripObjectHeader &strip_header,
-+ StripObjectHeaderRef *strip_header,
- KeyValueDB::Transaction t)
- {
-- Header header = lookup_create_header(cid, oid, t);
-+ Header header = generate_new_header(cid, oid, Header(), t);
- if (!header)
- return -EINVAL;
-
-- strip_header.oid = oid;
-- strip_header.cid = cid;
-- strip_header.header = header;
-+ StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
-+ tmp->oid = oid;
-+ tmp->cid = cid;
-+ tmp->header = header;
-+ if (strip_header)
-+ *strip_header = tmp;
-
- return 0;
- }
-
- int StripObjectMap::lookup_strip_header(const coll_t &cid,
- const ghobject_t &oid,
-- StripObjectHeader &strip_header)
-+ StripObjectHeaderRef *strip_header)
- {
-+ if (cid != coll_t()) {
-+ Mutex::Locker l(lock);
-+ pair<coll_t, StripObjectHeaderRef> p;
-+ if (caches.lookup(oid, &p)) {
-+ if (p.first == cid) {
-+ *strip_header = p.second;
-+ return 0;
-+ }
-+ }
-+ }
- Header header = lookup_header(cid, oid);
-
- if (!header) {
- dout(20) << "lookup_strip_header failed to get strip_header "
- << " cid " << cid <<" oid " << oid << dendl;
- return -ENOENT;
- }
-
-+
-+ StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
- if (header->data.length()) {
- bufferlist::iterator bliter = header->data.begin();
-- ::decode(strip_header, bliter);
-+ ::decode(*tmp, bliter);
- }
-
-- if (strip_header.strip_size == 0)
-- strip_header.strip_size = default_strip_size;
-+ if (tmp->strip_size == 0)
-+ tmp->strip_size = default_strip_size;
-
-- strip_header.oid = oid;
-- strip_header.cid = cid;
-- strip_header.header = header;
-+ tmp->oid = oid;
-+ tmp->cid = cid;
-+ tmp->header = header;
-
-+ {
-+ Mutex::Locker l(lock);
-+ caches.add(oid, make_pair(cid, tmp));
-+ }
-+ *strip_header = tmp;
- dout(10) << "lookup_strip_header done " << " cid " << cid << " oid "
- << oid << dendl;
- return 0;
- }
-@@ -193,125 +181,114 @@
- dout(10) << "file_to_extents done " << dendl;
- return 0;
- }
-
--void StripObjectMap::clone_wrap(StripObjectHeader &old_header,
-+void StripObjectMap::clone_wrap(StripObjectHeaderRef old_header,
- const coll_t &cid, const ghobject_t &oid,
- KeyValueDB::Transaction t,
-- StripObjectHeader *origin_header,
-- StripObjectHeader *target_header)
-+ StripObjectHeaderRef *target_header)
- {
- Header new_origin_header;
-+ StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
-
-- if (target_header)
-- *target_header = old_header;
-- if (origin_header)
-- *origin_header = old_header;
--
-- clone(old_header.header, cid, oid, t, &new_origin_header,
-- &target_header->header);
-+ clone(old_header->header, cid, oid, t, &new_origin_header,
-+ &tmp->header);
-
-- if(origin_header)
-- origin_header->header = new_origin_header;
-+ tmp->oid = oid;
-+ tmp->cid = cid;
-+ tmp->strip_size = old_header->strip_size;
-+ tmp->max_size = old_header->max_size;
-+ tmp->bits = old_header->bits;
-+ old_header->header = new_origin_header;
-
-- if (target_header) {
-- target_header->oid = oid;
-- target_header->cid = cid;
-- }
-+ if (target_header)
-+ *target_header = tmp;
- }
-
--void StripObjectMap::rename_wrap(const coll_t &cid, const ghobject_t &oid,
-+void StripObjectMap::rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid,
- KeyValueDB::Transaction t,
-- StripObjectHeader *header)
-+ StripObjectHeaderRef *new_header)
- {
-- assert(header);
-- rename(header->header, cid, oid, t);
-+ rename(old_header->header, cid, oid, t);
-
-- if (header) {
-- header->oid = oid;
-- header->cid = cid;
-- }
-+ StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
-+ tmp->strip_size = old_header->strip_size;
-+ tmp->max_size = old_header->max_size;
-+ tmp->bits = old_header->bits;
-+ tmp->header = old_header->header;
-+ tmp->oid = oid;
-+ tmp->cid = cid;
-+
-+ if (new_header)
-+ *new_header = tmp;
-+
-+ old_header->header = Header();
-+ old_header->deleted = true;
- }
-
--int StripObjectMap::get_values_with_header(const StripObjectHeader &header,
-+int StripObjectMap::get_values_with_header(const StripObjectHeaderRef header,
- const string &prefix,
- const set<string> &keys,
- map<string, bufferlist> *out)
- {
-- return scan(header.header, prefix, keys, 0, out);
-+ return scan(header->header, prefix, keys, 0, out);
- }
-
--int StripObjectMap::get_keys_with_header(const StripObjectHeader &header,
-+int StripObjectMap::get_keys_with_header(const StripObjectHeaderRef header,
- const string &prefix,
- set<string> *keys)
- {
-- ObjectMap::ObjectMapIterator iter = _get_iterator(header.header, prefix);
-+ ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix);
- for (; iter->valid(); iter->next()) {
- if (iter->status())
- return iter->status();
- keys->insert(iter->key());
- }
- return 0;
- }
-
--int StripObjectMap::get_with_header(const StripObjectHeader &header,
-+int StripObjectMap::get_with_header(const StripObjectHeaderRef header,
- const string &prefix, map<string, bufferlist> *out)
- {
-- ObjectMap::ObjectMapIterator iter = _get_iterator(header.header, prefix);
-+ ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix);
- for (iter->seek_to_first(); iter->valid(); iter->next()) {
- if (iter->status())
- return iter->status();
- out->insert(make_pair(iter->key(), iter->value()));
- }
-
- return 0;
- }
--// =========== KeyValueStore::SubmitManager Implementation ==============
--
--uint64_t KeyValueStore::SubmitManager::op_submit_start()
--{
-- lock.Lock();
-- uint64_t op = ++op_seq;
-- dout(10) << "op_submit_start " << op << dendl;
-- return op;
--}
--
--void KeyValueStore::SubmitManager::op_submit_finish(uint64_t op)
--{
-- dout(10) << "op_submit_finish " << op << dendl;
-- if (op != op_submitted + 1) {
-- dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1)
-- << ", OUT OF ORDER" << dendl;
-- assert(0 == "out of order op_submit_finish");
-- }
-- op_submitted = op;
-- lock.Unlock();
--}
--
-
- // ========= KeyValueStore::BufferTransaction Implementation ============
-
- int KeyValueStore::BufferTransaction::lookup_cached_header(
- const coll_t &cid, const ghobject_t &oid,
-- StripObjectMap::StripObjectHeader **strip_header,
-+ StripObjectMap::StripObjectHeaderRef *strip_header,
- bool create_if_missing)
- {
-- StripObjectMap::StripObjectHeader header;
-+ StripObjectMap::StripObjectHeaderRef header;
- int r = 0;
-
- StripHeaderMap::iterator it = strip_headers.find(make_pair(cid, oid));
- if (it != strip_headers.end()) {
-- if (it->second.deleted)
-+
-+ if (!it->second->deleted) {
-+ if (strip_header)
-+ *strip_header = it->second;
-+ return 0;
-+ } else if (!create_if_missing) {
- return -ENOENT;
-+ }
-
-- if (strip_header)
-- *strip_header = &it->second;
-- return 0;
-+ // If (it->second.deleted && create_if_missing) go down
-+ r = -ENOENT;
-+ } else {
-+ r = store->backend->lookup_strip_header(cid, oid, &header);
- }
-
-- r = store->backend->lookup_strip_header(cid, oid, header);
-- if (r < 0 && create_if_missing) {
-- r = store->backend->create_strip_header(cid, oid, header, t);
-+ if (r == -ENOENT && create_if_missing) {
-+ r = store->backend->create_strip_header(cid, oid, &header, t);
- }
-
- if (r < 0) {
- dout(10) << __func__ << " " << cid << "/" << oid << " "
-@@ -320,23 +297,23 @@
- }
-
- strip_headers[make_pair(cid, oid)] = header;
- if (strip_header)
-- *strip_header = &strip_headers[make_pair(cid, oid)];
-+ *strip_header = strip_headers[make_pair(cid, oid)];
- return r;
- }
-
- int KeyValueStore::BufferTransaction::get_buffer_keys(
-- StripObjectMap::StripObjectHeader &strip_header, const string &prefix,
-+ StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix,
- const set<string> &keys, map<string, bufferlist> *out)
- {
- set<string> need_lookup;
-
- for (set<string>::iterator it = keys.begin(); it != keys.end(); ++it) {
- map<pair<string, string>, bufferlist>::iterator i =
-- strip_header.buffers.find(make_pair(prefix, *it));
-+ strip_header->buffers.find(make_pair(prefix, *it));
-
-- if (i != strip_header.buffers.end()) {
-+ if (i != strip_header->buffers.end()) {
- (*out)[*it].swap(i->second);
- } else {
- need_lookup.insert(*it);
- }
-@@ -345,117 +322,118 @@
- if (!need_lookup.empty()) {
- int r = store->backend->get_values_with_header(strip_header, prefix,
- need_lookup, out);
- if (r < 0) {
-- dout(10) << __func__ << " " << strip_header.cid << "/"
-- << strip_header.oid << " " << " r = " << r << dendl;
-+ dout(10) << __func__ << " " << strip_header->cid << "/"
-+ << strip_header->oid << " " << " r = " << r << dendl;
- return r;
- }
- }
-
- return 0;
- }
-
- void KeyValueStore::BufferTransaction::set_buffer_keys(
-- StripObjectMap::StripObjectHeader &strip_header,
-+ StripObjectMap::StripObjectHeaderRef strip_header,
- const string &prefix, map<string, bufferlist> &values)
- {
-- store->backend->set_keys(strip_header.header, prefix, values, t);
-+ store->backend->set_keys(strip_header->header, prefix, values, t);
-
- for (map<string, bufferlist>::iterator iter = values.begin();
- iter != values.end(); ++iter) {
-- strip_header.buffers[make_pair(prefix, iter->first)].swap(iter->second);
-+ strip_header->buffers[make_pair(prefix, iter->first)].swap(iter->second);
- }
- }
-
- int KeyValueStore::BufferTransaction::remove_buffer_keys(
-- StripObjectMap::StripObjectHeader &strip_header, const string &prefix,
-+ StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix,
- const set<string> &keys)
- {
- for (set<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
-- strip_header.buffers[make_pair(prefix, *iter)] = bufferlist();
-+ strip_header->buffers[make_pair(prefix, *iter)] = bufferlist();
- }
-
-- return store->backend->rm_keys(strip_header.header, prefix, keys, t);
-+ return store->backend->rm_keys(strip_header->header, prefix, keys, t);
- }
-
- void KeyValueStore::BufferTransaction::clear_buffer_keys(
-- StripObjectMap::StripObjectHeader &strip_header, const string &prefix)
-+ StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix)
- {
-- for (map<pair<string, string>, bufferlist>::iterator iter = strip_header.buffers.begin();
-- iter != strip_header.buffers.end(); ++iter) {
-+ for (map<pair<string, string>, bufferlist>::iterator iter = strip_header->buffers.begin();
-+ iter != strip_header->buffers.end(); ++iter) {
- if (iter->first.first == prefix)
- iter->second = bufferlist();
- }
- }
-
- int KeyValueStore::BufferTransaction::clear_buffer(
-- StripObjectMap::StripObjectHeader &strip_header)
-+ StripObjectMap::StripObjectHeaderRef strip_header)
- {
-- strip_header.deleted = true;
-+ strip_header->deleted = true;
-
-- return store->backend->clear(strip_header.header, t);
-+ InvalidateCacheContext *c = new InvalidateCacheContext(store, strip_header->cid, strip_header->oid);
-+ finishes.push_back(c);
-+ return store->backend->clear(strip_header->header, t);
- }
-
- void KeyValueStore::BufferTransaction::clone_buffer(
-- StripObjectMap::StripObjectHeader &old_header,
-+ StripObjectMap::StripObjectHeaderRef old_header,
- const coll_t &cid, const ghobject_t &oid)
- {
- // Remove target ahead to avoid dead lock
- strip_headers.erase(make_pair(cid, oid));
-
-- StripObjectMap::StripObjectHeader new_origin_header, new_target_header;
-+ StripObjectMap::StripObjectHeaderRef new_target_header;
-
-- store->backend->clone_wrap(old_header, cid, oid, t,
-- &new_origin_header, &new_target_header);
-+ store->backend->clone_wrap(old_header, cid, oid, t, &new_target_header);
-
- // FIXME: Lacking of lock for origin header(now become parent), it will
- // cause other operation can get the origin header while submitting
- // transactions
-- strip_headers[make_pair(cid, old_header.oid)] = new_origin_header;
- strip_headers[make_pair(cid, oid)] = new_target_header;
- }
-
- void KeyValueStore::BufferTransaction::rename_buffer(
-- StripObjectMap::StripObjectHeader &old_header,
-+ StripObjectMap::StripObjectHeaderRef old_header,
- const coll_t &cid, const ghobject_t &oid)
- {
-- if (store->backend->check_spos(old_header, spos))
-- return ;
--
- // FIXME: Lacking of lock for origin header, it will cause other operation
- // can get the origin header while submitting transactions
-- store->backend->rename_wrap(cid, oid, t, &old_header);
-+ StripObjectMap::StripObjectHeaderRef new_header;
-+ store->backend->rename_wrap(old_header, cid, oid, t, &new_header);
-
-- strip_headers.erase(make_pair(old_header.cid, old_header.oid));
-- strip_headers[make_pair(cid, oid)] = old_header;
-+ InvalidateCacheContext *c = new InvalidateCacheContext(store, old_header->cid, old_header->oid);
-+ finishes.push_back(c);
-+ strip_headers[make_pair(cid, oid)] = new_header;
- }
-
- int KeyValueStore::BufferTransaction::submit_transaction()
- {
- int r = 0;
-
- for (StripHeaderMap::iterator header_iter = strip_headers.begin();
- header_iter != strip_headers.end(); ++header_iter) {
-- StripObjectMap::StripObjectHeader header = header_iter->second;
-+ StripObjectMap::StripObjectHeaderRef header = header_iter->second;
-
-- if (store->backend->check_spos(header, spos))
-+ if (header->deleted)
- continue;
-
-- if (header.deleted)
-- continue;
-+ r = store->backend->save_strip_header(header, t);
-
-- r = store->backend->save_strip_header(header, spos, t);
- if (r < 0) {
- dout(10) << __func__ << " save strip header failed " << dendl;
- goto out;
- }
- }
-
--out:
-+ r = store->backend->submit_transaction(t);
-+ for (list<Context*>::iterator it = finishes.begin(); it != finishes.end(); ++it) {
-+ (*it)->complete(r);
-+ }
-
-+out:
- dout(5) << __func__ << " r = " << r << dendl;
-- return store->backend->submit_transaction(t);
-+ return r;
- }
-
- // =========== KeyValueStore Intern Helper Implementation ==============
-
-@@ -494,9 +472,9 @@
- const char *name, bool do_update) :
- ObjectStore(base),
- internal_name(name),
- basedir(base),
-- fsid_fd(-1), op_fd(-1), current_fd(-1),
-+ fsid_fd(-1), current_fd(-1),
- kv_type(KV_TYPE_NONE),
- backend(NULL),
- ondisk_finisher(g_ceph_context),
- lock("KeyValueStore::lock"),
-@@ -905,12 +883,8 @@
- if (fsid_fd >= 0) {
- VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
- fsid_fd = -1;
- }
-- if (op_fd >= 0) {
-- VOID_TEMP_FAILURE_RETRY(::close(op_fd));
-- op_fd = -1;
-- }
- if (current_fd >= 0) {
- VOID_TEMP_FAILURE_RETRY(::close(current_fd));
- current_fd = -1;
- }
-@@ -962,16 +936,11 @@
- }
-
- Op *o = build_op(tls, ondisk, onreadable, onreadable_sync, osd_op);
- op_queue_reserve_throttle(o, handle);
-- uint64_t op = submit_manager.op_submit_start();
-- o->op = op;
-- dout(5) << "queue_transactions (trailing journal) " << op << " "
-- << tls <<dendl;
-+ dout(5) << "queue_transactions (trailing journal) " << " " << tls <<dendl;
- queue_op(osr, o);
-
-- submit_manager.op_submit_finish(op);
--
- return 0;
- }
-
-
-@@ -1087,9 +1056,10 @@
- }
-
- void KeyValueStore::_finish_op(OpSequencer *osr)
- {
-- Op *o = osr->dequeue();
-+ list<Context*> to_queue;
-+ Op *o = osr->dequeue(&to_queue);
-
- dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << dendl;
- osr->apply_lock.Unlock(); // locked in _do_op
- op_queue_release_throttle(o);
-@@ -1101,8 +1071,9 @@
- if (o->onreadable_sync) {
- o->onreadable_sync->complete(0);
- }
- op_finisher.queue(o->onreadable);
-+ op_finisher.queue(to_queue);
- delete o;
- }
-
- // Combine all the ops in the same transaction using "BufferTransaction" and
-@@ -1125,15 +1096,14 @@
- ops += (*p)->get_num_ops();
- }
-
- int trans_num = 0;
-- SequencerPosition spos(op_seq, trans_num, 0);
-- BufferTransaction bt(this, spos);
-+ BufferTransaction bt(this);
-
- for (list<Transaction*>::iterator p = tls.begin();
- p != tls.end();
- ++p, trans_num++) {
-- r = _do_transaction(**p, bt, spos, handle);
-+ r = _do_transaction(**p, bt, handle);
- if (r < 0)
- break;
- if (handle)
- handle->reset_tp_timeout();
-@@ -1148,14 +1118,14 @@
- }
-
- unsigned KeyValueStore::_do_transaction(Transaction& transaction,
- BufferTransaction &t,
-- SequencerPosition& spos,
- ThreadPool::TPHandle *handle)
- {
- dout(10) << "_do_transaction on " << &transaction << dendl;
-
- Transaction::iterator i = transaction.begin();
-+ uint64_t op_num = 0;
-
- while (i.have_op()) {
- if (handle)
- handle->reset_tp_timeout();
-@@ -1448,9 +1418,15 @@
- }
- break;
-
- case Transaction::OP_SETALLOCHINT:
-- // TODO: can kvstore make use of the hint?
-+ {
-+ // TODO: can kvstore make use of the hint?
-+ coll_t cid(i.get_cid());
-+ ghobject_t oid = i.get_oid();
-+ (void)i.get_length(); // discard result
-+ (void)i.get_length(); // discard result
-+ }
- break;
-
- default:
- derr << "bad op " << op << dendl;
-@@ -1486,10 +1462,9 @@
- msg = "ENOTEMPTY suggests garbage data in osd data dir";
- }
-
- dout(0) << " error " << cpp_strerror(r) << " not handled on operation "
-- << op << " (" << spos << ", or op " << spos.op
-- << ", counting from 0)" << dendl;
-+ << op << " op " << op_num << ", counting from 0)" << dendl;
- dout(0) << msg << dendl;
- dout(0) << " transaction dump:\n";
- JSONFormatter f(true);
- f.open_object_section("transaction");
-@@ -1504,9 +1479,9 @@
- }
- }
- }
-
-- spos.op++;
-+ op_num++;
- }
-
- return 0; // FIXME count errors
- }
-@@ -1519,11 +1494,11 @@
- {
- dout(10) << __func__ << "collection: " << cid << " object: " << oid
- << dendl;
- int r;
-- StripObjectMap::StripObjectHeader header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
-- r = backend->lookup_strip_header(cid, oid, header);
-+ r = backend->lookup_strip_header(cid, oid, &header);
- if (r < 0) {
- return false;
- }
-
-@@ -1534,44 +1509,44 @@
- struct stat *st, bool allow_eio)
- {
- dout(10) << "stat " << cid << "/" << oid << dendl;
-
-- StripObjectMap::StripObjectHeader header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
-- int r = backend->lookup_strip_header(cid, oid, header);
-+ int r = backend->lookup_strip_header(cid, oid, &header);
- if (r < 0) {
- dout(10) << "stat " << cid << "/" << oid << "=" << r << dendl;
- return -ENOENT;
- }
-
-- st->st_blocks = header.max_size / header.strip_size;
-- if (header.max_size % header.strip_size)
-+ st->st_blocks = header->max_size / header->strip_size;
-+ if (header->max_size % header->strip_size)
- st->st_blocks++;
- st->st_nlink = 1;
-- st->st_size = header.max_size;
-- st->st_blksize = header.strip_size;
-+ st->st_size = header->max_size;
-+ st->st_blksize = header->strip_size;
-
- return r;
- }
-
--int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeader &header,
-+int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeaderRef header,
- uint64_t offset, size_t len, bufferlist& bl,
- bool allow_eio, BufferTransaction *bt)
- {
-- if (header.max_size < offset) {
-- dout(10) << __func__ << " " << header.cid << "/" << header.oid << ")"
-+ if (header->max_size < offset) {
-+ dout(10) << __func__ << " " << header->cid << "/" << header->oid << ")"
- << " offset exceed the length of bl"<< dendl;
- return 0;
- }
-
- if (len == 0)
-- len = header.max_size - offset;
-+ len = header->max_size - offset;
-
-- if (offset + len > header.max_size)
-- len = header.max_size - offset;
-+ if (offset + len > header->max_size)
-+ len = header->max_size - offset;
-
- vector<StripObjectMap::StripExtent> extents;
-- StripObjectMap::file_to_extents(offset, len, header.strip_size,
-+ StripObjectMap::file_to_extents(offset, len, header->strip_size,
- extents);
- map<string, bufferlist> out;
- set<string> keys;
-
-@@ -1579,35 +1554,35 @@
- iter != extents.end(); ++iter) {
- bufferlist old;
- string key = strip_object_key(iter->no);
-
-- if (bt && header.buffers.count(make_pair(OBJECT_STRIP_PREFIX, key))) {
-+ if (bt && header->buffers.count(make_pair(OBJECT_STRIP_PREFIX, key))) {
- // use strip_header buffer
-- assert(header.bits[iter->no]);
-- out[key] = header.buffers[make_pair(OBJECT_STRIP_PREFIX, key)];
-- } else if (header.bits[iter->no]) {
-+ assert(header->bits[iter->no]);
-+ out[key] = header->buffers[make_pair(OBJECT_STRIP_PREFIX, key)];
-+ } else if (header->bits[iter->no]) {
- keys.insert(key);
- }
- }
-
- int r = backend->get_values_with_header(header, OBJECT_STRIP_PREFIX, keys, &out);
- if (r < 0) {
-- dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
-+ dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
- << offset << "~" << len << " = " << r << dendl;
- return r;
- } else if (out.size() != keys.size()) {
- dout(0) << __func__ << " broken header or missing data in backend "
-- << header.cid << "/" << header.oid << " " << offset << "~"
-+ << header->cid << "/" << header->oid << " " << offset << "~"
- << len << " = " << r << dendl;
- return -EBADF;
- }
-
- for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
- iter != extents.end(); ++iter) {
- string key = strip_object_key(iter->no);
-
-- if (header.bits[iter->no]) {
-- if (iter->len == header.strip_size) {
-+ if (header->bits[iter->no]) {
-+ if (iter->len == header->strip_size) {
- bl.claim_append(out[key]);
- } else {
- out[key].copy(iter->offset, iter->len, bl);
- }
-@@ -1615,9 +1590,9 @@
- bl.append_zero(iter->len);
- }
- }
-
-- dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
-+ dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
- << offset << "~" << bl.length() << "/" << len << " r = " << r
- << dendl;
-
- return bl.length();
-@@ -1629,11 +1604,11 @@
- {
- dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
- << len << dendl;
-
-- StripObjectMap::StripObjectHeader header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
-- int r = backend->lookup_strip_header(cid, oid, header);
-+ int r = backend->lookup_strip_header(cid, oid, &header);
-
- if (r < 0) {
- dout(10) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
- << len << " header isn't exist: r = " << r << dendl;
-@@ -1648,25 +1623,26 @@
- {
- dout(10) << __func__ << " " << cid << " " << oid << " " << offset << "~"
- << len << dendl;
- int r;
-- StripObjectMap::StripObjectHeader header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
-- r = backend->lookup_strip_header(cid, oid, header);
-+ r = backend->lookup_strip_header(cid, oid, &header);
- if (r < 0) {
- dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len
- << " failed to get header: r = " << r << dendl;
- return r;
- }
-
- vector<StripObjectMap::StripExtent> extents;
-- StripObjectMap::file_to_extents(offset, len, header.strip_size,
-+ StripObjectMap::file_to_extents(offset, len, header->strip_size,
- extents);
-
- map<uint64_t, uint64_t> m;
- for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
- iter != extents.end(); ++iter) {
-- m[iter->offset] = iter->len;
-+ uint64_t off = iter->no * header->strip_size + iter->offset;
-+ m[off] = iter->len;
- }
- ::encode(m, bl);
- return 0;
- }
-@@ -1676,18 +1652,20 @@
- {
- dout(15) << __func__ << " " << cid << "/" << oid << dendl;
-
- int r;
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- r = t.lookup_cached_header(cid, oid, &header, false);
- if (r < 0) {
- dout(10) << __func__ << " " << cid << "/" << oid << " "
- << " failed to get header: r = " << r << dendl;
- return r;
- }
-
-- r = t.clear_buffer(*header);
-+ header->max_size = 0;
-+ header->bits.clear();
-+ r = t.clear_buffer(header);
-
- dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
- return r;
- }
-@@ -1698,9 +1676,9 @@
- dout(15) << __func__ << " " << cid << "/" << oid << " size " << size
- << dendl;
-
- int r;
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- r = t.lookup_cached_header(cid, oid, &header, false);
- if (r < 0) {
- dout(10) << __func__ << " " << cid << "/" << oid << " " << size
-@@ -1724,9 +1702,9 @@
- set<string> lookup_keys;
- string key = strip_object_key(iter->no);
-
- lookup_keys.insert(key);
-- r = t.get_buffer_keys(*header, OBJECT_STRIP_PREFIX,
-+ r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX,
- lookup_keys, &values);
- if (r < 0) {
- dout(10) << __func__ << " " << cid << "/" << oid << " "
- << size << " = " << r << dendl;
-@@ -1742,9 +1720,9 @@
- value.append_zero(header->strip_size-iter->offset);
- assert(value.length() == header->strip_size);
- value.swap(values[key]);
-
-- t.set_buffer_keys(*header, OBJECT_STRIP_PREFIX, values);
-+ t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values);
- ++iter;
- }
-
- set<string> keys;
-@@ -1753,9 +1731,9 @@
- keys.insert(strip_object_key(iter->no));
- header->bits[iter->no] = 0;
- }
- }
-- r = t.remove_buffer_keys(*header, OBJECT_STRIP_PREFIX, keys);
-+ r = t.remove_buffer_keys(header, OBJECT_STRIP_PREFIX, keys);
- if (r < 0) {
- dout(10) << __func__ << " " << cid << "/" << oid << " "
- << size << " = " << r << dendl;
- return r;
-@@ -1775,9 +1753,9 @@
- {
- dout(15) << __func__ << " " << cid << "/" << oid << dendl;
-
- int r;
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- r = t.lookup_cached_header(cid, oid, &header, true);
- if (r < 0) {
- dout(10) << __func__ << " " << cid << "/" << oid << " "
-@@ -1789,44 +1767,44 @@
- dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
- return r;
- }
-
--int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeader &header,
-+int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeaderRef header,
- uint64_t offset, size_t len,
- const bufferlist& bl, BufferTransaction &t,
- bool replica)
- {
- if (len > bl.length())
- len = bl.length();
-
-- if (len + offset > header.max_size) {
-- header.max_size = len + offset;
-- header.bits.resize(header.max_size/header.strip_size+1);
-+ if (len + offset > header->max_size) {
-+ header->max_size = len + offset;
-+ header->bits.resize(header->max_size/header->strip_size+1);
- }
-
- vector<StripObjectMap::StripExtent> extents;
-- StripObjectMap::file_to_extents(offset, len, header.strip_size,
-+ StripObjectMap::file_to_extents(offset, len, header->strip_size,
- extents);
-
- map<string, bufferlist> out;
- set<string> keys;
- for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
- iter != extents.end(); ++iter) {
-- if (header.bits[iter->no] && !(iter->offset == 0 &&
-- iter->len == header.strip_size))
-+ if (header->bits[iter->no] && !(iter->offset == 0 &&
-+ iter->len == header->strip_size))
- keys.insert(strip_object_key(iter->no));
- }
-
- int r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, keys, &out);
- if (r < 0) {
-- dout(10) << __func__ << " failed to get value " << header.cid << "/"
-- << header.oid << " " << offset << "~" << len << " = " << r
-+ dout(10) << __func__ << " failed to get value " << header->cid << "/"
-+ << header->oid << " " << offset << "~" << len << " = " << r
- << dendl;
- return r;
- } else if (keys.size() != out.size()) {
- // Error on header.bits or the corresponding key/value pair is missing
- dout(0) << __func__ << " broken header or missing data in backend "
-- << header.cid << "/" << header.oid << " " << offset << "~"
-+ << header->cid << "/" << header->oid << " " << offset << "~"
- << len << " = " << r << dendl;
- return -EBADF;
- }
-
-@@ -1835,41 +1813,41 @@
- for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
- iter != extents.end(); ++iter) {
- bufferlist value;
- string key = strip_object_key(iter->no);
-- if (header.bits[iter->no]) {
-- if (iter->offset == 0 && iter->len == header.strip_size) {
-+ if (header->bits[iter->no]) {
-+ if (iter->offset == 0 && iter->len == header->strip_size) {
- bl.copy(bl_offset, iter->len, value);
- bl_offset += iter->len;
- } else {
-- assert(out[key].length() == header.strip_size);
-+ assert(out[key].length() == header->strip_size);
-
- out[key].copy(0, iter->offset, value);
- bl.copy(bl_offset, iter->len, value);
- bl_offset += iter->len;
-
-- if (value.length() != header.strip_size)
-- out[key].copy(value.length(), header.strip_size-value.length(),
-+ if (value.length() != header->strip_size)
-+ out[key].copy(value.length(), header->strip_size-value.length(),
- value);
- }
- } else {
- if (iter->offset)
- value.append_zero(iter->offset);
- bl.copy(bl_offset, iter->len, value);
- bl_offset += iter->len;
-
-- if (value.length() < header.strip_size)
-- value.append_zero(header.strip_size-value.length());
-+ if (value.length() < header->strip_size)
-+ value.append_zero(header->strip_size-value.length());
-
-- header.bits[iter->no] = 1;
-+ header->bits[iter->no] = 1;
- }
-- assert(value.length() == header.strip_size);
-+ assert(value.length() == header->strip_size);
- values[key].swap(value);
- }
- assert(bl_offset == len);
-
- t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values);
-- dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
-+ dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
- << offset << "~" << len << " = " << r << dendl;
-
- return r;
- }
-@@ -1881,18 +1859,18 @@
- dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
- << len << dendl;
-
- int r;
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- r = t.lookup_cached_header(cid, oid, &header, true);
- if (r < 0) {
- dout(10) << __func__ << " " << cid << "/" << oid << " " << offset
- << "~" << len << " failed to get header: r = " << r << dendl;
- return r;
- }
-
-- return _generic_write(*header, offset, len, bl, t, replica);
-+ return _generic_write(header, offset, len, bl, t, replica);
- }
-
- int KeyValueStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset,
- size_t len, BufferTransaction &t)
-@@ -1919,18 +1897,18 @@
- if (oldoid == newoid)
- return 0;
-
- int r;
-- StripObjectMap::StripObjectHeader *old_header;
-+ StripObjectMap::StripObjectHeaderRef old_header;
-
- r = t.lookup_cached_header(cid, oldoid, &old_header, false);
- if (r < 0) {
- dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
- << newoid << " = " << r << dendl;
- return r;
- }
-
-- t.clone_buffer(*old_header, cid, newoid);
-+ t.clone_buffer(old_header, cid, newoid);
-
- dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
- << newoid << " = " << r << dendl;
- return r;
-@@ -1947,9 +1925,9 @@
-
- int r;
- bufferlist bl;
-
-- StripObjectMap::StripObjectHeader *old_header, *new_header;
-+ StripObjectMap::StripObjectHeaderRef old_header, new_header;
-
- r = t.lookup_cached_header(cid, oldoid, &old_header, false);
- if (r < 0) {
- dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
-@@ -1965,13 +1943,13 @@
- << " can't create header: r = " << r << dendl;
- return r;
- }
-
-- r = _generic_read(*old_header, srcoff, len, bl, &t);
-+ r = _generic_read(old_header, srcoff, len, bl, &t);
- if (r < 0)
- goto out;
-
-- r = _generic_write(*new_header, dstoff, len, bl, t);
-+ r = _generic_write(new_header, dstoff, len, bl, t);
-
- out:
- dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
- << newoid << " " << srcoff << "~" << len << " to " << dstoff
-@@ -1989,11 +1967,19 @@
-
- int r;
- map<string, bufferlist> got;
- set<string> to_get;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- to_get.insert(string(name));
-- r = backend->get_values(cid, oid, OBJECT_XATTR, to_get, &got);
-+
-+ r = backend->lookup_strip_header(cid, oid, &header);
-+ if (r < 0) {
-+ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
-+ return r;
-+ }
-+
-+ r = backend->get_values_with_header(header, OBJECT_XATTR, to_get, &got);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " get_xattrs err r =" << r << dendl;
- goto out;
- }
-@@ -2055,9 +2041,9 @@
- dout(15) << __func__ << " " << cid << "/" << oid << dendl;
-
- int r;
-
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
- map<string, bufferlist> attrs;
-
- r = t.lookup_cached_header(cid, oid, &header, false);
- if (r < 0)
-@@ -2067,9 +2053,9 @@
- it != aset.end(); ++it) {
- attrs[it->first].push_back(it->second);
- }
-
-- t.set_buffer_keys(*header, OBJECT_XATTR, attrs);
-+ t.set_buffer_keys(header, OBJECT_XATTR, attrs);
-
- out:
- dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
- return r;
-@@ -2083,9 +2069,9 @@
- << dendl;
-
- int r;
- set<string> to_remove;
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- r = t.lookup_cached_header(cid, oid, &header, false);
- if (r < 0) {
- dout(10) << __func__ << " could not find header r = " << r
-@@ -2093,9 +2079,9 @@
- return r;
- }
-
- to_remove.insert(string(name));
-- r = t.remove_buffer_keys(*header, OBJECT_XATTR, to_remove);
-+ r = t.remove_buffer_keys(header, OBJECT_XATTR, to_remove);
-
- dout(10) << __func__ << " " << cid << "/" << oid << " '" << name << "' = "
- << r << dendl;
- return r;
-@@ -2108,25 +2094,25 @@
-
- int r;
- set<string> attrs;
-
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- r = t.lookup_cached_header(cid, oid, &header, false);
- if (r < 0) {
- dout(10) << __func__ << " could not find header r = " << r
- << dendl;
- return r;
- }
-
-- r = backend->get_keys_with_header(*header, OBJECT_XATTR, &attrs);
-+ r = backend->get_keys_with_header(header, OBJECT_XATTR, &attrs);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not get attrs r = " << r << dendl;
- return r;
- }
-
-- r = t.remove_buffer_keys(*header, OBJECT_XATTR, attrs);
-- t.clear_buffer_keys(*header, OBJECT_XATTR);
-+ r = t.remove_buffer_keys(header, OBJECT_XATTR, attrs);
-+ t.clear_buffer_keys(header, OBJECT_XATTR);
-
- dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
- return r;
- }
-@@ -2167,12 +2153,20 @@
- << "'" << dendl;
-
- set<string> keys;
- map<string, bufferlist> out;
-+ StripObjectMap::StripObjectHeaderRef header;
-+
- keys.insert(string(name));
-
-- int r = backend->get_values(get_coll_for_coll(), make_ghobject_for_coll(c),
-- COLLECTION_ATTR, keys, &out);
-+ int r = backend->lookup_strip_header(get_coll_for_coll(),
-+ make_ghobject_for_coll(c), &header);
-+ if (r < 0) {
-+ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
-+ return r;
-+ }
-+
-+ r = backend->get_values_with_header(header, COLLECTION_ATTR, keys, &out);
- if (r < 0) {
- dout(10) << __func__ << " could not get key" << string(name) << dendl;
- r = -EINVAL;
- }
-@@ -2191,16 +2185,23 @@
- dout(10) << __func__ << " " << cid.to_str() << dendl;
-
- map<string, bufferlist> out;
- set<string> keys;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- for (map<string, bufferptr>::iterator it = aset.begin();
- it != aset.end(); ++it) {
- keys.insert(it->first);
- }
-
-- int r = backend->get_values(get_coll_for_coll(), make_ghobject_for_coll(cid),
-- COLLECTION_ATTR, keys, &out);
-+ int r = backend->lookup_strip_header(get_coll_for_coll(),
-+ make_ghobject_for_coll(cid), &header);
-+ if (r < 0) {
-+ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
-+ return r;
-+ }
-+
-+ r = backend->get_values_with_header(header, COLLECTION_ATTR, keys, &out);
- if (r < 0) {
- dout(10) << __func__ << " could not get keys" << dendl;
- r = -EINVAL;
- goto out;
-@@ -2226,9 +2227,9 @@
-
- int r;
- bufferlist bl;
- map<string, bufferlist> out;
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- r = t.lookup_cached_header(get_coll_for_coll(),
- make_ghobject_for_coll(c),
- &header, false);
-@@ -2239,9 +2240,9 @@
-
- bl.append(reinterpret_cast<const char*>(value), size);
- out.insert(make_pair(string(name), bl));
-
-- t.set_buffer_keys(*header, COLLECTION_ATTR, out);
-+ t.set_buffer_keys(header, COLLECTION_ATTR, out);
-
- dout(10) << __func__ << " " << c << " '"
- << name << "' len " << size << " = " << r << dendl;
- return r;
-@@ -2253,9 +2254,9 @@
- dout(15) << __func__ << " " << c << dendl;
-
- bufferlist bl;
- set<string> out;
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- int r = t.lookup_cached_header(get_coll_for_coll(),
- make_ghobject_for_coll(c), &header, false);
- if (r < 0) {
-@@ -2263,9 +2264,9 @@
- return r;
- }
-
- out.insert(string(name));
-- r = t.remove_buffer_keys(*header, COLLECTION_ATTR, out);
-+ r = t.remove_buffer_keys(header, COLLECTION_ATTR, out);
-
- dout(10) << __func__ << " " << c << " = " << r << dendl;
- return r;
- }
-@@ -2276,9 +2277,9 @@
- {
- dout(15) << __func__ << " " << cid << dendl;
-
- map<string, bufferlist> attrs;
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
- int r = t.lookup_cached_header(get_coll_for_coll(),
- make_ghobject_for_coll(cid),
- &header, false);
- if (r < 0) {
-@@ -2290,9 +2291,9 @@
- ++it) {
- attrs[it->first].push_back(it->second);
- }
-
-- t.set_buffer_keys(*header, COLLECTION_ATTR, attrs);
-+ t.set_buffer_keys(header, COLLECTION_ATTR, attrs);
-
- dout(10) << __func__ << " " << cid << " = " << r << dendl;
- return r;
- }
-@@ -2304,9 +2305,9 @@
- {
- dout(15) << __func__ << " " << c << dendl;
-
- int r;
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
- bufferlist bl;
-
- r = t.lookup_cached_header(get_coll_for_coll(),
- make_ghobject_for_coll(c), &header,
-@@ -2329,9 +2330,9 @@
- dout(15) << __func__ << " " << c << dendl;
-
- int r;
- uint64_t modified_object = 0;
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
- vector<ghobject_t> oids;
-
- r = t.lookup_cached_header(get_coll_for_coll(), make_ghobject_for_coll(c),
- &header, false);
-@@ -2346,9 +2347,9 @@
- if (iter->first.first != c)
- continue;
-
- modified_object++;
-- if (!iter->second.deleted) {
-+ if (!iter->second->deleted) {
- r = -ENOTEMPTY;
- goto out;
- }
- }
-@@ -2368,9 +2369,9 @@
- goto out;
- }
- }
-
-- r = t.clear_buffer(*header);
-+ r = t.clear_buffer(header);
-
- out:
- dout(10) << __func__ << " " << c << " = " << r << dendl;
- return r;
-@@ -2384,9 +2385,9 @@
- dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/"
- << o << dendl;
-
- bufferlist bl;
-- StripObjectMap::StripObjectHeader *header, *old_header;
-+ StripObjectMap::StripObjectHeaderRef header, old_header;
-
- int r = t.lookup_cached_header(oldcid, o, &old_header, false);
- if (r < 0) {
- goto out;
-@@ -2399,15 +2400,15 @@
- << o << " already exist " << dendl;
- goto out;
- }
-
-- r = _generic_read(*old_header, 0, old_header->max_size, bl, &t);
-+ r = _generic_read(old_header, 0, old_header->max_size, bl, &t);
- if (r < 0) {
- r = -EINVAL;
- goto out;
- }
-
-- r = _generic_write(*header, 0, bl.length(), bl, t);
-+ r = _generic_write(header, 0, bl.length(), bl, t);
- if (r < 0) {
- r = -EINVAL;
- }
-
-@@ -2424,9 +2425,9 @@
- {
- dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/"
- << oldoid << dendl;
- int r;
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- r = t.lookup_cached_header(c, o, &header, false);
- if (r == 0) {
- dout(10) << __func__ << " " << oldcid << "/" << oldoid << " -> " << c
-@@ -2440,9 +2441,9 @@
- << "/" << o << " = " << r << dendl;
- return r;
- }
-
-- t.rename_buffer(*header, c, o);
-+ t.rename_buffer(header, c, o);
-
- dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/"
- << oldoid << " = " << r << dendl;
- return r;
-@@ -2452,9 +2453,9 @@
- BufferTransaction &t)
- {
- dout(15) << __func__ << " " << cid << dendl;
-
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- int r = t.lookup_cached_header(get_coll_for_coll(),
- make_ghobject_for_coll(cid),
- &header, false);
-@@ -2477,9 +2478,9 @@
- return r;
- }
- }
-
-- r = t.clear_buffer(*header);
-+ r = t.clear_buffer(header);
-
- dout(10) << __func__ << " " << cid << " r = " << r << dendl;
- return 0;
- }
-@@ -2489,9 +2490,9 @@
- {
- dout(10) << __func__ << " origin cid " << cid << " new cid " << ncid
- << dendl;
-
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- int r = t.lookup_cached_header(get_coll_for_coll(),
- make_ghobject_for_coll(ncid),
- &header, false);
-@@ -2531,9 +2532,9 @@
- objects.clear();
- current = next;
- }
-
-- t.rename_buffer(*header, get_coll_for_coll(), make_ghobject_for_coll(ncid));
-+ t.rename_buffer(header, get_coll_for_coll(), make_ghobject_for_coll(ncid));
-
- dout(10) << __func__ << " origin cid " << cid << " new cid " << ncid
- << dendl;
- return 0;
-@@ -2559,11 +2560,11 @@
- bool KeyValueStore::collection_exists(coll_t c)
- {
- dout(10) << __func__ << " " << dendl;
-
-- StripObjectMap::StripObjectHeader header;
-+ StripObjectMap::StripObjectHeaderRef header;
- int r = backend->lookup_strip_header(get_coll_for_coll(),
-- make_ghobject_for_coll(c), header);
-+ make_ghobject_for_coll(c), &header);
- if (r < 0) {
- return false;
- }
- return true;
-@@ -2651,17 +2652,16 @@
- bufferlist *bl, map<string, bufferlist> *out)
- {
- dout(15) << __func__ << " " << c << "/" << hoid << dendl;
-
-- StripObjectMap::StripObjectHeader header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
-- int r = backend->lookup_strip_header(c, hoid, header);
-+ int r = backend->lookup_strip_header(c, hoid, &header);
- if (r < 0) {
- dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
- return r;
- }
-
--
- r = backend->get_with_header(header, OBJECT_OMAP, out);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " err r =" << r << dendl;
- return r;
-@@ -2691,11 +2691,18 @@
- dout(15) << __func__ << " " << c << "/" << hoid << dendl;
-
- set<string> keys;
- map<string, bufferlist> got;
-+ StripObjectMap::StripObjectHeaderRef header;
-+
-+ int r = backend->lookup_strip_header(c, hoid, &header);
-+ if (r < 0) {
-+ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
-+ return r;
-+ }
-
- keys.insert(OBJECT_OMAP_HEADER_KEY);
-- int r = backend->get_values(c, hoid, OBJECT_OMAP_HEADER, keys, &got);
-+ r = backend->get_values_with_header(header, OBJECT_OMAP_HEADER, keys, &got);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " err r =" << r << dendl;
- return r;
- }
-@@ -2711,9 +2718,16 @@
- int KeyValueStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set<string> *keys)
- {
- dout(15) << __func__ << " " << c << "/" << hoid << dendl;
-
-- int r = backend->get_keys(c, hoid, OBJECT_OMAP, keys);
-+ StripObjectMap::StripObjectHeaderRef header;
-+ int r = backend->lookup_strip_header(c, hoid, &header);
-+ if (r < 0) {
-+ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
-+ return r;
-+ }
-+
-+ r = backend->get_keys_with_header(header, OBJECT_OMAP, keys);
- if (r < 0 && r != -ENOENT) {
- return r;
- }
- return 0;
-@@ -2724,9 +2738,16 @@
- map<string, bufferlist> *out)
- {
- dout(15) << __func__ << " " << c << "/" << hoid << dendl;
-
-- int r = backend->get_values(c, hoid, OBJECT_OMAP, keys, out);
-+ StripObjectMap::StripObjectHeaderRef header;
-+ int r = backend->lookup_strip_header(c, hoid, &header);
-+ if (r < 0) {
-+ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
-+ return r;
-+ }
-+
-+ r = backend->get_values_with_header(header, OBJECT_OMAP, keys, out);
- if (r < 0 && r != -ENOENT) {
- return r;
- }
- return 0;
-@@ -2755,9 +2776,9 @@
- BufferTransaction &t)
- {
- dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
-
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- int r = t.lookup_cached_header(cid, hoid, &header, false);
- if (r < 0) {
- dout(10) << __func__ << " " << cid << "/" << hoid << " "
-@@ -2765,29 +2786,29 @@
- return r;
- }
-
- set<string> keys;
-- r = backend->get_keys_with_header(*header, OBJECT_OMAP, &keys);
-+ r = backend->get_keys_with_header(header, OBJECT_OMAP, &keys);
- if (r < 0 && r != -ENOENT) {
- dout(10) << __func__ << " could not get omap_keys r = " << r << dendl;
- return r;
- }
-
-- r = t.remove_buffer_keys(*header, OBJECT_OMAP, keys);
-+ r = t.remove_buffer_keys(header, OBJECT_OMAP, keys);
- if (r < 0) {
- dout(10) << __func__ << " could not remove keys r = " << r << dendl;
- return r;
- }
-
- keys.clear();
- keys.insert(OBJECT_OMAP_HEADER_KEY);
-- r = t.remove_buffer_keys(*header, OBJECT_OMAP_HEADER, keys);
-+ r = t.remove_buffer_keys(header, OBJECT_OMAP_HEADER, keys);
- if (r < 0) {
- dout(10) << __func__ << " could not remove keys r = " << r << dendl;
- return r;
- }
-
-- t.clear_buffer_keys(*header, OBJECT_OMAP_HEADER);
-+ t.clear_buffer_keys(header, OBJECT_OMAP_HEADER);
-
- dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl;
- return 0;
- }
-@@ -2797,18 +2818,18 @@
- BufferTransaction &t)
- {
- dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
-
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- int r = t.lookup_cached_header(cid, hoid, &header, false);
- if (r < 0) {
- dout(10) << __func__ << " " << cid << "/" << hoid << " "
- << " failed to get header: r = " << r << dendl;
- return r;
- }
-
-- t.set_buffer_keys(*header, OBJECT_OMAP, aset);
-+ t.set_buffer_keys(header, OBJECT_OMAP, aset);
-
- return 0;
- }
-
-@@ -2817,18 +2838,18 @@
- BufferTransaction &t)
- {
- dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
-
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- int r = t.lookup_cached_header(cid, hoid, &header, false);
- if (r < 0) {
- dout(10) << __func__ << " " << cid << "/" << hoid << " "
- << " failed to get header: r = " << r << dendl;
- return r;
- }
-
-- r = t.remove_buffer_keys(*header, OBJECT_OMAP, keys);
-+ r = t.remove_buffer_keys(header, OBJECT_OMAP, keys);
-
- dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl;
- return r;
- }
-@@ -2860,9 +2881,9 @@
- {
- dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
-
- map<string, bufferlist> sets;
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- int r = t.lookup_cached_header(cid, hoid, &header, false);
- if (r < 0) {
- dout(10) << __func__ << " " << cid << "/" << hoid << " "
-@@ -2870,9 +2891,9 @@
- return r;
- }
-
- sets[OBJECT_OMAP_HEADER_KEY] = bl;
-- t.set_buffer_keys(*header, OBJECT_OMAP_HEADER, sets);
-+ t.set_buffer_keys(header, OBJECT_OMAP_HEADER, sets);
- return 0;
- }
-
- int KeyValueStore::_split_collection(coll_t cid, uint32_t bits, uint32_t rem,
-@@ -2880,9 +2901,9 @@
- {
- {
- dout(15) << __func__ << " " << cid << " bits: " << bits << dendl;
-
-- StripObjectMap::StripObjectHeader *header;
-+ StripObjectMap::StripObjectHeaderRef header;
-
- int r = t.lookup_cached_header(get_coll_for_coll(),
- make_ghobject_for_coll(cid),
- &header, false);
---- a/src/os/KeyValueStore.h
-+++ b/src/os/KeyValueStore.h
-@@ -35,10 +35,10 @@
- #include "common/fd.h"
-
- #include "common/Mutex.h"
- #include "GenericObjectMap.h"
--#include "SequencerPosition.h"
- #include "KeyValueDB.h"
-+#include "common/random_cache.hpp"
-
- #include "include/uuid.h"
-
- enum kvstore_types {
-@@ -47,8 +47,10 @@
- KV_TYPE_OTHER
- };
-
-
-+static uint64_t default_strip_size = 1024;
-+
- class StripObjectMap: public GenericObjectMap {
- public:
-
- struct StripExtent {
-@@ -64,9 +66,8 @@
- // Persistent state
- uint64_t strip_size;
- uint64_t max_size;
- vector<char> bits;
-- SequencerPosition spos;
-
- // soft state
- Header header; // FIXME: Hold lock to avoid concurrent operations, it will
- // also block read operation which not should be permitted.
-@@ -81,67 +82,66 @@
- ENCODE_START(1, 1, bl);
- ::encode(strip_size, bl);
- ::encode(max_size, bl);
- ::encode(bits, bl);
-- ::encode(spos, bl);
- ENCODE_FINISH(bl);
- }
-
- void decode(bufferlist::iterator &bl) {
- DECODE_START(1, bl);
- ::decode(strip_size, bl);
- ::decode(max_size, bl);
- ::decode(bits, bl);
-- ::decode(spos, bl);
- DECODE_FINISH(bl);
- }
- };
--
-- bool check_spos(const StripObjectHeader &header,
-- const SequencerPosition &spos);
-- void sync_wrap(StripObjectHeader &strip_header, KeyValueDB::Transaction t,
-- const SequencerPosition &spos);
-+ typedef ceph::shared_ptr<StripObjectHeader> StripObjectHeaderRef;
-
- static int file_to_extents(uint64_t offset, size_t len, uint64_t strip_size,
- vector<StripExtent> &extents);
- int lookup_strip_header(const coll_t & cid, const ghobject_t &oid,
-- StripObjectHeader &header);
-- int save_strip_header(StripObjectHeader &header,
-- const SequencerPosition &spos,
-- KeyValueDB::Transaction t);
-+ StripObjectHeaderRef *header);
-+ int save_strip_header(StripObjectHeaderRef header, KeyValueDB::Transaction t);
- int create_strip_header(const coll_t &cid, const ghobject_t &oid,
-- StripObjectHeader &strip_header,
-+ StripObjectHeaderRef *strip_header,
- KeyValueDB::Transaction t);
-- void clone_wrap(StripObjectHeader &old_header,
-+ void clone_wrap(StripObjectHeaderRef old_header,
- const coll_t &cid, const ghobject_t &oid,
- KeyValueDB::Transaction t,
-- StripObjectHeader *origin_header,
-- StripObjectHeader *target_header);
-- void rename_wrap(const coll_t &cid, const ghobject_t &oid,
-+ StripObjectHeaderRef *target_header);
-+ void rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid,
- KeyValueDB::Transaction t,
-- StripObjectHeader *header);
-+ StripObjectHeaderRef *new_header);
- // Already hold header to avoid lock header seq again
- int get_with_header(
-- const StripObjectHeader &header,
-+ const StripObjectHeaderRef header,
- const string &prefix,
- map<string, bufferlist> *out
- );
-
- int get_values_with_header(
-- const StripObjectHeader &header,
-+ const StripObjectHeaderRef header,
- const string &prefix,
- const set<string> &keys,
- map<string, bufferlist> *out
- );
- int get_keys_with_header(
-- const StripObjectHeader &header,
-+ const StripObjectHeaderRef header,
- const string &prefix,
- set<string> *keys
- );
-
-- StripObjectMap(KeyValueDB *db): GenericObjectMap(db) {}
-+ Mutex lock;
-+ void invalidate_cache(const coll_t &c, const ghobject_t &oid) {
-+ Mutex::Locker l(lock);
-+ caches.clear(oid);
-+ }
-
-- static const uint64_t default_strip_size = 1024;
-+ RandomCache<ghobject_t, pair<coll_t, StripObjectHeaderRef> > caches;
-+ StripObjectMap(KeyValueDB *db): GenericObjectMap(db),
-+ lock("StripObjectMap::lock"),
-+ caches(g_conf->keyvaluestore_header_cache_size)
-+ {}
- };
-
-
- class KeyValueStore : public ObjectStore,
-@@ -160,9 +160,9 @@
- std::string current_fn;
- std::string current_op_seq_fn;
- uuid_d fsid;
-
-- int fsid_fd, op_fd, current_fd;
-+ int fsid_fd, current_fd;
-
- enum kvstore_types kv_type;
-
- deque<uint64_t> snaps;
-@@ -209,41 +209,51 @@
- // 3. Object modify(including omap, xattr)
- // 4. Clone or rename
- struct BufferTransaction {
- typedef pair<coll_t, ghobject_t> uniq_id;
-- typedef map<uniq_id, StripObjectMap::StripObjectHeader> StripHeaderMap;
-+ typedef map<uniq_id, StripObjectMap::StripObjectHeaderRef> StripHeaderMap;
-
- //Dirty records
- StripHeaderMap strip_headers;
-+ list<Context*> finishes;
-
- KeyValueStore *store;
-
-- SequencerPosition spos;
- KeyValueDB::Transaction t;
-
- int lookup_cached_header(const coll_t &cid, const ghobject_t &oid,
-- StripObjectMap::StripObjectHeader **strip_header,
-+ StripObjectMap::StripObjectHeaderRef *strip_header,
- bool create_if_missing);
-- int get_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
-+ int get_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
- const string &prefix, const set<string> &keys,
- map<string, bufferlist> *out);
-- void set_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
-+ void set_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
- const string &prefix, map<string, bufferlist> &bl);
-- int remove_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
-+ int remove_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
- const string &prefix, const set<string> &keys);
-- void clear_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
-+ void clear_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
- const string &prefix);
-- int clear_buffer(StripObjectMap::StripObjectHeader &strip_header);
-- void clone_buffer(StripObjectMap::StripObjectHeader &old_header,
-+ int clear_buffer(StripObjectMap::StripObjectHeaderRef strip_header);
-+ void clone_buffer(StripObjectMap::StripObjectHeaderRef old_header,
- const coll_t &cid, const ghobject_t &oid);
-- void rename_buffer(StripObjectMap::StripObjectHeader &old_header,
-+ void rename_buffer(StripObjectMap::StripObjectHeaderRef old_header,
- const coll_t &cid, const ghobject_t &oid);
- int submit_transaction();
-
-- BufferTransaction(KeyValueStore *store,
-- SequencerPosition &spos): store(store), spos(spos) {
-+ BufferTransaction(KeyValueStore *store): store(store) {
- t = store->backend->get_transaction();
- }
-+
-+ struct InvalidateCacheContext : public Context {
-+ KeyValueStore *store;
-+ const coll_t cid;
-+ const ghobject_t oid;
-+ InvalidateCacheContext(KeyValueStore *s, const coll_t &c, const ghobject_t &oid): store(s), cid(c), oid(oid) {}
-+ void finish(int r) {
-+ if (r == 0)
-+ store->backend->invalidate_cache(cid, oid);
-+ }
-+ };
- };
-
- // -- op workqueue --
- struct Op {
-@@ -256,52 +266,111 @@
- };
- class OpSequencer : public Sequencer_impl {
- Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
- list<Op*> q;
-- list<uint64_t> jq;
- Cond cond;
-+ list<pair<uint64_t, Context*> > flush_commit_waiters;
-+ uint64_t op; // used by flush() to know the sequence of op
- public:
- Sequencer *parent;
- Mutex apply_lock; // for apply mutual exclusion
-+
-+ /// get_max_uncompleted
-+ bool _get_max_uncompleted(
-+ uint64_t *seq ///< [out] max uncompleted seq
-+ ) {
-+ assert(qlock.is_locked());
-+ assert(seq);
-+ *seq = 0;
-+ if (q.empty()) {
-+ return true;
-+ } else {
-+ *seq = q.back()->op;
-+ return false;
-+ }
-+ } /// @returns true if the queue is empty
-+
-+ /// get_min_uncompleted
-+ bool _get_min_uncompleted(
-+ uint64_t *seq ///< [out] min uncompleted seq
-+ ) {
-+ assert(qlock.is_locked());
-+ assert(seq);
-+ *seq = 0;
-+ if (q.empty()) {
-+ return true;
-+ } else {
-+ *seq = q.front()->op;
-+ return false;
-+ }
-+ } /// @returns true if both queues are empty
-+
-+ void _wake_flush_waiters(list<Context*> *to_queue) {
-+ uint64_t seq;
-+ if (_get_min_uncompleted(&seq))
-+ seq = -1;
-+
-+ for (list<pair<uint64_t, Context*> >::iterator i =
-+ flush_commit_waiters.begin();
-+ i != flush_commit_waiters.end() && i->first < seq;
-+ flush_commit_waiters.erase(i++)) {
-+ to_queue->push_back(i->second);
-+ }
-+ }
-
- void queue(Op *o) {
- Mutex::Locker l(qlock);
- q.push_back(o);
-+ op++;
-+ o->op = op;
- }
- Op *peek_queue() {
- assert(apply_lock.is_locked());
- return q.front();
- }
-- Op *dequeue() {
-+
-+ Op *dequeue(list<Context*> *to_queue) {
-+ assert(to_queue);
- assert(apply_lock.is_locked());
- Mutex::Locker l(qlock);
- Op *o = q.front();
- q.pop_front();
- cond.Signal();
-+
-+ _wake_flush_waiters(to_queue);
- return o;
- }
-+
- void flush() {
- Mutex::Locker l(qlock);
-
- // get max for journal _or_ op queues
- uint64_t seq = 0;
- if (!q.empty())
- seq = q.back()->op;
-- if (!jq.empty() && jq.back() > seq)
-- seq = jq.back();
-
- if (seq) {
- // everything prior to our watermark to drain through either/both
- // queues
-- while ((!q.empty() && q.front()->op <= seq) ||
-- (!jq.empty() && jq.front() <= seq))
-+ while (!q.empty() && q.front()->op <= seq)
- cond.Wait(qlock);
- }
- }
-+ bool flush_commit(Context *c) {
-+ Mutex::Locker l(qlock);
-+ uint64_t seq = 0;
-+ if (_get_max_uncompleted(&seq)) {
-+ delete c;
-+ return true;
-+ } else {
-+ flush_commit_waiters.push_back(make_pair(seq, c));
-+ return false;
-+ }
-+ }
-
- OpSequencer()
- : qlock("KeyValueStore::OpSequencer::qlock", false, false),
-- parent(0),
-+ op(0), parent(0),
- apply_lock("KeyValueStore::OpSequencer::apply_lock", false, false) {}
- ~OpSequencer() {
- assert(q.empty());
- }
-@@ -416,9 +485,8 @@
- return _do_transactions(tls, op_seq, 0);
- }
- unsigned _do_transaction(Transaction& transaction,
- BufferTransaction &bt,
-- SequencerPosition& spos,
- ThreadPool::TPHandle *handle);
-
- int queue_transactions(Sequencer *osr, list<Transaction*>& tls,
- TrackedOpRef op = TrackedOpRef(),
-@@ -427,12 +495,12 @@
-
- // ------------------
- // objects
-
-- int _generic_read(StripObjectMap::StripObjectHeader &header,
-+ int _generic_read(StripObjectMap::StripObjectHeaderRef header,
- uint64_t offset, size_t len, bufferlist& bl,
- bool allow_eio = false, BufferTransaction *bt = 0);
-- int _generic_write(StripObjectMap::StripObjectHeader &header,
-+ int _generic_write(StripObjectMap::StripObjectHeaderRef header,
- uint64_t offset, size_t len, const bufferlist& bl,
- BufferTransaction &t, bool replica = false);
-
- bool exists(coll_t cid, const ghobject_t& oid);
-@@ -571,28 +639,8 @@
- static const string OBJECT_OMAP_HEADER_KEY;
- static const string COLLECTION;
- static const string COLLECTION_ATTR;
- static const uint32_t COLLECTION_VERSION = 1;
--
-- class SubmitManager {
-- Mutex lock;
-- uint64_t op_seq;
-- uint64_t op_submitted;
-- public:
-- SubmitManager() :
-- lock("JOS::SubmitManager::lock", false, true, false, g_ceph_context),
-- op_seq(0), op_submitted(0)
-- {}
-- uint64_t op_submit_start();
-- void op_submit_finish(uint64_t op);
-- void set_op_seq(uint64_t seq) {
-- Mutex::Locker l(lock);
-- op_submitted = op_seq = seq;
-- }
-- uint64_t get_op_seq() {
-- return op_seq;
-- }
-- } submit_manager;
- };
-
- WRITE_CLASS_ENCODER(StripObjectMap::StripObjectHeader)
-
---- a/src/os/LFNIndex.cc
-+++ b/src/os/LFNIndex.cc
-@@ -60,8 +60,19 @@
- ++current_failure;
- }
- }
-
-+// Helper to close fd's when we leave scope. This is useful when used
-+// in combination with RetryException, thrown by the above.
-+struct FDCloser {
-+ int fd;
-+ FDCloser(int f) : fd(f) {}
-+ ~FDCloser() {
-+ VOID_TEMP_FAILURE_RETRY(::close(fd));
-+ }
-+};
-+
-+
- /* Public methods */
-
- void LFNIndex::set_ref(ceph::shared_ptr<CollectionIndex> ref)
- {
-@@ -159,11 +170,11 @@
- maybe_inject_failure();
- int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY);
- if (fd < 0)
- return -errno;
-+ FDCloser f(fd);
- maybe_inject_failure();
- int r = ::fsync(fd);
-- VOID_TEMP_FAILURE_RETRY(::close(fd));
- maybe_inject_failure();
- if (r < 0)
- return -errno;
- else
-@@ -752,9 +763,10 @@
- char buf[FILENAME_MAX_LEN + 1];
- for ( ; ; ++i) {
- candidate = lfn_get_short_name(oid, i);
- candidate_path = get_full_path(path, candidate);
-- r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(), buf, sizeof(buf));
-+ r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(),
-+ buf, sizeof(buf));
- if (r < 0) {
- if (errno != ENODATA && errno != ENOENT)
- return -errno;
- if (errno == ENODATA) {
-@@ -783,8 +795,40 @@
- if (exists)
- *exists = 1;
- return 0;
- }
-+ r = chain_getxattr(candidate_path.c_str(), get_alt_lfn_attr().c_str(),
-+ buf, sizeof(buf));
-+ if (r > 0) {
-+ // only consider alt name if nlink > 1
-+ struct stat st;
-+ int rc = ::stat(candidate_path.c_str(), &st);
-+ if (rc < 0)
-+ return -errno;
-+ if (st.st_nlink <= 1) {
-+ // left over from incomplete unlink, remove
-+ maybe_inject_failure();
-+ dout(20) << __func__ << " found extra alt attr for " << candidate_path
-+ << ", long name " << string(buf, r) << dendl;
-+ rc = chain_removexattr(candidate_path.c_str(),
-+ get_alt_lfn_attr().c_str());
-+ maybe_inject_failure();
-+ if (rc < 0)
-+ return rc;
-+ continue;
-+ }
-+ buf[MIN((int)sizeof(buf) - 1, r)] = '\0';
-+ if (!strcmp(buf, full_name.c_str())) {
-+ dout(20) << __func__ << " used alt attr for " << full_name << dendl;
-+ if (mangled_name)
-+ *mangled_name = candidate;
-+ if (out_path)
-+ *out_path = candidate_path;
-+ if (exists)
-+ *exists = 1;
-+ return 0;
-+ }
-+ }
- }
- assert(0); // Unreachable
- return 0;
- }
-@@ -797,9 +841,26 @@
- return 0;
- string full_path = get_full_path(path, mangled_name);
- string full_name = lfn_generate_object_name(oid);
- maybe_inject_failure();
-- return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(),
-+
-+ // if the main attr exists and is different, move it to the alt attr.
-+ char buf[FILENAME_MAX_LEN + 1];
-+ int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(),
-+ buf, sizeof(buf));
-+ if (r >= 0 && (r != (int)full_name.length() ||
-+ memcmp(buf, full_name.c_str(), full_name.length()))) {
-+ dout(20) << __func__ << " " << mangled_name
-+ << " moving old name to alt attr "
-+ << string(buf, r)
-+ << ", new name is " << full_name << dendl;
-+ r = chain_setxattr(full_path.c_str(), get_alt_lfn_attr().c_str(),
-+ buf, r);
-+ if (r < 0)
-+ return r;
-+ }
-+
-+ return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(),
- full_name.c_str(), full_name.size());
- }
-
- int LFNIndex::lfn_unlink(const vector<string> &path,
-@@ -838,28 +899,37 @@
- return -errno;
- }
- }
- }
-+ string full_path = get_full_path(path, mangled_name);
-+ int fd = ::open(full_path.c_str(), O_RDONLY);
-+ if (fd < 0)
-+ return -errno;
-+ FDCloser f(fd);
- if (i == removed_index + 1) {
-- string full_path = get_full_path(path, mangled_name);
- maybe_inject_failure();
- int r = ::unlink(full_path.c_str());
- maybe_inject_failure();
- if (r < 0)
- return -errno;
-- else
-- return 0;
- } else {
-- string rename_to = get_full_path(path, mangled_name);
-+ string& rename_to = full_path;
- string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1));
- maybe_inject_failure();
- int r = ::rename(rename_from.c_str(), rename_to.c_str());
- maybe_inject_failure();
- if (r < 0)
- return -errno;
-- else
-- return 0;
- }
-+ struct stat st;
-+ int r = ::fstat(fd, &st);
-+ if (r == 0 && st.st_nlink > 0) {
-+ // remove alt attr
-+ dout(20) << __func__ << " removing alt attr from " << full_path << dendl;
-+ fsync_dir(path);
-+ chain_fremovexattr(fd, get_alt_lfn_attr().c_str());
-+ }
-+ return r;
- }
-
- int LFNIndex::lfn_translate(const vector<string> &path,
- const string &short_name,
---- a/src/os/LFNIndex.h
-+++ b/src/os/LFNIndex.h
-@@ -122,9 +122,9 @@
- error_injection_enabled = false;
- }
-
- private:
-- string lfn_attribute;
-+ string lfn_attribute, lfn_alt_attribute;
- coll_t collection;
-
- public:
- /// Constructor
-@@ -145,9 +145,10 @@
- } else {
- char buf[100];
- snprintf(buf, sizeof(buf), "%d", index_version);
- lfn_attribute = LFN_ATTR + string(buf);
-- }
-+ lfn_alt_attribute = LFN_ATTR + string(buf) + "-alt";
-+ }
- }
-
- coll_t coll() const { return collection; }
-
-@@ -422,8 +423,11 @@
- */
- const string &get_lfn_attr() const {
- return lfn_attribute;
- }
-+ const string &get_alt_lfn_attr() const {
-+ return lfn_alt_attribute;
-+ }
-
- /**
- * Gets the filename corresponsing to oid in path.
- *
---- a/src/os/MemStore.cc
-+++ b/src/os/MemStore.cc
-@@ -949,9 +949,14 @@
- }
- break;
-
- case Transaction::OP_SETALLOCHINT:
-- // nop
-+ {
-+ coll_t cid(i.get_cid());
-+ ghobject_t oid = i.get_oid();
-+ (void)i.get_length(); // discard result
-+ (void)i.get_length(); // discard result
-+ }
- break;
-
- default:
- derr << "bad op " << op << dendl;
---- a/src/os/ObjectStore.cc
-+++ b/src/os/ObjectStore.cc
-@@ -143,9 +143,13 @@
- int ObjectStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
- snapid_t seq, vector<hobject_t> *ls)
- {
- vector<ghobject_t> go;
-- ghobject_t gstart(start), gend(end);
-+ // Starts with the smallest shard id and generation to
-+ // make sure the result list has the marker object
-+ ghobject_t gstart(start, 0, shard_id_t(0));
-+ // Exclusive end, choose the smallest end ghobject
-+ ghobject_t gend(end, 0, shard_id_t(0));
- int ret = collection_list_range(c, gstart, gend, seq, &go);
- if (ret == 0) {
- ls->reserve(go.size());
- for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; ++i)
---- a/src/os/ObjectStore.h
-+++ b/src/os/ObjectStore.h
-@@ -127,8 +127,24 @@
- * created in ...::queue_transaction(s)
- */
- struct Sequencer_impl {
- virtual void flush() = 0;
-+
-+ /**
-+ * Async flush_commit
-+ *
-+ * There are two cases:
-+ * 1) sequencer is currently idle: the method returns true and
-+ * c is deleted
-+ * 2) sequencer is not idle: the method returns false and c is
-+ * called asyncronously with a value of 0 once all transactions
-+ * queued on this sequencer prior to the call have been applied
-+ * and committed.
-+ */
-+ virtual bool flush_commit(
-+ Context *c ///< [in] context to call upon flush/commit
-+ ) = 0; ///< @return true if idle, false otherwise
-+
- virtual ~Sequencer_impl() {}
- };
-
- /**
-@@ -152,8 +168,18 @@
- void flush() {
- if (p)
- p->flush();
- }
-+
-+ /// @see Sequencer_impl::flush_commit()
-+ bool flush_commit(Context *c) {
-+ if (!p) {
-+ delete c;
-+ return true;
-+ } else {
-+ return p->flush_commit(c);
-+ }
-+ }
- };
-
- /*********************************
- *
---- a/src/osd/ECBackend.cc
-+++ b/src/osd/ECBackend.cc
-@@ -104,15 +104,15 @@
- }
-
- void ECBackend::ReadOp::dump(Formatter *f) const
- {
-- f->dump_stream("tid") << tid;
-+ f->dump_unsigned("tid", tid);
- if (op && op->get_req()) {
- f->dump_stream("op") << *(op->get_req());
- }
- f->dump_stream("to_read") << to_read;
- f->dump_stream("complete") << complete;
-- f->dump_stream("priority") << priority;
-+ f->dump_int("priority", priority);
- f->dump_stream("obj_to_source") << obj_to_source;
- f->dump_stream("source_to_obj") << source_to_obj;
- f->dump_stream("in_progress") << in_progress;
- }
-@@ -157,9 +157,9 @@
- f->dump_stream("missing_on") << missing_on;
- f->dump_stream("missing_on_shards") << missing_on_shards;
- f->dump_stream("recovery_info") << recovery_info;
- f->dump_stream("recovery_progress") << recovery_progress;
-- f->dump_stream("pending_read") << pending_read;
-+ f->dump_bool("pending_read", pending_read);
- f->dump_stream("state") << tostr(state);
- f->dump_stream("waiting_on_pushes") << waiting_on_pushes;
- f->dump_stream("extent_requested") << extent_requested;
- }
-@@ -828,8 +828,9 @@
- get_parent()->log_operation(
- op.log_entries,
- op.updated_hit_set_history,
- op.trim_to,
-+ op.trim_rollback_to,
- !(op.t.empty()),
- localt);
- localt->append(op.t);
- if (on_local_applied_sync) {
-@@ -1210,8 +1211,9 @@
- const hobject_t &hoid,
- const eversion_t &at_version,
- PGTransaction *_t,
- const eversion_t &trim_to,
-+ const eversion_t &trim_rollback_to,
- vector<pg_log_entry_t> &log_entries,
- boost::optional<pg_hit_set_history_t> &hset_history,
- Context *on_local_applied_sync,
- Context *on_all_applied,
-@@ -1225,8 +1227,9 @@
- Op *op = &(tid_to_op_map[tid]);
- op->hoid = hoid;
- op->version = at_version;
- op->trim_to = trim_to;
-+ op->trim_rollback_to = trim_rollback_to;
- op->log_entries.swap(log_entries);
- std::swap(op->updated_hit_set_history, hset_history);
- op->on_local_applied_sync = on_local_applied_sync;
- op->on_all_applied = on_all_applied;
-@@ -1531,8 +1534,9 @@
- stats,
- should_send ? iter->second : ObjectStore::Transaction(),
- op->version,
- op->trim_to,
-+ op->trim_rollback_to,
- op->log_entries,
- op->updated_hit_set_history,
- op->temp_added,
- op->temp_cleared);
---- a/src/osd/ECBackend.h
-+++ b/src/osd/ECBackend.h
-@@ -96,8 +96,9 @@
- const hobject_t &hoid,
- const eversion_t &at_version,
- PGTransaction *t,
- const eversion_t &trim_to,
-+ const eversion_t &trim_rollback_to,
- vector<pg_log_entry_t> &log_entries,
- boost::optional<pg_hit_set_history_t> &hset_history,
- Context *on_local_applied_sync,
- Context *on_all_applied,
-@@ -325,8 +326,9 @@
- struct Op {
- hobject_t hoid;
- eversion_t version;
- eversion_t trim_to;
-+ eversion_t trim_rollback_to;
- vector<pg_log_entry_t> log_entries;
- boost::optional<pg_hit_set_history_t> updated_hit_set_history;
- Context *on_local_applied_sync;
- Context *on_all_applied;
---- a/src/osd/ECMsgTypes.cc
-+++ b/src/osd/ECMsgTypes.cc
-@@ -15,9 +15,9 @@
- #include "ECMsgTypes.h"
-
- void ECSubWrite::encode(bufferlist &bl) const
- {
-- ENCODE_START(2, 1, bl);
-+ ENCODE_START(3, 1, bl);
- ::encode(from, bl);
- ::encode(tid, bl);
- ::encode(reqid, bl);
- ::encode(soid, bl);
-@@ -28,14 +28,15 @@
- ::encode(log_entries, bl);
- ::encode(temp_added, bl);
- ::encode(temp_removed, bl);
- ::encode(updated_hit_set_history, bl);
-+ ::encode(trim_rollback_to, bl);
- ENCODE_FINISH(bl);
- }
-
- void ECSubWrite::decode(bufferlist::iterator &bl)
- {
-- DECODE_START(2, bl);
-+ DECODE_START(3, bl);
- ::decode(from, bl);
- ::decode(tid, bl);
- ::decode(reqid, bl);
- ::decode(soid, bl);
-@@ -48,8 +49,13 @@
- ::decode(temp_removed, bl);
- if (struct_v >= 2) {
- ::decode(updated_hit_set_history, bl);
- }
-+ if (struct_v >= 3) {
-+ ::decode(trim_rollback_to, bl);
-+ } else {
-+ trim_rollback_to = trim_to;
-+ }
- DECODE_FINISH(bl);
- }
-
- std::ostream &operator<<(
-@@ -57,20 +63,22 @@
- {
- lhs << "ECSubWrite(tid=" << rhs.tid
- << ", reqid=" << rhs.reqid
- << ", at_version=" << rhs.at_version
-- << ", trim_to=" << rhs.trim_to;
-+ << ", trim_to=" << rhs.trim_to
-+ << ", trim_rollback_to=" << rhs.trim_rollback_to;
- if (rhs.updated_hit_set_history)
- lhs << ", has_updated_hit_set_history";
- return lhs << ")";
- }
-
- void ECSubWrite::dump(Formatter *f) const
- {
-- f->dump_stream("tid") << tid;
-+ f->dump_unsigned("tid", tid);
- f->dump_stream("reqid") << reqid;
- f->dump_stream("at_version") << at_version;
- f->dump_stream("trim_to") << trim_to;
-+ f->dump_stream("trim_rollback_to") << trim_rollback_to;
- f->dump_stream("has_updated_hit_set_history")
- << static_cast<bool>(updated_hit_set_history);
- }
-
-@@ -84,8 +92,14 @@
- o.back()->tid = 4;
- o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
- o.back()->at_version = eversion_t(10, 300);
- o.back()->trim_to = eversion_t(5, 42);
-+ o.push_back(new ECSubWrite());
-+ o.back()->tid = 9;
-+ o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
-+ o.back()->at_version = eversion_t(10, 300);
-+ o.back()->trim_to = eversion_t(5, 42);
-+ o.back()->trim_rollback_to = eversion_t(8, 250);
- }
-
- void ECSubWriteReply::encode(bufferlist &bl) const
- {
-@@ -120,9 +134,9 @@
- }
-
- void ECSubWriteReply::dump(Formatter *f) const
- {
-- f->dump_stream("tid") << tid;
-+ f->dump_unsigned("tid", tid);
- f->dump_stream("last_complete") << last_complete;
- f->dump_stream("committed") << committed;
- f->dump_stream("applied") << applied;
- }
-@@ -170,9 +184,9 @@
-
- void ECSubRead::dump(Formatter *f) const
- {
- f->dump_stream("from") << from;
-- f->dump_stream("tid") << tid;
-+ f->dump_unsigned("tid", tid);
- f->open_array_section("objects");
- for (map<hobject_t, list<pair<uint64_t, uint64_t> > >::const_iterator i =
- to_read.begin();
- i != to_read.end();
-@@ -258,9 +272,9 @@
-
- void ECSubReadReply::dump(Formatter *f) const
- {
- f->dump_stream("from") << from;
-- f->dump_stream("tid") << tid;
-+ f->dump_unsigned("tid", tid);
- f->open_array_section("buffers_read");
- for (map<hobject_t, list<pair<uint64_t, bufferlist> > >::const_iterator i =
- buffers_read.begin();
- i != buffers_read.end();
---- a/src/osd/ECMsgTypes.h
-+++ b/src/osd/ECMsgTypes.h
-@@ -27,8 +27,9 @@
- pg_stat_t stats;
- ObjectStore::Transaction t;
- eversion_t at_version;
- eversion_t trim_to;
-+ eversion_t trim_rollback_to;
- vector<pg_log_entry_t> log_entries;
- set<hobject_t> temp_added;
- set<hobject_t> temp_removed;
- boost::optional<pg_hit_set_history_t> updated_hit_set_history;
-@@ -41,16 +42,18 @@
- const pg_stat_t &stats,
- const ObjectStore::Transaction &t,
- eversion_t at_version,
- eversion_t trim_to,
-+ eversion_t trim_rollback_to,
- vector<pg_log_entry_t> log_entries,
- boost::optional<pg_hit_set_history_t> updated_hit_set_history,
- const set<hobject_t> &temp_added,
- const set<hobject_t> &temp_removed)
- : from(from), tid(tid), reqid(reqid),
- soid(soid), stats(stats), t(t),
- at_version(at_version),
-- trim_to(trim_to), log_entries(log_entries),
-+ trim_to(trim_to), trim_rollback_to(trim_rollback_to),
-+ log_entries(log_entries),
- temp_added(temp_added),
- temp_removed(temp_removed),
- updated_hit_set_history(updated_hit_set_history) {}
- void encode(bufferlist &bl) const;
---- a/src/osd/HitSet.h
-+++ b/src/osd/HitSet.h
-@@ -368,9 +368,9 @@
- double get_fpp() const {
- return (double)fpp_micro / 1000000.0;
- }
- void set_fpp(double f) {
-- fpp_micro = (unsigned)(f * 1000000.0);
-+ fpp_micro = (unsigned)(llrintl(f * (double)1000000.0));
- }
-
- void encode(bufferlist& bl) const {
- ENCODE_START(1, 1, bl);
---- a/src/osd/OSD.cc
-+++ b/src/osd/OSD.cc
-@@ -41,8 +41,9 @@
- #include "osdc/Objecter.h"
-
- #include "common/ceph_argparse.h"
- #include "common/version.h"
-+#include "common/io_priority.h"
-
- #include "os/ObjectStore.h"
-
- #include "ReplicatedPG.h"
-@@ -190,8 +191,9 @@
- rep_scrub_wq(osd->rep_scrub_wq),
- push_wq("push_wq", cct->_conf->osd_recovery_thread_timeout, &osd->recovery_tp),
- gen_wq("gen_wq", cct->_conf->osd_recovery_thread_timeout, &osd->recovery_tp),
- class_handler(osd->class_handler),
-+ pg_epoch_lock("OSDService::pg_epoch_lock"),
- publish_lock("OSDService::publish_lock"),
- pre_publish_lock("OSDService::pre_publish_lock"),
- sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
- scrubs_active(0),
-@@ -1276,8 +1278,10 @@
- recovery_tp.start();
- disk_tp.start();
- command_tp.start();
-
-+ set_disk_tp_priority();
-+
- // start the heartbeat
- heartbeat_thread.create();
-
- // tick
-@@ -1304,8 +1308,10 @@
- osd_lock.Lock();
- if (is_stopping())
- return 0;
-
-+ check_config();
-+
- dout(10) << "ensuring pgs have consumed prior maps" << dendl;
- consume_map();
- peering_wq.drain();
-
-@@ -1662,10 +1668,12 @@
- recovery_tp.stop();
- dout(10) << "recovery tp stopped" << dendl;
-
- op_tp.drain();
-+ peering_wq.clear();
-+ scrub_finalize_wq.clear();
- op_tp.stop();
-- dout(10) << "op tp stopped" << dendl;
-+ dout(10) << "osd tp stopped" << dendl;
-
- command_tp.drain();
- command_tp.stop();
- dout(10) << "command tp stopped" << dendl;
-@@ -1707,9 +1715,8 @@
- Mutex::Locker l(pg_stat_queue_lock);
- assert(pg_stat_queue.empty());
- }
-
-- peering_wq.clear();
- // Remove PGs
- #ifdef PG_DEBUG_REFS
- service.dump_live_pgids();
- #endif
-@@ -1853,8 +1860,10 @@
- PG* pg = _make_pg(createmap, pgid);
-
- pg_map[pgid] = pg;
-
-+ service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
-+
- pg->lock(no_lockdep_check);
- pg->get("PGMap"); // because it's in pg_map
- return pg;
- }
-@@ -1884,8 +1893,9 @@
- {
- epoch_t e(service.get_osdmap()->get_epoch());
- pg->get("PGMap"); // For pg_map
- pg_map[pg->info.pgid] = pg;
-+ service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
- dout(10) << "Adding newly split pg " << *pg << dendl;
- vector<int> up, acting;
- pg->get_osdmap()->pg_to_up_acting_osds(pg->info.pgid.pgid, up, acting);
- int role = OSDMap::calc_pg_role(service.whoami, acting);
-@@ -4391,11 +4401,10 @@
- // 1MB block sizes are big enough so that we get more stuff done.
- // However, to avoid the osd from getting hung on this and having
- // timers being triggered, we are going to limit the count assuming
- // a configurable throughput and duration.
-- int64_t total_throughput =
-+ int64_t max_count =
- g_conf->osd_bench_large_size_max_throughput * duration;
-- int64_t max_count = (int64_t) (total_throughput / bsize);
- if (count > max_count) {
- ss << "'count' values greater than " << max_count
- << " for a block size of " << prettybyte_t(bsize) << ", assuming "
- << prettybyte_t(g_conf->osd_bench_large_size_max_throughput) << "/s,"
-@@ -5712,13 +5721,14 @@
- client_messenger->set_default_policy(p);
- }
- }
- {
-- Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_MON);
-+ Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
- uint64_t mask;
- uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
- if ((p.features_required & mask) != features) {
- dout(0) << "crush map has features " << features
-+ << " was " << p.features_required
- << ", adjusting msgr requires for mons" << dendl;
- p.features_required = (p.features_required & ~mask) | features;
- client_messenger->set_policy(entity_name_t::TYPE_MON, p);
- }
-@@ -5747,9 +5757,9 @@
- }
- }
- }
-
--void OSD::advance_pg(
-+bool OSD::advance_pg(
- epoch_t osd_epoch, PG *pg,
- ThreadPool::TPHandle &handle,
- PG::RecoveryCtx *rctx,
- set<boost::intrusive_ptr<PG> > *new_pgs)
-@@ -5758,13 +5768,21 @@
- epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
- OSDMapRef lastmap = pg->get_osdmap();
-
- if (lastmap->get_epoch() == osd_epoch)
-- return;
-+ return true;
- assert(lastmap->get_epoch() < osd_epoch);
-
-+ epoch_t min_epoch = service.get_min_pg_epoch();
-+ epoch_t max;
-+ if (min_epoch) {
-+ max = min_epoch + g_conf->osd_map_max_advance;
-+ } else {
-+ max = next_epoch + g_conf->osd_map_max_advance;
-+ }
-+
- for (;
-- next_epoch <= osd_epoch;
-+ next_epoch <= osd_epoch && next_epoch <= max;
- ++next_epoch) {
- OSDMapRef nextmap = service.try_get_map(next_epoch);
- if (!nextmap)
- continue;
-@@ -5794,9 +5812,17 @@
-
- lastmap = nextmap;
- handle.reset_tp_timeout();
- }
-+ service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
- pg->handle_activate_map(rctx);
-+ if (next_epoch <= osd_epoch) {
-+ dout(10) << __func__ << " advanced by max " << g_conf->osd_map_max_advance
-+ << " past min epoch " << min_epoch
-+ << " ... will requeue " << *pg << dendl;
-+ return false;
-+ }
-+ return true;
- }
-
- /**
- * scan placement groups, initiate any replication
-@@ -6126,9 +6152,9 @@
- }
- return true;
- }
-
--bool OSD::require_osd_peer(OpRequestRef op)
-+bool OSD::require_osd_peer(OpRequestRef& op)
- {
- if (!op->get_req()->get_connection()->peer_is_osd()) {
- dout(0) << "require_osd_peer received from non-osd " << op->get_req()->get_connection()->get_peer_addr()
- << " " << *op->get_req() << dendl;
-@@ -6136,13 +6162,66 @@
- }
- return true;
- }
-
-+bool OSD::require_self_aliveness(OpRequestRef& op, epoch_t epoch)
-+{
-+ if (epoch < up_epoch) {
-+ dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
-+ return false;
-+ }
-+
-+ if (!is_active()) {
-+ dout(7) << "still in boot state, dropping message " << *op->get_req() << dendl;
-+ return false;
-+ }
-+
-+ return true;
-+}
-+
-+bool OSD::require_same_peer_instance(OpRequestRef& op, OSDMapRef& map)
-+{
-+ Message *m = op->get_req();
-+ int from = m->get_source().num();
-+
-+ if (!map->have_inst(from) ||
-+ (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
-+ dout(5) << "from dead osd." << from << ", marking down, "
-+ << " msg was " << m->get_source_inst().addr
-+ << " expected " << (map->have_inst(from) ?
-+ map->get_cluster_addr(from) : entity_addr_t())
-+ << dendl;
-+ ConnectionRef con = m->get_connection();
-+ cluster_messenger->mark_down(con.get());
-+ Session *s = static_cast<Session*>(con->get_priv());
-+ if (s) {
-+ con->set_priv(NULL); // break ref <-> session cycle, if any
-+ s->put();
-+ }
-+ return false;
-+ }
-+ return true;
-+}
-+
-+bool OSD::require_up_osd_peer(OpRequestRef& op, OSDMapRef& map,
-+ epoch_t their_epoch)
-+{
-+ if (!require_self_aliveness(op, their_epoch)) {
-+ return false;
-+ } else if (!require_osd_peer(op)) {
-+ return false;
-+ } else if (map->get_epoch() >= their_epoch &&
-+ !require_same_peer_instance(op, map)) {
-+ return false;
-+ }
-+ return true;
-+}
-+
- /*
- * require that we have same (or newer) map, and that
- * the source is the pg primary.
- */
--bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch)
-+bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch)
- {
- Message *m = op->get_req();
- dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
-
-@@ -6154,32 +6233,15 @@
- wait_for_new_map(op);
- return false;
- }
-
-- if (epoch < up_epoch) {
-- dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
-+ if (!require_self_aliveness(op, epoch)) {
- return false;
- }
-
- // ok, our map is same or newer.. do they still exist?
-- if (m->get_connection()->get_messenger() == cluster_messenger) {
-- int from = m->get_source().num();
-- if (!osdmap->have_inst(from) ||
-- osdmap->get_cluster_addr(from) != m->get_source_inst().addr) {
-- dout(5) << "from dead osd." << from << ", marking down, "
-- << " msg was " << m->get_source_inst().addr
-- << " expected " << (osdmap->have_inst(from) ? osdmap->get_cluster_addr(from) : entity_addr_t())
-- << dendl;
-- ConnectionRef con = m->get_connection();
-- con->set_priv(NULL); // break ref <-> session cycle, if any
-- cluster_messenger->mark_down(con.get());
-- return false;
-- }
-- }
--
-- // ok, we have at least as new a map as they do. are we (re)booting?
-- if (!is_active()) {
-- dout(7) << "still in boot state, dropping message " << *m << dendl;
-+ if (m->get_connection()->get_messenger() == cluster_messenger &&
-+ !require_same_peer_instance(op, osdmap)) {
- return false;
- }
-
- return true;
-@@ -7141,8 +7203,10 @@
- PGRef(pg))
- );
- remove_wq.queue(make_pair(PGRef(pg), deleting));
-
-+ service.pg_remove_epoch(pg->info.pgid);
-+
- // remove from map
- pg_map.erase(pg->info.pgid);
- pg->put("PGMap"); // since we've taken it out of map
- }
-@@ -7554,9 +7618,9 @@
- dout(3) << "replica op from before up" << dendl;
- return;
- }
-
-- if (!require_osd_peer(op))
-+ if (!require_up_osd_peer(op, osdmap, m->map_epoch))
- return;
-
- // must be a rep op.
- assert(m->get_source().is_osd());
-@@ -7769,10 +7833,11 @@
- if (pg->deleting) {
- pg->unlock();
- continue;
- }
-- advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs);
-- if (!pg->peering_queue.empty()) {
-+ if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
-+ pg->queue_null(curmap->get_epoch(), curmap->get_epoch());
-+ } else if (!pg->peering_queue.empty()) {
- PG::CephPeeringEvtRef evt = pg->peering_queue.front();
- pg->peering_queue.pop_front();
- pg->handle_peering_event(evt, &rctx);
- }
-@@ -7807,8 +7872,13 @@
- static const char* KEYS[] = {
- "osd_max_backfills",
- "osd_op_complaint_time", "osd_op_log_threshold",
- "osd_op_history_size", "osd_op_history_duration",
-+ "osd_map_cache_size",
-+ "osd_map_max_advance",
-+ "osd_pg_epoch_persisted_max_stale",
-+ "osd_disk_thread_ioprio_class",
-+ "osd_disk_thread_ioprio_priority",
- NULL
- };
- return KEYS;
- }
-@@ -7829,8 +7899,40 @@
- changed.count("osd_op_history_duration")) {
- op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
- cct->_conf->osd_op_history_duration);
- }
-+ if (changed.count("osd_disk_thread_ioprio_class") ||
-+ changed.count("osd_disk_thread_ioprio_priority")) {
-+ set_disk_tp_priority();
-+ }
-+
-+ check_config();
-+}
-+
-+void OSD::check_config()
-+{
-+ // some sanity checks
-+ if (g_conf->osd_map_cache_size <= g_conf->osd_map_max_advance + 2) {
-+ clog.warn() << "osd_map_cache_size (" << g_conf->osd_map_cache_size << ")"
-+ << " is not > osd_map_max_advance ("
-+ << g_conf->osd_map_max_advance << ")";
-+ }
-+ if (g_conf->osd_map_cache_size <= (int)g_conf->osd_pg_epoch_persisted_max_stale + 2) {
-+ clog.warn() << "osd_map_cache_size (" << g_conf->osd_map_cache_size << ")"
-+ << " is not > osd_pg_epoch_persisted_max_stale ("
-+ << g_conf->osd_pg_epoch_persisted_max_stale << ")";
-+ }
-+}
-+
-+void OSD::set_disk_tp_priority()
-+{
-+ dout(10) << __func__
-+ << " class " << cct->_conf->osd_disk_thread_ioprio_class
-+ << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
-+ << dendl;
-+ int cls =
-+ ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
-+ disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
- }
-
- // --------------------------------
-
---- a/src/osd/OSD.h
-+++ b/src/osd/OSD.h
-@@ -333,8 +333,44 @@
- ClassHandler *&class_handler;
-
- void dequeue_pg(PG *pg, list<OpRequestRef> *dequeued);
-
-+ // -- map epoch lower bound --
-+ Mutex pg_epoch_lock;
-+ multiset<epoch_t> pg_epochs;
-+ map<spg_t,epoch_t> pg_epoch;
-+
-+ void pg_add_epoch(spg_t pgid, epoch_t epoch) {
-+ Mutex::Locker l(pg_epoch_lock);
-+ map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
-+ assert(t == pg_epoch.end());
-+ pg_epoch[pgid] = epoch;
-+ pg_epochs.insert(epoch);
-+ }
-+ void pg_update_epoch(spg_t pgid, epoch_t epoch) {
-+ Mutex::Locker l(pg_epoch_lock);
-+ map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
-+ assert(t != pg_epoch.end());
-+ pg_epochs.erase(pg_epochs.find(t->second));
-+ t->second = epoch;
-+ pg_epochs.insert(epoch);
-+ }
-+ void pg_remove_epoch(spg_t pgid) {
-+ Mutex::Locker l(pg_epoch_lock);
-+ map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
-+ if (t != pg_epoch.end()) {
-+ pg_epochs.erase(pg_epochs.find(t->second));
-+ pg_epoch.erase(t);
-+ }
-+ }
-+ epoch_t get_min_pg_epoch() {
-+ Mutex::Locker l(pg_epoch_lock);
-+ if (pg_epochs.empty())
-+ return 0;
-+ else
-+ return *pg_epochs.begin();
-+ }
-+
- // -- superblock --
- Mutex publish_lock, pre_publish_lock; // pre-publish orders before publish
- OSDSuperblock superblock;
- OSDSuperblock get_superblock() {
-@@ -783,8 +819,9 @@
- // config observer bits
- virtual const char** get_tracked_conf_keys() const;
- virtual void handle_conf_change(const struct md_config_t *conf,
- const std::set <std::string> &changed);
-+ void check_config();
-
- protected:
- Mutex osd_lock; // global lock
- SafeTimer tick_timer; // safe timer (osd_lock)
-@@ -943,8 +980,10 @@
- ThreadPool command_tp;
-
- bool paused_recovery;
-
-+ void set_disk_tp_priority();
-+
- // -- sessions --
- public:
- struct Session : public RefCountedObject {
- EntityName entity_name;
-@@ -1254,9 +1293,9 @@
- void handle_osd_map(class MOSDMap *m);
- void note_down_osd(int osd);
- void note_up_osd(int osd);
-
-- void advance_pg(
-+ bool advance_pg(
- epoch_t advance_to, PG *pg,
- ThreadPool::TPHandle &handle,
- PG::RecoveryCtx *rctx,
- set<boost::intrusive_ptr<PG> > *split_pgs
-@@ -1512,11 +1551,24 @@
- OSDMapRef map);
- void repeer(PG *pg, map< int, map<spg_t,pg_query_t> >& query_map);
-
- bool require_mon_peer(Message *m);
-- bool require_osd_peer(OpRequestRef op);
-+ bool require_osd_peer(OpRequestRef& op);
-+ /***
-+ * Verifies that we were alive in the given epoch, and that
-+ * still are.
-+ */
-+ bool require_self_aliveness(OpRequestRef& op, epoch_t alive_since);
-+ /**
-+ * Verifies that the OSD who sent the given op has the same
-+ * address as in the given map.
-+ * @pre op was sent by an OSD using the cluster messenger
-+ */
-+ bool require_same_peer_instance(OpRequestRef& op, OSDMapRef& map);
-+ bool require_up_osd_peer(OpRequestRef& Op, OSDMapRef& map,
-+ epoch_t their_epoch);
-
-- bool require_same_or_newer_map(OpRequestRef op, epoch_t e);
-+ bool require_same_or_newer_map(OpRequestRef& op, epoch_t e);
-
- void handle_pg_query(OpRequestRef op);
- void handle_pg_notify(OpRequestRef op);
- void handle_pg_log(OpRequestRef op);
---- a/src/osd/OSDMap.cc
-+++ b/src/osd/OSDMap.cc
-@@ -958,12 +958,9 @@
- if (crush->has_nondefault_tunables())
- features |= CEPH_FEATURE_CRUSH_TUNABLES;
- if (crush->has_nondefault_tunables2())
- features |= CEPH_FEATURE_CRUSH_TUNABLES2;
-- if (crush->has_v2_rules())
-- features |= CEPH_FEATURE_CRUSH_V2;
-- if (crush->has_nondefault_tunables3() ||
-- crush->has_v3_rules())
-+ if (crush->has_nondefault_tunables3())
- features |= CEPH_FEATURE_CRUSH_TUNABLES3;
- mask |= CEPH_FEATURES_CRUSH;
-
- for (map<int64_t,pg_pool_t>::const_iterator p = pools.begin(); p != pools.end(); ++p) {
-@@ -977,8 +974,17 @@
- if (!p->second.tiers.empty() ||
- p->second.is_tier()) {
- features |= CEPH_FEATURE_OSD_CACHEPOOL;
- }
-+ int ruleid = crush->find_rule(p->second.get_crush_ruleset(),
-+ p->second.get_type(),
-+ p->second.get_size());
-+ if (ruleid >= 0) {
-+ if (crush->is_v2_rule(ruleid))
-+ features |= CEPH_FEATURE_CRUSH_V2;
-+ if (crush->is_v3_rule(ruleid))
-+ features |= CEPH_FEATURE_CRUSH_TUNABLES3;
-+ }
- }
- mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
- if (entity_type != CEPH_ENTITY_TYPE_CLIENT)
- mask |= CEPH_FEATURE_OSD_ERASURE_CODES;
-@@ -1800,9 +1806,17 @@
- {
- ENCODE_START(1, 1, bl); // extended, osd-only data
- ::encode(osd_addrs->hb_back_addr, bl);
- ::encode(osd_info, bl);
-- ::encode(blacklist, bl);
-+ {
-+ // put this in a sorted, ordered map<> so that we encode in a
-+ // deterministic order.
-+ map<entity_addr_t,utime_t> blacklist_map;
-+ for (ceph::unordered_map<entity_addr_t,utime_t>::const_iterator p =
-+ blacklist.begin(); p != blacklist.end(); ++p)
-+ blacklist_map.insert(make_pair(p->first, p->second));
-+ ::encode(blacklist_map, bl);
-+ }
- ::encode(osd_addrs->cluster_addr, bl);
- ::encode(cluster_snapshot_epoch, bl);
- ::encode(cluster_snapshot, bl);
- ::encode(*osd_uuid, bl);
-@@ -2158,8 +2172,9 @@
- o.push_back(new OSDMap);
- uuid_d fsid;
- o.back()->build_simple(cct, 1, fsid, 16, 7, 8);
- o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
-+ o.back()->blacklist[entity_addr_t()] = utime_t(5, 6);
- cct->put();
- }
-
- string OSDMap::get_flag_string(unsigned f)
-@@ -2550,15 +2565,27 @@
- set_state(i, 0);
- set_weight(i, CEPH_OSD_OUT);
- }
-
-- map<string,string> erasure_code_profile_map;
-- r = get_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
-- ss,
-- &erasure_code_profile_map);
-- erasure_code_profile_map["directory"] =
-+ map<string,string> profile_map;
-+ r = get_erasure_code_profile_default(cct, profile_map, &ss);
-+ if (r < 0) {
-+ lderr(cct) << ss.str() << dendl;
-+ return r;
-+ }
-+ set_erasure_code_profile("default", profile_map);
-+ return 0;
-+}
-+
-+int OSDMap::get_erasure_code_profile_default(CephContext *cct,
-+ map<string,string> &profile_map,
-+ ostream *ss)
-+{
-+ int r = get_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
-+ *ss,
-+ &profile_map);
-+ profile_map["directory"] =
- cct->_conf->osd_pool_default_erasure_code_directory;
-- set_erasure_code_profile("default", erasure_code_profile_map);
- return r;
- }
-
- int OSDMap::_build_crush_types(CrushWrapper& crush)
---- a/src/osd/OSDMap.h
-+++ b/src/osd/OSDMap.h
-@@ -379,8 +379,11 @@
- map<string,map<string,string> >::const_iterator i =
- erasure_code_profiles.find(name);
- return i != erasure_code_profiles.end();
- }
-+ int get_erasure_code_profile_default(CephContext *cct,
-+ map<string,string> &profile_map,
-+ ostream *ss);
- void set_erasure_code_profile(const string &name,
- const map<string,string> &profile) {
- erasure_code_profiles[name] = profile;
- }
---- a/src/osd/OpRequest.cc
-+++ b/src/osd/OpRequest.cc
-@@ -32,9 +32,9 @@
- f->open_object_section("client_info");
- stringstream client_name;
- client_name << m->get_orig_source();
- f->dump_string("client", client_name.str());
-- f->dump_int("tid", m->get_tid());
-+ f->dump_unsigned("tid", m->get_tid());
- f->close_section(); // client_info
- }
- {
- f->open_array_section("events");
---- a/src/osd/OpRequest.h
-+++ b/src/osd/OpRequest.h
-@@ -73,8 +73,12 @@
- void set_pg_op();
-
- void _dump(utime_t now, Formatter *f) const;
-
-+ bool has_feature(uint64_t f) const {
-+ return request->get_connection()->has_feature(f);
-+ }
-+
- private:
- osd_reqid_t reqid;
- uint8_t hit_flag_points;
- uint8_t latest_flag_point;
---- a/src/osd/PG.cc
-+++ b/src/osd/PG.cc
-@@ -1442,9 +1442,9 @@
- last_update_ondisk = info.last_update;
- min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
- }
- last_update_applied = info.last_update;
--
-+ last_rollback_info_trimmed_to_applied = pg_log.get_rollback_trimmed_to();
-
- need_up_thru = false;
-
- // write pg info, log
-@@ -2640,9 +2640,12 @@
- }
-
-
- void PG::append_log(
-- vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t,
-+ vector<pg_log_entry_t>& logv,
-+ eversion_t trim_to,
-+ eversion_t trim_rollback_to,
-+ ObjectStore::Transaction &t,
- bool transaction_applied)
- {
- if (transaction_applied)
- update_snap_map(logv, t);
-@@ -2654,15 +2657,35 @@
- ++p) {
- p->offset = 0;
- add_log_entry(*p, keys[p->get_key_name()]);
- }
-- if (!transaction_applied)
-- pg_log.clear_can_rollback_to();
-+
-+ PGLogEntryHandler handler;
-+ if (!transaction_applied) {
-+ pg_log.clear_can_rollback_to(&handler);
-+ t.register_on_applied(
-+ new C_UpdateLastRollbackInfoTrimmedToApplied(
-+ this,
-+ get_osdmap()->get_epoch(),
-+ info.last_update));
-+ } else if (trim_rollback_to > pg_log.get_rollback_trimmed_to()) {
-+ pg_log.trim_rollback_info(
-+ trim_rollback_to,
-+ &handler);
-+ t.register_on_applied(
-+ new C_UpdateLastRollbackInfoTrimmedToApplied(
-+ this,
-+ get_osdmap()->get_epoch(),
-+ trim_rollback_to));
-+ }
-
- dout(10) << "append_log adding " << keys.size() << " keys" << dendl;
- t.omap_setkeys(coll_t::META_COLL, log_oid, keys);
-- PGLogEntryHandler handler;
-+
- pg_log.trim(&handler, trim_to, info);
-+
-+ dout(10) << __func__ << ": trimming to " << trim_rollback_to
-+ << " entries " << handler.to_trim << dendl;
- handler.apply(this, &t);
-
- // update the local pg, pg log
- dirty_info = true;
-@@ -3003,9 +3026,10 @@
- }
-
- void PG::reg_next_scrub()
- {
-- if (scrubber.must_scrub) {
-+ if (scrubber.must_scrub ||
-+ (info.stats.stats_invalid && g_conf->osd_scrub_invalid_stats)) {
- scrubber.scrub_reg_stamp = utime_t();
- } else {
- scrubber.scrub_reg_stamp = info.history.last_scrub_stamp;
- }
-@@ -3261,8 +3285,36 @@
- osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
- }
- }
-
-+void PG::_scan_rollback_obs(
-+ const vector<ghobject_t> &rollback_obs,
-+ ThreadPool::TPHandle &handle)
-+{
-+ ObjectStore::Transaction *t = NULL;
-+ eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
-+ for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
-+ i != rollback_obs.end();
-+ ++i) {
-+ if (i->generation < trimmed_to.version) {
-+ osd->clog.error() << "osd." << osd->whoami
-+ << " pg " << info.pgid
-+ << " found obsolete rollback obj "
-+ << *i << " generation < trimmed_to "
-+ << trimmed_to
-+ << "...repaired";
-+ if (!t)
-+ t = new ObjectStore::Transaction;
-+ t->remove(coll, *i);
-+ }
-+ }
-+ if (t) {
-+ derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
-+ << dendl;
-+ osd->store->queue_transaction_and_cleanup(osr.get(), t);
-+ }
-+}
-+
- void PG::_scan_snaps(ScrubMap &smap)
- {
- for (map<hobject_t, ScrubMap::object>::iterator i = smap.objects.begin();
- i != smap.objects.end();
-@@ -3348,15 +3400,23 @@
- map.valid_through = info.last_update;
-
- // objects
- vector<hobject_t> ls;
-- int ret = get_pgbackend()->objects_list_range(start, end, 0, &ls);
-+ vector<ghobject_t> rollback_obs;
-+ int ret = get_pgbackend()->objects_list_range(
-+ start,
-+ end,
-+ 0,
-+ &ls,
-+ &rollback_obs);
- if (ret < 0) {
- dout(5) << "objects_list_range error: " << ret << dendl;
- return ret;
- }
-
-+
- get_pgbackend()->be_scan_list(map, ls, deep, handle);
-+ _scan_rollback_obs(rollback_obs, handle);
- _scan_snaps(map);
-
- // pg attrs
- osd->store->collection_getattrs(coll, map.attrs);
-@@ -3577,8 +3637,19 @@
- */
- void PG::scrub(ThreadPool::TPHandle &handle)
- {
- lock();
-+ if (g_conf->osd_scrub_sleep > 0 &&
-+ (scrubber.state == PG::Scrubber::NEW_CHUNK ||
-+ scrubber.state == PG::Scrubber::INACTIVE)) {
-+ dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
-+ unlock();
-+ utime_t t;
-+ t.set_from_double(g_conf->osd_scrub_sleep);
-+ t.sleep();
-+ lock();
-+ dout(20) << __func__ << " slept for " << t << dendl;
-+ }
- if (deleting) {
- unlock();
- return;
- }
-@@ -4630,8 +4701,23 @@
- on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
- on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
- }
-
-+void PG::reset_interval_flush()
-+{
-+ dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
-+ recovery_state.clear_blocked_outgoing();
-+
-+ if (!osr->flush_commit(
-+ new QueuePeeringEvt<IntervalFlush>(
-+ this, get_osdmap()->get_epoch(), IntervalFlush()))) {
-+ dout(10) << "Beginning to block outgoing recovery messages" << dendl;
-+ recovery_state.begin_block_outgoing();
-+ } else {
-+ dout(10) << "Not blocking outgoing recovery messages" << dendl;
-+ }
-+}
-+
- /* Called before initializing peering during advance_map */
- void PG::start_peering_interval(
- const OSDMapRef lastmap,
- const vector<int>& newup, int new_up_primary,
-@@ -4640,8 +4726,9 @@
- {
- const OSDMapRef osdmap = get_osdmap();
-
- set_last_peering_reset();
-+ reset_interval_flush();
-
- vector<int> oldacting, oldup;
- int oldrole = get_role();
-
-@@ -5049,9 +5136,9 @@
- return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
- case MSG_OSD_PG_PUSH_REPLY:
- return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
- case MSG_OSD_SUBOPREPLY:
-- return false;
-+ return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
-
- case MSG_OSD_EC_WRITE:
- return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
- case MSG_OSD_EC_WRITE_REPLY:
-@@ -5385,8 +5472,17 @@
- context< RecoveryMachine >().log_enter(state_name);
- }
-
- boost::statechart::result
-+PG::RecoveryState::Started::react(const IntervalFlush&)
-+{
-+ dout(10) << "Ending blocked outgoing recovery messages" << dendl;
-+ context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
-+ return discard_event();
-+}
-+
-+
-+boost::statechart::result
- PG::RecoveryState::Started::react(const FlushedEvt&)
- {
- PG *pg = context< RecoveryMachine >().pg;
- pg->on_flushed();
-@@ -5435,8 +5531,9 @@
- NamedState(context< RecoveryMachine >().pg->cct, "Reset")
- {
- context< RecoveryMachine >().log_enter(state_name);
- PG *pg = context< RecoveryMachine >().pg;
-+
- pg->flushes_in_progress = 0;
- pg->set_last_peering_reset();
- }
-
-@@ -5447,8 +5544,16 @@
- pg->on_flushed();
- return discard_event();
- }
-
-+boost::statechart::result
-+PG::RecoveryState::Reset::react(const IntervalFlush&)
-+{
-+ dout(10) << "Ending blocked outgoing recovery messages" << dendl;
-+ context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
-+ return discard_event();
-+}
-+
- boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
- {
- PG *pg = context< RecoveryMachine >().pg;
- dout(10) << "Reset advmap" << dendl;
-@@ -5829,8 +5934,20 @@
- {
- context< RecoveryMachine >().log_enter(state_name);
- }
-
-+boost::statechart::result
-+PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
-+{
-+ return discard_event();
-+}
-+
-+boost::statechart::result
-+PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
-+{
-+ return discard_event();
-+}
-+
- void PG::RecoveryState::NotBackfilling::exit()
- {
- context< RecoveryMachine >().log_exit(state_name, enter_time);
- PG *pg = context< RecoveryMachine >().pg;
-@@ -6587,19 +6704,23 @@
- PG *pg = context< RecoveryMachine >().pg;
- MOSDPGLog *msg = logevt.msg.get();
- dout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
-
-+ ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
- if (msg->info.last_backfill == hobject_t()) {
- // restart backfill
- pg->unreg_next_scrub();
- pg->info = msg->info;
- pg->reg_next_scrub();
- pg->dirty_info = true;
- pg->dirty_big_info = true; // maybe.
-- pg->pg_log.claim_log(msg->log);
-+
-+ PGLogEntryHandler rollbacker;
-+ pg->pg_log.claim_log_and_clear_rollback_info(msg->log, &rollbacker);
-+ rollbacker.apply(pg, t);
-+
- pg->pg_log.reset_backfill();
- } else {
-- ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
- pg->merge_log(*t, msg->info, msg->log, logevt.from);
- }
-
- assert(pg->pg_log.get_head() == pg->info.last_update);
-@@ -7491,20 +7612,53 @@
- }
-
- void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
- assert(!rctx);
-- rctx = new_ctx;
-- if (rctx)
-+ assert(!orig_ctx);
-+ orig_ctx = new_ctx;
-+ if (new_ctx) {
-+ if (messages_pending_flush) {
-+ rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
-+ } else {
-+ rctx = *new_ctx;
-+ }
- rctx->start_time = ceph_clock_now(pg->cct);
-+ }
-+}
-+
-+void PG::RecoveryState::begin_block_outgoing() {
-+ assert(!messages_pending_flush);
-+ assert(orig_ctx);
-+ assert(rctx);
-+ messages_pending_flush = BufferedRecoveryMessages();
-+ rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
-+}
-+
-+void PG::RecoveryState::clear_blocked_outgoing() {
-+ assert(orig_ctx);
-+ assert(rctx);
-+ messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
-+}
-+
-+void PG::RecoveryState::end_block_outgoing() {
-+ assert(messages_pending_flush);
-+ assert(orig_ctx);
-+ assert(rctx);
-+
-+ rctx = RecoveryCtx(*orig_ctx);
-+ rctx->accept_buffered_messages(*messages_pending_flush);
-+ messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
- }
-
- void PG::RecoveryState::end_handle() {
- if (rctx) {
- utime_t dur = ceph_clock_now(pg->cct) - rctx->start_time;
- machine.event_time += dur;
- }
-+
- machine.event_count++;
-- rctx = 0;
-+ rctx = boost::optional<RecoveryCtx>();
-+ orig_ctx = NULL;
- }
-
- void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
- void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
---- a/src/osd/PG.h
-+++ b/src/osd/PG.h
-@@ -446,8 +446,27 @@
- eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
- eversion_t last_complete_ondisk; // last_complete that has committed.
- eversion_t last_update_applied;
-
-+
-+ struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
-+ PGRef pg;
-+ epoch_t e;
-+ eversion_t v;
-+ C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
-+ : pg(pg), e(e), v(v) {}
-+ void finish(int) {
-+ pg->lock();
-+ if (!pg->pg_has_reset_since(e)) {
-+ pg->last_rollback_info_trimmed_to_applied = v;
-+ }
-+ pg->unlock();
-+ }
-+ };
-+ // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
-+ // and the transaction has applied
-+ eversion_t last_rollback_info_trimmed_to_applied;
-+
- // primary state
- public:
- pg_shard_t primary;
- pg_shard_t pg_whoami;
-@@ -486,8 +505,14 @@
- bool may_need_replay(const OSDMapRef osdmap) const;
-
-
- public:
-+ struct BufferedRecoveryMessages {
-+ map<int, map<spg_t, pg_query_t> > query_map;
-+ map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > info_map;
-+ map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > notify_list;
-+ };
-+
- struct RecoveryCtx {
- utime_t start_time;
- map<int, map<spg_t, pg_query_t> > *query_map;
- map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map;
-@@ -507,8 +532,50 @@
- notify_list(notify_list),
- on_applied(on_applied),
- on_safe(on_safe),
- transaction(transaction) {}
-+
-+ RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
-+ : query_map(&(buf.query_map)),
-+ info_map(&(buf.info_map)),
-+ notify_list(&(buf.notify_list)),
-+ on_applied(rctx.on_applied),
-+ on_safe(rctx.on_safe),
-+ transaction(rctx.transaction) {}
-+
-+ void accept_buffered_messages(BufferedRecoveryMessages &m) {
-+ assert(query_map);
-+ assert(info_map);
-+ assert(notify_list);
-+ for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
-+ i != m.query_map.end();
-+ ++i) {
-+ map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
-+ for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
-+ j != i->second.end();
-+ ++j) {
-+ omap[j->first] = j->second;
-+ }
-+ }
-+ for (map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator i
-+ = m.info_map.begin();
-+ i != m.info_map.end();
-+ ++i) {
-+ vector<pair<pg_notify_t, pg_interval_map_t> > &ovec =
-+ (*info_map)[i->first];
-+ ovec.reserve(ovec.size() + i->second.size());
-+ ovec.insert(ovec.end(), i->second.begin(), i->second.end());
-+ }
-+ for (map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator i
-+ = m.notify_list.begin();
-+ i != m.notify_list.end();
-+ ++i) {
-+ vector<pair<pg_notify_t, pg_interval_map_t> > &ovec =
-+ (*notify_list)[i->first];
-+ ovec.reserve(ovec.size() + i->second.size());
-+ ovec.insert(ovec.end(), i->second.begin(), i->second.end());
-+ }
-+ }
- };
-
- struct NamedState {
- const char *state_name;
-@@ -1107,8 +1174,11 @@
- void scrub_finish();
- void scrub_clear_state();
- bool scrub_gather_replica_maps();
- void _scan_snaps(ScrubMap &map);
-+ void _scan_rollback_obs(
-+ const vector<ghobject_t> &rollback_obs,
-+ ThreadPool::TPHandle &handle);
- void _request_scrub_map_classic(pg_shard_t replica, eversion_t version);
- void _request_scrub_map(pg_shard_t replica, eversion_t version,
- hobject_t start, hobject_t end, bool deep);
- int build_scrub_map_chunk(
-@@ -1332,12 +1402,19 @@
- TrivialEvent(GoClean)
-
- TrivialEvent(AllReplicasActivated)
-
-+ TrivialEvent(IntervalFlush)
-+
- /* Encapsulates PG recovery process */
- class RecoveryState {
- void start_handle(RecoveryCtx *new_ctx);
- void end_handle();
-+ public:
-+ void begin_block_outgoing();
-+ void end_block_outgoing();
-+ void clear_blocked_outgoing();
-+ private:
-
- /* States */
- struct Initial;
- class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
-@@ -1359,42 +1436,49 @@
- RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
-
- /* Accessor functions for state methods */
- ObjectStore::Transaction* get_cur_transaction() {
-+ assert(state->rctx);
- assert(state->rctx->transaction);
- return state->rctx->transaction;
- }
-
- void send_query(pg_shard_t to, const pg_query_t &query) {
-+ assert(state->rctx);
- assert(state->rctx->query_map);
- (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
- query;
- }
-
- map<int, map<spg_t, pg_query_t> > *get_query_map() {
-+ assert(state->rctx);
- assert(state->rctx->query_map);
- return state->rctx->query_map;
- }
-
- map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *get_info_map() {
-+ assert(state->rctx);
- assert(state->rctx->info_map);
- return state->rctx->info_map;
- }
-
- list< Context* > *get_on_safe_context_list() {
-+ assert(state->rctx);
- assert(state->rctx->on_safe);
- return &(state->rctx->on_safe->contexts);
- }
-
- list< Context * > *get_on_applied_context_list() {
-+ assert(state->rctx);
- assert(state->rctx->on_applied);
- return &(state->rctx->on_applied->contexts);
- }
-
-- RecoveryCtx *get_recovery_ctx() { return state->rctx; }
-+ RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
-
- void send_notify(pg_shard_t to,
- const pg_notify_t &info, const pg_interval_map_t &pi) {
-+ assert(state->rctx);
- assert(state->rctx->notify_list);
- (*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
- }
- };
-@@ -1438,14 +1522,16 @@
- boost::statechart::custom_reaction< AdvMap >,
- boost::statechart::custom_reaction< ActMap >,
- boost::statechart::custom_reaction< NullEvt >,
- boost::statechart::custom_reaction< FlushedEvt >,
-+ boost::statechart::custom_reaction< IntervalFlush >,
- boost::statechart::transition< boost::statechart::event_base, Crashed >
- > reactions;
- boost::statechart::result react(const QueryState& q);
- boost::statechart::result react(const AdvMap&);
- boost::statechart::result react(const ActMap&);
- boost::statechart::result react(const FlushedEvt&);
-+ boost::statechart::result react(const IntervalFlush&);
- boost::statechart::result react(const boost::statechart::event_base&) {
- return discard_event();
- }
- };
-@@ -1460,13 +1546,15 @@
- boost::statechart::custom_reaction< QueryState >,
- boost::statechart::custom_reaction< AdvMap >,
- boost::statechart::custom_reaction< NullEvt >,
- boost::statechart::custom_reaction< FlushedEvt >,
-+ boost::statechart::custom_reaction< IntervalFlush >,
- boost::statechart::transition< boost::statechart::event_base, Crashed >
- > reactions;
- boost::statechart::result react(const QueryState& q);
- boost::statechart::result react(const AdvMap&);
- boost::statechart::result react(const FlushedEvt&);
-+ boost::statechart::result react(const IntervalFlush&);
- boost::statechart::result react(const boost::statechart::event_base&) {
- return discard_event();
- }
- };
-@@ -1634,12 +1722,16 @@
- };
-
- struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
- typedef boost::mpl::list<
-- boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>
-+ boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
-+ boost::statechart::custom_reaction< RemoteBackfillReserved >,
-+ boost::statechart::custom_reaction< RemoteReservationRejected >
- > reactions;
- NotBackfilling(my_context ctx);
- void exit();
-+ boost::statechart::result react(const RemoteBackfillReserved& evt);
-+ boost::statechart::result react(const RemoteReservationRejected& evt);
- };
-
- struct RepNotRecovering;
- struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
-@@ -1854,12 +1946,25 @@
-
-
- RecoveryMachine machine;
- PG *pg;
-- RecoveryCtx *rctx;
-+
-+ /// context passed in by state machine caller
-+ RecoveryCtx *orig_ctx;
-+
-+ /// populated if we are buffering messages pending a flush
-+ boost::optional<BufferedRecoveryMessages> messages_pending_flush;
-+
-+ /**
-+ * populated between start_handle() and end_handle(), points into
-+ * the message lists for messages_pending_flush while blocking messages
-+ * or into orig_ctx otherwise
-+ */
-+ boost::optional<RecoveryCtx> rctx;
-
- public:
-- RecoveryState(PG *pg) : machine(this, pg), pg(pg), rctx(0) {
-+ RecoveryState(PG *pg)
-+ : machine(this, pg), pg(pg), orig_ctx(0) {
- machine.initiate();
- }
-
- void handle_event(const boost::statechart::event_base &evt,
-@@ -1995,9 +2100,12 @@
- }
-
- void add_log_entry(pg_log_entry_t& e, bufferlist& log_bl);
- void append_log(
-- vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t,
-+ vector<pg_log_entry_t>& logv,
-+ eversion_t trim_to,
-+ eversion_t trim_rollback_to,
-+ ObjectStore::Transaction &t,
- bool transaction_applied = true);
- bool check_log_for_corruption(ObjectStore *store);
- void trim_peers();
-
-@@ -2025,8 +2133,9 @@
- void share_pg_info();
- /// share new pg log entries after a pg is active
- void share_pg_log();
-
-+ void reset_interval_flush();
- void start_peering_interval(
- const OSDMapRef lastmap,
- const vector<int>& newup, int up_primary,
- const vector<int>& newacting, int acting_primary,
---- a/src/osd/PGBackend.cc
-+++ b/src/osd/PGBackend.cc
-@@ -114,9 +114,13 @@
- vector<hobject_t> *ls,
- hobject_t *next)
- {
- assert(ls);
-- ghobject_t _next(begin);
-+ // Starts with the smallest shard id and generation to
-+ // make sure the result list has the marker object (
-+ // it might have multiple generations though, which would
-+ // be filtered).
-+ ghobject_t _next(begin, 0, shard_id_t(0));
- ls->reserve(max);
- int r = 0;
- while (!_next.is_max() && ls->size() < (unsigned)min) {
- vector<ghobject_t> objects;
-@@ -146,9 +150,10 @@
- int PGBackend::objects_list_range(
- const hobject_t &start,
- const hobject_t &end,
- snapid_t seq,
-- vector<hobject_t> *ls)
-+ vector<hobject_t> *ls,
-+ vector<ghobject_t> *gen_obs)
- {
- assert(ls);
- vector<ghobject_t> objects;
- int r = store->collection_list_range(
-@@ -162,8 +167,10 @@
- i != objects.end();
- ++i) {
- if (i->is_no_gen()) {
- ls->push_back(i->hobj);
-+ } else if (gen_obs) {
-+ gen_obs->push_back(*i);
- }
- }
- return r;
- }
---- a/src/osd/PGBackend.h
-+++ b/src/osd/PGBackend.h
-@@ -176,8 +176,9 @@
- virtual void log_operation(
- vector<pg_log_entry_t> &logv,
- boost::optional<pg_hit_set_history_t> &hset_history,
- const eversion_t &trim_to,
-+ const eversion_t &trim_rollback_to,
- bool transaction_applied,
- ObjectStore::Transaction *t) = 0;
-
- virtual void update_peer_last_complete_ondisk(
-@@ -495,8 +496,9 @@
- const hobject_t &hoid, ///< [in] object
- const eversion_t &at_version, ///< [in] version
- PGTransaction *t, ///< [in] trans to execute
- const eversion_t &trim_to, ///< [in] trim log to here
-+ const eversion_t &trim_rollback_to, ///< [in] trim rollback info to here
- vector<pg_log_entry_t> &log_entries, ///< [in] log entries for t
- /// [in] hitset history (if updated with this transaction)
- boost::optional<pg_hit_set_history_t> &hset_history,
- Context *on_local_applied_sync, ///< [in] called when applied locally
-@@ -554,9 +556,10 @@
- int objects_list_range(
- const hobject_t &start,
- const hobject_t &end,
- snapid_t seq,
-- vector<hobject_t> *ls);
-+ vector<hobject_t> *ls,
-+ vector<ghobject_t> *gen_obs=0);
-
- int objects_get_attr(
- const hobject_t &hoid,
- const string &attr,
---- a/src/osd/PGLog.cc
-+++ b/src/osd/PGLog.cc
-@@ -23,8 +23,27 @@
- #define dout_subsys ceph_subsys_osd
-
- //////////////////// PGLog::IndexedLog ////////////////////
-
-+void PGLog::IndexedLog::advance_rollback_info_trimmed_to(
-+ eversion_t to,
-+ LogEntryHandler *h)
-+{
-+ assert(to <= can_rollback_to);
-+
-+ if (to > rollback_info_trimmed_to)
-+ rollback_info_trimmed_to = to;
-+
-+ while (rollback_info_trimmed_to_riter != log.rbegin()) {
-+ --rollback_info_trimmed_to_riter;
-+ if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) {
-+ ++rollback_info_trimmed_to_riter;
-+ break;
-+ }
-+ h->trim(*rollback_info_trimmed_to_riter);
-+ }
-+}
-+
- void PGLog::IndexedLog::split_into(
- pg_t child_pgid,
- unsigned split_bits,
- PGLog::IndexedLog *olog)
-@@ -46,11 +65,13 @@
- }
- oldlog.erase(i++);
- }
-
-+
-+ olog->can_rollback_to = can_rollback_to;
-+
- olog->index();
- index();
-- olog->can_rollback_to = can_rollback_to;
- }
-
- void PGLog::IndexedLog::trim(
- LogEntryHandler *handler,
-@@ -58,22 +79,33 @@
- set<eversion_t> *trimmed)
- {
- if (complete_to != log.end() &&
- complete_to->version <= s) {
-- generic_dout(0) << " bad trim to " << s << " when complete_to is " << complete_to->version
-+ generic_dout(0) << " bad trim to " << s << " when complete_to is "
-+ << complete_to->version
- << " on " << *this << dendl;
- }
-
-+ if (s > can_rollback_to)
-+ can_rollback_to = s;
-+ advance_rollback_info_trimmed_to(s, handler);
-+
- while (!log.empty()) {
- pg_log_entry_t &e = *log.begin();
- if (e.version > s)
- break;
- generic_dout(20) << "trim " << e << dendl;
- if (trimmed)
- trimmed->insert(e.version);
-- handler->trim(e);
-+
- unindex(e); // remove from index,
-- log.pop_front(); // from log
-+
-+ if (e.version == rollback_info_trimmed_to_riter->version) {
-+ log.pop_front();
-+ rollback_info_trimmed_to_riter = log.rend();
-+ } else {
-+ log.pop_front();
-+ }
- }
-
- // raise tail?
- if (tail < s)
-@@ -103,9 +135,9 @@
-
- void PGLog::clear() {
- divergent_priors.clear();
- missing.clear();
-- log.zero();
-+ log.clear();
- log_keys_debug.clear();
- undirty();
- }
-
---- a/src/osd/PGLog.h
-+++ b/src/osd/PGLog.h
-@@ -61,13 +61,35 @@
- // recovery pointers
- list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item
- version_t last_requested; // last object requested by primary
-
-+ //
-+ private:
-+ /**
-+ * rollback_info_trimmed_to_riter points to the first log entry <=
-+ * rollback_info_trimmed_to
-+ *
-+ * It's a reverse_iterator because rend() is a natural representation for
-+ * tail, and rbegin() works nicely for head.
-+ */
-+ list<pg_log_entry_t>::reverse_iterator rollback_info_trimmed_to_riter;
-+ public:
-+ void advance_rollback_info_trimmed_to(eversion_t to, LogEntryHandler *h);
-+
- /****/
-- IndexedLog() : last_requested(0) {}
-+ IndexedLog() :
-+ complete_to(log.end()),
-+ last_requested(0),
-+ rollback_info_trimmed_to_riter(log.rbegin())
-+ {}
-+
-+ void claim_log_and_clear_rollback_info(const pg_log_t& o) {
-+ // we must have already trimmed the old entries
-+ assert(rollback_info_trimmed_to == head);
-+ assert(rollback_info_trimmed_to_riter == log.rbegin());
-
-- void claim_log(const pg_log_t& o) {
- log = o.log;
-+ rollback_info_trimmed_to = head;
- head = o.head;
- tail = o.tail;
- index();
- }
-@@ -77,12 +99,22 @@
- unsigned split_bits,
- IndexedLog *olog);
-
- void zero() {
-+ // we must have already trimmed the old entries
-+ assert(rollback_info_trimmed_to == head);
-+ assert(rollback_info_trimmed_to_riter == log.rbegin());
-+
- unindex();
- pg_log_t::clear();
-+ rollback_info_trimmed_to_riter = log.rbegin();
- reset_recovery_pointers();
- }
-+ void clear() {
-+ rollback_info_trimmed_to = head;
-+ rollback_info_trimmed_to_riter = log.rbegin();
-+ zero();
-+ }
- void reset_recovery_pointers() {
- complete_to = log.end();
- last_requested = 0;
- }
-@@ -111,8 +143,13 @@
- //assert(caller_ops.count(i->reqid) == 0); // divergent merge_log indexes new before unindexing old
- caller_ops[i->reqid] = &(*i);
- }
- }
-+
-+ rollback_info_trimmed_to_riter = log.rbegin();
-+ while (rollback_info_trimmed_to_riter != log.rend() &&
-+ rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to)
-+ rollback_info_trimmed_to_riter++;
- }
-
- void index(pg_log_entry_t& e) {
- if (objects.count(e.soid) == 0 ||
-@@ -140,8 +177,13 @@
- // actors
- void add(pg_log_entry_t& e) {
- // add to log
- log.push_back(e);
-+
-+ // riter previously pointed to the previous entry
-+ if (rollback_info_trimmed_to_riter == log.rbegin())
-+ ++rollback_info_trimmed_to_riter;
-+
- assert(e.version > head);
- assert(head.version == 0 || e.version.version > head.version);
- head = e.version;
-
-@@ -324,16 +366,35 @@
- LogEntryHandler *handler,
- eversion_t trim_to,
- pg_info_t &info);
-
-- void clear_can_rollback_to() {
-+ void trim_rollback_info(
-+ eversion_t trim_rollback_to,
-+ LogEntryHandler *h) {
-+ if (trim_rollback_to > log.can_rollback_to)
-+ log.can_rollback_to = trim_rollback_to;
-+ log.advance_rollback_info_trimmed_to(
-+ trim_rollback_to,
-+ h);
-+ }
-+
-+ eversion_t get_rollback_trimmed_to() const {
-+ return log.rollback_info_trimmed_to;
-+ }
-+
-+ void clear_can_rollback_to(LogEntryHandler *h) {
- log.can_rollback_to = log.head;
-+ log.advance_rollback_info_trimmed_to(
-+ log.head,
-+ h);
- }
-
- //////////////////// get or set log & missing ////////////////////
-
-- void claim_log(const pg_log_t &o) {
-- log.claim_log(o);
-+ void claim_log_and_clear_rollback_info(const pg_log_t &o, LogEntryHandler *h) {
-+ log.can_rollback_to = log.head;
-+ log.advance_rollback_info_trimmed_to(log.head, h);
-+ log.claim_log_and_clear_rollback_info(o);
- missing.clear();
- mark_dirty_to(eversion_t::max());
- }
-
---- a/src/osd/ReplicatedBackend.cc
-+++ b/src/osd/ReplicatedBackend.cc
-@@ -493,8 +493,9 @@
- const hobject_t &soid,
- const eversion_t &at_version,
- PGTransaction *_t,
- const eversion_t &trim_to,
-+ const eversion_t &trim_rollback_to,
- vector<pg_log_entry_t> &log_entries,
- boost::optional<pg_hit_set_history_t> &hset_history,
- Context *on_local_applied_sync,
- Context *on_all_acked,
-@@ -533,8 +534,9 @@
- at_version,
- tid,
- reqid,
- trim_to,
-+ trim_rollback_to,
- t->get_temp_added().size() ? *(t->get_temp_added().begin()) : hobject_t(),
- t->get_temp_cleared().size() ?
- *(t->get_temp_cleared().begin()) :hobject_t(),
- log_entries,
-@@ -548,9 +550,15 @@
- add_temp_objs(t->get_temp_added());
- }
- clear_temp_objs(t->get_temp_cleared());
-
-- parent->log_operation(log_entries, hset_history, trim_to, true, &local_t);
-+ parent->log_operation(
-+ log_entries,
-+ hset_history,
-+ trim_to,
-+ trim_rollback_to,
-+ true,
-+ &local_t);
- local_t.append(*op_t);
- local_t.swap(*op_t);
-
- op_t->register_on_applied_sync(on_local_applied_sync);
---- a/src/osd/ReplicatedBackend.h
-+++ b/src/osd/ReplicatedBackend.h
-@@ -341,8 +341,9 @@
- const hobject_t &hoid,
- const eversion_t &at_version,
- PGTransaction *t,
- const eversion_t &trim_to,
-+ const eversion_t &trim_rollback_to,
- vector<pg_log_entry_t> &log_entries,
- boost::optional<pg_hit_set_history_t> &hset_history,
- Context *on_local_applied_sync,
- Context *on_all_applied,
-@@ -358,8 +359,9 @@
- const eversion_t &at_version,
- ceph_tid_t tid,
- osd_reqid_t reqid,
- eversion_t pg_trim_to,
-+ eversion_t pg_trim_rollback_to,
- hobject_t new_temp_oid,
- hobject_t discard_temp_oid,
- vector<pg_log_entry_t> &log_entries,
- boost::optional<pg_hit_set_history_t> &hset_history,
---- a/src/osd/ReplicatedPG.cc
-+++ b/src/osd/ReplicatedPG.cc
-@@ -1119,8 +1119,14 @@
- dout(20) << " replay, waiting for active on " << op << dendl;
- waiting_for_active.push_back(op);
- return;
- }
-+ // verify client features
-+ if ((pool.info.has_tiers() || pool.info.is_tier()) &&
-+ !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
-+ osd->reply_op_error(op, -EOPNOTSUPP);
-+ return;
-+ }
- do_op(op); // do it now
- break;
-
- case MSG_OSD_SUBOP:
-@@ -1351,11 +1357,12 @@
- if (hit_set->is_full() ||
- hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
- hit_set_persist();
- }
-+ }
-
-- if (agent_state)
-- agent_choose_mode();
-+ if (agent_state) {
-+ agent_choose_mode();
- }
-
- if ((m->get_flags() & CEPH_OSD_FLAG_IGNORE_CACHE) == 0 &&
- maybe_handle_cache(op, write_ordered, obc, r, missing_oid, false))
-@@ -4853,10 +4860,11 @@
- ctx->clone_obc->ssc->ref++;
- if (pool.info.require_rollback())
- ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
- snap_oi = &ctx->clone_obc->obs.oi;
-- bool got = ctx->clone_obc->get_write(ctx->op);
-+ bool got = ctx->clone_obc->get_write_greedy(ctx->op);
- assert(got);
-+ dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
- } else {
- snap_oi = &static_snap_oi;
- }
- snap_oi->version = ctx->at_version;
-@@ -5159,10 +5167,11 @@
- eversion_t(),
- 0, osd_reqid_t(), ctx->mtime));
-
- ctx->snapset_obc = get_object_context(snapoid, true);
-- bool got = ctx->snapset_obc->get_write(ctx->op);
-+ bool got = ctx->snapset_obc->get_write_greedy(ctx->op);
- assert(got);
-+ dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
- ctx->release_snapset_obc = true;
- if (pool.info.require_rollback() && !ctx->snapset_obc->obs.exists) {
- ctx->log.back().mod_desc.create();
- } else if (!pool.info.require_rollback()) {
-@@ -6025,8 +6034,13 @@
- kick_object_context_blocked(cop->obc);
- cop->results.should_requeue = requeue;
- CopyCallbackResults result(-ECANCELED, &cop->results);
- cop->cb->complete(result);
-+
-+ // There may still be an objecter callback referencing this copy op.
-+ // That callback will not need the obc since it's been canceled, and
-+ // we need the obc reference to go away prior to flush.
-+ cop->obc = ObjectContextRef();
- }
-
- void ReplicatedPG::cancel_copy_ops(bool requeue)
- {
-@@ -6441,9 +6455,9 @@
- }
-
- bool ReplicatedPG::is_present_clone(hobject_t coid)
- {
-- if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
-+ if (!pool.info.allow_incomplete_clones())
- return true;
- if (is_missing_object(coid))
- return true;
- ObjectContextRef obc = get_object_context(coid, false);
-@@ -6734,8 +6748,9 @@
- soid,
- repop->ctx->at_version,
- repop->ctx->op_t,
- pg_trim_to,
-+ min_last_complete_ondisk,
- repop->ctx->log,
- repop->ctx->updated_hset_history,
- onapplied_sync,
- on_all_applied,
-@@ -6751,8 +6766,9 @@
- const eversion_t &at_version,
- ceph_tid_t tid,
- osd_reqid_t reqid,
- eversion_t pg_trim_to,
-+ eversion_t pg_trim_rollback_to,
- hobject_t new_temp_oid,
- hobject_t discard_temp_oid,
- vector<pg_log_entry_t> &log_entries,
- boost::optional<pg_hit_set_history_t> &hset_hist,
-@@ -6806,8 +6822,9 @@
- else
- wr->pg_stats = get_info().stats;
-
- wr->pg_trim_to = pg_trim_to;
-+ wr->pg_trim_rollback_to = pg_trim_rollback_to;
-
- wr->new_temp_oid = new_temp_oid;
- wr->discard_temp_oid = discard_temp_oid;
- wr->updated_hit_set_history = hset_hist;
-@@ -6840,8 +6857,14 @@
-
- void ReplicatedPG::remove_repop(RepGather *repop)
- {
- dout(20) << __func__ << " " << *repop << dendl;
-+ if (repop->ctx->obc)
-+ dout(20) << " obc " << *repop->ctx->obc << dendl;
-+ if (repop->ctx->clone_obc)
-+ dout(20) << " clone_obc " << *repop->ctx->clone_obc << dendl;
-+ if (repop->ctx->snapset_obc)
-+ dout(20) << " snapset_obc " << *repop->ctx->snapset_obc << dendl;
- release_op_ctx_locks(repop->ctx);
- repop->ctx->finish(0); // FIXME: return value here is sloppy
- repop_map.erase(repop->rep_tid);
- repop->put();
-@@ -7606,8 +7629,9 @@
- parent->log_operation(
- log,
- m->updated_hit_set_history,
- m->pg_trim_to,
-+ m->pg_trim_rollback_to,
- update_snaps,
- &(rm->localt));
-
- rm->bytes_written = rm->opt.get_encoded_bytes();
-@@ -7701,10 +7725,10 @@
- uint64_t size = obc->obs.oi.size;
- if (size)
- data_subset.insert(0, size);
-
-- if (get_parent()->get_pool().cache_mode != pg_pool_t::CACHEMODE_NONE) {
-- dout(10) << __func__ << ": caching enabled, skipping clone subsets" << dendl;
-+ if (get_parent()->get_pool().allow_incomplete_clones()) {
-+ dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
- return;
- }
-
- if (!cct->_conf->osd_recover_clone_overlap) {
-@@ -7761,10 +7785,10 @@
- uint64_t size = snapset.clone_size[soid.snap];
- if (size)
- data_subset.insert(0, size);
-
-- if (get_parent()->get_pool().cache_mode != pg_pool_t::CACHEMODE_NONE) {
-- dout(10) << __func__ << ": caching enabled, skipping clone subsets" << dendl;
-+ if (get_parent()->get_pool().allow_incomplete_clones()) {
-+ dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
- return;
- }
-
- if (!cct->_conf->osd_recover_clone_overlap) {
-@@ -9464,8 +9488,19 @@
-
- void ReplicatedPG::on_pool_change()
- {
- dout(10) << __func__ << dendl;
-+ // requeue cache full waiters just in case the cache_mode is
-+ // changing away from writeback mode. note that if we are not
-+ // active the normal requeuing machinery is sufficient (and properly
-+ // ordered).
-+ if (is_active() &&
-+ pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
-+ !waiting_for_cache_not_full.empty()) {
-+ dout(10) << __func__ << " requeuing full waiters (not in writeback) "
-+ << dendl;
-+ requeue_ops(waiting_for_cache_not_full);
-+ }
- hit_set_setup();
- agent_setup();
- }
-
-@@ -11288,9 +11323,10 @@
- return false;
- }
- }
-
-- if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
-+ if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL &&
-+ hit_set) {
- // is this object old and/or cold enough?
- int atime = -1, temp = 0;
- agent_estimate_atime_temp(soid, &atime, NULL /*FIXME &temp*/);
-
-@@ -11420,9 +11456,13 @@
- else
- num_dirty = 0;
- }
-
-- dout(10) << __func__ << ": "
-+ dout(10) << __func__
-+ << " flush_mode: "
-+ << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
-+ << " evict_mode: "
-+ << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
- << " num_objects: " << info.stats.stats.sum.num_objects
- << " num_bytes: " << info.stats.stats.sum.num_bytes
- << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
- << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
-@@ -11434,9 +11474,9 @@
-
- // get dirty, full ratios
- uint64_t dirty_micro = 0;
- uint64_t full_micro = 0;
-- if (pool.info.target_max_bytes && info.stats.stats.sum.num_objects) {
-+ if (pool.info.target_max_bytes && info.stats.stats.sum.num_objects > 0) {
- uint64_t avg_size = info.stats.stats.sum.num_bytes /
- info.stats.stats.sum.num_objects;
- dirty_micro =
- num_dirty * avg_size * 1000000 /
-@@ -11444,9 +11484,9 @@
- full_micro =
- num_user_objects * avg_size * 1000000 /
- MAX(pool.info.target_max_bytes / divisor, 1);
- }
-- if (pool.info.target_max_objects) {
-+ if (pool.info.target_max_objects > 0) {
- uint64_t dirty_objects_micro =
- num_dirty * 1000000 /
- MAX(pool.info.target_max_objects / divisor, 1);
- if (dirty_objects_micro > dirty_micro)
-@@ -11530,10 +11570,12 @@
- << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
- << " -> "
- << TierAgentState::get_evict_mode_name(evict_mode)
- << dendl;
-- if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
-+ if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
-+ is_active()) {
- requeue_ops(waiting_for_cache_not_full);
-+ requeue_ops(waiting_for_active);
- }
- agent_state->evict_mode = evict_mode;
- }
- uint64_t old_effort = agent_state->evict_effort;
-@@ -11659,9 +11701,9 @@
- ::decode(snapset, blp);
-
- // did we finish the last oid?
- if (head != hobject_t() &&
-- pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
-+ !pool.info.allow_incomplete_clones()) {
- osd->clog.error() << mode << " " << info.pgid << " " << head
- << " missing clones";
- ++scrubber.shallow_errors;
- }
-@@ -11720,9 +11762,9 @@
- //assert(data.length() == p->size);
- //
-
- if (!next_clone.is_min() && next_clone != soid &&
-- pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE) {
-+ pool.info.allow_incomplete_clones()) {
- // it is okay to be missing one or more clones in a cache tier.
- // skip higher-numbered clones in the list.
- while (curclone != snapset.clones.rend() &&
- soid.snap < *curclone)
-@@ -11808,9 +11850,9 @@
- scrub_cstat.add(stat, cat);
- }
-
- if (!next_clone.is_min() &&
-- pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
-+ !pool.info.allow_incomplete_clones()) {
- osd->clog.error() << mode << " " << info.pgid
- << " expected clone " << next_clone;
- ++scrubber.shallow_errors;
- }
---- a/src/osd/ReplicatedPG.h
-+++ b/src/osd/ReplicatedPG.h
-@@ -346,15 +346,16 @@
- void log_operation(
- vector<pg_log_entry_t> &logv,
- boost::optional<pg_hit_set_history_t> &hset_history,
- const eversion_t &trim_to,
-+ const eversion_t &trim_rollback_to,
- bool transaction_applied,
- ObjectStore::Transaction *t) {
- if (hset_history) {
- info.hit_set = *hset_history;
- dirty_info = true;
- }
-- append_log(logv, trim_to, *t, transaction_applied);
-+ append_log(logv, trim_to, trim_rollback_to, *t, transaction_applied);
- }
-
- void op_applied(
- const eversion_t &applied_version);
---- a/src/osd/osd_types.cc
-+++ b/src/osd/osd_types.cc
-@@ -2101,10 +2101,10 @@
- void pg_notify_t::dump(Formatter *f) const
- {
- f->dump_int("from", from);
- f->dump_int("to", to);
-- f->dump_stream("query_epoch") << query_epoch;
-- f->dump_stream("epoch_sent") << epoch_sent;
-+ f->dump_unsigned("query_epoch", query_epoch);
-+ f->dump_unsigned("epoch_sent", epoch_sent);
- {
- f->open_object_section("info");
- info.dump(f);
- f->close_section();
-@@ -2460,10 +2460,10 @@
-
- void ObjectModDesc::dump(Formatter *f) const
- {
- f->open_object_section("object_mod_desc");
-- f->dump_stream("can_local_rollback") << can_local_rollback;
-- f->dump_stream("stashed") << stashed;
-+ f->dump_bool("can_local_rollback", can_local_rollback);
-+ f->dump_bool("rollback_info_completed", rollback_info_completed);
- {
- f->open_array_section("ops");
- DumpVisitor vis(f);
- visit(&vis);
-@@ -2496,17 +2496,17 @@
- void ObjectModDesc::encode(bufferlist &_bl) const
- {
- ENCODE_START(1, 1, _bl);
- ::encode(can_local_rollback, _bl);
-- ::encode(stashed, _bl);
-+ ::encode(rollback_info_completed, _bl);
- ::encode(bl, _bl);
- ENCODE_FINISH(_bl);
- }
- void ObjectModDesc::decode(bufferlist::iterator &_bl)
- {
- DECODE_START(1, _bl);
- ::decode(can_local_rollback, _bl);
-- ::decode(stashed, _bl);
-+ ::decode(rollback_info_completed, _bl);
- ::decode(bl, _bl);
- DECODE_FINISH(_bl);
- }
-
-@@ -2679,19 +2679,20 @@
- // -- pg_log_t --
-
- void pg_log_t::encode(bufferlist& bl) const
- {
-- ENCODE_START(5, 3, bl);
-+ ENCODE_START(6, 3, bl);
- ::encode(head, bl);
- ::encode(tail, bl);
- ::encode(log, bl);
- ::encode(can_rollback_to, bl);
-+ ::encode(rollback_info_trimmed_to, bl);
- ENCODE_FINISH(bl);
- }
-
- void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
- {
-- DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
-+ DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
- ::decode(head, bl);
- ::decode(tail, bl);
- if (struct_v < 2) {
- bool backlog;
-@@ -2699,8 +2700,13 @@
- }
- ::decode(log, bl);
- if (struct_v >= 5)
- ::decode(can_rollback_to, bl);
-+
-+ if (struct_v >= 6)
-+ ::decode(rollback_info_trimmed_to, bl);
-+ else
-+ rollback_info_trimmed_to = tail;
- DECODE_FINISH(bl);
-
- // handle hobject_t format change
- if (struct_v < 4) {
---- a/src/osd/osd_types.h
-+++ b/src/osd/osd_types.h
-@@ -810,18 +810,20 @@
- return "replicated";
- }
-
- enum {
-- FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding)
-- FLAG_FULL = 2, // pool is full
-+ FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
-+ FLAG_FULL = 1<<1, // pool is full
- FLAG_DEBUG_FAKE_EC_POOL = 1<<2, // require ReplicatedPG to act like an EC pg
-+ FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
- };
-
- static const char *get_flag_name(int f) {
- switch (f) {
- case FLAG_HASHPSPOOL: return "hashpspool";
- case FLAG_FULL: return "full";
- case FLAG_DEBUG_FAKE_EC_POOL: return "require_local_rollback";
-+ case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
- default: return "???";
- }
- }
- static string get_flags_string(uint64_t f) {
-@@ -867,8 +869,20 @@
- }
- const char *get_cache_mode_name() const {
- return get_cache_mode_name(cache_mode);
- }
-+ bool cache_mode_requires_hit_set() const {
-+ switch (cache_mode) {
-+ case CACHEMODE_NONE:
-+ case CACHEMODE_FORWARD:
-+ case CACHEMODE_READONLY:
-+ return false;
-+ case CACHEMODE_WRITEBACK:
-+ return true;
-+ default:
-+ assert(0 == "implement me");
-+ }
-+ }
-
- uint64_t flags; ///< FLAG_*
- __u8 type; ///< TYPE_*
- __u8 size, min_size; ///< number of osds in each pg
-@@ -915,13 +929,31 @@
- cache_mode_t cache_mode; ///< cache pool mode
-
- bool is_tier() const { return tier_of >= 0; }
- bool has_tiers() const { return !tiers.empty(); }
-- void clear_tier() { tier_of = -1; }
-+ void clear_tier() {
-+ tier_of = -1;
-+ clear_read_tier();
-+ clear_write_tier();
-+ clear_tier_tunables();
-+ }
- bool has_read_tier() const { return read_tier >= 0; }
- void clear_read_tier() { read_tier = -1; }
- bool has_write_tier() const { return write_tier >= 0; }
- void clear_write_tier() { write_tier = -1; }
-+ void clear_tier_tunables() {
-+ if (cache_mode != CACHEMODE_NONE)
-+ flags |= FLAG_INCOMPLETE_CLONES;
-+ cache_mode = CACHEMODE_NONE;
-+
-+ target_max_bytes = 0;
-+ target_max_objects = 0;
-+ cache_target_dirty_ratio_micro = 0;
-+ cache_target_full_ratio_micro = 0;
-+ hit_set_params = HitSet::Params();
-+ hit_set_period = 0;
-+ hit_set_count = 0;
-+ }
-
- uint64_t target_max_bytes; ///< tiering: target max pool size
- uint64_t target_max_objects; ///< tiering: target max pool size
-
-@@ -963,8 +995,9 @@
-
- void dump(Formatter *f) const;
-
- uint64_t get_flags() const { return flags; }
-+ bool has_flag(uint64_t f) const { return flags & f; }
-
- /// This method will later return true for ec pools as well
- bool ec_pool() const {
- return type == TYPE_ERASURE;
-@@ -972,8 +1005,13 @@
- bool require_rollback() const {
- return ec_pool() || flags & FLAG_DEBUG_FAKE_EC_POOL;
- }
-
-+ /// true if incomplete clones may be present
-+ bool allow_incomplete_clones() const {
-+ return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
-+ }
-+
- unsigned get_type() const { return type; }
- unsigned get_size() const { return size; }
- unsigned get_min_size() const { return min_size; }
- int get_crush_ruleset() const { return crush_ruleset; }
-@@ -1810,9 +1848,9 @@
-
- class PGBackend;
- class ObjectModDesc {
- bool can_local_rollback;
-- bool stashed;
-+ bool rollback_info_completed;
- public:
- class Visitor {
- public:
- virtual void append(uint64_t old_offset) {}
-@@ -1830,75 +1868,76 @@
- DELETE = 3,
- CREATE = 4,
- UPDATE_SNAPS = 5
- };
-- ObjectModDesc() : can_local_rollback(true), stashed(false) {}
-+ ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {}
- void claim(ObjectModDesc &other) {
- bl.clear();
- bl.claim(other.bl);
- can_local_rollback = other.can_local_rollback;
-- stashed = other.stashed;
-+ rollback_info_completed = other.rollback_info_completed;
- }
- void claim_append(ObjectModDesc &other) {
-- if (!can_local_rollback || stashed)
-+ if (!can_local_rollback || rollback_info_completed)
- return;
- if (!other.can_local_rollback) {
- mark_unrollbackable();
- return;
- }
- bl.claim_append(other.bl);
-- stashed = other.stashed;
-+ rollback_info_completed = other.rollback_info_completed;
- }
- void swap(ObjectModDesc &other) {
- bl.swap(other.bl);
-
- bool temp = other.can_local_rollback;
- other.can_local_rollback = can_local_rollback;
- can_local_rollback = temp;
-
-- temp = other.stashed;
-- other.stashed = stashed;
-- stashed = temp;
-+ temp = other.rollback_info_completed;
-+ other.rollback_info_completed = rollback_info_completed;
-+ rollback_info_completed = temp;
- }
- void append_id(ModID id) {
- uint8_t _id(id);
- ::encode(_id, bl);
- }
- void append(uint64_t old_size) {
-- if (!can_local_rollback || stashed)
-+ if (!can_local_rollback || rollback_info_completed)
- return;
- ENCODE_START(1, 1, bl);
- append_id(APPEND);
- ::encode(old_size, bl);
- ENCODE_FINISH(bl);
- }
- void setattrs(map<string, boost::optional<bufferlist> > &old_attrs) {
-- if (!can_local_rollback || stashed)
-+ if (!can_local_rollback || rollback_info_completed)
- return;
- ENCODE_START(1, 1, bl);
- append_id(SETATTRS);
- ::encode(old_attrs, bl);
- ENCODE_FINISH(bl);
- }
- bool rmobject(version_t deletion_version) {
-- if (!can_local_rollback || stashed)
-+ if (!can_local_rollback || rollback_info_completed)
- return false;
- ENCODE_START(1, 1, bl);
- append_id(DELETE);
- ::encode(deletion_version, bl);
- ENCODE_FINISH(bl);
-- stashed = true;
-+ rollback_info_completed = true;
- return true;
- }
- void create() {
-- if (!can_local_rollback || stashed)
-+ if (!can_local_rollback || rollback_info_completed)
- return;
-+ rollback_info_completed = true;
- ENCODE_START(1, 1, bl);
- append_id(CREATE);
- ENCODE_FINISH(bl);
- }
- void update_snaps(set<snapid_t> &old_snaps) {
-- if (!can_local_rollback || stashed)
-+ if (!can_local_rollback || rollback_info_completed)
- return;
- ENCODE_START(1, 1, bl);
- append_id(UPDATE_SNAPS);
- ::encode(old_snaps, bl);
-@@ -2060,8 +2099,12 @@
-
- // We can rollback rollback-able entries > can_rollback_to
- eversion_t can_rollback_to;
-
-+ // always <= can_rollback_to, indicates how far stashed rollback
-+ // data can be found
-+ eversion_t rollback_info_trimmed_to;
-+
- list<pg_log_entry_t> log; // the actual log.
-
- pg_log_t() {}
-
-@@ -2761,21 +2804,23 @@
- return false;
- }
- }
-
-- bool get_write(OpRequestRef op) {
-- if (get_write_lock()) {
-+ bool get_write(OpRequestRef op, bool greedy=false) {
-+ if (get_write_lock(greedy)) {
- return true;
- } // else
- if (op)
- waiters.push_back(op);
- return false;
- }
-- bool get_write_lock() {
-- // don't starve anybody!
-- if (!waiters.empty() ||
-- backfill_read_marker) {
-- return false;
-+ bool get_write_lock(bool greedy=false) {
-+ if (!greedy) {
-+ // don't starve anybody!
-+ if (!waiters.empty() ||
-+ backfill_read_marker) {
-+ return false;
-+ }
- }
- switch (state) {
- case RWNONE:
- assert(count == 0);
-@@ -2822,9 +2867,12 @@
- bool get_read(OpRequestRef op) {
- return rwstate.get_read(op);
- }
- bool get_write(OpRequestRef op) {
-- return rwstate.get_write(op);
-+ return rwstate.get_write(op, false);
-+ }
-+ bool get_write_greedy(OpRequestRef op) {
-+ return rwstate.get_write(op, true);
- }
- bool get_snaptrimmer_write() {
- if (rwstate.get_write_lock()) {
- return true;
---- a/src/osdc/Objecter.cc
-+++ b/src/osdc/Objecter.cc
-@@ -1363,8 +1363,13 @@
- }
-
- ldout(cct, 10) << __func__ << " tid " << tid << dendl;
- Op *op = p->second;
-+ if (op->con) {
-+ ldout(cct, 20) << " revoking rx buffer for " << tid
-+ << " on " << op->con << dendl;
-+ op->con->revoke_rx_buffer(tid);
-+ }
- if (op->onack) {
- op->onack->complete(r);
- op->onack = NULL;
- }
-@@ -1433,9 +1438,9 @@
- return -ENOENT;
- return p->raw_hash_to_pg(p->hash_key(key, ns));
- }
-
--int Objecter::calc_target(op_target_t *t)
-+int Objecter::calc_target(op_target_t *t, bool any_change)
- {
- bool is_read = t->flags & CEPH_OSD_FLAG_READ;
- bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
-
-@@ -1490,9 +1495,10 @@
- need_resend = true;
- }
-
- if (t->pgid != pgid ||
-- is_pg_changed(t->primary, t->acting, primary, acting, t->used_replica) ||
-+ is_pg_changed(
-+ t->primary, t->acting, primary, acting, t->used_replica || any_change) ||
- force_resend) {
- t->pgid = pgid;
- t->acting = acting;
- t->primary = primary;
-@@ -1569,9 +1575,9 @@
- }
-
- bool Objecter::recalc_linger_op_target(LingerOp *linger_op)
- {
-- int r = calc_target(&linger_op->target);
-+ int r = calc_target(&linger_op->target, true);
- if (r == RECALC_OP_TARGET_NEED_RESEND) {
- ldout(cct, 10) << "recalc_linger_op_target tid " << linger_op->linger_id
- << " pgid " << linger_op->target.pgid
- << " acting " << linger_op->target.acting << dendl;
---- a/src/osdc/Objecter.h
-+++ b/src/osdc/Objecter.h
-@@ -1479,9 +1479,9 @@
- };
- bool osdmap_full_flag() const;
- bool target_should_be_paused(op_target_t *op);
-
-- int calc_target(op_target_t *t);
-+ int calc_target(op_target_t *t, bool any_change=false);
- int recalc_op_target(Op *op);
- bool recalc_linger_op_target(LingerOp *op);
-
- void send_linger(LingerOp *info);
---- a/src/pybind/rbd.py
-+++ b/src/pybind/rbd.py
-@@ -749,8 +749,16 @@
- ret = self.librbd.rbd_flush(self.image)
- if ret < 0:
- raise make_ex(ret, 'error flushing image')
-
-+ def invalidate_cache(self):
-+ """
-+ Drop any cached data for the image.
-+ """
-+ ret = self.librbd.rbd_invalidate_cache(self.image)
-+ if ret < 0:
-+ raise make_ex(ret, 'error invalidating cache')
-+
- def stripe_unit(self):
- """
- Returns the stripe unit used for the image.
- """
---- a/src/rgw/rgw_common.cc
-+++ b/src/rgw/rgw_common.cc
-@@ -696,15 +696,17 @@
- char dest[src_str.size() + 1];
- int pos = 0;
- char c;
-
-+ bool in_query = false;
- while (*src) {
- if (*src != '%') {
-- if (*src != '+') {
-- dest[pos++] = *src++;
-+ if (!in_query || *src != '+') {
-+ if (*src == '?') in_query = true;
-+ dest[pos++] = *src++;
- } else {
-- dest[pos++] = ' ';
-- ++src;
-+ dest[pos++] = ' ';
-+ ++src;
- }
- } else {
- src++;
- if (!*src)
---- a/src/rgw/rgw_op.cc
-+++ b/src/rgw/rgw_op.cc
-@@ -1379,9 +1379,12 @@
- };
-
- int RGWPutObjProcessor_Multipart::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
- {
-- RGWPutObjProcessor::prepare(store, obj_ctx, NULL);
-+ int r = prepare_init(store, obj_ctx, NULL);
-+ if (r < 0) {
-+ return r;
-+ }
-
- string oid = obj_str;
- upload_id = s->info.args.get("uploadId");
- if (!oid_rand) {
-@@ -1418,9 +1421,9 @@
- manifest.set_prefix(upload_prefix);
-
- manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, num);
-
-- int r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, target_obj);
-+ r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, target_obj);
- if (r < 0) {
- return r;
- }
-
-@@ -1559,8 +1562,38 @@
-
- return 0;
- }
-
-+static int put_data_and_throttle(RGWPutObjProcessor *processor, bufferlist& data, off_t ofs,
-+ MD5 *hash, bool need_to_wait)
-+{
-+ const unsigned char *data_ptr = (hash ? (const unsigned char *)data.c_str() : NULL);
-+ bool again;
-+ uint64_t len = data.length();
-+
-+ do {
-+ void *handle;
-+
-+ int ret = processor->handle_data(data, ofs, &handle, &again);
-+ if (ret < 0)
-+ return ret;
-+
-+ if (hash) {
-+ hash->Update(data_ptr, len);
-+ hash = NULL; /* only calculate hash once */
-+ }
-+
-+ ret = processor->throttle_data(handle, need_to_wait);
-+ if (ret < 0)
-+ return ret;
-+
-+ need_to_wait = false; /* the need to wait only applies to the first iteration */
-+ } while (again);
-+
-+ return 0;
-+}
-+
-+
- void RGWPutObj::execute()
- {
- RGWPutObjProcessor *processor = NULL;
- char supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1];
-@@ -1632,25 +1665,14 @@
- }
- if (!len)
- break;
-
-- void *handle;
-- const unsigned char *data_ptr = (const unsigned char *)data.c_str();
--
-- ret = processor->handle_data(data, ofs, &handle);
-- if (ret < 0)
-- goto done;
--
-- if (need_calc_md5) {
-- hash.Update(data_ptr, len);
-- }
--
- /* do we need this operation to be synchronous? if we're dealing with an object with immutable
- * head, e.g., multipart object we need to make sure we're the first one writing to this object
- */
- bool need_to_wait = (ofs == 0) && multipart;
-
-- ret = processor->throttle_data(handle, need_to_wait);
-+ ret = put_data_and_throttle(processor, data, ofs, (need_calc_md5 ? &hash : NULL), need_to_wait);
- if (ret < 0) {
- if (!need_to_wait || ret != -EEXIST) {
- ldout(s->cct, 20) << "processor->thottle_data() returned ret=" << ret << dendl;
- goto done;
-@@ -1673,17 +1695,10 @@
- ldout(s->cct, 0) << "ERROR: processor->prepare() returned " << ret << dendl;
- goto done;
- }
-
-- ret = processor->handle_data(data, ofs, &handle);
-+ ret = put_data_and_throttle(processor, data, ofs, NULL, false);
- if (ret < 0) {
-- ldout(s->cct, 0) << "ERROR: processor->handle_data() returned " << ret << dendl;
-- goto done;
-- }
--
-- ret = processor->throttle_data(handle, false);
-- if (ret < 0) {
-- ldout(s->cct, 0) << "ERROR: processor->throttle_data() returned " << ret << dendl;
- goto done;
- }
- }
-
-@@ -1845,20 +1860,9 @@
-
- if (!len)
- break;
-
-- void *handle;
-- const unsigned char *data_ptr = (const unsigned char *)data.c_str();
--
-- ret = processor->handle_data(data, ofs, &handle);
-- if (ret < 0)
-- goto done;
--
-- hash.Update(data_ptr, len);
--
-- ret = processor->throttle_data(handle, false);
-- if (ret < 0)
-- goto done;
-+ ret = put_data_and_throttle(processor, data, ofs, &hash, false);
-
- ofs += len;
-
- if (ofs > max_len) {
---- a/src/rgw/rgw_rados.cc
-+++ b/src/rgw/rgw_rados.cc
-@@ -899,10 +899,12 @@
-
- return 0;
- };
-
--int RGWPutObjProcessor_Plain::handle_data(bufferlist& bl, off_t _ofs, void **phandle)
-+int RGWPutObjProcessor_Plain::handle_data(bufferlist& bl, off_t _ofs, void **phandle, bool *again)
- {
-+ *again = false;
-+
- if (ofs != _ofs)
- return -EINVAL;
-
- data.append(bl);
-@@ -1025,10 +1027,12 @@
-
- return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
- }
-
--int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle)
-+int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again)
- {
-+ *again = false;
-+
- *phandle = NULL;
- if (extra_data_len) {
- size_t extra_len = bl.length();
- if (extra_len > extra_data_len)
-@@ -1043,15 +1047,18 @@
- return 0;
- }
- }
-
-- uint64_t max_chunk_size = store->get_max_chunk_size();
-+ uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
-
- pending_data_bl.claim_append(bl);
-- if (pending_data_bl.length() < max_chunk_size)
-+ if (pending_data_bl.length() < max_write_size)
- return 0;
-
-- pending_data_bl.splice(0, max_chunk_size, &bl);
-+ pending_data_bl.splice(0, max_write_size, &bl);
-+
-+ /* do we have enough data pending accumulated that needs to be written? */
-+ *again = (pending_data_bl.length() >= max_chunk_size);
-
- if (!data_ofs && !immutable_head()) {
- first_chunk.claim(bl);
- obj_len = (uint64_t)first_chunk.length();
-@@ -1069,19 +1076,32 @@
- object and cleanup can be messy */
- return write_data(bl, write_ofs, phandle, exclusive);
- }
-
--int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
-+
-+int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, void *obj_ctx, string *oid_rand)
- {
- RGWPutObjProcessor::prepare(store, obj_ctx, oid_rand);
-
-- head_obj.init(bucket, obj_str);
-+ int r = store->get_max_chunk_size(bucket, &max_chunk_size);
-+ if (r < 0) {
-+ return r;
-+ }
-+
-+ return 0;
-+}
-
-- uint64_t max_chunk_size = store->get_max_chunk_size();
-+int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
-+{
-+ int r = prepare_init(store, obj_ctx, oid_rand);
-+ if (r < 0) {
-+ return r;
-+ }
-+ head_obj.init(bucket, obj_str);
-
- manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
-
-- int r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, head_obj);
-+ r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, head_obj);
- if (r < 0) {
- return r;
- }
-
-@@ -1200,8 +1220,46 @@
- objs_state[new_obj].prefetch_data = true;
- }
- }
-
-+int RGWRados::get_required_alignment(rgw_bucket& bucket, uint64_t *alignment)
-+{
-+ IoCtx ioctx;
-+ int r = open_bucket_data_ctx(bucket, ioctx);
-+ if (r < 0) {
-+ ldout(cct, 0) << "ERROR: open_bucket_data_ctx() returned " << r << dendl;
-+ return r;
-+ }
-+
-+ *alignment = ioctx.pool_required_alignment();
-+ return 0;
-+}
-+
-+int RGWRados::get_max_chunk_size(rgw_bucket& bucket, uint64_t *max_chunk_size)
-+{
-+ uint64_t alignment;
-+ int r = get_required_alignment(bucket, &alignment);
-+ if (r < 0) {
-+ return r;
-+ }
-+
-+ uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
-+
-+ if (alignment == 0) {
-+ *max_chunk_size = config_chunk_size;
-+ return 0;
-+ }
-+
-+ if (config_chunk_size <= alignment) {
-+ *max_chunk_size = alignment;
-+ return 0;
-+ }
-+
-+ *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
-+
-+ return 0;
-+}
-+
- void RGWRados::finalize()
- {
- if (need_watch_notify()) {
- finalize_watch();
-@@ -1235,10 +1293,8 @@
- int RGWRados::init_rados()
- {
- int ret;
-
-- max_chunk_size = cct->_conf->rgw_max_chunk_size;
--
- rados = new Rados();
- if (!rados)
- return -ENOMEM;
-
-@@ -2956,27 +3012,35 @@
- progress_data(_progress_data) {}
- int handle_data(bufferlist& bl, off_t ofs, off_t len) {
- progress_cb(ofs, progress_data);
-
-- void *handle;
-- int ret = processor->handle_data(bl, ofs, &handle);
-- if (ret < 0)
-- return ret;
-+ bool again;
-
-- if (opstate) {
-- /* need to update opstate repository with new state. This is ratelimited, so we're not
-- * really doing it every time
-- */
-- ret = opstate->renew_state();
-- if (ret < 0) {
-- /* could not renew state! might have been marked as cancelled */
-+ bool need_opstate = true;
-+
-+ do {
-+ void *handle;
-+ int ret = processor->handle_data(bl, ofs, &handle, &again);
-+ if (ret < 0)
- return ret;
-+
-+ if (need_opstate && opstate) {
-+ /* need to update opstate repository with new state. This is ratelimited, so we're not
-+ * really doing it every time
-+ */
-+ ret = opstate->renew_state();
-+ if (ret < 0) {
-+ /* could not renew state! might have been marked as cancelled */
-+ return ret;
-+ }
-+
-+ need_opstate = false;
- }
-- }
-
-- ret = processor->throttle_data(handle, false);
-- if (ret < 0)
-- return ret;
-+ ret = processor->throttle_data(handle, false);
-+ if (ret < 0)
-+ return ret;
-+ } while (again);
-
- return 0;
- }
-
-@@ -3191,26 +3255,8 @@
- return ret;
-
- vector<rgw_obj> ref_objs;
-
-- bool copy_data = !astate->has_manifest;
-- bool copy_first = false;
-- if (astate->has_manifest) {
-- if (!astate->manifest.has_tail()) {
-- copy_data = true;
-- } else {
-- uint64_t head_size = astate->manifest.get_head_size();
--
-- if (head_size > 0) {
-- if (head_size > max_chunk_size) // should never happen
-- copy_data = true;
-- else
-- copy_first = true;
-- }
-- }
-- }
--
--
- if (remote_dest) {
- /* dest is in a different region, copy it there */
-
- string etag;
-@@ -3229,10 +3275,37 @@
- if (ret < 0)
- return ret;
-
- return 0;
-- } else if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
-- return copy_obj_data(ctx, dest_bucket_info.owner, &handle, end, dest_obj, src_obj, mtime, src_attrs, category, ptag, err);
-+ }
-+
-+ uint64_t max_chunk_size;
-+
-+ ret = get_max_chunk_size(dest_obj.bucket, &max_chunk_size);
-+ if (ret < 0) {
-+ ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
-+ return ret;
-+ }
-+
-+ bool copy_data = !astate->has_manifest;
-+ bool copy_first = false;
-+ if (astate->has_manifest) {
-+ if (!astate->manifest.has_tail()) {
-+ copy_data = true;
-+ } else {
-+ uint64_t head_size = astate->manifest.get_head_size();
-+
-+ if (head_size > 0) {
-+ if (head_size > max_chunk_size)
-+ copy_data = true;
-+ else
-+ copy_first = true;
-+ }
-+ }
-+ }
-+
-+ if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
-+ return copy_obj_data(ctx, dest_bucket_info.owner, &handle, end, dest_obj, src_obj, max_chunk_size, mtime, src_attrs, category, ptag, err);
- }
-
- RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
-
-@@ -3340,8 +3413,9 @@
- const string& owner,
- void **handle, off_t end,
- rgw_obj& dest_obj,
- rgw_obj& src_obj,
-+ uint64_t max_chunk_size,
- time_t *mtime,
- map<string, bufferlist>& attrs,
- RGWObjCategory category,
- string *ptag,
-@@ -4472,8 +4546,10 @@
-
- bool merge_bl = false;
- bufferlist *pbl = &bl;
- bufferlist read_bl;
-+ uint64_t max_chunk_size;
-+
-
- get_obj_bucket_and_oid_key(obj, bucket, oid, key);
-
- if (!rctx) {
-@@ -4504,8 +4580,14 @@
- get_obj_bucket_and_oid_key(read_obj, bucket, oid, key);
- }
- }
-
-+ r = get_max_chunk_size(bucket, &max_chunk_size);
-+ if (r < 0) {
-+ ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << bucket << dendl;
-+ goto done_ret;
-+ }
-+
- if (len > max_chunk_size)
- len = max_chunk_size;
-
-
---- a/src/rgw/rgw_rados.h
-+++ b/src/rgw/rgw_rados.h
-@@ -547,9 +547,9 @@
- store = _store;
- obj_ctx = _o;
- return 0;
- };
-- virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle) = 0;
-+ virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again) = 0;
- virtual int throttle_data(void *handle, bool need_to_wait) = 0;
- virtual int complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs);
- };
-
-@@ -563,9 +563,9 @@
- off_t ofs;
-
- protected:
- int prepare(RGWRados *store, void *obj_ctx, string *oid_rand);
-- int handle_data(bufferlist& bl, off_t ofs, void **phandle);
-+ int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again);
- int do_complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs);
-
- public:
- int throttle_data(void *handle, bool need_to_wait) { return 0; }
-@@ -612,8 +612,10 @@
-
- uint64_t extra_data_len;
- bufferlist extra_data_bl;
- bufferlist pending_data_bl;
-+ uint64_t max_chunk_size;
-+
- protected:
- rgw_bucket bucket;
- string obj_str;
-
-@@ -630,8 +632,10 @@
- int prepare_next_part(off_t ofs);
- int complete_parts();
- int complete_writing_data();
-
-+ int prepare_init(RGWRados *store, void *obj_ctx, string *oid_rand);
-+
- public:
- ~RGWPutObjProcessor_Atomic() {}
- RGWPutObjProcessor_Atomic(const string& bucket_owner, rgw_bucket& _b, const string& _o, uint64_t _p, const string& _t) :
- RGWPutObjProcessor_Aio(bucket_owner),
-@@ -640,17 +644,18 @@
- next_part_ofs(_p),
- cur_part_id(0),
- data_ofs(0),
- extra_data_len(0),
-+ max_chunk_size(0),
- bucket(_b),
- obj_str(_o),
- unique_tag(_t) {}
- int prepare(RGWRados *store, void *obj_ctx, string *oid_rand);
- virtual bool immutable_head() { return false; }
- void set_extra_data_len(uint64_t len) {
- extra_data_len = len;
- }
-- virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle);
-+ virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again);
- bufferlist& get_extra_data() { return extra_data_bl; }
- };
-
-
-@@ -1220,10 +1225,8 @@
- int get_obj_ioctx(const rgw_obj& obj, librados::IoCtx *ioctx);
- int get_obj_ref(const rgw_obj& obj, rgw_rados_ref *ref, rgw_bucket *bucket, bool ref_system_obj = false);
- uint64_t max_bucket_id;
-
-- uint64_t max_chunk_size;
--
- int get_obj_state(RGWRadosCtx *rctx, rgw_obj& obj, RGWObjState **state, RGWObjVersionTracker *objv_tracker);
- int append_atomic_test(RGWRadosCtx *rctx, rgw_obj& obj,
- librados::ObjectOperation& op, RGWObjState **state);
- int prepare_atomic_for_write_impl(RGWRadosCtx *rctx, rgw_obj& obj,
-@@ -1286,9 +1289,8 @@
- gc(NULL), use_gc_thread(false), quota_threads(false),
- num_watchers(0), watchers(NULL), watch_handles(NULL),
- watch_initialized(false),
- bucket_id_lock("rados_bucket_id"), max_bucket_id(0),
-- max_chunk_size(0),
- cct(NULL), rados(NULL),
- pools_initialized(false),
- quota_handler(NULL),
- rest_master_conn(NULL),
-@@ -1324,11 +1326,10 @@
- delete rados;
- }
- }
-
-- uint64_t get_max_chunk_size() {
-- return max_chunk_size;
-- }
-+ int get_required_alignment(rgw_bucket& bucket, uint64_t *alignment);
-+ int get_max_chunk_size(rgw_bucket& bucket, uint64_t *max_chunk_size);
-
- int list_raw_objects(rgw_bucket& pool, const string& prefix_filter, int max,
- RGWListRawObjsCtx& ctx, list<string>& oids,
- bool *is_truncated);
-@@ -1562,8 +1563,9 @@
- const string& owner,
- void **handle, off_t end,
- rgw_obj& dest_obj,
- rgw_obj& src_obj,
-+ uint64_t max_chunk_size,
- time_t *mtime,
- map<string, bufferlist>& attrs,
- RGWObjCategory category,
- string *ptag,
---- a/src/rgw/rgw_rest.cc
-+++ b/src/rgw/rgw_rest.cc
-@@ -179,9 +179,9 @@
- {
- std::ostringstream oss;
- formatter->flush(oss);
- std::string outs(oss.str());
-- if (!outs.empty()) {
-+ if (!outs.empty() && s->op != OP_HEAD) {
- s->cio->write(outs.c_str(), outs.size());
- }
-
- s->formatter->reset();
-@@ -191,9 +191,9 @@
- {
- std::ostringstream oss;
- formatter->flush(oss);
- std::string outs(oss.str());
-- if (!outs.empty()) {
-+ if (!outs.empty() && s->op != OP_HEAD) {
- s->cio->write(outs.c_str(), outs.size());
- }
- }
-
---- a/src/rgw/rgw_rest_swift.cc
-+++ b/src/rgw/rgw_rest_swift.cc
-@@ -626,20 +626,18 @@
- string hdrs, exp_hdrs;
- uint32_t max_age = CORS_MAX_AGE_INVALID;
- /*EACCES means, there is no CORS registered yet for the bucket
- *ENOENT means, there is no match of the Origin in the list of CORSRule
-- *ENOTSUPP means, the HTTP_METHOD is not supported
- */
- if (ret == -ENOENT)
- ret = -EACCES;
-- if (ret != -EACCES) {
-- get_response_params(hdrs, exp_hdrs, &max_age);
-- } else {
-+ if (ret < 0) {
- set_req_state_err(s, ret);
- dump_errno(s);
- end_header(s, NULL);
- return;
- }
-+ get_response_params(hdrs, exp_hdrs, &max_age);
- dump_errno(s);
- dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(), max_age);
- end_header(s, NULL);
- }
---- a/src/test/crush/TestCrushWrapper.cc
-+++ b/src/test/crush/TestCrushWrapper.cc
-@@ -537,8 +537,13 @@
- EXPECT_NE(string::npos,
- ss.str().find("<item_name>default</item_name></step>"));
- }
-
-+ map<int,float> wm;
-+ c->get_rule_weight_osd_map(0, &wm);
-+ ASSERT_TRUE(wm.size() == 1);
-+ ASSERT_TRUE(wm[0] == 1.0);
-+
- delete c;
- }
-
- TEST(CrushWrapper, distance) {
---- a/src/test/erasure-code/TestErasureCodeJerasure.cc
-+++ b/src/test/erasure-code/TestErasureCodeJerasure.cc
-@@ -287,8 +287,38 @@
- c->insert_item(g_ceph_context, osd, 1.0, string("osd.") + stringify(osd), loc);
- }
- }
-
-+ //
-+ // The ruleid may be different from the ruleset when a crush rule is
-+ // removed because the removed ruleid will be reused but the removed
-+ // ruleset will not be reused.
-+ //
-+ // This also asserts that the create_ruleset() method returns a
-+ // ruleset and not a ruleid http://tracker.ceph.com/issues/9044
-+ //
-+ {
-+ stringstream ss;
-+ ErasureCodeJerasureReedSolomonVandermonde jerasure;
-+ map<std::string,std::string> parameters;
-+ parameters["k"] = "2";
-+ parameters["m"] = "2";
-+ parameters["w"] = "8";
-+ jerasure.init(parameters);
-+ int FIRST = jerasure.create_ruleset("FIRST", *c, &ss);
-+ int SECOND = jerasure.create_ruleset("SECOND", *c, &ss);
-+ int FIRST_ruleid = c->get_rule_id("FIRST");
-+ EXPECT_EQ(0, c->remove_rule(FIRST_ruleid));
-+ int ruleset = jerasure.create_ruleset("myrule", *c, &ss);
-+ EXPECT_NE(FIRST, ruleset);
-+ EXPECT_NE(SECOND, ruleset);
-+ EXPECT_NE(ruleset, c->get_rule_id("myrule"));
-+ int SECOND_ruleid = c->get_rule_id("SECOND");
-+ EXPECT_EQ(0, c->remove_rule(SECOND_ruleid));
-+ int myrule_ruleid = c->get_rule_id("myrule");
-+ EXPECT_EQ(0, c->remove_rule(myrule_ruleid));
-+ }
-+
- {
- stringstream ss;
- ErasureCodeJerasureReedSolomonVandermonde jerasure;
- map<std::string,std::string> parameters;
---- a/src/test/librados/TestCase.cc
-+++ b/src/test/librados/TestCase.cc
-@@ -7,8 +7,9 @@
-
- using namespace librados;
-
- std::string RadosTest::pool_name;
-+std::string RadosTest::nspace;
- rados_t RadosTest::s_cluster = NULL;
-
- void RadosTest::SetUpTestCase()
- {
-@@ -24,9 +25,9 @@
- void RadosTest::SetUp()
- {
- cluster = RadosTest::s_cluster;
- ASSERT_EQ(0, rados_ioctx_create(cluster, pool_name.c_str(), &ioctx));
-- std::string nspace = get_temp_pool_name();
-+ nspace = get_temp_pool_name();
- rados_ioctx_set_namespace(ioctx, nspace.c_str());
- ASSERT_FALSE(rados_ioctx_pool_requires_alignment(ioctx));
- }
-
-@@ -205,26 +206,8 @@
- cleanup_default_namespace(ioctx);
- rados_ioctx_destroy(ioctx);
- }
-
--void RadosTestEC::cleanup_default_namespace(rados_ioctx_t ioctx)
--{
-- // remove all objects from the default namespace to avoid polluting
-- // other tests
-- rados_ioctx_set_namespace(ioctx, "");
-- rados_list_ctx_t list_ctx;
-- ASSERT_EQ(0, rados_objects_list_open(ioctx, &list_ctx));
-- int r;
-- const char *entry = NULL;
-- const char *key = NULL;
-- while ((r = rados_objects_list_next(list_ctx, &entry, &key)) != -ENOENT) {
-- ASSERT_EQ(0, r);
-- rados_ioctx_locator_set_key(ioctx, key);
-- ASSERT_EQ(0, rados_remove(ioctx, entry));
-- }
-- rados_objects_list_close(list_ctx);
--}
--
- std::string RadosTestECPP::pool_name;
- Rados RadosTestECPP::s_cluster;
-
- void RadosTestECPP::SetUpTestCase()
-@@ -253,15 +236,4 @@
- cleanup_default_namespace(ioctx);
- ioctx.close();
- }
-
--void RadosTestECPP::cleanup_default_namespace(librados::IoCtx ioctx)
--{
-- // remove all objects from the default namespace to avoid polluting
-- // other tests
-- ioctx.set_namespace("");
-- for (ObjectIterator it = ioctx.objects_begin();
-- it != ioctx.objects_end(); ++it) {
-- ioctx.locator_set_key(it->second);
-- ASSERT_EQ(0, ioctx.remove(it->first));
-- }
--}
---- a/src/test/librados/TestCase.h
-+++ b/src/test/librados/TestCase.h
-@@ -27,8 +27,9 @@
- static void TearDownTestCase();
- static void cleanup_default_namespace(rados_ioctx_t ioctx);
- static rados_t s_cluster;
- static std::string pool_name;
-+ static std::string nspace;
-
- virtual void SetUp();
- virtual void TearDown();
- rados_t cluster;
-@@ -71,16 +72,15 @@
- librados::IoCtx ioctx;
- std::string ns;
- };
-
--class RadosTestEC : public ::testing::Test {
-+class RadosTestEC : public RadosTest {
- public:
- RadosTestEC() {}
- virtual ~RadosTestEC() {}
- protected:
- static void SetUpTestCase();
- static void TearDownTestCase();
-- static void cleanup_default_namespace(rados_ioctx_t ioctx);
- static rados_t s_cluster;
- static std::string pool_name;
-
- virtual void SetUp();
-@@ -89,16 +89,15 @@
- rados_ioctx_t ioctx;
- uint64_t alignment;
- };
-
--class RadosTestECPP : public ::testing::Test {
-+class RadosTestECPP : public RadosTestPP {
- public:
- RadosTestECPP() : cluster(s_cluster) {};
- virtual ~RadosTestECPP() {};
- protected:
- static void SetUpTestCase();
- static void TearDownTestCase();
-- static void cleanup_default_namespace(librados::IoCtx ioctx);
- static librados::Rados s_cluster;
- static std::string pool_name;
-
- virtual void SetUp();
---- a/src/test/librados/io.cc
-+++ b/src/test/librados/io.cc
-@@ -24,8 +24,60 @@
- rados_ioctx_set_namespace(ioctx, "nspace");
- ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
- }
-
-+TEST_F(LibRadosIo, ReadTimeout) {
-+ char buf[128];
-+ memset(buf, 'a', sizeof(buf));
-+ ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
-+
-+ {
-+ // set up a second client
-+ rados_t cluster;
-+ rados_ioctx_t ioctx;
-+ rados_create(&cluster, "admin");
-+ rados_conf_read_file(cluster, NULL);
-+ rados_conf_parse_env(cluster, NULL);
-+ rados_conf_set(cluster, "rados_osd_op_timeout", "0.00001"); // use any small value that will result in a timeout
-+ rados_connect(cluster);
-+ rados_ioctx_create(cluster, pool_name.c_str(), &ioctx);
-+ rados_ioctx_set_namespace(ioctx, nspace.c_str());
-+
-+ // then we show that the buffer is changed after rados_read returned
-+ // with a timeout
-+ for (int i=0; i<5; i++) {
-+ char buf2[sizeof(buf)];
-+ memset(buf2, 0, sizeof(buf2));
-+ int err = rados_read(ioctx, "foo", buf2, sizeof(buf2), 0);
-+ if (err == -110) {
-+ int startIndex = 0;
-+ // find the index until which librados already read the object before the timeout occurred
-+ for (unsigned b=0; b<sizeof(buf); b++) {
-+ if (buf2[b] != buf[b]) {
-+ startIndex = b;
-+ break;
-+ }
-+ }
-+
-+ // wait some time to give librados a change to do something
-+ sleep(1);
-+
-+ // then check if the buffer was changed after the call
-+ if (buf2[startIndex] == 'a') {
-+ printf("byte at index %d was changed after the timeout to %d\n",
-+ startIndex, (int)buf[startIndex]);
-+ ASSERT_TRUE(0);
-+ break;
-+ }
-+ } else {
-+ printf("no timeout :/\n");
-+ }
-+ }
-+ rados_ioctx_destroy(ioctx);
-+ rados_shutdown(cluster);
-+ }
-+}
-+
- TEST_F(LibRadosIoPP, SimpleWritePP) {
- char buf[128];
- memset(buf, 0xcc, sizeof(buf));
- bufferlist bl;
---- a/src/test/librados/tier.cc
-+++ b/src/test/librados/tier.cc
-@@ -33,8 +33,40 @@
-
- typedef RadosTestPP LibRadosTierPP;
- typedef RadosTestECPP LibRadosTierECPP;
-
-+void flush_evict_all(librados::Rados& cluster, librados::IoCtx& cache_ioctx)
-+{
-+ bufferlist inbl;
-+ cache_ioctx.set_namespace("");
-+ for (ObjectIterator it = cache_ioctx.objects_begin();
-+ it != cache_ioctx.objects_end(); ++it) {
-+ cache_ioctx.locator_set_key(it->second);
-+ {
-+ ObjectReadOperation op;
-+ op.cache_flush();
-+ librados::AioCompletion *completion = cluster.aio_create_completion();
-+ cache_ioctx.aio_operate(
-+ it->first, completion, &op,
-+ librados::OPERATION_IGNORE_OVERLAY, NULL);
-+ completion->wait_for_safe();
-+ completion->get_return_value();
-+ completion->release();
-+ }
-+ {
-+ ObjectReadOperation op;
-+ op.cache_evict();
-+ librados::AioCompletion *completion = cluster.aio_create_completion();
-+ cache_ioctx.aio_operate(
-+ it->first, completion, &op,
-+ librados::OPERATION_IGNORE_OVERLAY, NULL);
-+ completion->wait_for_safe();
-+ completion->get_return_value();
-+ completion->release();
-+ }
-+ }
-+}
-+
- class LibRadosTwoPoolsPP : public RadosTestPP
- {
- public:
- LibRadosTwoPoolsPP() {};
-@@ -58,9 +90,28 @@
- cache_ioctx.set_namespace(ns);
- }
- virtual void TearDown() {
- RadosTestPP::TearDown();
-+
-+ // flush + evict cache
-+ flush_evict_all(cluster, cache_ioctx);
-+
-+ bufferlist inbl;
-+ // tear down tiers
-+ ASSERT_EQ(0, cluster.mon_command(
-+ "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-+ "\"}",
-+ inbl, NULL, NULL));
-+ ASSERT_EQ(0, cluster.mon_command(
-+ "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-+ "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-+ inbl, NULL, NULL));
-+
-+ // wait for maps to settle before next test
-+ cluster.wait_for_latest_osdmap();
-+
- cleanup_default_namespace(cache_ioctx);
-+
- cache_ioctx.close();
- }
- librados::IoCtx cache_ioctx;
- };
-@@ -179,21 +230,8 @@
- ASSERT_EQ(0, completion->get_return_value());
- completion->release();
- ASSERT_EQ('b', bl[0]);
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsPP, Promote) {
- // create object
-@@ -246,21 +284,8 @@
- ASSERT_TRUE(it->first == string("foo") || it->first == string("bar"));
- ++it;
- ASSERT_TRUE(it == cache_ioctx.objects_end());
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsPP, PromoteSnap) {
- // create object
-@@ -399,21 +424,8 @@
- {
- bufferlist bl;
- ASSERT_EQ(-ENOENT, ioctx.read("baz", bl, 1, 0));
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsPP, PromoteSnapScrub) {
- int num = 100;
-@@ -508,21 +520,8 @@
- cout << "done waiting" << std::endl;
- }
-
- ioctx.snap_set_read(librados::SNAP_HEAD);
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
-
- TEST_F(LibRadosTwoPoolsPP, PromoteSnapTrimRace) {
-@@ -576,21 +575,8 @@
- {
- bufferlist bl;
- ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsPP, Whiteout) {
- // create object
-@@ -652,21 +638,8 @@
- bufferlist bl;
- ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
- ASSERT_EQ('h', bl[0]);
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsPP, Evict) {
- // create object
-@@ -755,21 +728,8 @@
- completion->wait_for_safe();
- ASSERT_EQ(-EBUSY, completion->get_return_value());
- completion->release();
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsPP, EvictSnap) {
- // create object
-@@ -1003,21 +963,8 @@
- completion->wait_for_safe();
- ASSERT_EQ(0, completion->get_return_value());
- completion->release();
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsPP, TryFlush) {
- // configure cache
-@@ -1124,21 +1071,8 @@
- {
- ObjectIterator it = cache_ioctx.objects_begin();
- ASSERT_TRUE(it == cache_ioctx.objects_end());
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsPP, Flush) {
- // configure cache
-@@ -1297,21 +1231,8 @@
- {
- ObjectIterator it = ioctx.objects_begin();
- ASSERT_TRUE(it == ioctx.objects_end());
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsPP, FlushSnap) {
- // configure cache
-@@ -1469,20 +1390,13 @@
- ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
- ASSERT_EQ('a', bl[0]);
- }
-
-- // tear down tiers
-+ // remove overlay
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle
-- cluster.wait_for_latest_osdmap();
-
- // verify i can read the snaps from the base pool
- ioctx.snap_set_read(librados::SNAP_HEAD);
- {
-@@ -1501,8 +1415,13 @@
- bufferlist bl;
- ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
- ASSERT_EQ('a', bl[0]);
- }
-+
-+ ASSERT_EQ(0, cluster.mon_command(
-+ "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
-+ "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
-+ inbl, NULL, NULL));
- }
-
- TEST_F(LibRadosTierPP, FlushWriteRaces) {
- Rados cluster;
-@@ -1785,21 +1704,8 @@
- ASSERT_EQ(0, completion2->get_return_value());
- completion->release();
- completion2->release();
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
-
- IoCtx *read_ioctx = 0;
-@@ -1894,21 +1800,8 @@
- max_reads = 0;
- while (num_reads > 0)
- cond.Wait(test_lock);
- test_lock.Unlock();
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTierPP, HitSetNone) {
- {
-@@ -1943,23 +1836,30 @@
- + string("\",\"var\": \"") + var + string("\",\"val\": \"")
- + stringify(val) + string("\"}");
- }
-
--TEST_F(LibRadosTierPP, HitSetRead) {
-- // enable hitset tracking for this pool
-+TEST_F(LibRadosTwoPoolsPP, HitSetRead) {
-+ // make it a tier
- bufferlist inbl;
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 2),
-+ ASSERT_EQ(0, cluster.mon_command(
-+ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
-+ "\", \"tierpool\": \"" + cache_pool_name +
-+ "\", \"force_nonempty\": \"--force-nonempty\" }",
-+ inbl, NULL, NULL));
-+
-+ // enable hitset tracking for this pool
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 2),
- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
- "explicit_object"),
- inbl, NULL, NULL));
-
- // wait for maps to settle
- cluster.wait_for_latest_osdmap();
-
-- ioctx.set_namespace("");
-+ cache_ioctx.set_namespace("");
-
- // keep reading until we see our object appear in the HitSet
- utime_t start = ceph_clock_now(NULL);
- utime_t hard_stop = start + utime_t(600, 0);
-@@ -1968,18 +1868,18 @@
- utime_t now = ceph_clock_now(NULL);
- ASSERT_TRUE(now < hard_stop);
-
- string name = "foo";
-- uint32_t hash = ioctx.get_object_hash_position(name);
-+ uint32_t hash = cache_ioctx.get_object_hash_position(name);
- hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash,
-- cluster.pool_lookup(pool_name.c_str()), "");
-+ cluster.pool_lookup(cache_pool_name.c_str()), "");
-
- bufferlist bl;
-- ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
-+ ASSERT_EQ(-ENOENT, cache_ioctx.read("foo", bl, 1, 0));
-
- bufferlist hbl;
- AioCompletion *c = librados::Rados::aio_create_completion();
-- ASSERT_EQ(0, ioctx.hit_set_get(hash, c, now.sec(), &hbl));
-+ ASSERT_EQ(0, cache_ioctx.hit_set_get(hash, c, now.sec(), &hbl));
- c->wait_for_complete();
- c->release();
-
- if (hbl.length()) {
-@@ -2027,49 +1927,58 @@
- return -1;
- }
-
-
--TEST_F(LibRadosTierPP, HitSetWrite) {
-+TEST_F(LibRadosTwoPoolsPP, HitSetWrite) {
- int num_pg = _get_pg_num(cluster, pool_name);
- assert(num_pg > 0);
-
-- // enable hitset tracking for this pool
-+ // make it a tier
- bufferlist inbl;
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 8),
-+ ASSERT_EQ(0, cluster.mon_command(
-+ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
-+ "\", \"tierpool\": \"" + cache_pool_name +
-+ "\", \"force_nonempty\": \"--force-nonempty\" }",
-+ inbl, NULL, NULL));
-+
-+ // enable hitset tracking for this pool
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 8),
- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
- "explicit_hash"),
- inbl, NULL, NULL));
-
- // wait for maps to settle
- cluster.wait_for_latest_osdmap();
-
-- ioctx.set_namespace("");
-+ cache_ioctx.set_namespace("");
-+
-+ int num = 200;
-
- // do a bunch of writes
-- for (int i=0; i<1000; ++i) {
-+ for (int i=0; i<num; ++i) {
- bufferlist bl;
- bl.append("a");
-- ASSERT_EQ(0, ioctx.write(stringify(i), bl, 1, 0));
-+ ASSERT_EQ(0, cache_ioctx.write(stringify(i), bl, 1, 0));
- }
-
- // get HitSets
- std::map<int,HitSet> hitsets;
- for (int i=0; i<num_pg; ++i) {
- list< pair<time_t,time_t> > ls;
- AioCompletion *c = librados::Rados::aio_create_completion();
-- ASSERT_EQ(0, ioctx.hit_set_list(i, c, &ls));
-+ ASSERT_EQ(0, cache_ioctx.hit_set_list(i, c, &ls));
- c->wait_for_complete();
- c->release();
- std::cout << "pg " << i << " ls " << ls << std::endl;
- ASSERT_FALSE(ls.empty());
-
- // get the latest
- c = librados::Rados::aio_create_completion();
- bufferlist bl;
-- ASSERT_EQ(0, ioctx.hit_set_get(i, c, ls.back().first, &bl));
-+ ASSERT_EQ(0, cache_ioctx.hit_set_get(i, c, ls.back().first, &bl));
- c->wait_for_complete();
- c->release();
-
- //std::cout << "bl len is " << bl.length() << "\n";
-@@ -2080,16 +1989,16 @@
- ::decode(hitsets[i], p);
-
- // cope with racing splits by refreshing pg_num
- if (i == num_pg - 1)
-- num_pg = _get_pg_num(cluster, pool_name);
-+ num_pg = _get_pg_num(cluster, cache_pool_name);
- }
-
-- for (int i=0; i<1000; ++i) {
-+ for (int i=0; i<num; ++i) {
- string n = stringify(i);
-- uint32_t hash = ioctx.get_object_hash_position(n);
-+ uint32_t hash = cache_ioctx.get_object_hash_position(n);
- hobject_t oid(sobject_t(n, CEPH_NOSNAP), "", hash,
-- cluster.pool_lookup(pool_name.c_str()), "");
-+ cluster.pool_lookup(cache_pool_name.c_str()), "");
- std::cout << "checking for " << oid << std::endl;
- bool found = false;
- for (int p=0; p<num_pg; ++p) {
- if (hitsets[p].contains(oid)) {
-@@ -2100,45 +2009,52 @@
- ASSERT_TRUE(found);
- }
- }
-
--TEST_F(LibRadosTierPP, HitSetTrim) {
-+TEST_F(LibRadosTwoPoolsPP, HitSetTrim) {
- unsigned count = 3;
- unsigned period = 3;
-
-- // enable hitset tracking for this pool
-+ // make it a tier
- bufferlist inbl;
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", count),
-+ ASSERT_EQ(0, cluster.mon_command(
-+ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
-+ "\", \"tierpool\": \"" + cache_pool_name +
-+ "\", \"force_nonempty\": \"--force-nonempty\" }",
-+ inbl, NULL, NULL));
-+
-+ // enable hitset tracking for this pool
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", count),
- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", period),
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", period),
- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type", "bloom"),
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_fpp", ".01"),
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_fpp", ".01"),
- inbl, NULL, NULL));
-
- // wait for maps to settle
- cluster.wait_for_latest_osdmap();
-
-- ioctx.set_namespace("");
-+ cache_ioctx.set_namespace("");
-
- // do a bunch of writes and make sure the hitsets rotate
- utime_t start = ceph_clock_now(NULL);
- utime_t hard_stop = start + utime_t(count * period * 50, 0);
-
- time_t first = 0;
- while (true) {
- string name = "foo";
-- uint32_t hash = ioctx.get_object_hash_position(name);
-+ uint32_t hash = cache_ioctx.get_object_hash_position(name);
- hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash, -1, "");
-
- bufferlist bl;
- bl.append("f");
-- ASSERT_EQ(0, ioctx.write("foo", bl, 1, 0));
-+ ASSERT_EQ(0, cache_ioctx.write("foo", bl, 1, 0));
-
- list<pair<time_t, time_t> > ls;
- AioCompletion *c = librados::Rados::aio_create_completion();
-- ASSERT_EQ(0, ioctx.hit_set_list(hash, c, &ls));
-+ ASSERT_EQ(0, cache_ioctx.hit_set_list(hash, c, &ls));
- c->wait_for_complete();
- c->release();
-
- ASSERT_TRUE(ls.size() <= count + 1);
-@@ -2186,11 +2102,31 @@
- cache_ioctx.set_namespace(ns);
- }
- virtual void TearDown() {
- RadosTestECPP::TearDown();
-+
-+ // flush + evict cache
-+ flush_evict_all(cluster, cache_ioctx);
-+
-+ bufferlist inbl;
-+ // tear down tiers
-+ ASSERT_EQ(0, cluster.mon_command(
-+ "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-+ "\"}",
-+ inbl, NULL, NULL));
-+ ASSERT_EQ(0, cluster.mon_command(
-+ "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-+ "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-+ inbl, NULL, NULL));
-+
-+ // wait for maps to settle before next test
-+ cluster.wait_for_latest_osdmap();
-+
- cleanup_default_namespace(cache_ioctx);
-+
- cache_ioctx.close();
- }
-+
- librados::IoCtx cache_ioctx;
- };
-
- std::string LibRadosTwoPoolsECPP::cache_pool_name;
-@@ -2307,21 +2243,8 @@
- ASSERT_EQ(0, completion->get_return_value());
- completion->release();
- ASSERT_EQ('b', bl[0]);
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsECPP, Promote) {
- // create object
-@@ -2374,21 +2297,8 @@
- ASSERT_TRUE(it->first == string("foo") || it->first == string("bar"));
- ++it;
- ASSERT_TRUE(it == cache_ioctx.objects_end());
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsECPP, PromoteSnap) {
- // create object
-@@ -2551,21 +2461,8 @@
- {
- bufferlist bl;
- ASSERT_EQ(-ENOENT, ioctx.read("baz", bl, 1, 0));
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsECPP, PromoteSnapTrimRace) {
- // create object
-@@ -2618,21 +2515,8 @@
- {
- bufferlist bl;
- ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsECPP, Whiteout) {
- // create object
-@@ -2694,21 +2578,8 @@
- bufferlist bl;
- ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
- ASSERT_EQ('h', bl[0]);
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsECPP, Evict) {
- // create object
-@@ -2797,21 +2668,8 @@
- completion->wait_for_safe();
- ASSERT_EQ(-EBUSY, completion->get_return_value());
- completion->release();
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsECPP, EvictSnap) {
- // create object
-@@ -3045,21 +2903,8 @@
- completion->wait_for_safe();
- ASSERT_EQ(0, completion->get_return_value());
- completion->release();
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsECPP, TryFlush) {
- // configure cache
-@@ -3166,21 +3011,8 @@
- {
- ObjectIterator it = cache_ioctx.objects_begin();
- ASSERT_TRUE(it == cache_ioctx.objects_end());
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsECPP, Flush) {
- // configure cache
-@@ -3339,21 +3171,8 @@
- {
- ObjectIterator it = ioctx.objects_begin();
- ASSERT_TRUE(it == ioctx.objects_end());
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsECPP, FlushSnap) {
- // configure cache
-@@ -3516,12 +3335,8 @@
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
-
- // wait for maps to settle
- cluster.wait_for_latest_osdmap();
-
-@@ -3543,8 +3358,13 @@
- bufferlist bl;
- ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
- ASSERT_EQ('a', bl[0]);
- }
-+
-+ ASSERT_EQ(0, cluster.mon_command(
-+ "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
-+ "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
-+ inbl, NULL, NULL));
- }
-
- TEST_F(LibRadosTierECPP, FlushWriteRaces) {
- Rados cluster;
-@@ -3827,21 +3647,8 @@
- ASSERT_EQ(0, completion2->get_return_value());
- completion->release();
- completion2->release();
- }
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTwoPoolsECPP, TryFlushReadRace) {
- // configure cache
-@@ -3902,21 +3709,8 @@
- max_reads = 0;
- while (num_reads > 0)
- cond.Wait(test_lock);
- test_lock.Unlock();
--
-- // tear down tiers
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-- "\"}",
-- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(
-- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-- inbl, NULL, NULL));
--
-- // wait for maps to settle before next test
-- cluster.wait_for_latest_osdmap();
- }
-
- TEST_F(LibRadosTierECPP, HitSetNone) {
- {
-@@ -3937,23 +3731,30 @@
- c->release();
- }
- }
-
--TEST_F(LibRadosTierECPP, HitSetRead) {
-- // enable hitset tracking for this pool
-+TEST_F(LibRadosTwoPoolsECPP, HitSetRead) {
-+ // make it a tier
- bufferlist inbl;
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 2),
-+ ASSERT_EQ(0, cluster.mon_command(
-+ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
-+ "\", \"tierpool\": \"" + cache_pool_name +
-+ "\", \"force_nonempty\": \"--force-nonempty\" }",
-+ inbl, NULL, NULL));
-+
-+ // enable hitset tracking for this pool
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 2),
- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
- "explicit_object"),
- inbl, NULL, NULL));
-
- // wait for maps to settle
- cluster.wait_for_latest_osdmap();
-
-- ioctx.set_namespace("");
-+ cache_ioctx.set_namespace("");
-
- // keep reading until we see our object appear in the HitSet
- utime_t start = ceph_clock_now(NULL);
- utime_t hard_stop = start + utime_t(600, 0);
-@@ -3962,18 +3763,18 @@
- utime_t now = ceph_clock_now(NULL);
- ASSERT_TRUE(now < hard_stop);
-
- string name = "foo";
-- uint32_t hash = ioctx.get_object_hash_position(name);
-+ uint32_t hash = cache_ioctx.get_object_hash_position(name);
- hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash,
-- cluster.pool_lookup(pool_name.c_str()), "");
-+ cluster.pool_lookup(cache_pool_name.c_str()), "");
-
- bufferlist bl;
-- ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
-+ ASSERT_EQ(-ENOENT, cache_ioctx.read("foo", bl, 1, 0));
-
- bufferlist hbl;
- AioCompletion *c = librados::Rados::aio_create_completion();
-- ASSERT_EQ(0, ioctx.hit_set_get(hash, c, now.sec(), &hbl));
-+ ASSERT_EQ(0, cache_ioctx.hit_set_get(hash, c, now.sec(), &hbl));
- c->wait_for_complete();
- c->release();
-
- if (hbl.length()) {
-@@ -4068,27 +3869,34 @@
- }
- }
- #endif
-
--TEST_F(LibRadosTierECPP, HitSetTrim) {
-+TEST_F(LibRadosTwoPoolsECPP, HitSetTrim) {
- unsigned count = 3;
- unsigned period = 3;
-
-- // enable hitset tracking for this pool
-+ // make it a tier
- bufferlist inbl;
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", count),
-+ ASSERT_EQ(0, cluster.mon_command(
-+ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
-+ "\", \"tierpool\": \"" + cache_pool_name +
-+ "\", \"force_nonempty\": \"--force-nonempty\" }",
-+ inbl, NULL, NULL));
-+
-+ // enable hitset tracking for this pool
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", count),
- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", period),
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", period),
- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type", "bloom"),
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
- inbl, NULL, NULL));
-- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_fpp", ".01"),
-+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_fpp", ".01"),
- inbl, NULL, NULL));
-
- // wait for maps to settle
- cluster.wait_for_latest_osdmap();
-
-- ioctx.set_namespace("");
-+ cache_ioctx.set_namespace("");
-
- // do a bunch of writes and make sure the hitsets rotate
- utime_t start = ceph_clock_now(NULL);
- utime_t hard_stop = start + utime_t(count * period * 50, 0);
-@@ -4099,18 +3907,18 @@
- memset(buf, 'f', bsize);
-
- while (true) {
- string name = "foo";
-- uint32_t hash = ioctx.get_object_hash_position(name);
-+ uint32_t hash = cache_ioctx.get_object_hash_position(name);
- hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash, -1, "");
-
- bufferlist bl;
- bl.append(buf, bsize);
-- ASSERT_EQ(0, ioctx.append("foo", bl, bsize));
-+ ASSERT_EQ(0, cache_ioctx.append("foo", bl, bsize));
-
- list<pair<time_t, time_t> > ls;
- AioCompletion *c = librados::Rados::aio_create_completion();
-- ASSERT_EQ(0, ioctx.hit_set_list(hash, c, &ls));
-+ ASSERT_EQ(0, cache_ioctx.hit_set_list(hash, c, &ls));
- c->wait_for_complete();
- c->release();
-
- ASSERT_TRUE(ls.size() <= count + 1);
---- a/src/test/objectstore/store_test.cc
-+++ b/src/test/objectstore/store_test.cc
-@@ -1114,8 +1114,113 @@
- ASSERT_EQ(1u, newomap.size());
- ASSERT_TRUE(newomap.count("omap_key"));
- ASSERT_TRUE(newomap["omap_key"].contents_equal(omap["omap_key"]));
- }
-+ {
-+ ObjectStore::Transaction t;
-+ t.remove(cid, oid);
-+ t.remove_collection(cid);
-+ t.remove_collection(temp_cid);
-+ r = store->apply_transaction(t);
-+ ASSERT_EQ(r, 0);
-+ }
-+}
-+
-+TEST_P(StoreTest, BigRGWObjectName) {
-+ store->set_allow_sharded_objects();
-+ store->sync_and_flush();
-+ coll_t temp_cid("mytemp");
-+ hobject_t temp_oid("tmp_oid", "", CEPH_NOSNAP, 0, 0, "");
-+ coll_t cid("dest");
-+ ghobject_t oid(
-+ hobject_t(
-+ "default.4106.50_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa [...]
-+ "",
-+ CEPH_NOSNAP,
-+ 0x81920472,
-+ 3,
-+ ""),
-+ 15,
-+ shard_id_t(1));
-+ ghobject_t oid2(oid);
-+ oid2.generation = 17;
-+ ghobject_t oidhead(oid);
-+ oidhead.generation = ghobject_t::NO_GEN;
-+
-+ int r;
-+ {
-+ ObjectStore::Transaction t;
-+ t.create_collection(cid);
-+ t.touch(cid, oidhead);
-+ t.collection_move_rename(cid, oidhead, cid, oid);
-+ t.touch(cid, oidhead);
-+ t.collection_move_rename(cid, oidhead, cid, oid2);
-+ r = store->apply_transaction(t);
-+ ASSERT_EQ(r, 0);
-+ }
-+
-+ {
-+ ObjectStore::Transaction t;
-+ t.remove(cid, oid);
-+ r = store->apply_transaction(t);
-+ ASSERT_EQ(r, 0);
-+ }
-+
-+ {
-+ vector<ghobject_t> objects;
-+ r = store->collection_list(cid, objects);
-+ ASSERT_EQ(r, 0);
-+ ASSERT_EQ(objects.size(), 1u);
-+ ASSERT_EQ(objects[0], oid2);
-+ }
-+
-+ ASSERT_FALSE(store->exists(cid, oid));
-+
-+ {
-+ ObjectStore::Transaction t;
-+ t.remove(cid, oid2);
-+ t.remove_collection(cid);
-+ r = store->apply_transaction(t);
-+ ASSERT_EQ(r, 0);
-+
-+ }
-+}
-+
-+TEST_P(StoreTest, SetAllocHint) {
-+ coll_t cid("alloc_hint");
-+ ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, 0, ""));
-+ int r;
-+ {
-+ ObjectStore::Transaction t;
-+ t.create_collection(cid);
-+ t.touch(cid, hoid);
-+ r = store->apply_transaction(t);
-+ ASSERT_EQ(r, 0);
-+ }
-+ {
-+ ObjectStore::Transaction t;
-+ t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
-+ r = store->apply_transaction(t);
-+ ASSERT_EQ(r, 0);
-+ }
-+ {
-+ ObjectStore::Transaction t;
-+ t.remove(cid, hoid);
-+ r = store->apply_transaction(t);
-+ ASSERT_EQ(r, 0);
-+ }
-+ {
-+ ObjectStore::Transaction t;
-+ t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
-+ r = store->apply_transaction(t);
-+ ASSERT_EQ(r, 0);
-+ }
-+ {
-+ ObjectStore::Transaction t;
-+ t.remove_collection(cid);
-+ r = store->apply_transaction(t);
-+ ASSERT_EQ(r, 0);
-+ }
- }
-
- INSTANTIATE_TEST_CASE_P(
- ObjectStore,
---- a/src/test/osd/TestOSDMap.cc
-+++ b/src/test/osd/TestOSDMap.cc
-@@ -49,15 +49,26 @@
- pending_inc.new_uuid[i] = sample_uuid;
- }
- osdmap.apply_incremental(pending_inc);
-
-- // kludge to get an erasure coding rule and pool
-+ // Create an EC ruleset and a pool using it
- int r = osdmap.crush->add_simple_ruleset("erasure", "default", "osd",
- "indep", pg_pool_t::TYPE_ERASURE,
- &cerr);
-- pg_pool_t *p = (pg_pool_t *)osdmap.get_pg_pool(2);
-+
-+ OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
-+ new_pool_inc.new_pool_max = osdmap.get_pool_max();
-+ new_pool_inc.fsid = osdmap.get_fsid();
-+ pg_pool_t empty;
-+ uint64_t pool_id = ++new_pool_inc.new_pool_max;
-+ pg_pool_t *p = new_pool_inc.get_new_pool(pool_id, &empty);
-+ p->size = 3;
-+ p->set_pg_num(64);
-+ p->set_pgp_num(64);
- p->type = pg_pool_t::TYPE_ERASURE;
- p->crush_ruleset = r;
-+ new_pool_inc.new_pool_names[pool_id] = "ec";
-+ osdmap.apply_incremental(new_pool_inc);
- }
- unsigned int get_num_osds() { return num_osds; }
-
- void test_mappings(int pool,
-@@ -85,8 +96,50 @@
- ASSERT_EQ(get_num_osds(), (unsigned)osdmap.get_max_osd());
- ASSERT_EQ(get_num_osds(), osdmap.get_num_in_osds());
- }
-
-+TEST_F(OSDMapTest, Features) {
-+ // with EC pool
-+ set_up_map();
-+ uint64_t features = osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
-+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
-+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
-+ ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
-+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
-+ ASSERT_TRUE(features & CEPH_FEATURE_OSD_ERASURE_CODES);
-+ ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
-+ ASSERT_FALSE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
-+
-+ // clients have a slightly different view
-+ features = osdmap.get_features(CEPH_ENTITY_TYPE_CLIENT, NULL);
-+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
-+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
-+ ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
-+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
-+ ASSERT_FALSE(features & CEPH_FEATURE_OSD_ERASURE_CODES); // dont' need this
-+ ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
-+ ASSERT_FALSE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
-+
-+ // remove teh EC pool, but leave the rule. add primary affinity.
-+ {
-+ OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
-+ new_pool_inc.old_pools.insert(osdmap.lookup_pg_pool_name("ec"));
-+ new_pool_inc.new_primary_affinity[0] = 0x8000;
-+ osdmap.apply_incremental(new_pool_inc);
-+ }
-+
-+ features = osdmap.get_features(CEPH_ENTITY_TYPE_MON, NULL);
-+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
-+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
-+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); // shared bit with primary affinity
-+ ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_V2);
-+ ASSERT_FALSE(features & CEPH_FEATURE_OSD_ERASURE_CODES);
-+ ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
-+ ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
-+
-+ // FIXME: test tiering feature bits
-+}
-+
- TEST_F(OSDMapTest, MapPG) {
- set_up_map();
-
- pg_t rawpg(0, 0, -1);
---- a/src/test/osd/osd-test-helpers.sh
-+++ b/src/test/osd/osd-test-helpers.sh
-@@ -36,8 +36,9 @@
- local ceph_args="$CEPH_ARGS"
- ceph_args+=" --osd-journal-size=100"
- ceph_args+=" --osd-data=$osd_data"
- ceph_args+=" --chdir="
-+ ceph_args+=" --osd-pool-default-erasure-code-directory=.libs"
- ceph_args+=" --run-dir=$dir"
- ceph_args+=" --debug-osd=20"
- ceph_args+=" --log-file=$dir/osd-\$id.log"
- ceph_args+=" --pid-file=$dir/osd-\$id.pidfile"
---- a/src/test/strtol.cc
-+++ b/src/test/strtol.cc
-@@ -13,8 +13,9 @@
- */
-
- #include "common/strtol.h"
- #include <string>
-+#include <map>
-
- #include "gtest/gtest.h"
-
- static void test_strict_strtoll(const char *str, long long expected)
-@@ -133,4 +134,78 @@
- test_strict_strtod_err("34.0 garbo");
-
- test_strict_strtof_err("0.05.0");
- }
-+
-+
-+static void test_strict_sistrtoll(const char *str)
-+{
-+ std::string err;
-+ strict_sistrtoll(str, &err);
-+ ASSERT_EQ(err, "");
-+}
-+
-+static void test_strict_sistrtoll_units(const std::string& foo,
-+ char u, const int m)
-+{
-+ std::string s(foo);
-+ s.push_back(u);
-+ const char *str = s.c_str();
-+ std::string err;
-+ uint64_t r = strict_sistrtoll(str, &err);
-+ ASSERT_EQ(err, "");
-+
-+ str = foo.c_str();
-+ std::string err2;
-+ long long tmp = strict_strtoll(str, 10, &err2);
-+ ASSERT_EQ(err2, "");
-+ tmp = (tmp << m);
-+ ASSERT_EQ(tmp, (long long)r);
-+}
-+
-+TEST(SIStrToLL, WithUnits) {
-+ std::map<char,int> units;
-+ units['B'] = 0;
-+ units['K'] = 10;
-+ units['M'] = 20;
-+ units['G'] = 30;
-+ units['T'] = 40;
-+ units['P'] = 50;
-+ units['E'] = 60;
-+
-+ for (std::map<char,int>::iterator p = units.begin();
-+ p != units.end(); ++p) {
-+ test_strict_sistrtoll_units("1024", p->first, p->second);
-+ test_strict_sistrtoll_units("1", p->first, p->second);
-+ test_strict_sistrtoll_units("0", p->first, p->second);
-+ }
-+}
-+
-+TEST(SIStrToLL, WithoutUnits) {
-+ test_strict_sistrtoll("1024");
-+ test_strict_sistrtoll("1152921504606846976");
-+ test_strict_sistrtoll("0");
-+}
-+
-+static void test_strict_sistrtoll_err(const char *str)
-+{
-+ std::string err;
-+ strict_sistrtoll(str, &err);
-+ ASSERT_NE(err, "");
-+}
-+
-+TEST(SIStrToLL, Error) {
-+ test_strict_sistrtoll_err("1024F");
-+ test_strict_sistrtoll_err("QDDSA");
-+ test_strict_sistrtoll_err("1b");
-+ test_strict_sistrtoll_err("100k");
-+ test_strict_sistrtoll_err("1000m");
-+ test_strict_sistrtoll_err("1g");
-+ test_strict_sistrtoll_err("20t");
-+ test_strict_sistrtoll_err("100p");
-+ test_strict_sistrtoll_err("1000e");
-+ test_strict_sistrtoll_err("B");
-+ test_strict_sistrtoll_err("M");
-+ test_strict_sistrtoll_err("BM");
-+ test_strict_sistrtoll_err("B0wef");
-+ test_strict_sistrtoll_err("0m");
-+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git
More information about the Pkg-ceph-commits
mailing list