[Pkg-ceph-commits] [ceph] 01/01: Imported Upstream version 0.80.11
Gaudenz Steinlin
gaudenz at moszumanska.debian.org
Mon Dec 26 20:48:02 UTC 2016
This is an automated email from the git hooks/post-receive script.
gaudenz pushed a commit to annotated tag upstream/0.80.11
in repository ceph.
commit a829670e143162f323f12bd013f1331406831dd0
Author: Gaudenz Steinlin <gaudenz at debian.org>
Date: Thu Jan 14 22:06:05 2016 +0100
Imported Upstream version 0.80.11
---
Makefile.am | 6 +-
Makefile.in | 6 +-
ceph.spec | 46 +++---
ceph.spec.in | 44 +++---
configure | 22 +--
configure.ac | 2 +-
src/.git_version | 4 +-
src/Makefile.in | 2 +-
src/ceph-disk | 106 ++++++++-----
src/ceph-post-file.in | 20 +--
src/ceph_fuse.cc | 4 +-
src/ceph_syn.cc | 5 +-
src/common/Mutex.cc | 1 +
src/common/RWLock.h | 3 +-
src/common/Thread.cc | 12 +-
src/common/Throttle.cc | 6 +
src/common/Throttle.h | 3 +-
src/common/WorkQueue.h | 35 ++++-
src/common/admin_socket.cc | 50 ++++--
src/common/admin_socket.h | 1 +
src/common/buffer.cc | 23 ++-
src/common/ceph_context.cc | 6 +
src/common/ceph_context.h | 19 +++
src/common/config.cc | 2 +-
src/common/config_opts.h | 17 +-
src/common/sync_filesystem.h | 12 +-
src/crush/CrushWrapper.cc | 27 ++++
src/crush/CrushWrapper.h | 1 +
src/include/interval_set.h | 1 +
src/init-radosgw | 16 +-
src/init-radosgw.sysv | 17 +-
src/json_spirit/json_spirit_reader_template.h | 34 +++-
src/libcephfs.cc | 17 +-
src/librados/IoCtxImpl.cc | 4 +-
src/librados/RadosClient.cc | 8 +-
src/librados/RadosClient.h | 4 +-
src/librbd/AioCompletion.cc | 48 +++++-
src/librbd/AioCompletion.h | 33 +---
src/librbd/AioRequest.cc | 15 +-
src/librbd/AioRequest.h | 6 +-
src/librbd/ImageCtx.cc | 34 +++-
src/librbd/ImageCtx.h | 3 +
src/librbd/internal.cc | 183 +++++++++-------------
src/librbd/internal.h | 17 +-
src/librbd/librbd.cc | 139 +++++++++++++++--
src/log/Log.cc | 2 +
src/mds/MDSUtility.cc | 2 +-
src/messages/MOSDBoot.h | 20 ++-
src/mon/AuthMonitor.cc | 3 +-
src/mon/Elector.cc | 9 +-
src/mon/MDSMonitor.cc | 47 +++---
src/mon/MDSMonitor.h | 6 +-
src/mon/MonClient.cc | 12 +-
src/mon/MonCommands.h | 4 +-
src/mon/Monitor.cc | 69 ++++-----
src/mon/MonitorDBStore.h | 19 ++-
src/mon/MonitorStore.cc | 16 +-
src/mon/OSDMonitor.cc | 70 ++++++++-
src/mon/OSDMonitor.h | 10 +-
src/mon/PGMonitor.cc | 8 +-
src/mon/PaxosService.cc | 10 ++
src/mon/PaxosService.h | 5 +-
src/msg/Messenger.cc | 7 +
src/msg/Messenger.h | 15 ++
src/os/FileJournal.cc | 13 +-
src/os/FileStore.cc | 6 +-
src/os/WBThrottle.cc | 1 +
src/os/chain_xattr.cc | 8 +
src/osd/ECBackend.cc | 2 +-
src/osd/OSD.cc | 61 ++++++--
src/osd/OSD.h | 36 ++---
src/osd/OSDMap.cc | 14 +-
src/osd/PG.cc | 6 +-
src/osd/PGLog.cc | 65 +++++---
src/osd/PGLog.h | 19 ++-
src/osd/ReplicatedPG.cc | 100 ++++++++++--
src/osd/ReplicatedPG.h | 6 +-
src/osd/osd_types.cc | 153 ++++++++++++++++--
src/osd/osd_types.h | 44 +++++-
src/osdc/Objecter.cc | 92 +++++++++--
src/osdc/Objecter.h | 80 ++++++++--
src/rgw/logrotate.conf | 2 +-
src/rgw/rgw_civetweb.cc | 11 +-
src/rgw/rgw_client_io.cc | 7 +-
src/rgw/rgw_main.cc | 2 +-
src/rgw/rgw_op.cc | 13 ++
src/rgw/rgw_rados.cc | 35 +++++
src/rgw/rgw_rados.h | 11 ++
src/rgw/rgw_rest.cc | 34 +++-
src/rgw/rgw_rest.h | 8 +-
src/rgw/rgw_rest_swift.cc | 43 ++++--
src/rgw/rgw_swift.cc | 5 +
src/test/Makefile.am | 2 +-
src/test/bufferlist.cc | 11 ++
src/test/librados/TestCase.cc | 9 +-
src/test/librados/tier.cc | 215 +++++++++++++++++++++++++-
src/test/librbd/test_librbd.cc | 71 ++++++++-
src/test/mon/test_mon_workloadgen.cc | 3 +-
src/test/objectstore/chain_xattr.cc | 39 +++++
src/test/osd/TestPGLog.cc | 84 +++++++---
src/tools/ceph_authtool.cc | 2 +-
src/upstart/ceph-mds.conf | 2 +-
src/upstart/ceph-mon.conf | 2 +-
src/upstart/ceph-osd.conf | 2 +-
104 files changed, 2053 insertions(+), 664 deletions(-)
diff --git a/Makefile.am b/Makefile.am
index cba3af2..7b6960c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -20,9 +20,9 @@ EXTRA_DIST += \
# why is it so hard to make autotools to this?
install-data-local:
-mkdir -p $(DESTDIR)$(datadir)/ceph
- -install -m 644 share/known_hosts_drop.ceph.com $(DESTDIR)$(datadir)/ceph/known_hosts_drop.ceph.com
- -install -m 644 share/id_dsa_drop.ceph.com $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com
- -install -m 644 share/id_dsa_drop.ceph.com.pub $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com.pub
+ -install -m 600 share/known_hosts_drop.ceph.com $(DESTDIR)$(datadir)/ceph/known_hosts_drop.ceph.com
+ -install -m 600 share/id_dsa_drop.ceph.com $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com
+ -install -m 600 share/id_dsa_drop.ceph.com.pub $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com.pub
all-local:
if WITH_DEBUG
diff --git a/Makefile.in b/Makefile.in
index a1095c0..969724b 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -864,9 +864,9 @@ uninstall-am:
# why is it so hard to make autotools to this?
install-data-local:
-mkdir -p $(DESTDIR)$(datadir)/ceph
- -install -m 644 share/known_hosts_drop.ceph.com $(DESTDIR)$(datadir)/ceph/known_hosts_drop.ceph.com
- -install -m 644 share/id_dsa_drop.ceph.com $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com
- -install -m 644 share/id_dsa_drop.ceph.com.pub $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com.pub
+ -install -m 600 share/known_hosts_drop.ceph.com $(DESTDIR)$(datadir)/ceph/known_hosts_drop.ceph.com
+ -install -m 600 share/id_dsa_drop.ceph.com $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com
+ -install -m 600 share/id_dsa_drop.ceph.com.pub $(DESTDIR)$(datadir)/ceph/id_dsa_drop.ceph.com.pub
all-local:
# We need gtest to build the rados-api tests. We only build those in
diff --git a/ceph.spec b/ceph.spec
index 73fe7a8..772cc8a 100644
--- a/ceph.spec
+++ b/ceph.spec
@@ -5,11 +5,14 @@
%{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
%endif
+%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
+
#################################################################################
# common
#################################################################################
+
Name: ceph
-Version: 0.80.10
+Version: 0.80.11
Release: 0%{?dist}
Summary: User space components of the Ceph file system
License: GPL-2.0
@@ -24,7 +27,6 @@ Requires: librados2 = %{version}-%{release}
Requires: libcephfs1 = %{version}-%{release}
Requires: ceph-common = %{version}-%{release}
Requires: python
-Requires: python-argparse
Requires: python-ceph
Requires: python-requests
Requires: python-flask
@@ -44,7 +46,6 @@ BuildRequires: gdbm
BuildRequires: pkgconfig
BuildRequires: python
BuildRequires: python-nose
-BuildRequires: python-argparse
BuildRequires: libaio-devel
BuildRequires: libcurl-devel
BuildRequires: libxml2-devel
@@ -77,7 +78,6 @@ BuildRequires: %insserv_prereq
BuildRequires: mozilla-nss-devel
BuildRequires: keyutils-devel
BuildRequires: libatomic-ops-devel
-BuildRequires: fdupes
%else
Requires: gdisk
BuildRequires: nss-devel
@@ -107,6 +107,11 @@ Requires: librados2 = %{version}-%{release}
Requires: python-ceph = %{version}-%{release}
Requires: python-requests
Requires: redhat-lsb-core
+# python-argparse is only needed in distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+Requires: python-argparse
+BuildRequires: python-argparse
+%endif
%description -n ceph-common
common utilities to mount and interact with a ceph storage cluster
@@ -219,9 +224,6 @@ Group: System Environment/Libraries
License: LGPL-2.0
Requires: librados2 = %{version}-%{release}
Requires: librbd1 = %{version}-%{release}
-%if 0%{defined suse_version}
-%py_requires
-%endif
%description -n python-ceph
This package contains Python libraries for interacting with Cephs RADOS
object storage.
@@ -262,8 +264,13 @@ License: LGPL-2.0
Requires: java
Requires: libcephfs_jni1 = %{version}-%{release}
BuildRequires: java-devel
+%if 0%{?el6}
Requires: junit4
BuildRequires: junit4
+%else
+Requires: junit
+BuildRequires: junit
+%endif
%description -n cephfs-java
This package contains the Java libraries for the Ceph File System.
@@ -352,13 +359,8 @@ chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf
chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.fetch_config
# udev rules
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%else
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%endif
+install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT%{_udevrulesdir}/50-rbd.rules
+install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
%if (0%{?rhel} && 0%{?rhel} < 7)
install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
@@ -385,12 +387,6 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
-%if %{defined suse_version}
-# Fedora seems to have some problems with this macro, use it only on SUSE
-%fdupes -s $RPM_BUILD_ROOT/%{python_sitelib}
-%fdupes %buildroot
-%endif
-
%clean
rm -rf $RPM_BUILD_ROOT
@@ -477,13 +473,8 @@ fi
%{_libdir}/ceph/erasure-code/libec_jerasure*.so*
%{_libdir}/ceph/erasure-code/libec_test_jerasure*.so*
%{_libdir}/ceph/erasure-code/libec_missing_entry_point.so*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/usr/lib/udev/rules.d/95-ceph-osd.rules
-%else
-/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/lib/udev/rules.d/95-ceph-osd.rules
-%endif
+%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+%{_udevrulesdir}/95-ceph-osd.rules
%config %{_sysconfdir}/bash_completion.d/ceph
%config(noreplace) %{_sysconfdir}/logrotate.d/ceph
%config(noreplace) %{_sysconfdir}/logrotate.d/radosgw
@@ -543,6 +534,7 @@ fi
%config %{_sysconfdir}/bash_completion.d/rbd
%config(noreplace) %{_sysconfdir}/ceph/rbdmap
%{_initrddir}/rbdmap
+%{_udevrulesdir}/50-rbd.rules
%postun -n ceph-common
# Package removal cleanup
diff --git a/ceph.spec.in b/ceph.spec.in
index b1372aa..67a3825 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -5,9 +5,12 @@
%{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
%endif
+%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
+
#################################################################################
# common
#################################################################################
+
Name: ceph
Version: @VERSION@
Release: @RPM_RELEASE@%{?dist}
@@ -24,7 +27,6 @@ Requires: librados2 = %{version}-%{release}
Requires: libcephfs1 = %{version}-%{release}
Requires: ceph-common = %{version}-%{release}
Requires: python
-Requires: python-argparse
Requires: python-ceph
Requires: python-requests
Requires: python-flask
@@ -44,7 +46,6 @@ BuildRequires: gdbm
BuildRequires: pkgconfig
BuildRequires: python
BuildRequires: python-nose
-BuildRequires: python-argparse
BuildRequires: libaio-devel
BuildRequires: libcurl-devel
BuildRequires: libxml2-devel
@@ -77,7 +78,6 @@ BuildRequires: %insserv_prereq
BuildRequires: mozilla-nss-devel
BuildRequires: keyutils-devel
BuildRequires: libatomic-ops-devel
-BuildRequires: fdupes
%else
Requires: gdisk
BuildRequires: nss-devel
@@ -107,6 +107,11 @@ Requires: librados2 = %{version}-%{release}
Requires: python-ceph = %{version}-%{release}
Requires: python-requests
Requires: redhat-lsb-core
+# python-argparse is only needed in distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+Requires: python-argparse
+BuildRequires: python-argparse
+%endif
%description -n ceph-common
common utilities to mount and interact with a ceph storage cluster
@@ -219,9 +224,6 @@ Group: System Environment/Libraries
License: LGPL-2.0
Requires: librados2 = %{version}-%{release}
Requires: librbd1 = %{version}-%{release}
-%if 0%{defined suse_version}
-%py_requires
-%endif
%description -n python-ceph
This package contains Python libraries for interacting with Cephs RADOS
object storage.
@@ -262,8 +264,13 @@ License: LGPL-2.0
Requires: java
Requires: libcephfs_jni1 = %{version}-%{release}
BuildRequires: java-devel
+%if 0%{?el6}
Requires: junit4
BuildRequires: junit4
+%else
+Requires: junit
+BuildRequires: junit
+%endif
%description -n cephfs-java
This package contains the Java libraries for the Ceph File System.
@@ -352,13 +359,8 @@ chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.ceph.conf
chmod 0644 $RPM_BUILD_ROOT%{_docdir}/ceph/sample.fetch_config
# udev rules
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%else
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%endif
+install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT%{_udevrulesdir}/50-rbd.rules
+install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
%if (0%{?rhel} && 0%{?rhel} < 7)
install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
@@ -385,12 +387,6 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
-%if %{defined suse_version}
-# Fedora seems to have some problems with this macro, use it only on SUSE
-%fdupes -s $RPM_BUILD_ROOT/%{python_sitelib}
-%fdupes %buildroot
-%endif
-
%clean
rm -rf $RPM_BUILD_ROOT
@@ -477,13 +473,8 @@ fi
%{_libdir}/ceph/erasure-code/libec_jerasure*.so*
%{_libdir}/ceph/erasure-code/libec_test_jerasure*.so*
%{_libdir}/ceph/erasure-code/libec_missing_entry_point.so*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/usr/lib/udev/rules.d/95-ceph-osd.rules
-%else
-/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/lib/udev/rules.d/95-ceph-osd.rules
-%endif
+%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+%{_udevrulesdir}/95-ceph-osd.rules
%config %{_sysconfdir}/bash_completion.d/ceph
%config(noreplace) %{_sysconfdir}/logrotate.d/ceph
%config(noreplace) %{_sysconfdir}/logrotate.d/radosgw
@@ -543,6 +534,7 @@ fi
%config %{_sysconfdir}/bash_completion.d/rbd
%config(noreplace) %{_sysconfdir}/ceph/rbdmap
%{_initrddir}/rbdmap
+%{_udevrulesdir}/50-rbd.rules
%postun -n ceph-common
# Package removal cleanup
diff --git a/configure b/configure
index 18f6513..b39fb90 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for ceph 0.80.10.
+# Generated by GNU Autoconf 2.69 for ceph 0.80.11.
#
# Report bugs to <ceph-devel at vger.kernel.org>.
#
@@ -590,8 +590,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='ceph'
PACKAGE_TARNAME='ceph'
-PACKAGE_VERSION='0.80.10'
-PACKAGE_STRING='ceph 0.80.10'
+PACKAGE_VERSION='0.80.11'
+PACKAGE_STRING='ceph 0.80.11'
PACKAGE_BUGREPORT='ceph-devel at vger.kernel.org'
PACKAGE_URL=''
@@ -1459,7 +1459,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures ceph 0.80.10 to adapt to many kinds of systems.
+\`configure' configures ceph 0.80.11 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1530,7 +1530,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of ceph 0.80.10:";;
+ short | recursive ) echo "Configuration of ceph 0.80.11:";;
esac
cat <<\_ACEOF
@@ -1677,7 +1677,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-ceph configure 0.80.10
+ceph configure 0.80.11
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2707,7 +2707,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by ceph $as_me 0.80.10, which was
+It was created by ceph $as_me 0.80.11, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -4824,7 +4824,7 @@ fi
# Define the identity of the package.
PACKAGE='ceph'
- VERSION='0.80.10'
+ VERSION='0.80.11'
cat >>confdefs.h <<_ACEOF
@@ -12728,7 +12728,7 @@ fi
# Define the identity of the package.
PACKAGE='ceph'
- VERSION='0.80.10'
+ VERSION='0.80.11'
cat >>confdefs.h <<_ACEOF
@@ -22601,7 +22601,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by ceph $as_me 0.80.10, which was
+This file was extended by ceph $as_me 0.80.11, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -22667,7 +22667,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-ceph config.status 0.80.10
+ceph config.status 0.80.11
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
diff --git a/configure.ac b/configure.ac
index f667953..4b0bec2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
# VERSION define is not used by the code. It gets a version string
# from 'git describe'; see src/ceph_ver.[ch]
-AC_INIT([ceph], [0.80.10], [ceph-devel at vger.kernel.org])
+AC_INIT([ceph], [0.80.11], [ceph-devel at vger.kernel.org])
# Create release string. Used with VERSION for RPMs.
RPM_RELEASE=0
diff --git a/src/.git_version b/src/.git_version
index d488cda..654f55e 100644
--- a/src/.git_version
+++ b/src/.git_version
@@ -1,2 +1,2 @@
-ea6c958c38df1216bf95c927f143d8b13c4a9e70
-v0.80.10
+8424145d49264624a3b0a204aedb127835161070
+v0.80.11
diff --git a/src/Makefile.in b/src/Makefile.in
index ca219b8..1c86a61 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -5280,7 +5280,7 @@ ceph_test_librbd_LDADD = $(LIBRBD) $(LIBRADOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
ceph_test_librbd_CXXFLAGS = $(UNITTEST_CXXFLAGS)
@LINUX_TRUE at ceph_test_librbd_fsx_SOURCES = test/librbd/fsx.c
@LINUX_TRUE at ceph_test_librbd_fsx_LDADD = $(LIBRBD) $(LIBRADOS) -lm
- at LINUX_TRUE@ceph_test_librbd_fsx_CFLAGS = ${AM_CFLAGS} -Wno-format
+ at LINUX_TRUE@ceph_test_librbd_fsx_CFLAGS = ${AM_CFLAGS}
ceph_test_cls_rbd_SOURCES = test/cls_rbd/test_cls_rbd.cc
ceph_test_cls_rbd_LDADD = $(LIBRADOS) libcls_rbd_client.la libcls_lock_client.la $(UNITTEST_LDADD) $(RADOS_TEST_LDADD)
ceph_test_cls_rbd_CXXFLAGS = $(UNITTEST_CXXFLAGS)
diff --git a/src/ceph-disk b/src/ceph-disk
index 6bd0220..c5e7af6 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -919,6 +919,9 @@ def unmount(
###########################################
+def extract_parted_partition_numbers(partitions):
+ numbers_as_strings = re.findall('^\d+', partitions, re.MULTILINE)
+ return map(int, numbers_as_strings)
def get_free_partition_index(dev):
"""
@@ -943,31 +946,19 @@ def get_free_partition_index(dev):
if not lines:
raise Error('parted failed to output anything')
- lines = str(lines).splitlines(True)
-
- # work around buggy libreadline(?) library in rhel/centos.
- idiot_prefix = '\x1b\x5b\x3f\x31\x30\x33\x34\x68'
- if lines[0].startswith(idiot_prefix):
- lines[0] = lines[0][8:]
-
- if lines[0] not in ['CHS;\n', 'CYL;\n', 'BYT;\n']:
- raise Error('weird parted units', lines[0])
- del lines[0]
-
- if not lines[0].startswith('/dev/'):
- raise Error('weird parted disk entry', lines[0])
- del lines[0]
-
- seen = set()
- for line in lines:
- idx, _ = line.split(':', 1)
- idx = int(idx)
- seen.add(idx)
-
- num = 1
- while num in seen:
- num += 1
- return num
+ if ('CHS;' not in lines and
+ 'CYL;' not in lines and
+ 'BYT;' not in lines):
+ raise Error('parted output expected to contain one of ' +
+ 'CHH; CYL; or BYT; : ' + lines)
+ if dev not in lines:
+ raise Error('parted output expected to contain ' + dev + ': ' + lines)
+ _, partitions = lines.split(dev)
+ partition_numbers = extract_parted_partition_numbers(partitions)
+ if partition_numbers:
+ return max(partition_numbers) + 1
+ else:
+ return 1
def update_partition(action, dev, description):
@@ -1018,6 +1009,13 @@ def zap(dev):
[
'sgdisk',
'--zap-all',
+ '--',
+ dev,
+ ],
+ )
+ command_check_call(
+ [
+ 'sgdisk',
'--clear',
'--mbrtogpt',
'--',
@@ -1039,10 +1037,40 @@ def prepare_journal_dev(
journal_dm_keypath,
):
+ reusing_partition = False
+
if is_partition(journal):
LOG.debug('Journal %s is a partition', journal)
LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
- return (journal, None, None)
+ if get_partition_type(journal) == JOURNAL_UUID:
+ LOG.debug('Journal %s was previously prepared with ceph-disk. Reusing it.', journal)
+ reusing_partition = True
+ # Read and reuse the partition uuid from this journal's previous life.
+ # We reuse the uuid instead of changing it because udev does not reliably
+ # notice changes to an existing partition's GUID.
+ # See http://tracker.ceph.com/issues/10146
+ journal_uuid = get_partition_uuid(journal)
+ LOG.debug('Reusing journal with uuid %s', journal_uuid)
+ else:
+ LOG.warning('Journal %s was not prepared with ceph-disk. Symlinking directly.', journal)
+ return (journal, None, None)
+
+ journal_symlink = '/dev/disk/by-partuuid/{journal_uuid}'.format(
+ journal_uuid=journal_uuid,
+ )
+
+ journal_dmcrypt = None
+ if journal_dm_keypath:
+ journal_dmcrypt = journal_symlink
+ journal_symlink = '/dev/mapper/{uuid}'.format(uuid=journal_uuid)
+
+ if reusing_partition:
+ # confirm that the journal_symlink exists. It should since this was an active journal
+ # in the past. Continuing otherwise would be futile.
+ assert os.path.exists(journal_symlink)
+ return (journal_symlink, journal_dmcrypt, journal_uuid)
+
+ # From here on we are creating a new journal device, not reusing.
ptype = JOURNAL_UUID
if journal_dm_keypath:
@@ -1099,8 +1127,8 @@ def prepare_journal_dev(
'--mbrtogpt',
'--',
journal,
- ],
- )
+ ]
+ )
update_partition('-a', journal, 'prepared')
@@ -1112,16 +1140,11 @@ def prepare_journal_dev(
],
)
- journal_symlink = '/dev/disk/by-partuuid/{journal_uuid}'.format(
- journal_uuid=journal_uuid,
- )
+ LOG.debug('Journal is GPT partition %s', journal_symlink)
- journal_dmcrypt = None
- if journal_dm_keypath:
- journal_dmcrypt = journal_symlink
- journal_symlink = '/dev/mapper/{uuid}'.format(uuid=journal_uuid)
+ # udev should have created the symlink by now. If not, abort.
+ assert os.path.exists(journal_symlink)
- LOG.debug('Journal is GPT partition %s', journal_symlink)
return (journal_symlink, journal_dmcrypt, journal_uuid)
except subprocess.CalledProcessError as e:
@@ -2158,6 +2181,13 @@ def get_dev_fs(dev):
return None
+def split_dev_base_partnum(dev):
+ if 'loop' in dev or 'cciss' in dev or 'nvme' in dev:
+ return re.match('(.*\d+)p(\d+)', dev).group(1, 2)
+ else:
+ return re.match('(\D+)(\d+)', dev).group(1, 2)
+
+
def get_partition_type(part):
"""
Get the GPT partition type UUID. If we have an old blkid and can't
@@ -2207,7 +2237,7 @@ def get_partition_type(part):
if 'blkid' not in warned_about:
LOG.warning('Old blkid does not support ID_PART_ENTRY_* fields, trying sgdisk; may not correctly identify ceph volumes with dmcrypt')
warned_about['blkid'] = True
- (base, partnum) = re.match('(\D+)(\d+)', part).group(1, 2)
+ (base, partnum) = split_dev_base_partnum(part)
sgdisk, _ = command(
[
'sgdisk',
@@ -2233,7 +2263,7 @@ def get_partition_type(part):
def get_partition_uuid(dev):
- (base, partnum) = re.match('(\D+)(\d+)', dev).group(1, 2)
+ (base, partnum) = split_dev_base_partnum(dev)
out, _ = command(['sgdisk', '-i', partnum, base])
for line in out.splitlines():
m = re.match('Partition unique GUID: (\S+)', line)
diff --git a/src/ceph-post-file.in b/src/ceph-post-file.in
index 9b922a6..b278e8a 100755
--- a/src/ceph-post-file.in
+++ b/src/ceph-post-file.in
@@ -1,16 +1,16 @@
#!/bin/bash -e
-# if we start up as ./$0, assume we are running from a source
-# checkout.
-if [ `dirname $0` = "." ] && [ $PWD != "/usr/bin" ]; then
- known_hosts=../share/known_hosts_drop.ceph.com
- ssh_key=../share/id_dsa_drop.ceph.com
-else
- known_hosts=@datadir@/known_hosts_drop.ceph.com
- ssh_key=@datadir@/id_dsa_drop.ceph.com
+# If these files exist, assume we are a source install.
+if [[ -f ../share/known_hosts_drop.ceph.com && -f ../share/id_dsa_drop.ceph.com ]]
+ then # running from source install
+ known_hosts=../share/known_hosts_drop.ceph.com
+ ssh_key=../share/id_dsa_drop.ceph.com
+ else # running from a pkg install
+ known_hosts=@datadir@/known_hosts_drop.ceph.com
+ ssh_key=@datadir@/id_dsa_drop.ceph.com
fi
-usage() {
+function usage() {
echo "Usage: $0 [options] file1 [dir2 ...]
Easily upload files or directories to ceph.com for analysis by Ceph
@@ -155,7 +155,7 @@ done
cp "$ssh_key" "$t4"
cp "${ssh_key}.pub" "$t4.pub"
-sftp -i $t4 \
+sftp -o "IdentityFile=$t4" \
-C \
-oCheckHostIP=no \
-oGlobalKnownHostsFile=$known_hosts \
diff --git a/src/ceph_fuse.cc b/src/ceph_fuse.cc
index 54616f6..cc97938 100644
--- a/src/ceph_fuse.cc
+++ b/src/ceph_fuse.cc
@@ -120,9 +120,7 @@ int main(int argc, const char **argv, const char *envp[]) {
goto out_mc_start_failed;
// start up network
- messenger = Messenger::create(g_ceph_context,
- entity_name_t::CLIENT(), "client",
- getpid());
+ messenger = Messenger::create_client_messenger(g_ceph_context, "client");
messenger->set_default_policy(Messenger::Policy::lossy_client(0, 0));
messenger->set_policy(entity_name_t::TYPE_MDS,
Messenger::Policy::lossless_client(0, 0));
diff --git a/src/ceph_syn.cc b/src/ceph_syn.cc
index c3410aa..1d10fa2 100644
--- a/src/ceph_syn.cc
+++ b/src/ceph_syn.cc
@@ -65,9 +65,8 @@ int main(int argc, const char **argv, char *envp[])
cout << "ceph-syn: starting " << g_conf->num_client << " syn client(s)" << std::endl;
for (int i=0; i<g_conf->num_client; i++) {
- messengers[i] = Messenger::create(g_ceph_context,
- entity_name_t(entity_name_t::TYPE_CLIENT,-1), "synclient",
- i * 1000000 + getpid());
+ messengers[i] = Messenger::create_client_messenger(g_ceph_context,
+ "synclient");
messengers[i]->bind(g_conf->public_addr);
mclients[i] = new MonClient(g_ceph_context);
mclients[i]->build_initial_monmap();
diff --git a/src/common/Mutex.cc b/src/common/Mutex.cc
index f1e9a55..de66655 100644
--- a/src/common/Mutex.cc
+++ b/src/common/Mutex.cc
@@ -55,6 +55,7 @@ Mutex::Mutex(const char *n, bool r, bool ld,
pthread_mutexattr_init(&attr);
pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK);
pthread_mutex_init(&_m, &attr);
+ pthread_mutexattr_destroy(&attr);
if (g_lockdep)
_register();
}
diff --git a/src/common/RWLock.h b/src/common/RWLock.h
index 1a70ef1..308092c 100644
--- a/src/common/RWLock.h
+++ b/src/common/RWLock.h
@@ -19,6 +19,7 @@
#include <pthread.h>
#include "lockdep.h"
+#include "include/assert.h"
#include "include/atomic.h"
class RWLock
@@ -26,7 +27,7 @@ class RWLock
mutable pthread_rwlock_t L;
const char *name;
mutable int id;
- mutable atomic_t nrlock, nwlock;
+ mutable ceph::atomic_t nrlock, nwlock;
public:
RWLock(const RWLock& other);
diff --git a/src/common/Thread.cc b/src/common/Thread.cc
index a962e06..9f09a92 100644
--- a/src/common/Thread.cc
+++ b/src/common/Thread.cc
@@ -87,11 +87,11 @@ int Thread::kill(int signal)
int Thread::try_create(size_t stacksize)
{
pthread_attr_t *thread_attr = NULL;
+ pthread_attr_t thread_attr_loc;
+
stacksize &= CEPH_PAGE_MASK; // must be multiple of page
if (stacksize) {
- thread_attr = (pthread_attr_t*) malloc(sizeof(pthread_attr_t));
- if (!thread_attr)
- return -ENOMEM;
+ thread_attr = &thread_attr_loc;
pthread_attr_init(thread_attr);
pthread_attr_setstacksize(thread_attr, stacksize);
}
@@ -113,8 +113,10 @@ int Thread::try_create(size_t stacksize)
r = pthread_create(&thread_id, thread_attr, _entry_func, (void*)this);
restore_sigset(&old_sigset);
- if (thread_attr)
- free(thread_attr);
+ if (thread_attr) {
+ pthread_attr_destroy(thread_attr);
+ }
+
return r;
}
diff --git a/src/common/Throttle.cc b/src/common/Throttle.cc
index 026d731..5c68a1f 100644
--- a/src/common/Throttle.cc
+++ b/src/common/Throttle.cc
@@ -267,6 +267,12 @@ void SimpleThrottle::end_op(int r)
m_cond.Signal();
}
+bool SimpleThrottle::pending_error() const
+{
+ Mutex::Locker l(m_lock);
+ return (m_ret < 0);
+}
+
int SimpleThrottle::wait_for_ret()
{
Mutex::Locker l(m_lock);
diff --git a/src/common/Throttle.h b/src/common/Throttle.h
index 6d03988..b171e27 100644
--- a/src/common/Throttle.h
+++ b/src/common/Throttle.h
@@ -76,9 +76,10 @@ public:
~SimpleThrottle();
void start_op();
void end_op(int r);
+ bool pending_error() const;
int wait_for_ret();
private:
- Mutex m_lock;
+ mutable Mutex m_lock;
Cond m_cond;
uint64_t m_max;
uint64_t m_current;
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
index cbf49a8..a9eaffb 100644
--- a/src/common/WorkQueue.h
+++ b/src/common/WorkQueue.h
@@ -330,7 +330,7 @@ private:
public:
ThreadPool(CephContext *cct_, string nm, int n, const char *option = NULL);
- ~ThreadPool();
+ virtual ~ThreadPool();
/// return number of threads currently running
int get_num_threads() {
@@ -340,10 +340,12 @@ public:
/// assign a work queue to this thread pool
void add_work_queue(WorkQueue_* wq) {
+ Mutex::Locker l(_lock);
work_queues.push_back(wq);
}
/// remove a work queue from this thread pool
void remove_work_queue(WorkQueue_* wq) {
+ Mutex::Locker l(_lock);
unsigned i = 0;
while (work_queues[i] != wq)
i++;
@@ -433,4 +435,35 @@ public:
}
};
+class ContextWQ : public ThreadPool::WorkQueueVal<Context *> {
+public:
+ ContextWQ(const string &name, time_t ti, ThreadPool *tp)
+ : ThreadPool::WorkQueueVal<Context *>(name, ti, 0, tp) {}
+
+ void queue(Context *ctx) {
+ ThreadPool::WorkQueueVal<Context *>::queue(ctx);
+ }
+
+protected:
+ virtual void _enqueue(Context *item) {
+ _queue.push_back(item);
+ }
+ virtual void _enqueue_front(Context *item) {
+ _queue.push_front(item);
+ }
+ virtual bool _empty() {
+ return _queue.empty();
+ }
+ virtual Context *_dequeue() {
+ Context *item = _queue.front();
+ _queue.pop_front();
+ return item;
+ }
+ virtual void _process(Context *item) {
+ item->complete(0);
+ }
+private:
+ list<Context *> _queue;
+};
+
#endif
diff --git a/src/common/admin_socket.cc b/src/common/admin_socket.cc
index 4af2904..77048d4 100644
--- a/src/common/admin_socket.cc
+++ b/src/common/admin_socket.cc
@@ -149,6 +149,33 @@ std::string AdminSocket::create_shutdown_pipe(int *pipe_rd, int *pipe_wr)
return "";
}
+std::string AdminSocket::destroy_shutdown_pipe()
+{
+ // Send a byte to the shutdown pipe that the thread is listening to
+ char buf[1] = { 0x0 };
+ int ret = safe_write(m_shutdown_wr_fd, buf, sizeof(buf));
+
+ // Close write end
+ VOID_TEMP_FAILURE_RETRY(close(m_shutdown_wr_fd));
+ m_shutdown_wr_fd = -1;
+
+ if (ret != 0) {
+ ostringstream oss;
+ oss << "AdminSocket::destroy_shutdown_pipe error: failed to write"
+ "to thread shutdown pipe: error " << ret;
+ return oss.str();
+ }
+
+ join();
+
+ // Close read end. Doing this before join() blocks the listenter and prevents
+ // joining.
+ VOID_TEMP_FAILURE_RETRY(close(m_shutdown_rd_fd));
+ m_shutdown_rd_fd = -1;
+
+ return "";
+}
+
std::string AdminSocket::bind_and_listen(const std::string &sock_path, int *fd)
{
ldout(m_cct, 5) << "bind_and_listen " << sock_path << dendl;
@@ -544,30 +571,31 @@ bool AdminSocket::init(const std::string &path)
void AdminSocket::shutdown()
{
+ std::string err;
+
+ // Under normal operation this is unlikely to occur. However for some unit
+ // tests, some object members are not initialized and so cannot be deleted
+ // without fault.
if (m_shutdown_wr_fd < 0)
return;
ldout(m_cct, 5) << "shutdown" << dendl;
- // Send a byte to the shutdown pipe that the thread is listening to
- char buf[1] = { 0x0 };
- int ret = safe_write(m_shutdown_wr_fd, buf, sizeof(buf));
- VOID_TEMP_FAILURE_RETRY(close(m_shutdown_wr_fd));
- m_shutdown_wr_fd = -1;
-
- if (ret == 0) {
- join();
- } else {
- lderr(m_cct) << "AdminSocket::shutdown: failed to write "
- "to thread shutdown pipe: error " << ret << dendl;
+ err = destroy_shutdown_pipe();
+ if (!err.empty()) {
+ lderr(m_cct) << "AdminSocket::shutdown: error: " << err << dendl;
}
+ VOID_TEMP_FAILURE_RETRY(close(m_sock_fd));
+
unregister_command("version");
unregister_command("git_version");
unregister_command("0");
delete m_version_hook;
+
unregister_command("help");
delete m_help_hook;
+
unregister_command("get_command_descriptions");
delete m_getdescs_hook;
diff --git a/src/common/admin_socket.h b/src/common/admin_socket.h
index 3bc8483..5e855bc 100644
--- a/src/common/admin_socket.h
+++ b/src/common/admin_socket.h
@@ -79,6 +79,7 @@ private:
void shutdown();
std::string create_shutdown_pipe(int *pipe_rd, int *pipe_wr);
+ std::string destroy_shutdown_pipe();
std::string bind_and_listen(const std::string &sock_path, int *fd);
void *entry();
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index 35c5d36..fc00f0b 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -1060,12 +1060,23 @@ static uint32_t simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZE
it != _buffers.end();
++it) {
if (p + it->length() > o) {
- if (p >= o && p+it->length() <= o+l)
- it->zero(); // all
- else if (p >= o)
- it->zero(0, o+l-p); // head
- else
- it->zero(o-p, it->length()-(o-p)); // tail
+ if (p >= o && p+it->length() <= o+l) {
+ // 'o'------------- l -----------|
+ // 'p'-- it->length() --|
+ it->zero();
+ } else if (p >= o) {
+ // 'o'------------- l -----------|
+ // 'p'------- it->length() -------|
+ it->zero(0, o+l-p);
+ } else if (p + it->length() <= o+l) {
+ // 'o'------------- l -----------|
+ // 'p'------- it->length() -------|
+ it->zero(o-p, it->length()-(o-p));
+ } else {
+ // 'o'----------- l -----------|
+ // 'p'---------- it->length() ----------|
+ it->zero(o-p, l);
+ }
}
p += it->length();
if (o+l <= p)
diff --git a/src/common/ceph_context.cc b/src/common/ceph_context.cc
index 4ebf79e..77488b6 100644
--- a/src/common/ceph_context.cc
+++ b/src/common/ceph_context.cc
@@ -265,6 +265,7 @@ CephContext::CephContext(uint32_t module_type_)
_crypto_aes(NULL)
{
ceph_spin_init(&_service_thread_lock);
+ ceph_spin_init(&_associated_objs_lock);
_log = new ceph::log::Log(&_conf->subsys);
_log->start();
@@ -298,6 +299,10 @@ CephContext::~CephContext()
{
join_service_thread();
+ for (map<string, AssociatedSingletonObject*>::iterator it = _associated_objs.begin();
+ it != _associated_objs.end(); it++)
+ delete it->second;
+
if (_conf->lockdep) {
lockdep_unregister_ceph_context(this);
}
@@ -335,6 +340,7 @@ CephContext::~CephContext()
delete _conf;
ceph_spin_destroy(&_service_thread_lock);
+ ceph_spin_destroy(&_associated_objs_lock);
delete _crypto_none;
delete _crypto_aes;
diff --git a/src/common/ceph_context.h b/src/common/ceph_context.h
index ba60620..7241c85 100644
--- a/src/common/ceph_context.h
+++ b/src/common/ceph_context.h
@@ -17,6 +17,7 @@
#include <iostream>
#include <stdint.h>
+#include <string>
#include "include/buffer.h"
#include "include/atomic.h"
@@ -58,6 +59,10 @@ private:
~CephContext();
atomic_t nref;
public:
+ class AssociatedSingletonObject {
+ public:
+ virtual ~AssociatedSingletonObject() {}
+ };
CephContext *get() {
nref.inc();
return this;
@@ -102,6 +107,17 @@ public:
void do_command(std::string command, cmdmap_t& cmdmap, std::string format,
bufferlist *out);
+ template<typename T>
+ void lookup_or_create_singleton_object(T*& p, const std::string &name) {
+ ceph_spin_lock(&_associated_objs_lock);
+ if (!_associated_objs.count(name)) {
+ p = new T(this);
+ _associated_objs[name] = reinterpret_cast<AssociatedSingletonObject*>(p);
+ } else {
+ p = reinterpret_cast<T*>(_associated_objs[name]);
+ }
+ ceph_spin_unlock(&_associated_objs_lock);
+ }
/**
* get a crypto handler
*/
@@ -138,6 +154,9 @@ private:
ceph::HeartbeatMap *_heartbeat_map;
+ ceph_spinlock_t _associated_objs_lock;
+ std::map<std::string, AssociatedSingletonObject*> _associated_objs;
+
// crypto
CryptoNone *_crypto_none;
CryptoAES *_crypto_aes;
diff --git a/src/common/config.cc b/src/common/config.cc
index fc47083..7650b88 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -148,7 +148,7 @@ md_config_t::md_config_t()
#undef OPTION
#undef SUBSYS
#undef DEFAULT_SUBSYS
- lock("md_config_t", true)
+ lock("md_config_t", true, false)
{
init_subsys();
}
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index d8ecdc7..1033a8e 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -137,6 +137,8 @@ OPTION(mon_sync_fs_threshold, OPT_INT, 5) // sync() when writing this many obj
OPTION(mon_compact_on_start, OPT_BOOL, false) // compact leveldb on ceph-mon start
OPTION(mon_compact_on_bootstrap, OPT_BOOL, false) // trigger leveldb compaction on bootstrap
OPTION(mon_compact_on_trim, OPT_BOOL, true) // compact (a prefix) when we trim old states
+OPTION(mon_osd_cache_size, OPT_INT, 10) // the size of osdmaps cache, not to rely on underlying store's cache
+
OPTION(mon_tick_interval, OPT_INT, 5)
OPTION(mon_subscribe_interval, OPT_DOUBLE, 300)
OPTION(mon_delta_reset_interval, OPT_DOUBLE, 10) // seconds of inactivity before we reset the pg delta to 0
@@ -172,7 +174,8 @@ OPTION(mon_pg_warn_min_pool_objects, OPT_INT, 1000) // do not warn on pools bel
OPTION(mon_cache_target_full_warn_ratio, OPT_FLOAT, .66) // position between pool cache_target_full and max where we start warning
OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full"
OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full
-OPTION(mon_globalid_prealloc, OPT_INT, 100) // how many globalids to prealloc
+OPTION(mon_allow_pool_delete, OPT_BOOL, true) // allow pool deletion
+OPTION(mon_globalid_prealloc, OPT_INT, 10000) // how many globalids to prealloc
OPTION(mon_osd_report_timeout, OPT_INT, 900) // grace period before declaring unresponsive OSDs dead
OPTION(mon_force_standby_active, OPT_BOOL, true) // should mons force standby-replay mds to be active
OPTION(mon_warn_on_old_mons, OPT_BOOL, true) // should mons set health to WARN if part of quorum is old?
@@ -452,6 +455,7 @@ OPTION(osd_tier_default_cache_mode, OPT_STR, "writeback")
OPTION(osd_tier_default_cache_hit_set_count, OPT_INT, 4)
OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200)
OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom")
+OPTION(osd_tier_default_cache_min_read_recency_for_promote, OPT_INT, 1) // number of recent HitSets the object must appear in to be promoted (on read)
OPTION(osd_map_dedup, OPT_BOOL, true)
OPTION(osd_map_max_advance, OPT_INT, 200) // make this < cache_size!
@@ -475,14 +479,21 @@ OPTION(osd_recover_clone_overlap_limit, OPT_INT, 10)
OPTION(osd_backfill_scan_min, OPT_INT, 64)
OPTION(osd_backfill_scan_max, OPT_INT, 512)
OPTION(osd_op_thread_timeout, OPT_INT, 15)
+OPTION(osd_op_thread_suicide_timeout, OPT_INT, 150)
OPTION(osd_recovery_thread_timeout, OPT_INT, 30)
+OPTION(osd_recovery_thread_suicide_timeout, OPT_INT, 300)
OPTION(osd_snap_trim_thread_timeout, OPT_INT, 60*60*1)
+OPTION(osd_snap_trim_thread_suicide_timeout, OPT_INT, 60*60*10)
OPTION(osd_snap_trim_sleep, OPT_FLOAT, 0)
OPTION(osd_scrub_thread_timeout, OPT_INT, 60)
+OPTION(osd_scrub_thread_suicide_timeout, OPT_INT, 60)
OPTION(osd_scrub_finalize_thread_timeout, OPT_INT, 60*10)
+OPTION(osd_scrub_finalize_thread_suicide_timeout, OPT_INT, 60*10*10)
OPTION(osd_scrub_invalid_stats, OPT_BOOL, true)
OPTION(osd_remove_thread_timeout, OPT_INT, 60*60)
+OPTION(osd_remove_thread_suicide_timeout, OPT_INT, 10*60*60)
OPTION(osd_command_thread_timeout, OPT_INT, 10*60)
+OPTION(osd_command_thread_suicide_timeout, OPT_INT, 15*60)
OPTION(osd_age, OPT_FLOAT, .8)
OPTION(osd_age_time, OPT_INT, 0)
OPTION(osd_heartbeat_addr, OPT_ADDR, entity_addr_t())
@@ -727,6 +738,9 @@ OPTION(journal_ignore_corruption, OPT_BOOL, false) // assume journal is not corr
OPTION(rados_mon_op_timeout, OPT_DOUBLE, 0) // how many seconds to wait for a response from the monitor before returning an error from a rados operation. 0 means on limit.
OPTION(rados_osd_op_timeout, OPT_DOUBLE, 0) // how many seconds to wait for a response from osds before returning an error from a rados operation. 0 means no limit.
+OPTION(rbd_op_threads, OPT_INT, 1)
+OPTION(rbd_op_thread_timeout, OPT_INT, 60)
+OPTION(rbd_non_blocking_aio, OPT_BOOL, true) // process AIO ops from a worker thread to prevent blocking
OPTION(rbd_cache, OPT_BOOL, false) // whether to enable caching (writeback unless rbd_cache_max_dirty is 0)
OPTION(rbd_cache_writethrough_until_flush, OPT_BOOL, false) // whether to make writeback caching writethrough until flush is called, to be sure the user of librbd will send flushs so that writeback is safe
OPTION(rbd_cache_size, OPT_LONGLONG, 32<<20) // cache size in bytes
@@ -784,6 +798,7 @@ OPTION(rgw_swift_url_prefix, OPT_STR, "swift") // entry point for which a url is
OPTION(rgw_swift_auth_url, OPT_STR, "") // default URL to go and verify tokens for v1 auth (if not using internal swift auth)
OPTION(rgw_swift_auth_entry, OPT_STR, "auth") // entry point for which a url is considered a swift auth url
OPTION(rgw_swift_tenant_name, OPT_STR, "") // tenant name to use for swift access
+OPTION(rgw_swift_enforce_content_length, OPT_BOOL, false) // enforce generation of Content-Length even in cost of performance or scalability
OPTION(rgw_keystone_url, OPT_STR, "") // url for keystone server
OPTION(rgw_keystone_admin_token, OPT_STR, "") // keystone admin token (shared secret)
OPTION(rgw_keystone_admin_user, OPT_STR, "") // keystone admin user name
diff --git a/src/common/sync_filesystem.h b/src/common/sync_filesystem.h
index eff18d2..7b392a2 100644
--- a/src/common/sync_filesystem.h
+++ b/src/common/sync_filesystem.h
@@ -42,13 +42,17 @@ inline int sync_filesystem(int fd)
return 0;
#endif
-#ifdef BTRFS_IOC_SYNC
- if (::ioctl(fd, BTRFS_IOC_SYNC) == 0)
+#if defined(HAVE_SYS_SYNCFS) || defined(SYS_syncfs) || defined(__NR_syncfs)
+ else if (errno == ENOSYS) {
+ sync();
return 0;
-#endif
-
+ } else {
+ return -errno;
+ }
+#else
sync();
return 0;
+#endif
}
#endif
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index b17829b..abf79af 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -113,6 +113,9 @@ bool CrushWrapper::_maybe_remove_last_instance(CephContext *cct, int item, bool
if (_search_item_exists(item)) {
return false;
}
+ if (item < 0 && _bucket_is_in_use(cct, item)) {
+ return false;
+ }
if (item < 0 && !unlink_only) {
crush_bucket *t = get_bucket(item);
@@ -140,6 +143,9 @@ int CrushWrapper::remove_item(CephContext *cct, int item, bool unlink_only)
<< " items, not empty" << dendl;
return -ENOTEMPTY;
}
+ if (_bucket_is_in_use(cct, item)) {
+ return -EBUSY;
+ }
}
for (int i = 0; i < crush->max_buckets; i++) {
@@ -179,6 +185,22 @@ bool CrushWrapper::_search_item_exists(int item) const
return false;
}
+bool CrushWrapper::_bucket_is_in_use(CephContext *cct, int item)
+{
+ for (unsigned i = 0; i < crush->max_rules; ++i) {
+ crush_rule *r = crush->rules[i];
+ if (!r)
+ continue;
+ for (unsigned j = 0; j < r->len; ++j) {
+ if (r->steps[j].op == CRUSH_RULE_TAKE &&
+ r->steps[j].arg1 == item) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
int CrushWrapper::_remove_item_under(CephContext *cct, int item, int ancestor, bool unlink_only)
{
ldout(cct, 5) << "_remove_item_under " << item << " under " << ancestor
@@ -214,6 +236,11 @@ int CrushWrapper::remove_item_under(CephContext *cct, int item, int ancestor, bo
{
ldout(cct, 5) << "remove_item_under " << item << " under " << ancestor
<< (unlink_only ? " unlink_only":"") << dendl;
+
+ if (!unlink_only && _bucket_is_in_use(cct, item)) {
+ return -EBUSY;
+ }
+
int ret = _remove_item_under(cct, item, ancestor, unlink_only);
if (ret < 0)
return ret;
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index 9fac2fe..8e28597 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -518,6 +518,7 @@ public:
private:
bool _maybe_remove_last_instance(CephContext *cct, int id, bool unlink_only);
int _remove_item_under(CephContext *cct, int id, int ancestor, bool unlink_only);
+ bool _bucket_is_in_use(CephContext *cct, int id);
public:
int remove_item_under(CephContext *cct, int id, int ancestor, bool unlink_only);
diff --git a/src/include/interval_set.h b/src/include/interval_set.h
index 7affb87..9a7d2f2 100644
--- a/src/include/interval_set.h
+++ b/src/include/interval_set.h
@@ -371,6 +371,7 @@ class interval_set {
void erase(iterator &i) {
_size -= i.get_len();
+ assert(_size >= 0);
m.erase(i._iter);
}
diff --git a/src/init-radosgw b/src/init-radosgw
index 5aa658c..7af51ef 100644
--- a/src/init-radosgw
+++ b/src/init-radosgw
@@ -66,8 +66,10 @@ case "$1" in
fi
log_file=`$RADOSGW -n $name --show-config-value log_file`
- if [ -n "$log_file" ] && [ ! -e "$log_file" ]; then
- touch "$log_file"
+ if [ -n "$log_file" ]; then
+ if [ ! -e "$log_file" ]; then
+ touch "$log_file"
+ fi
chown $user $log_file
fi
@@ -85,7 +87,15 @@ case "$1" in
$0 start
;;
stop)
- start-stop-daemon --stop -x $RADOSGW --oknodo
+ timeout=0
+ for name in `ceph-conf --list-sections $PREFIX`;
+ do
+ t=`$RADOSGW -n $name --show-config-value rgw_exit_timeout_secs`
+ if [ $t -gt $timeout ]; then timeout=$t; fi
+ done
+
+ if [ $timeout -gt 0 ]; then TIMEOUT="-R $timeout"; fi
+ start-stop-daemon --stop -x $RADOSGW --oknodo $TIMEOUT
;;
status)
daemon_is_running $RADOSGW
diff --git a/src/init-radosgw.sysv b/src/init-radosgw.sysv
index fe56939..1bb8a89 100644
--- a/src/init-radosgw.sysv
+++ b/src/init-radosgw.sysv
@@ -73,8 +73,10 @@ case "$1" in
fi
log_file=`$RADOSGW -n $name --show-config-value log_file`
- if [ -n "$log_file" ] && [ ! -e "$log_file" ]; then
- touch "$log_file"
+ if [ -n "$log_file" ]; then
+ if [ ! -e "$log_file" ]; then
+ touch "$log_file"
+ fi
chown $user $log_file
fi
@@ -99,8 +101,19 @@ case "$1" in
;;
stop)
#start-stop-daemon --stop -x $RADOSGW --oknodo
+ timeout=0
+ for name in `ceph-conf --list-sections $PREFIX`;
+ do
+ t=`$RADOSGW -n $name --show-config-value rgw_exit_timeout_secs`
+ if [ $t -gt $timeout ]; then timeout=$t; fi
+ done
+
killproc $RADOSGW
echo "Stopping radosgw instance(s)..."
+ while pidof $RADOSGW >/dev/null && [ $timeout -gt 0 ] ; do
+ sleep 1
+ timeout=$(($timeout - 1))
+ done
;;
status)
daemon_is_running $RADOSGW
diff --git a/src/json_spirit/json_spirit_reader_template.h b/src/json_spirit/json_spirit_reader_template.h
index f87b593..c50f885 100644
--- a/src/json_spirit/json_spirit_reader_template.h
+++ b/src/json_spirit/json_spirit_reader_template.h
@@ -13,6 +13,8 @@
#include "json_spirit_value.h"
#include "json_spirit_error_position.h"
+#include "common/utf8.h"
+
#define BOOST_SPIRIT_THREADSAFE // uncomment for multithreaded use, requires linking to boost.thread
#include <boost/bind.hpp>
@@ -71,18 +73,30 @@ namespace json_spirit
return ( hex_to_num( c1 ) << 4 ) + hex_to_num( c2 );
}
- template< class Char_type, class Iter_type >
- Char_type unicode_str_to_char( Iter_type& begin )
+ template< class String_type, class Iter_type >
+ String_type unicode_str_to_utf8( Iter_type& begin );
+
+ template<>
+ std::string unicode_str_to_utf8( std::string::const_iterator & begin )
{
+ typedef std::string::value_type Char_type;
+
const Char_type c1( *( ++begin ) );
const Char_type c2( *( ++begin ) );
const Char_type c3( *( ++begin ) );
const Char_type c4( *( ++begin ) );
- return ( hex_to_num( c1 ) << 12 ) +
- ( hex_to_num( c2 ) << 8 ) +
- ( hex_to_num( c3 ) << 4 ) +
- hex_to_num( c4 );
+ unsigned long uc = ( hex_to_num( c1 ) << 12 ) +
+ ( hex_to_num( c2 ) << 8 ) +
+ ( hex_to_num( c3 ) << 4 ) +
+ hex_to_num( c4 );
+
+ unsigned char buf[7]; // MAX_UTF8_SZ is 6 (see src/common/utf8.c)
+ int r = encode_utf8(uc, buf);
+ if (r >= 0) {
+ return std::string(reinterpret_cast<char *>(buf), r);
+ }
+ return std::string("_");
}
template< class String_type >
@@ -116,7 +130,7 @@ namespace json_spirit
{
if( end - begin >= 5 ) // expecting "uHHHH..."
{
- s += unicode_str_to_char< Char_type >( begin );
+ s += unicode_str_to_utf8< String_type >( begin );
}
break;
}
@@ -178,11 +192,15 @@ namespace json_spirit
return get_str_< std::string >( begin, end );
}
+// Need this guard else it tries to instantiate unicode_str_to_utf8 with a
+// std::wstring, which isn't presently implemented
+#if defined( JSON_SPIRIT_WMVALUE_ENABLED ) && !defined( BOOST_NO_STD_WSTRING )
inline std::wstring get_str( std::wstring::const_iterator begin, std::wstring::const_iterator end )
{
return get_str_< std::wstring >( begin, end );
}
-
+#endif
+
template< class String_type, class Iter_type >
String_type get_str( Iter_type begin, Iter_type end )
{
diff --git a/src/libcephfs.cc b/src/libcephfs.cc
index 88e86ba..a8a32f4 100644
--- a/src/libcephfs.cc
+++ b/src/libcephfs.cc
@@ -34,9 +34,8 @@
struct ceph_mount_info
{
public:
- ceph_mount_info(uint64_t msgr_nonce_, CephContext *cct_)
- : msgr_nonce(msgr_nonce_),
- mounted(false),
+ ceph_mount_info(CephContext *cct_)
+ : mounted(false),
inited(false),
client(NULL),
monclient(NULL),
@@ -80,7 +79,7 @@ public:
goto fail;
//network connection
- messenger = Messenger::create(cct, entity_name_t::CLIENT(), "client", msgr_nonce);
+ messenger = Messenger::create_client_messenger(cct, "client");
//at last the client
ret = -1002;
@@ -215,7 +214,6 @@ public:
}
private:
- uint64_t msgr_nonce;
bool mounted;
bool inited;
Client *client;
@@ -242,14 +240,7 @@ extern "C" const char *ceph_version(int *pmajor, int *pminor, int *ppatch)
extern "C" int ceph_create_with_context(struct ceph_mount_info **cmount, CephContext *cct)
{
- uint64_t nonce = 0;
-
- // 6 bytes of random and 2 bytes of pid
- get_random_bytes((char*)&nonce, sizeof(nonce));
- nonce &= ~0xffff;
- nonce |= (uint64_t)getpid();
-
- *cmount = new struct ceph_mount_info(nonce, cct);
+ *cmount = new struct ceph_mount_info(cct);
return 0;
}
diff --git a/src/librados/IoCtxImpl.cc b/src/librados/IoCtxImpl.cc
index 6fc22ad..887b390 100644
--- a/src/librados/IoCtxImpl.cc
+++ b/src/librados/IoCtxImpl.cc
@@ -815,7 +815,7 @@ int librados::IoCtxImpl::hit_set_list(uint32_t hash, AioCompletionImpl *c,
::ObjectOperation rd;
rd.hit_set_ls(pls, NULL);
object_locator_t oloc(poolid);
- objecter->pg_read(hash, oloc, rd, NULL, 0, onack, NULL);
+ objecter->pg_read(hash, oloc, rd, NULL, 0, onack, NULL, NULL);
return 0;
}
@@ -831,7 +831,7 @@ int librados::IoCtxImpl::hit_set_get(uint32_t hash, AioCompletionImpl *c,
::ObjectOperation rd;
rd.hit_set_get(utime_t(stamp, 0), pbl, 0);
object_locator_t oloc(poolid);
- objecter->pg_read(hash, oloc, rd, NULL, 0, onack, NULL);
+ objecter->pg_read(hash, oloc, rd, NULL, 0, onack, NULL, NULL);
return 0;
}
diff --git a/src/librados/RadosClient.cc b/src/librados/RadosClient.cc
index a5e77a5..572aa25 100644
--- a/src/librados/RadosClient.cc
+++ b/src/librados/RadosClient.cc
@@ -30,7 +30,7 @@
#include "messages/MWatchNotify.h"
#include "messages/MLog.h"
-#include "msg/SimpleMessenger.h"
+#include "msg/Messenger.h"
// needed for static_cast
#include "messages/PaxosServiceMessage.h"
@@ -52,8 +52,6 @@
#undef dout_prefix
#define dout_prefix *_dout << "librados: "
-static atomic_t rados_instance;
-
bool librados::RadosClient::ms_get_authorizer(int dest_type,
AuthAuthorizer **authorizer,
bool force_new) {
@@ -206,7 +204,6 @@ int librados::RadosClient::connect()
common_init_finish(cct);
int err;
- uint64_t nonce;
// already connected?
if (state == CONNECTING)
@@ -221,8 +218,7 @@ int librados::RadosClient::connect()
goto out;
err = -ENOMEM;
- nonce = getpid() + (1000000 * (uint64_t)rados_instance.inc());
- messenger = new SimpleMessenger(cct, entity_name_t::CLIENT(-1), "radosclient", nonce);
+ messenger = Messenger::create_client_messenger(cct, "radosclient");
if (!messenger)
goto out;
diff --git a/src/librados/RadosClient.h b/src/librados/RadosClient.h
index e608ced..3e12b55 100644
--- a/src/librados/RadosClient.h
+++ b/src/librados/RadosClient.h
@@ -33,7 +33,7 @@ struct md_config_t;
class Message;
class MWatchNotify;
class MLog;
-class SimpleMessenger;
+class Messenger;
class librados::RadosClient : public Dispatcher
{
@@ -49,7 +49,7 @@ private:
OSDMap osdmap;
MonClient monclient;
- SimpleMessenger *messenger;
+ Messenger *messenger;
uint64_t instance_id;
diff --git a/src/librbd/AioCompletion.cc b/src/librbd/AioCompletion.cc
index 86b5b50..e818674 100644
--- a/src/librbd/AioCompletion.cc
+++ b/src/librbd/AioCompletion.cc
@@ -5,6 +5,7 @@
#include "common/ceph_context.h"
#include "common/dout.h"
+#include "common/errno.h"
#include "librbd/AioRequest.h"
#include "librbd/internal.h"
@@ -25,7 +26,7 @@ namespace librbd {
building = false;
if (!pending_count) {
finalize(cct, rval);
- complete();
+ complete(cct);
}
lock.Unlock();
}
@@ -54,6 +55,49 @@ namespace librbd {
}
}
+ void AioCompletion::complete(CephContext *cct) {
+ utime_t elapsed;
+ assert(lock.is_locked());
+ elapsed = ceph_clock_now(cct) - start_time;
+ switch (aio_type) {
+ case AIO_TYPE_READ:
+ ictx->perfcounter->tinc(l_librbd_aio_rd_latency, elapsed); break;
+ case AIO_TYPE_WRITE:
+ ictx->perfcounter->tinc(l_librbd_aio_wr_latency, elapsed); break;
+ case AIO_TYPE_DISCARD:
+ ictx->perfcounter->tinc(l_librbd_aio_discard_latency, elapsed); break;
+ case AIO_TYPE_FLUSH:
+ ictx->perfcounter->tinc(l_librbd_aio_flush_latency, elapsed); break;
+ default:
+ lderr(cct) << "completed invalid aio_type: " << aio_type << dendl;
+ break;
+ }
+
+ if (ictx != NULL) {
+ Mutex::Locker l(ictx->aio_lock);
+ assert(ictx->pending_aio != 0);
+ --ictx->pending_aio;
+ ictx->pending_aio_cond.Signal();
+ }
+
+ if (complete_cb) {
+ complete_cb(rbd_comp, complete_arg);
+ }
+ done = true;
+ cond.Signal();
+ }
+
+ void AioCompletion::fail(CephContext *cct, int r)
+ {
+ lderr(cct) << "AioCompletion::fail() " << this << ": " << cpp_strerror(r)
+ << dendl;
+ lock.Lock();
+ assert(pending_count == 0);
+ rval = r;
+ complete(cct);
+ put_unlock();
+ }
+
void AioCompletion::complete_request(CephContext *cct, ssize_t r)
{
ldout(cct, 20) << "AioCompletion::complete_request() "
@@ -70,7 +114,7 @@ namespace librbd {
int count = --pending_count;
if (!count && !building) {
finalize(cct, rval);
- complete();
+ complete(cct);
}
put_unlock();
}
diff --git a/src/librbd/AioCompletion.h b/src/librbd/AioCompletion.h
index e28cd6a..4dbad52 100644
--- a/src/librbd/AioCompletion.h
+++ b/src/librbd/AioCompletion.h
@@ -101,37 +101,8 @@ namespace librbd {
start_time = ceph_clock_now(ictx->cct);
}
- void complete() {
- utime_t elapsed;
- assert(lock.is_locked());
- elapsed = ceph_clock_now(ictx->cct) - start_time;
- switch (aio_type) {
- case AIO_TYPE_READ:
- ictx->perfcounter->tinc(l_librbd_aio_rd_latency, elapsed); break;
- case AIO_TYPE_WRITE:
- ictx->perfcounter->tinc(l_librbd_aio_wr_latency, elapsed); break;
- case AIO_TYPE_DISCARD:
- ictx->perfcounter->tinc(l_librbd_aio_discard_latency, elapsed); break;
- case AIO_TYPE_FLUSH:
- ictx->perfcounter->tinc(l_librbd_aio_flush_latency, elapsed); break;
- default:
- lderr(ictx->cct) << "completed invalid aio_type: " << aio_type << dendl;
- break;
- }
-
- {
- Mutex::Locker l(ictx->aio_lock);
- assert(ictx->pending_aio != 0);
- --ictx->pending_aio;
- ictx->pending_aio_cond.Signal();
- }
-
- if (complete_cb) {
- complete_cb(rbd_comp, complete_arg);
- }
- done = true;
- cond.Signal();
- }
+ void complete(CephContext *cct);
+ void fail(CephContext *cct, int r);
void set_complete_cb(void *cb_arg, callback_t cb) {
complete_cb = cb;
diff --git a/src/librbd/AioRequest.cc b/src/librbd/AioRequest.cc
index 5cf9a11..dee6eba 100644
--- a/src/librbd/AioRequest.cc
+++ b/src/librbd/AioRequest.cc
@@ -85,8 +85,9 @@ namespace librbd {
return true;
}
- int AioRead::send() {
- ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len << dendl;
+ void AioRead::send() {
+ ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len << dendl;
librados::AioCompletion *rados_completion =
librados::Rados::aio_create_completion(this, rados_req_cb, NULL);
@@ -99,10 +100,11 @@ namespace librbd {
} else {
op.read(m_object_off, m_object_len, &m_read_data, NULL);
}
+
r = m_ioctx->aio_operate(m_oid, rados_completion, &op, flags, NULL);
+ assert(r == 0);
rados_completion->release();
- return r;
}
/** write **/
@@ -224,16 +226,17 @@ namespace librbd {
return finished;
}
- int AbstractWrite::send() {
- ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len << dendl;
+ void AbstractWrite::send() {
+ ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len << dendl;
librados::AioCompletion *rados_completion =
librados::Rados::aio_create_completion(this, NULL, rados_req_cb);
int r;
assert(m_write.size());
r = m_ioctx->aio_operate(m_oid, rados_completion, &m_write,
m_snap_seq, m_snaps);
+ assert(r == 0);
rados_completion->release();
- return r;
}
void AbstractWrite::send_copyup() {
diff --git a/src/librbd/AioRequest.h b/src/librbd/AioRequest.h
index d6103f9..882b535 100644
--- a/src/librbd/AioRequest.h
+++ b/src/librbd/AioRequest.h
@@ -43,7 +43,7 @@ namespace librbd {
}
virtual bool should_complete(int r) = 0;
- virtual int send() = 0;
+ virtual void send() = 0;
protected:
void read_from_parent(vector<pair<uint64_t,uint64_t> >& image_extents);
@@ -73,7 +73,7 @@ namespace librbd {
}
virtual ~AioRead() {}
virtual bool should_complete(int r);
- virtual int send();
+ virtual void send();
ceph::bufferlist &data() {
return m_read_data;
@@ -100,7 +100,7 @@ namespace librbd {
bool hide_enoent);
virtual ~AbstractWrite() {}
virtual bool should_complete(int r);
- virtual int send();
+ virtual void send();
void guard_write();
bool has_parent() const {
diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc
index 1295d42..ce403a7 100644
--- a/src/librbd/ImageCtx.cc
+++ b/src/librbd/ImageCtx.cc
@@ -6,6 +6,7 @@
#include "common/dout.h"
#include "common/errno.h"
#include "common/perf_counters.h"
+#include "common/WorkQueue.h"
#include "librbd/internal.h"
#include "librbd/WatchCtx.h"
@@ -27,6 +28,23 @@ using librados::snap_t;
using librados::IoCtx;
namespace librbd {
+
+namespace {
+
+class ThreadPoolSingleton : public ThreadPool {
+public:
+ ThreadPoolSingleton(CephContext *cct)
+ : ThreadPool(cct, "librbd::thread_pool", cct->_conf->rbd_op_threads,
+ "rbd_op_threads") {
+ start();
+ }
+ virtual ~ThreadPoolSingleton() {
+ stop();
+ }
+};
+
+} // anonymous namespace
+
ImageCtx::ImageCtx(const string &image_name, const string &image_id,
const char *snap, IoCtx& p, bool ro)
: cct((CephContext*)p.cct()),
@@ -53,7 +71,7 @@ namespace librbd {
id(image_id), parent(NULL),
stripe_unit(0), stripe_count(0),
object_cacher(NULL), writeback_handler(NULL), object_set(NULL),
- pending_aio(0)
+ pending_aio(0), aio_work_queue(NULL)
{
md_ctx.dup(p);
data_ctx.dup(p);
@@ -98,6 +116,13 @@ namespace librbd {
object_set->return_enoent = true;
object_cacher->start();
}
+
+ ThreadPoolSingleton *thread_pool_singleton;
+ cct->lookup_or_create_singleton_object<ThreadPoolSingleton>(
+ thread_pool_singleton, "librbd::thread_pool");
+ aio_work_queue = new ContextWQ("librbd::aio_work_queue",
+ cct->_conf->rbd_op_thread_timeout,
+ thread_pool_singleton);
}
ImageCtx::~ImageCtx() {
@@ -115,6 +140,8 @@ namespace librbd {
object_set = NULL;
}
delete[] format_string;
+
+ delete aio_work_queue;
}
int ImageCtx::init() {
@@ -189,10 +216,9 @@ namespace librbd {
if (object_cacher) {
uint64_t obj = cct->_conf->rbd_cache_max_dirty_object;
if (!obj) {
- obj = cct->_conf->rbd_cache_size / (1ull << order);
- obj = obj * 4 + 10;
+ obj = MIN(2000, MAX(10, cct->_conf->rbd_cache_size / 100 / sizeof(ObjectCacher::Object)));
}
- ldout(cct, 10) << " cache bytes " << cct->_conf->rbd_cache_size << " order " << (int)order
+ ldout(cct, 10) << " cache bytes " << cct->_conf->rbd_cache_size
<< " -> about " << obj << " objects" << dendl;
object_cacher->set_max_objects(obj);
}
diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h
index 5a0d637..406192b 100644
--- a/src/librbd/ImageCtx.h
+++ b/src/librbd/ImageCtx.h
@@ -26,6 +26,7 @@
#include "librbd/parent_types.h"
class CephContext;
+class ContextWQ;
class PerfCounters;
namespace librbd {
@@ -95,6 +96,8 @@ namespace librbd {
Cond pending_aio_cond;
uint64_t pending_aio;
+ ContextWQ *aio_work_queue;
+
/**
* Either image_name or image_id must be set.
* If id is not known, pass the empty std::string,
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index 1456012..9ed7bd9 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -9,6 +9,7 @@
#include "common/dout.h"
#include "common/errno.h"
#include "common/Throttle.h"
+#include "common/WorkQueue.h"
#include "cls/lock/cls_lock_client.h"
#include "include/stringify.h"
@@ -1420,7 +1421,6 @@ reprotect_and_return_err:
close_image(ictx);
return -EBUSY;
}
- assert(watchers.size() == 1);
ictx->md_lock.get_read();
trim_image(ictx, 0, prog_ctx);
@@ -2013,13 +2013,7 @@ reprotect_and_return_err:
Context *ctx = new C_CopyWrite(m_throttle, m_bl);
AioCompletion *comp = aio_create_completion_internal(ctx, rbd_ctx_cb);
- r = aio_write(m_dest, m_offset, m_bl->length(), m_bl->c_str(), comp);
- if (r < 0) {
- ctx->complete(r);
- comp->release();
- lderr(m_dest->cct) << "error writing to destination image at offset "
- << m_offset << ": " << cpp_strerror(r) << dendl;
- }
+ aio_write(m_dest, m_offset, m_bl->length(), m_bl->c_str(), comp);
}
private:
SimpleThrottle *m_throttle;
@@ -2052,20 +2046,15 @@ reprotect_and_return_err:
SimpleThrottle throttle(cct->_conf->rbd_concurrent_management_ops, false);
uint64_t period = src->get_stripe_period();
for (uint64_t offset = 0; offset < src_size; offset += period) {
+ if (throttle.pending_error()) {
+ return throttle.wait_for_ret();
+ }
+
uint64_t len = min(period, src_size - offset);
bufferlist *bl = new bufferlist();
Context *ctx = new C_CopyRead(&throttle, dest, offset, bl);
AioCompletion *comp = aio_create_completion_internal(ctx, rbd_ctx_cb);
- r = aio_read(src, offset, len, NULL, bl, comp);
- if (r < 0) {
- ctx->complete(r);
- comp->release();
- throttle.wait_for_ret();
- lderr(cct) << "could not read from source image from "
- << offset << " to " << offset + len << ": "
- << cpp_strerror(r) << dendl;
- return r;
- }
+ aio_read(src, offset, len, NULL, bl, comp);
prog_ctx.update_progress(offset, src_size);
}
@@ -2151,6 +2140,9 @@ reprotect_and_return_err:
void close_image(ImageCtx *ictx)
{
ldout(ictx->cct, 20) << "close_image " << ictx << dendl;
+
+ ictx->aio_work_queue->drain();
+
if (ictx->object_cacher) {
ictx->shutdown_cache(); // implicitly flushes
} else {
@@ -2221,6 +2213,10 @@ reprotect_and_return_err:
SimpleThrottle throttle(cct->_conf->rbd_concurrent_management_ops, false);
for (uint64_t ono = 0; ono < overlap_objects; ono++) {
+ if (throttle.pending_error()) {
+ return throttle.wait_for_ret();
+ }
+
{
RWLock::RLocker l(ictx->parent_lock);
// stop early if the parent went away - it just means
@@ -2244,12 +2240,7 @@ reprotect_and_return_err:
Context *comp = new C_SimpleThrottle(&throttle);
AioWrite *req = new AioWrite(ictx, oid, ono, 0, objectx, object_overlap,
bl, snapc, CEPH_NOSNAP, comp);
- r = req->send();
- if (r < 0) {
- lderr(cct) << "failed to flatten object " << oid << dendl;
- goto err;
- }
-
+ req->send();
prog_ctx.update_progress(ono, overlap_objects);
}
@@ -2439,12 +2430,7 @@ reprotect_and_return_err:
Context *ctx = new C_SafeCond(&mylock, &cond, &done, &ret);
AioCompletion *c = aio_create_completion_internal(ctx, rbd_ctx_cb);
- r = aio_read(ictx, off, read_len, NULL, &bl, c);
- if (r < 0) {
- c->release();
- delete ctx;
- return r;
- }
+ aio_read(ictx, off, read_len, NULL, &bl, c);
mylock.Lock();
while (!done)
@@ -2674,12 +2660,7 @@ reprotect_and_return_err:
Context *ctx = new C_SafeCond(&mylock, &cond, &done, &ret);
AioCompletion *c = aio_create_completion_internal(ctx, rbd_ctx_cb);
- int r = aio_read(ictx, image_extents, buf, pbl, c);
- if (r < 0) {
- c->release();
- delete ctx;
- return r;
- }
+ aio_read(ictx, image_extents, buf, pbl, c);
mylock.Lock();
while (!done)
@@ -2708,12 +2689,7 @@ reprotect_and_return_err:
Context *ctx = new C_SafeCond(&mylock, &cond, &done, &ret);
AioCompletion *c = aio_create_completion_internal(ctx, rbd_ctx_cb);
- r = aio_write(ictx, off, mylen, buf, c);
- if (r < 0) {
- c->release();
- delete ctx;
- return r;
- }
+ aio_write(ictx, off, mylen, buf, c);
mylock.Lock();
while (!done)
@@ -2744,12 +2720,7 @@ reprotect_and_return_err:
Context *ctx = new C_SafeCond(&mylock, &cond, &done, &ret);
AioCompletion *c = aio_create_completion_internal(ctx, rbd_ctx_cb);
- int r = aio_discard(ictx, off, len, c);
- if (r < 0) {
- c->release();
- delete ctx;
- return r;
- }
+ aio_discard(ictx, off, len, c);
mylock.Lock();
while (!done)
@@ -2867,18 +2838,20 @@ reprotect_and_return_err:
return 0;
}
- int aio_flush(ImageCtx *ictx, AioCompletion *c)
+ void aio_flush(ImageCtx *ictx, AioCompletion *c)
{
CephContext *cct = ictx->cct;
ldout(cct, 20) << "aio_flush " << ictx << " completion " << c << dendl;
+ c->get();
int r = ictx_check(ictx);
- if (r < 0)
- return r;
+ if (r < 0) {
+ c->fail(cct, r);
+ return;
+ }
ictx->user_flushed();
- c->get();
c->add_request();
c->init_time(ictx, AIO_TYPE_FLUSH);
C_AioWrite *req_comp = new C_AioWrite(cct, c);
@@ -2893,8 +2866,6 @@ reprotect_and_return_err:
c->finish_adding_requests(cct);
c->put();
ictx->perfcounter->inc(l_librbd_aio_flush);
-
- return 0;
}
int flush(ImageCtx *ictx)
@@ -2942,21 +2913,26 @@ reprotect_and_return_err:
return ictx->invalidate_cache();
}
- int aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
- AioCompletion *c)
+ void aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
+ AioCompletion *c)
{
CephContext *cct = ictx->cct;
ldout(cct, 20) << "aio_write " << ictx << " off = " << off << " len = "
<< len << " buf = " << (void*)buf << dendl;
+ c->get();
int r = ictx_check(ictx);
- if (r < 0)
- return r;
+ if (r < 0) {
+ c->fail(cct, r);
+ return;
+ }
uint64_t mylen = len;
r = clip_io(ictx, off, &mylen);
- if (r < 0)
- return r;
+ if (r < 0) {
+ c->fail(cct, r);
+ return;
+ }
ictx->snap_lock.get_read();
snapid_t snap_id = ictx->snap_id;
@@ -2967,8 +2943,10 @@ reprotect_and_return_err:
ictx->parent_lock.put_read();
ictx->snap_lock.put_read();
- if (snap_id != CEPH_NOSNAP || ictx->read_only)
- return -EROFS;
+ if (snap_id != CEPH_NOSNAP || ictx->read_only) {
+ c->fail(cct, -EROFS);
+ return;
+ }
ldout(cct, 20) << " parent overlap " << overlap << dendl;
@@ -2979,7 +2957,6 @@ reprotect_and_return_err:
&ictx->layout, off, mylen, 0, extents);
}
- c->get();
c->init_time(ictx, AIO_TYPE_WRITE);
for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length
@@ -3008,35 +2985,35 @@ reprotect_and_return_err:
objectx, object_overlap,
bl, snapc, snap_id, req_comp);
c->add_request();
- r = req->send();
- if (r < 0)
- goto done;
+ req->send();
}
}
- done:
+
c->finish_adding_requests(ictx->cct);
c->put();
ictx->perfcounter->inc(l_librbd_aio_wr);
ictx->perfcounter->inc(l_librbd_aio_wr_bytes, mylen);
-
- /* FIXME: cleanup all the allocated stuff */
- return r;
}
- int aio_discard(ImageCtx *ictx, uint64_t off, uint64_t len, AioCompletion *c)
+ void aio_discard(ImageCtx *ictx, uint64_t off, uint64_t len, AioCompletion *c)
{
CephContext *cct = ictx->cct;
ldout(cct, 20) << "aio_discard " << ictx << " off = " << off << " len = "
<< len << dendl;
+ c->get();
int r = ictx_check(ictx);
- if (r < 0)
- return r;
+ if (r < 0) {
+ c->fail(cct, r);
+ return;
+ }
r = clip_io(ictx, off, &len);
- if (r < 0)
- return r;
+ if (r < 0) {
+ c->fail(cct, r);
+ return;
+ }
// TODO: check for snap
ictx->snap_lock.get_read();
@@ -3048,8 +3025,10 @@ reprotect_and_return_err:
ictx->parent_lock.put_read();
ictx->snap_lock.put_read();
- if (snap_id != CEPH_NOSNAP || ictx->read_only)
- return -EROFS;
+ if (snap_id != CEPH_NOSNAP || ictx->read_only) {
+ c->fail(cct, -EROFS);
+ return;
+ }
// map
vector<ObjectExtent> extents;
@@ -3058,7 +3037,6 @@ reprotect_and_return_err:
&ictx->layout, off, len, 0, extents);
}
- c->get();
c->init_time(ictx, AIO_TYPE_DISCARD);
for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length
@@ -3089,12 +3067,10 @@ reprotect_and_return_err:
snapc, snap_id, req_comp);
}
- r = req->send();
- if (r < 0)
- goto done;
+ req->send();
}
+
r = 0;
- done:
if (ictx->object_cacher) {
Mutex::Locker l(ictx->cache_lock);
ictx->object_cacher->discard_set(ictx->object_set, extents);
@@ -3105,9 +3081,6 @@ reprotect_and_return_err:
ictx->perfcounter->inc(l_librbd_aio_discard);
ictx->perfcounter->inc(l_librbd_aio_discard_bytes, len);
-
- /* FIXME: cleanup all the allocated stuff */
- return r;
}
void rbd_req_cb(completion_t cb, void *arg)
@@ -3117,23 +3090,27 @@ reprotect_and_return_err:
req->complete(comp->get_return_value());
}
- int aio_read(ImageCtx *ictx, uint64_t off, size_t len,
+ void aio_read(ImageCtx *ictx, uint64_t off, size_t len,
char *buf, bufferlist *bl,
AioCompletion *c)
{
vector<pair<uint64_t,uint64_t> > image_extents(1);
image_extents[0] = make_pair(off, len);
- return aio_read(ictx, image_extents, buf, bl, c);
+ aio_read(ictx, image_extents, buf, bl, c);
}
- int aio_read(ImageCtx *ictx, const vector<pair<uint64_t,uint64_t> >& image_extents,
- char *buf, bufferlist *pbl, AioCompletion *c)
+ void aio_read(ImageCtx *ictx, const vector<pair<uint64_t,uint64_t> >& image_extents,
+ char *buf, bufferlist *pbl, AioCompletion *c)
{
- ldout(ictx->cct, 20) << "aio_read " << ictx << " completion " << c << " " << image_extents << dendl;
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "aio_read " << ictx << " completion " << c << " " << image_extents << dendl;
+ c->get();
int r = ictx_check(ictx);
- if (r < 0)
- return r;
+ if (r < 0) {
+ c->fail(cct, r);
+ return;
+ }
ictx->snap_lock.get_read();
snap_t snap_id = ictx->snap_id;
@@ -3148,8 +3125,10 @@ reprotect_and_return_err:
++p) {
uint64_t len = p->second;
r = clip_io(ictx, p->first, &len);
- if (r < 0)
- return r;
+ if (r < 0) {
+ c->fail(cct, r);
+ return;
+ }
if (len == 0)
continue;
@@ -3158,13 +3137,10 @@ reprotect_and_return_err:
buffer_ofs += len;
}
- int64_t ret;
-
c->read_buf = buf;
c->read_buf_len = buffer_ofs;
c->read_bl = pbl;
- c->get();
c->init_time(ictx, AIO_TYPE_READ);
for (map<object_t,vector<ObjectExtent> >::iterator p = object_extents.begin(); p != object_extents.end(); ++p) {
for (vector<ObjectExtent>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
@@ -3185,25 +3161,16 @@ reprotect_and_return_err:
q->length, q->offset,
cache_comp);
} else {
- r = req->send();
- if (r < 0 && r == -ENOENT)
- r = 0;
- if (r < 0) {
- ret = r;
- goto done;
- }
+ req->send();
}
}
}
- ret = buffer_ofs;
- done:
+
c->finish_adding_requests(ictx->cct);
c->put();
ictx->perfcounter->inc(l_librbd_aio_rd);
ictx->perfcounter->inc(l_librbd_aio_rd_bytes, buffer_ofs);
-
- return ret;
}
AioCompletion *aio_create_completion() {
diff --git a/src/librbd/internal.h b/src/librbd/internal.h
index 1e9fd9a..7712a39 100644
--- a/src/librbd/internal.h
+++ b/src/librbd/internal.h
@@ -179,14 +179,15 @@ namespace librbd {
char *buf, bufferlist *pbl);
ssize_t write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf);
int discard(ImageCtx *ictx, uint64_t off, uint64_t len);
- int aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
- AioCompletion *c);
- int aio_discard(ImageCtx *ictx, uint64_t off, uint64_t len, AioCompletion *c);
- int aio_read(ImageCtx *ictx, uint64_t off, size_t len,
- char *buf, bufferlist *pbl, AioCompletion *c);
- int aio_read(ImageCtx *ictx, const vector<pair<uint64_t,uint64_t> >& image_extents,
- char *buf, bufferlist *pbl, AioCompletion *c);
- int aio_flush(ImageCtx *ictx, AioCompletion *c);
+
+ void aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
+ AioCompletion *c);
+ void aio_discard(ImageCtx *ictx, uint64_t off, uint64_t len, AioCompletion *c);
+ void aio_read(ImageCtx *ictx, uint64_t off, size_t len,
+ char *buf, bufferlist *pbl, AioCompletion *c);
+ void aio_read(ImageCtx *ictx, const vector<pair<uint64_t,uint64_t> >& image_extents,
+ char *buf, bufferlist *pbl, AioCompletion *c);
+ void aio_flush(ImageCtx *ictx, AioCompletion *c);
int flush(ImageCtx *ictx);
int _flush(ImageCtx *ictx);
int invalidate_cache(ImageCtx *ictx);
diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc
index 658f24b..244a5a0 100644
--- a/src/librbd/librbd.cc
+++ b/src/librbd/librbd.cc
@@ -20,6 +20,7 @@
#include "common/errno.h"
#include "common/snap_types.h"
#include "common/perf_counters.h"
+#include "common/WorkQueue.h"
#include "include/Context.h"
#include "include/rbd/librbd.hpp"
#include "osdc/ObjectCacher.h"
@@ -45,6 +46,117 @@ using ceph::bufferlist;
using librados::snap_t;
using librados::IoCtx;
+namespace {
+
+class C_AioReadWQ : public Context {
+public:
+ C_AioReadWQ(librbd::ImageCtx *ictx, uint64_t off, size_t len,
+ char *buf, bufferlist *pbl, librbd::AioCompletion *c)
+ : m_ictx(ictx), m_off(off), m_len(len), m_buf(buf), m_pbl(pbl), m_comp(c) {
+ }
+protected:
+ virtual void finish(int r) {
+ librbd::aio_read(m_ictx, m_off, m_len, m_buf, m_pbl, m_comp);
+ }
+private:
+ librbd::ImageCtx *m_ictx;
+ uint64_t m_off;
+ uint64_t m_len;
+ char *m_buf;
+ bufferlist *m_pbl;
+ librbd::AioCompletion *m_comp;
+};
+
+class C_AioWriteWQ : public Context {
+public:
+ C_AioWriteWQ(librbd::ImageCtx *ictx, uint64_t off, size_t len,
+ const char *buf, librbd::AioCompletion *c)
+ : m_ictx(ictx), m_off(off), m_len(len), m_buf(buf), m_comp(c) {
+ }
+protected:
+ virtual void finish(int r) {
+ librbd::aio_write(m_ictx, m_off, m_len, m_buf, m_comp);
+ }
+private:
+ librbd::ImageCtx *m_ictx;
+ uint64_t m_off;
+ uint64_t m_len;
+ const char *m_buf;
+ librbd::AioCompletion *m_comp;
+};
+
+class C_AioDiscardWQ : public Context {
+public:
+ C_AioDiscardWQ(librbd::ImageCtx *ictx, uint64_t off, uint64_t len,
+ librbd::AioCompletion *c)
+ : m_ictx(ictx), m_off(off), m_len(len), m_comp(c) {
+ }
+protected:
+ virtual void finish(int r) {
+ librbd::aio_discard(m_ictx, m_off, m_len, m_comp);
+ }
+private:
+ librbd::ImageCtx *m_ictx;
+ uint64_t m_off;
+ uint64_t m_len;
+ librbd::AioCompletion *m_comp;
+};
+
+class C_AioFlushWQ : public Context {
+public:
+ C_AioFlushWQ(librbd::ImageCtx *ictx, librbd::AioCompletion *c)
+ : m_ictx(ictx), m_comp(c) {
+ }
+protected:
+ virtual void finish(int r) {
+ librbd::aio_flush(m_ictx, m_comp);
+ }
+private:
+ librbd::ImageCtx *m_ictx;
+ librbd::AioCompletion *m_comp;
+};
+
+void submit_aio_read(librbd::ImageCtx *ictx, uint64_t off, size_t len,
+ char *buf, bufferlist *pbl, librbd::AioCompletion *c) {
+ if (ictx->cct->_conf->rbd_non_blocking_aio) {
+ ictx->aio_work_queue->queue(new C_AioReadWQ(ictx, off, len, buf, pbl, c));
+ } else {
+ librbd::aio_read(ictx, off, len, buf, pbl, c);
+ }
+}
+
+void submit_aio_write(librbd::ImageCtx *ictx, uint64_t off, size_t len,
+ const char *buf, librbd::AioCompletion *c) {
+ if (ictx->cct->_conf->rbd_non_blocking_aio) {
+ ictx->aio_work_queue->queue(new C_AioWriteWQ(ictx, off, len, buf, c));
+ } else {
+ librbd::aio_write(ictx, off, len, buf, c);
+ }
+}
+
+void submit_aio_discard(librbd::ImageCtx *ictx, uint64_t off, uint64_t len,
+ librbd::AioCompletion *c) {
+ if (ictx->cct->_conf->rbd_non_blocking_aio) {
+ ictx->aio_work_queue->queue(new C_AioDiscardWQ(ictx, off, len, c));
+ } else {
+ librbd::aio_discard(ictx, off, len, c);
+ }
+}
+
+void submit_aio_flush(librbd::ImageCtx *ictx, librbd::AioCompletion *c) {
+ if (ictx->cct->_conf->rbd_non_blocking_aio) {
+ ictx->aio_work_queue->queue(new C_AioFlushWQ(ictx, c));
+ } else {
+ librbd::aio_flush(ictx, c);
+ }
+}
+
+librbd::AioCompletion* get_aio_completion(librbd::RBD::AioCompletion *comp) {
+ return reinterpret_cast<librbd::AioCompletion *>(comp->pc);
+}
+
+} // anonymous namespace
+
namespace librbd {
ProgressContext::~ProgressContext()
{
@@ -483,14 +595,15 @@ namespace librbd {
ImageCtx *ictx = (ImageCtx *)ctx;
if (bl.length() < len)
return -EINVAL;
- return librbd::aio_write(ictx, off, len, bl.c_str(),
- (librbd::AioCompletion *)c->pc);
+ submit_aio_write(ictx, off, len, bl.c_str(), get_aio_completion(c));
+ return 0;
}
int Image::aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c)
{
ImageCtx *ictx = (ImageCtx *)ctx;
- return librbd::aio_discard(ictx, off, len, (librbd::AioCompletion *)c->pc);
+ submit_aio_discard(ictx, off, len, get_aio_completion(c));
+ return 0;
}
int Image::aio_read(uint64_t off, size_t len, bufferlist& bl,
@@ -499,7 +612,8 @@ namespace librbd {
ImageCtx *ictx = (ImageCtx *)ctx;
ldout(ictx->cct, 10) << "Image::aio_read() buf=" << (void *)bl.c_str() << "~"
<< (void *)(bl.c_str() + len - 1) << dendl;
- return librbd::aio_read(ictx, off, len, NULL, &bl, (librbd::AioCompletion *)c->pc);
+ submit_aio_read(ictx, off, len, NULL, &bl, get_aio_completion(c));
+ return 0;
}
int Image::flush()
@@ -511,7 +625,8 @@ namespace librbd {
int Image::aio_flush(RBD::AioCompletion *c)
{
ImageCtx *ictx = (ImageCtx *)ctx;
- return librbd::aio_flush(ictx, (librbd::AioCompletion *)c->pc);
+ submit_aio_flush(ictx, get_aio_completion(c));
+ return 0;
}
int Image::invalidate_cache()
@@ -1102,8 +1217,8 @@ extern "C" int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len,
{
librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
- return librbd::aio_write(ictx, off, len, buf,
- (librbd::AioCompletion *)comp->pc);
+ submit_aio_write(ictx, off, len, buf, get_aio_completion(comp));
+ return 0;
}
extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len,
@@ -1111,7 +1226,8 @@ extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len,
{
librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
- return librbd::aio_discard(ictx, off, len, (librbd::AioCompletion *)comp->pc);
+ submit_aio_discard(ictx, off, len, get_aio_completion(comp));
+ return 0;
}
extern "C" int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len,
@@ -1119,8 +1235,8 @@ extern "C" int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len,
{
librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
- return librbd::aio_read(ictx, off, len, buf, NULL,
- (librbd::AioCompletion *)comp->pc);
+ submit_aio_read(ictx, off, len, buf, NULL, get_aio_completion(comp));
+ return 0;
}
extern "C" int rbd_flush(rbd_image_t image)
@@ -1133,7 +1249,8 @@ extern "C" int rbd_aio_flush(rbd_image_t image, rbd_completion_t c)
{
librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
- return librbd::aio_flush(ictx, (librbd::AioCompletion *)comp->pc);
+ submit_aio_flush(ictx, get_aio_completion(comp));
+ return 0;
}
extern "C" int rbd_invalidate_cache(rbd_image_t image)
diff --git a/src/log/Log.cc b/src/log/Log.cc
index 37bb4ef..392b2c0 100644
--- a/src/log/Log.cc
+++ b/src/log/Log.cc
@@ -112,6 +112,7 @@ void Log::set_log_file(string fn)
void Log::reopen_log_file()
{
+ pthread_mutex_lock(&m_flush_mutex);
if (m_fd >= 0)
VOID_TEMP_FAILURE_RETRY(::close(m_fd));
if (m_log_file.length()) {
@@ -119,6 +120,7 @@ void Log::reopen_log_file()
} else {
m_fd = -1;
}
+ pthread_mutex_unlock(&m_flush_mutex);
}
void Log::set_syslog_level(int log, int crash)
diff --git a/src/mds/MDSUtility.cc b/src/mds/MDSUtility.cc
index 09be280..65cf5c6 100644
--- a/src/mds/MDSUtility.cc
+++ b/src/mds/MDSUtility.cc
@@ -25,7 +25,7 @@ MDSUtility::MDSUtility() :
waiting_for_mds_map(NULL)
{
monc = new MonClient(g_ceph_context);
- messenger = Messenger::create(g_ceph_context, entity_name_t::CLIENT(), "mds", getpid());
+ messenger = Messenger::create_client_messenger(g_ceph_context, "mds");
mdsmap = new MDSMap();
osdmap = new OSDMap();
objecter = new Objecter(g_ceph_context, messenger, monc, osdmap, lock, timer, 0, 0);
diff --git a/src/messages/MOSDBoot.h b/src/messages/MOSDBoot.h
index bfe7775..e59b03c 100644
--- a/src/messages/MOSDBoot.h
+++ b/src/messages/MOSDBoot.h
@@ -22,7 +22,7 @@
class MOSDBoot : public PaxosServiceMessage {
- static const int HEAD_VERSION = 5;
+ static const int HEAD_VERSION = 6;
static const int COMPAT_VERSION = 2;
public:
@@ -31,21 +31,24 @@ class MOSDBoot : public PaxosServiceMessage {
entity_addr_t cluster_addr;
epoch_t boot_epoch; // last epoch this daemon was added to the map (if any)
map<string,string> metadata; ///< misc metadata about this osd
+ uint64_t osd_features;
MOSDBoot()
: PaxosServiceMessage(MSG_OSD_BOOT, 0, HEAD_VERSION, COMPAT_VERSION),
- boot_epoch(0)
+ boot_epoch(0), osd_features(0)
{ }
MOSDBoot(OSDSuperblock& s, epoch_t be,
const entity_addr_t& hb_back_addr_ref,
const entity_addr_t& hb_front_addr_ref,
- const entity_addr_t& cluster_addr_ref)
+ const entity_addr_t& cluster_addr_ref,
+ uint64_t feat)
: PaxosServiceMessage(MSG_OSD_BOOT, s.current_epoch, HEAD_VERSION, COMPAT_VERSION),
sb(s),
hb_back_addr(hb_back_addr_ref),
hb_front_addr(hb_front_addr_ref),
cluster_addr(cluster_addr_ref),
- boot_epoch(be)
+ boot_epoch(be),
+ osd_features(feat)
{ }
private:
@@ -54,7 +57,9 @@ private:
public:
const char *get_type_name() const { return "osd_boot"; }
void print(ostream& out) const {
- out << "osd_boot(osd." << sb.whoami << " booted " << boot_epoch << " v" << version << ")";
+ out << "osd_boot(osd." << sb.whoami << " booted " << boot_epoch
+ << " features " << osd_features
+ << " v" << version << ")";
}
void encode_payload(uint64_t features) {
@@ -65,6 +70,7 @@ public:
::encode(boot_epoch, payload);
::encode(hb_front_addr, payload);
::encode(metadata, payload);
+ ::encode(osd_features, payload);
}
void decode_payload() {
bufferlist::iterator p = payload.begin();
@@ -79,6 +85,10 @@ public:
::decode(hb_front_addr, p);
if (header.version >= 5)
::decode(metadata, p);
+ if (header.version >= 6)
+ ::decode(osd_features, p);
+ else
+ osd_features = 0;
}
};
diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc
index 359c2e1..9e074e2 100644
--- a/src/mon/AuthMonitor.cc
+++ b/src/mon/AuthMonitor.cc
@@ -339,7 +339,8 @@ uint64_t AuthMonitor::assign_global_id(MAuth *m, bool should_increase_max)
// bump the max?
while (mon->is_leader() &&
- next_global_id >= max_global_id - g_conf->mon_globalid_prealloc / 2) {
+ (max_global_id < g_conf->mon_globalid_prealloc ||
+ next_global_id >= max_global_id - g_conf->mon_globalid_prealloc / 2)) {
increase_max_global_id();
}
diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc
index 2ee10f7..debe58c 100644
--- a/src/mon/Elector.cc
+++ b/src/mon/Elector.cc
@@ -78,8 +78,15 @@ void Elector::start()
init();
// start by trying to elect me
- if (epoch % 2 == 0)
+ if (epoch % 2 == 0) {
bump_epoch(epoch+1); // odd == election cycle
+ } else {
+ // do a trivial db write just to ensure it is writeable.
+ MonitorDBStore::Transaction t;
+ t.put(Monitor::MONITOR_NAME, "election_writeable_test", rand());
+ int r = mon->store->apply_transaction(t);
+ assert(r >= 0);
+ }
start_stamp = ceph_clock_now(g_ceph_context);
electing_me = true;
acked_me[mon->rank] = CEPH_FEATURES_ALL;
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 12755b7..b05b8e8 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -208,17 +208,16 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
// check privileges, ignore if fails
MonSession *session = m->get_session();
- if (!session)
- goto out;
+ assert(session);
if (!session->is_capable("mds", MON_CAP_X)) {
dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
<< session->caps << dendl;
- goto out;
+ goto ignore;
}
if (m->get_fsid() != mon->monmap->fsid) {
dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon->monmap->fsid << dendl;
- goto out;
+ goto ignore;
}
dout(12) << "preprocess_beacon " << *m
@@ -229,13 +228,13 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
// make sure the address has a port
if (m->get_orig_source_addr().get_port() == 0) {
dout(1) << " ignoring boot message without a port" << dendl;
- goto out;
+ goto ignore;
}
// check compat
if (!m->get_compat().writeable(mdsmap.compat)) {
dout(1) << " mds " << m->get_source_inst() << " can't write to mdsmap " << mdsmap.compat << dendl;
- goto out;
+ goto ignore;
}
// fw to leader?
@@ -244,7 +243,7 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
if (pending_mdsmap.test_flag(CEPH_MDSMAP_DOWN)) {
dout(7) << " mdsmap DOWN flag set, ignoring mds " << m->get_source_inst() << " beacon" << dendl;
- goto out;
+ goto ignore;
}
// booted, but not in map?
@@ -252,7 +251,8 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
if (state != MDSMap::STATE_BOOT) {
dout(7) << "mds_beacon " << *m << " is not in mdsmap" << dendl;
mon->send_reply(m, new MMDSMap(mon->monmap->fsid, &mdsmap));
- goto out;
+ m->put();
+ return true;
} else {
return false; // not booted yet.
}
@@ -262,13 +262,13 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
// old seq?
if (info.state_seq > seq) {
dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
- goto out;
+ goto ignore;
}
if (mdsmap.get_epoch() != m->get_last_epoch_seen()) {
dout(10) << "mds_beacon " << *m
<< " ignoring requested state, because mds hasn't seen latest map" << dendl;
- goto ignore;
+ goto reply;
}
if (info.laggy()) {
@@ -277,17 +277,17 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
}
if (state == MDSMap::STATE_BOOT) {
// ignore, already booted.
- goto out;
+ goto ignore;
}
// is there a state change here?
- if (info.state != state) {
+ if (info.state != state) {
// legal state change?
if ((info.state == MDSMap::STATE_STANDBY ||
info.state == MDSMap::STATE_STANDBY_REPLAY ||
info.state == MDSMap::STATE_ONESHOT_REPLAY) && state > 0) {
dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
<< " -> " << ceph_mds_state_name(state) << ")" << dendl;
- goto ignore;
+ goto reply;
}
if (info.state == MDSMap::STATE_STANDBY &&
@@ -299,21 +299,24 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m)
dout(10) << "mds_beacon can't standby-replay mds." << m->get_standby_for_rank() << " at this time (cluster degraded, or mds not active)" << dendl;
dout(10) << "pending_mdsmap.is_degraded()==" << pending_mdsmap.is_degraded()
<< " rank state: " << ceph_mds_state_name(pending_mdsmap.get_state(m->get_standby_for_rank())) << dendl;
- goto ignore;
+ goto reply;
}
_note_beacon(m);
return false; // need to update map
}
- ignore:
+ reply:
// note time and reply
_note_beacon(m);
mon->send_reply(m,
new MMDSBeacon(mon->monmap->fsid, m->get_global_id(), m->get_name(),
mdsmap.get_epoch(), state, seq));
-
- // done
- out:
+ m->put();
+ return true;
+
+ ignore:
+ // I won't reply this beacon, drop it.
+ mon->no_reply(m);
m->put();
return true;
}
@@ -525,8 +528,14 @@ void MDSMonitor::_updated(MMDSBeacon *m)
if (m->get_state() == MDSMap::STATE_STOPPED) {
// send the map manually (they're out of the map, so they won't get it automatic)
mon->send_reply(m, new MMDSMap(mon->monmap->fsid, &mdsmap));
+ } else {
+ mon->send_reply(m, new MMDSBeacon(mon->monmap->fsid,
+ m->get_global_id(),
+ m->get_name(),
+ mdsmap.get_epoch(),
+ m->get_state(),
+ m->get_seq()));
}
-
m->put();
}
diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h
index 901e93e..230f0ac 100644
--- a/src/mon/MDSMonitor.h
+++ b/src/mon/MDSMonitor.h
@@ -56,10 +56,12 @@ class MDSMonitor : public PaxosService {
void finish(int r) {
if (r >= 0)
mm->_updated(m); // success
- else if (r == -ECANCELED)
+ else if (r == -ECANCELED) {
+ mm->mon->no_reply(m);
m->put();
- else
+ } else {
mm->dispatch((PaxosServiceMessage*)m); // try again
+ }
}
};
diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc
index 3a6dda4..0272dfd 100644
--- a/src/mon/MonClient.cc
+++ b/src/mon/MonClient.cc
@@ -12,7 +12,7 @@
*
*/
-#include "msg/SimpleMessenger.h"
+#include "msg/Messenger.h"
#include "messages/MMonGetMap.h"
#include "messages/MMonGetVersion.h"
#include "messages/MMonGetVersionReply.h"
@@ -114,11 +114,9 @@ int MonClient::get_monmap_privately()
Mutex::Locker l(monc_lock);
bool temp_msgr = false;
- SimpleMessenger* smessenger = NULL;
+ Messenger* smessenger = NULL;
if (!messenger) {
- messenger = smessenger = new SimpleMessenger(cct,
- entity_name_t::CLIENT(-1),
- "temp_mon_client", getpid());
+ messenger = smessenger = Messenger::create_client_messenger(cct, "temp_mon_client");
messenger->add_dispatcher_head(this);
smessenger->start();
temp_msgr = true;
@@ -213,9 +211,7 @@ int MonClient::ping_monitor(const string &mon_id, string *result_reply)
MonClientPinger *pinger = new MonClientPinger(cct, result_reply);
- Messenger *smsgr = new SimpleMessenger(cct,
- entity_name_t::CLIENT(-1),
- "temp_ping_client", getpid());
+ Messenger *smsgr = Messenger::create_client_messenger(cct, "temp_ping_client");
smsgr->add_dispatcher_head(pinger);
smsgr->start();
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index 461b3f2..d0908cb 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -564,11 +564,11 @@ COMMAND("osd pool rename " \
"rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
COMMAND("osd pool get " \
"name=pool,type=CephPoolname " \
- "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile", \
+ "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote", \
"get pool parameter <var>", "osd", "r", "cli,rest")
COMMAND("osd pool set " \
"name=pool,type=CephPoolname " \
- "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid " \
+ "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote " \
"name=val,type=CephString " \
"name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
"set pool parameter <var> to <val>", "osd", "rw", "cli,rest")
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 2240e41..339fd3e 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -642,7 +642,7 @@ void Monitor::refresh_from_paxos(bool *need_bootstrap)
paxos_service[i]->refresh(need_bootstrap);
}
for (int i = 0; i < PAXOS_NUM; ++i) {
- paxos_service[i]->post_paxos_update();
+ paxos_service[i]->post_refresh();
}
}
@@ -1925,30 +1925,6 @@ void Monitor::get_health(string& status, bufferlist *detailbl, Formatter *f)
health_monitor->get_health(f, summary, (detailbl ? &detail : NULL));
- if (f)
- f->open_array_section("summary");
- stringstream ss;
- health_status_t overall = HEALTH_OK;
- if (!summary.empty()) {
- ss << ' ';
- while (!summary.empty()) {
- if (overall > summary.front().first)
- overall = summary.front().first;
- ss << summary.front().second;
- if (f) {
- f->open_object_section("item");
- f->dump_stream("severity") << summary.front().first;
- f->dump_string("summary", summary.front().second);
- f->close_section();
- }
- summary.pop_front();
- if (!summary.empty())
- ss << "; ";
- }
- }
- if (f)
- f->close_section();
-
if (f) {
f->open_object_section("timechecks");
f->dump_int("epoch", get_epoch());
@@ -1957,6 +1933,8 @@ void Monitor::get_health(string& status, bufferlist *detailbl, Formatter *f)
<< ((timecheck_round%2) ? "on-going" : "finished");
}
+ stringstream ss;
+ health_status_t overall = HEALTH_OK;
if (!timecheck_skews.empty()) {
list<string> warns;
if (f)
@@ -2003,6 +1981,7 @@ void Monitor::get_health(string& status, bufferlist *detailbl, Formatter *f)
if (!warns.empty())
ss << ",";
}
+ summary.push_back(make_pair(HEALTH_WARN, "Monitor clock skew detected "));
}
if (f)
f->close_section();
@@ -2010,6 +1989,28 @@ void Monitor::get_health(string& status, bufferlist *detailbl, Formatter *f)
if (f)
f->close_section();
+ if (f)
+ f->open_array_section("summary");
+ if (!summary.empty()) {
+ ss << ' ';
+ while (!summary.empty()) {
+ if (overall > summary.front().first)
+ overall = summary.front().first;
+ ss << summary.front().second;
+ if (f) {
+ f->open_object_section("item");
+ f->dump_stream("severity") << summary.front().first;
+ f->dump_string("summary", summary.front().second);
+ f->close_section();
+ }
+ summary.pop_front();
+ if (!summary.empty())
+ ss << "; ";
+ }
+ }
+ if (f)
+ f->close_section();
+
stringstream fss;
fss << overall;
status = fss.str() + ss.str();
@@ -2577,7 +2578,8 @@ void Monitor::forward_request_leader(PaxosServiceMessage *req)
routed_requests[rr->tid] = rr;
session->routed_request_tids.insert(rr->tid);
- dout(10) << "forward_request " << rr->tid << " request " << *req << dendl;
+ dout(10) << "forward_request " << rr->tid << " request " << *req
+ << " features " << rr->con_features << dendl;
MForward *forward = new MForward(rr->tid, req,
rr->con_features,
@@ -2934,12 +2936,11 @@ bool Monitor::_ms_dispatch(Message *m)
dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl;
}
- if (s) {
- if (s->auth_handler) {
- s->entity_name = s->auth_handler->get_entity_name();
- }
- dout(20) << " caps " << s->caps.get_str() << dendl;
+ assert(s);
+ if (s->auth_handler) {
+ s->entity_name = s->auth_handler->get_entity_name();
}
+ dout(20) << " caps " << s->caps.get_str() << dendl;
if (is_synchronizing() && !src_is_mon) {
waitlist_or_zap_client(m);
@@ -2947,11 +2948,7 @@ bool Monitor::_ms_dispatch(Message *m)
}
ret = dispatch(s, m, src_is_mon);
-
- if (s) {
- s->put();
- }
-
+ s->put();
return ret;
}
diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h
index 1576db7..f16770c 100644
--- a/src/mon/MonitorDBStore.h
+++ b/src/mon/MonitorDBStore.h
@@ -283,6 +283,8 @@ class MonitorDBStore
db->compact_range_async(compact.front().first, compact.front().second.first, compact.front().second.second);
compact.pop_front();
}
+ } else {
+ assert(0 == "failed to write to db");
}
return r;
}
@@ -390,11 +392,15 @@ class MonitorDBStore
virtual pair<string,string> get_next_key() {
assert(iter->valid());
- pair<string,string> r = iter->raw_key();
- do {
- iter->next();
- } while (iter->valid() && sync_prefixes.count(iter->raw_key().first) == 0);
- return r;
+
+ for (; iter->valid(); iter->next()) {
+ pair<string,string> r = iter->raw_key();
+ if (sync_prefixes.count(r.first) > 0) {
+ iter->next();
+ return r;
+ }
+ }
+ return pair<string,string>();
}
virtual bool _is_valid() {
@@ -506,7 +512,8 @@ class MonitorDBStore
for (iter = prefixes.begin(); iter != prefixes.end(); ++iter) {
dbt->rmkeys_by_prefix((*iter));
}
- db->submit_transaction_sync(dbt);
+ int r = db->submit_transaction_sync(dbt);
+ assert(r >= 0);
}
void init_options() {
diff --git a/src/mon/MonitorStore.cc b/src/mon/MonitorStore.cc
index db21a94..afaddab 100644
--- a/src/mon/MonitorStore.cc
+++ b/src/mon/MonitorStore.cc
@@ -437,7 +437,13 @@ void MonitorStore::put_bl_sn_map(const char *a,
derr << "failed to open " << dir << ": " << cpp_strerror(err) << dendl;
assert(0 == "failed to open temp file");
}
- sync_filesystem(dirfd);
+
+ err = sync_filesystem(dirfd);
+ if (err < 0) {
+ derr << "sync_filesystem error " << cpp_strerror(err) << dendl;
+ assert(0 == "failed to sync_filesystem");
+ }
+
close_err = TEMP_FAILURE_RETRY(::close(dirfd));
assert (0 == close_err);
@@ -481,7 +487,13 @@ void MonitorStore::sync()
<< ": " << cpp_strerror(err) << dendl;
assert(0 == "failed to open dir for syncing");
}
- sync_filesystem(dirfd);
+
+ int ret = sync_filesystem(dirfd);
+ if (ret < 0) {
+ derr << __func__ << " sync_filesystem error " << cpp_strerror(ret) << dendl;
+ assert(0 == "failed to sync_filesystem");
+ }
+
int close_err = TEMP_FAILURE_RETRY(::close(dirfd));
assert (0 == close_err);
}
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 510b727..599de08 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -66,6 +66,12 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, OSDMap& osdmap) {
<< ").osd e" << osdmap.get_epoch() << " ";
}
+OSDMonitor::OSDMonitor(Monitor *mn, Paxos *p, string service_name)
+ : PaxosService(mn, p, service_name),
+ inc_osd_cache(g_conf->mon_osd_cache_size),
+ full_osd_cache(g_conf->mon_osd_cache_size),
+ thrash_map(0), thrash_last_up_osd(-1) { }
+
bool OSDMonitor::_have_pending_crush()
{
return pending_inc.crush.length();
@@ -1349,8 +1355,13 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m)
xi.laggy_probability * (1.0 - g_conf->mon_osd_laggy_weight);
dout(10) << " laggy, now xi " << xi << dendl;
}
+
// set features shared by the osd
- xi.features = m->get_connection()->get_features();
+ if (m->osd_features)
+ xi.features = m->osd_features;
+ else
+ xi.features = m->get_connection()->get_features();
+
pending_inc.new_xinfo[from] = xi;
// wait
@@ -1765,6 +1776,29 @@ void OSDMonitor::send_incremental(epoch_t first, entity_inst_t& dest, bool oneti
}
}
+int OSDMonitor::get_version(version_t ver, bufferlist& bl)
+{
+ if (inc_osd_cache.lookup(ver, &bl)) {
+ return 0;
+ }
+ int ret = PaxosService::get_version(ver, bl);
+ if (!ret) {
+ inc_osd_cache.add(ver, bl);
+ }
+ return ret;
+}
+
+int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
+{
+ if (full_osd_cache.lookup(ver, &bl)) {
+ return 0;
+ }
+ int ret = PaxosService::get_version_full(ver, bl);
+ if (!ret) {
+ full_osd_cache.add(ver, bl);
+ }
+ return ret;
+}
@@ -2476,8 +2510,6 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
} else if (prefix == "osd crush get-tunable") {
string tunable;
cmd_getval(g_ceph_context, cmdmap, "tunable", tunable);
- int value;
- cmd_getval(g_ceph_context, cmdmap, "value", value);
ostringstream rss;
if (f)
f->open_object_section("tunable");
@@ -2587,6 +2619,8 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
} else if (var == "erasure_code_profile") {
f->dump_string("erasure_code_profile", p->erasure_code_profile);
+ } else if (var == "min_read_recency_for_promote") {
+ f->dump_int("min_read_recency_for_promote", p->min_read_recency_for_promote);
}
f->close_section();
@@ -2636,6 +2670,8 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
ss << "cache_min_evict_age: " << p->cache_min_evict_age;
} else if (var == "erasure_code_profile") {
ss << "erasure_code_profile: " << p->erasure_code_profile;
+ } else if (var == "min_read_recency_for_promote") {
+ ss << "min_read_recency_for_promote: " << p->min_read_recency_for_promote;
}
rdata.append(ss);
@@ -3039,7 +3075,7 @@ void OSDMonitor::get_pools_health(
} else if (warn_threshold > 0 &&
sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
ss << "pool '" << pool_name
- << "' has " << si_t(sum.num_bytes) << " objects"
+ << "' has " << si_t(sum.num_bytes) << " bytes"
<< " (max " << si_t(pool.quota_max_bytes) << ")";
status = HEALTH_WARN;
}
@@ -3752,6 +3788,12 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
return -EINVAL;
}
p.cache_min_evict_age = n;
+ } else if (var == "min_read_recency_for_promote") {
+ if (interr.length()) {
+ ss << "error parsing integer value '" << val << "': " << interr;
+ return -EINVAL;
+ }
+ p.min_read_recency_for_promote = n;
} else {
ss << "unrecognized variable '" << var << "'";
return -EINVAL;
@@ -4834,7 +4876,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
}
if (osdmap.exists(id)) {
pending_inc.new_weight[id] = ww;
- ss << "reweighted osd." << id << " to " << w << " (" << ios::hex << ww << ios::dec << ")";
+ ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
getline(ss, rs);
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
get_last_committed() + 1));
@@ -5310,6 +5352,17 @@ done:
err = -ENOTEMPTY;
goto reply;
}
+ if (tp->ec_pool()) {
+ ss << "tier pool '" << tierpoolstr
+ << "' is an ec pool, which cannot be a tier";
+ err = -ENOTSUP;
+ goto reply;
+ }
+ if (!tp->removed_snaps.empty() || !tp->snaps.empty()) {
+ ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
+ err = -ENOTEMPTY;
+ goto reply;
+ }
// go
pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
@@ -5637,6 +5690,7 @@ done:
ntp->cache_mode = mode;
ntp->hit_set_count = g_conf->osd_tier_default_cache_hit_set_count;
ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
+ ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
ntp->hit_set_params = hsp;
ntp->target_max_bytes = size;
ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
@@ -5990,6 +6044,12 @@ int OSDMonitor::_check_remove_pool(int64_t pool, const pg_pool_t *p,
}
return -EBUSY;
}
+
+ if (!g_conf->mon_allow_pool_delete) {
+ *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
+ return -EPERM;
+ }
+
*ss << "pool '" << poolstr << "' removed";
return 0;
}
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index fbce5fe..6428820 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -26,6 +26,7 @@
using namespace std;
#include "include/types.h"
+#include "common/simple_cache.hpp"
#include "msg/Messenger.h"
#include "osd/OSDMap.h"
@@ -139,6 +140,8 @@ private:
* optimization to try to avoid sending the same inc maps twice.
*/
map<int,epoch_t> osd_epoch;
+ SimpleLRU<version_t, bufferlist> inc_osd_cache;
+ SimpleLRU<version_t, bufferlist> full_osd_cache;
void check_failures(utime_t now);
bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
@@ -358,9 +361,7 @@ private:
bool prepare_remove_snaps(struct MRemoveSnaps *m);
public:
- OSDMonitor(Monitor *mn, Paxos *p, string service_name)
- : PaxosService(mn, p, service_name),
- thrash_map(0), thrash_last_up_osd(-1) { }
+ OSDMonitor(Monitor *mn, Paxos *p, string service_name);
void tick(); // check state, take actions
@@ -384,6 +385,9 @@ private:
send_incremental(m, start);
}
+ int get_version(version_t ver, bufferlist& bl);
+ int get_version_full(version_t ver, bufferlist& bl);
+
epoch_t blacklist(const entity_addr_t& a, utime_t until);
void dump_info(Formatter *f);
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index 3e0523b..c85d55b 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1000,8 +1000,8 @@ bool PGMonitor::register_new_pgs()
++p) {
int64_t poolid = p->first;
pg_pool_t &pool = p->second;
- int ruleno = pool.get_crush_ruleset();
- if (!osdmap->crush->rule_exists(ruleno))
+ int ruleno = osdmap->crush->find_rule(pool.get_crush_ruleset(), pool.get_type(), pool.get_size());
+ if (ruleno < 0 || !osdmap->crush->rule_exists(ruleno))
continue;
if (pool.get_last_change() <= pg_map.last_pg_scan ||
@@ -1991,7 +1991,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
((1000000 - p->second.cache_target_full_ratio_micro) *
g_conf->mon_cache_target_full_warn_ratio);
if (p->second.target_max_objects && (uint64_t)st.stats.sum.num_objects >
- p->second.target_max_objects * ratio / 1000000) {
+ p->second.target_max_objects * (ratio / 1000000.0)) {
nearfull = true;
if (detail) {
ostringstream ss;
@@ -2003,7 +2003,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
}
}
if (p->second.target_max_bytes && (uint64_t)st.stats.sum.num_bytes >
- p->second.target_max_bytes * ratio / 1000000) {
+ p->second.target_max_bytes * (ratio / 1000000.0)) {
nearfull = true;
if (detail) {
ostringstream ss;
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index 1b21689..7ba8e9c 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -127,6 +127,16 @@ void PaxosService::refresh(bool *need_bootstrap)
update_from_paxos(need_bootstrap);
}
+void PaxosService::post_refresh()
+{
+ dout(10) << __func__ << dendl;
+
+ post_paxos_update();
+
+ if (mon->is_peon() && !waiting_for_finished_proposal.empty()) {
+ finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
+ }
+}
void PaxosService::remove_legacy_versions()
{
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
index 5321beb..6affd5c 100644
--- a/src/mon/PaxosService.h
+++ b/src/mon/PaxosService.h
@@ -322,6 +322,7 @@ public:
bool dispatch(PaxosServiceMessage *m);
void refresh(bool *need_bootstrap);
+ void post_refresh();
/**
* @defgroup PaxosService_h_override_funcs Functions that should be
@@ -857,7 +858,7 @@ public:
* @param bl The bufferlist to be populated
* @return 0 on success; <0 otherwise
*/
- int get_version(version_t ver, bufferlist& bl) {
+ virtual int get_version(version_t ver, bufferlist& bl) {
return mon->store->get(get_service_name(), ver, bl);
}
/**
@@ -867,7 +868,7 @@ public:
* @param bl The bufferlist to be populated
* @returns 0 on success; <0 otherwise
*/
- int get_version_full(version_t ver, bufferlist& bl) {
+ virtual int get_version_full(version_t ver, bufferlist& bl) {
string key = mon->store->combine_strings(full_prefix_name, ver);
return mon->store->get(get_service_name(), key, bl);
}
diff --git a/src/msg/Messenger.cc b/src/msg/Messenger.cc
index b80782d..5dc69a6 100644
--- a/src/msg/Messenger.cc
+++ b/src/msg/Messenger.cc
@@ -4,6 +4,13 @@
#include "SimpleMessenger.h"
+Messenger *Messenger::create_client_messenger(CephContext *cct, string lname)
+{
+ uint64_t nonce = 0;
+ get_random_bytes((char*)&nonce, sizeof(nonce));
+ return Messenger::create(cct, entity_name_t::CLIENT(), lname, nonce);
+}
+
Messenger *Messenger::create(CephContext *cct,
entity_name_t name,
string lname,
diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h
index 42feaf2..82ac8e6 100644
--- a/src/msg/Messenger.h
+++ b/src/msg/Messenger.h
@@ -148,6 +148,21 @@ public:
uint64_t nonce);
/**
+ * create a new messenger
+ *
+ * Create a new messenger instance.
+ * Same as the above, but a slightly simpler interface for clients:
+ * - Generate a random nonce
+ * - use the default feature bits
+ * - get the messenger type from cct
+ * - use the client entity_type
+ *
+ * @param cct context
+ * @param lname logical name of the messenger in this process (e.g., "client")
+ */
+ static Messenger *create_client_messenger(CephContext *cct, string lname);
+
+ /**
* @defgroup Accessors
* @{
*/
diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc
index b1d2db1..3d2fc8a 100644
--- a/src/os/FileJournal.cc
+++ b/src/os/FileJournal.cc
@@ -27,6 +27,7 @@
#include <limits.h>
#include <sstream>
#include <stdio.h>
+#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mount.h>
@@ -296,24 +297,28 @@ int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,
if (create && g_conf->journal_zero_on_create) {
derr << "FileJournal::_open_file : zeroing journal" << dendl;
uint64_t write_size = 1 << 20;
- char *buf = new char[write_size];
+ char *buf;
+ ret = ::posix_memalign((void **)&buf, block_size, write_size);
+ if (ret != 0) {
+ return ret;
+ }
memset(static_cast<void*>(buf), 0, write_size);
uint64_t i = 0;
for (; (i + write_size) <= (unsigned)max_size; i += write_size) {
ret = ::pwrite(fd, static_cast<void*>(buf), write_size, i);
if (ret < 0) {
- delete [] buf;
+ free(buf);
return -errno;
}
}
if (i < (unsigned)max_size) {
ret = ::pwrite(fd, static_cast<void*>(buf), max_size - i, i);
if (ret < 0) {
- delete [] buf;
+ free(buf);
return -errno;
}
}
- delete [] buf;
+ free(buf);
}
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index dd28f6a..a2528a5 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -1901,7 +1901,11 @@ void FileStore::_set_global_replay_guard(coll_t cid,
return;
// sync all previous operations on this sequencer
- sync_filesystem(basedir_fd);
+ int ret = sync_filesystem(basedir_fd);
+ if (ret < 0) {
+ derr << __func__ << " :sync_filesytem error " << cpp_strerror(ret) << dendl;
+ assert(0 == "_set_global_replay_guard failed");
+ }
char fn[PATH_MAX];
get_cdir(cid, fn, sizeof(fn));
diff --git a/src/os/WBThrottle.cc b/src/os/WBThrottle.cc
index 0354ceb..62b2ddc 100644
--- a/src/os/WBThrottle.cc
+++ b/src/os/WBThrottle.cc
@@ -246,6 +246,7 @@ void WBThrottle::clear_object(const ghobject_t &hoid)
pending_wbs.erase(i);
remove_object(hoid);
+ cond.Signal();
}
void WBThrottle::throttle()
diff --git a/src/os/chain_xattr.cc b/src/os/chain_xattr.cc
index c020c9d..e90462a 100644
--- a/src/os/chain_xattr.cc
+++ b/src/os/chain_xattr.cc
@@ -138,6 +138,10 @@ int chain_getxattr(const char *fn, const char *name, void *val, size_t size)
size -= chunk_size;
r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size);
+ if (i && r == -ENODATA) {
+ ret = pos;
+ break;
+ }
if (r < 0) {
ret = r;
break;
@@ -201,6 +205,10 @@ int chain_fgetxattr(int fd, const char *name, void *val, size_t size)
size -= chunk_size;
r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size);
+ if (i && r == -ENODATA) {
+ ret = pos;
+ break;
+ }
if (r < 0) {
ret = r;
break;
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index 39e3429..5235d4d 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -81,7 +81,7 @@ ostream &operator<<(ostream &lhs, const ECBackend::read_result_t &rhs)
lhs << "read_result_t(r=" << rhs.r
<< ", errors=" << rhs.errors;
if (rhs.attrs) {
- lhs << ", attrs=" << rhs.attrs;
+ lhs << ", attrs=" << rhs.attrs.get();
} else {
lhs << ", noattrs";
}
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index c0f4bdd..0befa92 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -940,8 +940,16 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
finished_lock("OSD::finished_lock"),
op_tracker(cct, cct->_conf->osd_enable_op_tracker),
test_ops_hook(NULL),
- op_wq(this, cct->_conf->osd_op_thread_timeout, &op_tp),
- peering_wq(this, cct->_conf->osd_op_thread_timeout, &op_tp),
+ op_wq(
+ this,
+ cct->_conf->osd_op_thread_timeout,
+ cct->_conf->osd_op_thread_suicide_timeout,
+ &op_tp),
+ peering_wq(
+ this,
+ cct->_conf->osd_op_thread_timeout,
+ cct->_conf->osd_op_thread_suicide_timeout,
+ &op_tp),
map_lock("OSD::map_lock"),
peer_map_epoch_lock("OSD::peer_map_epoch_lock"),
debug_drop_pg_create_probability(cct->_conf->osd_debug_drop_pg_create_probability),
@@ -953,15 +961,42 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
osd_stat_updated(false),
pg_stat_tid(0), pg_stat_tid_flushed(0),
- command_wq(this, cct->_conf->osd_command_thread_timeout, &command_tp),
+ command_wq(
+ this,
+ cct->_conf->osd_command_thread_timeout,
+ cct->_conf->osd_command_thread_suicide_timeout,
+ &command_tp),
recovery_ops_active(0),
- recovery_wq(this, cct->_conf->osd_recovery_thread_timeout, &recovery_tp),
+ recovery_wq(
+ this,
+ cct->_conf->osd_recovery_thread_timeout,
+ cct->_conf->osd_recovery_thread_suicide_timeout,
+ &recovery_tp),
replay_queue_lock("OSD::replay_queue_lock"),
- snap_trim_wq(this, cct->_conf->osd_snap_trim_thread_timeout, &disk_tp),
- scrub_wq(this, cct->_conf->osd_scrub_thread_timeout, &disk_tp),
- scrub_finalize_wq(cct->_conf->osd_scrub_finalize_thread_timeout, &op_tp),
- rep_scrub_wq(this, cct->_conf->osd_scrub_thread_timeout, &disk_tp),
- remove_wq(store, cct->_conf->osd_remove_thread_timeout, &disk_tp),
+ snap_trim_wq(
+ this,
+ cct->_conf->osd_snap_trim_thread_timeout,
+ cct->_conf->osd_snap_trim_thread_suicide_timeout,
+ &disk_tp),
+ scrub_wq(
+ this,
+ cct->_conf->osd_scrub_thread_timeout,
+ cct->_conf->osd_scrub_thread_suicide_timeout,
+ &disk_tp),
+ scrub_finalize_wq(
+ cct->_conf->osd_scrub_finalize_thread_timeout,
+ cct->_conf->osd_scrub_finalize_thread_suicide_timeout,
+ &op_tp),
+ rep_scrub_wq(
+ this,
+ cct->_conf->osd_scrub_thread_timeout,
+ cct->_conf->osd_scrub_thread_suicide_timeout,
+ &disk_tp),
+ remove_wq(
+ store,
+ cct->_conf->osd_remove_thread_timeout,
+ cct->_conf->osd_remove_thread_suicide_timeout,
+ &disk_tp),
next_removal_seq(0),
service(this)
{
@@ -3771,7 +3806,9 @@ void OSD::_send_boot()
dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
}
- MOSDBoot *mboot = new MOSDBoot(superblock, boot_epoch, hb_back_addr, hb_front_addr, cluster_addr);
+ MOSDBoot *mboot = new MOSDBoot(superblock, boot_epoch,
+ hb_back_addr, hb_front_addr, cluster_addr,
+ CEPH_FEATURES_ALL);
dout(10) << " client_addr " << client_messenger->get_myaddr()
<< ", cluster_addr " << cluster_addr
<< ", hb_back_addr " << hb_back_addr
@@ -3949,8 +3986,8 @@ void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epo
return;
}
const entity_inst_t& peer_inst = next_osdmap->get_cluster_inst(peer);
- Connection *peer_con = osd->cluster_messenger->get_connection(peer_inst).get();
- osd->_share_map_outgoing(peer, peer_con, next_osdmap);
+ ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
+ osd->_share_map_outgoing(peer, peer_con.get(), next_osdmap);
osd->cluster_messenger->send_message(m, peer_inst);
}
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index daa18ca..d06f0e7 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1147,9 +1147,9 @@ private:
map<PG*, list<OpRequestRef> > pg_for_processing;
OSD *osd;
PrioritizedQueue<pair<PGRef, OpRequestRef>, entity_inst_t > pqueue;
- OpWQ(OSD *o, time_t ti, ThreadPool *tp)
+ OpWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
: ThreadPool::WorkQueueVal<pair<PGRef, OpRequestRef>, PGRef >(
- "OSD::OpWQ", ti, ti*10, tp),
+ "OSD::OpWQ", ti, si, tp),
qlock("OpWQ::qlock"),
osd(o),
pqueue(o->cct->_conf->osd_op_pq_max_tokens_per_priority,
@@ -1211,9 +1211,9 @@ private:
list<PG*> peering_queue;
OSD *osd;
set<PG*> in_use;
- PeeringWQ(OSD *o, time_t ti, ThreadPool *tp)
+ PeeringWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
: ThreadPool::BatchWorkQueue<PG>(
- "OSD::PeeringWQ", ti, ti*10, tp), osd(o) {}
+ "OSD::PeeringWQ", ti, si, tp), osd(o) {}
void _dequeue(PG *pg) {
for (list<PG*>::iterator i = peering_queue.begin();
@@ -1599,8 +1599,8 @@ protected:
list<Command*> command_queue;
struct CommandWQ : public ThreadPool::WorkQueue<Command> {
OSD *osd;
- CommandWQ(OSD *o, time_t ti, ThreadPool *tp)
- : ThreadPool::WorkQueue<Command>("OSD::CommandWQ", ti, 0, tp), osd(o) {}
+ CommandWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
+ : ThreadPool::WorkQueue<Command>("OSD::CommandWQ", ti, si, tp), osd(o) {}
bool _empty() {
return osd->command_queue.empty();
@@ -1653,8 +1653,8 @@ protected:
struct RecoveryWQ : public ThreadPool::WorkQueue<PG> {
OSD *osd;
- RecoveryWQ(OSD *o, time_t ti, ThreadPool *tp)
- : ThreadPool::WorkQueue<PG>("OSD::RecoveryWQ", ti, ti*10, tp), osd(o) {}
+ RecoveryWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
+ : ThreadPool::WorkQueue<PG>("OSD::RecoveryWQ", ti, si, tp), osd(o) {}
bool _empty() {
return osd->recovery_queue.empty();
@@ -1711,8 +1711,8 @@ protected:
struct SnapTrimWQ : public ThreadPool::WorkQueue<PG> {
OSD *osd;
- SnapTrimWQ(OSD *o, time_t ti, ThreadPool *tp)
- : ThreadPool::WorkQueue<PG>("OSD::SnapTrimWQ", ti, 0, tp), osd(o) {}
+ SnapTrimWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
+ : ThreadPool::WorkQueue<PG>("OSD::SnapTrimWQ", ti, si, tp), osd(o) {}
bool _empty() {
return osd->snap_trim_queue.empty();
@@ -1756,8 +1756,8 @@ protected:
struct ScrubWQ : public ThreadPool::WorkQueue<PG> {
OSD *osd;
- ScrubWQ(OSD *o, time_t ti, ThreadPool *tp)
- : ThreadPool::WorkQueue<PG>("OSD::ScrubWQ", ti, 0, tp), osd(o) {}
+ ScrubWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
+ : ThreadPool::WorkQueue<PG>("OSD::ScrubWQ", ti, si, tp), osd(o) {}
bool _empty() {
return osd->scrub_queue.empty();
@@ -1802,8 +1802,8 @@ protected:
xlist<PG*> scrub_finalize_queue;
public:
- ScrubFinalizeWQ(time_t ti, ThreadPool *tp)
- : ThreadPool::WorkQueue<PG>("OSD::ScrubFinalizeWQ", ti, ti*10, tp) {}
+ ScrubFinalizeWQ(time_t ti, time_t si, ThreadPool *tp)
+ : ThreadPool::WorkQueue<PG>("OSD::ScrubFinalizeWQ", ti, si, tp) {}
bool _empty() {
return scrub_finalize_queue.empty();
@@ -1847,8 +1847,8 @@ protected:
list<MOSDRepScrub*> rep_scrub_queue;
public:
- RepScrubWQ(OSD *o, time_t ti, ThreadPool *tp)
- : ThreadPool::WorkQueue<MOSDRepScrub>("OSD::RepScrubWQ", ti, 0, tp), osd(o) {}
+ RepScrubWQ(OSD *o, time_t ti, time_t si, ThreadPool *tp)
+ : ThreadPool::WorkQueue<MOSDRepScrub>("OSD::RepScrubWQ", ti, si, tp), osd(o) {}
bool _empty() {
return rep_scrub_queue.empty();
@@ -1901,9 +1901,9 @@ protected:
public ThreadPool::WorkQueueVal<pair<PGRef, DeletingStateRef> > {
ObjectStore *&store;
list<pair<PGRef, DeletingStateRef> > remove_queue;
- RemoveWQ(ObjectStore *&o, time_t ti, ThreadPool *tp)
+ RemoveWQ(ObjectStore *&o, time_t ti, time_t si, ThreadPool *tp)
: ThreadPool::WorkQueueVal<pair<PGRef, DeletingStateRef> >(
- "OSD::RemoveWQ", ti, 0, tp),
+ "OSD::RemoveWQ", ti, si, tp),
store(o) {}
bool _empty() {
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 63dc1da..810c530 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -1183,13 +1183,6 @@ int OSDMap::apply_incremental(const Incremental &inc)
if (inc.new_pool_max != -1)
pool_max = inc.new_pool_max;
- for (set<int64_t>::const_iterator p = inc.old_pools.begin();
- p != inc.old_pools.end();
- ++p) {
- pools.erase(*p);
- name_pool.erase(pool_name[*p]);
- pool_name.erase(*p);
- }
for (map<int64_t,pg_pool_t>::const_iterator p = inc.new_pools.begin();
p != inc.new_pools.end();
++p) {
@@ -1204,6 +1197,13 @@ int OSDMap::apply_incremental(const Incremental &inc)
pool_name[p->first] = p->second;
name_pool[p->second] = p->first;
}
+ for (set<int64_t>::const_iterator p = inc.old_pools.begin();
+ p != inc.old_pools.end();
+ ++p) {
+ pools.erase(*p);
+ name_pool.erase(pool_name[*p]);
+ pool_name.erase(*p);
+ }
for (map<int32_t,uint32_t>::const_iterator i = inc.new_weight.begin();
i != inc.new_weight.end();
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index b9e9b1c..4b5ab6c 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1874,7 +1874,7 @@ void PG::mark_clean()
{
// only mark CLEAN if we have the desired number of replicas AND we
// are not remapped.
- if (acting.size() == get_osdmap()->get_pg_size(info.pgid.pgid) &&
+ if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid) &&
up == acting)
state_set(PG_STATE_CLEAN);
@@ -5323,12 +5323,12 @@ void PG::handle_advance_map(
<< dendl;
update_osdmap_ref(osdmap);
pool.update(osdmap);
- if (pool.info.last_change == osdmap_ref->get_epoch())
- on_pool_change();
AdvMap evt(
osdmap, lastmap, newup, up_primary,
newacting, acting_primary);
recovery_state.handle_event(evt, rctx);
+ if (pool.info.last_change == osdmap_ref->get_epoch())
+ on_pool_change();
}
void PG::handle_activate_map(RecoveryCtx *rctx)
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index f081055..71526a5 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -185,6 +185,18 @@ void PGLog::proc_replica_log(
dout(10) << "proc_replica_log for osd." << from << ": "
<< oinfo << " " << olog << " " << omissing << dendl;
+ if (olog.head < log.tail) {
+ dout(10) << __func__ << ": osd." << from << " does not overlap, not looking "
+ << "for divergent objects" << dendl;
+ return;
+ }
+ if (olog.head == log.head) {
+ dout(10) << __func__ << ": osd." << from << " same log head, not looking "
+ << "for divergent objects" << dendl;
+ return;
+ }
+ assert(olog.head >= log.tail);
+
/*
basically what we're doing here is rewinding the remote log,
dropping divergent entries, until we find something that matches
@@ -202,48 +214,54 @@ void PGLog::proc_replica_log(
<< " have " << i->second.have << dendl;
}
- list<pg_log_entry_t>::const_iterator fromiter = log.log.end();
- eversion_t lower_bound = log.tail;
+ list<pg_log_entry_t>::const_reverse_iterator first_non_divergent =
+ log.log.rbegin();
while (1) {
- if (fromiter == log.log.begin())
+ if (first_non_divergent == log.log.rend())
break;
- --fromiter;
- if (fromiter->version <= olog.head) {
- dout(20) << "merge_log cut point (usually last shared) is "
- << *fromiter << dendl;
- lower_bound = fromiter->version;
- ++fromiter;
+ if (first_non_divergent->version <= olog.head) {
+ dout(20) << "merge_log point (usually last shared) is "
+ << *first_non_divergent << dendl;
break;
}
+ ++first_non_divergent;
}
+ /* Because olog.head >= log.tail, we know that both pgs must at least have
+ * the event represented by log.tail. Thus, lower_bound >= log.tail. It's
+ * possible that olog/log contain no actual events between olog.head and
+ * log.tail, however, since they might have been split out. Thus, if
+ * we cannot find an event e such that log.tail <= e.version <= log.head,
+ * the last_update must actually be log.tail.
+ */
+ eversion_t lu =
+ (first_non_divergent == log.log.rend() ||
+ first_non_divergent->version < log.tail) ?
+ log.tail :
+ first_non_divergent->version;
+
list<pg_log_entry_t> divergent;
list<pg_log_entry_t>::const_iterator pp = olog.log.end();
- eversion_t lu(oinfo.last_update);
while (true) {
- if (pp == olog.log.begin()) {
- if (pp != olog.log.end()) // no last_update adjustment if we discard nothing!
- lu = olog.tail;
+ if (pp == olog.log.begin())
break;
- }
+
--pp;
const pg_log_entry_t& oe = *pp;
// don't continue past the tail of our log.
if (oe.version <= log.tail) {
- lu = oe.version;
++pp;
break;
}
- if (oe.version <= lower_bound) {
- lu = oe.version;
+ if (oe.version <= lu) {
++pp;
break;
}
divergent.push_front(oe);
- }
+ }
IndexedLog folog;
@@ -560,6 +578,7 @@ void PGLog::merge_log(ObjectStore::Transaction& t,
dout(10) << "merge_log extending tail to " << olog.tail << dendl;
list<pg_log_entry_t>::iterator from = olog.log.begin();
list<pg_log_entry_t>::iterator to;
+ eversion_t last;
for (to = from;
to != olog.log.end();
++to) {
@@ -567,12 +586,10 @@ void PGLog::merge_log(ObjectStore::Transaction& t,
break;
log.index(*to);
dout(15) << *to << dendl;
+ last = to->version;
}
-
- if (to == olog.log.end())
- mark_dirty_to(oinfo.last_update);
- else
- mark_dirty_to(to->version);
+ mark_dirty_to(last);
+
// splice into our log.
log.log.splice(log.log.begin(),
olog.log, from, to);
@@ -794,7 +811,7 @@ void PGLog::_write_log(
map<string,bufferlist> keys;
for (list<pg_log_entry_t>::iterator p = log.log.begin();
- p != log.log.end() && p->version < dirty_to;
+ p != log.log.end() && p->version <= dirty_to;
++p) {
bufferlist bl(sizeof(*p) * 2);
p->encode_with_checksum(bl);
diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h
index c1563f8..508b295 100644
--- a/src/osd/PGLog.h
+++ b/src/osd/PGLog.h
@@ -212,9 +212,9 @@ protected:
/// Log is clean on [dirty_to, dirty_from)
bool touched_log;
- eversion_t dirty_to; ///< must clear/writeout all keys up to dirty_to
- eversion_t dirty_from; ///< must clear/writeout all keys past dirty_from
- eversion_t writeout_from; ///< must writout keys past writeout_from
+ eversion_t dirty_to; ///< must clear/writeout all keys <= dirty_to
+ eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from
+ eversion_t writeout_from; ///< must writout keys >= writeout_from
set<eversion_t> trimmed; ///< must clear keys in trimmed
bool dirty_divergent_priors;
CephContext *cct;
@@ -397,6 +397,19 @@ public:
missing.split_into(child_pgid, split_bits, &(opg_log->missing));
opg_log->mark_dirty_to(eversion_t::max());
mark_dirty_to(eversion_t::max());
+
+ unsigned mask = ~((~0)<<split_bits);
+ for (map<eversion_t, hobject_t>::iterator i = divergent_priors.begin();
+ i != divergent_priors.end();
+ ) {
+ if ((i->second.hash & mask) == child_pgid.m_seed) {
+ opg_log->add_divergent_prior(i->first, i->second);
+ divergent_priors.erase(i++);
+ dirty_divergent_priors = true;
+ } else {
+ ++i;
+ }
+ }
}
void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) {
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index c8fb01e..750638a 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -1353,7 +1353,10 @@ void ReplicatedPG::do_op(OpRequestRef op)
}
}
+ bool in_hit_set = false;
if (hit_set) {
+ if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
+ in_hit_set = true;
hit_set->insert(oid);
if (hit_set->is_full() ||
hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
@@ -1366,7 +1369,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
}
if ((m->get_flags() & CEPH_OSD_FLAG_IGNORE_CACHE) == 0 &&
- maybe_handle_cache(op, write_ordered, obc, r, missing_oid, false))
+ maybe_handle_cache(op, write_ordered, obc, r, missing_oid, false, in_hit_set))
return;
if (r) {
@@ -1535,6 +1538,10 @@ void ReplicatedPG::do_op(OpRequestRef op)
return;
}
+ if (m->get_flags() & CEPH_OSD_FLAG_IGNORE_CACHE) {
+ ctx->ignore_cache = true;
+ }
+
if ((op->may_read()) && (obc->obs.oi.is_lost())) {
// This object is lost. Reading from it returns an error.
dout(20) << __func__ << ": object " << obc->obs.oi.soid
@@ -1561,7 +1568,8 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op,
bool write_ordered,
ObjectContextRef obc,
int r, const hobject_t& missing_oid,
- bool must_promote)
+ bool must_promote,
+ bool in_hit_set)
{
if (obc)
dout(25) << __func__ << " " << obc->obs.oi << " "
@@ -1606,7 +1614,43 @@ bool ReplicatedPG::maybe_handle_cache(OpRequestRef op,
if (!must_promote && can_skip_promote(op, obc)) {
return false;
}
- promote_object(op, obc, missing_oid);
+ if (op->may_write() || write_ordered || must_promote || !hit_set) {
+ promote_object(op, obc, missing_oid);
+ } else {
+ switch (pool.info.min_read_recency_for_promote) {
+ case 0:
+ promote_object(op, obc, missing_oid);
+ break;
+ case 1:
+ // Check if in the current hit set
+ if (in_hit_set) {
+ promote_object(op, obc, missing_oid);
+ } else {
+ do_cache_redirect(op, obc);
+ }
+ break;
+ default:
+ if (in_hit_set) {
+ promote_object(op, obc, missing_oid);
+ } else {
+ // Check if in other hit sets
+ map<time_t,HitSetRef>::iterator itor;
+ bool in_other_hit_sets = false;
+ for (itor = agent_state->hit_set_map.begin(); itor != agent_state->hit_set_map.end(); itor++) {
+ if (itor->second->contains(missing_oid)) {
+ in_other_hit_sets = true;
+ break;
+ }
+ }
+ if (in_other_hit_sets) {
+ promote_object(op, obc, missing_oid);
+ } else {
+ do_cache_redirect(op, obc);
+ }
+ }
+ break;
+ }
+ }
return true;
case pg_pool_t::CACHEMODE_FORWARD:
@@ -3701,6 +3745,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
if (!obs.exists) {
+ if (pool.info.require_rollback() && op.extent.offset) {
+ result = -EOPNOTSUPP;
+ break;
+ }
ctx->mod_desc.create();
} else if (op.extent.offset == oi.size) {
ctx->mod_desc.append(oi.size);
@@ -3948,7 +3996,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
// Cannot delete an object with watchers
result = -EBUSY;
} else {
- result = _delete_oid(ctx, false);
+ result = _delete_oid(ctx, ctx->ignore_cache);
}
break;
@@ -5065,7 +5113,10 @@ void ReplicatedPG::do_osd_op_effects(OpContext *ctx)
for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
p != ctx->notify_acks.end();
++p) {
- dout(10) << "notify_ack " << make_pair(p->watch_cookie, p->notify_id) << dendl;
+ if (p->watch_cookie)
+ dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
+ else
+ dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
ctx->obc->watchers.begin();
i != ctx->obc->watchers.end();
@@ -5147,6 +5198,7 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
dout(20) << __func__ << " " << soid << " " << ctx
<< " op " << pg_log_entry_t::get_op_name(log_op_type)
<< dendl;
+ utime_t now = ceph_clock_now(cct);
// snapset
bufferlist bss;
@@ -5208,6 +5260,7 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
ctx->snapset_obc->obs.oi.version = ctx->at_version;
ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
+ ctx->snapset_obc->obs.oi.local_mtime = now;
bufferlist bv(sizeof(ctx->new_obs.oi));
::encode(ctx->snapset_obc->obs.oi, bv);
@@ -5248,6 +5301,7 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
if (ctx->mtime != utime_t()) {
ctx->new_obs.oi.mtime = ctx->mtime;
dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
+ ctx->new_obs.oi.local_mtime = now;
} else {
dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
}
@@ -6040,6 +6094,10 @@ void ReplicatedPG::finish_promote(int r, OpRequestRef op,
simple_repop_submit(repop);
osd->logger->inc(l_osd_tier_promote);
+
+ assert(agent_state);
+ if (agent_state->is_idle())
+ agent_choose_mode();
}
void ReplicatedPG::cancel_copy(CopyOpRef cop, bool requeue)
@@ -10952,8 +11010,10 @@ void ReplicatedPG::hit_set_persist()
info.hit_set.current_info.end = now;
dout(20) << __func__ << " archive " << oid << dendl;
- if (agent_state)
+ if (agent_state) {
agent_state->add_hit_set(info.hit_set.current_info.begin, hit_set);
+ hit_set_in_memory_trim();
+ }
// hold a ref until it is flushed to disk
hit_set_flushing[info.hit_set.current_info.begin] = hit_set;
@@ -11089,8 +11149,6 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
repop->ctx->op_t->remove(oid);
repop->ctx->log.back().mod_desc.mark_unrollbackable();
}
- if (agent_state)
- agent_state->remove_oldest_hit_set();
updated_hit_set_hist.history.pop_front();
ObjectContextRef obc = get_object_context(oid, false);
@@ -11101,6 +11159,19 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
}
}
+void ReplicatedPG::hit_set_in_memory_trim()
+{
+ unsigned max = pool.info.hit_set_count;
+ unsigned max_in_memory = pool.info.min_read_recency_for_promote > 0 ? pool.info.min_read_recency_for_promote - 1 : 0;
+
+ if (max_in_memory > max) {
+ max_in_memory = max;
+ }
+ while (agent_state->hit_set_map.size() > max_in_memory) {
+ agent_state->remove_oldest_hit_set();
+ }
+}
+
// =======================================
// cache agent
@@ -11293,6 +11364,9 @@ bool ReplicatedPG::agent_work(int start_max)
else
agent_state->position = next;
+ // Discard old in memory HitSets
+ hit_set_in_memory_trim();
+
if (need_delay) {
assert(agent_state->delaying == false);
agent_delay();
@@ -11307,7 +11381,6 @@ bool ReplicatedPG::agent_work(int start_max)
void ReplicatedPG::agent_load_hit_sets()
{
if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
- agent_state->discard_hit_sets();
return;
}
@@ -11378,7 +11451,14 @@ bool ReplicatedPG::agent_maybe_flush(ObjectContextRef& obc)
}
utime_t now = ceph_clock_now(NULL);
- if (obc->obs.oi.mtime + utime_t(pool.info.cache_min_flush_age, 0) > now) {
+ utime_t ob_local_mtime;
+ if (obc->obs.oi.local_mtime != utime_t()) {
+ ob_local_mtime = obc->obs.oi.local_mtime;
+ } else {
+ ob_local_mtime = obc->obs.oi.mtime;
+ }
+ bool evict_mode_full = (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
+ if (!evict_mode_full && (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
osd->logger->inc(l_osd_agent_skip);
return false;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 4b0d1d6..c6de400 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -439,6 +439,7 @@ public:
bool user_modify; // user-visible modification
bool undirty; // user explicitly un-dirtying this object
bool cache_evict; ///< true if this is a cache eviction
+ bool ignore_cache; ///< true if IGNORE_CACHE flag is set
// side effects
list<watch_info_t> watch_connects;
@@ -541,6 +542,7 @@ public:
op(_op), reqid(_reqid), ops(_ops), obs(_obs), snapset(0),
new_obs(_obs->oi, _obs->exists),
modify(false), user_modify(false), undirty(false), cache_evict(false),
+ ignore_cache(false),
bytes_written(0), bytes_read(0), user_at_version(0),
current_osd_subop_num(0),
op_t(NULL),
@@ -800,6 +802,7 @@ protected:
void hit_set_persist(); ///< persist hit info
bool hit_set_apply_log(); ///< apply log entries to update in-memory HitSet
void hit_set_trim(RepGather *repop, unsigned max); ///< discard old HitSets
+ void hit_set_in_memory_trim(); ///< discard old in memory HitSets
hobject_t get_hit_set_current_object(utime_t stamp);
hobject_t get_hit_set_archive_object(utime_t start, utime_t end);
@@ -1054,7 +1057,8 @@ protected:
bool write_ordered,
ObjectContextRef obc, int r,
const hobject_t& missing_oid,
- bool must_promote);
+ bool must_promote,
+ bool in_hit_set = false);
/**
* This helper function tells the client to redirect their request elsewhere.
*/
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index d08e9b7..5f4d660 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -807,6 +807,7 @@ void pg_pool_t::dump(Formatter *f) const
f->close_section(); // hit_set_params
f->dump_unsigned("hit_set_period", hit_set_period);
f->dump_unsigned("hit_set_count", hit_set_count);
+ f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
f->dump_unsigned("stripe_width", get_stripe_width());
}
@@ -1058,8 +1059,56 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
return;
}
- __u8 encode_compat = 5;
- ENCODE_START(15, encode_compat, bl);
+ if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
+ // we simply added last_force_op_resend here, which is a fully
+ // backward compatible change. however, encoding the same map
+ // differently between monitors triggers scrub noise (even though
+ // they are decodable without the feature), so let's be pendantic
+ // about it.
+ ENCODE_START(14, 5, bl);
+ ::encode(type, bl);
+ ::encode(size, bl);
+ ::encode(crush_ruleset, bl);
+ ::encode(object_hash, bl);
+ ::encode(pg_num, bl);
+ ::encode(pgp_num, bl);
+ __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
+ ::encode(lpg_num, bl);
+ ::encode(lpgp_num, bl);
+ ::encode(last_change, bl);
+ ::encode(snap_seq, bl);
+ ::encode(snap_epoch, bl);
+ ::encode(snaps, bl, features);
+ ::encode(removed_snaps, bl);
+ ::encode(auid, bl);
+ ::encode(flags, bl);
+ ::encode(crash_replay_interval, bl);
+ ::encode(min_size, bl);
+ ::encode(quota_max_bytes, bl);
+ ::encode(quota_max_objects, bl);
+ ::encode(tiers, bl);
+ ::encode(tier_of, bl);
+ __u8 c = cache_mode;
+ ::encode(c, bl);
+ ::encode(read_tier, bl);
+ ::encode(write_tier, bl);
+ ::encode(properties, bl);
+ ::encode(hit_set_params, bl);
+ ::encode(hit_set_period, bl);
+ ::encode(hit_set_count, bl);
+ ::encode(stripe_width, bl);
+ ::encode(target_max_bytes, bl);
+ ::encode(target_max_objects, bl);
+ ::encode(cache_target_dirty_ratio_micro, bl);
+ ::encode(cache_target_full_ratio_micro, bl);
+ ::encode(cache_min_flush_age, bl);
+ ::encode(cache_min_evict_age, bl);
+ ::encode(erasure_code_profile, bl);
+ ENCODE_FINISH(bl);
+ return;
+ }
+
+ ENCODE_START(16, 5, bl);
::encode(type, bl);
::encode(size, bl);
::encode(crush_ruleset, bl);
@@ -1099,12 +1148,13 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
::encode(cache_min_evict_age, bl);
::encode(erasure_code_profile, bl);
::encode(last_force_op_resend, bl);
+ ::encode(min_read_recency_for_promote, bl);
ENCODE_FINISH(bl);
}
void pg_pool_t::decode(bufferlist::iterator& bl)
{
- DECODE_START_LEGACY_COMPAT_LEN(15, 5, 5, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(16, 5, 5, bl);
::decode(type, bl);
::decode(size, bl);
::decode(crush_ruleset, bl);
@@ -1206,6 +1256,12 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
} else {
last_force_op_resend = 0;
}
+ if (struct_v >= 16) {
+ ::decode(min_read_recency_for_promote, bl);
+ } else {
+ pg_pool_t def;
+ min_read_recency_for_promote = def.min_read_recency_for_promote;
+ }
DECODE_FINISH(bl);
calc_pg_masks();
}
@@ -1251,6 +1307,7 @@ void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
a.hit_set_period = 3600;
a.hit_set_count = 8;
+ a.min_read_recency_for_promote = 1;
a.set_stripe_width(12345);
a.target_max_bytes = 1238132132;
a.target_max_objects = 1232132;
@@ -1303,6 +1360,8 @@ ostream& operator<<(ostream& out, const pg_pool_t& p)
<< " " << p.hit_set_period << "s"
<< " x" << p.hit_set_count;
}
+ if (p.min_read_recency_for_promote)
+ out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
out << " stripe_width " << p.get_stripe_width();
return out;
}
@@ -2196,6 +2255,62 @@ void pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
o.back()->maybe_went_rw = true;
}
+bool pg_interval_t::is_new_interval(
+ int old_acting_primary,
+ int new_acting_primary,
+ const vector<int> &old_acting,
+ const vector<int> &new_acting,
+ int old_up_primary,
+ int new_up_primary,
+ const vector<int> &old_up,
+ const vector<int> &new_up,
+ int old_size,
+ int new_size,
+ int old_min_size,
+ int new_min_size,
+ unsigned old_pg_num,
+ unsigned new_pg_num,
+ pg_t pgid) {
+ return old_acting_primary != new_acting_primary ||
+ new_acting != old_acting ||
+ old_up_primary != new_up_primary ||
+ new_up != old_up ||
+ old_min_size != new_min_size ||
+ old_size != new_size ||
+ pgid.is_split(old_pg_num, new_pg_num, 0);
+}
+
+bool pg_interval_t::is_new_interval(
+ int old_acting_primary,
+ int new_acting_primary,
+ const vector<int> &old_acting,
+ const vector<int> &new_acting,
+ int old_up_primary,
+ int new_up_primary,
+ const vector<int> &old_up,
+ const vector<int> &new_up,
+ OSDMapRef osdmap,
+ OSDMapRef lastmap,
+ int64_t pool_id,
+ pg_t pgid) {
+ return !(lastmap->get_pools().count(pgid.pool())) ||
+ is_new_interval(old_acting_primary,
+ new_acting_primary,
+ old_acting,
+ new_acting,
+ old_up_primary,
+ new_up_primary,
+ old_up,
+ new_up,
+ lastmap->get_pools().find(pgid.pool())->second.size,
+ osdmap->get_pools().find(pgid.pool())->second.size,
+ lastmap->get_pools().find(pgid.pool())->second.min_size,
+ osdmap->get_pools().find(pgid.pool())->second.min_size,
+ lastmap->get_pg_num(pgid.pool()),
+ osdmap->get_pg_num(pgid.pool()),
+ pgid);
+}
+
bool pg_interval_t::check_new_interval(
int old_acting_primary,
int new_acting_primary,
@@ -2218,15 +2333,19 @@ bool pg_interval_t::check_new_interval(
// NOTE: a change in the up set primary triggers an interval
// change, even though the interval members in the pg_interval_t
// do not change.
- if (old_acting_primary != new_acting_primary ||
- new_acting != old_acting ||
- old_up_primary != new_up_primary ||
- new_up != old_up ||
- (!(lastmap->get_pools().count(pool_id))) ||
- (lastmap->get_pools().find(pool_id)->second.min_size !=
- osdmap->get_pools().find(pool_id)->second.min_size) ||
- pgid.is_split(lastmap->get_pg_num(pgid.pool()),
- osdmap->get_pg_num(pgid.pool()), 0)) {
+ if (is_new_interval(
+ old_acting_primary,
+ new_acting_primary,
+ old_acting,
+ new_acting,
+ old_up_primary,
+ new_up_primary,
+ old_up,
+ new_up,
+ osdmap,
+ lastmap,
+ pool_id,
+ pgid)) {
pg_interval_t& i = (*past_intervals)[same_interval_since];
i.first = same_interval_since;
i.last = osdmap->get_epoch() - 1;
@@ -3613,6 +3732,7 @@ void object_info_t::copy_user_bits(const object_info_t& other)
// these bits are copied from head->clone.
size = other.size;
mtime = other.mtime;
+ local_mtime = other.local_mtime;
last_reqid = other.last_reqid;
truncate_seq = other.truncate_seq;
truncate_size = other.truncate_size;
@@ -3644,7 +3764,7 @@ void object_info_t::encode(bufferlist& bl) const
++i) {
old_watchers.insert(make_pair(i->first.second, i->second));
}
- ENCODE_START(13, 8, bl);
+ ENCODE_START(14, 8, bl);
::encode(soid, bl);
::encode(myoloc, bl); //Retained for compatibility
::encode(category, bl);
@@ -3669,6 +3789,7 @@ void object_info_t::encode(bufferlist& bl) const
::encode(watchers, bl);
__u32 _flags = flags;
::encode(_flags, bl);
+ ::encode(local_mtime, bl);
ENCODE_FINISH(bl);
}
@@ -3747,6 +3868,11 @@ void object_info_t::decode(bufferlist::iterator& bl)
::decode(_flags, bl);
flags = (flag_t)_flags;
}
+ if (struct_v >= 14) {
+ ::decode(local_mtime, bl);
+ } else {
+ local_mtime = utime_t();
+ }
DECODE_FINISH(bl);
}
@@ -3762,6 +3888,7 @@ void object_info_t::dump(Formatter *f) const
f->dump_unsigned("user_version", user_version);
f->dump_unsigned("size", size);
f->dump_stream("mtime") << mtime;
+ f->dump_stream("local_mtime") << local_mtime;
f->dump_unsigned("lost", (int)is_lost());
f->dump_unsigned("flags", (int)flags);
f->dump_stream("wrlock_by") << wrlock_by;
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index a296df0..5653039 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -973,6 +973,7 @@ public:
HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
uint32_t hit_set_period; ///< periodicity of HitSet segments (seconds)
uint32_t hit_set_count; ///< number of periods to retain
+ uint32_t min_read_recency_for_promote; ///< minimum number of HitSet to check before promote
uint32_t stripe_width; ///< erasure coded stripe size in bytes
@@ -997,6 +998,7 @@ public:
hit_set_params(),
hit_set_period(0),
hit_set_count(0),
+ min_read_recency_for_promote(0),
stripe_width(0)
{ }
@@ -1753,12 +1755,51 @@ struct pg_interval_t {
static void generate_test_instances(list<pg_interval_t*>& o);
/**
+ * Determines whether there is an interval change
+ */
+ static bool is_new_interval(
+ int old_acting_primary,
+ int new_acting_primary,
+ const vector<int> &old_acting,
+ const vector<int> &new_acting,
+ int old_up_primary,
+ int new_up_primary,
+ const vector<int> &old_up,
+ const vector<int> &new_up,
+ int old_size,
+ int new_size,
+ int old_min_size,
+ int new_min_size,
+ unsigned old_pg_num,
+ unsigned new_pg_num,
+ pg_t pgid
+ );
+
+ /**
+ * Determines whether there is an interval change
+ */
+ static bool is_new_interval(
+ int old_acting_primary, ///< [in] primary as of lastmap
+ int new_acting_primary, ///< [in] primary as of lastmap
+ const vector<int> &old_acting, ///< [in] acting as of lastmap
+ const vector<int> &new_acting, ///< [in] acting as of osdmap
+ int old_up_primary, ///< [in] up primary of lastmap
+ int new_up_primary, ///< [in] up primary of osdmap
+ const vector<int> &old_up, ///< [in] up as of lastmap
+ const vector<int> &new_up, ///< [in] up as of osdmap
+ ceph::shared_ptr<const OSDMap> osdmap, ///< [in] current map
+ ceph::shared_ptr<const OSDMap> lastmap, ///< [in] last map
+ int64_t poolid, ///< [in] pool for pg
+ pg_t pgid ///< [in] pgid for pg
+ );
+
+ /**
* Integrates a new map into *past_intervals, returns true
* if an interval was closed out.
*/
static bool check_new_interval(
int old_acting_primary, ///< [in] primary as of lastmap
- int new_acting_primary, ///< [in] primary as of lastmap
+ int new_acting_primary, ///< [in] primary as of osdmap
const vector<int> &old_acting, ///< [in] acting as of lastmap
const vector<int> &new_acting, ///< [in] acting as of osdmap
int old_up_primary, ///< [in] up primary of lastmap
@@ -2621,6 +2662,7 @@ struct object_info_t {
uint64_t size;
utime_t mtime;
+ utime_t local_mtime; // local mtime
// note: these are currently encoded into a total 16 bits; see
// encode()/decode() for the weirdness.
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index 649c61c..9649e27 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -1220,7 +1220,7 @@ public:
}
};
-ceph_tid_t Objecter::op_submit(Op *op)
+ceph_tid_t Objecter::op_submit(Op *op, int *ctx_budget)
{
assert(client_lock.is_locked());
assert(initialized);
@@ -1236,7 +1236,14 @@ ceph_tid_t Objecter::op_submit(Op *op)
// throttle. before we look at any state, because
// take_op_budget() may drop our lock while it blocks.
- take_op_budget(op);
+ if (!op->ctx_budgeted || (ctx_budget && (*ctx_budget == -1))) {
+ int op_budget = take_op_budget(op);
+ // take and pass out the budget for the first OP
+ // in the context session
+ if (ctx_budget && (*ctx_budget == -1)) {
+ *ctx_budget = op_budget;
+ }
+ }
return _op_submit(op);
}
@@ -1439,7 +1446,7 @@ int64_t Objecter::get_object_pg_hash_position(int64_t pool, const string& key,
return p->raw_hash_to_pg(p->hash_key(key, ns));
}
-int Objecter::calc_target(op_target_t *t, bool any_change)
+int Objecter::calc_target(op_target_t *t, epoch_t *last_force_resend, bool any_change)
{
bool is_read = t->flags & CEPH_OSD_FLAG_READ;
bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
@@ -1447,9 +1454,15 @@ int Objecter::calc_target(op_target_t *t, bool any_change)
const pg_pool_t *pi = osdmap->get_pg_pool(t->base_oloc.pool);
bool force_resend = false;
bool need_check_tiering = false;
+
if (pi && osdmap->get_epoch() == pi->last_force_op_resend) {
- force_resend = true;
+ if (last_force_resend && *last_force_resend < pi->last_force_op_resend) {
+ *last_force_resend = pi->last_force_op_resend;
+ force_resend = true;
+ } else if (last_force_resend == 0)
+ force_resend = true;
}
+
if (t->target_oid.name.empty() || force_resend) {
t->target_oid = t->base_oid;
need_check_tiering = true;
@@ -1483,9 +1496,33 @@ int Objecter::calc_target(op_target_t *t, bool any_change)
if (ret == -ENOENT)
return RECALC_OP_TARGET_POOL_DNE;
}
- int primary;
- vector<int> acting;
- osdmap->pg_to_acting_osds(pgid, &acting, &primary);
+
+ int size = pi->size;
+ int min_size = pi->min_size;
+ unsigned pg_num = pi->get_pg_num();
+ int up_primary, acting_primary;
+ vector<int> up, acting;
+ osdmap->pg_to_up_acting_osds(pgid, &up, &up_primary,
+ &acting, &acting_primary);
+ unsigned prev_seed = ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask);
+ if (any_change && pg_interval_t::is_new_interval(
+ t->acting_primary,
+ acting_primary,
+ t->acting,
+ acting,
+ t->up_primary,
+ up_primary,
+ t->up,
+ up,
+ t->size,
+ size,
+ t->min_size,
+ min_size,
+ t->pg_num,
+ pg_num,
+ pg_t(prev_seed, pgid.pool(), pgid.preferred()))) {
+ force_resend = true;
+ }
bool need_resend = false;
@@ -1497,15 +1534,22 @@ int Objecter::calc_target(op_target_t *t, bool any_change)
if (t->pgid != pgid ||
is_pg_changed(
- t->primary, t->acting, primary, acting, t->used_replica || any_change) ||
+ t->acting_primary, t->acting, acting_primary, acting,
+ t->used_replica || any_change) ||
force_resend) {
t->pgid = pgid;
t->acting = acting;
- t->primary = primary;
- ldout(cct, 10) << __func__ << " pgid " << pgid
- << " acting " << acting << dendl;
+ t->acting_primary = acting_primary;
+ t->up_primary = up_primary;
+ t->up = up;
+ t->size = size;
+ t->min_size = min_size;
+ t->pg_num = pg_num;
+ t->pg_num_mask = pi->get_pg_num_mask();
+ ldout(cct, 10) << __func__ << " "
+ << " pgid " << pgid << " acting " << acting << dendl;
t->used_replica = false;
- if (primary == -1) {
+ if (acting_primary == -1) {
t->osd = -1;
} else {
int osd;
@@ -1541,7 +1585,7 @@ int Objecter::calc_target(op_target_t *t, bool any_change)
assert(best >= 0);
osd = acting[best];
} else {
- osd = primary;
+ osd = acting_primary;
}
t->osd = osd;
}
@@ -1555,7 +1599,7 @@ int Objecter::calc_target(op_target_t *t, bool any_change)
int Objecter::recalc_op_target(Op *op)
{
- int r = calc_target(&op->target);
+ int r = calc_target(&op->target, &op->last_force_resend);
if (r == RECALC_OP_TARGET_NEED_RESEND) {
OSDSession *s = NULL;
if (op->target.osd >= 0)
@@ -1576,7 +1620,7 @@ int Objecter::recalc_op_target(Op *op)
bool Objecter::recalc_linger_op_target(LingerOp *linger_op)
{
- int r = calc_target(&linger_op->target, true);
+ int r = calc_target(&linger_op->target, &linger_op->last_force_resend, true);
if (r == RECALC_OP_TARGET_NEED_RESEND) {
ldout(cct, 10) << "recalc_linger_op_target tid " << linger_op->linger_id
<< " pgid " << linger_op->target.pgid
@@ -1610,7 +1654,7 @@ void Objecter::finish_op(Op *op)
ldout(cct, 15) << "finish_op " << op->tid << dendl;
op->session_item.remove_myself();
- if (op->budgeted)
+ if (!op->ctx_budgeted && op->budgeted)
put_op_budget(op);
ops.erase(op->tid);
@@ -1915,6 +1959,10 @@ void Objecter::list_objects(ListContext *list_context, Context *onfinish)
}
}
if (list_context->at_end_of_pool) {
+ // release the listing context's budget once all
+ // OPs (in the session) are finished
+ put_list_context_budget(list_context);
+
onfinish->complete(0);
return;
}
@@ -1943,7 +1991,7 @@ void Objecter::list_objects(ListContext *list_context, Context *onfinish)
C_List *onack = new C_List(list_context, onfinish, this);
object_locator_t oloc(list_context->pool_id, list_context->nspace);
pg_read(list_context->current_pg, oloc, op,
- &list_context->bl, 0, onack, &onack->epoch);
+ &list_context->bl, 0, onack, &onack->epoch, &list_context->ctx_budget);
}
void Objecter::_list_reply(ListContext *list_context, int r,
@@ -1989,6 +2037,9 @@ void Objecter::_list_reply(ListContext *list_context, int r,
}
if (!list_context->list.empty()) {
ldout(cct, 20) << " returning results so far" << dendl;
+ // release the listing context's budget once all
+ // OPs (in the session) are finished
+ put_list_context_budget(list_context);
final_finish->complete(0);
return;
}
@@ -1997,6 +2048,13 @@ void Objecter::_list_reply(ListContext *list_context, int r,
list_objects(list_context, final_finish);
}
+void Objecter::put_list_context_budget(ListContext *list_context) {
+ if (list_context->ctx_budget >= 0) {
+ ldout(cct, 10) << " release listing context's budget " << list_context->ctx_budget << dendl;
+ put_op_budget_bytes(list_context->ctx_budget);
+ list_context->ctx_budget = -1;
+ }
+ }
//snapshots
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 2ede888..b0739a1 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -1068,12 +1068,18 @@ public:
object_t target_oid;
object_locator_t target_oloc;
- bool precalc_pgid; ///< true if we are directed at base_pgid, not base_oid
- pg_t base_pgid; ///< explciti pg target, if any
-
- pg_t pgid; ///< last pg we mapped to
- vector<int> acting; ///< acting for last pg we mapped to
- int primary; ///< primary for last pg we mapped to
+ bool precalc_pgid; ///< true if we are directed at base_pgid, not base_oid
+ pg_t base_pgid; ///< explciti pg target, if any
+
+ pg_t pgid; ///< last pg we mapped to
+ unsigned pg_num; ///< last pg_num we mapped to
+ unsigned pg_num_mask; ///< last pg_num_mask we mapped to
+ vector<int> up; ///< set of up osds for last pg we mapped to
+ vector<int> acting; ///< set of acting osds for last pg we mapped to
+ int up_primary; ///< primary for last pg we mapped to based on the up set
+ int acting_primary; ///< primary for last pg we mapped to based on the acting set
+ int size; ///< the size of the pool when were were last mapped
+ int min_size; ///< the min size of the pool when were were last mapped
bool used_replica;
bool paused;
@@ -1085,7 +1091,12 @@ public:
base_oid(oid),
base_oloc(oloc),
precalc_pgid(false),
- primary(-1),
+ pg_num(0),
+ pg_num_mask(0),
+ up_primary(-1),
+ acting_primary(-1),
+ size(-1),
+ min_size(-1),
used_replica(false),
paused(false),
osd(-1)
@@ -1133,6 +1144,13 @@ public:
/// true if we should resend this message on failure
bool should_resend;
+ epoch_t last_force_resend;
+
+ /// true if the throttle budget is get/put on a series of OPs, instead of
+ /// per OP basis, when this flag is set, the budget is acquired before sending
+ /// the very first OP of the series and released upon receiving the last OP reply.
+ bool ctx_budgeted;
+
Op(const object_t& o, const object_locator_t& ol, vector<OSDOp>& op,
int f, Context *ac, Context *co, version_t *ov) :
session(NULL), session_item(this), incarnation(0),
@@ -1146,7 +1164,9 @@ public:
objver(ov), reply_epoch(NULL),
map_dne_bound(0),
budgeted(false),
- should_resend(true) {
+ should_resend(true),
+ last_force_resend(0),
+ ctx_budgeted(false) {
ops.swap(op);
/* initialize out_* to match op vector */
@@ -1249,11 +1269,24 @@ public:
bufferlist extra_info;
+ // The budget associated with this context, once it is set (>= 0),
+ // the budget is not get/released on OP basis, instead the budget
+ // is acquired before sending the first OP and released upon receiving
+ // the last op reply.
+ int ctx_budget;
+
ListContext() : current_pg(0), current_pg_epoch(0), starting_pg_num(0),
at_end_of_pool(false),
at_end_of_pg(false),
pool_id(0),
- pool_snap_seq(0), max_entries(0) {}
+ pool_snap_seq(0),
+ max_entries(0),
+ nspace(),
+ bl(),
+ list(),
+ filter(),
+ extra_info(),
+ ctx_budget(-1) {}
bool at_end() const {
return at_end_of_pool;
@@ -1372,6 +1405,7 @@ public:
ceph_tid_t register_tid;
epoch_t map_dne_bound;
+ epoch_t last_force_resend;
LingerOp() : linger_id(0),
target(object_t(), object_locator_t(), 0),
@@ -1381,7 +1415,8 @@ public:
on_reg_ack(NULL), on_reg_commit(NULL),
session(NULL), session_item(this),
register_tid(0),
- map_dne_bound(0) {}
+ map_dne_bound(0),
+ last_force_resend(0) {}
// no copy!
const LingerOp &operator=(const LingerOp& r);
@@ -1480,7 +1515,7 @@ public:
bool osdmap_full_flag() const;
bool target_should_be_paused(op_target_t *op);
- int calc_target(op_target_t *t, bool any_change=false);
+ int calc_target(op_target_t *t, epoch_t *last_force_resend=0, bool any_change=false);
int recalc_op_target(Op *op);
bool recalc_linger_op_target(LingerOp *op);
@@ -1517,7 +1552,7 @@ public:
*/
int calc_op_budget(Op *op);
void throttle_op(Op *op, int op_size=0);
- void take_op_budget(Op *op) {
+ int take_op_budget(Op *op) {
int op_budget = calc_op_budget(op);
if (keep_balanced_budget) {
throttle_op(op, op_budget);
@@ -1526,13 +1561,19 @@ public:
op_throttle_ops.take(1);
}
op->budgeted = true;
+ return op_budget;
+ }
+ void put_op_budget_bytes(int op_budget) {
+ assert(op_budget >= 0);
+ op_throttle_bytes.put(op_budget);
+ op_throttle_ops.put(1);
}
void put_op_budget(Op *op) {
assert(op->budgeted);
int op_budget = calc_op_budget(op);
- op_throttle_bytes.put(op_budget);
- op_throttle_ops.put(1);
+ put_op_budget_bytes(op_budget);
}
+ void put_list_context_budget(ListContext *list_context);
Throttle op_throttle_bytes, op_throttle_ops;
public:
@@ -1603,7 +1644,7 @@ private:
// public interface
public:
- ceph_tid_t op_submit(Op *op);
+ ceph_tid_t op_submit(Op *op, int *ctx_budget = NULL);
bool is_active() {
return !(ops.empty() && linger_ops.empty() && poolstat_ops.empty() && statfs_ops.empty());
}
@@ -1707,7 +1748,8 @@ public:
ObjectOperation& op,
bufferlist *pbl, int flags,
Context *onack,
- epoch_t *reply_epoch) {
+ epoch_t *reply_epoch,
+ int *ctx_budget) {
Op *o = new Op(object_t(), oloc,
op.ops, flags | global_op_flags | CEPH_OSD_FLAG_READ,
onack, NULL, NULL);
@@ -1720,7 +1762,11 @@ public:
o->out_handler.swap(op.out_handler);
o->out_rval.swap(op.out_rval);
o->reply_epoch = reply_epoch;
- return op_submit(o);
+ if (ctx_budget) {
+ // budget is tracked by listing context
+ o->ctx_budgeted = true;
+ }
+ return op_submit(o, ctx_budget);
}
ceph_tid_t linger_mutate(const object_t& oid, const object_locator_t& oloc,
ObjectOperation& op,
diff --git a/src/rgw/logrotate.conf b/src/rgw/logrotate.conf
index ec47f00..7e527e8 100644
--- a/src/rgw/logrotate.conf
+++ b/src/rgw/logrotate.conf
@@ -7,7 +7,7 @@
if which invoke-rc.d > /dev/null 2>&1 && [ -x `which invoke-rc.d` ]; then
invoke-rc.d radosgw reload >/dev/null
elif which service > /dev/null 2>&1 && [ -x `which service` ]; then
- service radosgw reload >/dev/null
+ service ceph-radosgw reload >/dev/null
fi
# Possibly reload twice, but depending on ceph.conf the reload above may be a no-op
if which initctl > /dev/null 2>&1 && [ -x `which initctl` ]; then
diff --git a/src/rgw/rgw_civetweb.cc b/src/rgw/rgw_civetweb.cc
index 7f246d1..5c15bcf 100644
--- a/src/rgw/rgw_civetweb.cc
+++ b/src/rgw/rgw_civetweb.cc
@@ -11,13 +11,18 @@ int RGWMongoose::write_data(const char *buf, int len)
{
if (!header_done) {
header_data.append(buf, len);
- return 0;
+ return len;
}
if (!sent_header) {
data.append(buf, len);
- return 0;
+ return len;
+ }
+ int r = mg_write(conn, buf, len);
+ if (r == 0) {
+ /* didn't send anything, error out */
+ return -EIO;
}
- return mg_write(conn, buf, len);
+ return r;
}
RGWMongoose::RGWMongoose(mg_connection *_conn, int _port) : conn(_conn), port(_port), header_done(false), sent_header(false), has_content_length(false),
diff --git a/src/rgw/rgw_client_io.cc b/src/rgw/rgw_client_io.cc
index 193f44e..32d99dc 100644
--- a/src/rgw/rgw_client_io.cc
+++ b/src/rgw/rgw_client_io.cc
@@ -54,7 +54,12 @@ int RGWClientIO::write(const char *buf, int len)
return ret;
if (account)
- bytes_sent += len;
+ bytes_sent += ret;
+
+ if (ret < len) {
+ /* sent less than tried to send, error out */
+ return -EIO;
+ }
return 0;
}
diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc
index 8c0b40d..1c8720c 100644
--- a/src/rgw/rgw_main.cc
+++ b/src/rgw/rgw_main.cc
@@ -371,7 +371,7 @@ void RGWLoadGenProcess::run()
int num_buckets;
conf->get_val("num_buckets", 1, &num_buckets);
- string buckets[num_buckets];
+ vector<string> buckets(num_buckets);
atomic_t failed;
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index ec64777..804917c 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -1798,6 +1798,12 @@ void RGWPostObj::execute()
goto done;
}
+ ret = store->check_quota(s->bucket_owner.get_id(), s->bucket,
+ user_quota, bucket_quota, s->content_length);
+ if (ret < 0) {
+ goto done;
+ }
+
processor = select_processor();
ret = processor->prepare(store, s->obj_ctx, NULL);
@@ -1833,6 +1839,12 @@ void RGWPostObj::execute()
s->obj_size = ofs;
+ ret = store->check_quota(s->bucket_owner.get_id(), s->bucket,
+ user_quota, bucket_quota, s->obj_size);
+ if (ret < 0) {
+ goto done;
+ }
+
hash.Final(m);
buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
@@ -2070,6 +2082,7 @@ int RGWCopyObj::verify_permission()
if (src_bucket_name.compare(dest_bucket_name) == 0) { /* will only happen if s->local_source */
dest_bucket_info = src_bucket_info;
+ dest_attrs = src_attrs;
} else {
ret = store->get_bucket_info(s->obj_ctx, dest_bucket_name, dest_bucket_info, NULL, &dest_attrs);
if (ret < 0)
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 25923e1..482b609 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -3534,6 +3534,31 @@ int RGWRados::copy_obj_data(void *ctx,
}
/**
+ * Check to see if the bucket metadata could be synced
+ * bucket: the bucket to check
+ * Returns false is the bucket is not synced
+ */
+bool RGWRados::is_syncing_bucket_meta(rgw_bucket& bucket)
+{
+ /* region is not master region */
+ if (!region.is_master) {
+ return false;
+ }
+
+ /* single region and a single zone */
+ if (region_map.regions.size() == 1 && region.zones.size() == 1) {
+ return false;
+ }
+
+ /* zone is not master */
+ if (region.master_zone.compare(zone_name) != 0) {
+ return false;
+ }
+
+ return true;
+}
+
+/**
* Delete a bucket.
* bucket: the name of the bucket to delete
* Returns 0 on success, -ERR# otherwise.
@@ -3572,6 +3597,16 @@ int RGWRados::delete_bucket(rgw_bucket& bucket, RGWObjVersionTracker& objv_track
if (r < 0)
return r;
+ /* if the bucked is not synced we can remove the meta file */
+ if (!is_syncing_bucket_meta(bucket)) {
+ RGWObjVersionTracker objv_tracker;
+ string entry;
+ get_bucket_instance_entry(bucket, entry);
+ r= rgw_bucket_instance_remove_entry(this, entry, &objv_tracker);
+ if (r < 0) {
+ return r;
+ }
+ }
return 0;
}
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index 2281f9a..5eee430 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -269,6 +269,12 @@ public:
::decode(rules, bl);
} else {
explicit_objs = true;
+ if (!objs.empty()) {
+ map<uint64_t, RGWObjManifestPart>::iterator iter = objs.begin();
+ head_obj = iter->second.loc;
+ head_size = iter->second.size;
+ max_head_size = head_size;
+ }
}
if (struct_v >= 4) {
@@ -1595,6 +1601,11 @@ public:
*/
virtual int delete_bucket(rgw_bucket& bucket, RGWObjVersionTracker& objv_tracker);
+ /**
+ * Check to see if the bucket metadata is synced
+ */
+ bool is_syncing_bucket_meta(rgw_bucket& bucket);
+
int set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner);
int set_buckets_enabled(std::vector<rgw_bucket>& buckets, bool enabled);
int bucket_suspended(rgw_bucket& bucket, bool *suspended);
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
index 768ca09..13d1643 100644
--- a/src/rgw/rgw_rest.cc
+++ b/src/rgw/rgw_rest.cc
@@ -294,8 +294,11 @@ void dump_bucket_from_state(struct req_state *s)
{
int expose_bucket = g_conf->rgw_expose_bucket;
if (expose_bucket) {
- if (!s->bucket_name_str.empty())
- s->cio->print("Bucket: \"%s\"\r\n", s->bucket_name_str.c_str());
+ if (!s->bucket_name_str.empty()) {
+ string b;
+ url_encode(s->bucket_name_str, b);
+ s->cio->print("Bucket: %s\r\n", b.c_str());
+ }
}
}
@@ -427,7 +430,8 @@ void dump_start(struct req_state *s)
}
}
-void end_header(struct req_state *s, RGWOp *op, const char *content_type)
+void end_header(struct req_state *s, RGWOp *op, const char *content_type, const int64_t proposed_content_length,
+ bool force_content_type)
{
string ctype;
@@ -435,7 +439,13 @@ void end_header(struct req_state *s, RGWOp *op, const char *content_type)
dump_access_control(s, op);
}
- if (!content_type || s->err.is_err()) {
+ if (s->prot_flags & RGW_REST_SWIFT && !content_type) {
+ force_content_type = true;
+ }
+
+ /* do not send content type if content length is zero
+ and the content type was not set by the user */
+ if (force_content_type || (!content_type && s->formatter->get_len() != 0) || s->err.is_err()){
switch (s->format) {
case RGW_FORMAT_XML:
ctype = "application/xml";
@@ -460,10 +470,18 @@ void end_header(struct req_state *s, RGWOp *op, const char *content_type)
s->formatter->dump_string("Message", s->err.message);
s->formatter->close_section();
dump_content_length(s, s->formatter->get_len());
+ } else {
+ if (proposed_content_length != NO_CONTENT_LENGTH) {
+ dump_content_length(s, proposed_content_length);
+ }
}
- int r = s->cio->print("Content-type: %s\r\n", content_type);
- if (r < 0) {
- ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
+
+ int r;
+ if (content_type) {
+ r = s->cio->print("Content-Type: %s\r\n", content_type);
+ if (r < 0) {
+ ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
+ }
}
r = s->cio->complete_header();
if (r < 0) {
@@ -1244,7 +1262,7 @@ int RGWREST::preprocess(struct req_state *s, RGWClientIO *cio)
s->content_length = 0;
} else {
string err;
- s->content_length = strict_strtol(s->length, 10, &err);
+ s->content_length = strict_strtoll(s->length, 10, &err);
if (!err.empty()) {
ldout(s->cct, 10) << "bad content length, aborting" << dendl;
return -EINVAL;
diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h
index a6108f4..0624e4f 100644
--- a/src/rgw/rgw_rest.h
+++ b/src/rgw/rgw_rest.h
@@ -354,10 +354,16 @@ public:
}
};
+static const int64_t NO_CONTENT_LENGTH = -1;
+
extern void set_req_state_err(struct req_state *s, int err_no);
extern void dump_errno(struct req_state *s);
extern void dump_errno(struct req_state *s, int ret);
-extern void end_header(struct req_state *s, RGWOp *op = NULL, const char *content_type = NULL);
+extern void end_header(struct req_state *s,
+ RGWOp *op = NULL,
+ const char *content_type = NULL,
+ const int64_t proposed_content_length = NO_CONTENT_LENGTH,
+ bool force_content_type = false);
extern void dump_start(struct req_state *s);
extern void list_all_buckets_start(struct req_state *s);
extern void dump_owner(struct req_state *s, string& id, string& name, const char *section = NULL);
diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc
index 2702472..4e4475a 100644
--- a/src/rgw/rgw_rest_swift.cc
+++ b/src/rgw/rgw_rest_swift.cc
@@ -51,8 +51,11 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_begin(bool has_buckets)
ret = STATUS_NO_CONTENT;
set_req_state_err(s, ret);
}
- dump_errno(s);
- end_header(s, NULL);
+
+ if (!g_conf->rgw_swift_enforce_content_length) {
+ dump_errno(s);
+ end_header(s, NULL, NULL, NO_CONTENT_LENGTH, true);
+ }
if (!ret) {
dump_start(s);
@@ -79,7 +82,9 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_data(RGWUserBuckets& buckets)
s->formatter->dump_int("bytes", obj.size);
}
s->formatter->close_section();
- rgw_flush_formatter(s, s->formatter);
+ if (!g_conf->rgw_swift_enforce_content_length) {
+ rgw_flush_formatter(s, s->formatter);
+ }
}
}
@@ -87,6 +92,14 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_end()
{
if (sent_data) {
s->formatter->close_section();
+ }
+
+ if (g_conf->rgw_swift_enforce_content_length) {
+ dump_errno(s);
+ end_header(s, NULL, NULL, s->formatter->get_len(), true);
+ }
+
+ if (sent_data || g_conf->rgw_swift_enforce_content_length) {
rgw_flush_formatter_and_reset(s, s->formatter);
}
}
@@ -204,14 +217,19 @@ next:
s->formatter->close_section();
- if (!ret && s->formatter->get_len() == 0)
- ret = STATUS_NO_CONTENT;
- else if (ret > 0)
+ int64_t content_len = 0;
+ if (!ret) {
+ content_len = s->formatter->get_len();
+ if (content_len == 0) {
+ ret = STATUS_NO_CONTENT;
+ }
+ } else if (ret > 0) {
ret = 0;
+ }
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s, this);
+ end_header(s, this, NULL, content_len);
if (ret < 0) {
return;
}
@@ -277,7 +295,8 @@ void RGWStatAccount_ObjStore_SWIFT::send_response()
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s, NULL);
+ end_header(s, NULL, NULL, 0, true);
+
dump_start(s);
}
@@ -291,7 +310,7 @@ void RGWStatBucket_ObjStore_SWIFT::send_response()
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s, this);
+ end_header(s, this,NULL,0, true);
dump_start(s);
}
@@ -371,7 +390,8 @@ void RGWCreateBucket_ObjStore_SWIFT::send_response()
ret = STATUS_ACCEPTED;
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s, NULL);
+ /* Propose ending HTTP header with 0 Content-Length header. */
+ end_header(s, NULL, NULL, 0);
rgw_flush_formatter_and_reset(s, s->formatter);
}
@@ -383,7 +403,7 @@ void RGWDeleteBucket_ObjStore_SWIFT::send_response()
set_req_state_err(s, r);
dump_errno(s);
- end_header(s, this);
+ end_header(s, this, NULL, 0);
rgw_flush_formatter_and_reset(s, s->formatter);
}
@@ -597,6 +617,7 @@ int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, off_t bl_ofs, o
dump_content_length(s, total_len);
dump_last_modified(s, lastmod);
+ s->cio->print("X-Timestamp: %lld\r\n", (long long)lastmod);
if (!ret) {
map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_ETAG);
diff --git a/src/rgw/rgw_swift.cc b/src/rgw/rgw_swift.cc
index 1b724ed..6418f5b 100644
--- a/src/rgw/rgw_swift.cc
+++ b/src/rgw/rgw_swift.cc
@@ -520,6 +520,11 @@ int RGWSwift::validate_keystone_token(RGWRados *store, const string& token, stru
if (ret < 0)
return ret;
+ if (t.expired()) {
+ ldout(cct, 0) << "got expired token: " << t.token.tenant.name << ":" << t.user.name << " expired: " << t.token.expires << dendl;
+ return -EPERM;
+ }
+
keystone_token_cache->add(token_id, t);
ret = update_user_info(store, info, rgw_user);
diff --git a/src/test/Makefile.am b/src/test/Makefile.am
index 69f9e84..9ede275 100644
--- a/src/test/Makefile.am
+++ b/src/test/Makefile.am
@@ -646,7 +646,7 @@ bin_DEBUGPROGRAMS += ceph_test_librbd
if LINUX
ceph_test_librbd_fsx_SOURCES = test/librbd/fsx.c
ceph_test_librbd_fsx_LDADD = $(LIBRBD) $(LIBRADOS) -lm
-ceph_test_librbd_fsx_CFLAGS = ${AM_CFLAGS} -Wno-format
+ceph_test_librbd_fsx_CFLAGS = ${AM_CFLAGS}
bin_DEBUGPROGRAMS += ceph_test_librbd_fsx
endif
diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc
index ff87238..9190d91 100644
--- a/src/test/bufferlist.cc
+++ b/src/test/bufferlist.cc
@@ -2093,6 +2093,17 @@ TEST(BufferList, zero) {
bl.zero((unsigned)3, (unsigned)3);
EXPECT_EQ(0, ::memcmp("ABC\0\0\0GHIKLM", bl.c_str(), 9));
}
+ {
+ bufferlist bl;
+ bufferptr ptr1(4);
+ bufferptr ptr2(4);
+ memset(ptr1.c_str(), 'a', 4);
+ memset(ptr2.c_str(), 'b', 4);
+ bl.append(ptr1);
+ bl.append(ptr2);
+ bl.zero((unsigned)2, (unsigned)4);
+ EXPECT_EQ(0, ::memcmp("aa\0\0\0\0bb", bl.c_str(), 8));
+ }
}
TEST(BufferList, EmptyAppend) {
diff --git a/src/test/librados/TestCase.cc b/src/test/librados/TestCase.cc
index 7f072fd..3747683 100644
--- a/src/test/librados/TestCase.cc
+++ b/src/test/librados/TestCase.cc
@@ -91,7 +91,14 @@ void RadosTestPP::cleanup_default_namespace(librados::IoCtx ioctx)
for (ObjectIterator it = ioctx.objects_begin();
it != ioctx.objects_end(); ++it) {
ioctx.locator_set_key(it->second);
- ASSERT_EQ(0, ioctx.remove(it->first));
+ ObjectWriteOperation op;
+ op.remove();
+ librados::AioCompletion *completion = s_cluster.aio_create_completion();
+ ASSERT_EQ(0, ioctx.aio_operate(it->first, completion, &op,
+ librados::OPERATION_IGNORE_CACHE));
+ completion->wait_for_safe();
+ ASSERT_EQ(0, completion->get_return_value());
+ completion->release();
}
}
diff --git a/src/test/librados/tier.cc b/src/test/librados/tier.cc
index a89d68b..9f3df30 100644
--- a/src/test/librados/tier.cc
+++ b/src/test/librados/tier.cc
@@ -75,16 +75,15 @@ protected:
static void SetUpTestCase() {
pool_name = get_temp_pool_name();
ASSERT_EQ("", create_one_pool_pp(pool_name, s_cluster));
- cache_pool_name = get_temp_pool_name();
- ASSERT_EQ(0, s_cluster.pool_create(cache_pool_name.c_str()));
}
static void TearDownTestCase() {
- ASSERT_EQ(0, s_cluster.pool_delete(cache_pool_name.c_str()));
ASSERT_EQ(0, destroy_one_pool_pp(pool_name, s_cluster));
}
static std::string cache_pool_name;
virtual void SetUp() {
+ cache_pool_name = get_temp_pool_name();
+ ASSERT_EQ(0, s_cluster.pool_create(cache_pool_name.c_str()));
RadosTestPP::SetUp();
ASSERT_EQ(0, cluster.ioctx_create(cache_pool_name.c_str(), cache_ioctx));
cache_ioctx.set_namespace(ns);
@@ -112,6 +111,7 @@ protected:
cleanup_default_namespace(cache_ioctx);
cache_ioctx.close();
+ ASSERT_EQ(0, s_cluster.pool_delete(cache_pool_name.c_str()));
}
librados::IoCtx cache_ioctx;
};
@@ -624,7 +624,24 @@ TEST_F(LibRadosTwoPoolsPP, Whiteout) {
ASSERT_TRUE(it == cache_ioctx.objects_end());
}
+ // delete a whiteout and verify it goes away
ASSERT_EQ(-ENOENT, ioctx.remove("foo"));
+ {
+ ObjectWriteOperation op;
+ op.remove();
+ librados::AioCompletion *completion = cluster.aio_create_completion();
+ ASSERT_EQ(0, ioctx.aio_operate("bar", completion, &op,
+ librados::OPERATION_IGNORE_CACHE));
+ completion->wait_for_safe();
+ ASSERT_EQ(0, completion->get_return_value());
+ completion->release();
+
+ ObjectIterator it = cache_ioctx.objects_begin();
+ ASSERT_TRUE(it != cache_ioctx.objects_end());
+ ASSERT_TRUE(it->first == string("foo"));
+ ++it;
+ ASSERT_TRUE(it == cache_ioctx.objects_end());
+ }
// recreate an object and verify we can read it
{
@@ -2154,6 +2171,91 @@ TEST_F(LibRadosTwoPoolsPP, HitSetTrim) {
}
}
+TEST_F(LibRadosTwoPoolsPP, PromoteOn2ndRead) {
+ // create object
+ {
+ bufferlist bl;
+ bl.append("hi there");
+ ObjectWriteOperation op;
+ op.write_full(bl);
+ ASSERT_EQ(0, ioctx.operate("foo", &op));
+ }
+
+ // configure cache
+ bufferlist inbl;
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+ "\", \"tierpool\": \"" + cache_pool_name +
+ "\", \"force_nonempty\": \"--force-nonempty\" }",
+ inbl, NULL, NULL));
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
+ "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
+ inbl, NULL, NULL));
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier cache-mode\", \"pool\": \"" + cache_pool_name +
+ "\", \"mode\": \"writeback\"}",
+ inbl, NULL, NULL));
+
+ // enable hitset tracking for this pool
+ ASSERT_EQ(0, cluster.mon_command(
+ set_pool_str(cache_pool_name, "hit_set_count", 2),
+ inbl, NULL, NULL));
+ ASSERT_EQ(0, cluster.mon_command(
+ set_pool_str(cache_pool_name, "hit_set_period", 600),
+ inbl, NULL, NULL));
+ ASSERT_EQ(0, cluster.mon_command(
+ set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
+ inbl, NULL, NULL));
+ ASSERT_EQ(0, cluster.mon_command(
+ set_pool_str(cache_pool_name, "min_read_recency_for_promote", 1),
+ inbl, NULL, NULL));
+
+ // wait for maps to settle
+ cluster.wait_for_latest_osdmap();
+
+ // 1st read, don't trigger a promote
+ {
+ bufferlist bl;
+ ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+ }
+
+ // verify the object is NOT present in the cache tier
+ {
+ ObjectIterator it = cache_ioctx.objects_begin();
+ ASSERT_TRUE(it == cache_ioctx.objects_end());
+ }
+
+ // Read until the object is present in the cache tier
+ while (true) {
+ bufferlist bl;
+ ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+
+ ObjectIterator it = cache_ioctx.objects_begin();
+ if (it != cache_ioctx.objects_end()) {
+ ASSERT_TRUE(it->first == string("foo"));
+ ++it;
+ ASSERT_TRUE(it == cache_ioctx.objects_end());
+ break;
+ }
+
+ sleep(1);
+ }
+
+ // tear down tiers
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+ "\"}",
+ inbl, NULL, NULL));
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+ "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+ inbl, NULL, NULL));
+
+ // wait for maps to settle before next test
+ cluster.wait_for_latest_osdmap();
+}
+
class LibRadosTwoPoolsECPP : public RadosTestECPP
{
public:
@@ -2163,16 +2265,15 @@ protected:
static void SetUpTestCase() {
pool_name = get_temp_pool_name();
ASSERT_EQ("", create_one_ec_pool_pp(pool_name, s_cluster));
- cache_pool_name = get_temp_pool_name();
- ASSERT_EQ(0, s_cluster.pool_create(cache_pool_name.c_str()));
}
static void TearDownTestCase() {
- ASSERT_EQ(0, s_cluster.pool_delete(cache_pool_name.c_str()));
ASSERT_EQ(0, destroy_one_ec_pool_pp(pool_name, s_cluster));
}
static std::string cache_pool_name;
virtual void SetUp() {
+ cache_pool_name = get_temp_pool_name();
+ ASSERT_EQ(0, s_cluster.pool_create(cache_pool_name.c_str()));
RadosTestECPP::SetUp();
ASSERT_EQ(0, cluster.ioctx_create(cache_pool_name.c_str(), cache_ioctx));
cache_ioctx.set_namespace(ns);
@@ -2200,6 +2301,7 @@ protected:
cleanup_default_namespace(cache_ioctx);
cache_ioctx.close();
+ ASSERT_EQ(0, s_cluster.pool_delete(cache_pool_name.c_str()));
}
librados::IoCtx cache_ioctx;
@@ -2640,7 +2742,23 @@ TEST_F(LibRadosTwoPoolsECPP, Whiteout) {
ASSERT_TRUE(it == cache_ioctx.objects_end());
}
+ // delete a whiteout and verify it goes away
ASSERT_EQ(-ENOENT, ioctx.remove("foo"));
+ {
+ ObjectWriteOperation op;
+ op.remove();
+ librados::AioCompletion *completion = cluster.aio_create_completion();
+ ASSERT_EQ(0, ioctx.aio_operate("bar", completion, &op,
+ librados::OPERATION_IGNORE_CACHE));
+ completion->wait_for_safe();
+ ASSERT_EQ(0, completion->get_return_value());
+ completion->release();
+
+ ObjectIterator it = cache_ioctx.objects_begin();
+ ASSERT_TRUE(it != cache_ioctx.objects_end());
+ ++it;
+ ASSERT_TRUE(it == cache_ioctx.objects_end());
+ }
// recreate an object and verify we can read it
{
@@ -4019,6 +4137,91 @@ TEST_F(LibRadosTwoPoolsECPP, HitSetTrim) {
delete[] buf;
}
+TEST_F(LibRadosTwoPoolsECPP, PromoteOn2ndRead) {
+ // create object
+ {
+ bufferlist bl;
+ bl.append("hi there");
+ ObjectWriteOperation op;
+ op.write_full(bl);
+ ASSERT_EQ(0, ioctx.operate("foo", &op));
+ }
+
+ // configure cache
+ bufferlist inbl;
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+ "\", \"tierpool\": \"" + cache_pool_name +
+ "\", \"force_nonempty\": \"--force-nonempty\" }",
+ inbl, NULL, NULL));
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
+ "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
+ inbl, NULL, NULL));
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier cache-mode\", \"pool\": \"" + cache_pool_name +
+ "\", \"mode\": \"writeback\"}",
+ inbl, NULL, NULL));
+
+ // enable hitset tracking for this pool
+ ASSERT_EQ(0, cluster.mon_command(
+ set_pool_str(cache_pool_name, "hit_set_count", 2),
+ inbl, NULL, NULL));
+ ASSERT_EQ(0, cluster.mon_command(
+ set_pool_str(cache_pool_name, "hit_set_period", 600),
+ inbl, NULL, NULL));
+ ASSERT_EQ(0, cluster.mon_command(
+ set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
+ inbl, NULL, NULL));
+ ASSERT_EQ(0, cluster.mon_command(
+ set_pool_str(cache_pool_name, "min_read_recency_for_promote", 1),
+ inbl, NULL, NULL));
+
+ // wait for maps to settle
+ cluster.wait_for_latest_osdmap();
+
+ // 1st read, don't trigger a promote
+ {
+ bufferlist bl;
+ ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+ }
+
+ // verify the object is NOT present in the cache tier
+ {
+ ObjectIterator it = cache_ioctx.objects_begin();
+ ASSERT_TRUE(it == cache_ioctx.objects_end());
+ }
+
+ // Read until the object is present in the cache tier
+ while (true) {
+ bufferlist bl;
+ ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+
+ ObjectIterator it = cache_ioctx.objects_begin();
+ if (it != cache_ioctx.objects_end()) {
+ ASSERT_TRUE(it->first == string("foo"));
+ ++it;
+ ASSERT_TRUE(it == cache_ioctx.objects_end());
+ break;
+ }
+
+ sleep(1);
+ }
+
+ // tear down tiers
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+ "\"}",
+ inbl, NULL, NULL));
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+ "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+ inbl, NULL, NULL));
+
+ // wait for maps to settle before next test
+ cluster.wait_for_latest_osdmap();
+}
+
int main(int argc, char **argv)
{
::testing::InitGoogleTest(&argc, argv);
diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc
index c37d884..867e197 100644
--- a/src/test/librbd/test_librbd.cc
+++ b/src/test/librbd/test_librbd.cc
@@ -750,8 +750,16 @@ TEST(LibRBD, TestIO)
ASSERT_EQ(10, rbd_write(image, info.size - 10, 100, test_data));
rbd_aio_create_completion(NULL, (rbd_callback_t) simple_read_cb, &comp);
- ASSERT_EQ(-EINVAL, rbd_aio_write(image, info.size, 1, test_data, comp));
- ASSERT_EQ(-EINVAL, rbd_aio_read(image, info.size, 1, test_data, comp));
+ ASSERT_EQ(0, rbd_aio_write(image, info.size, 1, test_data, comp));
+ ASSERT_EQ(0, rbd_aio_wait_for_complete(comp));
+ ASSERT_EQ(-EINVAL, rbd_aio_get_return_value(comp));
+ rbd_aio_release(comp);
+
+ rbd_aio_create_completion(NULL, (rbd_callback_t) simple_read_cb, &comp);
+ ASSERT_EQ(0, rbd_aio_read(image, info.size, 1, test_data, comp));
+ ASSERT_EQ(0, rbd_aio_wait_for_complete(comp));
+ ASSERT_EQ(-EINVAL, rbd_aio_get_return_value(comp));
+ rbd_aio_release(comp);
ASSERT_EQ(0, rbd_close(image));
@@ -1965,6 +1973,65 @@ TEST(LibRBD, TestPendingAio)
ASSERT_EQ(0, destroy_one_pool(pool_name, &cluster));
}
+TEST(LibRBD, BlockingAIO)
+{
+ librados::Rados rados;
+ librados::IoCtx ioctx;
+ string pool_name = get_temp_pool_name();
+
+ ASSERT_EQ("", create_one_pool_pp(pool_name, rados));
+ ASSERT_EQ(0, rados.ioctx_create(pool_name.c_str(), ioctx));
+
+ librbd::RBD rbd;
+ std::string name = "testimg";
+ uint64_t size = 1 << 20;
+ int order = 18;
+ ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+ CephContext *cct = reinterpret_cast<CephContext*>(ioctx.cct());
+ cct->_conf->set_val_or_die("rbd_non_blocking_aio", "0");
+
+ librbd::Image image;
+ ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
+
+ bufferlist bl;
+ bl.append(std::string(256, '1'));
+
+ librbd::RBD::AioCompletion *write_comp =
+ new librbd::RBD::AioCompletion(NULL, NULL);
+ ASSERT_EQ(0, image.aio_write(0, bl.length(), bl, write_comp));
+
+ librbd::RBD::AioCompletion *flush_comp =
+ new librbd::RBD::AioCompletion(NULL, NULL);
+ ASSERT_EQ(0, image.aio_flush(flush_comp));
+ ASSERT_EQ(0, flush_comp->wait_for_complete());
+ ASSERT_EQ(0, flush_comp->get_return_value());
+ flush_comp->release();
+
+ ASSERT_EQ(1, write_comp->is_complete());
+ ASSERT_EQ(0, write_comp->get_return_value());
+ write_comp->release();
+
+ librbd::RBD::AioCompletion *discard_comp =
+ new librbd::RBD::AioCompletion(NULL, NULL);
+ ASSERT_EQ(0, image.aio_discard(128, 128, discard_comp));
+ ASSERT_EQ(0, discard_comp->wait_for_complete());
+ discard_comp->release();
+
+ librbd::RBD::AioCompletion *read_comp =
+ new librbd::RBD::AioCompletion(NULL, NULL);
+ bufferlist read_bl;
+ image.aio_read(0, bl.length(), read_bl, read_comp);
+ ASSERT_EQ(0, read_comp->wait_for_complete());
+ ASSERT_EQ(bl.length(), read_comp->get_return_value());
+ read_comp->release();
+
+ bufferlist expected_bl;
+ expected_bl.append(std::string(128, '1'));
+ expected_bl.append(std::string(128, '\0'));
+ ASSERT_TRUE(expected_bl.contents_equal(read_bl));
+}
+
int main(int argc, char **argv)
{
::testing::InitGoogleTest(&argc, argv);
diff --git a/src/test/mon/test_mon_workloadgen.cc b/src/test/mon/test_mon_workloadgen.cc
index 3c6ff56..d1659d6 100644
--- a/src/test/mon/test_mon_workloadgen.cc
+++ b/src/test/mon/test_mon_workloadgen.cc
@@ -247,8 +247,7 @@ class ClientStub : public TestStub
return err;
}
- messenger.reset(Messenger::create(cct, entity_name_t::CLIENT(-1),
- "stubclient", getpid()));
+ messenger.reset(Messenger::create_client_messenger(cct, "stubclient"));
assert(messenger.get() != NULL);
messenger->set_default_policy(
diff --git a/src/test/objectstore/chain_xattr.cc b/src/test/objectstore/chain_xattr.cc
index 8346c02..7c08962 100644
--- a/src/test/objectstore/chain_xattr.cc
+++ b/src/test/objectstore/chain_xattr.cc
@@ -29,6 +29,7 @@
#include <gtest/gtest.h>
#define LARGE_BLOCK_LEN CHAIN_XATTR_MAX_BLOCK_LEN + 1024
+#define FILENAME "bufferlist"
TEST(chain_xattr, get_and_set) {
const char* file = "testfile";
@@ -147,6 +148,44 @@ TEST(chain_xattr, get_and_set) {
::unlink(file);
}
+TEST(chain_xattr, chunk_aligned) {
+ const char* file = FILENAME;
+ ::unlink(file);
+ int fd = ::open(file, O_CREAT|O_WRONLY|O_TRUNC, 0700);
+ const string user("user.");
+
+ // set N* chunk size
+ const string name = "user.foo";
+ const string name2 = "user.bar";
+
+ for (int len = CHAIN_XATTR_MAX_BLOCK_LEN - 10;
+ len < CHAIN_XATTR_MAX_BLOCK_LEN + 10;
+ ++len) {
+ cout << len << std::endl;
+ const string x(len, 'x');
+ char buf[len*2];
+ ASSERT_EQ(len, chain_setxattr(file, name.c_str(), x.c_str(), len));
+ char attrbuf[4096];
+ int l = ceph_os_listxattr(file, attrbuf, sizeof(attrbuf));
+ for (char *p = attrbuf; p - attrbuf < l; p += strlen(p) + 1) {
+ cout << " attr " << p << std::endl;
+ }
+ ASSERT_EQ(len, chain_getxattr(file, name.c_str(), buf, len*2));
+ ASSERT_EQ(0, chain_removexattr(file, name.c_str()));
+
+ ASSERT_EQ(len, chain_fsetxattr(fd, name2.c_str(), x.c_str(), len));
+ l = ceph_os_flistxattr(fd, attrbuf, sizeof(attrbuf));
+ for (char *p = attrbuf; p - attrbuf < l; p += strlen(p) + 1) {
+ cout << " attr " << p << std::endl;
+ }
+ ASSERT_EQ(len, chain_fgetxattr(fd, name2.c_str(), buf, len*2));
+ ASSERT_EQ(0, chain_fremovexattr(fd, name2.c_str()));
+ }
+
+ ::close(fd);
+ ::unlink(file);
+}
+
TEST(chain_xattr, listxattr) {
const char* file = "testfile";
::unlink(file);
diff --git a/src/test/osd/TestPGLog.cc b/src/test/osd/TestPGLog.cc
index c2063b8..6dc6bec 100644
--- a/src/test/osd/TestPGLog.cc
+++ b/src/test/osd/TestPGLog.cc
@@ -138,6 +138,14 @@ public:
fullauth.index();
fulldiv.index();
}
+ void set_div_bounds(eversion_t head, eversion_t tail) {
+ fulldiv.tail = divinfo.log_tail = tail;
+ fulldiv.head = divinfo.last_update = head;
+ }
+ void set_auth_bounds(eversion_t head, eversion_t tail) {
+ fullauth.tail = authinfo.log_tail = tail;
+ fullauth.head = authinfo.last_update = head;
+ }
const IndexedLog &get_fullauth() const { return fullauth; }
const IndexedLog &get_fulldiv() const { return fulldiv; }
const pg_info_t &get_authinfo() const { return authinfo; }
@@ -235,6 +243,8 @@ public:
proc_replica_log(
t, oinfo, olog, omissing, pg_shard_t(1, 0));
+ assert(oinfo.last_update >= log.tail);
+
if (!tcase.base.empty()) {
ASSERT_EQ(tcase.base.rbegin()->version, oinfo.last_update);
}
@@ -1270,8 +1280,8 @@ TEST_F(PGLogTest, proc_replica_log) {
pg_shard_t from;
eversion_t last_update(1, 1);
- oinfo.last_update = last_update;
- eversion_t last_complete(2, 1);
+ log.head = olog.head = oinfo.last_update = last_update;
+ eversion_t last_complete(1, 1);
oinfo.last_complete = last_complete;
EXPECT_TRUE(t.empty());
@@ -1470,12 +1480,12 @@ TEST_F(PGLogTest, proc_replica_log) {
}
/* +--------------------------+
- | log olog |
+ | olog log |
+--------+-------+---------+
| |object | |
|version | hash | version |
| | | |
- tail > (1,1) | x5 | (1,1) < tail
+ tail > (1,1) | x9 | (1,1) < tail
| | | |
| | | |
| (1,2) | x3 | (1,2) |
@@ -1503,34 +1513,38 @@ TEST_F(PGLogTest, proc_replica_log) {
pg_shard_t from;
eversion_t last_update(1, 2);
+ hobject_t divergent_object;
+ divergent_object.hash = 0x9;
{
pg_log_entry_t e;
e.mod_desc.mark_unrollbackable();
e.version = eversion_t(1, 1);
- e.soid.hash = 0x5;
+ e.soid = divergent_object;
log.tail = e.version;
log.log.push_back(e);
e.version = last_update;
e.soid.hash = 0x3;
log.log.push_back(e);
- e.version = eversion_t(1,3);
- e.soid.hash = 0x9;
+ e.version = eversion_t(2, 3);
+ e.prior_version = eversion_t(1, 1);
+ e.soid = divergent_object;
e.op = pg_log_entry_t::DELETE;
log.log.push_back(e);
log.head = e.version;
log.index();
e.version = eversion_t(1, 1);
- e.soid.hash = 0x5;
+ e.soid = divergent_object;
olog.tail = e.version;
olog.log.push_back(e);
e.version = last_update;
e.soid.hash = 0x3;
olog.log.push_back(e);
- e.version = eversion_t(2, 3);
- e.soid.hash = 0x9;
+ e.version = eversion_t(1, 3);
+ e.prior_version = eversion_t(1, 1);
+ e.soid = divergent_object;
e.op = pg_log_entry_t::DELETE;
olog.log.push_back(e);
olog.head = e.version;
@@ -1547,28 +1561,30 @@ TEST_F(PGLogTest, proc_replica_log) {
proc_replica_log(t, oinfo, olog, omissing, from);
EXPECT_TRUE(t.empty());
- EXPECT_FALSE(omissing.have_missing());
+ EXPECT_TRUE(omissing.have_missing());
+ EXPECT_TRUE(omissing.is_missing(divergent_object));
+ EXPECT_EQ(omissing.missing[divergent_object].have, eversion_t(0, 0));
+ EXPECT_EQ(omissing.missing[divergent_object].need, eversion_t(1, 1));
EXPECT_EQ(last_update, oinfo.last_update);
- EXPECT_EQ(last_update, oinfo.last_complete);
}
/* +--------------------------+
- | log olog |
+ | olog log |
+--------+-------+---------+
| |object | |
|version | hash | version |
| | | |
- tail > (1,1) | x5 | (1,1) < tail
+ tail > (1,1) | x9 | (1,1) < tail
| | | |
| | | |
| (1,2) | x3 | (1,2) |
| | | |
| | | |
head > (1,3) | x9 | |
- | DELETE | | |
+ | MODIFY | | |
| | | |
| | x9 | (2,3) < head
- | | | MODIFY |
+ | | | DELETE |
| | | |
+--------+-------+---------+
@@ -1593,28 +1609,30 @@ TEST_F(PGLogTest, proc_replica_log) {
e.mod_desc.mark_unrollbackable();
e.version = eversion_t(1, 1);
- e.soid.hash = 0x5;
+ e.soid = divergent_object;
log.tail = e.version;
log.log.push_back(e);
e.version = last_update;
e.soid.hash = 0x3;
log.log.push_back(e);
- e.version = eversion_t(1, 3);
- e.soid.hash = 0x9;
+ e.version = eversion_t(2, 3);
+ e.prior_version = eversion_t(1, 1);
+ e.soid = divergent_object;
e.op = pg_log_entry_t::DELETE;
log.log.push_back(e);
log.head = e.version;
log.index();
e.version = eversion_t(1, 1);
- e.soid.hash = 0x5;
+ e.soid = divergent_object;
olog.tail = e.version;
olog.log.push_back(e);
e.version = last_update;
e.soid.hash = 0x3;
olog.log.push_back(e);
- e.version = eversion_t(2, 3);
- e.soid.hash = 0x9;
+ e.version = eversion_t(1, 3);
+ e.prior_version = eversion_t(1, 1);
+ e.soid = divergent_object;
divergent_object = e.soid;
omissing.add(divergent_object, e.version, eversion_t());
e.op = pg_log_entry_t::MODIFY;
@@ -1628,16 +1646,18 @@ TEST_F(PGLogTest, proc_replica_log) {
EXPECT_TRUE(t.empty());
EXPECT_TRUE(omissing.have_missing());
EXPECT_TRUE(omissing.is_missing(divergent_object));
- EXPECT_EQ(eversion_t(2, 3), omissing.missing[divergent_object].need);
+ EXPECT_EQ(eversion_t(1, 3), omissing.missing[divergent_object].need);
EXPECT_EQ(olog.head, oinfo.last_update);
EXPECT_EQ(olog.head, oinfo.last_complete);
proc_replica_log(t, oinfo, olog, omissing, from);
EXPECT_TRUE(t.empty());
- EXPECT_FALSE(omissing.have_missing());
+ EXPECT_TRUE(omissing.have_missing());
+ EXPECT_TRUE(omissing.is_missing(divergent_object));
+ EXPECT_EQ(omissing.missing[divergent_object].have, eversion_t(0, 0));
+ EXPECT_EQ(omissing.missing[divergent_object].need, eversion_t(1, 1));
EXPECT_EQ(last_update, oinfo.last_update);
- EXPECT_EQ(last_update, oinfo.last_complete);
}
/* +--------------------------+
@@ -1862,6 +1882,20 @@ TEST_F(PGLogTest, merge_log_prior_version_have) {
run_test_case(t);
}
+TEST_F(PGLogTest, merge_log_split_missing_entries_at_head) {
+ TestCase t;
+ t.auth.push_back(mk_ple_mod_rb(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+ t.auth.push_back(mk_ple_mod_rb(mk_obj(1), mk_evt(15, 150), mk_evt(10, 100)));
+
+ t.div.push_back(mk_ple_mod(mk_obj(1), mk_evt(8, 70), mk_evt(8, 65)));
+
+ t.setup();
+ t.set_div_bounds(mk_evt(9, 79), mk_evt(8, 69));
+ t.set_auth_bounds(mk_evt(10, 160), mk_evt(9, 77));
+ t.final.add(mk_obj(1), mk_evt(15, 150), mk_evt(8, 70));
+ run_test_case(t);
+}
+
int main(int argc, char **argv) {
vector<const char*> args;
argv_to_vec(argc, (const char **)argv, args);
diff --git a/src/tools/ceph_authtool.cc b/src/tools/ceph_authtool.cc
index f66a3c6..1b6b90a 100644
--- a/src/tools/ceph_authtool.cc
+++ b/src/tools/ceph_authtool.cc
@@ -84,7 +84,7 @@ int main(int argc, const char **argv)
gen_print_key = true;
} else if (ceph_argparse_witharg(args, i, &val, "-a", "--add-key", (char*)NULL)) {
add_key = val;
- } else if (ceph_argparse_flag(args, i, &val, "-l", "--list", (char*)NULL)) {
+ } else if (ceph_argparse_flag(args, i, "-l", "--list", (char*)NULL)) {
list = true;
} else if (ceph_argparse_witharg(args, i, &val, "--caps", (char*)NULL)) {
caps_fn = val;
diff --git a/src/upstart/ceph-mds.conf b/src/upstart/ceph-mds.conf
index 77841cd..4063d91 100644
--- a/src/upstart/ceph-mds.conf
+++ b/src/upstart/ceph-mds.conf
@@ -4,7 +4,7 @@ start on ceph-mds
stop on runlevel [!2345] or stopping ceph-mds-all
respawn
-respawn limit 5 30
+respawn limit 3 1800
limit nofile 16384 16384
diff --git a/src/upstart/ceph-mon.conf b/src/upstart/ceph-mon.conf
index 0279f15..83c9858 100644
--- a/src/upstart/ceph-mon.conf
+++ b/src/upstart/ceph-mon.conf
@@ -4,7 +4,7 @@ start on ceph-mon
stop on runlevel [!2345] or stopping ceph-mon-all
respawn
-respawn limit 5 30
+respawn limit 3 1800
limit nofile 16384 16384
diff --git a/src/upstart/ceph-osd.conf b/src/upstart/ceph-osd.conf
index 7175c2d..6fa97ed 100644
--- a/src/upstart/ceph-osd.conf
+++ b/src/upstart/ceph-osd.conf
@@ -4,7 +4,7 @@ start on ceph-osd
stop on runlevel [!2345] or stopping ceph-osd-all
respawn
-respawn limit 5 30
+respawn limit 3 1800
limit nofile 32768 32768
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git
More information about the Pkg-ceph-commits
mailing list