[Pkg-ceph-commits] [ceph] 01/01: Imported Upstream version 0.80.6
Dmitry Smirnov
onlyjob at moszumanska.debian.org
Thu Oct 2 12:58:20 UTC 2014
This is an automated email from the git hooks/post-receive script.
onlyjob pushed a commit to branch upstream
in repository ceph.
commit 680e2ae (upstream)
Author: Dmitry Smirnov <onlyjob at member.fsf.org>
Date: Thu Oct 2 12:57:25 2014
Imported Upstream version 0.80.6
---
ceph.spec | 2 +-
configure | 240 ++++++++-
configure.ac | 9 +-
src/.git_version | 4 +-
src/Makefile.in | 86 ++--
src/acconfig.h.in | 3 +
src/ceph-disk | 376 +++++++++++----
src/ceph.in | 35 +-
src/ceph_common.sh | 5 +-
src/ceph_mon.cc | 22 +-
src/ceph_osd.cc | 20 +
src/cls/rgw/cls_rgw.cc | 2 +-
src/common/Finisher.h | 9 +
src/common/LogClient.cc | 3 +-
src/common/Makefile.am | 3 +
src/common/Thread.cc | 34 +-
src/common/Thread.h | 5 +
src/common/WorkQueue.cc | 21 +
src/common/WorkQueue.h | 4 +
src/common/blkdev.cc | 2 +-
src/common/config.cc | 10 +-
src/common/config_opts.h | 12 +
src/common/io_priority.cc | 54 +++
src/common/io_priority.h | 44 ++
src/common/random_cache.hpp | 111 +++++
src/common/str_map.cc | 2 +-
src/common/strtol.cc | 43 ++
src/common/strtol.h | 5 +
src/crush/CrushWrapper.cc | 109 ++++-
src/crush/CrushWrapper.h | 14 +
src/erasure-code/ErasureCodeInterface.h | 2 +-
src/erasure-code/ErasureCodePlugin.cc | 28 ++
src/erasure-code/ErasureCodePlugin.h | 3 +
src/erasure-code/jerasure/ErasureCodeJerasure.cc | 8 +-
src/global/global_init.cc | 10 +-
src/include/atomic.h | 123 +++--
src/include/intarith.h | 2 +-
src/include/rbd/librbd.h | 9 +
src/include/rbd/librbd.hpp | 8 +
src/include/str_map.h | 2 +-
src/init-ceph.in | 33 +-
src/init-radosgw.sysv | 13 +-
src/librados/RadosClient.cc | 8 +-
src/librbd/ImageCtx.cc | 20 +-
src/librbd/ImageCtx.h | 2 +-
src/librbd/internal.cc | 42 +-
src/librbd/internal.h | 2 +
src/librbd/librbd.cc | 12 +
src/mds/Locker.cc | 8 +-
src/mds/MDCache.cc | 2 +
src/messages/MOSDSubOp.h | 10 +-
src/mon/DataHealthService.cc | 2 +-
src/mon/MonCommands.h | 6 +-
src/mon/Monitor.cc | 41 +-
src/mon/Monitor.h | 3 +
src/mon/MonmapMonitor.cc | 5 +
src/mon/OSDMonitor.cc | 276 ++++++++++-
src/mon/OSDMonitor.h | 2 +-
src/mon/PGMonitor.cc | 83 +++-
src/mon/PGMonitor.h | 6 +-
src/mon/Paxos.cc | 3 +-
src/msg/SimpleMessenger.cc | 3 +
src/os/FileJournal.cc | 7 +-
src/os/FileStore.cc | 15 +-
src/os/FileStore.h | 72 ++-
src/os/GenericObjectMap.cc | 46 +-
src/os/GenericObjectMap.h | 37 +-
src/os/KeyValueStore.cc | 587 ++++++++++++-----------
src/os/KeyValueStore.h | 182 ++++---
src/os/LFNIndex.cc | 88 +++-
src/os/LFNIndex.h | 8 +-
src/os/MemStore.cc | 7 +-
src/os/ObjectStore.cc | 6 +-
src/os/ObjectStore.h | 26 +
src/osd/ECBackend.cc | 10 +-
src/osd/ECBackend.h | 2 +
src/osd/ECMsgTypes.cc | 28 +-
src/osd/ECMsgTypes.h | 5 +-
src/osd/HitSet.h | 2 +-
src/osd/OSD.cc | 168 +++++--
src/osd/OSD.h | 58 ++-
src/osd/OSDMap.cc | 49 +-
src/osd/OSDMap.h | 3 +
src/osd/OpRequest.cc | 2 +-
src/osd/OpRequest.h | 4 +
src/osd/PG.cc | 275 +++++++++--
src/osd/PG.h | 125 ++++-
src/osd/PGBackend.cc | 11 +-
src/osd/PGBackend.h | 5 +-
src/osd/PGLog.cc | 42 +-
src/osd/PGLog.h | 71 ++-
src/osd/ReplicatedBackend.cc | 10 +-
src/osd/ReplicatedBackend.h | 2 +
src/osd/ReplicatedPG.cc | 176 ++++---
src/osd/ReplicatedPG.h | 3 +-
src/osd/osd_types.cc | 22 +-
src/osd/osd_types.h | 121 ++++-
src/osdc/Objecter.cc | 12 +-
src/osdc/Objecter.h | 2 +-
src/pybind/rados.py | 7 +-
src/pybind/rbd.py | 8 +
src/rgw/rgw_common.cc | 10 +-
src/rgw/rgw_op.cc | 72 +--
src/rgw/rgw_rados.cc | 174 +++++--
src/rgw/rgw_rados.h | 20 +-
src/rgw/rgw_rest.cc | 4 +-
src/rgw/rgw_rest_swift.cc | 6 +-
src/test/crush/TestCrushWrapper.cc | 5 +
src/test/erasure-code/TestErasureCodeJerasure.cc | 30 ++
src/test/librados/TestCase.cc | 32 +-
src/test/librados/TestCase.h | 7 +-
src/test/librados/io.cc | 52 ++
src/test/librados/tier.cc | 538 +++++++--------------
src/test/objectstore/store_test.cc | 105 ++++
src/test/osd/TestOSDMap.cc | 57 ++-
src/test/osd/osd-test-helpers.sh | 1 +
src/test/strtol.cc | 75 +++
117 files changed, 4097 insertions(+), 1461 deletions(-)
diff --git a/ceph.spec b/ceph.spec
index 31b8960..20937c2 100644
--- a/ceph.spec
+++ b/ceph.spec
@@ -9,7 +9,7 @@
# common
#################################################################################
Name: ceph
-Version: 0.80.5
+Version: 0.80.6
Release: 0%{?dist}
Summary: User space components of the Ceph file system
License: GPL-2.0
diff --git a/configure b/configure
index 8d8c0ed..81da86d 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.68 for ceph 0.80.5.
+# Generated by GNU Autoconf 2.68 for ceph 0.80.6.
#
# Report bugs to <ceph-devel at vger.kernel.org>.
#
@@ -570,8 +570,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='ceph'
PACKAGE_TARNAME='ceph'
-PACKAGE_VERSION='0.80.5'
-PACKAGE_STRING='ceph 0.80.5'
+PACKAGE_VERSION='0.80.6'
+PACKAGE_STRING='ceph 0.80.6'
PACKAGE_BUGREPORT='ceph-devel at vger.kernel.org'
PACKAGE_URL=''
@@ -1441,7 +1441,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures ceph 0.80.5 to adapt to many kinds of systems.
+\`configure' configures ceph 0.80.6 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1512,7 +1512,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of ceph 0.80.5:";;
+ short | recursive ) echo "Configuration of ceph 0.80.6:";;
esac
cat <<\_ACEOF
@@ -1657,7 +1657,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-ceph configure 0.80.5
+ceph configure 0.80.6
generated by GNU Autoconf 2.68
Copyright (C) 2010 Free Software Foundation, Inc.
@@ -2144,6 +2144,184 @@ fi
} # ac_fn_c_check_header_mongrel
+# ac_fn_c_compute_int LINENO EXPR VAR INCLUDES
+# --------------------------------------------
+# Tries to find the compile-time value of EXPR in a program that includes
+# INCLUDES, setting VAR accordingly. Returns whether the value could be
+# computed
+ac_fn_c_compute_int ()
+{
+ as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+ if test "$cross_compiling" = yes; then
+ # Depending upon the size, compute the lo and hi bounds.
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) >= 0)];
+test_array [0] = 0
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+ ac_lo=0 ac_mid=0
+ while :; do
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) <= $ac_mid)];
+test_array [0] = 0
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+ ac_hi=$ac_mid; break
+else
+ as_fn_arith $ac_mid + 1 && ac_lo=$as_val
+ if test $ac_lo -le $ac_mid; then
+ ac_lo= ac_hi=
+ break
+ fi
+ as_fn_arith 2 '*' $ac_mid + 1 && ac_mid=$as_val
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ done
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) < 0)];
+test_array [0] = 0
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+ ac_hi=-1 ac_mid=-1
+ while :; do
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) >= $ac_mid)];
+test_array [0] = 0
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+ ac_lo=$ac_mid; break
+else
+ as_fn_arith '(' $ac_mid ')' - 1 && ac_hi=$as_val
+ if test $ac_mid -le $ac_hi; then
+ ac_lo= ac_hi=
+ break
+ fi
+ as_fn_arith 2 '*' $ac_mid && ac_mid=$as_val
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ done
+else
+ ac_lo= ac_hi=
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+# Binary search between lo and hi bounds.
+while test "x$ac_lo" != "x$ac_hi"; do
+ as_fn_arith '(' $ac_hi - $ac_lo ')' / 2 + $ac_lo && ac_mid=$as_val
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) <= $ac_mid)];
+test_array [0] = 0
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+ ac_hi=$ac_mid
+else
+ as_fn_arith '(' $ac_mid ')' + 1 && ac_lo=$as_val
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+done
+case $ac_lo in #((
+?*) eval "$3=\$ac_lo"; ac_retval=0 ;;
+'') ac_retval=1 ;;
+esac
+ else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+$4
+static long int longval () { return $2; }
+static unsigned long int ulongval () { return $2; }
+#include <stdio.h>
+#include <stdlib.h>
+int
+main ()
+{
+
+ FILE *f = fopen ("conftest.val", "w");
+ if (! f)
+ return 1;
+ if (($2) < 0)
+ {
+ long int i = longval ();
+ if (i != ($2))
+ return 1;
+ fprintf (f, "%ld", i);
+ }
+ else
+ {
+ unsigned long int i = ulongval ();
+ if (i != ($2))
+ return 1;
+ fprintf (f, "%lu", i);
+ }
+ /* Do not output a trailing newline, as this causes \r\n confusion
+ on some platforms. */
+ return ferror (f) || fclose (f) != 0;
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+ echo >>conftest.val; read $3 <conftest.val; ac_retval=0
+else
+ ac_retval=1
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+ conftest.$ac_objext conftest.beam conftest.$ac_ext
+rm -f conftest.val
+
+ fi
+ eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+ as_fn_set_status $ac_retval
+
+} # ac_fn_c_compute_int
+
# ac_fn_cxx_check_header_mongrel LINENO HEADER VAR INCLUDES
# ---------------------------------------------------------
# Tests whether HEADER exists, giving a warning if it cannot be compiled using
@@ -2504,7 +2682,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by ceph $as_me 0.80.5, which was
+It was created by ceph $as_me 0.80.6, which was
generated by GNU Autoconf 2.68. Invocation command line was
$ $0 $@
@@ -4504,7 +4682,7 @@ fi
# Define the identity of the package.
PACKAGE='ceph'
- VERSION='0.80.5'
+ VERSION='0.80.6'
cat >>confdefs.h <<_ACEOF
@@ -12482,7 +12660,7 @@ fi
# Define the identity of the package.
PACKAGE='ceph'
- VERSION='0.80.5'
+ VERSION='0.80.6'
cat >>confdefs.h <<_ACEOF
@@ -18906,7 +19084,7 @@ else
JAVA_TEST=Test.java
CLASS_TEST=Test.class
cat << \EOF > $JAVA_TEST
-/* #line 18909 "configure" */
+/* #line 19087 "configure" */
public class Test {
}
EOF
@@ -19239,12 +19417,50 @@ fi
fi
if test "$HAVE_ATOMIC_OPS" = "1"; then :
+ # The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of AO_t" >&5
+$as_echo_n "checking size of AO_t... " >&6; }
+if ${ac_cv_sizeof_AO_t+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (AO_t))" "ac_cv_sizeof_AO_t" "
+ #include <atomic_ops.h>
+
+"; then :
+
+else
+ if test "$ac_cv_type_AO_t" = yes; then
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (AO_t)
+See \`config.log' for more details" "$LINENO" 5; }
+ else
+ ac_cv_sizeof_AO_t=0
+ fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_AO_t" >&5
+$as_echo "$ac_cv_sizeof_AO_t" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_AO_T $ac_cv_sizeof_AO_t
+_ACEOF
+
+
+
else
$as_echo "#define NO_ATOMIC_OPS 1" >>confdefs.h
fi
+
if test "$HAVE_ATOMIC_OPS" = "1"; then
WITH_LIBATOMIC_TRUE=
WITH_LIBATOMIC_FALSE='#'
@@ -22248,7 +22464,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by ceph $as_me 0.80.5, which was
+This file was extended by ceph $as_me 0.80.6, which was
generated by GNU Autoconf 2.68. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -22314,7 +22530,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-ceph config.status 0.80.5
+ceph config.status 0.80.6
configured by $0, generated by GNU Autoconf 2.68,
with options \\"\$ac_cs_config\\"
diff --git a/configure.ac b/configure.ac
index fb54df1..eb16aa5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
# VERSION define is not used by the code. It gets a version string
# from 'git describe'; see src/ceph_ver.[ch]
-AC_INIT([ceph], [0.80.5], [ceph-devel at vger.kernel.org])
+AC_INIT([ceph], [0.80.6], [ceph-devel at vger.kernel.org])
# Create release string. Used with VERSION for RPMs.
RPM_RELEASE=0
@@ -472,9 +472,14 @@ AS_IF([test "x$with_libatomic_ops" != xno],
[no libatomic-ops found (use --without-libatomic-ops to disable)])
])])
AS_IF([test "$HAVE_ATOMIC_OPS" = "1"],
- [],
+ [
+ AC_CHECK_SIZEOF(AO_t, [], [
+ #include <atomic_ops.h>
+ ])
+ ],
[AC_DEFINE([NO_ATOMIC_OPS], [1], [Defined if you do not have atomic_ops])])
+
AM_CONDITIONAL(WITH_LIBATOMIC, [test "$HAVE_ATOMIC_OPS" = "1"])
# newsyn? requires mpi.
diff --git a/src/.git_version b/src/.git_version
index 6bd39d8..338f76a 100644
--- a/src/.git_version
+++ b/src/.git_version
@@ -1,2 +1,2 @@
-38b73c67d375a2552d8ed67843c8a65c2c0feba6
-v0.80.5
+f93610a4421cb670b08e974c6550ee715ac528ae
+v0.80.6
diff --git a/src/Makefile.in b/src/Makefile.in
index e42ddf4..afa524b 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -487,11 +487,11 @@ am_libcommon_la_OBJECTS = ceph_ver.lo common/DecayCounter.lo \
common/BackTrace.lo common/perf_counters.lo common/Mutex.lo \
common/OutputDataSocket.lo common/admin_socket.lo \
common/admin_socket_client.lo common/cmdparse.lo \
- common/escape.lo common/Clock.lo common/Throttle.lo \
- common/Timer.lo common/Finisher.lo common/environment.lo \
- common/assert.lo common/run_cmd.lo common/WorkQueue.lo \
- common/ConfUtils.lo common/MemoryModel.lo common/armor.lo \
- common/fd.lo common/xattr.lo common/safe_io.lo \
+ common/escape.lo common/io_priority.lo common/Clock.lo \
+ common/Throttle.lo common/Timer.lo common/Finisher.lo \
+ common/environment.lo common/assert.lo common/run_cmd.lo \
+ common/WorkQueue.lo common/ConfUtils.lo common/MemoryModel.lo \
+ common/armor.lo common/fd.lo common/xattr.lo common/safe_io.lo \
common/snap_types.lo common/str_list.lo common/str_map.lo \
common/errno.lo common/RefCountedObj.lo common/blkdev.lo \
common/common_init.lo common/pipe.lo common/ceph_argparse.lo \
@@ -1854,11 +1854,11 @@ am__test_build_libcommon_SOURCES_DIST = test/buildtest_skeleton.cc \
common/perf_counters.cc common/Mutex.cc \
common/OutputDataSocket.cc common/admin_socket.cc \
common/admin_socket_client.cc common/cmdparse.cc \
- common/escape.c common/Clock.cc common/Throttle.cc \
- common/Timer.cc common/Finisher.cc common/environment.cc \
- common/assert.cc common/run_cmd.cc common/WorkQueue.cc \
- common/ConfUtils.cc common/MemoryModel.cc common/armor.c \
- common/fd.cc common/xattr.c common/safe_io.c \
+ common/escape.c common/io_priority.cc common/Clock.cc \
+ common/Throttle.cc common/Timer.cc common/Finisher.cc \
+ common/environment.cc common/assert.cc common/run_cmd.cc \
+ common/WorkQueue.cc common/ConfUtils.cc common/MemoryModel.cc \
+ common/armor.c common/fd.cc common/xattr.c common/safe_io.c \
common/snap_types.cc common/str_list.cc common/str_map.cc \
common/errno.cc common/RefCountedObj.cc common/blkdev.cc \
common/common_init.cc common/pipe.c common/ceph_argparse.cc \
@@ -1891,6 +1891,7 @@ am__objects_15 = test_build_libcommon-ceph_ver.$(OBJEXT) \
common/test_build_libcommon-admin_socket_client.$(OBJEXT) \
common/test_build_libcommon-cmdparse.$(OBJEXT) \
common/test_build_libcommon-escape.$(OBJEXT) \
+ common/test_build_libcommon-io_priority.$(OBJEXT) \
common/test_build_libcommon-Clock.$(OBJEXT) \
common/test_build_libcommon-Throttle.$(OBJEXT) \
common/test_build_libcommon-Timer.$(OBJEXT) \
@@ -3098,18 +3099,18 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/neon.h arch/probe.h \
common/Cond.h common/ConfUtils.h common/DecayCounter.h \
common/Finisher.h common/Formatter.h common/perf_counters.h \
common/OutputDataSocket.h common/admin_socket.h \
- common/admin_socket_client.h common/shared_cache.hpp \
- common/tracked_int_ptr.hpp common/simple_cache.hpp \
- common/sharedptr_registry.hpp common/map_cacher.hpp \
- common/MemoryModel.h common/Mutex.h \
+ common/admin_socket_client.h common/random_cache.hpp \
+ common/shared_cache.hpp common/tracked_int_ptr.hpp \
+ common/simple_cache.hpp common/sharedptr_registry.hpp \
+ common/map_cacher.hpp common/MemoryModel.h common/Mutex.h \
common/PrebufferedStreambuf.h common/RWLock.h \
common/Semaphore.h common/SimpleRNG.h common/TextTable.h \
common/Thread.h common/Throttle.h common/Timer.h \
common/TrackedOp.h common/arch.h common/armor.h \
- common/common_init.h common/pipe.h common/code_environment.h \
- common/signal.h common/simple_spin.h common/run_cmd.h \
- common/safe_io.h common/config.h common/config_obs.h \
- common/config_opts.h common/ceph_crypto.h \
+ common/common_init.h common/io_priority.h common/pipe.h \
+ common/code_environment.h common/signal.h common/simple_spin.h \
+ common/run_cmd.h common/safe_io.h common/config.h \
+ common/config_obs.h common/config_opts.h common/ceph_crypto.h \
common/ceph_crypto_cms.h common/ceph_json.h common/lru_map.h \
common/utf8.h common/mime.h common/pick_address.h \
common/secret.h common/strtol.h common/static_assert.h \
@@ -3642,18 +3643,18 @@ noinst_HEADERS = arch/intel.h arch/neon.h arch/probe.h \
common/Cond.h common/ConfUtils.h common/DecayCounter.h \
common/Finisher.h common/Formatter.h common/perf_counters.h \
common/OutputDataSocket.h common/admin_socket.h \
- common/admin_socket_client.h common/shared_cache.hpp \
- common/tracked_int_ptr.hpp common/simple_cache.hpp \
- common/sharedptr_registry.hpp common/map_cacher.hpp \
- common/MemoryModel.h common/Mutex.h \
+ common/admin_socket_client.h common/random_cache.hpp \
+ common/shared_cache.hpp common/tracked_int_ptr.hpp \
+ common/simple_cache.hpp common/sharedptr_registry.hpp \
+ common/map_cacher.hpp common/MemoryModel.h common/Mutex.h \
common/PrebufferedStreambuf.h common/RWLock.h \
common/Semaphore.h common/SimpleRNG.h common/TextTable.h \
common/Thread.h common/Throttle.h common/Timer.h \
common/TrackedOp.h common/arch.h common/armor.h \
- common/common_init.h common/pipe.h common/code_environment.h \
- common/signal.h common/simple_spin.h common/run_cmd.h \
- common/safe_io.h common/config.h common/config_obs.h \
- common/config_opts.h common/ceph_crypto.h \
+ common/common_init.h common/io_priority.h common/pipe.h \
+ common/code_environment.h common/signal.h common/simple_spin.h \
+ common/run_cmd.h common/safe_io.h common/config.h \
+ common/config_obs.h common/config_opts.h common/ceph_crypto.h \
common/ceph_crypto_cms.h common/ceph_json.h common/lru_map.h \
common/utf8.h common/mime.h common/pick_address.h \
common/secret.h common/strtol.h common/static_assert.h \
@@ -4242,11 +4243,11 @@ libcommon_la_SOURCES = ceph_ver.c common/DecayCounter.cc \
common/BackTrace.cc common/perf_counters.cc common/Mutex.cc \
common/OutputDataSocket.cc common/admin_socket.cc \
common/admin_socket_client.cc common/cmdparse.cc \
- common/escape.c common/Clock.cc common/Throttle.cc \
- common/Timer.cc common/Finisher.cc common/environment.cc \
- common/assert.cc common/run_cmd.cc common/WorkQueue.cc \
- common/ConfUtils.cc common/MemoryModel.cc common/armor.c \
- common/fd.cc common/xattr.c common/safe_io.c \
+ common/escape.c common/io_priority.cc common/Clock.cc \
+ common/Throttle.cc common/Timer.cc common/Finisher.cc \
+ common/environment.cc common/assert.cc common/run_cmd.cc \
+ common/WorkQueue.cc common/ConfUtils.cc common/MemoryModel.cc \
+ common/armor.c common/fd.cc common/xattr.c common/safe_io.c \
common/snap_types.cc common/str_list.cc common/str_map.cc \
common/errno.cc common/RefCountedObj.cc common/blkdev.cc \
common/common_init.cc common/pipe.c common/ceph_argparse.cc \
@@ -5765,6 +5766,8 @@ common/cmdparse.lo: common/$(am__dirstamp) \
common/$(DEPDIR)/$(am__dirstamp)
common/escape.lo: common/$(am__dirstamp) \
common/$(DEPDIR)/$(am__dirstamp)
+common/io_priority.lo: common/$(am__dirstamp) \
+ common/$(DEPDIR)/$(am__dirstamp)
common/Clock.lo: common/$(am__dirstamp) \
common/$(DEPDIR)/$(am__dirstamp)
common/Throttle.lo: common/$(am__dirstamp) \
@@ -7614,6 +7617,8 @@ common/test_build_libcommon-cmdparse.$(OBJEXT): \
common/$(am__dirstamp) common/$(DEPDIR)/$(am__dirstamp)
common/test_build_libcommon-escape.$(OBJEXT): common/$(am__dirstamp) \
common/$(DEPDIR)/$(am__dirstamp)
+common/test_build_libcommon-io_priority.$(OBJEXT): \
+ common/$(am__dirstamp) common/$(DEPDIR)/$(am__dirstamp)
common/test_build_libcommon-Clock.$(OBJEXT): common/$(am__dirstamp) \
common/$(DEPDIR)/$(am__dirstamp)
common/test_build_libcommon-Throttle.$(OBJEXT): \
@@ -8634,6 +8639,8 @@ mostlyclean-compile:
-rm -f common/histogram.lo
-rm -f common/hobject.$(OBJEXT)
-rm -f common/hobject.lo
+ -rm -f common/io_priority.$(OBJEXT)
+ -rm -f common/io_priority.lo
-rm -f common/ipaddr.$(OBJEXT)
-rm -f common/ipaddr.lo
-rm -f common/libcommon_crc_la-crc32c.$(OBJEXT)
@@ -8735,6 +8742,7 @@ mostlyclean-compile:
-rm -f common/test_build_libcommon-hex.$(OBJEXT)
-rm -f common/test_build_libcommon-histogram.$(OBJEXT)
-rm -f common/test_build_libcommon-hobject.$(OBJEXT)
+ -rm -f common/test_build_libcommon-io_priority.$(OBJEXT)
-rm -f common/test_build_libcommon-ipaddr.$(OBJEXT)
-rm -f common/test_build_libcommon-linux_version.$(OBJEXT)
-rm -f common/test_build_libcommon-lockdep.$(OBJEXT)
@@ -9666,6 +9674,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/hex.Plo at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/histogram.Plo at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/hobject.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/io_priority.Plo at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/ipaddr.Plo at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/libcommon_crc_la-crc32c.Plo at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/libcommon_crc_la-crc32c_intel_baseline.Plo at am__quote@
@@ -9743,6 +9752,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_libcommon-hex.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_libcommon-histogram.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_libcommon-hobject.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_libcommon-io_priority.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_libcommon-ipaddr.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_libcommon-linux_version.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_libcommon-lockdep.Po at am__quote@
@@ -12855,6 +12865,20 @@ common/test_build_libcommon-cmdparse.obj: common/cmdparse.cc
@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -c -o common/test_build_libcommon-cmdparse.obj `if test -f 'common/cmdparse.cc'; then $(CYGPATH_W) 'common/cmdparse.cc'; else $(CYGPATH_W) '$(srcdir)/common/cmdparse.cc'; fi`
+common/test_build_libcommon-io_priority.o: common/io_priority.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -MT common/test_build_libcommon-io_priority.o -MD -MP -MF common/$(DEPDIR)/test_build_libcommon-io_priority.Tpo -c -o common/test_build_libcommon-io_priority.o `test -f 'common/io_priority.cc' || echo '$(srcdir)/'`common/io_priority.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) common/$(DEPDIR)/test_build_libcommon-io_priority.Tpo common/$(DEPDIR)/test_build_libcommon-io_priority.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='common/io_priority.cc' object='common/test_build_libcommon-io_priority.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -c -o common/test_build_libcommon-io_priority.o `test -f 'common/io_priority.cc' || echo '$(srcdir)/'`common/io_priority.cc
+
+common/test_build_libcommon-io_priority.obj: common/io_priority.cc
+ at am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -MT common/test_build_libcommon-io_priority.obj -MD -MP -MF common/$(DEPDIR)/test_build_libcommon-io_priority.Tpo -c -o common/test_build_libcommon-io_priority.obj `if test -f 'common/io_priority.cc'; then $(CYGPATH_W) 'common/io_priority.cc'; else $(CYGPATH_W) '$(srcdir)/common/io_priority.cc'; fi`
+ at am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) common/$(DEPDIR)/test_build_libcommon-io_priority.Tpo common/$(DEPDIR)/test_build_libcommon-io_priority.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='common/io_priority.cc' object='common/test_build_libcommon-io_priority.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@ $(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -c -o common/test_build_libcommon-io_priority.obj `if test -f 'common/io_priority.cc'; then $(CYGPATH_W) 'common/io_priority.cc'; else $(CYGPATH_W) '$(srcdir)/common/io_priority.cc'; fi`
+
common/test_build_libcommon-Clock.o: common/Clock.cc
@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -MT common/test_build_libcommon-Clock.o -MD -MP -MF common/$(DEPDIR)/test_build_libcommon-Clock.Tpo -c -o common/test_build_libcommon-Clock.o `test -f 'common/Clock.cc' || echo '$(srcdir)/'`common/Clock.cc
@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) common/$(DEPDIR)/test_build_libcommon-Clock.Tpo common/$(DEPDIR)/test_build_libcommon-Clock.Po
diff --git a/src/acconfig.h.in b/src/acconfig.h.in
index 165c967..bed7a05 100644
--- a/src/acconfig.h.in
+++ b/src/acconfig.h.in
@@ -361,6 +361,9 @@
your system. */
#undef PTHREAD_CREATE_JOINABLE
+/* The size of `AO_t', as computed by sizeof. */
+#undef SIZEOF_AO_T
+
/* Define to 1 if you have the ANSI C header files. */
#undef STDC_HEADERS
diff --git a/src/ceph-disk b/src/ceph-disk
index c67f2f3..5d6071d 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -119,6 +119,9 @@ STATEDIR = '/var/lib/ceph'
SYSCONFDIR = '/etc/ceph'
+# only warn once about some things
+warned_about = {}
+
# Nuke the TERM variable to avoid confusing any subprocesses we call.
# For example, libreadline will print weird control sequences for some
# TERM values.
@@ -131,8 +134,6 @@ if LOG_NAME == '__main__':
LOG = logging.getLogger(LOG_NAME)
-
-
###### lock ########
class filelock(object):
@@ -150,8 +151,10 @@ class filelock(object):
fcntl.lockf(self.fd, fcntl.LOCK_UN)
self.fd = None
+
###### exceptions ########
+
class Error(Exception):
"""
Error
@@ -161,36 +164,43 @@ class Error(Exception):
doc = self.__doc__.strip()
return ': '.join([doc] + [str(a) for a in self.args])
+
class MountError(Error):
"""
Mounting filesystem failed
"""
+
class UnmountError(Error):
"""
Unmounting filesystem failed
"""
+
class BadMagicError(Error):
"""
Does not look like a Ceph OSD, or incompatible version
"""
+
class TruncatedLineError(Error):
"""
Line is truncated
"""
+
class TooManyLinesError(Error):
"""
Too many lines
"""
+
class FilesystemTypeError(Error):
"""
Cannot discover filesystem type
"""
+
class CephDiskException(Exception):
"""
A base exception for ceph-disk to provide custom (ad-hoc) messages that
@@ -198,12 +208,14 @@ class CephDiskException(Exception):
"""
pass
+
class ExecutableNotFound(CephDiskException):
"""
Exception to report on executables not available in PATH
"""
pass
+
####### utils
@@ -300,7 +312,7 @@ def command_check_call(arguments):
otherwise.
"""
arguments = _get_command_executable(arguments)
- LOG.info('Running command: %s' % ' '.join(arguments))
+ LOG.info('Running command: %s', ' '.join(arguments))
return subprocess.check_call(arguments)
@@ -340,26 +352,35 @@ def platform_information():
)
-# a device "name" is something like
-# sdb
-# cciss!c0d1
def get_dev_name(path):
"""
- get device name from path. e.g., /dev/sda -> sdas, /dev/cciss/c0d1 -> cciss!c0d1
+ get device name from path. e.g.::
+
+ /dev/sda -> sdas, /dev/cciss/c0d1 -> cciss!c0d1
+
+ a device "name" is something like::
+
+ sdb
+ cciss!c0d1
+
"""
assert path.startswith('/dev/')
base = path[5:]
return base.replace('/', '!')
-# a device "path" is something like
-# /dev/sdb
-# /dev/cciss/c0d1
+
def get_dev_path(name):
"""
get a path (/dev/...) from a name (cciss!c0d1)
+ a device "path" is something like::
+
+ /dev/sdb
+ /dev/cciss/c0d1
+
"""
return '/dev/' + name.replace('!', '/')
+
def get_dev_relpath(name):
"""
get a relative path to /dev from a name (cciss!c0d1)
@@ -367,6 +388,29 @@ def get_dev_relpath(name):
return name.replace('!', '/')
+def get_dev_size(dev, size='megabytes'):
+ """
+ Attempt to get the size of a device so that we can prevent errors
+ from actions to devices that are smaller, and improve error reporting.
+
+ Because we want to avoid breakage in case this approach is not robust, we
+ will issue a warning if we failed to get the size.
+
+ :param size: bytes or megabytes
+ :param dev: the device to calculate the size
+ """
+ fd = os.open(dev, os.O_RDONLY)
+ dividers = {'bytes': 1, 'megabytes': 1024*1024}
+ try:
+ device_size = os.lseek(fd, 0, os.SEEK_END)
+ divider = dividers.get(size, 1024*1024) # default to megabytes
+ return device_size/divider
+ except Exception as error:
+ LOG.warning('failed to get size of %s: %s' % (dev, str(error)))
+ finally:
+ os.close(fd)
+
+
def get_partition_dev(dev, pnum):
"""
get the device name for a partition
@@ -389,6 +433,7 @@ def get_partition_dev(dev, pnum):
else:
raise Error('partition %d for %s does not appear to exist' % (pnum, dev))
+
def list_all_partitions():
"""
Return a list of devices and partitions
@@ -403,6 +448,7 @@ def list_all_partitions():
dev_part_list[name] = list_partitions(name)
return dev_part_list
+
def list_partitions(basename):
"""
Return a list of partitions on the given device name
@@ -413,6 +459,23 @@ def list_partitions(basename):
partitions.append(name)
return partitions
+def get_partition_base(dev):
+ """
+ Get the base device for a partition
+ """
+ dev = os.path.realpath(dev)
+ if not stat.S_ISBLK(os.lstat(dev).st_mode):
+ raise Error('not a block device', dev)
+
+ name = get_dev_name(dev)
+ if os.path.exists(os.path.join('/sys/block', name)):
+ raise Error('not a partition', dev)
+
+ # find the base
+ for basename in os.listdir('/sys/block'):
+ if os.path.exists(os.path.join('/sys/block', basename, name)):
+ return '/dev/' + basename
+ raise Error('no parent device for partition', dev)
def is_partition(dev):
"""
@@ -476,7 +539,7 @@ def is_held(dev):
return []
-def verify_not_in_use(dev):
+def verify_not_in_use(dev, check_partitions=False):
"""
Verify if a given device (path) is in use (e.g. mounted or
in use by device-mapper).
@@ -484,13 +547,13 @@ def verify_not_in_use(dev):
:raises: Error if device is in use.
"""
assert os.path.exists(dev)
- if is_partition(dev):
- if is_mounted(dev):
- raise Error('Device is mounted', dev)
- holders = is_held(dev)
- if holders:
- raise Error('Device is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders))
- else:
+ if is_mounted(dev):
+ raise Error('Device is mounted', dev)
+ holders = is_held(dev)
+ if holders:
+ raise Error('Device is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders))
+
+ if check_partitions and not is_partition(dev):
basename = get_dev_name(os.path.realpath(dev))
for partname in list_partitions(basename):
partition = get_dev_path(partname)
@@ -536,10 +599,12 @@ def read_one_line(parent, name):
try:
line = must_be_one_line(line)
except (TruncatedLineError, TooManyLinesError) as e:
- raise Error('File is corrupt: {path}: {msg}'.format(
+ raise Error(
+ 'File is corrupt: {path}: {msg}'.format(
path=path,
msg=e,
- ))
+ )
+ )
return line
@@ -746,7 +811,7 @@ def dmcrypt_map(
:return: Path to the dmcrypt device.
"""
- dev = '/dev/mapper/'+ _uuid
+ dev = '/dev/mapper/' + _uuid
args = [
'cryptsetup',
'--key-file',
@@ -792,6 +857,12 @@ def mount(
Mounts a device with given filessystem type and
mount options to a tempfile path under /var/lib/ceph/tmp.
"""
+ # sanity check: none of the arguments are None
+ if dev is None:
+ raise ValueError('dev may not be None')
+ if fstype is None:
+ raise ValueError('fstype may not be None')
+
# pick best-of-breed mount options based on fs type
if options is None:
options = MOUNT_OPTIONS.get(fstype, '')
@@ -967,6 +1038,15 @@ def prepare_journal_dev(
)
LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
+ dev_size = get_dev_size(journal)
+
+ if journal_size > dev_size:
+ LOG.error('refusing to create journal on %s' % journal)
+ LOG.error('journal size (%sM) is bigger than device (%sM)' % (journal_size, dev_size))
+ raise Error(
+ '%s device size (%sM) is not big enough for journal' % (journal, dev_size)
+ )
+
try:
LOG.debug('Creating journal partition num %d size %d on %s', num, journal_size, journal)
command_check_call(
@@ -1044,7 +1124,7 @@ def prepare_journal_file(
if not os.path.exists(journal):
LOG.debug('Creating journal file %s with size 0 (ceph-osd will resize and allocate)', journal)
- with file(journal, 'wb') as journal_file:
+ with file(journal, 'wb') as journal_file: # noqa
pass
LOG.debug('Journal is file %s', journal)
@@ -1110,13 +1190,14 @@ def adjust_symlink(target, path):
except:
raise Error('unable to create symlink %s -> %s' % (path, target))
+
def prepare_dir(
path,
journal,
cluster_uuid,
osd_uuid,
journal_uuid,
- journal_dmcrypt = None,
+ journal_dmcrypt=None,
):
if os.path.exists(os.path.join(path, 'magic')):
@@ -1183,9 +1264,6 @@ def prepare_dev(
LOG.debug('OSD data device %s is a partition', data)
rawdev = data
else:
- if journal_dmcrypt is not None:
- dmcrypt_unmap(journal)
-
LOG.debug('Creating osd partition on %s', data)
try:
command_check_call(
@@ -1238,9 +1316,9 @@ def prepare_dev(
else:
args.extend(MKFS_ARGS.get(fstype, []))
args.extend([
- '--',
- dev,
- ])
+ '--',
+ dev,
+ ])
try:
LOG.debug('Creating %s fs on %s', fstype, dev)
command_check_call(args)
@@ -1267,8 +1345,6 @@ def prepare_dev(
finally:
if rawdev != dev:
dmcrypt_unmap(osd_uuid)
- if journal_dmcrypt is not None:
- dmcrypt_unmap(journal)
if not is_partition(data):
try:
@@ -1289,7 +1365,7 @@ def main_prepare(args):
osd_dm_keypath = None
try:
- prepare_lock.acquire()
+ prepare_lock.acquire() # noqa
if not os.path.exists(args.data):
if args.data_dev:
raise Error('data path does not exist', args.data)
@@ -1299,12 +1375,12 @@ def main_prepare(args):
# in use?
dmode = os.stat(args.data).st_mode
if stat.S_ISBLK(dmode):
- verify_not_in_use(args.data)
+ verify_not_in_use(args.data, True)
if args.journal and os.path.exists(args.journal):
jmode = os.stat(args.journal).st_mode
if stat.S_ISBLK(jmode):
- verify_not_in_use(args.journal)
+ verify_not_in_use(args.journal, False)
if args.zap_disk is not None:
if stat.S_ISBLK(dmode) and not is_partition(args.data):
@@ -1421,7 +1497,7 @@ def main_prepare(args):
)
else:
raise Error('not a dir or block device', args.data)
- prepare_lock.release()
+ prepare_lock.release() # noqa
if stat.S_ISBLK(dmode):
# try to make sure the kernel refreshes the table. note
@@ -1457,7 +1533,7 @@ def main_prepare(args):
os.unlink(journal_dm_keypath)
if osd_dm_keypath:
os.unlink(osd_dm_keypath)
- prepare_lock.release()
+ prepare_lock.release() # noqa
raise e
@@ -1623,18 +1699,21 @@ def start_daemon(
[
svc,
'ceph',
+ '--cluster',
+ '{cluster}'.format(cluster=cluster),
'start',
'osd.{osd_id}'.format(osd_id=osd_id),
],
)
else:
raise Error('{cluster} osd.{osd_id} is not tagged with an init system'.format(
- cluster=cluster,
- osd_id=osd_id,
- ))
+ cluster=cluster,
+ osd_id=osd_id,
+ ))
except subprocess.CalledProcessError as e:
raise Error('ceph osd start failed', e)
+
def detect_fstype(
dev,
):
@@ -1704,8 +1783,8 @@ def mount_activate(
src_dev = os.stat(path).st_dev
try:
dst_dev = os.stat((STATEDIR + '/osd/{cluster}-{osd_id}').format(
- cluster=cluster,
- osd_id=osd_id)).st_dev
+ cluster=cluster,
+ osd_id=osd_id)).st_dev
if src_dev == dst_dev:
active = True
else:
@@ -1760,7 +1839,7 @@ def activate_dir(
(osd_id, cluster) = activate(path, activate_key_template, init)
- if init not in ( None, 'none' ):
+ if init not in (None, 'none' ):
canonical = (STATEDIR + '/osd/{cluster}-{osd_id}').format(
cluster=cluster,
osd_id=osd_id)
@@ -1815,6 +1894,7 @@ def find_cluster_by_uuid(_uuid):
return 'ceph'
return None
+
def activate(
path,
activate_key_template,
@@ -1861,7 +1941,7 @@ def activate(
keyring=keyring,
)
- if init not in ( None, 'none' ):
+ if init not in (None, 'none' ):
if init == 'auto':
conf_val = get_conf(
cluster=cluster,
@@ -1912,7 +1992,7 @@ def main_activate(args):
LOG.info('suppressed activate request on %s', args.path)
return
- activate_lock.acquire()
+ activate_lock.acquire() # noqa
try:
mode = os.stat(args.path).st_mode
if stat.S_ISBLK(mode):
@@ -1932,7 +2012,7 @@ def main_activate(args):
if args.mark_init == 'none':
command_check_call(
[
- 'ceph-osd',
+ 'ceph-osd',
'--cluster={cluster}'.format(cluster=cluster),
'--id={osd_id}'.format(osd_id=osd_id),
'--osd-data={path}'.format(path=args.path),
@@ -1943,7 +2023,7 @@ def main_activate(args):
else:
raise Error('%s is not a directory or block device' % args.path)
- if args.mark_init not in ( None, 'none' ):
+ if args.mark_init not in (None, 'none' ):
start_daemon(
cluster=cluster,
@@ -1951,7 +2031,7 @@ def main_activate(args):
)
finally:
- activate_lock.release()
+ activate_lock.release() # noqa
###########################
@@ -1984,6 +2064,7 @@ def get_journal_osd_uuid(path):
LOG.debug('Journal %s has OSD UUID %s', path, value)
return value
+
def main_activate_journal(args):
if not os.path.exists(args.dev):
raise Error('%s does not exist' % args.dev)
@@ -1991,7 +2072,7 @@ def main_activate_journal(args):
cluster = None
osd_id = None
osd_uuid = None
- activate_lock.acquire()
+ activate_lock.acquire() # noqa
try:
osd_uuid = get_journal_osd_uuid(args.dev)
path = os.path.join('/dev/disk/by-partuuid/', osd_uuid.lower())
@@ -2008,10 +2089,12 @@ def main_activate_journal(args):
)
finally:
- activate_lock.release()
+ activate_lock.release() # noqa
+
###########################
+
def main_activate_all(args):
dir = '/dev/disk/by-parttypeuuid'
LOG.debug('Scanning %s', dir)
@@ -2022,10 +2105,16 @@ def main_activate_all(args):
if name.find('.') < 0:
continue
(tag, uuid) = name.split('.')
- if tag == OSD_UUID:
- path = os.path.join(dir, name)
+
+ if tag == OSD_UUID or tag == DMCRYPT_OSD_UUID:
+
+ if tag == DMCRYPT_OSD_UUID:
+ path = os.path.join('/dev/mapper', uuid)
+ else:
+ path = os.path.join(dir, name)
+
LOG.info('Activating %s', path)
- activate_lock.acquire()
+ activate_lock.acquire() # noqa
try:
(cluster, osd_id) = mount_activate(
dev=path,
@@ -2045,7 +2134,7 @@ def main_activate_all(args):
err = True
finally:
- activate_lock.release()
+ activate_lock.release() # noqa
if err:
raise Error('One or more partitions failed to activate')
@@ -2066,6 +2155,7 @@ def is_swap(dev):
return True
return False
+
def get_oneliner(base, name):
path = os.path.join(base, name)
if os.path.isfile(path):
@@ -2073,6 +2163,7 @@ def get_oneliner(base, name):
return _file.readline().rstrip()
return None
+
def get_dev_fs(dev):
fscheck, _ = command(
[
@@ -2088,7 +2179,56 @@ def get_dev_fs(dev):
else:
return None
+
def get_partition_type(part):
+ """
+ Get the GPT partition type UUID. If we have an old blkid and can't
+ get it that way, use sgdisk and use the description instead (and hope
+ dmcrypt isn't being used).
+ """
+ blkid, _ = command(
+ [
+ 'blkid',
+ '-p',
+ '-o', 'udev',
+ part,
+ ]
+ )
+ saw_part_entry = False
+ for line in blkid.splitlines():
+ (key, value) = line.split('=')
+ if key == 'ID_PART_ENTRY_TYPE':
+ return value
+ if key == 'ID_PART_ENTRY_SCHEME':
+ table_type = value
+ if key.startswith('ID_PART_ENTRY_'):
+ saw_part_entry = True
+
+ # hmm, is it in fact GPT?
+ table_type = None
+ base = get_partition_base(part)
+ blkid, _ = command(
+ [
+ 'blkid',
+ '-p',
+ '-o', 'udev',
+ base
+ ]
+ )
+ for line in blkid.splitlines():
+ (key, value) = line.split('=')
+ if key == 'ID_PART_TABLE_TYPE':
+ table_type = value
+ if table_type != 'gpt':
+ return None # not even GPT
+
+ if saw_part_entry:
+ return None # GPT, and blkid appears to be new, so we're done.
+
+ # bah, fall back to sgdisk.
+ if 'blkid' not in warned_about:
+ LOG.warning('Old blkid does not support ID_PART_ENTRY_* fields, trying sgdisk; may not correctly identify ceph volumes with dmcrypt')
+ warned_about['blkid'] = True
(base, partnum) = re.match('(\D+)(\d+)', part).group(1, 2)
sgdisk, _ = command(
[
@@ -2104,9 +2244,16 @@ def get_partition_type(part):
num = m.group(1)
if num != partnum:
continue
- return m.group(2)
+ desc = m.group(2)
+ # assume unencrypted ... blkid has failed us :(
+ if desc == 'ceph data':
+ return OSD_UUID
+ if desc == 'ceph journal':
+ return JOURNAL_UUID
+
return None
+
def get_partition_uuid(dev):
(base, partnum) = re.match('(\D+)(\d+)', dev).group(1, 2)
out, _ = command(['sgdisk', '-i', partnum, base])
@@ -2116,6 +2263,7 @@ def get_partition_uuid(dev):
return m.group(1).lower()
return None
+
def more_osd_info(path, uuid_map):
desc = []
ceph_fsid = get_oneliner(path, 'ceph_fsid')
@@ -2138,6 +2286,27 @@ def more_osd_info(path, uuid_map):
return desc
+def list_dev_osd(dev, uuid_map):
+ path = is_mounted(dev)
+ fs_type = get_dev_fs(dev)
+ desc = []
+ if path:
+ desc.append('active')
+ desc.extend(more_osd_info(path, uuid_map))
+ elif fs_type:
+ try:
+ tpath = mount(dev=dev, fstype=fs_type, options='')
+ if tpath:
+ try:
+ magic = get_oneliner(tpath, 'magic')
+ if magic is not None:
+ desc.append('prepared')
+ desc.extend(more_osd_info(tpath, uuid_map))
+ finally:
+ unmount(tpath)
+ except MountError:
+ pass
+ return desc
def list_dev(dev, uuid_map, journal_map):
ptype = 'unknown'
@@ -2145,37 +2314,41 @@ def list_dev(dev, uuid_map, journal_map):
if is_partition(dev):
ptype = get_partition_type(dev)
prefix = ' '
- fs_type = get_dev_fs(dev)
- path = is_mounted(dev)
desc = []
- if ptype == 'ceph data':
- if path:
- desc.append('active')
- desc.extend(more_osd_info(path, uuid_map))
- elif fs_type:
- try:
- tpath = mount(dev=dev, fstype=fs_type, options='')
- if tpath:
- try:
- magic = get_oneliner(tpath, 'magic')
- if magic is not None:
- desc.append('prepared')
- desc.extend(more_osd_info(tpath, uuid_map))
- finally:
- unmount(tpath)
- except MountError:
- pass
+ if ptype == OSD_UUID:
+ desc = list_dev_osd(dev, uuid_map)
if desc:
desc = ['ceph data'] + desc
else:
desc = ['ceph data', 'unprepared']
- elif ptype == 'ceph journal':
+ elif ptype == DMCRYPT_OSD_UUID:
+ holders = is_held(dev)
+ if not holders:
+ desc = ['ceph data (dmcrypt)', 'not currently mapped']
+ elif len(holders) == 1:
+ holder = '/dev/' + holders[0]
+ fs_desc = list_dev_osd(holder, uuid_map)
+ desc = ['ceph data (dmcrypt %s)' % holder] + fs_desc
+ else:
+ desc = ['ceph data (dmcrypt)', 'holders: ' + ','.join(holders)]
+ elif ptype == JOURNAL_UUID:
desc.append('ceph journal')
part_uuid = get_partition_uuid(dev)
if part_uuid and part_uuid in journal_map:
desc.append('for %s' % journal_map[part_uuid])
+ elif ptype == DMCRYPT_JOURNAL_UUID:
+ holders = is_held(dev)
+ if len(holders) == 1:
+ desc = ['ceph journal (dmcrypt /dev/%s)' % holders[0]]
+ else:
+ desc = ['ceph journal (dmcrypt)']
+ part_uuid = get_partition_uuid(dev)
+ if part_uuid and part_uuid in journal_map:
+ desc.append('for %s' % journal_map[part_uuid])
else:
+ path = is_mounted(dev)
+ fs_type = get_dev_fs(dev)
if is_swap(dev):
desc.append('swap')
else:
@@ -2190,7 +2363,6 @@ def list_dev(dev, uuid_map, journal_map):
print '%s%s %s' % (prefix, dev, ', '.join(desc))
-
def main_list(args):
partmap = list_all_partitions()
@@ -2203,18 +2375,35 @@ def main_list(args):
if part_uuid:
uuid_map[part_uuid] = dev
ptype = get_partition_type(dev)
- if ptype == 'ceph data':
+ if ptype == OSD_UUID:
fs_type = get_dev_fs(dev)
- try:
- tpath = mount(dev=dev, fstype=fs_type, options='')
+ if fs_type is not None:
try:
- journal_uuid = get_oneliner(tpath, 'journal_uuid')
- if journal_uuid:
- journal_map[journal_uuid.lower()] = dev
- finally:
- unmount(tpath)
- except MountError:
- pass
+ tpath = mount(dev=dev, fstype=fs_type, options='')
+ try:
+ journal_uuid = get_oneliner(tpath, 'journal_uuid')
+ if journal_uuid:
+ journal_map[journal_uuid.lower()] = dev
+ finally:
+ unmount(tpath)
+ except MountError:
+ pass
+ if ptype == DMCRYPT_OSD_UUID:
+ holders = is_held(dev)
+ if len(holders) == 1:
+ holder = '/dev/' + holders[0]
+ fs_type = get_dev_fs(holder)
+ if fs_type is not None:
+ try:
+ tpath = mount(dev=holder, fstype=fs_type, options='')
+ try:
+ journal_uuid = get_oneliner(tpath, 'journal_uuid')
+ if journal_uuid:
+ journal_map[journal_uuid.lower()] = dev
+ finally:
+ unmount(tpath)
+ except MountError:
+ pass
for base, parts in sorted(partmap.iteritems()):
if parts:
@@ -2244,12 +2433,13 @@ def is_suppressed(path):
return False
base = get_dev_name(disk)
while len(base):
- if os.path.exists(SUPPRESS_PREFIX + base):
+ if os.path.exists(SUPPRESS_PREFIX + base): # noqa
return True
base = base[:-1]
except:
return False
+
def set_suppress(path):
disk = os.path.realpath(path)
if not os.path.exists(disk):
@@ -2258,10 +2448,11 @@ def set_suppress(path):
raise Error('not a block device', path)
base = get_dev_name(disk)
- with file(SUPPRESS_PREFIX + base, 'w') as f:
+ with file(SUPPRESS_PREFIX + base, 'w') as f: # noqa
pass
LOG.info('set suppress flag on %s', base)
+
def unset_suppress(path):
disk = os.path.realpath(path)
if not os.path.exists(disk):
@@ -2271,7 +2462,7 @@ def unset_suppress(path):
assert disk.startswith('/dev/')
base = get_dev_name(disk)
- fn = SUPPRESS_PREFIX + base
+ fn = SUPPRESS_PREFIX + base # noqa
if not os.path.exists(fn):
raise Error('not marked as suppressed', path)
@@ -2285,16 +2476,22 @@ def unset_suppress(path):
def main_suppress(args):
set_suppress(args.path)
+
def main_unsuppress(args):
unset_suppress(args.path)
+
def main_zap(args):
for dev in args.dev:
zap(dev)
###########################
+
def setup_statedir(dir):
+ # XXX The following use of globals makes linting
+ # really hard. Global state in Python is iffy and
+ # should be avoided.
global STATEDIR
STATEDIR = dir
@@ -2312,10 +2509,12 @@ def setup_statedir(dir):
global SUPPRESS_PREFIX
SUPPRESS_PREFIX = STATEDIR + '/tmp/suppress-activate.'
+
def setup_sysconfdir(dir):
global SYSCONFDIR
SYSCONFDIR = dir
+
def parse_args():
parser = argparse.ArgumentParser(
'ceph-disk',
@@ -2589,3 +2788,4 @@ def main():
if __name__ == '__main__':
main()
+ warned_about = {}
diff --git a/src/ceph.in b/src/ceph.in
index 0978882..82c9085 100755
--- a/src/ceph.in
+++ b/src/ceph.in
@@ -106,6 +106,14 @@ def mdsids():
l.append(mdsdict['name'])
return l
+# these args must be passed to all child programs
+GLOBAL_ARGS = {
+ 'client_id': '--id',
+ 'client_name': '--name',
+ 'cluster': '--cluster',
+ 'cephconf': '--conf',
+}
+
def parse_cmdargs(args=None, target=''):
# alias: let the line-wrapping be sane
AP = argparse.ArgumentParser
@@ -339,15 +347,23 @@ def admin_socket(asok_path, cmd, format=''):
return ret
-def ceph_conf(field, name):
+def ceph_conf(parsed_args, field, name):
+ args=['ceph-conf']
+
+ if name:
+ args.extend(['--name', name])
+
+ # add any args in GLOBAL_ARGS
+ for key, val in GLOBAL_ARGS.iteritems():
+ # ignore name in favor of argument name, if any
+ if name and key == 'client_name':
+ continue
+ if getattr(parsed_args, key):
+ args.extend([val, getattr(parsed_args, key)])
+
+ args.extend(['--show-config-value', field])
p = subprocess.Popen(
- args=[
- 'ceph-conf',
- '--show-config-value',
- field,
- '-n',
- name,
- ],
+ args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
outdata, errdata = p.communicate()
@@ -538,7 +554,8 @@ def main():
else:
# try resolve daemon name
try:
- sockpath = ceph_conf('admin_socket', childargs[1])
+ sockpath = ceph_conf(parsed_args, 'admin_socket',
+ childargs[1])
except Exception as e:
print >> sys.stderr, \
'Can\'t get admin socket path: ' + str(e)
diff --git a/src/ceph_common.sh b/src/ceph_common.sh
index d78f831..07faddc 100644
--- a/src/ceph_common.sh
+++ b/src/ceph_common.sh
@@ -50,12 +50,13 @@ check_host() {
#echo host for $name is $host, i am $hostname
- if [ -e "/var/lib/ceph/$type/ceph-$id/upstart" ]; then
+ cluster=$1
+ if [ -e "/var/lib/ceph/$type/$cluster-$id/upstart" ]; then
return 1
fi
# sysvinit managed instance in standard location?
- if [ -e "/var/lib/ceph/$type/ceph-$id/sysvinit" ]; then
+ if [ -e "/var/lib/ceph/$type/$cluster-$id/sysvinit" ]; then
host="$hostname"
echo "=== $type.$id === "
return 0
diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc
index 4e84b4d..80b17a1 100644
--- a/src/ceph_mon.cc
+++ b/src/ceph_mon.cc
@@ -43,6 +43,8 @@ using namespace std;
#include "include/assert.h"
+#include "erasure-code/ErasureCodePlugin.h"
+
#define dout_subsys ceph_subsys_mon
Monitor *mon = NULL;
@@ -184,6 +186,21 @@ void usage()
generic_server_usage();
}
+int preload_erasure_code()
+{
+ string directory = g_conf->osd_pool_default_erasure_code_directory;
+ string plugins = g_conf->osd_erasure_code_plugins;
+ stringstream ss;
+ int r = ErasureCodePluginRegistry::instance().preload(plugins,
+ directory,
+ ss);
+ if (r)
+ derr << ss.str() << dendl;
+ else
+ dout(10) << ss.str() << dendl;
+ return r;
+}
+
int main(int argc, const char **argv)
{
int err;
@@ -406,8 +423,7 @@ int main(int argc, const char **argv)
// screwing us over
Preforker prefork;
if (!(flags & CINIT_FLAG_NO_DAEMON_ACTIONS)) {
- if (g_conf->daemonize) {
- global_init_prefork(g_ceph_context, 0);
+ if (global_init_prefork(g_ceph_context, 0) >= 0) {
prefork.prefork();
if (prefork.is_parent()) {
return prefork.parent_wait();
@@ -416,6 +432,8 @@ int main(int argc, const char **argv)
}
common_init_finish(g_ceph_context);
global_init_chdir(g_ceph_context);
+ if (preload_erasure_code() < -1)
+ prefork.exit(1);
}
MonitorDBStore *store = new MonitorDBStore(g_conf->mon_data);
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index 029ef28..a2f4542 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -48,6 +48,8 @@ using namespace std;
#include "include/assert.h"
+#include "erasure-code/ErasureCodePlugin.h"
+
#define dout_subsys ceph_subsys_osd
OSD *osd = NULL;
@@ -66,6 +68,21 @@ void usage()
generic_server_usage();
}
+int preload_erasure_code()
+{
+ string directory = g_conf->osd_pool_default_erasure_code_directory;
+ string plugins = g_conf->osd_erasure_code_plugins;
+ stringstream ss;
+ int r = ErasureCodePluginRegistry::instance().preload(plugins,
+ directory,
+ ss);
+ if (r)
+ derr << ss.str() << dendl;
+ else
+ dout(10) << ss.str() << dendl;
+ return r;
+}
+
int main(int argc, const char **argv)
{
vector<const char*> args;
@@ -451,6 +468,9 @@ int main(int argc, const char **argv)
return -1;
global_init_chdir(g_ceph_context);
+ if (preload_erasure_code() < -1)
+ return -1;
+
osd = new OSD(g_ceph_context,
store,
whoami,
diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc
index cf301f7..7a15a90 100644
--- a/src/cls/rgw/cls_rgw.cc
+++ b/src/cls/rgw/cls_rgw.cc
@@ -670,7 +670,7 @@ int rgw_bucket_complete_op(cls_method_context_t hctx, bufferlist *in, bufferlist
unaccount_entry(header, remove_entry);
if (op.log_op) {
- rc = log_index_operation(hctx, op.name, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
+ rc = log_index_operation(hctx, remove_oid_name, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
remove_entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker);
if (rc < 0)
continue;
diff --git a/src/common/Finisher.h b/src/common/Finisher.h
index 173b924..610470e 100644
--- a/src/common/Finisher.h
+++ b/src/common/Finisher.h
@@ -77,6 +77,15 @@ class Finisher {
if (logger)
logger->inc(l_finisher_queue_len);
}
+ void queue(list<Context*>& ls) {
+ finisher_lock.Lock();
+ finisher_queue.insert(finisher_queue.end(), ls.begin(), ls.end());
+ finisher_cond.Signal();
+ finisher_lock.Unlock();
+ ls.clear();
+ if (logger)
+ logger->inc(l_finisher_queue_len);
+ }
void start();
void stop();
diff --git a/src/common/LogClient.cc b/src/common/LogClient.cc
index 1e290b1..e4536c7 100644
--- a/src/common/LogClient.cc
+++ b/src/common/LogClient.cc
@@ -124,6 +124,7 @@ bool LogClient::are_pending()
Message *LogClient::_get_mon_log_message()
{
+ assert(log_lock.is_locked());
if (log_queue.empty())
return NULL;
@@ -149,7 +150,7 @@ Message *LogClient::_get_mon_log_message()
assert(num_unsent <= log_queue.size());
std::deque<LogEntry>::iterator p = log_queue.begin();
std::deque<LogEntry> o;
- while (p->seq < last_log_sent) {
+ while (p->seq <= last_log_sent) {
++p;
assert(p != log_queue.end());
}
diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index 9769e2f..69e5ad3 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -13,6 +13,7 @@ libcommon_la_SOURCES = \
common/admin_socket_client.cc \
common/cmdparse.cc \
common/escape.c \
+ common/io_priority.cc \
common/Clock.cc \
common/Throttle.cc \
common/Timer.cc \
@@ -156,6 +157,7 @@ noinst_HEADERS += \
common/OutputDataSocket.h \
common/admin_socket.h \
common/admin_socket_client.h \
+ common/random_cache.hpp \
common/shared_cache.hpp \
common/tracked_int_ptr.hpp \
common/simple_cache.hpp \
@@ -175,6 +177,7 @@ noinst_HEADERS += \
common/arch.h \
common/armor.h \
common/common_init.h \
+ common/io_priority.h \
common/pipe.h \
common/code_environment.h \
common/signal.h \
diff --git a/src/common/Thread.cc b/src/common/Thread.cc
index 0f4e322..7be0013 100644
--- a/src/common/Thread.cc
+++ b/src/common/Thread.cc
@@ -16,6 +16,7 @@
#include "common/code_environment.h"
#include "common/debug.h"
#include "common/signal.h"
+#include "common/io_priority.h"
#include <dirent.h>
#include <errno.h>
@@ -29,7 +30,10 @@
Thread::Thread()
- : thread_id(0)
+ : thread_id(0),
+ pid(0),
+ ioprio_class(-1),
+ ioprio_priority(-1)
{
}
@@ -38,10 +42,24 @@ Thread::~Thread()
}
void *Thread::_entry_func(void *arg) {
- void *r = ((Thread*)arg)->entry();
+ void *r = ((Thread*)arg)->entry_wrapper();
return r;
}
+void *Thread::entry_wrapper()
+{
+ int p = ceph_gettid(); // may return -ENOSYS on other platforms
+ if (p > 0)
+ pid = p;
+ if (ioprio_class >= 0 &&
+ ioprio_priority >= 0) {
+ ceph_ioprio_set(IOPRIO_WHO_PROCESS,
+ pid,
+ IOPRIO_PRIO_VALUE(ioprio_class, ioprio_priority));
+ }
+ return entry();
+}
+
const pthread_t &Thread::get_thread_id()
{
return thread_id;
@@ -128,3 +146,15 @@ int Thread::detach()
{
return pthread_detach(thread_id);
}
+
+int Thread::set_ioprio(int cls, int prio)
+{
+ // fixme, maybe: this can race with create()
+ ioprio_class = cls;
+ ioprio_priority = prio;
+ if (pid && cls >= 0 && prio >= 0)
+ return ceph_ioprio_set(IOPRIO_WHO_PROCESS,
+ pid,
+ IOPRIO_PRIO_VALUE(cls, prio));
+ return 0;
+}
diff --git a/src/common/Thread.h b/src/common/Thread.h
index 4bc0254..95f63b4 100644
--- a/src/common/Thread.h
+++ b/src/common/Thread.h
@@ -21,6 +21,10 @@
class Thread {
private:
pthread_t thread_id;
+ pid_t pid;
+ int ioprio_class, ioprio_priority;
+
+ void *entry_wrapper();
public:
Thread(const Thread& other);
@@ -44,6 +48,7 @@ class Thread {
void create(size_t stacksize = 0);
int join(void **prval = 0);
int detach();
+ int set_ioprio(int cls, int prio);
};
#endif
diff --git a/src/common/WorkQueue.cc b/src/common/WorkQueue.cc
index f47435b..42f402f 100644
--- a/src/common/WorkQueue.cc
+++ b/src/common/WorkQueue.cc
@@ -16,6 +16,7 @@
#include "include/types.h"
#include "include/utime.h"
+#include "common/errno.h"
#include "WorkQueue.h"
#include "common/config.h"
@@ -33,6 +34,8 @@ ThreadPool::ThreadPool(CephContext *cct_, string nm, int n, const char *option)
_stop(false),
_pause(0),
_draining(0),
+ ioprio_class(-1),
+ ioprio_priority(-1),
_num_threads(n),
last_work_queue(0),
processing(0)
@@ -156,6 +159,11 @@ void ThreadPool::start_threads()
WorkThread *wt = new WorkThread(this);
ldout(cct, 10) << "start_threads creating and starting " << wt << dendl;
_threads.insert(wt);
+
+ int r = wt->set_ioprio(ioprio_class, ioprio_priority);
+ if (r < 0)
+ lderr(cct) << " set_ioprio got " << cpp_strerror(r) << dendl;
+
wt->create();
}
}
@@ -255,3 +263,16 @@ void ThreadPool::drain(WorkQueue_* wq)
_lock.Unlock();
}
+void ThreadPool::set_ioprio(int cls, int priority)
+{
+ Mutex::Locker l(_lock);
+ ioprio_class = cls;
+ ioprio_priority = priority;
+ for (set<WorkThread*>::iterator p = _threads.begin();
+ p != _threads.end();
+ ++p) {
+ int r = (*p)->set_ioprio(cls, priority);
+ if (r < 0)
+ lderr(cct) << " set_ioprio got " << cpp_strerror(r) << dendl;
+ }
+}
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
index 794b577..cbf49a8 100644
--- a/src/common/WorkQueue.h
+++ b/src/common/WorkQueue.h
@@ -33,6 +33,7 @@ class ThreadPool : public md_config_obs_t {
int _pause;
int _draining;
Cond _wait_cond;
+ int ioprio_class, ioprio_priority;
public:
class TPHandle {
@@ -388,6 +389,9 @@ public:
void unpause();
/// wait for all work to complete
void drain(WorkQueue_* wq = 0);
+
+ /// set io priority
+ void set_ioprio(int cls, int priority);
};
class GenContextWQ :
diff --git a/src/common/blkdev.cc b/src/common/blkdev.cc
index 9c7240c..8b19abb 100644
--- a/src/common/blkdev.cc
+++ b/src/common/blkdev.cc
@@ -10,7 +10,7 @@ int get_block_device_size(int fd, int64_t *psize)
{
#ifdef BLKGETSIZE64
int ret = ::ioctl(fd, BLKGETSIZE64, psize);
-#elif BLKGETSIZE
+#elif defined(BLKGETSIZE)
unsigned long sectors = 0;
int ret = ::ioctl(fd, BLKGETSIZE, §ors);
*psize = sectors * 512ULL;
diff --git a/src/common/config.cc b/src/common/config.cc
index 0ee7f58..23bfe35 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -389,12 +389,10 @@ int md_config_t::parse_argv(std::vector<const char*>& args)
}
else if (ceph_argparse_flag(args, i, "--foreground", "-f", (char*)NULL)) {
set_val_or_die("daemonize", "false");
- set_val_or_die("pid_file", "");
}
else if (ceph_argparse_flag(args, i, "-d", (char*)NULL)) {
set_val_or_die("daemonize", "false");
set_val_or_die("log_file", "");
- set_val_or_die("pid_file", "");
set_val_or_die("log_to_stderr", "true");
set_val_or_die("err_to_stderr", "true");
set_val_or_die("log_to_syslog", "false");
@@ -879,7 +877,7 @@ int md_config_t::set_val_raw(const char *val, const config_option *opt)
switch (opt->type) {
case OPT_INT: {
std::string err;
- int f = strict_strtol(val, 10, &err);
+ int f = strict_sistrtoll(val, &err);
if (!err.empty())
return -EINVAL;
*(int*)opt->conf_ptr(this) = f;
@@ -887,7 +885,7 @@ int md_config_t::set_val_raw(const char *val, const config_option *opt)
}
case OPT_LONGLONG: {
std::string err;
- long long f = strict_strtoll(val, 10, &err);
+ long long f = strict_sistrtoll(val, &err);
if (!err.empty())
return -EINVAL;
*(long long*)opt->conf_ptr(this) = f;
@@ -917,7 +915,7 @@ int md_config_t::set_val_raw(const char *val, const config_option *opt)
return 0;
case OPT_U32: {
std::string err;
- int f = strict_strtol(val, 10, &err);
+ int f = strict_sistrtoll(val, &err);
if (!err.empty())
return -EINVAL;
*(uint32_t*)opt->conf_ptr(this) = f;
@@ -925,7 +923,7 @@ int md_config_t::set_val_raw(const char *val, const config_option *opt)
}
case OPT_U64: {
std::string err;
- long long f = strict_strtoll(val, 10, &err);
+ long long f = strict_sistrtoll(val, &err);
if (!err.empty())
return -EINVAL;
*(uint64_t*)opt->conf_ptr(this) = f;
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index f8dd5f0..fe00c76 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -177,6 +177,7 @@ OPTION(mon_force_standby_active, OPT_BOOL, true) // should mons force standby-re
OPTION(mon_warn_on_old_mons, OPT_BOOL, true) // should mons set health to WARN if part of quorum is old?
OPTION(mon_warn_on_legacy_crush_tunables, OPT_BOOL, true) // warn if crush tunables are not optimal
OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0'
+OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL, true)
OPTION(mon_min_osdmap_epochs, OPT_INT, 500)
OPTION(mon_max_pgmap_epochs, OPT_INT, 500)
OPTION(mon_max_log_epochs, OPT_INT, 500)
@@ -434,6 +435,7 @@ OPTION(osd_pool_default_erasure_code_profile,
"k=2 "
"m=1 "
) // default properties of osd pool create
+OPTION(osd_erasure_code_plugins, OPT_STR, "jerasure") // list of erasure code plugins
OPTION(osd_pool_default_flags, OPT_INT, 0) // default flags for new pools
OPTION(osd_pool_default_flag_hashpspool, OPT_BOOL, true) // use new pg hashing to prevent pool/pg overlap
OPTION(osd_pool_default_hit_set_bloom_fpp, OPT_FLOAT, .05)
@@ -442,6 +444,7 @@ OPTION(osd_pool_default_cache_target_full_ratio, OPT_FLOAT, .8)
OPTION(osd_pool_default_cache_min_flush_age, OPT_INT, 0) // seconds
OPTION(osd_pool_default_cache_min_evict_age, OPT_INT, 0) // seconds
OPTION(osd_hit_set_min_size, OPT_INT, 1000) // min target size for a HitSet
+OPTION(osd_hit_set_max_size, OPT_INT, 100000) // max target size for a HitSet
OPTION(osd_hit_set_namespace, OPT_STR, ".ceph-internal") // rados namespace for hit_set tracking
OPTION(osd_tier_default_cache_mode, OPT_STR, "writeback")
@@ -450,6 +453,7 @@ OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200)
OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom")
OPTION(osd_map_dedup, OPT_BOOL, true)
+OPTION(osd_map_max_advance, OPT_INT, 200) // make this < cache_size!
OPTION(osd_map_cache_size, OPT_INT, 500)
OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message
OPTION(osd_map_share_max_epochs, OPT_INT, 100) // cap on # of inc maps we send to peers, clients
@@ -458,6 +462,8 @@ OPTION(osd_peering_wq_batch_size, OPT_U64, 20)
OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64, 4194304)
OPTION(osd_op_pq_min_cost, OPT_U64, 65536)
OPTION(osd_disk_threads, OPT_INT, 1)
+OPTION(osd_disk_thread_ioprio_class, OPT_STR, "") // rt realtime be besteffort best effort idle
+OPTION(osd_disk_thread_ioprio_priority, OPT_INT, -1) // 0-7
OPTION(osd_recovery_threads, OPT_INT, 1)
OPTION(osd_recover_clone_overlap, OPT_BOOL, true) // preserve clone_overlap during recovery/migration
@@ -473,6 +479,7 @@ OPTION(osd_snap_trim_thread_timeout, OPT_INT, 60*60*1)
OPTION(osd_snap_trim_sleep, OPT_FLOAT, 0)
OPTION(osd_scrub_thread_timeout, OPT_INT, 60)
OPTION(osd_scrub_finalize_thread_timeout, OPT_INT, 60*10)
+OPTION(osd_scrub_invalid_stats, OPT_BOOL, true)
OPTION(osd_remove_thread_timeout, OPT_INT, 60*60)
OPTION(osd_command_thread_timeout, OPT_INT, 10*60)
OPTION(osd_age, OPT_FLOAT, .8)
@@ -509,6 +516,7 @@ OPTION(osd_scrub_min_interval, OPT_FLOAT, 60*60*24) // if load is low
OPTION(osd_scrub_max_interval, OPT_FLOAT, 7*60*60*24) // regardless of load
OPTION(osd_scrub_chunk_min, OPT_INT, 5)
OPTION(osd_scrub_chunk_max, OPT_INT, 25)
+OPTION(osd_scrub_sleep, OPT_FLOAT, 0) // sleep between [deep]scrub ops
OPTION(osd_deep_scrub_interval, OPT_FLOAT, 60*60*24*7) // once a week
OPTION(osd_deep_scrub_stride, OPT_INT, 524288)
OPTION(osd_scan_list_ping_tp_interval, OPT_U64, 100)
@@ -690,6 +698,9 @@ OPTION(keyvaluestore_debug_check_backend, OPT_BOOL, 0) // Expensive debugging ch
OPTION(keyvaluestore_op_threads, OPT_INT, 2)
OPTION(keyvaluestore_op_thread_timeout, OPT_INT, 60)
OPTION(keyvaluestore_op_thread_suicide_timeout, OPT_INT, 180)
+OPTION(keyvaluestore_default_strip_size, OPT_INT, 4096) // Only affect new object
+OPTION(keyvaluestore_max_expected_write_size, OPT_U64, 1ULL << 24) // bytes
+OPTION(keyvaluestore_header_cache_size, OPT_INT, 4096) // Header cache size
// max bytes to search ahead in journal searching for corruption
OPTION(journal_max_corrupt_search, OPT_U64, 10<<20)
@@ -713,6 +724,7 @@ OPTION(rbd_cache_size, OPT_LONGLONG, 32<<20) // cache size in bytes
OPTION(rbd_cache_max_dirty, OPT_LONGLONG, 24<<20) // dirty limit in bytes - set to 0 for write-through caching
OPTION(rbd_cache_target_dirty, OPT_LONGLONG, 16<<20) // target dirty limit in bytes
OPTION(rbd_cache_max_dirty_age, OPT_FLOAT, 1.0) // seconds in cache before writeback starts
+OPTION(rbd_cache_max_dirty_object, OPT_INT, 0) // dirty limit for objects - set to 0 for auto calculate from rbd_cache_size
OPTION(rbd_cache_block_writes_upfront, OPT_BOOL, false) // whether to block writes to the cache before the aio_write call completes (true), or block before the aio completion is called (false)
OPTION(rbd_concurrent_management_ops, OPT_INT, 10) // how many operations can be in flight for a management operation like deleting or resizing an image
OPTION(rbd_balance_snap_reads, OPT_BOOL, false)
diff --git a/src/common/io_priority.cc b/src/common/io_priority.cc
new file mode 100644
index 0000000..b9eeae8
--- /dev/null
+++ b/src/common/io_priority.cc
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/syscall.h> /* For SYS_xxx definitions */
+#include <algorithm>
+#include <errno.h>
+
+#include "common/errno.h"
+#include "io_priority.h"
+
+pid_t ceph_gettid(void)
+{
+#ifdef __linux__
+ return syscall(SYS_gettid);
+#else
+ return -ENOSYS;
+#endif
+}
+
+int ceph_ioprio_set(int whence, int who, int ioprio)
+{
+#ifdef __linux__
+ return syscall(SYS_ioprio_set, whence, who, ioprio);
+#else
+ return -ENOSYS;
+#endif
+}
+
+int ceph_ioprio_string_to_class(const std::string& s)
+{
+ std::string l;
+ std::transform(s.begin(), s.end(), l.begin(), ::tolower);
+
+ if (l == "idle")
+ return IOPRIO_CLASS_IDLE;
+ if (l == "be" || l == "besteffort" || l == "best effort")
+ return IOPRIO_CLASS_BE;
+ if (l == "rt" || l == "realtime" || l == "real time")
+ return IOPRIO_CLASS_RT;
+ return -EINVAL;
+}
diff --git a/src/common/io_priority.h b/src/common/io_priority.h
new file mode 100644
index 0000000..91ebf42
--- /dev/null
+++ b/src/common/io_priority.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_IO_PRIORITY_H
+#define CEPH_COMMON_IO_PRIORITY_H
+
+#include <string>
+
+extern pid_t ceph_gettid();
+
+#ifndef IOPRIO_WHO_PROCESS
+# define IOPRIO_WHO_PROCESS 1
+#endif
+#ifndef IOPRIO_PRIO_VALUE
+# define IOPRIO_CLASS_SHIFT 13
+# define IOPRIO_PRIO_VALUE(class, data) \
+ (((class) << IOPRIO_CLASS_SHIFT) | (data))
+#endif
+#ifndef IOPRIO_CLASS_RT
+# define IOPRIO_CLASS_RT 1
+#endif
+#ifndef IOPRIO_CLASS_BE
+# define IOPRIO_CLASS_BE 2
+#endif
+#ifndef IOPRIO_CLASS_IDLE
+# define IOPRIO_CLASS_IDLE 3
+#endif
+
+extern int ceph_ioprio_set(int whence, int who, int ioprio);
+
+extern int ceph_ioprio_string_to_class(const std::string& s);
+
+#endif
diff --git a/src/common/random_cache.hpp b/src/common/random_cache.hpp
new file mode 100644
index 0000000..c627847
--- /dev/null
+++ b/src/common/random_cache.hpp
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai at unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang at gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RANDOMCACHE_H
+#define CEPH_RANDOMCACHE_H
+
+#include "common/Mutex.h"
+#include "include/compat.h"
+#include "include/unordered_map.h"
+
+
+// Although This is a ramdom cache implementation, here still consider to make
+// the trim progress more reasonable. Each item owns its lookup frequency,
+// when the cache is full it will randomly pick up several items and compare the
+// frequency associated with. The least frequency of items will be evicted.
+template <class K, class V>
+class RandomCache {
+ // The first element of pair is the frequency of item, it's used to evict item
+ ceph::unordered_map<K, pair<uint64_t, V> > contents;
+ Mutex lock;
+ uint64_t max_size;
+ K last_trim_key;
+
+ // When cache reach full, consider to evict a certain number of items
+ static const uint64_t EVICT_COUNT = 5;
+ // Avoid too much overhead on comparing items's frequency, the number of
+ // compare items is expected to small.
+ static const uint64_t COMPARE_COUNT = 3;
+
+ // In order to make evict cache progress more lightweight and effective,
+ // several items are expected to evicted in one call
+ void trim_cache(uint64_t evict_count) {
+ typename ceph::unordered_map<K, pair<uint64_t, V> >::iterator it = contents.find(last_trim_key);
+ uint64_t total_compare = evict_count * COMPARE_COUNT;
+ map<uint64_t, K> candidates;
+
+ while (total_compare--) {
+ if (it == contents.end()) {
+ it = contents.begin();
+ }
+
+ candidates[it->second.first] = it->first;
+ it++;
+ }
+ if (it != contents.end())
+ last_trim_key = it->first;
+ else
+ last_trim_key = contents.begin()->first;
+
+ for (typename map<uint64_t, K>::iterator j = candidates.begin(); j != candidates.end(); j++) {
+ contents.erase(j->second);
+ evict_count--;
+ if (!evict_count)
+ break;
+ }
+ }
+
+ public:
+ RandomCache(size_t max_size=20) : lock("RandomCache::lock"),
+ max_size(max_size) {}
+ ~RandomCache() {
+ contents.clear();
+ }
+
+ void clear(K key) {
+ Mutex::Locker l(lock);
+ contents.erase(key);
+ }
+
+ void set_size(size_t new_size) {
+ Mutex::Locker l(lock);
+ max_size = new_size;
+ if (max_size <= contents.size()) {
+ trim_cache(contents.size() - max_size);
+ }
+ }
+
+ bool lookup(K key, V *out) {
+ Mutex::Locker l(lock);
+ typename ceph::unordered_map<K, pair<uint64_t, V> >::iterator it = contents.find(key);
+ if (it != contents.end()) {
+ it->second.first++;
+ *out = it->second.second;
+ return true;
+ }
+ return false;
+ }
+
+ void add(K key, V value) {
+ Mutex::Locker l(lock);
+ if (max_size <= contents.size()) {
+ trim_cache(EVICT_COUNT);
+ }
+ contents[key] = make_pair(1, value);
+ }
+};
+
+#endif
diff --git a/src/common/str_map.cc b/src/common/str_map.cc
index e635159..ef9b7d4 100644
--- a/src/common/str_map.cc
+++ b/src/common/str_map.cc
@@ -24,7 +24,7 @@
using namespace std;
int get_str_map(const string &str,
- stringstream &ss,
+ ostream &ss,
map<string,string> *str_map)
{
json_spirit::mValue json;
diff --git a/src/common/strtol.cc b/src/common/strtol.cc
index 8f12f08..840b3d9 100644
--- a/src/common/strtol.cc
+++ b/src/common/strtol.cc
@@ -17,6 +17,9 @@
#include <sstream>
#include <stdlib.h>
#include <string>
+extern "C" {
+#include <stdint.h>
+}
using std::ostringstream;
@@ -124,3 +127,43 @@ float strict_strtof(const char *str, std::string *err)
*err = "";
return ret;
}
+
+uint64_t strict_sistrtoll(const char *str, std::string *err)
+{
+ std::string s(str);
+ if (s.size() == 0) {
+ ostringstream oss;
+ oss << "strict_sistrtoll: value not specified";
+ *err = oss.str();
+ return 0;
+ }
+ const char &u = s.at(s.size()-1); //str[std::strlen(str)-1];
+ int m = 0;
+ if (u == 'B')
+ m = 0;
+ else if (u == 'K')
+ m = 10;
+ else if (u == 'M')
+ m = 20;
+ else if (u == 'G')
+ m = 30;
+ else if (u == 'T')
+ m = 40;
+ else if (u == 'P')
+ m = 50;
+ else if (u == 'E')
+ m = 60;
+ else
+ m = -1;
+
+ const char *v = NULL;
+ if (m >= 0)
+ s = std::string(str, s.size()-1);
+ v = s.c_str();
+
+ uint64_t r = strict_strtoll(v, 10, err);
+ if (err->empty() && m > 0) {
+ r = (r << m);
+ }
+ return r;
+}
diff --git a/src/common/strtol.h b/src/common/strtol.h
index 80b5a3f..ea0a469 100644
--- a/src/common/strtol.h
+++ b/src/common/strtol.h
@@ -16,6 +16,9 @@
#define CEPH_COMMON_STRTOL_H
#include <string>
+extern "C" {
+#include <stdint.h>
+}
long long strict_strtoll(const char *str, int base, std::string *err);
@@ -25,4 +28,6 @@ double strict_strtod(const char *str, std::string *err);
float strict_strtof(const char *str, std::string *err);
+uint64_t strict_sistrtoll(const char *str, std::string *err);
+
#endif
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index 4ed3fa9..31da4f5 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -10,17 +10,28 @@
bool CrushWrapper::has_v2_rules() const
{
- // check rules for use of indep or new SET_* rule steps
for (unsigned i=0; i<crush->max_rules; i++) {
- crush_rule *r = crush->rules[i];
- if (!r)
- continue;
- for (unsigned j=0; j<r->len; j++) {
- if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
- r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
- r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
- r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES)
- return true;
+ if (is_v2_rule(i)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool CrushWrapper::is_v2_rule(unsigned ruleid) const
+{
+ // check rule for use of indep or new SET_* rule steps
+ if (ruleid >= crush->max_rules)
+ return false;
+ crush_rule *r = crush->rules[ruleid];
+ if (!r)
+ return false;
+ for (unsigned j=0; j<r->len; j++) {
+ if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
+ r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
+ r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
+ r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) {
+ return true;
}
}
return false;
@@ -28,14 +39,25 @@ bool CrushWrapper::has_v2_rules() const
bool CrushWrapper::has_v3_rules() const
{
- // check rules for use of SET_CHOOSELEAF_VARY_R step
for (unsigned i=0; i<crush->max_rules; i++) {
- crush_rule *r = crush->rules[i];
- if (!r)
- continue;
- for (unsigned j=0; j<r->len; j++) {
- if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R)
- return true;
+ if (is_v3_rule(i)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool CrushWrapper::is_v3_rule(unsigned ruleid) const
+{
+ // check rule for use of SET_CHOOSELEAF_VARY_R step
+ if (ruleid >= crush->max_rules)
+ return false;
+ crush_rule *r = crush->rules[ruleid];
+ if (!r)
+ return false;
+ for (unsigned j=0; j<r->len; j++) {
+ if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) {
+ return true;
}
}
return false;
@@ -794,6 +816,59 @@ int CrushWrapper::add_simple_ruleset(string name, string root_name,
return rno;
}
+int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
+{
+ if (ruleno >= crush->max_rules)
+ return -ENOENT;
+ if (crush->rules[ruleno] == NULL)
+ return -ENOENT;
+ crush_rule *rule = crush->rules[ruleno];
+
+ // build a weight map for each TAKE in the rule, and then merge them
+ for (unsigned i=0; i<rule->len; ++i) {
+ map<int,float> m;
+ float sum = 0;
+ if (rule->steps[i].op == CRUSH_RULE_TAKE) {
+ int n = rule->steps[i].arg1;
+ if (n >= 0) {
+ m[n] = 1.0;
+ sum = 1.0;
+ } else {
+ list<int> q;
+ q.push_back(n);
+ //breadth first iterate the OSD tree
+ while (!q.empty()) {
+ int bno = q.front();
+ q.pop_front();
+ crush_bucket *b = crush->buckets[-1-bno];
+ assert(b);
+ for (unsigned j=0; j<b->size; ++j) {
+ int item_id = b->items[j];
+ if (item_id >= 0) //it's an OSD
+ {
+ float w = crush_get_bucket_item_weight(b, j);
+ m[item_id] = w;
+ sum += w;
+ }
+ else //not an OSD, expand the child later
+ q.push_back(item_id);
+ }
+ }
+ }
+ }
+ for (map<int,float>::iterator p = m.begin(); p != m.end(); ++p) {
+ map<int,float>::iterator q = pmap->find(p->first);
+ if (q == pmap->end()) {
+ (*pmap)[p->first] = p->second / sum;
+ } else {
+ q->second += p->second / sum;
+ }
+ }
+ }
+
+ return 0;
+}
+
int CrushWrapper::remove_rule(int ruleno)
{
if (ruleno >= (int)crush->max_rules)
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index 282cbeb..d5d4f4f 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -216,6 +216,8 @@ public:
bool has_v2_rules() const;
bool has_v3_rules() const;
+ bool is_v2_rule(unsigned ruleid) const;
+ bool is_v3_rule(unsigned ruleid) const;
// bucket types
int get_num_type_names() const {
@@ -631,6 +633,18 @@ public:
return s->arg2;
}
+ /**
+ * calculate a map of osds to weights for a given rule
+ *
+ * Generate a map of which OSDs get how much relative weight for a
+ * given rule.
+ *
+ * @param ruleno [in] rule id
+ * @param pmap [out] map of osd to weight
+ * @return 0 for success, or negative error code
+ */
+ int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap);
+
/* modifiers */
int add_rule(int len, int ruleset, int type, int minsize, int maxsize, int ruleno) {
if (!crush) return -ENOENT;
diff --git a/src/erasure-code/ErasureCodeInterface.h b/src/erasure-code/ErasureCodeInterface.h
index f8e22d1..1dc12c5 100644
--- a/src/erasure-code/ErasureCodeInterface.h
+++ b/src/erasure-code/ErasureCodeInterface.h
@@ -167,7 +167,7 @@ namespace ceph {
* @param [in] name of the ruleset to create
* @param [in] crush crushmap in which the ruleset is created
* @param [out] ss contains informative messages when an error occurs
- * @return **0** on success or a negative errno on error.
+ * @return a ruleset on success or a negative errno on error.
*/
virtual int create_ruleset(const string &name,
CrushWrapper &crush,
diff --git a/src/erasure-code/ErasureCodePlugin.cc b/src/erasure-code/ErasureCodePlugin.cc
index da075d2..3ce0563 100644
--- a/src/erasure-code/ErasureCodePlugin.cc
+++ b/src/erasure-code/ErasureCodePlugin.cc
@@ -4,6 +4,7 @@
* Ceph - scalable distributed file system
*
* Copyright (C) 2013,2014 Cloudwatt <libre.licensing at cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact at redhat.com>
*
* Author: Loic Dachary <loic at dachary.org>
*
@@ -19,6 +20,7 @@
#include "ErasureCodePlugin.h"
#include "common/errno.h"
+#include "include/str_list.h"
#define PLUGIN_PREFIX "libec_"
#define PLUGIN_SUFFIX ".so"
@@ -130,6 +132,32 @@ int ErasureCodePluginRegistry::load(const std::string &plugin_name,
(*plugin)->library = library;
+ ss << __func__ << ": " << plugin_name << " ";
+
return 0;
}
+int ErasureCodePluginRegistry::preload(const std::string &plugins,
+ const std::string &directory,
+ ostream &ss)
+{
+ map<string,string> profile;
+ profile["directory"] = directory;
+ list<string> plugins_list;
+ get_str_list(plugins, plugins_list);
+ for (list<string>::iterator i = plugins_list.begin();
+ i != plugins_list.end();
+ i++) {
+ ErasureCodePlugin *plugin;
+ int r = load(*i, profile, &plugin, ss);
+ if (r)
+ return r;
+
+ ErasureCodeInterfaceRef erasure_code;
+ profile["technique"] = "reed_sol_van";
+ r = plugin->factory(profile, &erasure_code);
+ if (r)
+ return r;
+ }
+ return 0;
+}
diff --git a/src/erasure-code/ErasureCodePlugin.h b/src/erasure-code/ErasureCodePlugin.h
index e891079..7f0b1e9 100644
--- a/src/erasure-code/ErasureCodePlugin.h
+++ b/src/erasure-code/ErasureCodePlugin.h
@@ -67,6 +67,9 @@ namespace ceph {
ErasureCodePlugin **plugin,
ostream &ss);
+ int preload(const std::string &plugins,
+ const std::string &directory,
+ ostream &ss);
};
}
diff --git a/src/erasure-code/jerasure/ErasureCodeJerasure.cc b/src/erasure-code/jerasure/ErasureCodeJerasure.cc
index 6d0f653..06ccc58 100644
--- a/src/erasure-code/jerasure/ErasureCodeJerasure.cc
+++ b/src/erasure-code/jerasure/ErasureCodeJerasure.cc
@@ -44,8 +44,12 @@ int ErasureCodeJerasure::create_ruleset(const string &name,
CrushWrapper &crush,
ostream *ss) const
{
- return crush.add_simple_ruleset(name, ruleset_root, ruleset_failure_domain,
- "indep", pg_pool_t::TYPE_ERASURE, ss);
+ int ruleid = crush.add_simple_ruleset(name, ruleset_root, ruleset_failure_domain,
+ "indep", pg_pool_t::TYPE_ERASURE, ss);
+ if (ruleid < 0)
+ return ruleid;
+ else
+ return crush.get_rule_mask_ruleset(ruleid);
}
void ErasureCodeJerasure::init(const map<string,string> ¶meters)
diff --git a/src/global/global_init.cc b/src/global/global_init.cc
index 7b20343..f03677c 100644
--- a/src/global/global_init.cc
+++ b/src/global/global_init.cc
@@ -166,8 +166,16 @@ int global_init_prefork(CephContext *cct, int flags)
if (g_code_env != CODE_ENVIRONMENT_DAEMON)
return -1;
const md_config_t *conf = cct->_conf;
- if (!conf->daemonize)
+ if (!conf->daemonize) {
+ if (atexit(pidfile_remove_void)) {
+ derr << "global_init_daemonize: failed to set pidfile_remove function "
+ << "to run at exit." << dendl;
+ }
+
+ pidfile_write(g_conf);
+
return -1;
+ }
// stop log thread
g_ceph_context->_log->flush();
diff --git a/src/include/atomic.h b/src/include/atomic.h
index 537fa98..c1650be 100644
--- a/src/include/atomic.h
+++ b/src/include/atomic.h
@@ -21,10 +21,66 @@
#endif
#include <stdlib.h>
+#include "include/Spinlock.h"
+
+namespace ceph {
+ template <class T>
+ class atomic_spinlock_t {
+ mutable ceph_spinlock_t lock;
+ T val;
+ public:
+ atomic_spinlock_t(T i=0)
+ : val(i) {
+ ceph_spin_init(&lock);
+ }
+ ~atomic_spinlock_t() {
+ ceph_spin_destroy(&lock);
+ }
+ void set(T v) {
+ ceph_spin_lock(&lock);
+ val = v;
+ ceph_spin_unlock(&lock);
+ }
+ T inc() {
+ ceph_spin_lock(&lock);
+ T r = ++val;
+ ceph_spin_unlock(&lock);
+ return r;
+ }
+ T dec() {
+ ceph_spin_lock(&lock);
+ T r = --val;
+ ceph_spin_unlock(&lock);
+ return r;
+ }
+ void add(T d) {
+ ceph_spin_lock(&lock);
+ val += d;
+ ceph_spin_unlock(&lock);
+ }
+ void sub(T d) {
+ ceph_spin_lock(&lock);
+ val -= d;
+ ceph_spin_unlock(&lock);
+ }
+ T read() const {
+ T ret;
+ ceph_spin_lock(&lock);
+ ret = val;
+ ceph_spin_unlock(&lock);
+ return ret;
+ }
+ private:
+ // forbid copying
+ atomic_spinlock_t(const atomic_spinlock_t<T> &other);
+ atomic_spinlock_t &operator=(const atomic_spinlock_t<T> &rhs);
+ };
+}
#ifndef NO_ATOMIC_OPS
// libatomic_ops implementation
+#define AO_REQUIRE_CAS
#include <atomic_ops.h>
// reinclude our assert to clobber the system one
@@ -35,7 +91,7 @@ namespace ceph {
AO_t val;
public:
atomic_t(AO_t i=0) : val(i) {}
- void set(size_t v) {
+ void set(AO_t v) {
AO_store(&val, v);
}
AO_t inc() {
@@ -47,8 +103,8 @@ namespace ceph {
void add(AO_t add_me) {
AO_fetch_and_add(&val, add_me);
}
- void sub(int sub_me) {
- int negsub = 0 - sub_me;
+ void sub(AO_t sub_me) {
+ AO_t negsub = 0 - sub_me;
AO_fetch_and_add_write(&val, (AO_t)negsub);
}
AO_t read() const {
@@ -62,7 +118,15 @@ namespace ceph {
atomic_t(const atomic_t &other);
atomic_t &operator=(const atomic_t &rhs);
};
+
+#if SIZEOF_AO_T == 8
+ typedef atomic_t atomic64_t;
+#else
+ typedef atomic_spinlock_t<unsigned long long> atomic64_t;
+#endif
+
}
+
#else
/*
* crappy slow implementation that uses a pthreads spinlock.
@@ -70,56 +134,9 @@ namespace ceph {
#include "include/Spinlock.h"
namespace ceph {
- class atomic_t {
- mutable ceph_spinlock_t lock;
- signed long val;
- public:
- atomic_t(int i=0)
- : val(i) {
- ceph_spin_init(&lock);
- }
- ~atomic_t() {
- ceph_spin_destroy(&lock);
- }
- void set(size_t v) {
- ceph_spin_lock(&lock);
- val = v;
- ceph_spin_unlock(&lock);
- }
- int inc() {
- ceph_spin_lock(&lock);
- int r = ++val;
- ceph_spin_unlock(&lock);
- return r;
- }
- int dec() {
- ceph_spin_lock(&lock);
- int r = --val;
- ceph_spin_unlock(&lock);
- return r;
- }
- void add(int d) {
- ceph_spin_lock(&lock);
- val += d;
- ceph_spin_unlock(&lock);
- }
- void sub(int d) {
- ceph_spin_lock(&lock);
- val -= d;
- ceph_spin_unlock(&lock);
- }
- int read() const {
- signed long ret;
- ceph_spin_lock(&lock);
- ret = val;
- ceph_spin_unlock(&lock);
- return ret;
- }
- private:
- // forbid copying
- atomic_t(const atomic_t &other);
- atomic_t &operator=(const atomic_t &rhs);
- };
+ typedef atomic_spinlock_t<unsigned> atomic_t;
+ typedef atomic_spinlock_t<unsigned long long> atomic64_t;
}
+
#endif
#endif
diff --git a/src/include/intarith.h b/src/include/intarith.h
index 640129c..2c27cec 100644
--- a/src/include/intarith.h
+++ b/src/include/intarith.h
@@ -28,7 +28,7 @@
#endif
#ifndef ROUND_UP_TO
-# define ROUND_UP_TO(n, d) (((n)+(d)-1) & ~((d)-1))
+# define ROUND_UP_TO(n, d) ((n)%(d) ? ((n)+(d)-(n)%(d)) : (n))
#endif
#ifndef SHIFT_ROUND_UP
diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h
index 5be8203..1e87af9 100644
--- a/src/include/rbd/librbd.h
+++ b/src/include/rbd/librbd.h
@@ -39,6 +39,7 @@ extern "C" {
#define LIBRBD_SUPPORTS_WATCH 0
#define LIBRBD_SUPPORTS_AIO_FLUSH 1
+#define LIBRBD_SUPPORTS_INVALIDATE 1
typedef void *rbd_snap_t;
typedef void *rbd_image_t;
@@ -376,6 +377,14 @@ int rbd_flush(rbd_image_t image);
*/
int rbd_aio_flush(rbd_image_t image, rbd_completion_t c);
+/**
+ * Drop any cached data for an image
+ *
+ * @param image the image to invalidate cached data for
+ * @returns 0 on success, negative error code on failure
+ */
+int rbd_invalidate_cache(rbd_image_t image);
+
#ifdef __cplusplus
}
#endif
diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp
index 697fc6c..caf61a6 100644
--- a/src/include/rbd/librbd.hpp
+++ b/src/include/rbd/librbd.hpp
@@ -216,6 +216,14 @@ public:
*/
int aio_flush(RBD::AioCompletion *c);
+ /**
+ * Drop any cached data for an image
+ *
+ * @param image the image to invalidate cached data for
+ * @returns 0 on success, negative error code on failure
+ */
+ int invalidate_cache();
+
private:
friend class RBD;
diff --git a/src/include/str_map.h b/src/include/str_map.h
index efae903..eabe8d2 100644
--- a/src/include/str_map.h
+++ b/src/include/str_map.h
@@ -53,7 +53,7 @@
* @return **0** on success or a -EINVAL on error.
*/
extern int get_str_map(const std::string &str,
- std::stringstream &ss,
+ std::ostream &ss,
std::map<std::string,std::string> *str_map);
#endif
diff --git a/src/init-ceph.in b/src/init-ceph.in
index 95723b0..7276830 100644
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
@@ -31,6 +31,7 @@ fi
usage_exit() {
echo "usage: $0 [options] {start|stop|restart|condrestart} [mon|osd|mds]..."
printf "\t-c ceph.conf\n"
+ printf "\t--cluster [cluster name]\tdefine the cluster name\n"
printf "\t--valgrind\trun via valgrind\n"
printf "\t--hostname [hostname]\toverride hostname lookup\n"
exit
@@ -113,6 +114,8 @@ monaddr=
dofsmount=1
dofsumount=0
verbose=0
+use_default_conf=1
+
while echo $1 | grep -q '^-'; do # FIXME: why not '^-'?
case $1 in
@@ -153,8 +156,15 @@ case $1 in
[ -z "$2" ] && usage_exit
options="$options $1"
shift
+ use_default_conf=0
conf=$1
;;
+ --cluster )
+ [ -z "$2" ] && usage_exit
+ options="$options $1"
+ shift
+ cluster=$1
+ ;;
--hostname )
[ -z "$2" ] && usage_exit
options="$options $1"
@@ -170,6 +180,20 @@ options="$options $1"
shift
done
+
+# if `--cluster` was not passed in, fallback to looking at the config name
+if [ -z "$cluster" ]; then
+ cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1`
+else
+ # if we were told to use a given cluster name then $conf needs to be updated
+ # but just define it if `--conf` was not specified, otherwise we would be silently
+ # overriding $conf even if it was defined with `--conf`
+ if [ $use_default_conf -eq 1 ]; then
+ conf="/etc/ceph/$cluster.conf"
+ fi
+fi
+
+
verify_conf
command=$1
@@ -189,11 +213,10 @@ fi
for name in $what; do
type=`echo $name | cut -c 1-3` # e.g. 'mon', if $item is 'mon1'
id=`echo $name | cut -c 4- | sed 's/^\\.//'`
- cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1`
num=$id
name="$type.$id"
- check_host || continue
+ check_host $cluster || continue
binary="$BINDIR/ceph-$type"
cmd="$binary -i $id"
@@ -235,7 +258,7 @@ for name in $what; do
cmd="$cmd -c $conf"
if echo $name | grep -q ^osd; then
- get_conf osd_data "/var/lib/ceph/osd/ceph-$id" "osd data"
+ get_conf osd_data "/var/lib/ceph/osd/$cluster-$id" "osd data"
get_conf fs_path "$osd_data" "fs path" # mount point defaults so osd data
get_conf fs_devs "" "devs"
if [ -z "$fs_devs" ]; then
@@ -335,7 +358,7 @@ for name in $what; do
if [ "${update_crush:-1}" = "1" -o "${update_crush:-1}" = "true" ]; then
# update location in crush
get_conf osd_location_hook "$BINDIR/ceph-crush-location" "osd crush location hook"
- osd_location=`$osd_location_hook --cluster ceph --id $id --type osd`
+ osd_location=`$osd_location_hook --cluster $cluster --id $id --type osd`
get_conf osd_weight "" "osd crush initial weight"
defaultweight="$(df -P -k $osd_data/. | tail -1 | awk '{ print sprintf("%.2f",$2/1073741824) }')"
get_conf osd_keyring "$osd_data/keyring" "keyring"
@@ -366,7 +389,7 @@ for name in $what; do
get_conf mon_data "/var/lib/ceph/mon/ceph-$id" "mon data"
if [ "$mon_data" = "/var/lib/ceph/mon/ceph-$id" -a "$asok" = "/var/run/ceph/ceph-mon.$id.asok" ]; then
echo Starting ceph-create-keys on $host...
- cmd2="$SBINDIR/ceph-create-keys -i $id 2> /dev/null &"
+ cmd2="$SBINDIR/ceph-create-keys --cluster $cluster -i $id 2> /dev/null &"
do_cmd "$cmd2"
fi
fi
diff --git a/src/init-radosgw.sysv b/src/init-radosgw.sysv
index ab6b250..dd3dbb0 100644
--- a/src/init-radosgw.sysv
+++ b/src/init-radosgw.sysv
@@ -15,6 +15,7 @@ PATH=/sbin:/bin:/usr/bin
daemon_is_running() {
daemon=$1
+ sleep 1
if pidof $daemon >/dev/null; then
echo "$daemon is running."
exit 0
@@ -44,6 +45,10 @@ if [ ! -x "$RADOSGW" ]; then
exit 1
fi
+# detect systemd
+SYSTEMD=0
+grep -qs systemd /proc/1/comm && SYSTEMD=1
+
case "$1" in
start)
echo "Starting radosgw instance(s)..."
@@ -79,8 +84,12 @@ case "$1" in
chown $user $log_file
fi
- #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
- daemon --user="$user" "ulimit -n 32768; $RADOSGW -n $name"
+ if [ $SYSTEMD -eq 1 ]; then
+ systemd-run -r bash -c "ulimit -n 32768; $RADOSGW -n $name"
+ else
+ #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
+ daemon --user="$user" "ulimit -n 32768; $RADOSGW -n $name"
+ fi
echo "Starting $name..."
done
daemon_is_running $RADOSGW
diff --git a/src/librados/RadosClient.cc b/src/librados/RadosClient.cc
index 249c34f..9330e65 100644
--- a/src/librados/RadosClient.cc
+++ b/src/librados/RadosClient.cc
@@ -103,8 +103,10 @@ int64_t librados::RadosClient::lookup_pool(const char *name)
lock.Lock();
int r = wait_for_osdmap();
- if (r < 0)
+ if (r < 0) {
+ lock.Unlock();
return r;
+ }
int64_t ret = osdmap.lookup_pg_pool_name(name);
pool_cache_rwl.get_write();
lock.Unlock();
@@ -582,8 +584,10 @@ int librados::RadosClient::pool_delete(const char *name)
{
lock.Lock();
int r = wait_for_osdmap();
- if (r < 0)
+ if (r < 0) {
+ lock.Unlock();
return r;
+ }
int tmp_pool_id = osdmap.lookup_pg_pool_name(name);
if (tmp_pool_id < 0) {
lock.Unlock();
diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc
index 6477e8d..b5c2db6 100644
--- a/src/librbd/ImageCtx.cc
+++ b/src/librbd/ImageCtx.cc
@@ -185,10 +185,14 @@ namespace librbd {
// size object cache appropriately
if (object_cacher) {
- uint64_t obj = cct->_conf->rbd_cache_size / (1ull << order);
+ uint64_t obj = cct->_conf->rbd_cache_max_dirty_object;
+ if (!obj) {
+ obj = cct->_conf->rbd_cache_size / (1ull << order);
+ obj = obj * 4 + 10;
+ }
ldout(cct, 10) << " cache bytes " << cct->_conf->rbd_cache_size << " order " << (int)order
<< " -> about " << obj << " objects" << dendl;
- object_cacher->set_max_objects(obj * 4 + 10);
+ object_cacher->set_max_objects(obj);
}
ldout(cct, 10) << "init_layout stripe_unit " << stripe_unit
@@ -573,9 +577,9 @@ namespace librbd {
object_cacher->stop();
}
- void ImageCtx::invalidate_cache() {
+ int ImageCtx::invalidate_cache() {
if (!object_cacher)
- return;
+ return 0;
cache_lock.Lock();
object_cacher->release_set(object_set);
cache_lock.Unlock();
@@ -585,8 +589,12 @@ namespace librbd {
cache_lock.Lock();
bool unclean = object_cacher->release_set(object_set);
cache_lock.Unlock();
- if (unclean)
- lderr(cct) << "could not release all objects from cache" << dendl;
+ if (unclean) {
+ lderr(cct) << "could not release all objects from cache: "
+ << unclean << " bytes remain" << dendl;
+ return -EBUSY;
+ }
+ return r;
}
void ImageCtx::clear_nonexistence_cache() {
diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h
index 026a3e0..83ed044 100644
--- a/src/librbd/ImageCtx.h
+++ b/src/librbd/ImageCtx.h
@@ -139,7 +139,7 @@ namespace librbd {
void flush_cache_aio(Context *onfinish);
int flush_cache();
void shutdown_cache();
- void invalidate_cache();
+ int invalidate_cache();
void clear_nonexistence_cache();
int register_watch();
void unregister_watch();
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index 127be38..afa4660 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -832,6 +832,9 @@ reprotect_and_return_err:
bool old_format, uint64_t features, int *order,
uint64_t stripe_unit, uint64_t stripe_count)
{
+ if (!order)
+ return -EINVAL;
+
CephContext *cct = (CephContext *)io_ctx.cct();
ldout(cct, 20) << "create " << &io_ctx << " name = " << imgname
<< " size = " << size << " old_format = " << old_format
@@ -857,9 +860,6 @@ reprotect_and_return_err:
return -EEXIST;
}
- if (!order)
- return -EINVAL;
-
if (!*order)
*order = cct->_conf->rbd_default_order;
if (!*order)
@@ -1275,6 +1275,19 @@ reprotect_and_return_err:
return r;
}
ictx->parent->snap_set(ictx->parent->snap_name);
+ ictx->parent->parent_lock.get_write();
+ r = refresh_parent(ictx->parent);
+ if (r < 0) {
+ lderr(ictx->cct) << "error refreshing parent snapshot "
+ << ictx->parent->id << " "
+ << ictx->parent->snap_name << dendl;
+ ictx->parent->parent_lock.put_write();
+ ictx->parent->snap_lock.put_write();
+ close_image(ictx->parent);
+ ictx->parent = NULL;
+ return r;
+ }
+ ictx->parent->parent_lock.put_write();
ictx->parent->snap_lock.put_write();
return 0;
@@ -1504,7 +1517,9 @@ reprotect_and_return_err:
if (size < ictx->size && ictx->object_cacher) {
// need to invalidate since we're deleting objects, and
// ObjectCacher doesn't track non-existent objects
- ictx->invalidate_cache();
+ r = ictx->invalidate_cache();
+ if (r < 0)
+ return r;
}
resize_helper(ictx, size, prog_ctx);
@@ -1847,7 +1862,9 @@ reprotect_and_return_err:
// need to flush any pending writes before resizing and rolling back -
// writes might create new snapshots. Rolling back will replace
// the current version, so we have to invalidate that too.
- ictx->invalidate_cache();
+ r = ictx->invalidate_cache();
+ if (r < 0)
+ return r;
ldout(cct, 2) << "resizing to snapshot size..." << dendl;
NoOpProgressContext no_op;
@@ -2071,7 +2088,7 @@ reprotect_and_return_err:
<< ictx->snap_name << "'" << dendl;
int r = ictx->init();
if (r < 0)
- return r;
+ goto err_close;
if (!ictx->read_only) {
r = ictx->register_watch();
@@ -2877,6 +2894,19 @@ reprotect_and_return_err:
return r;
}
+ int invalidate_cache(ImageCtx *ictx)
+ {
+ CephContext *cct = ictx->cct;
+ ldout(cct, 20) << "invalidate_cache " << ictx << dendl;
+
+ int r = ictx_check(ictx);
+ if (r < 0)
+ return r;
+
+ RWLock::WLocker l(ictx->md_lock);
+ return ictx->invalidate_cache();
+ }
+
int aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
AioCompletion *c)
{
diff --git a/src/librbd/internal.h b/src/librbd/internal.h
index 4345888..1e9fd9a 100644
--- a/src/librbd/internal.h
+++ b/src/librbd/internal.h
@@ -114,6 +114,7 @@ namespace librbd {
bool *is_protected);
int add_snap(ImageCtx *ictx, const char *snap_name);
int rm_snap(ImageCtx *ictx, const char *snap_name);
+ int refresh_parent(ImageCtx *ictx);
int ictx_check(ImageCtx *ictx);
int ictx_refresh(ImageCtx *ictx);
int copy(ImageCtx *ictx, IoCtx& dest_md_ctx, const char *destname,
@@ -188,6 +189,7 @@ namespace librbd {
int aio_flush(ImageCtx *ictx, AioCompletion *c);
int flush(ImageCtx *ictx);
int _flush(ImageCtx *ictx);
+ int invalidate_cache(ImageCtx *ictx);
ssize_t handle_sparse_read(CephContext *cct,
ceph::bufferlist data_bl,
diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc
index cad0c5e..658f24b 100644
--- a/src/librbd/librbd.cc
+++ b/src/librbd/librbd.cc
@@ -514,6 +514,12 @@ namespace librbd {
return librbd::aio_flush(ictx, (librbd::AioCompletion *)c->pc);
}
+ int Image::invalidate_cache()
+ {
+ ImageCtx *ictx = (ImageCtx *)ctx;
+ return librbd::invalidate_cache(ictx);
+ }
+
} // namespace librbd
extern "C" void rbd_version(int *major, int *minor, int *extra)
@@ -1130,6 +1136,12 @@ extern "C" int rbd_aio_flush(rbd_image_t image, rbd_completion_t c)
return librbd::aio_flush(ictx, (librbd::AioCompletion *)comp->pc);
}
+extern "C" int rbd_invalidate_cache(rbd_image_t image)
+{
+ librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+ return librbd::invalidate_cache(ictx);
+}
+
extern "C" int rbd_aio_is_complete(rbd_completion_t c)
{
librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index e5fe00c..74305b9 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -2062,7 +2062,13 @@ public:
void Locker::calc_new_client_ranges(CInode *in, uint64_t size, map<client_t,client_writeable_range_t>& new_ranges)
{
inode_t *latest = in->get_projected_inode();
- uint64_t ms = ROUND_UP_TO((size+1)<<1, latest->get_layout_size_increment());
+ uint64_t ms;
+ if(latest->has_layout()) {
+ ms = ROUND_UP_TO((size+1)<<1, latest->get_layout_size_increment());
+ } else {
+ // Layout-less directories like ~mds0/, have zero size
+ ms = 0;
+ }
// increase ranges as appropriate.
// shrink to 0 if no WR|BUFFER caps issued.
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 71a4b33..d6cfebd 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -349,6 +349,7 @@ void MDCache::create_empty_hierarchy(C_Gather *gather)
root->inode.dirstat = rootdir->fnode.fragstat;
root->inode.rstat = rootdir->fnode.rstat;
+ ++root->inode.rstat.rsubdirs;
root->inode.accounted_rstat = root->inode.rstat;
rootdir->mark_complete();
@@ -399,6 +400,7 @@ void MDCache::create_mydir_hierarchy(C_Gather *gather)
myin->inode.dirstat = mydir->fnode.fragstat;
myin->inode.rstat = mydir->fnode.rstat;
+ ++myin->inode.rstat.rsubdirs;
myin->inode.accounted_rstat = myin->inode.rstat;
diff --git a/src/messages/MOSDSubOp.h b/src/messages/MOSDSubOp.h
index 6a38186..7b40c0a 100644
--- a/src/messages/MOSDSubOp.h
+++ b/src/messages/MOSDSubOp.h
@@ -25,7 +25,7 @@
class MOSDSubOp : public Message {
- static const int HEAD_VERSION = 10;
+ static const int HEAD_VERSION = 11;
static const int COMPAT_VERSION = 1;
public:
@@ -63,6 +63,8 @@ public:
// piggybacked osd/og state
eversion_t pg_trim_to; // primary->replica: trim to here
+ eversion_t pg_trim_rollback_to; // primary->replica: trim rollback
+ // info to here
osd_peer_stat_t peer_stat;
map<string,bufferlist> attrset;
@@ -175,6 +177,11 @@ public:
if (header.version >= 10) {
::decode(updated_hit_set_history, p);
}
+ if (header.version >= 11) {
+ ::decode(pg_trim_rollback_to, p);
+ } else {
+ pg_trim_rollback_to = pg_trim_to;
+ }
}
virtual void encode_payload(uint64_t features) {
@@ -224,6 +231,7 @@ public:
::encode(from, payload);
::encode(pgid.shard, payload);
::encode(updated_hit_set_history, payload);
+ ::encode(pg_trim_rollback_to, payload);
}
MOSDSubOp()
diff --git a/src/mon/DataHealthService.cc b/src/mon/DataHealthService.cc
index 78732ac..6c6ed29 100644
--- a/src/mon/DataHealthService.cc
+++ b/src/mon/DataHealthService.cc
@@ -228,7 +228,7 @@ void DataHealthService::service_tick()
if (ours.latest_avail_percent != last_warned_percent)
mon->clog.warn()
<< "reached concerning levels of available space on local monitor storage"
- << " (" << ours.latest_avail_percent << "\% free)\n";
+ << " (" << ours.latest_avail_percent << "% free)\n";
last_warned_percent = ours.latest_avail_percent;
} else {
last_warned_percent = 0;
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index eb63303..bd9dd2e 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -552,7 +552,7 @@ COMMAND("osd pool rename " \
"rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
COMMAND("osd pool get " \
"name=pool,type=CephPoolname " \
- "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid", \
+ "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile", \
"get pool parameter <var>", "osd", "r", "cli,rest")
COMMAND("osd pool set " \
"name=pool,type=CephPoolname " \
@@ -568,6 +568,10 @@ COMMAND("osd pool set-quota " \
"name=field,type=CephChoices,strings=max_objects|max_bytes " \
"name=val,type=CephString",
"set object or byte limit on pool", "osd", "rw", "cli,rest")
+COMMAND("osd pool get-quota " \
+ "name=pool,type=CephPoolname ",
+ "obtain object or byte limits for pool",
+ "osd", "r", "cli,rest")
COMMAND("osd pool stats " \
"name=name,type=CephString,req=false",
"obtain stats from all pools, or from specified pool",
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index cd447e7..fd3a358 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -621,6 +621,21 @@ void Monitor::init_paxos()
void Monitor::refresh_from_paxos(bool *need_bootstrap)
{
dout(10) << __func__ << dendl;
+
+ bufferlist bl;
+ int r = store->get(MONITOR_NAME, "cluster_fingerprint", bl);
+ if (r >= 0) {
+ try {
+ bufferlist::iterator p = bl.begin();
+ ::decode(fingerprint, p);
+ }
+ catch (buffer::error& e) {
+ dout(10) << __func__ << " failed to decode cluster_fingerprint" << dendl;
+ }
+ } else {
+ dout(10) << __func__ << " no cluster_fingerprint" << dendl;
+ }
+
for (int i = 0; i < PAXOS_NUM; ++i) {
paxos_service[i]->refresh(need_bootstrap);
}
@@ -2393,6 +2408,7 @@ void Monitor::handle_command(MMonCommand *m)
if (!f)
f.reset(new_formatter("json-pretty"));
f->open_object_section("report");
+ f->dump_stream("cluster_fingerprint") << fingerprint;
f->dump_string("version", ceph_version_to_str());
f->dump_string("commit", git_version_to_str());
f->dump_stream("timestamp") << ceph_clock_now(NULL);
@@ -2866,8 +2882,9 @@ bool Monitor::_ms_dispatch(Message *m)
return dispatch(s, m, false);
}
dout(1) << __func__ << " dropping stray message " << *m
- << " from " << m->get_source_inst() << dendl;
- return false;
+ << " from " << m->get_source_inst() << dendl;
+ m->put();
+ return true;
}
if (!exited_quorum.is_zero() && !src_is_mon) {
@@ -3847,9 +3864,29 @@ void Monitor::tick()
finish_contexts(g_ceph_context, maybe_wait_for_quorum);
}
+ if (is_leader() && paxos->is_active() && fingerprint.is_zero()) {
+ // this is only necessary on upgraded clusters.
+ MonitorDBStore::Transaction t;
+ prepare_new_fingerprint(&t);
+ bufferlist tbl;
+ t.encode(tbl);
+ paxos->propose_new_value(tbl, new C_NoopContext);
+ }
+
new_tick();
}
+void Monitor::prepare_new_fingerprint(MonitorDBStore::Transaction *t)
+{
+ uuid_d nf;
+ nf.generate_random();
+ dout(10) << __func__ << " proposing cluster_fingerprint " << nf << dendl;
+
+ bufferlist bl;
+ ::encode(nf, bl);
+ t->put(MONITOR_NAME, "cluster_fingerprint", bl);
+}
+
int Monitor::check_fsid()
{
if (!store->exists(MONITOR_NAME, "cluster_uuid"))
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index 59292ec..42e148e 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -128,6 +128,7 @@ public:
void unregister_cluster_logger();
MonMap *monmap;
+ uuid_d fingerprint;
set<entity_addr_t> extra_probe_peers;
@@ -190,6 +191,8 @@ public:
const utime_t &get_leader_since() const;
+ void prepare_new_fingerprint(MonitorDBStore::Transaction *t);
+
// -- elector --
private:
Paxos *paxos;
diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc
index 5940724..3890704 100644
--- a/src/mon/MonmapMonitor.cc
+++ b/src/mon/MonmapMonitor.cc
@@ -97,6 +97,11 @@ void MonmapMonitor::encode_pending(MonitorDBStore::Transaction *t)
put_version(t, pending_map.epoch, bl);
put_last_committed(t, pending_map.epoch);
+
+ // generate a cluster fingerprint, too?
+ if (pending_map.epoch == 1) {
+ mon->prepare_new_fingerprint(t);
+ }
}
void MonmapMonitor::on_active()
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index eab5122..7e469b2 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -2067,6 +2067,32 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
}
}
+ // hit_set-less cache_mode?
+ if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
+ int problem_cache_pools = 0;
+ for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
+ p != osdmap.pools.end();
+ ++p) {
+ const pg_pool_t& info = p->second;
+ if (info.cache_mode_requires_hit_set() &&
+ info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
+ ++problem_cache_pools;
+ if (detail) {
+ ostringstream ss;
+ ss << "pool '" << osdmap.get_pool_name(p->first)
+ << "' with cache_mode " << info.get_cache_mode_name()
+ << " needs hit_set_type to be set but it is not";
+ detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+ }
+ }
+ }
+ if (problem_cache_pools) {
+ ostringstream ss;
+ ss << problem_cache_pools << " cache pools are missing hit_sets";
+ summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+ }
+ }
+
// Warn if 'mon_osd_down_out_interval' is set to zero.
// Having this option set to zero on the leader acts much like the
// 'noout' flag. It's hard to figure out what's going wrong with clusters
@@ -2453,6 +2479,26 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
string var;
cmd_getval(g_ceph_context, cmdmap, "var", var);
+ if (!p->is_tier() &&
+ (var == "hit_set_type" || var == "hit_set_period" ||
+ var == "hit_set_count" || var == "hit_set_fpp" ||
+ var == "target_max_objects" || var == "target_max_bytes" ||
+ var == "cache_target_full_ratio" ||
+ var == "cache_target_dirty_ratio" ||
+ var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
+ ss << "pool '" << poolstr
+ << "' is not a tier pool: variable not applicable";
+ r = -EACCES;
+ goto reply;
+ }
+
+ if (!p->is_erasure() && var == "erasure_code_profile") {
+ ss << "pool '" << poolstr
+ << "' is not a erasure pool: variable not applicable";
+ r = -EACCES;
+ goto reply;
+ }
+
if (f) {
f->open_object_section("pool");
f->dump_string("pool", poolstr);
@@ -2488,6 +2534,26 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
f->dump_float("hit_set_fpp", bloomp->get_fpp());
}
+ } else if (var == "target_max_objects") {
+ f->dump_unsigned("target_max_objects", p->target_max_objects);
+ } else if (var == "target_max_bytes") {
+ f->dump_unsigned("target_max_bytes", p->target_max_bytes);
+ } else if (var == "cache_target_dirty_ratio") {
+ f->dump_unsigned("cache_target_dirty_ratio_micro",
+ p->cache_target_dirty_ratio_micro);
+ f->dump_float("cache_target_dirty_ratio",
+ ((float)p->cache_target_dirty_ratio_micro/1000000));
+ } else if (var == "cache_target_full_ratio") {
+ f->dump_unsigned("cache_target_full_ratio_micro",
+ p->cache_target_full_ratio_micro);
+ f->dump_float("cache_target_full_ratio",
+ ((float)p->cache_target_full_ratio_micro/1000000));
+ } else if (var == "cache_min_flush_age") {
+ f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
+ } else if (var == "cache_min_evict_age") {
+ f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
+ } else if (var == "erasure_code_profile") {
+ f->dump_string("erasure_code_profile", p->erasure_code_profile);
}
f->close_section();
@@ -2521,7 +2587,24 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
}
BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
ss << "hit_set_fpp: " << bloomp->get_fpp();
+ } else if (var == "target_max_objects") {
+ ss << "target_max_objects: " << p->target_max_objects;
+ } else if (var == "target_max_bytes") {
+ ss << "target_max_bytes: " << p->target_max_bytes;
+ } else if (var == "cache_target_dirty_ratio") {
+ ss << "cache_target_dirty_ratio: "
+ << ((float)p->cache_target_dirty_ratio_micro/1000000);
+ } else if (var == "cache_target_full_ratio") {
+ ss << "cache_target_full_ratio: "
+ << ((float)p->cache_target_full_ratio_micro/1000000);
+ } else if (var == "cache_min_flush_age") {
+ ss << "cache_min_flush_age: " << p->cache_min_flush_age;
+ } else if (var == "cache_min_evict_age") {
+ ss << "cache_min_evict_age: " << p->cache_min_evict_age;
+ } else if (var == "erasure_code_profile") {
+ ss << "erasure_code_profile: " << p->erasure_code_profile;
}
+
rdata.append(ss);
ss.str("");
}
@@ -2626,6 +2709,45 @@ stats_out:
rdata.append("\n");
r = 0;
+ } else if (prefix == "osd pool get-quota") {
+ string pool_name;
+ cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
+
+ int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
+ if (poolid < 0) {
+ assert(poolid == -ENOENT);
+ ss << "unrecognized pool '" << pool_name << "'";
+ r = -ENOENT;
+ goto reply;
+ }
+ const pg_pool_t *p = osdmap.get_pg_pool(poolid);
+
+ if (f) {
+ f->open_object_section("pool_quotas");
+ f->dump_string("pool_name", pool_name);
+ f->dump_unsigned("pool_id", poolid);
+ f->dump_unsigned("quota_max_objects", p->quota_max_objects);
+ f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
+ f->close_section();
+ f->flush(rdata);
+ } else {
+ stringstream rs;
+ rs << "quotas for pool '" << pool_name << "':\n"
+ << " max objects: ";
+ if (p->quota_max_objects == 0)
+ rs << "N/A";
+ else
+ rs << si_t(p->quota_max_objects) << " objects";
+ rs << "\n"
+ << " max bytes : ";
+ if (p->quota_max_bytes == 0)
+ rs << "N/A";
+ else
+ rs << si_t(p->quota_max_bytes) << "B";
+ rdata.append(rs.str());
+ }
+ rdata.append("\n");
+ r = 0;
} else if (prefix == "osd crush rule list" ||
prefix == "osd crush rule ls") {
string format;
@@ -2925,15 +3047,18 @@ int OSDMonitor::crush_ruleset_create_erasure(const string &name,
int *ruleset,
stringstream &ss)
{
- *ruleset = osdmap.crush->get_rule_id(name);
- if (*ruleset != -ENOENT)
+ int ruleid = osdmap.crush->get_rule_id(name);
+ if (ruleid != -ENOENT) {
+ *ruleset = osdmap.crush->get_rule_mask_ruleset(ruleid);
return -EEXIST;
+ }
CrushWrapper newcrush;
_get_pending_crush(newcrush);
- *ruleset = newcrush.get_rule_id(name);
- if (*ruleset != -ENOENT) {
+ ruleid = newcrush.get_rule_id(name);
+ if (ruleid != -ENOENT) {
+ *ruleset = newcrush.get_rule_mask_ruleset(ruleid);
return -EALREADY;
} else {
ErasureCodeInterfaceRef erasure_code;
@@ -3089,20 +3214,23 @@ int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_pr
int OSDMonitor::prepare_pool_size(const unsigned pool_type,
const string &erasure_code_profile,
- unsigned *size,
+ unsigned *size, unsigned *min_size,
stringstream &ss)
{
int err = 0;
switch (pool_type) {
case pg_pool_t::TYPE_REPLICATED:
*size = g_conf->osd_pool_default_size;
+ *min_size = g_conf->get_osd_pool_default_min_size();
break;
case pg_pool_t::TYPE_ERASURE:
{
ErasureCodeInterfaceRef erasure_code;
err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
- if (err == 0)
+ if (err == 0) {
*size = erasure_code->get_chunk_count();
+ *min_size = erasure_code->get_data_chunk_count();
+ }
}
break;
default:
@@ -3219,8 +3347,8 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
crush_ruleset_name, &crush_ruleset, ss);
if (r)
return r;
- unsigned size;
- r = prepare_pool_size(pool_type, erasure_code_profile, &size, ss);
+ unsigned size, min_size;
+ r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
if (r)
return r;
uint32_t stripe_width = 0;
@@ -3246,7 +3374,7 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
pi->flags |= pg_pool_t::FLAG_HASHPSPOOL;
pi->size = size;
- pi->min_size = g_conf->get_osd_pool_default_min_size();
+ pi->min_size = min_size;
pi->crush_ruleset = crush_ruleset;
pi->object_hash = CEPH_STR_HASH_RJENKINS;
pi->set_pg_num(pg_num ? pg_num : g_conf->osd_pool_default_pg_num);
@@ -3336,6 +3464,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
string interr, floaterr;
int64_t n = 0;
double f = 0;
+ int64_t uf = 0; // micro-f
if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
// wasn't a string; maybe an older mon forwarded json with an int?
if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
@@ -3345,6 +3474,17 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
n = strict_strtoll(val.c_str(), 10, &interr);
// or a float
f = strict_strtod(val.c_str(), &floaterr);
+ uf = llrintl(f * (double)1000000.0);
+ }
+
+ if (!p.is_tier() &&
+ (var == "hit_set_type" || var == "hit_set_period" ||
+ var == "hit_set_count" || var == "hit_set_fpp" ||
+ var == "target_max_objects" || var == "target_max_bytes" ||
+ var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
+ var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
+ ss << "pool '" << poolstr << "' is not a tier pool: variable not applicable";
+ return -EACCES;
}
if (var == "size") {
@@ -3399,7 +3539,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling. use --yes-i-really-mean-it to force.";
return -EPERM;
}
- int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
+ int expected_osds = MAX(1, MIN(p.get_pg_num(), osdmap.get_num_osds()));
int64_t new_pgs = n - p.get_pg_num();
int64_t pgs_per_osd = new_pgs / expected_osds;
if (pgs_per_osd > g_conf->mon_osd_max_split_count) {
@@ -3487,6 +3627,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
}
p.hit_set_period = n;
} else if (var == "hit_set_count") {
+
if (interr.length()) {
ss << "error parsing integer value '" << val << "': " << interr;
return -EINVAL;
@@ -3528,7 +3669,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
ss << "value must be in the range 0..1";
return -ERANGE;
}
- p.cache_target_dirty_ratio_micro = f * 1000000;
+ p.cache_target_dirty_ratio_micro = uf;
} else if (var == "cache_target_full_ratio") {
if (floaterr.length()) {
ss << "error parsing float '" << val << "': " << floaterr;
@@ -3538,7 +3679,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
ss << "value must be in the range 0..1";
return -ERANGE;
}
- p.cache_target_full_ratio_micro = f * 1000000;
+ p.cache_target_full_ratio_micro = uf;
} else if (var == "cache_min_flush_age") {
if (interr.length()) {
ss << "error parsing int '" << val << "': " << interr;
@@ -4172,6 +4313,24 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
cmd_getval(g_ceph_context, cmdmap, "profile", profile);
if (profile == "")
profile = "default";
+ if (profile == "default") {
+ if (!osdmap.has_erasure_code_profile(profile)) {
+ if (pending_inc.has_erasure_code_profile(profile)) {
+ dout(20) << "erasure code profile " << profile << " already pending" << dendl;
+ goto wait;
+ }
+
+ map<string,string> profile_map;
+ err = osdmap.get_erasure_code_profile_default(g_ceph_context,
+ profile_map,
+ &ss);
+ if (err)
+ goto reply;
+ dout(20) << "erasure code profile " << profile << " set" << dendl;
+ pending_inc.set_erasure_code_profile(profile, profile_map);
+ goto wait;
+ }
+ }
int ruleset;
err = crush_ruleset_create_erasure(name, profile, &ruleset, ss);
@@ -4847,6 +5006,25 @@ done:
cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
if (erasure_code_profile == "")
erasure_code_profile = "default";
+ if (erasure_code_profile == "default") {
+ if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
+ if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
+ dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
+ goto wait;
+ }
+
+ map<string,string> profile_map;
+ err = osdmap.get_erasure_code_profile_default(g_ceph_context,
+ profile_map,
+ &ss);
+ if (err)
+ goto reply;
+ dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
+ pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
+ goto wait;
+ }
+ }
+
if (ruleset_name == "") {
if (erasure_code_profile == "default") {
ruleset_name = "erasure-code";
@@ -5054,7 +5232,10 @@ done:
goto reply;
}
if (tp->tier_of != pool_id) {
- ss << "tier pool '" << tierpoolstr << "' is a tier of '" << tp->tier_of << "'";
+ ss << "tier pool '" << tierpoolstr << "' is a tier of '"
+ << osdmap.get_pool_name(tp->tier_of) << "': "
+ // be scary about it; this is an inconsistency and bells must go off
+ << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
err = -EINVAL;
goto reply;
}
@@ -5182,8 +5363,67 @@ done:
err = -EINVAL;
goto reply;
}
+
+ // pool already has this cache-mode set and there are no pending changes
+ if (p->cache_mode == mode &&
+ (pending_inc.new_pools.count(pool_id) == 0 ||
+ pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
+ ss << "set cache-mode for pool '" << poolstr << "'"
+ << " to " << pg_pool_t::get_cache_mode_name(mode);
+ err = 0;
+ goto reply;
+ }
+
+ /* Mode description:
+ *
+ * none: No cache-mode defined
+ * forward: Forward all reads and writes to base pool
+ * writeback: Cache writes, promote reads from base pool
+ * readonly: Forward writes to base pool
+ *
+ * Hence, these are the allowed transitions:
+ *
+ * none -> any
+ * forward -> writeback || any IF num_objects_dirty == 0
+ * writeback -> forward
+ * readonly -> any
+ */
+
+ // We check if the transition is valid against the current pool mode, as
+ // it is the only committed state thus far. We will blantly squash
+ // whatever mode is on the pending state.
+
+ if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
+ mode != pg_pool_t::CACHEMODE_FORWARD) {
+ ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
+ << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
+ << "' pool; only '"
+ << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
+ << "' allowed.";
+ err = -EINVAL;
+ goto reply;
+ }
+ if (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
+ mode != pg_pool_t::CACHEMODE_WRITEBACK) {
+
+ const pool_stat_t& tier_stats =
+ mon->pgmon()->pg_map.get_pg_pool_sum_stat(pool_id);
+
+ if (tier_stats.stats.sum.num_objects_dirty > 0) {
+ ss << "unable to set cache-mode '"
+ << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
+ << "': dirty objects found";
+ err = -EBUSY;
+ goto reply;
+ }
+ }
+
// go
- pending_inc.get_new_pool(pool_id, p)->cache_mode = mode;
+ pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+ np->cache_mode = mode;
+ // set this both when moving to and from cache_mode NONE. this is to
+ // capture legacy pools that were set up before this flag existed.
+ np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
ss << "set cache-mode for pool '" << poolstr
<< "' to " << pg_pool_t::get_cache_mode_name(mode);
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
@@ -5623,8 +5863,12 @@ int OSDMonitor::_check_remove_pool(int64_t pool, const pg_pool_t *p,
return -EBUSY;
}
if (!p->tiers.empty()) {
- *ss << "pool '" << poolstr << "' includes tiers "
- << p->tiers;
+ *ss << "pool '" << poolstr << "' has tiers";
+ for(std::set<uint64_t>::iterator i = p->tiers.begin(); i != p->tiers.end(); ++i) {
+ const char *name = osdmap.get_pool_name(*i);
+ assert(name != NULL);
+ *ss << " " << name;
+ }
return -EBUSY;
}
*ss << "pool '" << poolstr << "' removed";
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index 2d4f379..fbce5fe 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -272,7 +272,7 @@ private:
stringstream &ss);
int prepare_pool_size(const unsigned pool_type,
const string &erasure_code_profile,
- unsigned *size,
+ unsigned *size, unsigned *min_size,
stringstream &ss);
int prepare_pool_stripe_width(const unsigned pool_type,
const string &erasure_code_profile,
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index ae8f6e7..15f6746 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1215,11 +1215,13 @@ inline string percentify(const float& a) {
//void PGMonitor::dump_object_stat_sum(stringstream& ss, Formatter *f,
void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f,
- object_stat_sum_t &sum, bool verbose)
+ object_stat_sum_t &sum, uint64_t avail,
+ bool verbose)
{
if (f) {
f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
f->dump_int("bytes_used", sum.num_bytes);
+ f->dump_unsigned("max_avail", avail);
f->dump_int("objects", sum.num_objects);
if (verbose) {
f->dump_int("dirty", sum.num_objects_dirty);
@@ -1232,6 +1234,7 @@ void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f,
tbl << stringify(si_t(sum.num_bytes));
int64_t kb_used = SHIFT_ROUND_UP(sum.num_bytes, 10);
tbl << percentify(((float)kb_used / pg_map.osd_sum.kb)*100);
+ tbl << si_t(avail);
tbl << sum.num_objects;
if (verbose) {
tbl << stringify(si_t(sum.num_objects_dirty))
@@ -1241,6 +1244,24 @@ void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f,
}
}
+int64_t PGMonitor::get_rule_avail(OSDMap& osdmap, int ruleno)
+{
+ map<int,float> wm;
+ int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
+ if (r < 0)
+ return r;
+ if(wm.size() == 0)
+ return 0;
+ int64_t min = -1;
+ for (map<int,float>::iterator p = wm.begin(); p != wm.end(); ++p) {
+ int64_t proj = (float)(pg_map.osd_stat[p->first].kb_avail * 1024ull) /
+ (double)p->second;
+ if (min < 0 || proj < min)
+ min = proj;
+ }
+ return min;
+}
+
void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
{
TextTable tbl;
@@ -1252,16 +1273,18 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
if (verbose)
tbl.define_column("CATEGORY", TextTable::LEFT, TextTable::LEFT);
- tbl.define_column("USED", TextTable::LEFT, TextTable::LEFT);
- tbl.define_column("\%USED", TextTable::LEFT, TextTable::LEFT);
- tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
if (verbose) {
- tbl.define_column("DIRTY", TextTable::LEFT, TextTable::LEFT);
- tbl.define_column("READ", TextTable::LEFT, TextTable::LEFT);
- tbl.define_column("WRITE", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
}
}
+ map<int,uint64_t> avail_by_rule;
OSDMap &osdmap = mon->osdmon()->osdmap;
for (map<int64_t,pg_pool_t>::const_iterator p = osdmap.get_pools().begin();
p != osdmap.get_pools().end(); ++p) {
@@ -1271,6 +1294,38 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
string pool_name = osdmap.get_pool_name(pool_id);
pool_stat_t &stat = pg_map.pg_pool_sum[pool_id];
+ const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
+ int ruleno = osdmap.crush->find_rule(pool->get_crush_ruleset(),
+ pool->get_type(),
+ pool->get_size());
+ uint64_t avail;
+ if (avail_by_rule.count(ruleno) == 0) {
+ avail = get_rule_avail(osdmap, ruleno);
+ avail_by_rule[ruleno] = avail;
+ } else {
+ avail = avail_by_rule[ruleno];
+ }
+ switch (pool->get_type()) {
+ case pg_pool_t::TYPE_REPLICATED:
+ avail /= pool->get_size();
+ break;
+ case pg_pool_t::TYPE_ERASURE:
+ {
+ const map<string,string>& ecp =
+ osdmap.get_erasure_code_profile(pool->erasure_code_profile);
+ map<string,string>::const_iterator pm = ecp.find("m");
+ map<string,string>::const_iterator pk = ecp.find("k");
+ if (pm != ecp.end() && pk != ecp.end()) {
+ int k = atoi(pk->second.c_str());
+ int m = atoi(pm->second.c_str());
+ avail = avail * k / (m + k);
+ }
+ }
+ break;
+ default:
+ assert(0 == "unrecognized pool type");
+ }
+
if (f) {
f->open_object_section("pool");
f->dump_string("name", pool_name);
@@ -1282,7 +1337,7 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
if (verbose)
tbl << "-";
}
- dump_object_stat_sum(tbl, f, stat.stats.sum, verbose);
+ dump_object_stat_sum(tbl, f, stat.stats.sum, avail, verbose);
if (f)
f->close_section(); // stats
else
@@ -1301,7 +1356,7 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
<< ""
<< it->first;
}
- dump_object_stat_sum(tbl, f, it->second, verbose);
+ dump_object_stat_sum(tbl, f, it->second, avail, verbose);
if (f)
f->close_section(); // category name
else
@@ -1335,12 +1390,12 @@ void PGMonitor::dump_fs_stats(stringstream &ss, Formatter *f, bool verbose)
f->close_section();
} else {
TextTable tbl;
- tbl.define_column("SIZE", TextTable::LEFT, TextTable::LEFT);
- tbl.define_column("AVAIL", TextTable::LEFT, TextTable::LEFT);
- tbl.define_column("RAW USED", TextTable::LEFT, TextTable::LEFT);
- tbl.define_column("\%RAW USED", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
+ tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
if (verbose) {
- tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::LEFT);
+ tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
}
tbl << stringify(si_t(pg_map.osd_sum.kb*1024))
<< stringify(si_t(pg_map.osd_sum.kb_avail*1024))
diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h
index 09dd009..f007378 100644
--- a/src/mon/PGMonitor.h
+++ b/src/mon/PGMonitor.h
@@ -146,7 +146,11 @@ private:
vector<string>& args) const;
void dump_object_stat_sum(TextTable &tbl, Formatter *f,
- object_stat_sum_t &sum, bool verbose);
+ object_stat_sum_t &sum,
+ uint64_t avail,
+ bool verbose);
+
+ int64_t get_rule_avail(OSDMap& osdmap, int ruleno);
public:
PGMonitor(Monitor *mn, Paxos *p, const string& service_name)
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index 3883a32..b38b111 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -1264,7 +1264,8 @@ void Paxos::dispatch(PaxosServiceMessage *m)
bool Paxos::is_readable(version_t v)
{
- dout(1) << "is_readable now=" << ceph_clock_now(g_ceph_context) << " lease_expire=" << lease_expire
+ dout(5) << "is_readable now=" << ceph_clock_now(g_ceph_context)
+ << " lease_expire=" << lease_expire
<< " has v" << v << " lc " << last_committed << dendl;
if (v > last_committed)
return false;
diff --git a/src/msg/SimpleMessenger.cc b/src/msg/SimpleMessenger.cc
index 2070fe5..ce7f1fd 100644
--- a/src/msg/SimpleMessenger.cc
+++ b/src/msg/SimpleMessenger.cc
@@ -86,6 +86,9 @@ int SimpleMessenger::shutdown()
ldout(cct,10) << "shutdown " << get_myaddr() << dendl;
mark_down_all();
dispatch_queue.shutdown();
+
+ // break ref cycles on the loopback connection
+ local_connection->set_priv(NULL);
return 0;
}
diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc
index c6bd616..7eb7927 100644
--- a/src/os/FileJournal.cc
+++ b/src/os/FileJournal.cc
@@ -1758,7 +1758,12 @@ FileJournal::read_entry_result FileJournal::do_read_entry(
// ok!
if (seq)
*seq = h->seq;
- journalq.push_back(pair<uint64_t,off64_t>(h->seq, pos));
+
+ // works around an apparent GCC 4.8(?) compiler bug about unaligned
+ // bind by reference to (packed) h->seq
+ journalq.push_back(
+ pair<uint64_t,off64_t>(static_cast<uint64_t>(h->seq),
+ static_cast<off64_t>(pos)));
if (next_pos)
*next_pos = pos;
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index f73d930..1b3dd5e 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -126,7 +126,7 @@ void FileStore::FSPerfTracker::update_from_perfcounters(
{
os_commit_latency.consume_next(
logger.get_tavg_ms(
- l_os_commit_lat));
+ l_os_j_lat));
os_apply_latency.consume_next(
logger.get_tavg_ms(
l_os_apply_lat));
@@ -1558,6 +1558,8 @@ int FileStore::umount()
backend = generic_backend;
}
+ force_sync = false;
+
object_map.reset();
{
@@ -1711,7 +1713,8 @@ void FileStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle)
void FileStore::_finish_op(OpSequencer *osr)
{
- Op *o = osr->dequeue();
+ list<Context*> to_queue;
+ Op *o = osr->dequeue(&to_queue);
dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << dendl;
osr->apply_lock.Unlock(); // locked in _do_op
@@ -1729,6 +1732,7 @@ void FileStore::_finish_op(OpSequencer *osr)
if (o->onreadable) {
op_finisher.queue(o->onreadable);
}
+ op_finisher.queue(to_queue);
delete o;
}
@@ -1844,7 +1848,8 @@ void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk)
// this should queue in order because the journal does it's completions in order.
queue_op(osr, o);
- osr->dequeue_journal();
+ list<Context*> to_queue;
+ osr->dequeue_journal(&to_queue);
// do ondisk completions async, to prevent any onreadable_sync completions
// getting blocked behind an ondisk completion.
@@ -1852,6 +1857,7 @@ void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk)
dout(10) << " queueing ondisk " << ondisk << dendl;
ondisk_finisher.queue(ondisk);
}
+ ondisk_finisher.queue(to_queue);
}
int FileStore::_do_transactions(
@@ -2545,11 +2551,12 @@ unsigned FileStore::_do_transaction(
f.close_section();
f.flush(*_dout);
*_dout << dendl;
- assert(0 == "unexpected error");
if (r == -EMFILE) {
dump_open_fds(g_ceph_context);
}
+
+ assert(0 == "unexpected error");
}
}
diff --git a/src/os/FileStore.h b/src/os/FileStore.h
index 4c9ffdb..3fcd89a 100644
--- a/src/os/FileStore.h
+++ b/src/os/FileStore.h
@@ -193,19 +193,70 @@ private:
Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
list<Op*> q;
list<uint64_t> jq;
+ list<pair<uint64_t, Context*> > flush_commit_waiters;
Cond cond;
public:
Sequencer *parent;
Mutex apply_lock; // for apply mutual exclusion
+ /// get_max_uncompleted
+ bool _get_max_uncompleted(
+ uint64_t *seq ///< [out] max uncompleted seq
+ ) {
+ assert(qlock.is_locked());
+ assert(seq);
+ *seq = 0;
+ if (q.empty() && jq.empty())
+ return true;
+
+ if (!q.empty())
+ *seq = q.back()->op;
+ if (!jq.empty() && jq.back() > *seq)
+ *seq = jq.back();
+
+ return false;
+ } /// @returns true if both queues are empty
+
+ /// get_min_uncompleted
+ bool _get_min_uncompleted(
+ uint64_t *seq ///< [out] min uncompleted seq
+ ) {
+ assert(qlock.is_locked());
+ assert(seq);
+ *seq = 0;
+ if (q.empty() && jq.empty())
+ return true;
+
+ if (!q.empty())
+ *seq = q.front()->op;
+ if (!jq.empty() && jq.front() < *seq)
+ *seq = jq.front();
+
+ return false;
+ } /// @returns true if both queues are empty
+
+ void _wake_flush_waiters(list<Context*> *to_queue) {
+ uint64_t seq;
+ if (_get_min_uncompleted(&seq))
+ seq = -1;
+
+ for (list<pair<uint64_t, Context*> >::iterator i =
+ flush_commit_waiters.begin();
+ i != flush_commit_waiters.end() && i->first < seq;
+ flush_commit_waiters.erase(i++)) {
+ to_queue->push_back(i->second);
+ }
+ }
+
void queue_journal(uint64_t s) {
Mutex::Locker l(qlock);
jq.push_back(s);
}
- void dequeue_journal() {
+ void dequeue_journal(list<Context*> *to_queue) {
Mutex::Locker l(qlock);
jq.pop_front();
cond.Signal();
+ _wake_flush_waiters(to_queue);
}
void queue(Op *o) {
Mutex::Locker l(qlock);
@@ -215,20 +266,26 @@ private:
assert(apply_lock.is_locked());
return q.front();
}
- Op *dequeue() {
+
+ Op *dequeue(list<Context*> *to_queue) {
+ assert(to_queue);
assert(apply_lock.is_locked());
Mutex::Locker l(qlock);
Op *o = q.front();
q.pop_front();
cond.Signal();
+
+ _wake_flush_waiters(to_queue);
return o;
}
+
void flush() {
Mutex::Locker l(qlock);
while (g_conf->filestore_blackhole)
cond.Wait(qlock); // wait forever
+
// get max for journal _or_ op queues
uint64_t seq = 0;
if (!q.empty())
@@ -243,6 +300,17 @@ private:
cond.Wait(qlock);
}
}
+ bool flush_commit(Context *c) {
+ Mutex::Locker l(qlock);
+ uint64_t seq = 0;
+ if (_get_max_uncompleted(&seq)) {
+ delete c;
+ return true;
+ } else {
+ flush_commit_waiters.push_back(make_pair(seq, c));
+ return false;
+ }
+ }
OpSequencer()
: qlock("FileStore::OpSequencer::qlock", false, false),
diff --git a/src/os/GenericObjectMap.cc b/src/os/GenericObjectMap.cc
index 4d41c50..011c83b 100644
--- a/src/os/GenericObjectMap.cc
+++ b/src/os/GenericObjectMap.cc
@@ -689,8 +689,6 @@ void GenericObjectMap::rename(const Header old_header, const coll_t &cid,
old_header->cid = cid;
old_header->oid = target;
set_header(cid, target, *old_header, t);
-
- // "in_use" still hold the "seq"
}
int GenericObjectMap::init(bool do_upgrade)
@@ -926,35 +924,18 @@ GenericObjectMap::Header GenericObjectMap::_lookup_header(
to_get.insert(header_key(cid, oid));
_Header header;
- while (1) {
- map<string, bufferlist> out;
- bool try_again = false;
-
- int r = db->get(GHOBJECT_TO_SEQ_PREFIX, to_get, &out);
- if (r < 0)
- return Header();
- if (out.empty())
- return Header();
-
- bufferlist::iterator iter = out.begin()->second.begin();
- header.decode(iter);
-
- while (in_use.count(header.seq)) {
- header_cond.Wait(header_lock);
+ map<string, bufferlist> out;
- // Another thread is hold this header, wait for it.
- // Because the seq of this object may change, such as clone
- // and rename operation, here need to look up "seq" again
- try_again = true;
- }
+ int r = db->get(GHOBJECT_TO_SEQ_PREFIX, to_get, &out);
+ if (r < 0)
+ return Header();
+ if (out.empty())
+ return Header();
- if (!try_again) {
- break;
- }
- }
+ bufferlist::iterator iter = out.begin()->second.begin();
+ header.decode(iter);
- Header ret = Header(new _Header(header), RemoveOnDelete(this));
- in_use.insert(ret->seq);
+ Header ret = Header(new _Header(header));
return ret;
}
@@ -962,7 +943,7 @@ GenericObjectMap::Header GenericObjectMap::_generate_new_header(
const coll_t &cid, const ghobject_t &oid, Header parent,
KeyValueDB::Transaction t)
{
- Header header = Header(new _Header(), RemoveOnDelete(this));
+ Header header = Header(new _Header());
header->seq = state.seq++;
if (parent) {
header->parent = parent->seq;
@@ -970,8 +951,6 @@ GenericObjectMap::Header GenericObjectMap::_generate_new_header(
header->num_children = 1;
header->oid = oid;
header->cid = cid;
- assert(!in_use.count(header->seq));
- in_use.insert(header->seq);
write_state(t);
return header;
@@ -980,8 +959,6 @@ GenericObjectMap::Header GenericObjectMap::_generate_new_header(
GenericObjectMap::Header GenericObjectMap::lookup_parent(Header input)
{
Mutex::Locker l(header_lock);
- while (in_use.count(input->parent))
- header_cond.Wait(header_lock);
map<string, bufferlist> out;
set<string> keys;
keys.insert(PARENT_KEY);
@@ -999,13 +976,12 @@ GenericObjectMap::Header GenericObjectMap::lookup_parent(Header input)
return Header();
}
- Header header = Header(new _Header(), RemoveOnDelete(this));
+ Header header = Header(new _Header());
header->seq = input->parent;
bufferlist::iterator iter = out.begin()->second.begin();
header->decode(iter);
dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent "
<< header->parent << dendl;
- in_use.insert(header->seq);
return header;
}
diff --git a/src/os/GenericObjectMap.h b/src/os/GenericObjectMap.h
index c9c64bc..3c5e3cb 100644
--- a/src/os/GenericObjectMap.h
+++ b/src/os/GenericObjectMap.h
@@ -74,12 +74,6 @@ class GenericObjectMap {
* Serializes access to next_seq as well as the in_use set
*/
Mutex header_lock;
- Cond header_cond;
-
- /**
- * Set of headers currently in use
- */
- set<uint64_t> in_use;
GenericObjectMap(KeyValueDB *db) : db(db), header_lock("GenericObjectMap") {}
@@ -371,6 +365,12 @@ protected:
return GenericObjectMapIterator(new GenericObjectMapIteratorImpl(this, header, prefix));
}
+ Header generate_new_header(const coll_t &cid, const ghobject_t &oid,
+ Header parent, KeyValueDB::Transaction t) {
+ Mutex::Locker l(header_lock);
+ return _generate_new_header(cid, oid, parent, t);
+ }
+
// Scan keys in header into out_keys and out_values (if nonnull)
int scan(Header header, const string &prefix, const set<string> &in_keys,
set<string> *out_keys, map<string, bufferlist> *out_values);
@@ -394,11 +394,6 @@ protected:
*/
Header _generate_new_header(const coll_t &cid, const ghobject_t &oid,
Header parent, KeyValueDB::Transaction t);
- Header generate_new_header(const coll_t &cid, const ghobject_t &oid,
- Header parent, KeyValueDB::Transaction t) {
- Mutex::Locker l(header_lock);
- return _generate_new_header(cid, oid, parent, t);
- }
// Lookup leaf header for c oid
Header _lookup_header(const coll_t &cid, const ghobject_t &oid);
@@ -425,26 +420,6 @@ protected:
// Sets header @see set_header
void _set_header(Header header, const bufferlist &bl,
KeyValueDB::Transaction t);
-
- /**
- * Removes header seq lock once Header is out of scope
- * @see _lookup_header
- * @see lookup_parent
- * @see generate_new_header
- */
- class RemoveOnDelete {
- public:
- GenericObjectMap *db;
- RemoveOnDelete(GenericObjectMap *db) :
- db(db) {}
- void operator() (_Header *header) {
- Mutex::Locker l(db->header_lock);
- db->in_use.erase(header->seq);
- db->header_cond.Signal();
- delete header;
- }
- };
- friend class RemoveOnDelete;
};
WRITE_CLASS_ENCODER(GenericObjectMap::_Header)
WRITE_CLASS_ENCODER(GenericObjectMap::State)
diff --git a/src/os/KeyValueStore.cc b/src/os/KeyValueStore.cc
index fb459b2..17c0c3b 100644
--- a/src/os/KeyValueStore.cc
+++ b/src/os/KeyValueStore.cc
@@ -69,68 +69,49 @@ const string KeyValueStore::COLLECTION_ATTR = "__COLL_ATTR__";
// ============== StripObjectMap Implementation =================
-void StripObjectMap::sync_wrap(StripObjectHeader &strip_header,
- KeyValueDB::Transaction t,
- const SequencerPosition &spos)
-{
- dout(10) << __func__ << " cid: " << strip_header.cid << "oid: "
- << strip_header.oid << " setting spos to " << strip_header.spos
- << dendl;
- strip_header.spos = spos;
- strip_header.header->data.clear();
- ::encode(strip_header, strip_header.header->data);
-
- sync(strip_header.header, t);
-}
-
-bool StripObjectMap::check_spos(const StripObjectHeader &header,
- const SequencerPosition &spos)
-{
- if (spos > header.spos) {
- stringstream out;
- dout(10) << "cid: " << "oid: " << header.oid
- << " not skipping op, *spos " << spos << dendl;
- dout(10) << " > header.spos " << header.spos << dendl;
- return false;
- } else {
- dout(10) << "cid: " << "oid: " << header.oid << " skipping op, spos "
- << spos << " <= header.spos " << header.spos << dendl;
- return true;
- }
-}
-
-int StripObjectMap::save_strip_header(StripObjectHeader &strip_header,
- const SequencerPosition &spos,
+int StripObjectMap::save_strip_header(StripObjectHeaderRef strip_header,
KeyValueDB::Transaction t)
{
- strip_header.spos = spos;
- strip_header.header->data.clear();
- ::encode(strip_header, strip_header.header->data);
+ strip_header->header->data.clear();
+ ::encode(*strip_header, strip_header->header->data);
- set_header(strip_header.cid, strip_header.oid, *(strip_header.header), t);
+ set_header(strip_header->cid, strip_header->oid, *(strip_header->header), t);
return 0;
}
int StripObjectMap::create_strip_header(const coll_t &cid,
const ghobject_t &oid,
- StripObjectHeader &strip_header,
+ StripObjectHeaderRef *strip_header,
KeyValueDB::Transaction t)
{
- Header header = lookup_create_header(cid, oid, t);
+ Header header = generate_new_header(cid, oid, Header(), t);
if (!header)
return -EINVAL;
- strip_header.oid = oid;
- strip_header.cid = cid;
- strip_header.header = header;
+ StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
+ tmp->oid = oid;
+ tmp->cid = cid;
+ tmp->header = header;
+ if (strip_header)
+ *strip_header = tmp;
return 0;
}
int StripObjectMap::lookup_strip_header(const coll_t &cid,
const ghobject_t &oid,
- StripObjectHeader &strip_header)
-{
+ StripObjectHeaderRef *strip_header)
+{
+ if (cid != coll_t()) {
+ Mutex::Locker l(lock);
+ pair<coll_t, StripObjectHeaderRef> p;
+ if (caches.lookup(oid, &p)) {
+ if (p.first == cid) {
+ *strip_header = p.second;
+ return 0;
+ }
+ }
+ }
Header header = lookup_header(cid, oid);
if (!header) {
@@ -139,18 +120,25 @@ int StripObjectMap::lookup_strip_header(const coll_t &cid,
return -ENOENT;
}
+
+ StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
if (header->data.length()) {
bufferlist::iterator bliter = header->data.begin();
- ::decode(strip_header, bliter);
+ ::decode(*tmp, bliter);
}
- if (strip_header.strip_size == 0)
- strip_header.strip_size = default_strip_size;
+ if (tmp->strip_size == 0)
+ tmp->strip_size = default_strip_size;
- strip_header.oid = oid;
- strip_header.cid = cid;
- strip_header.header = header;
+ tmp->oid = oid;
+ tmp->cid = cid;
+ tmp->header = header;
+ {
+ Mutex::Locker l(lock);
+ caches.add(oid, make_pair(cid, tmp));
+ }
+ *strip_header = tmp;
dout(10) << "lookup_strip_header done " << " cid " << cid << " oid "
<< oid << dendl;
return 0;
@@ -194,57 +182,62 @@ int StripObjectMap::file_to_extents(uint64_t offset, size_t len,
return 0;
}
-void StripObjectMap::clone_wrap(StripObjectHeader &old_header,
+void StripObjectMap::clone_wrap(StripObjectHeaderRef old_header,
const coll_t &cid, const ghobject_t &oid,
KeyValueDB::Transaction t,
- StripObjectHeader *origin_header,
- StripObjectHeader *target_header)
+ StripObjectHeaderRef *target_header)
{
Header new_origin_header;
+ StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
- if (target_header)
- *target_header = old_header;
- if (origin_header)
- *origin_header = old_header;
-
- clone(old_header.header, cid, oid, t, &new_origin_header,
- &target_header->header);
+ clone(old_header->header, cid, oid, t, &new_origin_header,
+ &tmp->header);
- if(origin_header)
- origin_header->header = new_origin_header;
+ tmp->oid = oid;
+ tmp->cid = cid;
+ tmp->strip_size = old_header->strip_size;
+ tmp->max_size = old_header->max_size;
+ tmp->bits = old_header->bits;
+ old_header->header = new_origin_header;
- if (target_header) {
- target_header->oid = oid;
- target_header->cid = cid;
- }
+ if (target_header)
+ *target_header = tmp;
}
-void StripObjectMap::rename_wrap(const coll_t &cid, const ghobject_t &oid,
+void StripObjectMap::rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid,
KeyValueDB::Transaction t,
- StripObjectHeader *header)
+ StripObjectHeaderRef *new_header)
{
- assert(header);
- rename(header->header, cid, oid, t);
+ rename(old_header->header, cid, oid, t);
- if (header) {
- header->oid = oid;
- header->cid = cid;
- }
+ StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
+ tmp->strip_size = old_header->strip_size;
+ tmp->max_size = old_header->max_size;
+ tmp->bits = old_header->bits;
+ tmp->header = old_header->header;
+ tmp->oid = oid;
+ tmp->cid = cid;
+
+ if (new_header)
+ *new_header = tmp;
+
+ old_header->header = Header();
+ old_header->deleted = true;
}
-int StripObjectMap::get_values_with_header(const StripObjectHeader &header,
+int StripObjectMap::get_values_with_header(const StripObjectHeaderRef header,
const string &prefix,
const set<string> &keys,
map<string, bufferlist> *out)
{
- return scan(header.header, prefix, keys, 0, out);
+ return scan(header->header, prefix, keys, 0, out);
}
-int StripObjectMap::get_keys_with_header(const StripObjectHeader &header,
+int StripObjectMap::get_keys_with_header(const StripObjectHeaderRef header,
const string &prefix,
set<string> *keys)
{
- ObjectMap::ObjectMapIterator iter = _get_iterator(header.header, prefix);
+ ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix);
for (; iter->valid(); iter->next()) {
if (iter->status())
return iter->status();
@@ -253,10 +246,10 @@ int StripObjectMap::get_keys_with_header(const StripObjectHeader &header,
return 0;
}
-int StripObjectMap::get_with_header(const StripObjectHeader &header,
+int StripObjectMap::get_with_header(const StripObjectHeaderRef header,
const string &prefix, map<string, bufferlist> *out)
{
- ObjectMap::ObjectMapIterator iter = _get_iterator(header.header, prefix);
+ ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix);
for (iter->seek_to_first(); iter->valid(); iter->next()) {
if (iter->status())
return iter->status();
@@ -265,52 +258,36 @@ int StripObjectMap::get_with_header(const StripObjectHeader &header,
return 0;
}
-// =========== KeyValueStore::SubmitManager Implementation ==============
-
-uint64_t KeyValueStore::SubmitManager::op_submit_start()
-{
- lock.Lock();
- uint64_t op = ++op_seq;
- dout(10) << "op_submit_start " << op << dendl;
- return op;
-}
-
-void KeyValueStore::SubmitManager::op_submit_finish(uint64_t op)
-{
- dout(10) << "op_submit_finish " << op << dendl;
- if (op != op_submitted + 1) {
- dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1)
- << ", OUT OF ORDER" << dendl;
- assert(0 == "out of order op_submit_finish");
- }
- op_submitted = op;
- lock.Unlock();
-}
-
// ========= KeyValueStore::BufferTransaction Implementation ============
int KeyValueStore::BufferTransaction::lookup_cached_header(
const coll_t &cid, const ghobject_t &oid,
- StripObjectMap::StripObjectHeader **strip_header,
+ StripObjectMap::StripObjectHeaderRef *strip_header,
bool create_if_missing)
{
- StripObjectMap::StripObjectHeader header;
+ StripObjectMap::StripObjectHeaderRef header;
int r = 0;
StripHeaderMap::iterator it = strip_headers.find(make_pair(cid, oid));
if (it != strip_headers.end()) {
- if (it->second.deleted)
+
+ if (!it->second->deleted) {
+ if (strip_header)
+ *strip_header = it->second;
+ return 0;
+ } else if (!create_if_missing) {
return -ENOENT;
+ }
- if (strip_header)
- *strip_header = &it->second;
- return 0;
+ // If (it->second.deleted && create_if_missing) go down
+ r = -ENOENT;
+ } else {
+ r = store->backend->lookup_strip_header(cid, oid, &header);
}
- r = store->backend->lookup_strip_header(cid, oid, header);
- if (r < 0 && create_if_missing) {
- r = store->backend->create_strip_header(cid, oid, header, t);
+ if (r == -ENOENT && create_if_missing) {
+ r = store->backend->create_strip_header(cid, oid, &header, t);
}
if (r < 0) {
@@ -321,21 +298,21 @@ int KeyValueStore::BufferTransaction::lookup_cached_header(
strip_headers[make_pair(cid, oid)] = header;
if (strip_header)
- *strip_header = &strip_headers[make_pair(cid, oid)];
+ *strip_header = strip_headers[make_pair(cid, oid)];
return r;
}
int KeyValueStore::BufferTransaction::get_buffer_keys(
- StripObjectMap::StripObjectHeader &strip_header, const string &prefix,
+ StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix,
const set<string> &keys, map<string, bufferlist> *out)
{
set<string> need_lookup;
for (set<string>::iterator it = keys.begin(); it != keys.end(); ++it) {
map<pair<string, string>, bufferlist>::iterator i =
- strip_header.buffers.find(make_pair(prefix, *it));
+ strip_header->buffers.find(make_pair(prefix, *it));
- if (i != strip_header.buffers.end()) {
+ if (i != strip_header->buffers.end()) {
(*out)[*it].swap(i->second);
} else {
need_lookup.insert(*it);
@@ -346,8 +323,8 @@ int KeyValueStore::BufferTransaction::get_buffer_keys(
int r = store->backend->get_values_with_header(strip_header, prefix,
need_lookup, out);
if (r < 0) {
- dout(10) << __func__ << " " << strip_header.cid << "/"
- << strip_header.oid << " " << " r = " << r << dendl;
+ dout(10) << __func__ << " " << strip_header->cid << "/"
+ << strip_header->oid << " " << " r = " << r << dendl;
return r;
}
}
@@ -356,78 +333,77 @@ int KeyValueStore::BufferTransaction::get_buffer_keys(
}
void KeyValueStore::BufferTransaction::set_buffer_keys(
- StripObjectMap::StripObjectHeader &strip_header,
+ StripObjectMap::StripObjectHeaderRef strip_header,
const string &prefix, map<string, bufferlist> &values)
{
- store->backend->set_keys(strip_header.header, prefix, values, t);
+ store->backend->set_keys(strip_header->header, prefix, values, t);
for (map<string, bufferlist>::iterator iter = values.begin();
iter != values.end(); ++iter) {
- strip_header.buffers[make_pair(prefix, iter->first)].swap(iter->second);
+ strip_header->buffers[make_pair(prefix, iter->first)].swap(iter->second);
}
}
int KeyValueStore::BufferTransaction::remove_buffer_keys(
- StripObjectMap::StripObjectHeader &strip_header, const string &prefix,
+ StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix,
const set<string> &keys)
{
for (set<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
- strip_header.buffers[make_pair(prefix, *iter)] = bufferlist();
+ strip_header->buffers[make_pair(prefix, *iter)] = bufferlist();
}
- return store->backend->rm_keys(strip_header.header, prefix, keys, t);
+ return store->backend->rm_keys(strip_header->header, prefix, keys, t);
}
void KeyValueStore::BufferTransaction::clear_buffer_keys(
- StripObjectMap::StripObjectHeader &strip_header, const string &prefix)
+ StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix)
{
- for (map<pair<string, string>, bufferlist>::iterator iter = strip_header.buffers.begin();
- iter != strip_header.buffers.end(); ++iter) {
+ for (map<pair<string, string>, bufferlist>::iterator iter = strip_header->buffers.begin();
+ iter != strip_header->buffers.end(); ++iter) {
if (iter->first.first == prefix)
iter->second = bufferlist();
}
}
int KeyValueStore::BufferTransaction::clear_buffer(
- StripObjectMap::StripObjectHeader &strip_header)
+ StripObjectMap::StripObjectHeaderRef strip_header)
{
- strip_header.deleted = true;
+ strip_header->deleted = true;
- return store->backend->clear(strip_header.header, t);
+ InvalidateCacheContext *c = new InvalidateCacheContext(store, strip_header->cid, strip_header->oid);
+ finishes.push_back(c);
+ return store->backend->clear(strip_header->header, t);
}
void KeyValueStore::BufferTransaction::clone_buffer(
- StripObjectMap::StripObjectHeader &old_header,
+ StripObjectMap::StripObjectHeaderRef old_header,
const coll_t &cid, const ghobject_t &oid)
{
// Remove target ahead to avoid dead lock
strip_headers.erase(make_pair(cid, oid));
- StripObjectMap::StripObjectHeader new_origin_header, new_target_header;
+ StripObjectMap::StripObjectHeaderRef new_target_header;
- store->backend->clone_wrap(old_header, cid, oid, t,
- &new_origin_header, &new_target_header);
+ store->backend->clone_wrap(old_header, cid, oid, t, &new_target_header);
// FIXME: Lacking of lock for origin header(now become parent), it will
// cause other operation can get the origin header while submitting
// transactions
- strip_headers[make_pair(cid, old_header.oid)] = new_origin_header;
strip_headers[make_pair(cid, oid)] = new_target_header;
}
void KeyValueStore::BufferTransaction::rename_buffer(
- StripObjectMap::StripObjectHeader &old_header,
+ StripObjectMap::StripObjectHeaderRef old_header,
const coll_t &cid, const ghobject_t &oid)
{
- if (store->backend->check_spos(old_header, spos))
- return ;
-
// FIXME: Lacking of lock for origin header, it will cause other operation
// can get the origin header while submitting transactions
- store->backend->rename_wrap(cid, oid, t, &old_header);
+ StripObjectMap::StripObjectHeaderRef new_header;
+ store->backend->rename_wrap(old_header, cid, oid, t, &new_header);
- strip_headers.erase(make_pair(old_header.cid, old_header.oid));
- strip_headers[make_pair(cid, oid)] = old_header;
+ InvalidateCacheContext *c = new InvalidateCacheContext(store, old_header->cid, old_header->oid);
+ finishes.push_back(c);
+ strip_headers[make_pair(cid, oid)] = new_header;
}
int KeyValueStore::BufferTransaction::submit_transaction()
@@ -436,25 +412,27 @@ int KeyValueStore::BufferTransaction::submit_transaction()
for (StripHeaderMap::iterator header_iter = strip_headers.begin();
header_iter != strip_headers.end(); ++header_iter) {
- StripObjectMap::StripObjectHeader header = header_iter->second;
+ StripObjectMap::StripObjectHeaderRef header = header_iter->second;
- if (store->backend->check_spos(header, spos))
+ if (header->deleted)
continue;
- if (header.deleted)
- continue;
+ r = store->backend->save_strip_header(header, t);
- r = store->backend->save_strip_header(header, spos, t);
if (r < 0) {
dout(10) << __func__ << " save strip header failed " << dendl;
goto out;
}
}
-out:
+ r = store->backend->submit_transaction(t);
+ for (list<Context*>::iterator it = finishes.begin(); it != finishes.end(); ++it) {
+ (*it)->complete(r);
+ }
+out:
dout(5) << __func__ << " r = " << r << dendl;
- return store->backend->submit_transaction(t);
+ return r;
}
// =========== KeyValueStore Intern Helper Implementation ==============
@@ -495,7 +473,7 @@ KeyValueStore::KeyValueStore(const std::string &base,
ObjectStore(base),
internal_name(name),
basedir(base),
- fsid_fd(-1), op_fd(-1), current_fd(-1),
+ fsid_fd(-1), current_fd(-1),
kv_type(KV_TYPE_NONE),
backend(NULL),
ondisk_finisher(g_ceph_context),
@@ -906,10 +884,6 @@ int KeyValueStore::umount()
VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
fsid_fd = -1;
}
- if (op_fd >= 0) {
- VOID_TEMP_FAILURE_RETRY(::close(op_fd));
- op_fd = -1;
- }
if (current_fd >= 0) {
VOID_TEMP_FAILURE_RETRY(::close(current_fd));
current_fd = -1;
@@ -963,14 +937,9 @@ int KeyValueStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
Op *o = build_op(tls, ondisk, onreadable, onreadable_sync, osd_op);
op_queue_reserve_throttle(o, handle);
- uint64_t op = submit_manager.op_submit_start();
- o->op = op;
- dout(5) << "queue_transactions (trailing journal) " << op << " "
- << tls <<dendl;
+ dout(5) << "queue_transactions (trailing journal) " << " " << tls <<dendl;
queue_op(osr, o);
- submit_manager.op_submit_finish(op);
-
return 0;
}
@@ -1088,7 +1057,8 @@ void KeyValueStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle)
void KeyValueStore::_finish_op(OpSequencer *osr)
{
- Op *o = osr->dequeue();
+ list<Context*> to_queue;
+ Op *o = osr->dequeue(&to_queue);
dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << dendl;
osr->apply_lock.Unlock(); // locked in _do_op
@@ -1102,6 +1072,7 @@ void KeyValueStore::_finish_op(OpSequencer *osr)
o->onreadable_sync->complete(0);
}
op_finisher.queue(o->onreadable);
+ op_finisher.queue(to_queue);
delete o;
}
@@ -1126,13 +1097,12 @@ int KeyValueStore::_do_transactions(list<Transaction*> &tls, uint64_t op_seq,
}
int trans_num = 0;
- SequencerPosition spos(op_seq, trans_num, 0);
- BufferTransaction bt(this, spos);
+ BufferTransaction bt(this);
for (list<Transaction*>::iterator p = tls.begin();
p != tls.end();
++p, trans_num++) {
- r = _do_transaction(**p, bt, spos, handle);
+ r = _do_transaction(**p, bt, handle);
if (r < 0)
break;
if (handle)
@@ -1149,12 +1119,12 @@ int KeyValueStore::_do_transactions(list<Transaction*> &tls, uint64_t op_seq,
unsigned KeyValueStore::_do_transaction(Transaction& transaction,
BufferTransaction &t,
- SequencerPosition& spos,
ThreadPool::TPHandle *handle)
{
dout(10) << "_do_transaction on " << &transaction << dendl;
Transaction::iterator i = transaction.begin();
+ uint64_t op_num = 0;
while (i.have_op()) {
if (handle)
@@ -1449,7 +1419,13 @@ unsigned KeyValueStore::_do_transaction(Transaction& transaction,
break;
case Transaction::OP_SETALLOCHINT:
- // TODO: can kvstore make use of the hint?
+ {
+ // TODO: can kvstore make use of the hint?
+ coll_t cid(i.get_cid());
+ ghobject_t oid = i.get_oid();
+ (void)i.get_length(); // discard result
+ (void)i.get_length(); // discard result
+ }
break;
default:
@@ -1487,8 +1463,7 @@ unsigned KeyValueStore::_do_transaction(Transaction& transaction,
}
dout(0) << " error " << cpp_strerror(r) << " not handled on operation "
- << op << " (" << spos << ", or op " << spos.op
- << ", counting from 0)" << dendl;
+ << op << " op " << op_num << ", counting from 0)" << dendl;
dout(0) << msg << dendl;
dout(0) << " transaction dump:\n";
JSONFormatter f(true);
@@ -1505,7 +1480,7 @@ unsigned KeyValueStore::_do_transaction(Transaction& transaction,
}
}
- spos.op++;
+ op_num++;
}
return 0; // FIXME count errors
@@ -1520,9 +1495,9 @@ bool KeyValueStore::exists(coll_t cid, const ghobject_t& oid)
dout(10) << __func__ << "collection: " << cid << " object: " << oid
<< dendl;
int r;
- StripObjectMap::StripObjectHeader header;
+ StripObjectMap::StripObjectHeaderRef header;
- r = backend->lookup_strip_header(cid, oid, header);
+ r = backend->lookup_strip_header(cid, oid, &header);
if (r < 0) {
return false;
}
@@ -1535,42 +1510,42 @@ int KeyValueStore::stat(coll_t cid, const ghobject_t& oid,
{
dout(10) << "stat " << cid << "/" << oid << dendl;
- StripObjectMap::StripObjectHeader header;
+ StripObjectMap::StripObjectHeaderRef header;
- int r = backend->lookup_strip_header(cid, oid, header);
+ int r = backend->lookup_strip_header(cid, oid, &header);
if (r < 0) {
dout(10) << "stat " << cid << "/" << oid << "=" << r << dendl;
return -ENOENT;
}
- st->st_blocks = header.max_size / header.strip_size;
- if (header.max_size % header.strip_size)
+ st->st_blocks = header->max_size / header->strip_size;
+ if (header->max_size % header->strip_size)
st->st_blocks++;
st->st_nlink = 1;
- st->st_size = header.max_size;
- st->st_blksize = header.strip_size;
+ st->st_size = header->max_size;
+ st->st_blksize = header->strip_size;
return r;
}
-int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeader &header,
+int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeaderRef header,
uint64_t offset, size_t len, bufferlist& bl,
bool allow_eio, BufferTransaction *bt)
{
- if (header.max_size < offset) {
- dout(10) << __func__ << " " << header.cid << "/" << header.oid << ")"
+ if (header->max_size < offset) {
+ dout(10) << __func__ << " " << header->cid << "/" << header->oid << ")"
<< " offset exceed the length of bl"<< dendl;
return 0;
}
if (len == 0)
- len = header.max_size - offset;
+ len = header->max_size - offset;
- if (offset + len > header.max_size)
- len = header.max_size - offset;
+ if (offset + len > header->max_size)
+ len = header->max_size - offset;
vector<StripObjectMap::StripExtent> extents;
- StripObjectMap::file_to_extents(offset, len, header.strip_size,
+ StripObjectMap::file_to_extents(offset, len, header->strip_size,
extents);
map<string, bufferlist> out;
set<string> keys;
@@ -1580,23 +1555,23 @@ int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeader &header,
bufferlist old;
string key = strip_object_key(iter->no);
- if (bt && header.buffers.count(make_pair(OBJECT_STRIP_PREFIX, key))) {
+ if (bt && header->buffers.count(make_pair(OBJECT_STRIP_PREFIX, key))) {
// use strip_header buffer
- assert(header.bits[iter->no]);
- out[key] = header.buffers[make_pair(OBJECT_STRIP_PREFIX, key)];
- } else if (header.bits[iter->no]) {
+ assert(header->bits[iter->no]);
+ out[key] = header->buffers[make_pair(OBJECT_STRIP_PREFIX, key)];
+ } else if (header->bits[iter->no]) {
keys.insert(key);
}
}
int r = backend->get_values_with_header(header, OBJECT_STRIP_PREFIX, keys, &out);
if (r < 0) {
- dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
+ dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
<< offset << "~" << len << " = " << r << dendl;
return r;
} else if (out.size() != keys.size()) {
dout(0) << __func__ << " broken header or missing data in backend "
- << header.cid << "/" << header.oid << " " << offset << "~"
+ << header->cid << "/" << header->oid << " " << offset << "~"
<< len << " = " << r << dendl;
return -EBADF;
}
@@ -1605,8 +1580,8 @@ int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeader &header,
iter != extents.end(); ++iter) {
string key = strip_object_key(iter->no);
- if (header.bits[iter->no]) {
- if (iter->len == header.strip_size) {
+ if (header->bits[iter->no]) {
+ if (iter->len == header->strip_size) {
bl.claim_append(out[key]);
} else {
out[key].copy(iter->offset, iter->len, bl);
@@ -1616,7 +1591,7 @@ int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeader &header,
}
}
- dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
+ dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
<< offset << "~" << bl.length() << "/" << len << " r = " << r
<< dendl;
@@ -1630,9 +1605,9 @@ int KeyValueStore::read(coll_t cid, const ghobject_t& oid, uint64_t offset,
dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
<< len << dendl;
- StripObjectMap::StripObjectHeader header;
+ StripObjectMap::StripObjectHeaderRef header;
- int r = backend->lookup_strip_header(cid, oid, header);
+ int r = backend->lookup_strip_header(cid, oid, &header);
if (r < 0) {
dout(10) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
@@ -1649,9 +1624,9 @@ int KeyValueStore::fiemap(coll_t cid, const ghobject_t& oid,
dout(10) << __func__ << " " << cid << " " << oid << " " << offset << "~"
<< len << dendl;
int r;
- StripObjectMap::StripObjectHeader header;
+ StripObjectMap::StripObjectHeaderRef header;
- r = backend->lookup_strip_header(cid, oid, header);
+ r = backend->lookup_strip_header(cid, oid, &header);
if (r < 0) {
dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len
<< " failed to get header: r = " << r << dendl;
@@ -1659,13 +1634,14 @@ int KeyValueStore::fiemap(coll_t cid, const ghobject_t& oid,
}
vector<StripObjectMap::StripExtent> extents;
- StripObjectMap::file_to_extents(offset, len, header.strip_size,
+ StripObjectMap::file_to_extents(offset, len, header->strip_size,
extents);
map<uint64_t, uint64_t> m;
for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
iter != extents.end(); ++iter) {
- m[iter->offset] = iter->len;
+ uint64_t off = iter->no * header->strip_size + iter->offset;
+ m[off] = iter->len;
}
::encode(m, bl);
return 0;
@@ -1677,7 +1653,7 @@ int KeyValueStore::_remove(coll_t cid, const ghobject_t& oid,
dout(15) << __func__ << " " << cid << "/" << oid << dendl;
int r;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
r = t.lookup_cached_header(cid, oid, &header, false);
if (r < 0) {
@@ -1686,7 +1662,9 @@ int KeyValueStore::_remove(coll_t cid, const ghobject_t& oid,
return r;
}
- r = t.clear_buffer(*header);
+ header->max_size = 0;
+ header->bits.clear();
+ r = t.clear_buffer(header);
dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
return r;
@@ -1699,7 +1677,7 @@ int KeyValueStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size,
<< dendl;
int r;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
r = t.lookup_cached_header(cid, oid, &header, false);
if (r < 0) {
@@ -1725,7 +1703,7 @@ int KeyValueStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size,
string key = strip_object_key(iter->no);
lookup_keys.insert(key);
- r = t.get_buffer_keys(*header, OBJECT_STRIP_PREFIX,
+ r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX,
lookup_keys, &values);
if (r < 0) {
dout(10) << __func__ << " " << cid << "/" << oid << " "
@@ -1743,7 +1721,7 @@ int KeyValueStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size,
assert(value.length() == header->strip_size);
value.swap(values[key]);
- t.set_buffer_keys(*header, OBJECT_STRIP_PREFIX, values);
+ t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values);
++iter;
}
@@ -1754,7 +1732,7 @@ int KeyValueStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size,
header->bits[iter->no] = 0;
}
}
- r = t.remove_buffer_keys(*header, OBJECT_STRIP_PREFIX, keys);
+ r = t.remove_buffer_keys(header, OBJECT_STRIP_PREFIX, keys);
if (r < 0) {
dout(10) << __func__ << " " << cid << "/" << oid << " "
<< size << " = " << r << dendl;
@@ -1776,7 +1754,7 @@ int KeyValueStore::_touch(coll_t cid, const ghobject_t& oid,
dout(15) << __func__ << " " << cid << "/" << oid << dendl;
int r;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
r = t.lookup_cached_header(cid, oid, &header, true);
if (r < 0) {
@@ -1790,7 +1768,7 @@ int KeyValueStore::_touch(coll_t cid, const ghobject_t& oid,
return r;
}
-int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeader &header,
+int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeaderRef header,
uint64_t offset, size_t len,
const bufferlist& bl, BufferTransaction &t,
bool replica)
@@ -1798,34 +1776,34 @@ int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeader &header,
if (len > bl.length())
len = bl.length();
- if (len + offset > header.max_size) {
- header.max_size = len + offset;
- header.bits.resize(header.max_size/header.strip_size+1);
+ if (len + offset > header->max_size) {
+ header->max_size = len + offset;
+ header->bits.resize(header->max_size/header->strip_size+1);
}
vector<StripObjectMap::StripExtent> extents;
- StripObjectMap::file_to_extents(offset, len, header.strip_size,
+ StripObjectMap::file_to_extents(offset, len, header->strip_size,
extents);
map<string, bufferlist> out;
set<string> keys;
for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
iter != extents.end(); ++iter) {
- if (header.bits[iter->no] && !(iter->offset == 0 &&
- iter->len == header.strip_size))
+ if (header->bits[iter->no] && !(iter->offset == 0 &&
+ iter->len == header->strip_size))
keys.insert(strip_object_key(iter->no));
}
int r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, keys, &out);
if (r < 0) {
- dout(10) << __func__ << " failed to get value " << header.cid << "/"
- << header.oid << " " << offset << "~" << len << " = " << r
+ dout(10) << __func__ << " failed to get value " << header->cid << "/"
+ << header->oid << " " << offset << "~" << len << " = " << r
<< dendl;
return r;
} else if (keys.size() != out.size()) {
// Error on header.bits or the corresponding key/value pair is missing
dout(0) << __func__ << " broken header or missing data in backend "
- << header.cid << "/" << header.oid << " " << offset << "~"
+ << header->cid << "/" << header->oid << " " << offset << "~"
<< len << " = " << r << dendl;
return -EBADF;
}
@@ -1836,19 +1814,19 @@ int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeader &header,
iter != extents.end(); ++iter) {
bufferlist value;
string key = strip_object_key(iter->no);
- if (header.bits[iter->no]) {
- if (iter->offset == 0 && iter->len == header.strip_size) {
+ if (header->bits[iter->no]) {
+ if (iter->offset == 0 && iter->len == header->strip_size) {
bl.copy(bl_offset, iter->len, value);
bl_offset += iter->len;
} else {
- assert(out[key].length() == header.strip_size);
+ assert(out[key].length() == header->strip_size);
out[key].copy(0, iter->offset, value);
bl.copy(bl_offset, iter->len, value);
bl_offset += iter->len;
- if (value.length() != header.strip_size)
- out[key].copy(value.length(), header.strip_size-value.length(),
+ if (value.length() != header->strip_size)
+ out[key].copy(value.length(), header->strip_size-value.length(),
value);
}
} else {
@@ -1857,18 +1835,18 @@ int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeader &header,
bl.copy(bl_offset, iter->len, value);
bl_offset += iter->len;
- if (value.length() < header.strip_size)
- value.append_zero(header.strip_size-value.length());
+ if (value.length() < header->strip_size)
+ value.append_zero(header->strip_size-value.length());
- header.bits[iter->no] = 1;
+ header->bits[iter->no] = 1;
}
- assert(value.length() == header.strip_size);
+ assert(value.length() == header->strip_size);
values[key].swap(value);
}
assert(bl_offset == len);
t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values);
- dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
+ dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
<< offset << "~" << len << " = " << r << dendl;
return r;
@@ -1882,7 +1860,7 @@ int KeyValueStore::_write(coll_t cid, const ghobject_t& oid,
<< len << dendl;
int r;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
r = t.lookup_cached_header(cid, oid, &header, true);
if (r < 0) {
@@ -1891,7 +1869,7 @@ int KeyValueStore::_write(coll_t cid, const ghobject_t& oid,
return r;
}
- return _generic_write(*header, offset, len, bl, t, replica);
+ return _generic_write(header, offset, len, bl, t, replica);
}
int KeyValueStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset,
@@ -1920,7 +1898,7 @@ int KeyValueStore::_clone(coll_t cid, const ghobject_t& oldoid,
return 0;
int r;
- StripObjectMap::StripObjectHeader *old_header;
+ StripObjectMap::StripObjectHeaderRef old_header;
r = t.lookup_cached_header(cid, oldoid, &old_header, false);
if (r < 0) {
@@ -1929,7 +1907,7 @@ int KeyValueStore::_clone(coll_t cid, const ghobject_t& oldoid,
return r;
}
- t.clone_buffer(*old_header, cid, newoid);
+ t.clone_buffer(old_header, cid, newoid);
dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
<< newoid << " = " << r << dendl;
@@ -1948,7 +1926,7 @@ int KeyValueStore::_clone_range(coll_t cid, const ghobject_t& oldoid,
int r;
bufferlist bl;
- StripObjectMap::StripObjectHeader *old_header, *new_header;
+ StripObjectMap::StripObjectHeaderRef old_header, new_header;
r = t.lookup_cached_header(cid, oldoid, &old_header, false);
if (r < 0) {
@@ -1966,11 +1944,11 @@ int KeyValueStore::_clone_range(coll_t cid, const ghobject_t& oldoid,
return r;
}
- r = _generic_read(*old_header, srcoff, len, bl, &t);
+ r = _generic_read(old_header, srcoff, len, bl, &t);
if (r < 0)
goto out;
- r = _generic_write(*new_header, dstoff, len, bl, t);
+ r = _generic_write(new_header, dstoff, len, bl, t);
out:
dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
@@ -1990,9 +1968,17 @@ int KeyValueStore::getattr(coll_t cid, const ghobject_t& oid, const char *name,
int r;
map<string, bufferlist> got;
set<string> to_get;
+ StripObjectMap::StripObjectHeaderRef header;
to_get.insert(string(name));
- r = backend->get_values(cid, oid, OBJECT_XATTR, to_get, &got);
+
+ r = backend->lookup_strip_header(cid, oid, &header);
+ if (r < 0) {
+ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
+ return r;
+ }
+
+ r = backend->get_values_with_header(header, OBJECT_XATTR, to_get, &got);
if (r < 0 && r != -ENOENT) {
dout(10) << __func__ << " get_xattrs err r =" << r << dendl;
goto out;
@@ -2056,7 +2042,7 @@ int KeyValueStore::_setattrs(coll_t cid, const ghobject_t& oid,
int r;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
map<string, bufferlist> attrs;
r = t.lookup_cached_header(cid, oid, &header, false);
@@ -2068,7 +2054,7 @@ int KeyValueStore::_setattrs(coll_t cid, const ghobject_t& oid,
attrs[it->first].push_back(it->second);
}
- t.set_buffer_keys(*header, OBJECT_XATTR, attrs);
+ t.set_buffer_keys(header, OBJECT_XATTR, attrs);
out:
dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
@@ -2084,7 +2070,7 @@ int KeyValueStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name,
int r;
set<string> to_remove;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
r = t.lookup_cached_header(cid, oid, &header, false);
if (r < 0) {
@@ -2094,7 +2080,7 @@ int KeyValueStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name,
}
to_remove.insert(string(name));
- r = t.remove_buffer_keys(*header, OBJECT_XATTR, to_remove);
+ r = t.remove_buffer_keys(header, OBJECT_XATTR, to_remove);
dout(10) << __func__ << " " << cid << "/" << oid << " '" << name << "' = "
<< r << dendl;
@@ -2109,7 +2095,7 @@ int KeyValueStore::_rmattrs(coll_t cid, const ghobject_t& oid,
int r;
set<string> attrs;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
r = t.lookup_cached_header(cid, oid, &header, false);
if (r < 0) {
@@ -2118,14 +2104,14 @@ int KeyValueStore::_rmattrs(coll_t cid, const ghobject_t& oid,
return r;
}
- r = backend->get_keys_with_header(*header, OBJECT_XATTR, &attrs);
+ r = backend->get_keys_with_header(header, OBJECT_XATTR, &attrs);
if (r < 0 && r != -ENOENT) {
dout(10) << __func__ << " could not get attrs r = " << r << dendl;
return r;
}
- r = t.remove_buffer_keys(*header, OBJECT_XATTR, attrs);
- t.clear_buffer_keys(*header, OBJECT_XATTR);
+ r = t.remove_buffer_keys(header, OBJECT_XATTR, attrs);
+ t.clear_buffer_keys(header, OBJECT_XATTR);
dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
return r;
@@ -2168,10 +2154,18 @@ int KeyValueStore::collection_getattr(coll_t c, const char *name,
set<string> keys;
map<string, bufferlist> out;
+ StripObjectMap::StripObjectHeaderRef header;
+
keys.insert(string(name));
- int r = backend->get_values(get_coll_for_coll(), make_ghobject_for_coll(c),
- COLLECTION_ATTR, keys, &out);
+ int r = backend->lookup_strip_header(get_coll_for_coll(),
+ make_ghobject_for_coll(c), &header);
+ if (r < 0) {
+ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
+ return r;
+ }
+
+ r = backend->get_values_with_header(header, COLLECTION_ATTR, keys, &out);
if (r < 0) {
dout(10) << __func__ << " could not get key" << string(name) << dendl;
r = -EINVAL;
@@ -2192,14 +2186,21 @@ int KeyValueStore::collection_getattrs(coll_t cid,
map<string, bufferlist> out;
set<string> keys;
+ StripObjectMap::StripObjectHeaderRef header;
for (map<string, bufferptr>::iterator it = aset.begin();
it != aset.end(); ++it) {
keys.insert(it->first);
}
- int r = backend->get_values(get_coll_for_coll(), make_ghobject_for_coll(cid),
- COLLECTION_ATTR, keys, &out);
+ int r = backend->lookup_strip_header(get_coll_for_coll(),
+ make_ghobject_for_coll(cid), &header);
+ if (r < 0) {
+ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
+ return r;
+ }
+
+ r = backend->get_values_with_header(header, COLLECTION_ATTR, keys, &out);
if (r < 0) {
dout(10) << __func__ << " could not get keys" << dendl;
r = -EINVAL;
@@ -2227,7 +2228,7 @@ int KeyValueStore::_collection_setattr(coll_t c, const char *name,
int r;
bufferlist bl;
map<string, bufferlist> out;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
r = t.lookup_cached_header(get_coll_for_coll(),
make_ghobject_for_coll(c),
@@ -2240,7 +2241,7 @@ int KeyValueStore::_collection_setattr(coll_t c, const char *name,
bl.append(reinterpret_cast<const char*>(value), size);
out.insert(make_pair(string(name), bl));
- t.set_buffer_keys(*header, COLLECTION_ATTR, out);
+ t.set_buffer_keys(header, COLLECTION_ATTR, out);
dout(10) << __func__ << " " << c << " '"
<< name << "' len " << size << " = " << r << dendl;
@@ -2254,7 +2255,7 @@ int KeyValueStore::_collection_rmattr(coll_t c, const char *name,
bufferlist bl;
set<string> out;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
int r = t.lookup_cached_header(get_coll_for_coll(),
make_ghobject_for_coll(c), &header, false);
@@ -2264,7 +2265,7 @@ int KeyValueStore::_collection_rmattr(coll_t c, const char *name,
}
out.insert(string(name));
- r = t.remove_buffer_keys(*header, COLLECTION_ATTR, out);
+ r = t.remove_buffer_keys(header, COLLECTION_ATTR, out);
dout(10) << __func__ << " " << c << " = " << r << dendl;
return r;
@@ -2277,7 +2278,7 @@ int KeyValueStore::_collection_setattrs(coll_t cid,
dout(15) << __func__ << " " << cid << dendl;
map<string, bufferlist> attrs;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
int r = t.lookup_cached_header(get_coll_for_coll(),
make_ghobject_for_coll(cid),
&header, false);
@@ -2291,7 +2292,7 @@ int KeyValueStore::_collection_setattrs(coll_t cid,
attrs[it->first].push_back(it->second);
}
- t.set_buffer_keys(*header, COLLECTION_ATTR, attrs);
+ t.set_buffer_keys(header, COLLECTION_ATTR, attrs);
dout(10) << __func__ << " " << cid << " = " << r << dendl;
return r;
@@ -2305,7 +2306,7 @@ int KeyValueStore::_create_collection(coll_t c, BufferTransaction &t)
dout(15) << __func__ << " " << c << dendl;
int r;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
bufferlist bl;
r = t.lookup_cached_header(get_coll_for_coll(),
@@ -2330,7 +2331,7 @@ int KeyValueStore::_destroy_collection(coll_t c, BufferTransaction &t)
int r;
uint64_t modified_object = 0;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
vector<ghobject_t> oids;
r = t.lookup_cached_header(get_coll_for_coll(), make_ghobject_for_coll(c),
@@ -2347,7 +2348,7 @@ int KeyValueStore::_destroy_collection(coll_t c, BufferTransaction &t)
continue;
modified_object++;
- if (!iter->second.deleted) {
+ if (!iter->second->deleted) {
r = -ENOTEMPTY;
goto out;
}
@@ -2369,7 +2370,7 @@ int KeyValueStore::_destroy_collection(coll_t c, BufferTransaction &t)
}
}
- r = t.clear_buffer(*header);
+ r = t.clear_buffer(header);
out:
dout(10) << __func__ << " " << c << " = " << r << dendl;
@@ -2385,7 +2386,7 @@ int KeyValueStore::_collection_add(coll_t c, coll_t oldcid,
<< o << dendl;
bufferlist bl;
- StripObjectMap::StripObjectHeader *header, *old_header;
+ StripObjectMap::StripObjectHeaderRef header, old_header;
int r = t.lookup_cached_header(oldcid, o, &old_header, false);
if (r < 0) {
@@ -2400,13 +2401,13 @@ int KeyValueStore::_collection_add(coll_t c, coll_t oldcid,
goto out;
}
- r = _generic_read(*old_header, 0, old_header->max_size, bl, &t);
+ r = _generic_read(old_header, 0, old_header->max_size, bl, &t);
if (r < 0) {
r = -EINVAL;
goto out;
}
- r = _generic_write(*header, 0, bl.length(), bl, t);
+ r = _generic_write(header, 0, bl.length(), bl, t);
if (r < 0) {
r = -EINVAL;
}
@@ -2425,7 +2426,7 @@ int KeyValueStore::_collection_move_rename(coll_t oldcid,
dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/"
<< oldoid << dendl;
int r;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
r = t.lookup_cached_header(c, o, &header, false);
if (r == 0) {
@@ -2441,7 +2442,7 @@ int KeyValueStore::_collection_move_rename(coll_t oldcid,
return r;
}
- t.rename_buffer(*header, c, o);
+ t.rename_buffer(header, c, o);
dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/"
<< oldoid << " = " << r << dendl;
@@ -2453,7 +2454,7 @@ int KeyValueStore::_collection_remove_recursive(const coll_t &cid,
{
dout(15) << __func__ << " " << cid << dendl;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
int r = t.lookup_cached_header(get_coll_for_coll(),
make_ghobject_for_coll(cid),
@@ -2478,7 +2479,7 @@ int KeyValueStore::_collection_remove_recursive(const coll_t &cid,
}
}
- r = t.clear_buffer(*header);
+ r = t.clear_buffer(header);
dout(10) << __func__ << " " << cid << " r = " << r << dendl;
return 0;
@@ -2490,7 +2491,7 @@ int KeyValueStore::_collection_rename(const coll_t &cid, const coll_t &ncid,
dout(10) << __func__ << " origin cid " << cid << " new cid " << ncid
<< dendl;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
int r = t.lookup_cached_header(get_coll_for_coll(),
make_ghobject_for_coll(ncid),
@@ -2532,7 +2533,7 @@ int KeyValueStore::_collection_rename(const coll_t &cid, const coll_t &ncid,
current = next;
}
- t.rename_buffer(*header, get_coll_for_coll(), make_ghobject_for_coll(ncid));
+ t.rename_buffer(header, get_coll_for_coll(), make_ghobject_for_coll(ncid));
dout(10) << __func__ << " origin cid " << cid << " new cid " << ncid
<< dendl;
@@ -2560,9 +2561,9 @@ bool KeyValueStore::collection_exists(coll_t c)
{
dout(10) << __func__ << " " << dendl;
- StripObjectMap::StripObjectHeader header;
+ StripObjectMap::StripObjectHeaderRef header;
int r = backend->lookup_strip_header(get_coll_for_coll(),
- make_ghobject_for_coll(c), header);
+ make_ghobject_for_coll(c), &header);
if (r < 0) {
return false;
}
@@ -2652,15 +2653,14 @@ int KeyValueStore::omap_get(coll_t c, const ghobject_t &hoid,
{
dout(15) << __func__ << " " << c << "/" << hoid << dendl;
- StripObjectMap::StripObjectHeader header;
+ StripObjectMap::StripObjectHeaderRef header;
- int r = backend->lookup_strip_header(c, hoid, header);
+ int r = backend->lookup_strip_header(c, hoid, &header);
if (r < 0) {
dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
return r;
}
-
r = backend->get_with_header(header, OBJECT_OMAP, out);
if (r < 0 && r != -ENOENT) {
dout(10) << __func__ << " err r =" << r << dendl;
@@ -2692,9 +2692,16 @@ int KeyValueStore::omap_get_header(coll_t c, const ghobject_t &hoid,
set<string> keys;
map<string, bufferlist> got;
+ StripObjectMap::StripObjectHeaderRef header;
+
+ int r = backend->lookup_strip_header(c, hoid, &header);
+ if (r < 0) {
+ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
+ return r;
+ }
keys.insert(OBJECT_OMAP_HEADER_KEY);
- int r = backend->get_values(c, hoid, OBJECT_OMAP_HEADER, keys, &got);
+ r = backend->get_values_with_header(header, OBJECT_OMAP_HEADER, keys, &got);
if (r < 0 && r != -ENOENT) {
dout(10) << __func__ << " err r =" << r << dendl;
return r;
@@ -2712,7 +2719,14 @@ int KeyValueStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set<string> *
{
dout(15) << __func__ << " " << c << "/" << hoid << dendl;
- int r = backend->get_keys(c, hoid, OBJECT_OMAP, keys);
+ StripObjectMap::StripObjectHeaderRef header;
+ int r = backend->lookup_strip_header(c, hoid, &header);
+ if (r < 0) {
+ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
+ return r;
+ }
+
+ r = backend->get_keys_with_header(header, OBJECT_OMAP, keys);
if (r < 0 && r != -ENOENT) {
return r;
}
@@ -2725,7 +2739,14 @@ int KeyValueStore::omap_get_values(coll_t c, const ghobject_t &hoid,
{
dout(15) << __func__ << " " << c << "/" << hoid << dendl;
- int r = backend->get_values(c, hoid, OBJECT_OMAP, keys, out);
+ StripObjectMap::StripObjectHeaderRef header;
+ int r = backend->lookup_strip_header(c, hoid, &header);
+ if (r < 0) {
+ dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
+ return r;
+ }
+
+ r = backend->get_values_with_header(header, OBJECT_OMAP, keys, out);
if (r < 0 && r != -ENOENT) {
return r;
}
@@ -2756,7 +2777,7 @@ int KeyValueStore::_omap_clear(coll_t cid, const ghobject_t &hoid,
{
dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
int r = t.lookup_cached_header(cid, hoid, &header, false);
if (r < 0) {
@@ -2766,13 +2787,13 @@ int KeyValueStore::_omap_clear(coll_t cid, const ghobject_t &hoid,
}
set<string> keys;
- r = backend->get_keys_with_header(*header, OBJECT_OMAP, &keys);
+ r = backend->get_keys_with_header(header, OBJECT_OMAP, &keys);
if (r < 0 && r != -ENOENT) {
dout(10) << __func__ << " could not get omap_keys r = " << r << dendl;
return r;
}
- r = t.remove_buffer_keys(*header, OBJECT_OMAP, keys);
+ r = t.remove_buffer_keys(header, OBJECT_OMAP, keys);
if (r < 0) {
dout(10) << __func__ << " could not remove keys r = " << r << dendl;
return r;
@@ -2780,13 +2801,13 @@ int KeyValueStore::_omap_clear(coll_t cid, const ghobject_t &hoid,
keys.clear();
keys.insert(OBJECT_OMAP_HEADER_KEY);
- r = t.remove_buffer_keys(*header, OBJECT_OMAP_HEADER, keys);
+ r = t.remove_buffer_keys(header, OBJECT_OMAP_HEADER, keys);
if (r < 0) {
dout(10) << __func__ << " could not remove keys r = " << r << dendl;
return r;
}
- t.clear_buffer_keys(*header, OBJECT_OMAP_HEADER);
+ t.clear_buffer_keys(header, OBJECT_OMAP_HEADER);
dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl;
return 0;
@@ -2798,7 +2819,7 @@ int KeyValueStore::_omap_setkeys(coll_t cid, const ghobject_t &hoid,
{
dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
int r = t.lookup_cached_header(cid, hoid, &header, false);
if (r < 0) {
@@ -2807,7 +2828,7 @@ int KeyValueStore::_omap_setkeys(coll_t cid, const ghobject_t &hoid,
return r;
}
- t.set_buffer_keys(*header, OBJECT_OMAP, aset);
+ t.set_buffer_keys(header, OBJECT_OMAP, aset);
return 0;
}
@@ -2818,7 +2839,7 @@ int KeyValueStore::_omap_rmkeys(coll_t cid, const ghobject_t &hoid,
{
dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
int r = t.lookup_cached_header(cid, hoid, &header, false);
if (r < 0) {
@@ -2827,7 +2848,7 @@ int KeyValueStore::_omap_rmkeys(coll_t cid, const ghobject_t &hoid,
return r;
}
- r = t.remove_buffer_keys(*header, OBJECT_OMAP, keys);
+ r = t.remove_buffer_keys(header, OBJECT_OMAP, keys);
dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl;
return r;
@@ -2861,7 +2882,7 @@ int KeyValueStore::_omap_setheader(coll_t cid, const ghobject_t &hoid,
dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
map<string, bufferlist> sets;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
int r = t.lookup_cached_header(cid, hoid, &header, false);
if (r < 0) {
@@ -2871,7 +2892,7 @@ int KeyValueStore::_omap_setheader(coll_t cid, const ghobject_t &hoid,
}
sets[OBJECT_OMAP_HEADER_KEY] = bl;
- t.set_buffer_keys(*header, OBJECT_OMAP_HEADER, sets);
+ t.set_buffer_keys(header, OBJECT_OMAP_HEADER, sets);
return 0;
}
@@ -2881,7 +2902,7 @@ int KeyValueStore::_split_collection(coll_t cid, uint32_t bits, uint32_t rem,
{
dout(15) << __func__ << " " << cid << " bits: " << bits << dendl;
- StripObjectMap::StripObjectHeader *header;
+ StripObjectMap::StripObjectHeaderRef header;
int r = t.lookup_cached_header(get_coll_for_coll(),
make_ghobject_for_coll(cid),
diff --git a/src/os/KeyValueStore.h b/src/os/KeyValueStore.h
index d7b9c0a..bc36103 100644
--- a/src/os/KeyValueStore.h
+++ b/src/os/KeyValueStore.h
@@ -36,8 +36,8 @@ using namespace std;
#include "common/Mutex.h"
#include "GenericObjectMap.h"
-#include "SequencerPosition.h"
#include "KeyValueDB.h"
+#include "common/random_cache.hpp"
#include "include/uuid.h"
@@ -48,6 +48,8 @@ enum kvstore_types {
};
+static uint64_t default_strip_size = 1024;
+
class StripObjectMap: public GenericObjectMap {
public:
@@ -65,7 +67,6 @@ class StripObjectMap: public GenericObjectMap {
uint64_t strip_size;
uint64_t max_size;
vector<char> bits;
- SequencerPosition spos;
// soft state
Header header; // FIXME: Hold lock to avoid concurrent operations, it will
@@ -82,7 +83,6 @@ class StripObjectMap: public GenericObjectMap {
::encode(strip_size, bl);
::encode(max_size, bl);
::encode(bits, bl);
- ::encode(spos, bl);
ENCODE_FINISH(bl);
}
@@ -91,56 +91,56 @@ class StripObjectMap: public GenericObjectMap {
::decode(strip_size, bl);
::decode(max_size, bl);
::decode(bits, bl);
- ::decode(spos, bl);
DECODE_FINISH(bl);
}
};
-
- bool check_spos(const StripObjectHeader &header,
- const SequencerPosition &spos);
- void sync_wrap(StripObjectHeader &strip_header, KeyValueDB::Transaction t,
- const SequencerPosition &spos);
+ typedef ceph::shared_ptr<StripObjectHeader> StripObjectHeaderRef;
static int file_to_extents(uint64_t offset, size_t len, uint64_t strip_size,
vector<StripExtent> &extents);
int lookup_strip_header(const coll_t & cid, const ghobject_t &oid,
- StripObjectHeader &header);
- int save_strip_header(StripObjectHeader &header,
- const SequencerPosition &spos,
- KeyValueDB::Transaction t);
+ StripObjectHeaderRef *header);
+ int save_strip_header(StripObjectHeaderRef header, KeyValueDB::Transaction t);
int create_strip_header(const coll_t &cid, const ghobject_t &oid,
- StripObjectHeader &strip_header,
+ StripObjectHeaderRef *strip_header,
KeyValueDB::Transaction t);
- void clone_wrap(StripObjectHeader &old_header,
+ void clone_wrap(StripObjectHeaderRef old_header,
const coll_t &cid, const ghobject_t &oid,
KeyValueDB::Transaction t,
- StripObjectHeader *origin_header,
- StripObjectHeader *target_header);
- void rename_wrap(const coll_t &cid, const ghobject_t &oid,
+ StripObjectHeaderRef *target_header);
+ void rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid,
KeyValueDB::Transaction t,
- StripObjectHeader *header);
+ StripObjectHeaderRef *new_header);
// Already hold header to avoid lock header seq again
int get_with_header(
- const StripObjectHeader &header,
+ const StripObjectHeaderRef header,
const string &prefix,
map<string, bufferlist> *out
);
int get_values_with_header(
- const StripObjectHeader &header,
+ const StripObjectHeaderRef header,
const string &prefix,
const set<string> &keys,
map<string, bufferlist> *out
);
int get_keys_with_header(
- const StripObjectHeader &header,
+ const StripObjectHeaderRef header,
const string &prefix,
set<string> *keys
);
- StripObjectMap(KeyValueDB *db): GenericObjectMap(db) {}
+ Mutex lock;
+ void invalidate_cache(const coll_t &c, const ghobject_t &oid) {
+ Mutex::Locker l(lock);
+ caches.clear(oid);
+ }
- static const uint64_t default_strip_size = 1024;
+ RandomCache<ghobject_t, pair<coll_t, StripObjectHeaderRef> > caches;
+ StripObjectMap(KeyValueDB *db): GenericObjectMap(db),
+ lock("StripObjectMap::lock"),
+ caches(g_conf->keyvaluestore_header_cache_size)
+ {}
};
@@ -161,7 +161,7 @@ class KeyValueStore : public ObjectStore,
std::string current_op_seq_fn;
uuid_d fsid;
- int fsid_fd, op_fd, current_fd;
+ int fsid_fd, current_fd;
enum kvstore_types kv_type;
@@ -210,39 +210,49 @@ class KeyValueStore : public ObjectStore,
// 4. Clone or rename
struct BufferTransaction {
typedef pair<coll_t, ghobject_t> uniq_id;
- typedef map<uniq_id, StripObjectMap::StripObjectHeader> StripHeaderMap;
+ typedef map<uniq_id, StripObjectMap::StripObjectHeaderRef> StripHeaderMap;
//Dirty records
StripHeaderMap strip_headers;
+ list<Context*> finishes;
KeyValueStore *store;
- SequencerPosition spos;
KeyValueDB::Transaction t;
int lookup_cached_header(const coll_t &cid, const ghobject_t &oid,
- StripObjectMap::StripObjectHeader **strip_header,
+ StripObjectMap::StripObjectHeaderRef *strip_header,
bool create_if_missing);
- int get_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
+ int get_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
const string &prefix, const set<string> &keys,
map<string, bufferlist> *out);
- void set_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
+ void set_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
const string &prefix, map<string, bufferlist> &bl);
- int remove_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
+ int remove_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
const string &prefix, const set<string> &keys);
- void clear_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
+ void clear_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
const string &prefix);
- int clear_buffer(StripObjectMap::StripObjectHeader &strip_header);
- void clone_buffer(StripObjectMap::StripObjectHeader &old_header,
+ int clear_buffer(StripObjectMap::StripObjectHeaderRef strip_header);
+ void clone_buffer(StripObjectMap::StripObjectHeaderRef old_header,
const coll_t &cid, const ghobject_t &oid);
- void rename_buffer(StripObjectMap::StripObjectHeader &old_header,
+ void rename_buffer(StripObjectMap::StripObjectHeaderRef old_header,
const coll_t &cid, const ghobject_t &oid);
int submit_transaction();
- BufferTransaction(KeyValueStore *store,
- SequencerPosition &spos): store(store), spos(spos) {
+ BufferTransaction(KeyValueStore *store): store(store) {
t = store->backend->get_transaction();
}
+
+ struct InvalidateCacheContext : public Context {
+ KeyValueStore *store;
+ const coll_t cid;
+ const ghobject_t oid;
+ InvalidateCacheContext(KeyValueStore *s, const coll_t &c, const ghobject_t &oid): store(s), cid(c), oid(oid) {}
+ void finish(int r) {
+ if (r == 0)
+ store->backend->invalidate_cache(cid, oid);
+ }
+ };
};
// -- op workqueue --
@@ -257,28 +267,79 @@ class KeyValueStore : public ObjectStore,
class OpSequencer : public Sequencer_impl {
Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
list<Op*> q;
- list<uint64_t> jq;
Cond cond;
+ list<pair<uint64_t, Context*> > flush_commit_waiters;
+ uint64_t op; // used by flush() to know the sequence of op
public:
Sequencer *parent;
Mutex apply_lock; // for apply mutual exclusion
+
+ /// get_max_uncompleted
+ bool _get_max_uncompleted(
+ uint64_t *seq ///< [out] max uncompleted seq
+ ) {
+ assert(qlock.is_locked());
+ assert(seq);
+ *seq = 0;
+ if (q.empty()) {
+ return true;
+ } else {
+ *seq = q.back()->op;
+ return false;
+ }
+ } /// @returns true if the queue is empty
+
+ /// get_min_uncompleted
+ bool _get_min_uncompleted(
+ uint64_t *seq ///< [out] min uncompleted seq
+ ) {
+ assert(qlock.is_locked());
+ assert(seq);
+ *seq = 0;
+ if (q.empty()) {
+ return true;
+ } else {
+ *seq = q.front()->op;
+ return false;
+ }
+ } /// @returns true if both queues are empty
+
+ void _wake_flush_waiters(list<Context*> *to_queue) {
+ uint64_t seq;
+ if (_get_min_uncompleted(&seq))
+ seq = -1;
+
+ for (list<pair<uint64_t, Context*> >::iterator i =
+ flush_commit_waiters.begin();
+ i != flush_commit_waiters.end() && i->first < seq;
+ flush_commit_waiters.erase(i++)) {
+ to_queue->push_back(i->second);
+ }
+ }
void queue(Op *o) {
Mutex::Locker l(qlock);
q.push_back(o);
+ op++;
+ o->op = op;
}
Op *peek_queue() {
assert(apply_lock.is_locked());
return q.front();
}
- Op *dequeue() {
+
+ Op *dequeue(list<Context*> *to_queue) {
+ assert(to_queue);
assert(apply_lock.is_locked());
Mutex::Locker l(qlock);
Op *o = q.front();
q.pop_front();
cond.Signal();
+
+ _wake_flush_waiters(to_queue);
return o;
}
+
void flush() {
Mutex::Locker l(qlock);
@@ -286,21 +347,29 @@ class KeyValueStore : public ObjectStore,
uint64_t seq = 0;
if (!q.empty())
seq = q.back()->op;
- if (!jq.empty() && jq.back() > seq)
- seq = jq.back();
if (seq) {
// everything prior to our watermark to drain through either/both
// queues
- while ((!q.empty() && q.front()->op <= seq) ||
- (!jq.empty() && jq.front() <= seq))
+ while (!q.empty() && q.front()->op <= seq)
cond.Wait(qlock);
}
}
+ bool flush_commit(Context *c) {
+ Mutex::Locker l(qlock);
+ uint64_t seq = 0;
+ if (_get_max_uncompleted(&seq)) {
+ delete c;
+ return true;
+ } else {
+ flush_commit_waiters.push_back(make_pair(seq, c));
+ return false;
+ }
+ }
OpSequencer()
: qlock("KeyValueStore::OpSequencer::qlock", false, false),
- parent(0),
+ op(0), parent(0),
apply_lock("KeyValueStore::OpSequencer::apply_lock", false, false) {}
~OpSequencer() {
assert(q.empty());
@@ -417,7 +486,6 @@ class KeyValueStore : public ObjectStore,
}
unsigned _do_transaction(Transaction& transaction,
BufferTransaction &bt,
- SequencerPosition& spos,
ThreadPool::TPHandle *handle);
int queue_transactions(Sequencer *osr, list<Transaction*>& tls,
@@ -428,10 +496,10 @@ class KeyValueStore : public ObjectStore,
// ------------------
// objects
- int _generic_read(StripObjectMap::StripObjectHeader &header,
+ int _generic_read(StripObjectMap::StripObjectHeaderRef header,
uint64_t offset, size_t len, bufferlist& bl,
bool allow_eio = false, BufferTransaction *bt = 0);
- int _generic_write(StripObjectMap::StripObjectHeader &header,
+ int _generic_write(StripObjectMap::StripObjectHeaderRef header,
uint64_t offset, size_t len, const bufferlist& bl,
BufferTransaction &t, bool replica = false);
@@ -572,26 +640,6 @@ class KeyValueStore : public ObjectStore,
static const string COLLECTION;
static const string COLLECTION_ATTR;
static const uint32_t COLLECTION_VERSION = 1;
-
- class SubmitManager {
- Mutex lock;
- uint64_t op_seq;
- uint64_t op_submitted;
- public:
- SubmitManager() :
- lock("JOS::SubmitManager::lock", false, true, false, g_ceph_context),
- op_seq(0), op_submitted(0)
- {}
- uint64_t op_submit_start();
- void op_submit_finish(uint64_t op);
- void set_op_seq(uint64_t seq) {
- Mutex::Locker l(lock);
- op_submitted = op_seq = seq;
- }
- uint64_t get_op_seq() {
- return op_seq;
- }
- } submit_manager;
};
WRITE_CLASS_ENCODER(StripObjectMap::StripObjectHeader)
diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc
index a460e5c..e017f83 100644
--- a/src/os/LFNIndex.cc
+++ b/src/os/LFNIndex.cc
@@ -61,6 +61,17 @@ void LFNIndex::maybe_inject_failure()
}
}
+// Helper to close fd's when we leave scope. This is useful when used
+// in combination with RetryException, thrown by the above.
+struct FDCloser {
+ int fd;
+ FDCloser(int f) : fd(f) {}
+ ~FDCloser() {
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ }
+};
+
+
/* Public methods */
void LFNIndex::set_ref(ceph::shared_ptr<CollectionIndex> ref)
@@ -160,9 +171,9 @@ int LFNIndex::fsync_dir(const vector<string> &path)
int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY);
if (fd < 0)
return -errno;
+ FDCloser f(fd);
maybe_inject_failure();
int r = ::fsync(fd);
- VOID_TEMP_FAILURE_RETRY(::close(fd));
maybe_inject_failure();
if (r < 0)
return -errno;
@@ -753,7 +764,8 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
for ( ; ; ++i) {
candidate = lfn_get_short_name(oid, i);
candidate_path = get_full_path(path, candidate);
- r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(), buf, sizeof(buf));
+ r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(),
+ buf, sizeof(buf));
if (r < 0) {
if (errno != ENODATA && errno != ENOENT)
return -errno;
@@ -784,6 +796,38 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
*exists = 1;
return 0;
}
+ r = chain_getxattr(candidate_path.c_str(), get_alt_lfn_attr().c_str(),
+ buf, sizeof(buf));
+ if (r > 0) {
+ // only consider alt name if nlink > 1
+ struct stat st;
+ int rc = ::stat(candidate_path.c_str(), &st);
+ if (rc < 0)
+ return -errno;
+ if (st.st_nlink <= 1) {
+ // left over from incomplete unlink, remove
+ maybe_inject_failure();
+ dout(20) << __func__ << " found extra alt attr for " << candidate_path
+ << ", long name " << string(buf, r) << dendl;
+ rc = chain_removexattr(candidate_path.c_str(),
+ get_alt_lfn_attr().c_str());
+ maybe_inject_failure();
+ if (rc < 0)
+ return rc;
+ continue;
+ }
+ buf[MIN((int)sizeof(buf) - 1, r)] = '\0';
+ if (!strcmp(buf, full_name.c_str())) {
+ dout(20) << __func__ << " used alt attr for " << full_name << dendl;
+ if (mangled_name)
+ *mangled_name = candidate;
+ if (out_path)
+ *out_path = candidate_path;
+ if (exists)
+ *exists = 1;
+ return 0;
+ }
+ }
}
assert(0); // Unreachable
return 0;
@@ -798,7 +842,24 @@ int LFNIndex::lfn_created(const vector<string> &path,
string full_path = get_full_path(path, mangled_name);
string full_name = lfn_generate_object_name(oid);
maybe_inject_failure();
- return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(),
+
+ // if the main attr exists and is different, move it to the alt attr.
+ char buf[FILENAME_MAX_LEN + 1];
+ int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(),
+ buf, sizeof(buf));
+ if (r >= 0 && (r != (int)full_name.length() ||
+ memcmp(buf, full_name.c_str(), full_name.length()))) {
+ dout(20) << __func__ << " " << mangled_name
+ << " moving old name to alt attr "
+ << string(buf, r)
+ << ", new name is " << full_name << dendl;
+ r = chain_setxattr(full_path.c_str(), get_alt_lfn_attr().c_str(),
+ buf, r);
+ if (r < 0)
+ return r;
+ }
+
+ return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(),
full_name.c_str(), full_name.size());
}
@@ -839,26 +900,35 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
}
}
}
+ string full_path = get_full_path(path, mangled_name);
+ int fd = ::open(full_path.c_str(), O_RDONLY);
+ if (fd < 0)
+ return -errno;
+ FDCloser f(fd);
if (i == removed_index + 1) {
- string full_path = get_full_path(path, mangled_name);
maybe_inject_failure();
int r = ::unlink(full_path.c_str());
maybe_inject_failure();
if (r < 0)
return -errno;
- else
- return 0;
} else {
- string rename_to = get_full_path(path, mangled_name);
+ string& rename_to = full_path;
string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1));
maybe_inject_failure();
int r = ::rename(rename_from.c_str(), rename_to.c_str());
maybe_inject_failure();
if (r < 0)
return -errno;
- else
- return 0;
}
+ struct stat st;
+ int r = ::fstat(fd, &st);
+ if (r == 0 && st.st_nlink > 0) {
+ // remove alt attr
+ dout(20) << __func__ << " removing alt attr from " << full_path << dendl;
+ fsync_dir(path);
+ chain_fremovexattr(fd, get_alt_lfn_attr().c_str());
+ }
+ return r;
}
int LFNIndex::lfn_translate(const vector<string> &path,
diff --git a/src/os/LFNIndex.h b/src/os/LFNIndex.h
index c9c7f5e..646e726 100644
--- a/src/os/LFNIndex.h
+++ b/src/os/LFNIndex.h
@@ -123,7 +123,7 @@ protected:
}
private:
- string lfn_attribute;
+ string lfn_attribute, lfn_alt_attribute;
coll_t collection;
public:
@@ -146,7 +146,8 @@ public:
char buf[100];
snprintf(buf, sizeof(buf), "%d", index_version);
lfn_attribute = LFN_ATTR + string(buf);
- }
+ lfn_alt_attribute = LFN_ATTR + string(buf) + "-alt";
+ }
}
coll_t coll() const { return collection; }
@@ -423,6 +424,9 @@ private:
const string &get_lfn_attr() const {
return lfn_attribute;
}
+ const string &get_alt_lfn_attr() const {
+ return lfn_alt_attribute;
+ }
/**
* Gets the filename corresponsing to oid in path.
diff --git a/src/os/MemStore.cc b/src/os/MemStore.cc
index 9e75b76..952866a 100644
--- a/src/os/MemStore.cc
+++ b/src/os/MemStore.cc
@@ -950,7 +950,12 @@ void MemStore::_do_transaction(Transaction& t)
break;
case Transaction::OP_SETALLOCHINT:
- // nop
+ {
+ coll_t cid(i.get_cid());
+ ghobject_t oid = i.get_oid();
+ (void)i.get_length(); // discard result
+ (void)i.get_length(); // discard result
+ }
break;
default:
diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc
index e4e2257..afa90b1 100644
--- a/src/os/ObjectStore.cc
+++ b/src/os/ObjectStore.cc
@@ -144,7 +144,11 @@ int ObjectStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
snapid_t seq, vector<hobject_t> *ls)
{
vector<ghobject_t> go;
- ghobject_t gstart(start), gend(end);
+ // Starts with the smallest shard id and generation to
+ // make sure the result list has the marker object
+ ghobject_t gstart(start, 0, shard_id_t(0));
+ // Exclusive end, choose the smallest end ghobject
+ ghobject_t gend(end, 0, shard_id_t(0));
int ret = collection_list_range(c, gstart, gend, seq, &go);
if (ret == 0) {
ls->reserve(go.size());
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
index 15ed31f..a5f5fcb 100644
--- a/src/os/ObjectStore.h
+++ b/src/os/ObjectStore.h
@@ -128,6 +128,22 @@ public:
*/
struct Sequencer_impl {
virtual void flush() = 0;
+
+ /**
+ * Async flush_commit
+ *
+ * There are two cases:
+ * 1) sequencer is currently idle: the method returns true and
+ * c is deleted
+ * 2) sequencer is not idle: the method returns false and c is
+ * called asyncronously with a value of 0 once all transactions
+ * queued on this sequencer prior to the call have been applied
+ * and committed.
+ */
+ virtual bool flush_commit(
+ Context *c ///< [in] context to call upon flush/commit
+ ) = 0; ///< @return true if idle, false otherwise
+
virtual ~Sequencer_impl() {}
};
@@ -153,6 +169,16 @@ public:
if (p)
p->flush();
}
+
+ /// @see Sequencer_impl::flush_commit()
+ bool flush_commit(Context *c) {
+ if (!p) {
+ delete c;
+ return true;
+ } else {
+ return p->flush_commit(c);
+ }
+ }
};
/*********************************
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index b69d77a..aefbb5e 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -105,13 +105,13 @@ ostream &operator<<(ostream &lhs, const ECBackend::ReadOp &rhs)
void ECBackend::ReadOp::dump(Formatter *f) const
{
- f->dump_stream("tid") << tid;
+ f->dump_unsigned("tid", tid);
if (op && op->get_req()) {
f->dump_stream("op") << *(op->get_req());
}
f->dump_stream("to_read") << to_read;
f->dump_stream("complete") << complete;
- f->dump_stream("priority") << priority;
+ f->dump_int("priority", priority);
f->dump_stream("obj_to_source") << obj_to_source;
f->dump_stream("source_to_obj") << source_to_obj;
f->dump_stream("in_progress") << in_progress;
@@ -158,7 +158,7 @@ void ECBackend::RecoveryOp::dump(Formatter *f) const
f->dump_stream("missing_on_shards") << missing_on_shards;
f->dump_stream("recovery_info") << recovery_info;
f->dump_stream("recovery_progress") << recovery_progress;
- f->dump_stream("pending_read") << pending_read;
+ f->dump_bool("pending_read", pending_read);
f->dump_stream("state") << tostr(state);
f->dump_stream("waiting_on_pushes") << waiting_on_pushes;
f->dump_stream("extent_requested") << extent_requested;
@@ -829,6 +829,7 @@ void ECBackend::handle_sub_write(
op.log_entries,
op.updated_hit_set_history,
op.trim_to,
+ op.trim_rollback_to,
!(op.t.empty()),
localt);
localt->append(op.t);
@@ -1211,6 +1212,7 @@ void ECBackend::submit_transaction(
const eversion_t &at_version,
PGTransaction *_t,
const eversion_t &trim_to,
+ const eversion_t &trim_rollback_to,
vector<pg_log_entry_t> &log_entries,
boost::optional<pg_hit_set_history_t> &hset_history,
Context *on_local_applied_sync,
@@ -1226,6 +1228,7 @@ void ECBackend::submit_transaction(
op->hoid = hoid;
op->version = at_version;
op->trim_to = trim_to;
+ op->trim_rollback_to = trim_rollback_to;
op->log_entries.swap(log_entries);
std::swap(op->updated_hit_set_history, hset_history);
op->on_local_applied_sync = on_local_applied_sync;
@@ -1532,6 +1535,7 @@ void ECBackend::start_write(Op *op) {
should_send ? iter->second : ObjectStore::Transaction(),
op->version,
op->trim_to,
+ op->trim_rollback_to,
op->log_entries,
op->updated_hit_set_history,
op->temp_added,
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h
index c13f30f..28bcf8a 100644
--- a/src/osd/ECBackend.h
+++ b/src/osd/ECBackend.h
@@ -97,6 +97,7 @@ public:
const eversion_t &at_version,
PGTransaction *t,
const eversion_t &trim_to,
+ const eversion_t &trim_rollback_to,
vector<pg_log_entry_t> &log_entries,
boost::optional<pg_hit_set_history_t> &hset_history,
Context *on_local_applied_sync,
@@ -326,6 +327,7 @@ public:
hobject_t hoid;
eversion_t version;
eversion_t trim_to;
+ eversion_t trim_rollback_to;
vector<pg_log_entry_t> log_entries;
boost::optional<pg_hit_set_history_t> updated_hit_set_history;
Context *on_local_applied_sync;
diff --git a/src/osd/ECMsgTypes.cc b/src/osd/ECMsgTypes.cc
index 4e4c8e3..ba02d83 100644
--- a/src/osd/ECMsgTypes.cc
+++ b/src/osd/ECMsgTypes.cc
@@ -16,7 +16,7 @@
void ECSubWrite::encode(bufferlist &bl) const
{
- ENCODE_START(2, 1, bl);
+ ENCODE_START(3, 1, bl);
::encode(from, bl);
::encode(tid, bl);
::encode(reqid, bl);
@@ -29,12 +29,13 @@ void ECSubWrite::encode(bufferlist &bl) const
::encode(temp_added, bl);
::encode(temp_removed, bl);
::encode(updated_hit_set_history, bl);
+ ::encode(trim_rollback_to, bl);
ENCODE_FINISH(bl);
}
void ECSubWrite::decode(bufferlist::iterator &bl)
{
- DECODE_START(2, bl);
+ DECODE_START(3, bl);
::decode(from, bl);
::decode(tid, bl);
::decode(reqid, bl);
@@ -49,6 +50,11 @@ void ECSubWrite::decode(bufferlist::iterator &bl)
if (struct_v >= 2) {
::decode(updated_hit_set_history, bl);
}
+ if (struct_v >= 3) {
+ ::decode(trim_rollback_to, bl);
+ } else {
+ trim_rollback_to = trim_to;
+ }
DECODE_FINISH(bl);
}
@@ -58,7 +64,8 @@ std::ostream &operator<<(
lhs << "ECSubWrite(tid=" << rhs.tid
<< ", reqid=" << rhs.reqid
<< ", at_version=" << rhs.at_version
- << ", trim_to=" << rhs.trim_to;
+ << ", trim_to=" << rhs.trim_to
+ << ", trim_rollback_to=" << rhs.trim_rollback_to;
if (rhs.updated_hit_set_history)
lhs << ", has_updated_hit_set_history";
return lhs << ")";
@@ -66,10 +73,11 @@ std::ostream &operator<<(
void ECSubWrite::dump(Formatter *f) const
{
- f->dump_stream("tid") << tid;
+ f->dump_unsigned("tid", tid);
f->dump_stream("reqid") << reqid;
f->dump_stream("at_version") << at_version;
f->dump_stream("trim_to") << trim_to;
+ f->dump_stream("trim_rollback_to") << trim_rollback_to;
f->dump_stream("has_updated_hit_set_history")
<< static_cast<bool>(updated_hit_set_history);
}
@@ -85,6 +93,12 @@ void ECSubWrite::generate_test_instances(list<ECSubWrite*> &o)
o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
o.back()->at_version = eversion_t(10, 300);
o.back()->trim_to = eversion_t(5, 42);
+ o.push_back(new ECSubWrite());
+ o.back()->tid = 9;
+ o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
+ o.back()->at_version = eversion_t(10, 300);
+ o.back()->trim_to = eversion_t(5, 42);
+ o.back()->trim_rollback_to = eversion_t(8, 250);
}
void ECSubWriteReply::encode(bufferlist &bl) const
@@ -121,7 +135,7 @@ std::ostream &operator<<(
void ECSubWriteReply::dump(Formatter *f) const
{
- f->dump_stream("tid") << tid;
+ f->dump_unsigned("tid", tid);
f->dump_stream("last_complete") << last_complete;
f->dump_stream("committed") << committed;
f->dump_stream("applied") << applied;
@@ -171,7 +185,7 @@ std::ostream &operator<<(
void ECSubRead::dump(Formatter *f) const
{
f->dump_stream("from") << from;
- f->dump_stream("tid") << tid;
+ f->dump_unsigned("tid", tid);
f->open_array_section("objects");
for (map<hobject_t, list<pair<uint64_t, uint64_t> > >::const_iterator i =
to_read.begin();
@@ -259,7 +273,7 @@ std::ostream &operator<<(
void ECSubReadReply::dump(Formatter *f) const
{
f->dump_stream("from") << from;
- f->dump_stream("tid") << tid;
+ f->dump_unsigned("tid", tid);
f->open_array_section("buffers_read");
for (map<hobject_t, list<pair<uint64_t, bufferlist> > >::const_iterator i =
buffers_read.begin();
diff --git a/src/osd/ECMsgTypes.h b/src/osd/ECMsgTypes.h
index 11c519d..1cdfa57 100644
--- a/src/osd/ECMsgTypes.h
+++ b/src/osd/ECMsgTypes.h
@@ -28,6 +28,7 @@ struct ECSubWrite {
ObjectStore::Transaction t;
eversion_t at_version;
eversion_t trim_to;
+ eversion_t trim_rollback_to;
vector<pg_log_entry_t> log_entries;
set<hobject_t> temp_added;
set<hobject_t> temp_removed;
@@ -42,6 +43,7 @@ struct ECSubWrite {
const ObjectStore::Transaction &t,
eversion_t at_version,
eversion_t trim_to,
+ eversion_t trim_rollback_to,
vector<pg_log_entry_t> log_entries,
boost::optional<pg_hit_set_history_t> updated_hit_set_history,
const set<hobject_t> &temp_added,
@@ -49,7 +51,8 @@ struct ECSubWrite {
: from(from), tid(tid), reqid(reqid),
soid(soid), stats(stats), t(t),
at_version(at_version),
- trim_to(trim_to), log_entries(log_entries),
+ trim_to(trim_to), trim_rollback_to(trim_rollback_to),
+ log_entries(log_entries),
temp_added(temp_added),
temp_removed(temp_removed),
updated_hit_set_history(updated_hit_set_history) {}
diff --git a/src/osd/HitSet.h b/src/osd/HitSet.h
index 391dd63..476678e 100644
--- a/src/osd/HitSet.h
+++ b/src/osd/HitSet.h
@@ -369,7 +369,7 @@ public:
return (double)fpp_micro / 1000000.0;
}
void set_fpp(double f) {
- fpp_micro = (unsigned)(f * 1000000.0);
+ fpp_micro = (unsigned)(llrintl(f * (double)1000000.0));
}
void encode(bufferlist& bl) const {
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 5c8f0d6..dc67fdd 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -42,6 +42,7 @@
#include "common/ceph_argparse.h"
#include "common/version.h"
+#include "common/io_priority.h"
#include "os/ObjectStore.h"
@@ -191,6 +192,7 @@ OSDService::OSDService(OSD *osd) :
push_wq("push_wq", cct->_conf->osd_recovery_thread_timeout, &osd->recovery_tp),
gen_wq("gen_wq", cct->_conf->osd_recovery_thread_timeout, &osd->recovery_tp),
class_handler(osd->class_handler),
+ pg_epoch_lock("OSDService::pg_epoch_lock"),
publish_lock("OSDService::publish_lock"),
pre_publish_lock("OSDService::pre_publish_lock"),
sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
@@ -1277,6 +1279,8 @@ int OSD::init()
disk_tp.start();
command_tp.start();
+ set_disk_tp_priority();
+
// start the heartbeat
heartbeat_thread.create();
@@ -1305,6 +1309,8 @@ int OSD::init()
if (is_stopping())
return 0;
+ check_config();
+
dout(10) << "ensuring pgs have consumed prior maps" << dendl;
consume_map();
peering_wq.drain();
@@ -1663,8 +1669,10 @@ int OSD::shutdown()
dout(10) << "recovery tp stopped" << dendl;
op_tp.drain();
+ peering_wq.clear();
+ scrub_finalize_wq.clear();
op_tp.stop();
- dout(10) << "op tp stopped" << dendl;
+ dout(10) << "osd tp stopped" << dendl;
command_tp.drain();
command_tp.stop();
@@ -1708,7 +1716,6 @@ int OSD::shutdown()
assert(pg_stat_queue.empty());
}
- peering_wq.clear();
// Remove PGs
#ifdef PG_DEBUG_REFS
service.dump_live_pgids();
@@ -1854,6 +1861,8 @@ PG *OSD::_open_lock_pg(
pg_map[pgid] = pg;
+ service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
+
pg->lock(no_lockdep_check);
pg->get("PGMap"); // because it's in pg_map
return pg;
@@ -1885,6 +1894,7 @@ void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
epoch_t e(service.get_osdmap()->get_epoch());
pg->get("PGMap"); // For pg_map
pg_map[pg->info.pgid] = pg;
+ service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
dout(10) << "Adding newly split pg " << *pg << dendl;
vector<int> up, acting;
pg->get_osdmap()->pg_to_up_acting_osds(pg->info.pgid.pgid, up, acting);
@@ -4392,9 +4402,8 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
// However, to avoid the osd from getting hung on this and having
// timers being triggered, we are going to limit the count assuming
// a configurable throughput and duration.
- int64_t total_throughput =
+ int64_t max_count =
g_conf->osd_bench_large_size_max_throughput * duration;
- int64_t max_count = (int64_t) (total_throughput / bsize);
if (count > max_count) {
ss << "'count' values greater than " << max_count
<< " for a block size of " << prettybyte_t(bsize) << ", assuming "
@@ -5713,11 +5722,12 @@ void OSD::check_osdmap_features(ObjectStore *fs)
}
}
{
- Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_MON);
+ Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
uint64_t mask;
uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
if ((p.features_required & mask) != features) {
dout(0) << "crush map has features " << features
+ << " was " << p.features_required
<< ", adjusting msgr requires for mons" << dendl;
p.features_required = (p.features_required & ~mask) | features;
client_messenger->set_policy(entity_name_t::TYPE_MON, p);
@@ -5748,7 +5758,7 @@ void OSD::check_osdmap_features(ObjectStore *fs)
}
}
-void OSD::advance_pg(
+bool OSD::advance_pg(
epoch_t osd_epoch, PG *pg,
ThreadPool::TPHandle &handle,
PG::RecoveryCtx *rctx,
@@ -5759,11 +5769,19 @@ void OSD::advance_pg(
OSDMapRef lastmap = pg->get_osdmap();
if (lastmap->get_epoch() == osd_epoch)
- return;
+ return true;
assert(lastmap->get_epoch() < osd_epoch);
+ epoch_t min_epoch = service.get_min_pg_epoch();
+ epoch_t max;
+ if (min_epoch) {
+ max = min_epoch + g_conf->osd_map_max_advance;
+ } else {
+ max = next_epoch + g_conf->osd_map_max_advance;
+ }
+
for (;
- next_epoch <= osd_epoch;
+ next_epoch <= osd_epoch && next_epoch <= max;
++next_epoch) {
OSDMapRef nextmap = service.try_get_map(next_epoch);
if (!nextmap)
@@ -5795,7 +5813,15 @@ void OSD::advance_pg(
lastmap = nextmap;
handle.reset_tp_timeout();
}
+ service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
pg->handle_activate_map(rctx);
+ if (next_epoch <= osd_epoch) {
+ dout(10) << __func__ << " advanced by max " << g_conf->osd_map_max_advance
+ << " past min epoch " << min_epoch
+ << " ... will requeue " << *pg << dendl;
+ return false;
+ }
+ return true;
}
/**
@@ -6127,7 +6153,7 @@ bool OSD::require_mon_peer(Message *m)
return true;
}
-bool OSD::require_osd_peer(OpRequestRef op)
+bool OSD::require_osd_peer(OpRequestRef& op)
{
if (!op->get_req()->get_connection()->peer_is_osd()) {
dout(0) << "require_osd_peer received from non-osd " << op->get_req()->get_connection()->get_peer_addr()
@@ -6137,11 +6163,64 @@ bool OSD::require_osd_peer(OpRequestRef op)
return true;
}
+bool OSD::require_self_aliveness(OpRequestRef& op, epoch_t epoch)
+{
+ if (epoch < up_epoch) {
+ dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
+ return false;
+ }
+
+ if (!is_active()) {
+ dout(7) << "still in boot state, dropping message " << *op->get_req() << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+bool OSD::require_same_peer_instance(OpRequestRef& op, OSDMapRef& map)
+{
+ Message *m = op->get_req();
+ int from = m->get_source().num();
+
+ if (!map->have_inst(from) ||
+ (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
+ dout(5) << "from dead osd." << from << ", marking down, "
+ << " msg was " << m->get_source_inst().addr
+ << " expected " << (map->have_inst(from) ?
+ map->get_cluster_addr(from) : entity_addr_t())
+ << dendl;
+ ConnectionRef con = m->get_connection();
+ cluster_messenger->mark_down(con.get());
+ Session *s = static_cast<Session*>(con->get_priv());
+ if (s) {
+ con->set_priv(NULL); // break ref <-> session cycle, if any
+ s->put();
+ }
+ return false;
+ }
+ return true;
+}
+
+bool OSD::require_up_osd_peer(OpRequestRef& op, OSDMapRef& map,
+ epoch_t their_epoch)
+{
+ if (!require_self_aliveness(op, their_epoch)) {
+ return false;
+ } else if (!require_osd_peer(op)) {
+ return false;
+ } else if (map->get_epoch() >= their_epoch &&
+ !require_same_peer_instance(op, map)) {
+ return false;
+ }
+ return true;
+}
+
/*
* require that we have same (or newer) map, and that
* the source is the pg primary.
*/
-bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch)
+bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch)
{
Message *m = op->get_req();
dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
@@ -6155,30 +6234,13 @@ bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch)
return false;
}
- if (epoch < up_epoch) {
- dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
+ if (!require_self_aliveness(op, epoch)) {
return false;
}
// ok, our map is same or newer.. do they still exist?
- if (m->get_connection()->get_messenger() == cluster_messenger) {
- int from = m->get_source().num();
- if (!osdmap->have_inst(from) ||
- osdmap->get_cluster_addr(from) != m->get_source_inst().addr) {
- dout(5) << "from dead osd." << from << ", marking down, "
- << " msg was " << m->get_source_inst().addr
- << " expected " << (osdmap->have_inst(from) ? osdmap->get_cluster_addr(from) : entity_addr_t())
- << dendl;
- ConnectionRef con = m->get_connection();
- con->set_priv(NULL); // break ref <-> session cycle, if any
- cluster_messenger->mark_down(con.get());
- return false;
- }
- }
-
- // ok, we have at least as new a map as they do. are we (re)booting?
- if (!is_active()) {
- dout(7) << "still in boot state, dropping message " << *m << dendl;
+ if (m->get_connection()->get_messenger() == cluster_messenger &&
+ !require_same_peer_instance(op, osdmap)) {
return false;
}
@@ -7142,6 +7204,8 @@ void OSD::_remove_pg(PG *pg)
);
remove_wq.queue(make_pair(PGRef(pg), deleting));
+ service.pg_remove_epoch(pg->info.pgid);
+
// remove from map
pg_map.erase(pg->info.pgid);
pg->put("PGMap"); // since we've taken it out of map
@@ -7555,7 +7619,7 @@ void OSD::handle_replica_op(OpRequestRef op)
return;
}
- if (!require_osd_peer(op))
+ if (!require_up_osd_peer(op, osdmap, m->map_epoch))
return;
// must be a rep op.
@@ -7770,8 +7834,9 @@ void OSD::process_peering_events(
pg->unlock();
continue;
}
- advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs);
- if (!pg->peering_queue.empty()) {
+ if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
+ pg->queue_null(curmap->get_epoch(), curmap->get_epoch());
+ } else if (!pg->peering_queue.empty()) {
PG::CephPeeringEvtRef evt = pg->peering_queue.front();
pg->peering_queue.pop_front();
pg->handle_peering_event(evt, &rctx);
@@ -7808,6 +7873,11 @@ const char** OSD::get_tracked_conf_keys() const
"osd_max_backfills",
"osd_op_complaint_time", "osd_op_log_threshold",
"osd_op_history_size", "osd_op_history_duration",
+ "osd_map_cache_size",
+ "osd_map_max_advance",
+ "osd_pg_epoch_persisted_max_stale",
+ "osd_disk_thread_ioprio_class",
+ "osd_disk_thread_ioprio_priority",
NULL
};
return KEYS;
@@ -7830,6 +7900,38 @@ void OSD::handle_conf_change(const struct md_config_t *conf,
op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
cct->_conf->osd_op_history_duration);
}
+ if (changed.count("osd_disk_thread_ioprio_class") ||
+ changed.count("osd_disk_thread_ioprio_priority")) {
+ set_disk_tp_priority();
+ }
+
+ check_config();
+}
+
+void OSD::check_config()
+{
+ // some sanity checks
+ if (g_conf->osd_map_cache_size <= g_conf->osd_map_max_advance + 2) {
+ clog.warn() << "osd_map_cache_size (" << g_conf->osd_map_cache_size << ")"
+ << " is not > osd_map_max_advance ("
+ << g_conf->osd_map_max_advance << ")";
+ }
+ if (g_conf->osd_map_cache_size <= (int)g_conf->osd_pg_epoch_persisted_max_stale + 2) {
+ clog.warn() << "osd_map_cache_size (" << g_conf->osd_map_cache_size << ")"
+ << " is not > osd_pg_epoch_persisted_max_stale ("
+ << g_conf->osd_pg_epoch_persisted_max_stale << ")";
+ }
+}
+
+void OSD::set_disk_tp_priority()
+{
+ dout(10) << __func__
+ << " class " << cct->_conf->osd_disk_thread_ioprio_class
+ << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
+ << dendl;
+ int cls =
+ ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
+ disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
}
// --------------------------------
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index ae8d74e..e2a3c8e 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -334,6 +334,42 @@ public:
void dequeue_pg(PG *pg, list<OpRequestRef> *dequeued);
+ // -- map epoch lower bound --
+ Mutex pg_epoch_lock;
+ multiset<epoch_t> pg_epochs;
+ map<spg_t,epoch_t> pg_epoch;
+
+ void pg_add_epoch(spg_t pgid, epoch_t epoch) {
+ Mutex::Locker l(pg_epoch_lock);
+ map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
+ assert(t == pg_epoch.end());
+ pg_epoch[pgid] = epoch;
+ pg_epochs.insert(epoch);
+ }
+ void pg_update_epoch(spg_t pgid, epoch_t epoch) {
+ Mutex::Locker l(pg_epoch_lock);
+ map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
+ assert(t != pg_epoch.end());
+ pg_epochs.erase(pg_epochs.find(t->second));
+ t->second = epoch;
+ pg_epochs.insert(epoch);
+ }
+ void pg_remove_epoch(spg_t pgid) {
+ Mutex::Locker l(pg_epoch_lock);
+ map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
+ if (t != pg_epoch.end()) {
+ pg_epochs.erase(pg_epochs.find(t->second));
+ pg_epoch.erase(t);
+ }
+ }
+ epoch_t get_min_pg_epoch() {
+ Mutex::Locker l(pg_epoch_lock);
+ if (pg_epochs.empty())
+ return 0;
+ else
+ return *pg_epochs.begin();
+ }
+
// -- superblock --
Mutex publish_lock, pre_publish_lock; // pre-publish orders before publish
OSDSuperblock superblock;
@@ -784,6 +820,7 @@ public:
virtual const char** get_tracked_conf_keys() const;
virtual void handle_conf_change(const struct md_config_t *conf,
const std::set <std::string> &changed);
+ void check_config();
protected:
Mutex osd_lock; // global lock
@@ -944,6 +981,8 @@ private:
bool paused_recovery;
+ void set_disk_tp_priority();
+
// -- sessions --
public:
struct Session : public RefCountedObject {
@@ -1255,7 +1294,7 @@ private:
void note_down_osd(int osd);
void note_up_osd(int osd);
- void advance_pg(
+ bool advance_pg(
epoch_t advance_to, PG *pg,
ThreadPool::TPHandle &handle,
PG::RecoveryCtx *rctx,
@@ -1513,9 +1552,22 @@ protected:
void repeer(PG *pg, map< int, map<spg_t,pg_query_t> >& query_map);
bool require_mon_peer(Message *m);
- bool require_osd_peer(OpRequestRef op);
+ bool require_osd_peer(OpRequestRef& op);
+ /***
+ * Verifies that we were alive in the given epoch, and that
+ * still are.
+ */
+ bool require_self_aliveness(OpRequestRef& op, epoch_t alive_since);
+ /**
+ * Verifies that the OSD who sent the given op has the same
+ * address as in the given map.
+ * @pre op was sent by an OSD using the cluster messenger
+ */
+ bool require_same_peer_instance(OpRequestRef& op, OSDMapRef& map);
+ bool require_up_osd_peer(OpRequestRef& Op, OSDMapRef& map,
+ epoch_t their_epoch);
- bool require_same_or_newer_map(OpRequestRef op, epoch_t e);
+ bool require_same_or_newer_map(OpRequestRef& op, epoch_t e);
void handle_pg_query(OpRequestRef op);
void handle_pg_notify(OpRequestRef op);
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 8c66c76..645a6f7 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -959,10 +959,7 @@ uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
features |= CEPH_FEATURE_CRUSH_TUNABLES;
if (crush->has_nondefault_tunables2())
features |= CEPH_FEATURE_CRUSH_TUNABLES2;
- if (crush->has_v2_rules())
- features |= CEPH_FEATURE_CRUSH_V2;
- if (crush->has_nondefault_tunables3() ||
- crush->has_v3_rules())
+ if (crush->has_nondefault_tunables3())
features |= CEPH_FEATURE_CRUSH_TUNABLES3;
mask |= CEPH_FEATURES_CRUSH;
@@ -978,6 +975,15 @@ uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
p->second.is_tier()) {
features |= CEPH_FEATURE_OSD_CACHEPOOL;
}
+ int ruleid = crush->find_rule(p->second.get_crush_ruleset(),
+ p->second.get_type(),
+ p->second.get_size());
+ if (ruleid >= 0) {
+ if (crush->is_v2_rule(ruleid))
+ features |= CEPH_FEATURE_CRUSH_V2;
+ if (crush->is_v3_rule(ruleid))
+ features |= CEPH_FEATURE_CRUSH_TUNABLES3;
+ }
}
mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
if (entity_type != CEPH_ENTITY_TYPE_CLIENT)
@@ -1801,7 +1807,15 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
ENCODE_START(1, 1, bl); // extended, osd-only data
::encode(osd_addrs->hb_back_addr, bl);
::encode(osd_info, bl);
- ::encode(blacklist, bl);
+ {
+ // put this in a sorted, ordered map<> so that we encode in a
+ // deterministic order.
+ map<entity_addr_t,utime_t> blacklist_map;
+ for (ceph::unordered_map<entity_addr_t,utime_t>::const_iterator p =
+ blacklist.begin(); p != blacklist.end(); ++p)
+ blacklist_map.insert(make_pair(p->first, p->second));
+ ::encode(blacklist_map, bl);
+ }
::encode(osd_addrs->cluster_addr, bl);
::encode(cluster_snapshot_epoch, bl);
::encode(cluster_snapshot, bl);
@@ -2159,6 +2173,7 @@ void OSDMap::generate_test_instances(list<OSDMap*>& o)
uuid_d fsid;
o.back()->build_simple(cct, 1, fsid, 16, 7, 8);
o.back()->created = o.back()->modified = utime_t(1, 2); // fix timestamp
+ o.back()->blacklist[entity_addr_t()] = utime_t(5, 6);
cct->put();
}
@@ -2551,13 +2566,25 @@ int OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
set_weight(i, CEPH_OSD_OUT);
}
- map<string,string> erasure_code_profile_map;
- r = get_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
- ss,
- &erasure_code_profile_map);
- erasure_code_profile_map["directory"] =
+ map<string,string> profile_map;
+ r = get_erasure_code_profile_default(cct, profile_map, &ss);
+ if (r < 0) {
+ lderr(cct) << ss.str() << dendl;
+ return r;
+ }
+ set_erasure_code_profile("default", profile_map);
+ return 0;
+}
+
+int OSDMap::get_erasure_code_profile_default(CephContext *cct,
+ map<string,string> &profile_map,
+ ostream *ss)
+{
+ int r = get_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
+ *ss,
+ &profile_map);
+ profile_map["directory"] =
cct->_conf->osd_pool_default_erasure_code_directory;
- set_erasure_code_profile("default", erasure_code_profile_map);
return r;
}
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index 08064f8..a347583 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -380,6 +380,9 @@ public:
erasure_code_profiles.find(name);
return i != erasure_code_profiles.end();
}
+ int get_erasure_code_profile_default(CephContext *cct,
+ map<string,string> &profile_map,
+ ostream *ss);
void set_erasure_code_profile(const string &name,
const map<string,string> &profile) {
erasure_code_profiles[name] = profile;
diff --git a/src/osd/OpRequest.cc b/src/osd/OpRequest.cc
index 33e7fbd..bfa819d 100644
--- a/src/osd/OpRequest.cc
+++ b/src/osd/OpRequest.cc
@@ -33,7 +33,7 @@ void OpRequest::_dump(utime_t now, Formatter *f) const
stringstream client_name;
client_name << m->get_orig_source();
f->dump_string("client", client_name.str());
- f->dump_int("tid", m->get_tid());
+ f->dump_unsigned("tid", m->get_tid());
f->close_section(); // client_info
}
{
diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h
index 569b6fc..b074bee 100644
--- a/src/osd/OpRequest.h
+++ b/src/osd/OpRequest.h
@@ -74,6 +74,10 @@ struct OpRequest : public TrackedOp {
void _dump(utime_t now, Formatter *f) const;
+ bool has_feature(uint64_t f) const {
+ return request->get_connection()->has_feature(f);
+ }
+
private:
osd_reqid_t reqid;
uint8_t hit_flag_points;
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 42099fb..11a34a2 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -868,6 +868,10 @@ map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
i != infos.end();
++i) {
+ if (max_last_epoch_started_found < i->second.history.last_epoch_started) {
+ min_last_update_acceptable = eversion_t::max();
+ max_last_epoch_started_found = i->second.history.last_epoch_started;
+ }
if (max_last_epoch_started_found < i->second.last_epoch_started) {
min_last_update_acceptable = eversion_t::max();
max_last_epoch_started_found = i->second.last_epoch_started;
@@ -877,7 +881,8 @@ map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
min_last_update_acceptable = i->second.last_update;
}
}
- assert(min_last_update_acceptable != eversion_t::max());
+ if (min_last_update_acceptable == eversion_t::max())
+ return infos.end();
map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
// find osd with newest last_update (oldest for ec_pool).
@@ -1269,11 +1274,19 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id)
ss);
dout(10) << ss.str() << dendl;
- // This might cause a problem if min_size is large
- // and we need to backfill more than 1 osd. Older
- // code would only include 1 backfill osd and now we
- // have the resize above.
- if (want_acting_backfill.size() < pool.info.min_size) {
+ unsigned num_want_acting = 0;
+ for (vector<int>::iterator i = want.begin();
+ i != want.end();
+ ++i) {
+ if (*i != CRUSH_ITEM_NONE)
+ ++num_want_acting;
+ }
+ assert(want_acting_backfill.size() - want_backfill.size() == num_want_acting);
+
+ // This is a bit of a problem, if we allow the pg to go active with
+ // want.size() < min_size, we won't consider the pg to have been
+ // maybe_went_rw in build_prior.
+ if (num_want_acting < pool.info.min_size) {
want_acting.clear();
return false;
}
@@ -1443,7 +1456,7 @@ void PG::activate(ObjectStore::Transaction& t,
min_last_complete_ondisk = eversion_t(0,0); // we don't know (yet)!
}
last_update_applied = info.last_update;
-
+ last_rollback_info_trimmed_to_applied = pg_log.get_rollback_trimmed_to();
need_up_thru = false;
@@ -1474,12 +1487,6 @@ void PG::activate(ObjectStore::Transaction& t,
} else {
dout(10) << "activate - not complete, " << missing << dendl;
pg_log.activate_not_complete(info);
- if (is_primary()) {
- dout(10) << "activate - starting recovery" << dendl;
- osd->queue_for_recovery(this);
- if (have_unfound())
- discover_all_missing(query_map);
- }
}
log_weirdness();
@@ -1642,6 +1649,11 @@ void PG::activate(ObjectStore::Transaction& t,
}
build_might_have_unfound();
+
+ dout(10) << "activate - starting recovery" << dendl;
+ osd->queue_for_recovery(this);
+ if (have_unfound())
+ discover_all_missing(query_map);
}
// degraded?
@@ -2347,6 +2359,7 @@ void PG::init(
dout(10) << __func__ << ": Setting backfill" << dendl;
info.last_backfill = hobject_t();
info.last_complete = info.last_update;
+ pg_log.mark_log_for_rewrite();
}
reg_next_scrub();
@@ -2641,7 +2654,10 @@ void PG::add_log_entry(pg_log_entry_t& e, bufferlist& log_bl)
void PG::append_log(
- vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t,
+ vector<pg_log_entry_t>& logv,
+ eversion_t trim_to,
+ eversion_t trim_rollback_to,
+ ObjectStore::Transaction &t,
bool transaction_applied)
{
if (transaction_applied)
@@ -2655,13 +2671,33 @@ void PG::append_log(
p->offset = 0;
add_log_entry(*p, keys[p->get_key_name()]);
}
- if (!transaction_applied)
- pg_log.clear_can_rollback_to();
+
+ PGLogEntryHandler handler;
+ if (!transaction_applied) {
+ pg_log.clear_can_rollback_to(&handler);
+ t.register_on_applied(
+ new C_UpdateLastRollbackInfoTrimmedToApplied(
+ this,
+ get_osdmap()->get_epoch(),
+ info.last_update));
+ } else if (trim_rollback_to > pg_log.get_rollback_trimmed_to()) {
+ pg_log.trim_rollback_info(
+ trim_rollback_to,
+ &handler);
+ t.register_on_applied(
+ new C_UpdateLastRollbackInfoTrimmedToApplied(
+ this,
+ get_osdmap()->get_epoch(),
+ trim_rollback_to));
+ }
dout(10) << "append_log adding " << keys.size() << " keys" << dendl;
t.omap_setkeys(coll_t::META_COLL, log_oid, keys);
- PGLogEntryHandler handler;
+
pg_log.trim(&handler, trim_to, info);
+
+ dout(10) << __func__ << ": trimming to " << trim_rollback_to
+ << " entries " << handler.to_trim << dendl;
handler.apply(this, &t);
// update the local pg, pg log
@@ -3004,7 +3040,8 @@ bool PG::sched_scrub()
void PG::reg_next_scrub()
{
- if (scrubber.must_scrub) {
+ if (scrubber.must_scrub ||
+ (info.stats.stats_invalid && g_conf->osd_scrub_invalid_stats)) {
scrubber.scrub_reg_stamp = utime_t();
} else {
scrubber.scrub_reg_stamp = info.history.last_scrub_stamp;
@@ -3262,6 +3299,34 @@ void PG::scrub_unreserve_replicas()
}
}
+void PG::_scan_rollback_obs(
+ const vector<ghobject_t> &rollback_obs,
+ ThreadPool::TPHandle &handle)
+{
+ ObjectStore::Transaction *t = NULL;
+ eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
+ for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
+ i != rollback_obs.end();
+ ++i) {
+ if (i->generation < trimmed_to.version) {
+ osd->clog.error() << "osd." << osd->whoami
+ << " pg " << info.pgid
+ << " found obsolete rollback obj "
+ << *i << " generation < trimmed_to "
+ << trimmed_to
+ << "...repaired";
+ if (!t)
+ t = new ObjectStore::Transaction;
+ t->remove(coll, *i);
+ }
+ }
+ if (t) {
+ derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
+ << dendl;
+ osd->store->queue_transaction_and_cleanup(osr.get(), t);
+ }
+}
+
void PG::_scan_snaps(ScrubMap &smap)
{
for (map<hobject_t, ScrubMap::object>::iterator i = smap.objects.begin();
@@ -3349,13 +3414,21 @@ int PG::build_scrub_map_chunk(
// objects
vector<hobject_t> ls;
- int ret = get_pgbackend()->objects_list_range(start, end, 0, &ls);
+ vector<ghobject_t> rollback_obs;
+ int ret = get_pgbackend()->objects_list_range(
+ start,
+ end,
+ 0,
+ &ls,
+ &rollback_obs);
if (ret < 0) {
dout(5) << "objects_list_range error: " << ret << dendl;
return ret;
}
+
get_pgbackend()->be_scan_list(map, ls, deep, handle);
+ _scan_rollback_obs(rollback_obs, handle);
_scan_snaps(map);
// pg attrs
@@ -3578,6 +3651,17 @@ void PG::replica_scrub(
void PG::scrub(ThreadPool::TPHandle &handle)
{
lock();
+ if (g_conf->osd_scrub_sleep > 0 &&
+ (scrubber.state == PG::Scrubber::NEW_CHUNK ||
+ scrubber.state == PG::Scrubber::INACTIVE)) {
+ dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
+ unlock();
+ utime_t t;
+ t.set_from_double(g_conf->osd_scrub_sleep);
+ t.sleep();
+ lock();
+ dout(20) << __func__ << " slept for " << t << dendl;
+ }
if (deleting) {
unlock();
return;
@@ -4631,6 +4715,21 @@ void PG::start_flush(ObjectStore::Transaction *t,
on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
}
+void PG::reset_interval_flush()
+{
+ dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
+ recovery_state.clear_blocked_outgoing();
+
+ if (!osr->flush_commit(
+ new QueuePeeringEvt<IntervalFlush>(
+ this, get_osdmap()->get_epoch(), IntervalFlush()))) {
+ dout(10) << "Beginning to block outgoing recovery messages" << dendl;
+ recovery_state.begin_block_outgoing();
+ } else {
+ dout(10) << "Not blocking outgoing recovery messages" << dendl;
+ }
+}
+
/* Called before initializing peering during advance_map */
void PG::start_peering_interval(
const OSDMapRef lastmap,
@@ -4641,6 +4740,7 @@ void PG::start_peering_interval(
const OSDMapRef osdmap = get_osdmap();
set_last_peering_reset();
+ reset_interval_flush();
vector<int> oldacting, oldup;
int oldrole = get_role();
@@ -5050,7 +5150,7 @@ bool PG::can_discard_request(OpRequestRef op)
case MSG_OSD_PG_PUSH_REPLY:
return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
case MSG_OSD_SUBOPREPLY:
- return false;
+ return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
case MSG_OSD_EC_WRITE:
return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
@@ -5386,6 +5486,15 @@ PG::RecoveryState::Started::Started(my_context ctx)
}
boost::statechart::result
+PG::RecoveryState::Started::react(const IntervalFlush&)
+{
+ dout(10) << "Ending blocked outgoing recovery messages" << dendl;
+ context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
+ return discard_event();
+}
+
+
+boost::statechart::result
PG::RecoveryState::Started::react(const FlushedEvt&)
{
PG *pg = context< RecoveryMachine >().pg;
@@ -5436,6 +5545,7 @@ PG::RecoveryState::Reset::Reset(my_context ctx)
{
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
+
pg->flushes_in_progress = 0;
pg->set_last_peering_reset();
}
@@ -5448,6 +5558,14 @@ PG::RecoveryState::Reset::react(const FlushedEvt&)
return discard_event();
}
+boost::statechart::result
+PG::RecoveryState::Reset::react(const IntervalFlush&)
+{
+ dout(10) << "Ending blocked outgoing recovery messages" << dendl;
+ context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
+ return discard_event();
+}
+
boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
{
PG *pg = context< RecoveryMachine >().pg;
@@ -5715,7 +5833,7 @@ void PG::RecoveryState::Backfilling::exit()
PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
: my_base(ctx),
NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitRemoteBackfillReserved"),
- backfill_osd_it(context< Active >().sorted_backfill_set.begin())
+ backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
{
context< RecoveryMachine >().log_enter(state_name);
PG *pg = context< RecoveryMachine >().pg;
@@ -5728,7 +5846,7 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserve
{
PG *pg = context< RecoveryMachine >().pg;
- if (backfill_osd_it != context< Active >().sorted_backfill_set.end()) {
+ if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
//The primary never backfills itself
assert(*backfill_osd_it != pg->pg_whoami);
ConnectionRef con = pg->osd->get_con_osd_cluster(
@@ -5770,8 +5888,8 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationReje
// Send REJECT to all previously acquired reservations
set<pg_shard_t>::const_iterator it, begin, end, next;
- begin = context< Active >().sorted_backfill_set.begin();
- end = context< Active >().sorted_backfill_set.end();
+ begin = context< Active >().remote_shards_to_reserve_backfill.begin();
+ end = context< Active >().remote_shards_to_reserve_backfill.end();
assert(begin != end);
for (next = it = begin, ++next ; next != backfill_osd_it; ++it, ++next) {
//The primary never backfills itself
@@ -5830,6 +5948,18 @@ PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
context< RecoveryMachine >().log_enter(state_name);
}
+boost::statechart::result
+PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
+{
+ return discard_event();
+}
+
+boost::statechart::result
+PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
+{
+ return discard_event();
+}
+
void PG::RecoveryState::NotBackfilling::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
@@ -6021,7 +6151,7 @@ void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
: my_base(ctx),
NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
- acting_osd_it(context< Active >().sorted_actingbackfill_set.begin())
+ remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
{
context< RecoveryMachine >().log_enter(state_name);
post_event(RemoteRecoveryReserved());
@@ -6031,28 +6161,26 @@ boost::statechart::result
PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
PG *pg = context< RecoveryMachine >().pg;
- if (acting_osd_it != context< Active >().sorted_actingbackfill_set.end()) {
- // skip myself
- if (*acting_osd_it == pg->pg_whoami)
- ++acting_osd_it;
+ if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
+ assert(*remote_recovery_reservation_it != pg->pg_whoami);
}
- if (acting_osd_it != context< Active >().sorted_actingbackfill_set.end()) {
+ if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
ConnectionRef con = pg->osd->get_con_osd_cluster(
- acting_osd_it->osd, pg->get_osdmap()->get_epoch());
+ remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
if (con) {
if (con->has_feature(CEPH_FEATURE_RECOVERY_RESERVATION)) {
pg->osd->send_message_osd_cluster(
new MRecoveryReserve(
MRecoveryReserve::REQUEST,
- spg_t(pg->info.pgid.pgid, acting_osd_it->shard),
+ spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
pg->get_osdmap()->get_epoch()),
con.get());
} else {
post_event(RemoteRecoveryReserved());
}
}
- ++acting_osd_it;
+ ++remote_recovery_reservation_it;
} else {
post_event(AllRemotesReserved());
}
@@ -6086,8 +6214,8 @@ void PG::RecoveryState::Recovering::release_reservations()
// release remote reservations
for (set<pg_shard_t>::const_iterator i =
- context< Active >().sorted_actingbackfill_set.begin();
- i != context< Active >().sorted_actingbackfill_set.end();
+ context< Active >().remote_shards_to_reserve_recovery.begin();
+ i != context< Active >().remote_shards_to_reserve_recovery.end();
++i) {
if (*i == pg->pg_whoami) // skip myself
continue;
@@ -6196,16 +6324,34 @@ void PG::RecoveryState::Clean::exit()
pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
}
+template <typename T>
+set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
+{
+ set<int> osds_found;
+ set<pg_shard_t> out;
+ for (typename T::const_iterator i = in.begin();
+ i != in.end();
+ ++i) {
+ if (*i != skip && !osds_found.count(i->osd)) {
+ osds_found.insert(i->osd);
+ out.insert(*i);
+ }
+ }
+ return out;
+}
+
/*---------Active---------*/
PG::RecoveryState::Active::Active(my_context ctx)
: my_base(ctx),
NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active"),
- sorted_actingbackfill_set(
- context< RecoveryMachine >().pg->actingbackfill.begin(),
- context< RecoveryMachine >().pg->actingbackfill.end()),
- sorted_backfill_set(
- context< RecoveryMachine >().pg->backfill_targets.begin(),
- context< RecoveryMachine >().pg->backfill_targets.end()),
+ remote_shards_to_reserve_recovery(
+ unique_osd_shard_set(
+ context< RecoveryMachine >().pg->pg_whoami,
+ context< RecoveryMachine >().pg->actingbackfill)),
+ remote_shards_to_reserve_backfill(
+ unique_osd_shard_set(
+ context< RecoveryMachine >().pg->pg_whoami,
+ context< RecoveryMachine >().pg->backfill_targets)),
all_replicas_activated(false)
{
context< RecoveryMachine >().log_enter(state_name);
@@ -6588,6 +6734,7 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
MOSDPGLog *msg = logevt.msg.get();
dout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
+ ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
if (msg->info.last_backfill == hobject_t()) {
// restart backfill
pg->unreg_next_scrub();
@@ -6595,10 +6742,13 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
pg->reg_next_scrub();
pg->dirty_info = true;
pg->dirty_big_info = true; // maybe.
- pg->pg_log.claim_log(msg->log);
+
+ PGLogEntryHandler rollbacker;
+ pg->pg_log.claim_log_and_clear_rollback_info(msg->log, &rollbacker);
+ rollbacker.apply(pg, t);
+
pg->pg_log.reset_backfill();
} else {
- ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
pg->merge_log(*t, msg->info, msg->log, logevt.from);
}
@@ -7492,9 +7642,40 @@ bool PG::PriorSet::affected_by_map(const OSDMapRef osdmap, const PG *debug_pg) c
void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
assert(!rctx);
- rctx = new_ctx;
- if (rctx)
+ assert(!orig_ctx);
+ orig_ctx = new_ctx;
+ if (new_ctx) {
+ if (messages_pending_flush) {
+ rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
+ } else {
+ rctx = *new_ctx;
+ }
rctx->start_time = ceph_clock_now(pg->cct);
+ }
+}
+
+void PG::RecoveryState::begin_block_outgoing() {
+ assert(!messages_pending_flush);
+ assert(orig_ctx);
+ assert(rctx);
+ messages_pending_flush = BufferedRecoveryMessages();
+ rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
+}
+
+void PG::RecoveryState::clear_blocked_outgoing() {
+ assert(orig_ctx);
+ assert(rctx);
+ messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
+}
+
+void PG::RecoveryState::end_block_outgoing() {
+ assert(messages_pending_flush);
+ assert(orig_ctx);
+ assert(rctx);
+
+ rctx = RecoveryCtx(*orig_ctx);
+ rctx->accept_buffered_messages(*messages_pending_flush);
+ messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
}
void PG::RecoveryState::end_handle() {
@@ -7502,8 +7683,10 @@ void PG::RecoveryState::end_handle() {
utime_t dur = ceph_clock_now(pg->cct) - rctx->start_time;
machine.event_time += dur;
}
+
machine.event_count++;
- rctx = 0;
+ rctx = boost::optional<RecoveryCtx>();
+ orig_ctx = NULL;
}
void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
diff --git a/src/osd/PG.h b/src/osd/PG.h
index e9f3981..1aadaf0 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -447,6 +447,25 @@ public:
eversion_t last_complete_ondisk; // last_complete that has committed.
eversion_t last_update_applied;
+
+ struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
+ PGRef pg;
+ epoch_t e;
+ eversion_t v;
+ C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
+ : pg(pg), e(e), v(v) {}
+ void finish(int) {
+ pg->lock();
+ if (!pg->pg_has_reset_since(e)) {
+ pg->last_rollback_info_trimmed_to_applied = v;
+ }
+ pg->unlock();
+ }
+ };
+ // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
+ // and the transaction has applied
+ eversion_t last_rollback_info_trimmed_to_applied;
+
// primary state
public:
pg_shard_t primary;
@@ -487,6 +506,12 @@ public:
public:
+ struct BufferedRecoveryMessages {
+ map<int, map<spg_t, pg_query_t> > query_map;
+ map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > info_map;
+ map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > notify_list;
+ };
+
struct RecoveryCtx {
utime_t start_time;
map<int, map<spg_t, pg_query_t> > *query_map;
@@ -508,6 +533,48 @@ public:
on_applied(on_applied),
on_safe(on_safe),
transaction(transaction) {}
+
+ RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
+ : query_map(&(buf.query_map)),
+ info_map(&(buf.info_map)),
+ notify_list(&(buf.notify_list)),
+ on_applied(rctx.on_applied),
+ on_safe(rctx.on_safe),
+ transaction(rctx.transaction) {}
+
+ void accept_buffered_messages(BufferedRecoveryMessages &m) {
+ assert(query_map);
+ assert(info_map);
+ assert(notify_list);
+ for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
+ i != m.query_map.end();
+ ++i) {
+ map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
+ for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
+ j != i->second.end();
+ ++j) {
+ omap[j->first] = j->second;
+ }
+ }
+ for (map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator i
+ = m.info_map.begin();
+ i != m.info_map.end();
+ ++i) {
+ vector<pair<pg_notify_t, pg_interval_map_t> > &ovec =
+ (*info_map)[i->first];
+ ovec.reserve(ovec.size() + i->second.size());
+ ovec.insert(ovec.end(), i->second.begin(), i->second.end());
+ }
+ for (map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator i
+ = m.notify_list.begin();
+ i != m.notify_list.end();
+ ++i) {
+ vector<pair<pg_notify_t, pg_interval_map_t> > &ovec =
+ (*notify_list)[i->first];
+ ovec.reserve(ovec.size() + i->second.size());
+ ovec.insert(ovec.end(), i->second.begin(), i->second.end());
+ }
+ }
};
struct NamedState {
@@ -1108,6 +1175,9 @@ public:
void scrub_clear_state();
bool scrub_gather_replica_maps();
void _scan_snaps(ScrubMap &map);
+ void _scan_rollback_obs(
+ const vector<ghobject_t> &rollback_obs,
+ ThreadPool::TPHandle &handle);
void _request_scrub_map_classic(pg_shard_t replica, eversion_t version);
void _request_scrub_map(pg_shard_t replica, eversion_t version,
hobject_t start, hobject_t end, bool deep);
@@ -1333,10 +1403,17 @@ public:
TrivialEvent(AllReplicasActivated)
+ TrivialEvent(IntervalFlush)
+
/* Encapsulates PG recovery process */
class RecoveryState {
void start_handle(RecoveryCtx *new_ctx);
void end_handle();
+ public:
+ void begin_block_outgoing();
+ void end_block_outgoing();
+ void clear_blocked_outgoing();
+ private:
/* States */
struct Initial;
@@ -1360,40 +1437,47 @@ public:
/* Accessor functions for state methods */
ObjectStore::Transaction* get_cur_transaction() {
+ assert(state->rctx);
assert(state->rctx->transaction);
return state->rctx->transaction;
}
void send_query(pg_shard_t to, const pg_query_t &query) {
+ assert(state->rctx);
assert(state->rctx->query_map);
(*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
query;
}
map<int, map<spg_t, pg_query_t> > *get_query_map() {
+ assert(state->rctx);
assert(state->rctx->query_map);
return state->rctx->query_map;
}
map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *get_info_map() {
+ assert(state->rctx);
assert(state->rctx->info_map);
return state->rctx->info_map;
}
list< Context* > *get_on_safe_context_list() {
+ assert(state->rctx);
assert(state->rctx->on_safe);
return &(state->rctx->on_safe->contexts);
}
list< Context * > *get_on_applied_context_list() {
+ assert(state->rctx);
assert(state->rctx->on_applied);
return &(state->rctx->on_applied->contexts);
}
- RecoveryCtx *get_recovery_ctx() { return state->rctx; }
+ RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
void send_notify(pg_shard_t to,
const pg_notify_t &info, const pg_interval_map_t &pi) {
+ assert(state->rctx);
assert(state->rctx->notify_list);
(*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
}
@@ -1439,12 +1523,14 @@ public:
boost::statechart::custom_reaction< ActMap >,
boost::statechart::custom_reaction< NullEvt >,
boost::statechart::custom_reaction< FlushedEvt >,
+ boost::statechart::custom_reaction< IntervalFlush >,
boost::statechart::transition< boost::statechart::event_base, Crashed >
> reactions;
boost::statechart::result react(const QueryState& q);
boost::statechart::result react(const AdvMap&);
boost::statechart::result react(const ActMap&);
boost::statechart::result react(const FlushedEvt&);
+ boost::statechart::result react(const IntervalFlush&);
boost::statechart::result react(const boost::statechart::event_base&) {
return discard_event();
}
@@ -1461,11 +1547,13 @@ public:
boost::statechart::custom_reaction< AdvMap >,
boost::statechart::custom_reaction< NullEvt >,
boost::statechart::custom_reaction< FlushedEvt >,
+ boost::statechart::custom_reaction< IntervalFlush >,
boost::statechart::transition< boost::statechart::event_base, Crashed >
> reactions;
boost::statechart::result react(const QueryState& q);
boost::statechart::result react(const AdvMap&);
boost::statechart::result react(const FlushedEvt&);
+ boost::statechart::result react(const IntervalFlush&);
boost::statechart::result react(const boost::statechart::event_base&) {
return discard_event();
}
@@ -1555,8 +1643,8 @@ public:
Active(my_context ctx);
void exit();
- const set<pg_shard_t> sorted_actingbackfill_set;
- const set<pg_shard_t> sorted_backfill_set;
+ const set<pg_shard_t> remote_shards_to_reserve_recovery;
+ const set<pg_shard_t> remote_shards_to_reserve_backfill;
bool all_replicas_activated;
typedef boost::mpl::list <
@@ -1635,10 +1723,14 @@ public:
struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
typedef boost::mpl::list<
- boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>
+ boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
+ boost::statechart::custom_reaction< RemoteBackfillReserved >,
+ boost::statechart::custom_reaction< RemoteReservationRejected >
> reactions;
NotBackfilling(my_context ctx);
void exit();
+ boost::statechart::result react(const RemoteBackfillReserved& evt);
+ boost::statechart::result react(const RemoteReservationRejected& evt);
};
struct RepNotRecovering;
@@ -1721,7 +1813,7 @@ public:
boost::statechart::custom_reaction< RemoteRecoveryReserved >,
boost::statechart::transition< AllRemotesReserved, Recovering >
> reactions;
- set<pg_shard_t>::const_iterator acting_osd_it;
+ set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
WaitRemoteRecoveryReserved(my_context ctx);
boost::statechart::result react(const RemoteRecoveryReserved &evt);
void exit();
@@ -1855,10 +1947,23 @@ public:
RecoveryMachine machine;
PG *pg;
- RecoveryCtx *rctx;
+
+ /// context passed in by state machine caller
+ RecoveryCtx *orig_ctx;
+
+ /// populated if we are buffering messages pending a flush
+ boost::optional<BufferedRecoveryMessages> messages_pending_flush;
+
+ /**
+ * populated between start_handle() and end_handle(), points into
+ * the message lists for messages_pending_flush while blocking messages
+ * or into orig_ctx otherwise
+ */
+ boost::optional<RecoveryCtx> rctx;
public:
- RecoveryState(PG *pg) : machine(this, pg), pg(pg), rctx(0) {
+ RecoveryState(PG *pg)
+ : machine(this, pg), pg(pg), orig_ctx(0) {
machine.initiate();
}
@@ -1996,7 +2101,10 @@ public:
void add_log_entry(pg_log_entry_t& e, bufferlist& log_bl);
void append_log(
- vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t,
+ vector<pg_log_entry_t>& logv,
+ eversion_t trim_to,
+ eversion_t trim_rollback_to,
+ ObjectStore::Transaction &t,
bool transaction_applied = true);
bool check_log_for_corruption(ObjectStore *store);
void trim_peers();
@@ -2026,6 +2134,7 @@ public:
/// share new pg log entries after a pg is active
void share_pg_log();
+ void reset_interval_flush();
void start_peering_interval(
const OSDMapRef lastmap,
const vector<int>& newup, int up_primary,
diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc
index e1aceef..57faadd 100644
--- a/src/osd/PGBackend.cc
+++ b/src/osd/PGBackend.cc
@@ -115,7 +115,11 @@ int PGBackend::objects_list_partial(
hobject_t *next)
{
assert(ls);
- ghobject_t _next(begin);
+ // Starts with the smallest shard id and generation to
+ // make sure the result list has the marker object (
+ // it might have multiple generations though, which would
+ // be filtered).
+ ghobject_t _next(begin, 0, shard_id_t(0));
ls->reserve(max);
int r = 0;
while (!_next.is_max() && ls->size() < (unsigned)min) {
@@ -147,7 +151,8 @@ int PGBackend::objects_list_range(
const hobject_t &start,
const hobject_t &end,
snapid_t seq,
- vector<hobject_t> *ls)
+ vector<hobject_t> *ls,
+ vector<ghobject_t> *gen_obs)
{
assert(ls);
vector<ghobject_t> objects;
@@ -163,6 +168,8 @@ int PGBackend::objects_list_range(
++i) {
if (i->is_no_gen()) {
ls->push_back(i->hobj);
+ } else if (gen_obs) {
+ gen_obs->push_back(*i);
}
}
return r;
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index e10201a..4070752 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -177,6 +177,7 @@
vector<pg_log_entry_t> &logv,
boost::optional<pg_hit_set_history_t> &hset_history,
const eversion_t &trim_to,
+ const eversion_t &trim_rollback_to,
bool transaction_applied,
ObjectStore::Transaction *t) = 0;
@@ -496,6 +497,7 @@
const eversion_t &at_version, ///< [in] version
PGTransaction *t, ///< [in] trans to execute
const eversion_t &trim_to, ///< [in] trim log to here
+ const eversion_t &trim_rollback_to, ///< [in] trim rollback info to here
vector<pg_log_entry_t> &log_entries, ///< [in] log entries for t
/// [in] hitset history (if updated with this transaction)
boost::optional<pg_hit_set_history_t> &hset_history,
@@ -555,7 +557,8 @@
const hobject_t &start,
const hobject_t &end,
snapid_t seq,
- vector<hobject_t> *ls);
+ vector<hobject_t> *ls,
+ vector<ghobject_t> *gen_obs=0);
int objects_get_attr(
const hobject_t &hoid,
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index c3addd7..9523b12 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -24,6 +24,25 @@
//////////////////// PGLog::IndexedLog ////////////////////
+void PGLog::IndexedLog::advance_rollback_info_trimmed_to(
+ eversion_t to,
+ LogEntryHandler *h)
+{
+ assert(to <= can_rollback_to);
+
+ if (to > rollback_info_trimmed_to)
+ rollback_info_trimmed_to = to;
+
+ while (rollback_info_trimmed_to_riter != log.rbegin()) {
+ --rollback_info_trimmed_to_riter;
+ if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) {
+ ++rollback_info_trimmed_to_riter;
+ break;
+ }
+ h->trim(*rollback_info_trimmed_to_riter);
+ }
+}
+
void PGLog::IndexedLog::split_into(
pg_t child_pgid,
unsigned split_bits,
@@ -47,9 +66,11 @@ void PGLog::IndexedLog::split_into(
oldlog.erase(i++);
}
+
+ olog->can_rollback_to = can_rollback_to;
+
olog->index();
index();
- olog->can_rollback_to = can_rollback_to;
}
void PGLog::IndexedLog::trim(
@@ -59,10 +80,15 @@ void PGLog::IndexedLog::trim(
{
if (complete_to != log.end() &&
complete_to->version <= s) {
- generic_dout(0) << " bad trim to " << s << " when complete_to is " << complete_to->version
+ generic_dout(0) << " bad trim to " << s << " when complete_to is "
+ << complete_to->version
<< " on " << *this << dendl;
}
+ if (s > can_rollback_to)
+ can_rollback_to = s;
+ advance_rollback_info_trimmed_to(s, handler);
+
while (!log.empty()) {
pg_log_entry_t &e = *log.begin();
if (e.version > s)
@@ -70,9 +96,15 @@ void PGLog::IndexedLog::trim(
generic_dout(20) << "trim " << e << dendl;
if (trimmed)
trimmed->insert(e.version);
- handler->trim(e);
+
unindex(e); // remove from index,
- log.pop_front(); // from log
+
+ if (e.version == rollback_info_trimmed_to_riter->version) {
+ log.pop_front();
+ rollback_info_trimmed_to_riter = log.rend();
+ } else {
+ log.pop_front();
+ }
}
// raise tail?
@@ -104,7 +136,7 @@ void PGLog::reset_backfill()
void PGLog::clear() {
divergent_priors.clear();
missing.clear();
- log.zero();
+ log.clear();
log_keys_debug.clear();
undirty();
}
diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h
index f793cbd..1744cc8 100644
--- a/src/osd/PGLog.h
+++ b/src/osd/PGLog.h
@@ -62,12 +62,34 @@ struct PGLog {
list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item
version_t last_requested; // last object requested by primary
+ //
+ private:
+ /**
+ * rollback_info_trimmed_to_riter points to the first log entry <=
+ * rollback_info_trimmed_to
+ *
+ * It's a reverse_iterator because rend() is a natural representation for
+ * tail, and rbegin() works nicely for head.
+ */
+ list<pg_log_entry_t>::reverse_iterator rollback_info_trimmed_to_riter;
+ public:
+ void advance_rollback_info_trimmed_to(eversion_t to, LogEntryHandler *h);
+
/****/
- IndexedLog() : last_requested(0) {}
+ IndexedLog() :
+ complete_to(log.end()),
+ last_requested(0),
+ rollback_info_trimmed_to_riter(log.rbegin())
+ {}
+
+ void claim_log_and_clear_rollback_info(const pg_log_t& o) {
+ // we must have already trimmed the old entries
+ assert(rollback_info_trimmed_to == head);
+ assert(rollback_info_trimmed_to_riter == log.rbegin());
- void claim_log(const pg_log_t& o) {
log = o.log;
head = o.head;
+ rollback_info_trimmed_to = head;
tail = o.tail;
index();
}
@@ -78,10 +100,20 @@ struct PGLog {
IndexedLog *olog);
void zero() {
+ // we must have already trimmed the old entries
+ assert(rollback_info_trimmed_to == head);
+ assert(rollback_info_trimmed_to_riter == log.rbegin());
+
unindex();
pg_log_t::clear();
+ rollback_info_trimmed_to_riter = log.rbegin();
reset_recovery_pointers();
}
+ void clear() {
+ rollback_info_trimmed_to = head;
+ rollback_info_trimmed_to_riter = log.rbegin();
+ zero();
+ }
void reset_recovery_pointers() {
complete_to = log.end();
last_requested = 0;
@@ -112,6 +144,11 @@ struct PGLog {
caller_ops[i->reqid] = &(*i);
}
}
+
+ rollback_info_trimmed_to_riter = log.rbegin();
+ while (rollback_info_trimmed_to_riter != log.rend() &&
+ rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to)
+ rollback_info_trimmed_to_riter++;
}
void index(pg_log_entry_t& e) {
@@ -141,6 +178,11 @@ struct PGLog {
void add(pg_log_entry_t& e) {
// add to log
log.push_back(e);
+
+ // riter previously pointed to the previous entry
+ if (rollback_info_trimmed_to_riter == log.rbegin())
+ ++rollback_info_trimmed_to_riter;
+
assert(e.version > head);
assert(head.version == 0 || e.version.version > head.version);
head = e.version;
@@ -325,14 +367,33 @@ public:
eversion_t trim_to,
pg_info_t &info);
- void clear_can_rollback_to() {
+ void trim_rollback_info(
+ eversion_t trim_rollback_to,
+ LogEntryHandler *h) {
+ if (trim_rollback_to > log.can_rollback_to)
+ log.can_rollback_to = trim_rollback_to;
+ log.advance_rollback_info_trimmed_to(
+ trim_rollback_to,
+ h);
+ }
+
+ eversion_t get_rollback_trimmed_to() const {
+ return log.rollback_info_trimmed_to;
+ }
+
+ void clear_can_rollback_to(LogEntryHandler *h) {
log.can_rollback_to = log.head;
+ log.advance_rollback_info_trimmed_to(
+ log.head,
+ h);
}
//////////////////// get or set log & missing ////////////////////
- void claim_log(const pg_log_t &o) {
- log.claim_log(o);
+ void claim_log_and_clear_rollback_info(const pg_log_t &o, LogEntryHandler *h) {
+ log.can_rollback_to = log.head;
+ log.advance_rollback_info_trimmed_to(log.head, h);
+ log.claim_log_and_clear_rollback_info(o);
missing.clear();
mark_dirty_to(eversion_t::max());
}
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index 489aee4..4430b39 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -494,6 +494,7 @@ void ReplicatedBackend::submit_transaction(
const eversion_t &at_version,
PGTransaction *_t,
const eversion_t &trim_to,
+ const eversion_t &trim_rollback_to,
vector<pg_log_entry_t> &log_entries,
boost::optional<pg_hit_set_history_t> &hset_history,
Context *on_local_applied_sync,
@@ -534,6 +535,7 @@ void ReplicatedBackend::submit_transaction(
tid,
reqid,
trim_to,
+ trim_rollback_to,
t->get_temp_added().size() ? *(t->get_temp_added().begin()) : hobject_t(),
t->get_temp_cleared().size() ?
*(t->get_temp_cleared().begin()) :hobject_t(),
@@ -549,7 +551,13 @@ void ReplicatedBackend::submit_transaction(
}
clear_temp_objs(t->get_temp_cleared());
- parent->log_operation(log_entries, hset_history, trim_to, true, &local_t);
+ parent->log_operation(
+ log_entries,
+ hset_history,
+ trim_to,
+ trim_rollback_to,
+ true,
+ &local_t);
local_t.append(*op_t);
local_t.swap(*op_t);
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
index 2d75d42..5e1f0ec 100644
--- a/src/osd/ReplicatedBackend.h
+++ b/src/osd/ReplicatedBackend.h
@@ -342,6 +342,7 @@ public:
const eversion_t &at_version,
PGTransaction *t,
const eversion_t &trim_to,
+ const eversion_t &trim_rollback_to,
vector<pg_log_entry_t> &log_entries,
boost::optional<pg_hit_set_history_t> &hset_history,
Context *on_local_applied_sync,
@@ -359,6 +360,7 @@ private:
ceph_tid_t tid,
osd_reqid_t reqid,
eversion_t pg_trim_to,
+ eversion_t pg_trim_rollback_to,
hobject_t new_temp_oid,
hobject_t discard_temp_oid,
vector<pg_log_entry_t> &log_entries,
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 15d2edf..5600466 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -1120,6 +1120,12 @@ void ReplicatedPG::do_request(
waiting_for_active.push_back(op);
return;
}
+ // verify client features
+ if ((pool.info.has_tiers() || pool.info.is_tier()) &&
+ !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
+ osd->reply_op_error(op, -EOPNOTSUPP);
+ return;
+ }
do_op(op); // do it now
break;
@@ -1352,9 +1358,10 @@ void ReplicatedPG::do_op(OpRequestRef op)
hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
hit_set_persist();
}
+ }
- if (agent_state)
- agent_choose_mode();
+ if (agent_state) {
+ agent_choose_mode();
}
if ((m->get_flags() & CEPH_OSD_FLAG_IGNORE_CACHE) == 0 &&
@@ -4854,8 +4861,9 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
if (pool.info.require_rollback())
ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
snap_oi = &ctx->clone_obc->obs.oi;
- bool got = ctx->clone_obc->get_write(ctx->op);
+ bool got = ctx->clone_obc->get_write_greedy(ctx->op);
assert(got);
+ dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
} else {
snap_oi = &static_snap_oi;
}
@@ -5160,8 +5168,9 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
0, osd_reqid_t(), ctx->mtime));
ctx->snapset_obc = get_object_context(snapoid, true);
- bool got = ctx->snapset_obc->get_write(ctx->op);
+ bool got = ctx->snapset_obc->get_write_greedy(ctx->op);
assert(got);
+ dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
ctx->release_snapset_obc = true;
if (pool.info.require_rollback() && !ctx->snapset_obc->obs.exists) {
ctx->log.back().mod_desc.create();
@@ -6026,6 +6035,11 @@ void ReplicatedPG::cancel_copy(CopyOpRef cop, bool requeue)
cop->results.should_requeue = requeue;
CopyCallbackResults result(-ECANCELED, &cop->results);
cop->cb->complete(result);
+
+ // There may still be an objecter callback referencing this copy op.
+ // That callback will not need the obc since it's been canceled, and
+ // we need the obc reference to go away prior to flush.
+ cop->obc = ObjectContextRef();
}
void ReplicatedPG::cancel_copy_ops(bool requeue)
@@ -6169,58 +6183,53 @@ int ReplicatedPG::start_flush(
cancel_flush(fop, false);
}
- // construct a SnapContext appropriate for this clone/head
- SnapContext dsnapc;
- dsnapc.seq = 0;
- SnapContext snapc;
- if (soid.snap == CEPH_NOSNAP) {
- snapc.seq = snapset.seq;
- snapc.snaps = snapset.snaps;
+ /**
+ * In general, we need to send two deletes and a copyfrom.
+ * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
+ * where 4 is marked as clean. To flush 10, we have to:
+ * 1) delete 4:[4,3,2] -- ensure head is created at cloneid 4
+ * 2) delete (8-1):[4,3,2] -- ensure that the object does not exist at 8
+ * 3) copyfrom 8:[8,4,3,2] -- flush object excluding snap 8
+ *
+ * The second delete is required in case at some point in the past
+ * there had been a clone 7(7,6), which we had flushed. Without
+ * the second delete, the object would appear in the base pool to
+ * have existed.
+ */
- if (!snapset.clones.empty() && snapset.clones.back() != snapset.seq) {
- dsnapc.seq = snapset.clones.back();
- vector<snapid_t>::iterator p = snapset.snaps.begin();
- while (p != snapset.snaps.end() && *p > dsnapc.seq)
- ++p;
- dsnapc.snaps = vector<snapid_t>(p, snapset.snaps.end());
+ SnapContext snapc, dsnapc, dsnapc2;
+ if (snapset.seq != 0) {
+ if (soid.snap == CEPH_NOSNAP) {
+ snapc.seq = snapset.seq;
+ snapc.snaps = snapset.snaps;
+ } else {
+ snapid_t min_included_snap = oi.snaps.back();
+ snapc = snapset.get_ssc_as_of(min_included_snap - 1);
}
- } else {
- vector<snapid_t>::iterator citer = std::find(
- snapset.clones.begin(),
- snapset.clones.end(),
- soid.snap);
- assert(citer != snapset.clones.end());
- snapid_t prev_snapc = (citer == snapset.clones.begin()) ?
- snapid_t(0) : *(citer - 1);
-
- vector<snapid_t>::iterator p = snapset.snaps.begin();
- while (p != snapset.snaps.end() && *p >= oi.snaps.back())
- ++p;
- snapc.snaps = vector<snapid_t>(p, snapset.snaps.end());
- while (p != snapset.snaps.end() && *p >= oi.snaps.back())
- ++p;
- vector<snapid_t>::iterator dnewest = p;
-
- // we may need to send a delete first
- while (p != snapset.snaps.end() && *p > prev_snapc)
- ++p;
- dsnapc.snaps = vector<snapid_t>(p, snapset.snaps.end());
+ snapid_t prev_snapc = 0;
+ for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
+ citer != snapset.clones.rend();
+ ++citer) {
+ if (*citer < soid.snap) {
+ prev_snapc = *citer;
+ break;
+ }
+ }
- if (p == dnewest) {
- // no snaps between the oldest in this clone and prev_snapc
- snapc.seq = prev_snapc;
- } else {
- // snaps between oldest in this clone and prev_snapc, send delete
- dsnapc.seq = prev_snapc;
- snapc.seq = oi.snaps.back() - 1;
+ if (prev_snapc != snapc.seq) {
+ dsnapc = snapset.get_ssc_as_of(prev_snapc);
+ snapid_t first_snap_after_prev_snapc =
+ snapset.get_first_snap_after(prev_snapc, snapc.seq);
+ dsnapc2 = snapset.get_ssc_as_of(
+ first_snap_after_prev_snapc - 1);
}
}
object_locator_t base_oloc(soid);
base_oloc.pool = pool.info.tier_of;
- if (dsnapc.seq > 0) {
+ if (dsnapc.seq > 0 && dsnapc.seq < snapc.seq) {
ObjectOperation o;
o.remove();
osd->objecter_lock.Lock();
@@ -6238,6 +6247,24 @@ int ReplicatedPG::start_flush(
osd->objecter_lock.Unlock();
}
+ if (dsnapc2.seq > dsnapc.seq && dsnapc2.seq < snapc.seq) {
+ ObjectOperation o;
+ o.remove();
+ osd->objecter_lock.Lock();
+ osd->objecter->mutate(
+ soid.oid,
+ base_oloc,
+ o,
+ dsnapc2,
+ oi.mtime,
+ (CEPH_OSD_FLAG_IGNORE_OVERLAY |
+ CEPH_OSD_FLAG_ORDERSNAP |
+ CEPH_OSD_FLAG_ENFORCE_SNAPC),
+ NULL,
+ NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
+ osd->objecter_lock.Unlock();
+ }
+
FlushOpRef fop(new FlushOp);
fop->obc = obc;
fop->flushed_version = oi.user_version;
@@ -6442,7 +6469,7 @@ void ReplicatedPG::cancel_flush_ops(bool requeue)
bool ReplicatedPG::is_present_clone(hobject_t coid)
{
- if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
+ if (!pool.info.allow_incomplete_clones())
return true;
if (is_missing_object(coid))
return true;
@@ -6735,6 +6762,7 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now)
repop->ctx->at_version,
repop->ctx->op_t,
pg_trim_to,
+ min_last_complete_ondisk,
repop->ctx->log,
repop->ctx->updated_hset_history,
onapplied_sync,
@@ -6752,6 +6780,7 @@ void ReplicatedBackend::issue_op(
ceph_tid_t tid,
osd_reqid_t reqid,
eversion_t pg_trim_to,
+ eversion_t pg_trim_rollback_to,
hobject_t new_temp_oid,
hobject_t discard_temp_oid,
vector<pg_log_entry_t> &log_entries,
@@ -6807,6 +6836,7 @@ void ReplicatedBackend::issue_op(
wr->pg_stats = get_info().stats;
wr->pg_trim_to = pg_trim_to;
+ wr->pg_trim_rollback_to = pg_trim_rollback_to;
wr->new_temp_oid = new_temp_oid;
wr->discard_temp_oid = discard_temp_oid;
@@ -6841,6 +6871,12 @@ ReplicatedPG::RepGather *ReplicatedPG::new_repop(OpContext *ctx, ObjectContextRe
void ReplicatedPG::remove_repop(RepGather *repop)
{
dout(20) << __func__ << " " << *repop << dendl;
+ if (repop->ctx->obc)
+ dout(20) << " obc " << *repop->ctx->obc << dendl;
+ if (repop->ctx->clone_obc)
+ dout(20) << " clone_obc " << *repop->ctx->clone_obc << dendl;
+ if (repop->ctx->snapset_obc)
+ dout(20) << " snapset_obc " << *repop->ctx->snapset_obc << dendl;
release_op_ctx_locks(repop->ctx);
repop->ctx->finish(0); // FIXME: return value here is sloppy
repop_map.erase(repop->rep_tid);
@@ -7607,6 +7643,7 @@ void ReplicatedBackend::sub_op_modify(OpRequestRef op)
log,
m->updated_hit_set_history,
m->pg_trim_to,
+ m->pg_trim_rollback_to,
update_snaps,
&(rm->localt));
@@ -7702,8 +7739,8 @@ void ReplicatedBackend::calc_head_subsets(
if (size)
data_subset.insert(0, size);
- if (get_parent()->get_pool().cache_mode != pg_pool_t::CACHEMODE_NONE) {
- dout(10) << __func__ << ": caching enabled, skipping clone subsets" << dendl;
+ if (get_parent()->get_pool().allow_incomplete_clones()) {
+ dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
return;
}
@@ -7762,8 +7799,8 @@ void ReplicatedBackend::calc_clone_subsets(
if (size)
data_subset.insert(0, size);
- if (get_parent()->get_pool().cache_mode != pg_pool_t::CACHEMODE_NONE) {
- dout(10) << __func__ << ": caching enabled, skipping clone subsets" << dendl;
+ if (get_parent()->get_pool().allow_incomplete_clones()) {
+ dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
return;
}
@@ -9465,6 +9502,17 @@ void ReplicatedPG::on_role_change()
void ReplicatedPG::on_pool_change()
{
dout(10) << __func__ << dendl;
+ // requeue cache full waiters just in case the cache_mode is
+ // changing away from writeback mode. note that if we are not
+ // active the normal requeuing machinery is sufficient (and properly
+ // ordered).
+ if (is_active() &&
+ pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+ !waiting_for_cache_not_full.empty()) {
+ dout(10) << __func__ << " requeuing full waiters (not in writeback) "
+ << dendl;
+ requeue_ops(waiting_for_cache_not_full);
+ }
hit_set_setup();
agent_setup();
}
@@ -10701,6 +10749,9 @@ void ReplicatedPG::hit_set_create()
if (p->target_size < static_cast<uint64_t>(g_conf->osd_hit_set_min_size))
p->target_size = g_conf->osd_hit_set_min_size;
+ if (p->target_size > static_cast<uint64_t>(g_conf->osd_hit_set_max_size))
+ p->target_size = g_conf->osd_hit_set_max_size;
+
p->seed = now.sec();
dout(10) << __func__ << " target_size " << p->target_size
@@ -11289,7 +11340,8 @@ bool ReplicatedPG::agent_maybe_evict(ObjectContextRef& obc)
}
}
- if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
+ if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL &&
+ hit_set) {
// is this object old and/or cold enough?
int atime = -1, temp = 0;
agent_estimate_atime_temp(soid, &atime, NULL /*FIXME &temp*/);
@@ -11421,7 +11473,11 @@ void ReplicatedPG::agent_choose_mode(bool restart)
num_dirty = 0;
}
- dout(10) << __func__ << ": "
+ dout(10) << __func__
+ << " flush_mode: "
+ << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
+ << " evict_mode: "
+ << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
<< " num_objects: " << info.stats.stats.sum.num_objects
<< " num_bytes: " << info.stats.stats.sum.num_bytes
<< " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
@@ -11435,7 +11491,7 @@ void ReplicatedPG::agent_choose_mode(bool restart)
// get dirty, full ratios
uint64_t dirty_micro = 0;
uint64_t full_micro = 0;
- if (pool.info.target_max_bytes && info.stats.stats.sum.num_objects) {
+ if (pool.info.target_max_bytes && info.stats.stats.sum.num_objects > 0) {
uint64_t avg_size = info.stats.stats.sum.num_bytes /
info.stats.stats.sum.num_objects;
dirty_micro =
@@ -11445,7 +11501,7 @@ void ReplicatedPG::agent_choose_mode(bool restart)
num_user_objects * avg_size * 1000000 /
MAX(pool.info.target_max_bytes / divisor, 1);
}
- if (pool.info.target_max_objects) {
+ if (pool.info.target_max_objects > 0) {
uint64_t dirty_objects_micro =
num_dirty * 1000000 /
MAX(pool.info.target_max_objects / divisor, 1);
@@ -11531,8 +11587,10 @@ void ReplicatedPG::agent_choose_mode(bool restart)
<< " -> "
<< TierAgentState::get_evict_mode_name(evict_mode)
<< dendl;
- if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
+ if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
+ is_active()) {
requeue_ops(waiting_for_cache_not_full);
+ requeue_ops(waiting_for_active);
}
agent_state->evict_mode = evict_mode;
}
@@ -11660,7 +11718,7 @@ void ReplicatedPG::_scrub(ScrubMap& scrubmap)
// did we finish the last oid?
if (head != hobject_t() &&
- pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
+ !pool.info.allow_incomplete_clones()) {
osd->clog.error() << mode << " " << info.pgid << " " << head
<< " missing clones";
++scrubber.shallow_errors;
@@ -11721,7 +11779,7 @@ void ReplicatedPG::_scrub(ScrubMap& scrubmap)
//
if (!next_clone.is_min() && next_clone != soid &&
- pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE) {
+ pool.info.allow_incomplete_clones()) {
// it is okay to be missing one or more clones in a cache tier.
// skip higher-numbered clones in the list.
while (curclone != snapset.clones.rend() &&
@@ -11809,7 +11867,7 @@ void ReplicatedPG::_scrub(ScrubMap& scrubmap)
}
if (!next_clone.is_min() &&
- pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
+ !pool.info.allow_incomplete_clones()) {
osd->clog.error() << mode << " " << info.pgid
<< " expected clone " << next_clone;
++scrubber.shallow_errors;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 562cb06..9ef131c 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -347,13 +347,14 @@ public:
vector<pg_log_entry_t> &logv,
boost::optional<pg_hit_set_history_t> &hset_history,
const eversion_t &trim_to,
+ const eversion_t &trim_rollback_to,
bool transaction_applied,
ObjectStore::Transaction *t) {
if (hset_history) {
info.hit_set = *hset_history;
dirty_info = true;
}
- append_log(logv, trim_to, *t, transaction_applied);
+ append_log(logv, trim_to, trim_rollback_to, *t, transaction_applied);
}
void op_applied(
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index c57ee86..16bdbaf 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -2102,8 +2102,8 @@ void pg_notify_t::dump(Formatter *f) const
{
f->dump_int("from", from);
f->dump_int("to", to);
- f->dump_stream("query_epoch") << query_epoch;
- f->dump_stream("epoch_sent") << epoch_sent;
+ f->dump_unsigned("query_epoch", query_epoch);
+ f->dump_unsigned("epoch_sent", epoch_sent);
{
f->open_object_section("info");
info.dump(f);
@@ -2461,8 +2461,8 @@ struct DumpVisitor : public ObjectModDesc::Visitor {
void ObjectModDesc::dump(Formatter *f) const
{
f->open_object_section("object_mod_desc");
- f->dump_stream("can_local_rollback") << can_local_rollback;
- f->dump_stream("stashed") << stashed;
+ f->dump_bool("can_local_rollback", can_local_rollback);
+ f->dump_bool("rollback_info_completed", rollback_info_completed);
{
f->open_array_section("ops");
DumpVisitor vis(f);
@@ -2497,7 +2497,7 @@ void ObjectModDesc::encode(bufferlist &_bl) const
{
ENCODE_START(1, 1, _bl);
::encode(can_local_rollback, _bl);
- ::encode(stashed, _bl);
+ ::encode(rollback_info_completed, _bl);
::encode(bl, _bl);
ENCODE_FINISH(_bl);
}
@@ -2505,7 +2505,7 @@ void ObjectModDesc::decode(bufferlist::iterator &_bl)
{
DECODE_START(1, _bl);
::decode(can_local_rollback, _bl);
- ::decode(stashed, _bl);
+ ::decode(rollback_info_completed, _bl);
::decode(bl, _bl);
DECODE_FINISH(_bl);
}
@@ -2680,17 +2680,18 @@ ostream& operator<<(ostream& out, const pg_log_entry_t& e)
void pg_log_t::encode(bufferlist& bl) const
{
- ENCODE_START(5, 3, bl);
+ ENCODE_START(6, 3, bl);
::encode(head, bl);
::encode(tail, bl);
::encode(log, bl);
::encode(can_rollback_to, bl);
+ ::encode(rollback_info_trimmed_to, bl);
ENCODE_FINISH(bl);
}
void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
{
- DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
::decode(head, bl);
::decode(tail, bl);
if (struct_v < 2) {
@@ -2700,6 +2701,11 @@ void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
::decode(log, bl);
if (struct_v >= 5)
::decode(can_rollback_to, bl);
+
+ if (struct_v >= 6)
+ ::decode(rollback_info_trimmed_to, bl);
+ else
+ rollback_info_trimmed_to = tail;
DECODE_FINISH(bl);
// handle hobject_t format change
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index b70951c..8e9cf6f 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -811,9 +811,10 @@ struct pg_pool_t {
}
enum {
- FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding)
- FLAG_FULL = 2, // pool is full
+ FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
+ FLAG_FULL = 1<<1, // pool is full
FLAG_DEBUG_FAKE_EC_POOL = 1<<2, // require ReplicatedPG to act like an EC pg
+ FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
};
static const char *get_flag_name(int f) {
@@ -821,6 +822,7 @@ struct pg_pool_t {
case FLAG_HASHPSPOOL: return "hashpspool";
case FLAG_FULL: return "full";
case FLAG_DEBUG_FAKE_EC_POOL: return "require_local_rollback";
+ case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
default: return "???";
}
}
@@ -868,6 +870,18 @@ struct pg_pool_t {
const char *get_cache_mode_name() const {
return get_cache_mode_name(cache_mode);
}
+ bool cache_mode_requires_hit_set() const {
+ switch (cache_mode) {
+ case CACHEMODE_NONE:
+ case CACHEMODE_FORWARD:
+ case CACHEMODE_READONLY:
+ return false;
+ case CACHEMODE_WRITEBACK:
+ return true;
+ default:
+ assert(0 == "implement me");
+ }
+ }
uint64_t flags; ///< FLAG_*
__u8 type; ///< TYPE_*
@@ -916,11 +930,29 @@ public:
bool is_tier() const { return tier_of >= 0; }
bool has_tiers() const { return !tiers.empty(); }
- void clear_tier() { tier_of = -1; }
+ void clear_tier() {
+ tier_of = -1;
+ clear_read_tier();
+ clear_write_tier();
+ clear_tier_tunables();
+ }
bool has_read_tier() const { return read_tier >= 0; }
void clear_read_tier() { read_tier = -1; }
bool has_write_tier() const { return write_tier >= 0; }
void clear_write_tier() { write_tier = -1; }
+ void clear_tier_tunables() {
+ if (cache_mode != CACHEMODE_NONE)
+ flags |= FLAG_INCOMPLETE_CLONES;
+ cache_mode = CACHEMODE_NONE;
+
+ target_max_bytes = 0;
+ target_max_objects = 0;
+ cache_target_dirty_ratio_micro = 0;
+ cache_target_full_ratio_micro = 0;
+ hit_set_params = HitSet::Params();
+ hit_set_period = 0;
+ hit_set_count = 0;
+ }
uint64_t target_max_bytes; ///< tiering: target max pool size
uint64_t target_max_objects; ///< tiering: target max pool size
@@ -964,6 +996,7 @@ public:
void dump(Formatter *f) const;
uint64_t get_flags() const { return flags; }
+ bool has_flag(uint64_t f) const { return flags & f; }
/// This method will later return true for ec pools as well
bool ec_pool() const {
@@ -973,6 +1006,11 @@ public:
return ec_pool() || flags & FLAG_DEBUG_FAKE_EC_POOL;
}
+ /// true if incomplete clones may be present
+ bool allow_incomplete_clones() const {
+ return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
+ }
+
unsigned get_type() const { return type; }
unsigned get_size() const { return size; }
unsigned get_min_size() const { return min_size; }
@@ -1811,7 +1849,7 @@ inline ostream& operator<<(ostream& out, const pg_query_t& q) {
class PGBackend;
class ObjectModDesc {
bool can_local_rollback;
- bool stashed;
+ bool rollback_info_completed;
public:
class Visitor {
public:
@@ -1831,22 +1869,22 @@ public:
CREATE = 4,
UPDATE_SNAPS = 5
};
- ObjectModDesc() : can_local_rollback(true), stashed(false) {}
+ ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {}
void claim(ObjectModDesc &other) {
bl.clear();
bl.claim(other.bl);
can_local_rollback = other.can_local_rollback;
- stashed = other.stashed;
+ rollback_info_completed = other.rollback_info_completed;
}
void claim_append(ObjectModDesc &other) {
- if (!can_local_rollback || stashed)
+ if (!can_local_rollback || rollback_info_completed)
return;
if (!other.can_local_rollback) {
mark_unrollbackable();
return;
}
bl.claim_append(other.bl);
- stashed = other.stashed;
+ rollback_info_completed = other.rollback_info_completed;
}
void swap(ObjectModDesc &other) {
bl.swap(other.bl);
@@ -1855,16 +1893,16 @@ public:
other.can_local_rollback = can_local_rollback;
can_local_rollback = temp;
- temp = other.stashed;
- other.stashed = stashed;
- stashed = temp;
+ temp = other.rollback_info_completed;
+ other.rollback_info_completed = rollback_info_completed;
+ rollback_info_completed = temp;
}
void append_id(ModID id) {
uint8_t _id(id);
::encode(_id, bl);
}
void append(uint64_t old_size) {
- if (!can_local_rollback || stashed)
+ if (!can_local_rollback || rollback_info_completed)
return;
ENCODE_START(1, 1, bl);
append_id(APPEND);
@@ -1872,7 +1910,7 @@ public:
ENCODE_FINISH(bl);
}
void setattrs(map<string, boost::optional<bufferlist> > &old_attrs) {
- if (!can_local_rollback || stashed)
+ if (!can_local_rollback || rollback_info_completed)
return;
ENCODE_START(1, 1, bl);
append_id(SETATTRS);
@@ -1880,24 +1918,25 @@ public:
ENCODE_FINISH(bl);
}
bool rmobject(version_t deletion_version) {
- if (!can_local_rollback || stashed)
+ if (!can_local_rollback || rollback_info_completed)
return false;
ENCODE_START(1, 1, bl);
append_id(DELETE);
::encode(deletion_version, bl);
ENCODE_FINISH(bl);
- stashed = true;
+ rollback_info_completed = true;
return true;
}
void create() {
- if (!can_local_rollback || stashed)
+ if (!can_local_rollback || rollback_info_completed)
return;
+ rollback_info_completed = true;
ENCODE_START(1, 1, bl);
append_id(CREATE);
ENCODE_FINISH(bl);
}
void update_snaps(set<snapid_t> &old_snaps) {
- if (!can_local_rollback || stashed)
+ if (!can_local_rollback || rollback_info_completed)
return;
ENCODE_START(1, 1, bl);
append_id(UPDATE_SNAPS);
@@ -2061,6 +2100,10 @@ struct pg_log_t {
// We can rollback rollback-able entries > can_rollback_to
eversion_t can_rollback_to;
+ // always <= can_rollback_to, indicates how far stashed rollback
+ // data can be found
+ eversion_t rollback_info_trimmed_to;
+
list<pg_log_entry_t> log; // the actual log.
pg_log_t() {}
@@ -2492,6 +2535,29 @@ struct SnapSet {
void decode(bufferlist::iterator& bl);
void dump(Formatter *f) const;
static void generate_test_instances(list<SnapSet*>& o);
+
+ SnapContext get_ssc_as_of(snapid_t as_of) const {
+ SnapContext out;
+ out.seq = as_of;
+ for (vector<snapid_t>::const_iterator i = snaps.begin();
+ i != snaps.end();
+ ++i) {
+ if (*i <= as_of)
+ out.snaps.push_back(*i);
+ }
+ return out;
+ }
+
+ // return min element of snaps > after, return max if no such element
+ snapid_t get_first_snap_after(snapid_t after, snapid_t max) const {
+ for (vector<snapid_t>::const_reverse_iterator i = snaps.rbegin();
+ i != snaps.rend();
+ ++i) {
+ if (*i > after)
+ return *i;
+ }
+ return max;
+ }
};
WRITE_CLASS_ENCODER(SnapSet)
@@ -2762,19 +2828,21 @@ public:
}
}
- bool get_write(OpRequestRef op) {
- if (get_write_lock()) {
+ bool get_write(OpRequestRef op, bool greedy=false) {
+ if (get_write_lock(greedy)) {
return true;
} // else
if (op)
waiters.push_back(op);
return false;
}
- bool get_write_lock() {
- // don't starve anybody!
- if (!waiters.empty() ||
- backfill_read_marker) {
- return false;
+ bool get_write_lock(bool greedy=false) {
+ if (!greedy) {
+ // don't starve anybody!
+ if (!waiters.empty() ||
+ backfill_read_marker) {
+ return false;
+ }
}
switch (state) {
case RWNONE:
@@ -2823,7 +2891,10 @@ public:
return rwstate.get_read(op);
}
bool get_write(OpRequestRef op) {
- return rwstate.get_write(op);
+ return rwstate.get_write(op, false);
+ }
+ bool get_write_greedy(OpRequestRef op) {
+ return rwstate.get_write(op, true);
}
bool get_snaptrimmer_write() {
if (rwstate.get_write_lock()) {
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index e165266..d82b3e1 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -1364,6 +1364,11 @@ int Objecter::op_cancel(ceph_tid_t tid, int r)
ldout(cct, 10) << __func__ << " tid " << tid << dendl;
Op *op = p->second;
+ if (op->con) {
+ ldout(cct, 20) << " revoking rx buffer for " << tid
+ << " on " << op->con << dendl;
+ op->con->revoke_rx_buffer(tid);
+ }
if (op->onack) {
op->onack->complete(r);
op->onack = NULL;
@@ -1434,7 +1439,7 @@ int64_t Objecter::get_object_pg_hash_position(int64_t pool, const string& key,
return p->raw_hash_to_pg(p->hash_key(key, ns));
}
-int Objecter::calc_target(op_target_t *t)
+int Objecter::calc_target(op_target_t *t, bool any_change)
{
bool is_read = t->flags & CEPH_OSD_FLAG_READ;
bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
@@ -1491,7 +1496,8 @@ int Objecter::calc_target(op_target_t *t)
}
if (t->pgid != pgid ||
- is_pg_changed(t->primary, t->acting, primary, acting, t->used_replica) ||
+ is_pg_changed(
+ t->primary, t->acting, primary, acting, t->used_replica || any_change) ||
force_resend) {
t->pgid = pgid;
t->acting = acting;
@@ -1570,7 +1576,7 @@ int Objecter::recalc_op_target(Op *op)
bool Objecter::recalc_linger_op_target(LingerOp *linger_op)
{
- int r = calc_target(&linger_op->target);
+ int r = calc_target(&linger_op->target, true);
if (r == RECALC_OP_TARGET_NEED_RESEND) {
ldout(cct, 10) << "recalc_linger_op_target tid " << linger_op->linger_id
<< " pgid " << linger_op->target.pgid
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 1e6fcf3..2ede888 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -1480,7 +1480,7 @@ public:
bool osdmap_full_flag() const;
bool target_should_be_paused(op_target_t *op);
- int calc_target(op_target_t *t);
+ int calc_target(op_target_t *t, bool any_change=false);
int recalc_op_target(Op *op);
bool recalc_linger_op_target(LingerOp *op);
diff --git a/src/pybind/rados.py b/src/pybind/rados.py
index e5da077..0fbd10e 100644
--- a/src/pybind/rados.py
+++ b/src/pybind/rados.py
@@ -1089,8 +1089,11 @@ class Ioctx(object):
:returns: completion object
"""
buf = create_string_buffer(length)
- def oncomplete_(completion):
- return oncomplete(completion, buf.value)
+ def oncomplete_(completion_v):
+ return_value = completion_v.get_return_value()
+ return oncomplete(completion_v,
+ ctypes.string_at(buf, return_value) if return_value >= 0 else None)
+
completion = self.__get_completion(oncomplete_, None)
ret = run_in_thread(self.librados.rados_aio_read,
(self.io, c_char_p(object_name),
diff --git a/src/pybind/rbd.py b/src/pybind/rbd.py
index bf07576..ab093ce 100644
--- a/src/pybind/rbd.py
+++ b/src/pybind/rbd.py
@@ -750,6 +750,14 @@ written." % (self.name, ret, length))
if ret < 0:
raise make_ex(ret, 'error flushing image')
+ def invalidate_cache(self):
+ """
+ Drop any cached data for the image.
+ """
+ ret = self.librbd.rbd_invalidate_cache(self.image)
+ if ret < 0:
+ raise make_ex(ret, 'error invalidating cache')
+
def stripe_unit(self):
"""
Returns the stripe unit used for the image.
diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc
index 58913cc..5a1043f 100644
--- a/src/rgw/rgw_common.cc
+++ b/src/rgw/rgw_common.cc
@@ -697,13 +697,15 @@ bool url_decode(string& src_str, string& dest_str)
int pos = 0;
char c;
+ bool in_query = false;
while (*src) {
if (*src != '%') {
- if (*src != '+') {
- dest[pos++] = *src++;
+ if (!in_query || *src != '+') {
+ if (*src == '?') in_query = true;
+ dest[pos++] = *src++;
} else {
- dest[pos++] = ' ';
- ++src;
+ dest[pos++] = ' ';
+ ++src;
}
} else {
src++;
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 8979619..7694748 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -1380,7 +1380,10 @@ public:
int RGWPutObjProcessor_Multipart::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
{
- RGWPutObjProcessor::prepare(store, obj_ctx, NULL);
+ int r = prepare_init(store, obj_ctx, NULL);
+ if (r < 0) {
+ return r;
+ }
string oid = obj_str;
upload_id = s->info.args.get("uploadId");
@@ -1419,7 +1422,7 @@ int RGWPutObjProcessor_Multipart::prepare(RGWRados *store, void *obj_ctx, string
manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, num);
- int r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, target_obj);
+ r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, target_obj);
if (r < 0) {
return r;
}
@@ -1560,6 +1563,36 @@ int RGWPutObj::user_manifest_iterate_cb(rgw_bucket& bucket, RGWObjEnt& ent, RGWA
return 0;
}
+static int put_data_and_throttle(RGWPutObjProcessor *processor, bufferlist& data, off_t ofs,
+ MD5 *hash, bool need_to_wait)
+{
+ const unsigned char *data_ptr = (hash ? (const unsigned char *)data.c_str() : NULL);
+ bool again;
+ uint64_t len = data.length();
+
+ do {
+ void *handle;
+
+ int ret = processor->handle_data(data, ofs, &handle, &again);
+ if (ret < 0)
+ return ret;
+
+ if (hash) {
+ hash->Update(data_ptr, len);
+ hash = NULL; /* only calculate hash once */
+ }
+
+ ret = processor->throttle_data(handle, need_to_wait);
+ if (ret < 0)
+ return ret;
+
+ need_to_wait = false; /* the need to wait only applies to the first iteration */
+ } while (again);
+
+ return 0;
+}
+
+
void RGWPutObj::execute()
{
RGWPutObjProcessor *processor = NULL;
@@ -1633,23 +1666,12 @@ void RGWPutObj::execute()
if (!len)
break;
- void *handle;
- const unsigned char *data_ptr = (const unsigned char *)data.c_str();
-
- ret = processor->handle_data(data, ofs, &handle);
- if (ret < 0)
- goto done;
-
- if (need_calc_md5) {
- hash.Update(data_ptr, len);
- }
-
/* do we need this operation to be synchronous? if we're dealing with an object with immutable
* head, e.g., multipart object we need to make sure we're the first one writing to this object
*/
bool need_to_wait = (ofs == 0) && multipart;
- ret = processor->throttle_data(handle, need_to_wait);
+ ret = put_data_and_throttle(processor, data, ofs, (need_calc_md5 ? &hash : NULL), need_to_wait);
if (ret < 0) {
if (!need_to_wait || ret != -EEXIST) {
ldout(s->cct, 20) << "processor->thottle_data() returned ret=" << ret << dendl;
@@ -1674,15 +1696,8 @@ void RGWPutObj::execute()
goto done;
}
- ret = processor->handle_data(data, ofs, &handle);
+ ret = put_data_and_throttle(processor, data, ofs, NULL, false);
if (ret < 0) {
- ldout(s->cct, 0) << "ERROR: processor->handle_data() returned " << ret << dendl;
- goto done;
- }
-
- ret = processor->throttle_data(handle, false);
- if (ret < 0) {
- ldout(s->cct, 0) << "ERROR: processor->throttle_data() returned " << ret << dendl;
goto done;
}
}
@@ -1846,18 +1861,7 @@ void RGWPostObj::execute()
if (!len)
break;
- void *handle;
- const unsigned char *data_ptr = (const unsigned char *)data.c_str();
-
- ret = processor->handle_data(data, ofs, &handle);
- if (ret < 0)
- goto done;
-
- hash.Update(data_ptr, len);
-
- ret = processor->throttle_data(handle, false);
- if (ret < 0)
- goto done;
+ ret = put_data_and_throttle(processor, data, ofs, &hash, false);
ofs += len;
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 7ca4a9d..e22bef0 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -900,8 +900,10 @@ int RGWPutObjProcessor_Plain::prepare(RGWRados *store, void *obj_ctx, string *oi
return 0;
};
-int RGWPutObjProcessor_Plain::handle_data(bufferlist& bl, off_t _ofs, void **phandle)
+int RGWPutObjProcessor_Plain::handle_data(bufferlist& bl, off_t _ofs, void **phandle, bool *again)
{
+ *again = false;
+
if (ofs != _ofs)
return -EINVAL;
@@ -1026,8 +1028,10 @@ int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phan
return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
}
-int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle)
+int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again)
{
+ *again = false;
+
*phandle = NULL;
if (extra_data_len) {
size_t extra_len = bl.length();
@@ -1044,13 +1048,16 @@ int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **pha
}
}
- uint64_t max_chunk_size = store->get_max_chunk_size();
+ uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
pending_data_bl.claim_append(bl);
- if (pending_data_bl.length() < max_chunk_size)
+ if (pending_data_bl.length() < max_write_size)
return 0;
- pending_data_bl.splice(0, max_chunk_size, &bl);
+ pending_data_bl.splice(0, max_write_size, &bl);
+
+ /* do we have enough data pending accumulated that needs to be written? */
+ *again = (pending_data_bl.length() >= max_chunk_size);
if (!data_ofs && !immutable_head()) {
first_chunk.claim(bl);
@@ -1070,17 +1077,30 @@ int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **pha
return write_data(bl, write_ofs, phandle, exclusive);
}
-int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
+
+int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, void *obj_ctx, string *oid_rand)
{
RGWPutObjProcessor::prepare(store, obj_ctx, oid_rand);
- head_obj.init(bucket, obj_str);
+ int r = store->get_max_chunk_size(bucket, &max_chunk_size);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
- uint64_t max_chunk_size = store->get_max_chunk_size();
+int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
+{
+ int r = prepare_init(store, obj_ctx, oid_rand);
+ if (r < 0) {
+ return r;
+ }
+ head_obj.init(bucket, obj_str);
manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
- int r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, head_obj);
+ r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, head_obj);
if (r < 0) {
return r;
}
@@ -1201,6 +1221,44 @@ void RGWRadosCtx::set_prefetch_data(rgw_obj& obj) {
}
}
+int RGWRados::get_required_alignment(rgw_bucket& bucket, uint64_t *alignment)
+{
+ IoCtx ioctx;
+ int r = open_bucket_data_ctx(bucket, ioctx);
+ if (r < 0) {
+ ldout(cct, 0) << "ERROR: open_bucket_data_ctx() returned " << r << dendl;
+ return r;
+ }
+
+ *alignment = ioctx.pool_required_alignment();
+ return 0;
+}
+
+int RGWRados::get_max_chunk_size(rgw_bucket& bucket, uint64_t *max_chunk_size)
+{
+ uint64_t alignment;
+ int r = get_required_alignment(bucket, &alignment);
+ if (r < 0) {
+ return r;
+ }
+
+ uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
+
+ if (alignment == 0) {
+ *max_chunk_size = config_chunk_size;
+ return 0;
+ }
+
+ if (config_chunk_size <= alignment) {
+ *max_chunk_size = alignment;
+ return 0;
+ }
+
+ *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
+
+ return 0;
+}
+
void RGWRados::finalize()
{
if (need_watch_notify()) {
@@ -1236,8 +1294,6 @@ int RGWRados::init_rados()
{
int ret;
- max_chunk_size = cct->_conf->rgw_max_chunk_size;
-
rados = new Rados();
if (!rados)
return -ENOMEM;
@@ -2957,25 +3013,33 @@ public:
int handle_data(bufferlist& bl, off_t ofs, off_t len) {
progress_cb(ofs, progress_data);
- void *handle;
- int ret = processor->handle_data(bl, ofs, &handle);
- if (ret < 0)
- return ret;
+ bool again;
- if (opstate) {
- /* need to update opstate repository with new state. This is ratelimited, so we're not
- * really doing it every time
- */
- ret = opstate->renew_state();
- if (ret < 0) {
- /* could not renew state! might have been marked as cancelled */
+ bool need_opstate = true;
+
+ do {
+ void *handle;
+ int ret = processor->handle_data(bl, ofs, &handle, &again);
+ if (ret < 0)
return ret;
+
+ if (need_opstate && opstate) {
+ /* need to update opstate repository with new state. This is ratelimited, so we're not
+ * really doing it every time
+ */
+ ret = opstate->renew_state();
+ if (ret < 0) {
+ /* could not renew state! might have been marked as cancelled */
+ return ret;
+ }
+
+ need_opstate = false;
}
- }
- ret = processor->throttle_data(handle, false);
- if (ret < 0)
- return ret;
+ ret = processor->throttle_data(handle, false);
+ if (ret < 0)
+ return ret;
+ } while (again);
return 0;
}
@@ -3192,24 +3256,6 @@ set_err_state:
vector<rgw_obj> ref_objs;
- bool copy_data = !astate->has_manifest;
- bool copy_first = false;
- if (astate->has_manifest) {
- if (!astate->manifest.has_tail()) {
- copy_data = true;
- } else {
- uint64_t head_size = astate->manifest.get_head_size();
-
- if (head_size > 0) {
- if (head_size > max_chunk_size) // should never happen
- copy_data = true;
- else
- copy_first = true;
- }
- }
- }
-
-
if (remote_dest) {
/* dest is in a different region, copy it there */
@@ -3230,8 +3276,35 @@ set_err_state:
return ret;
return 0;
- } else if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
- return copy_obj_data(ctx, dest_bucket_info.owner, &handle, end, dest_obj, src_obj, mtime, src_attrs, category, ptag, err);
+ }
+
+ uint64_t max_chunk_size;
+
+ ret = get_max_chunk_size(dest_obj.bucket, &max_chunk_size);
+ if (ret < 0) {
+ ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
+ return ret;
+ }
+
+ bool copy_data = !astate->has_manifest;
+ bool copy_first = false;
+ if (astate->has_manifest) {
+ if (!astate->manifest.has_tail()) {
+ copy_data = true;
+ } else {
+ uint64_t head_size = astate->manifest.get_head_size();
+
+ if (head_size > 0) {
+ if (head_size > max_chunk_size)
+ copy_data = true;
+ else
+ copy_first = true;
+ }
+ }
+ }
+
+ if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
+ return copy_obj_data(ctx, dest_bucket_info.owner, &handle, end, dest_obj, src_obj, max_chunk_size, mtime, src_attrs, category, ptag, err);
}
RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
@@ -3341,6 +3414,7 @@ int RGWRados::copy_obj_data(void *ctx,
void **handle, off_t end,
rgw_obj& dest_obj,
rgw_obj& src_obj,
+ uint64_t max_chunk_size,
time_t *mtime,
map<string, bufferlist>& attrs,
RGWObjCategory category,
@@ -4473,6 +4547,8 @@ int RGWRados::get_obj(void *ctx, RGWObjVersionTracker *objv_tracker, void **hand
bool merge_bl = false;
bufferlist *pbl = &bl;
bufferlist read_bl;
+ uint64_t max_chunk_size;
+
get_obj_bucket_and_oid_key(obj, bucket, oid, key);
@@ -4505,6 +4581,12 @@ int RGWRados::get_obj(void *ctx, RGWObjVersionTracker *objv_tracker, void **hand
}
}
+ r = get_max_chunk_size(bucket, &max_chunk_size);
+ if (r < 0) {
+ ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << bucket << dendl;
+ goto done_ret;
+ }
+
if (len > max_chunk_size)
len = max_chunk_size;
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index d50fb59..d811b49 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -548,7 +548,7 @@ public:
obj_ctx = _o;
return 0;
};
- virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle) = 0;
+ virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again) = 0;
virtual int throttle_data(void *handle, bool need_to_wait) = 0;
virtual int complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs);
};
@@ -564,7 +564,7 @@ class RGWPutObjProcessor_Plain : public RGWPutObjProcessor
protected:
int prepare(RGWRados *store, void *obj_ctx, string *oid_rand);
- int handle_data(bufferlist& bl, off_t ofs, void **phandle);
+ int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again);
int do_complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs);
public:
@@ -613,6 +613,8 @@ class RGWPutObjProcessor_Atomic : public RGWPutObjProcessor_Aio
uint64_t extra_data_len;
bufferlist extra_data_bl;
bufferlist pending_data_bl;
+ uint64_t max_chunk_size;
+
protected:
rgw_bucket bucket;
string obj_str;
@@ -631,6 +633,8 @@ protected:
int complete_parts();
int complete_writing_data();
+ int prepare_init(RGWRados *store, void *obj_ctx, string *oid_rand);
+
public:
~RGWPutObjProcessor_Atomic() {}
RGWPutObjProcessor_Atomic(const string& bucket_owner, rgw_bucket& _b, const string& _o, uint64_t _p, const string& _t) :
@@ -641,6 +645,7 @@ public:
cur_part_id(0),
data_ofs(0),
extra_data_len(0),
+ max_chunk_size(0),
bucket(_b),
obj_str(_o),
unique_tag(_t) {}
@@ -649,7 +654,7 @@ public:
void set_extra_data_len(uint64_t len) {
extra_data_len = len;
}
- virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle);
+ virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again);
bufferlist& get_extra_data() { return extra_data_bl; }
};
@@ -1221,8 +1226,6 @@ class RGWRados
int get_obj_ref(const rgw_obj& obj, rgw_rados_ref *ref, rgw_bucket *bucket, bool ref_system_obj = false);
uint64_t max_bucket_id;
- uint64_t max_chunk_size;
-
int get_obj_state(RGWRadosCtx *rctx, rgw_obj& obj, RGWObjState **state, RGWObjVersionTracker *objv_tracker);
int append_atomic_test(RGWRadosCtx *rctx, rgw_obj& obj,
librados::ObjectOperation& op, RGWObjState **state);
@@ -1287,7 +1290,6 @@ public:
num_watchers(0), watchers(NULL), watch_handles(NULL),
watch_initialized(false),
bucket_id_lock("rados_bucket_id"), max_bucket_id(0),
- max_chunk_size(0),
cct(NULL), rados(NULL),
pools_initialized(false),
quota_handler(NULL),
@@ -1325,9 +1327,8 @@ public:
}
}
- uint64_t get_max_chunk_size() {
- return max_chunk_size;
- }
+ int get_required_alignment(rgw_bucket& bucket, uint64_t *alignment);
+ int get_max_chunk_size(rgw_bucket& bucket, uint64_t *max_chunk_size);
int list_raw_objects(rgw_bucket& pool, const string& prefix_filter, int max,
RGWListRawObjsCtx& ctx, list<string>& oids,
@@ -1563,6 +1564,7 @@ public:
void **handle, off_t end,
rgw_obj& dest_obj,
rgw_obj& src_obj,
+ uint64_t max_chunk_size,
time_t *mtime,
map<string, bufferlist>& attrs,
RGWObjCategory category,
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
index 69948a6..b74002d 100644
--- a/src/rgw/rgw_rest.cc
+++ b/src/rgw/rgw_rest.cc
@@ -180,7 +180,7 @@ void rgw_flush_formatter_and_reset(struct req_state *s, Formatter *formatter)
std::ostringstream oss;
formatter->flush(oss);
std::string outs(oss.str());
- if (!outs.empty()) {
+ if (!outs.empty() && s->op != OP_HEAD) {
s->cio->write(outs.c_str(), outs.size());
}
@@ -192,7 +192,7 @@ void rgw_flush_formatter(struct req_state *s, Formatter *formatter)
std::ostringstream oss;
formatter->flush(oss);
std::string outs(oss.str());
- if (!outs.empty()) {
+ if (!outs.empty() && s->op != OP_HEAD) {
s->cio->write(outs.c_str(), outs.size());
}
}
diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc
index 507a7ff..b562079 100644
--- a/src/rgw/rgw_rest_swift.cc
+++ b/src/rgw/rgw_rest_swift.cc
@@ -627,18 +627,16 @@ void RGWOptionsCORS_ObjStore_SWIFT::send_response()
uint32_t max_age = CORS_MAX_AGE_INVALID;
/*EACCES means, there is no CORS registered yet for the bucket
*ENOENT means, there is no match of the Origin in the list of CORSRule
- *ENOTSUPP means, the HTTP_METHOD is not supported
*/
if (ret == -ENOENT)
ret = -EACCES;
- if (ret != -EACCES) {
- get_response_params(hdrs, exp_hdrs, &max_age);
- } else {
+ if (ret < 0) {
set_req_state_err(s, ret);
dump_errno(s);
end_header(s, NULL);
return;
}
+ get_response_params(hdrs, exp_hdrs, &max_age);
dump_errno(s);
dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(), max_age);
end_header(s, NULL);
diff --git a/src/test/crush/TestCrushWrapper.cc b/src/test/crush/TestCrushWrapper.cc
index d70a525..34d6401 100644
--- a/src/test/crush/TestCrushWrapper.cc
+++ b/src/test/crush/TestCrushWrapper.cc
@@ -538,6 +538,11 @@ TEST(CrushWrapper, dump_rules) {
ss.str().find("<item_name>default</item_name></step>"));
}
+ map<int,float> wm;
+ c->get_rule_weight_osd_map(0, &wm);
+ ASSERT_TRUE(wm.size() == 1);
+ ASSERT_TRUE(wm[0] == 1.0);
+
delete c;
}
diff --git a/src/test/erasure-code/TestErasureCodeJerasure.cc b/src/test/erasure-code/TestErasureCodeJerasure.cc
index 4b768a8..5c637da 100644
--- a/src/test/erasure-code/TestErasureCodeJerasure.cc
+++ b/src/test/erasure-code/TestErasureCodeJerasure.cc
@@ -288,6 +288,36 @@ TEST(ErasureCodeTest, create_ruleset)
}
}
+ //
+ // The ruleid may be different from the ruleset when a crush rule is
+ // removed because the removed ruleid will be reused but the removed
+ // ruleset will not be reused.
+ //
+ // This also asserts that the create_ruleset() method returns a
+ // ruleset and not a ruleid http://tracker.ceph.com/issues/9044
+ //
+ {
+ stringstream ss;
+ ErasureCodeJerasureReedSolomonVandermonde jerasure;
+ map<std::string,std::string> parameters;
+ parameters["k"] = "2";
+ parameters["m"] = "2";
+ parameters["w"] = "8";
+ jerasure.init(parameters);
+ int FIRST = jerasure.create_ruleset("FIRST", *c, &ss);
+ int SECOND = jerasure.create_ruleset("SECOND", *c, &ss);
+ int FIRST_ruleid = c->get_rule_id("FIRST");
+ EXPECT_EQ(0, c->remove_rule(FIRST_ruleid));
+ int ruleset = jerasure.create_ruleset("myrule", *c, &ss);
+ EXPECT_NE(FIRST, ruleset);
+ EXPECT_NE(SECOND, ruleset);
+ EXPECT_NE(ruleset, c->get_rule_id("myrule"));
+ int SECOND_ruleid = c->get_rule_id("SECOND");
+ EXPECT_EQ(0, c->remove_rule(SECOND_ruleid));
+ int myrule_ruleid = c->get_rule_id("myrule");
+ EXPECT_EQ(0, c->remove_rule(myrule_ruleid));
+ }
+
{
stringstream ss;
ErasureCodeJerasureReedSolomonVandermonde jerasure;
diff --git a/src/test/librados/TestCase.cc b/src/test/librados/TestCase.cc
index 9f68af1..7f072fd 100644
--- a/src/test/librados/TestCase.cc
+++ b/src/test/librados/TestCase.cc
@@ -8,6 +8,7 @@
using namespace librados;
std::string RadosTest::pool_name;
+std::string RadosTest::nspace;
rados_t RadosTest::s_cluster = NULL;
void RadosTest::SetUpTestCase()
@@ -25,7 +26,7 @@ void RadosTest::SetUp()
{
cluster = RadosTest::s_cluster;
ASSERT_EQ(0, rados_ioctx_create(cluster, pool_name.c_str(), &ioctx));
- std::string nspace = get_temp_pool_name();
+ nspace = get_temp_pool_name();
rados_ioctx_set_namespace(ioctx, nspace.c_str());
ASSERT_FALSE(rados_ioctx_pool_requires_alignment(ioctx));
}
@@ -206,24 +207,6 @@ void RadosTestEC::TearDown()
rados_ioctx_destroy(ioctx);
}
-void RadosTestEC::cleanup_default_namespace(rados_ioctx_t ioctx)
-{
- // remove all objects from the default namespace to avoid polluting
- // other tests
- rados_ioctx_set_namespace(ioctx, "");
- rados_list_ctx_t list_ctx;
- ASSERT_EQ(0, rados_objects_list_open(ioctx, &list_ctx));
- int r;
- const char *entry = NULL;
- const char *key = NULL;
- while ((r = rados_objects_list_next(list_ctx, &entry, &key)) != -ENOENT) {
- ASSERT_EQ(0, r);
- rados_ioctx_locator_set_key(ioctx, key);
- ASSERT_EQ(0, rados_remove(ioctx, entry));
- }
- rados_objects_list_close(list_ctx);
-}
-
std::string RadosTestECPP::pool_name;
Rados RadosTestECPP::s_cluster;
@@ -254,14 +237,3 @@ void RadosTestECPP::TearDown()
ioctx.close();
}
-void RadosTestECPP::cleanup_default_namespace(librados::IoCtx ioctx)
-{
- // remove all objects from the default namespace to avoid polluting
- // other tests
- ioctx.set_namespace("");
- for (ObjectIterator it = ioctx.objects_begin();
- it != ioctx.objects_end(); ++it) {
- ioctx.locator_set_key(it->second);
- ASSERT_EQ(0, ioctx.remove(it->first));
- }
-}
diff --git a/src/test/librados/TestCase.h b/src/test/librados/TestCase.h
index 5bd084f..4ede5e9 100644
--- a/src/test/librados/TestCase.h
+++ b/src/test/librados/TestCase.h
@@ -28,6 +28,7 @@ protected:
static void cleanup_default_namespace(rados_ioctx_t ioctx);
static rados_t s_cluster;
static std::string pool_name;
+ static std::string nspace;
virtual void SetUp();
virtual void TearDown();
@@ -72,14 +73,13 @@ protected:
std::string ns;
};
-class RadosTestEC : public ::testing::Test {
+class RadosTestEC : public RadosTest {
public:
RadosTestEC() {}
virtual ~RadosTestEC() {}
protected:
static void SetUpTestCase();
static void TearDownTestCase();
- static void cleanup_default_namespace(rados_ioctx_t ioctx);
static rados_t s_cluster;
static std::string pool_name;
@@ -90,14 +90,13 @@ protected:
uint64_t alignment;
};
-class RadosTestECPP : public ::testing::Test {
+class RadosTestECPP : public RadosTestPP {
public:
RadosTestECPP() : cluster(s_cluster) {};
virtual ~RadosTestECPP() {};
protected:
static void SetUpTestCase();
static void TearDownTestCase();
- static void cleanup_default_namespace(librados::IoCtx ioctx);
static librados::Rados s_cluster;
static std::string pool_name;
diff --git a/src/test/librados/io.cc b/src/test/librados/io.cc
index 5daca3c..0bb805f 100644
--- a/src/test/librados/io.cc
+++ b/src/test/librados/io.cc
@@ -25,6 +25,58 @@ TEST_F(LibRadosIo, SimpleWrite) {
ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
}
+TEST_F(LibRadosIo, ReadTimeout) {
+ char buf[128];
+ memset(buf, 'a', sizeof(buf));
+ ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+
+ {
+ // set up a second client
+ rados_t cluster;
+ rados_ioctx_t ioctx;
+ rados_create(&cluster, "admin");
+ rados_conf_read_file(cluster, NULL);
+ rados_conf_parse_env(cluster, NULL);
+ rados_conf_set(cluster, "rados_osd_op_timeout", "0.00001"); // use any small value that will result in a timeout
+ rados_connect(cluster);
+ rados_ioctx_create(cluster, pool_name.c_str(), &ioctx);
+ rados_ioctx_set_namespace(ioctx, nspace.c_str());
+
+ // then we show that the buffer is changed after rados_read returned
+ // with a timeout
+ for (int i=0; i<5; i++) {
+ char buf2[sizeof(buf)];
+ memset(buf2, 0, sizeof(buf2));
+ int err = rados_read(ioctx, "foo", buf2, sizeof(buf2), 0);
+ if (err == -110) {
+ int startIndex = 0;
+ // find the index until which librados already read the object before the timeout occurred
+ for (unsigned b=0; b<sizeof(buf); b++) {
+ if (buf2[b] != buf[b]) {
+ startIndex = b;
+ break;
+ }
+ }
+
+ // wait some time to give librados a change to do something
+ sleep(1);
+
+ // then check if the buffer was changed after the call
+ if (buf2[startIndex] == 'a') {
+ printf("byte at index %d was changed after the timeout to %d\n",
+ startIndex, (int)buf[startIndex]);
+ ASSERT_TRUE(0);
+ break;
+ }
+ } else {
+ printf("no timeout :/\n");
+ }
+ }
+ rados_ioctx_destroy(ioctx);
+ rados_shutdown(cluster);
+ }
+}
+
TEST_F(LibRadosIoPP, SimpleWritePP) {
char buf[128];
memset(buf, 0xcc, sizeof(buf));
diff --git a/src/test/librados/tier.cc b/src/test/librados/tier.cc
index 611e17e..4267389 100644
--- a/src/test/librados/tier.cc
+++ b/src/test/librados/tier.cc
@@ -34,6 +34,38 @@ using std::string;
typedef RadosTestPP LibRadosTierPP;
typedef RadosTestECPP LibRadosTierECPP;
+void flush_evict_all(librados::Rados& cluster, librados::IoCtx& cache_ioctx)
+{
+ bufferlist inbl;
+ cache_ioctx.set_namespace("");
+ for (ObjectIterator it = cache_ioctx.objects_begin();
+ it != cache_ioctx.objects_end(); ++it) {
+ cache_ioctx.locator_set_key(it->second);
+ {
+ ObjectReadOperation op;
+ op.cache_flush();
+ librados::AioCompletion *completion = cluster.aio_create_completion();
+ cache_ioctx.aio_operate(
+ it->first, completion, &op,
+ librados::OPERATION_IGNORE_OVERLAY, NULL);
+ completion->wait_for_safe();
+ completion->get_return_value();
+ completion->release();
+ }
+ {
+ ObjectReadOperation op;
+ op.cache_evict();
+ librados::AioCompletion *completion = cluster.aio_create_completion();
+ cache_ioctx.aio_operate(
+ it->first, completion, &op,
+ librados::OPERATION_IGNORE_OVERLAY, NULL);
+ completion->wait_for_safe();
+ completion->get_return_value();
+ completion->release();
+ }
+ }
+}
+
class LibRadosTwoPoolsPP : public RadosTestPP
{
public:
@@ -59,7 +91,26 @@ protected:
}
virtual void TearDown() {
RadosTestPP::TearDown();
+
+ // flush + evict cache
+ flush_evict_all(cluster, cache_ioctx);
+
+ bufferlist inbl;
+ // tear down tiers
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+ "\"}",
+ inbl, NULL, NULL));
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+ "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+ inbl, NULL, NULL));
+
+ // wait for maps to settle before next test
+ cluster.wait_for_latest_osdmap();
+
cleanup_default_namespace(cache_ioctx);
+
cache_ioctx.close();
}
librados::IoCtx cache_ioctx;
@@ -180,19 +231,6 @@ TEST_F(LibRadosTwoPoolsPP, Overlay) {
completion->release();
ASSERT_EQ('b', bl[0]);
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsPP, Promote) {
@@ -247,19 +285,6 @@ TEST_F(LibRadosTwoPoolsPP, Promote) {
++it;
ASSERT_TRUE(it == cache_ioctx.objects_end());
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsPP, PromoteSnap) {
@@ -400,19 +425,6 @@ TEST_F(LibRadosTwoPoolsPP, PromoteSnap) {
bufferlist bl;
ASSERT_EQ(-ENOENT, ioctx.read("baz", bl, 1, 0));
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsPP, PromoteSnapScrub) {
@@ -509,19 +521,6 @@ TEST_F(LibRadosTwoPoolsPP, PromoteSnapScrub) {
}
ioctx.snap_set_read(librados::SNAP_HEAD);
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
@@ -577,19 +576,6 @@ TEST_F(LibRadosTwoPoolsPP, PromoteSnapTrimRace) {
bufferlist bl;
ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsPP, Whiteout) {
@@ -653,19 +639,6 @@ TEST_F(LibRadosTwoPoolsPP, Whiteout) {
ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
ASSERT_EQ('h', bl[0]);
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsPP, Evict) {
@@ -756,19 +729,6 @@ TEST_F(LibRadosTwoPoolsPP, Evict) {
ASSERT_EQ(-EBUSY, completion->get_return_value());
completion->release();
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsPP, EvictSnap) {
@@ -1004,19 +964,6 @@ TEST_F(LibRadosTwoPoolsPP, EvictSnap) {
ASSERT_EQ(0, completion->get_return_value());
completion->release();
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsPP, TryFlush) {
@@ -1125,19 +1072,6 @@ TEST_F(LibRadosTwoPoolsPP, TryFlush) {
ObjectIterator it = cache_ioctx.objects_begin();
ASSERT_TRUE(it == cache_ioctx.objects_end());
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsPP, Flush) {
@@ -1298,19 +1232,6 @@ TEST_F(LibRadosTwoPoolsPP, Flush) {
ObjectIterator it = ioctx.objects_begin();
ASSERT_TRUE(it == ioctx.objects_end());
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsPP, FlushSnap) {
@@ -1470,18 +1391,11 @@ TEST_F(LibRadosTwoPoolsPP, FlushSnap) {
ASSERT_EQ('a', bl[0]);
}
- // tear down tiers
+ // remove overlay
ASSERT_EQ(0, cluster.mon_command(
"{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
"\"}",
inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle
- cluster.wait_for_latest_osdmap();
// verify i can read the snaps from the base pool
ioctx.snap_set_read(librados::SNAP_HEAD);
@@ -1502,6 +1416,11 @@ TEST_F(LibRadosTwoPoolsPP, FlushSnap) {
ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
ASSERT_EQ('a', bl[0]);
}
+
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
+ "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
+ inbl, NULL, NULL));
}
TEST_F(LibRadosTierPP, FlushWriteRaces) {
@@ -1786,19 +1705,6 @@ TEST_F(LibRadosTwoPoolsPP, FlushTryFlushRaces) {
completion->release();
completion2->release();
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
@@ -1895,19 +1801,6 @@ TEST_F(LibRadosTwoPoolsPP, TryFlushReadRace) {
while (num_reads > 0)
cond.Wait(test_lock);
test_lock.Unlock();
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTierPP, HitSetNone) {
@@ -1944,21 +1837,28 @@ string set_pool_str(string pool, string var, int val)
+ stringify(val) + string("\"}");
}
-TEST_F(LibRadosTierPP, HitSetRead) {
- // enable hitset tracking for this pool
+TEST_F(LibRadosTwoPoolsPP, HitSetRead) {
+ // make it a tier
bufferlist inbl;
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 2),
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+ "\", \"tierpool\": \"" + cache_pool_name +
+ "\", \"force_nonempty\": \"--force-nonempty\" }",
+ inbl, NULL, NULL));
+
+ // enable hitset tracking for this pool
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 2),
inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
"explicit_object"),
inbl, NULL, NULL));
// wait for maps to settle
cluster.wait_for_latest_osdmap();
- ioctx.set_namespace("");
+ cache_ioctx.set_namespace("");
// keep reading until we see our object appear in the HitSet
utime_t start = ceph_clock_now(NULL);
@@ -1969,16 +1869,16 @@ TEST_F(LibRadosTierPP, HitSetRead) {
ASSERT_TRUE(now < hard_stop);
string name = "foo";
- uint32_t hash = ioctx.get_object_hash_position(name);
+ uint32_t hash = cache_ioctx.get_object_hash_position(name);
hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash,
- cluster.pool_lookup(pool_name.c_str()), "");
+ cluster.pool_lookup(cache_pool_name.c_str()), "");
bufferlist bl;
- ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
+ ASSERT_EQ(-ENOENT, cache_ioctx.read("foo", bl, 1, 0));
bufferlist hbl;
AioCompletion *c = librados::Rados::aio_create_completion();
- ASSERT_EQ(0, ioctx.hit_set_get(hash, c, now.sec(), &hbl));
+ ASSERT_EQ(0, cache_ioctx.hit_set_get(hash, c, now.sec(), &hbl));
c->wait_for_complete();
c->release();
@@ -2028,30 +1928,39 @@ static int _get_pg_num(Rados& cluster, string pool_name)
}
-TEST_F(LibRadosTierPP, HitSetWrite) {
+TEST_F(LibRadosTwoPoolsPP, HitSetWrite) {
int num_pg = _get_pg_num(cluster, pool_name);
assert(num_pg > 0);
- // enable hitset tracking for this pool
+ // make it a tier
bufferlist inbl;
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 8),
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+ "\", \"tierpool\": \"" + cache_pool_name +
+ "\", \"force_nonempty\": \"--force-nonempty\" }",
+ inbl, NULL, NULL));
+
+ // enable hitset tracking for this pool
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 8),
inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
"explicit_hash"),
inbl, NULL, NULL));
// wait for maps to settle
cluster.wait_for_latest_osdmap();
- ioctx.set_namespace("");
+ cache_ioctx.set_namespace("");
+
+ int num = 200;
// do a bunch of writes
- for (int i=0; i<1000; ++i) {
+ for (int i=0; i<num; ++i) {
bufferlist bl;
bl.append("a");
- ASSERT_EQ(0, ioctx.write(stringify(i), bl, 1, 0));
+ ASSERT_EQ(0, cache_ioctx.write(stringify(i), bl, 1, 0));
}
// get HitSets
@@ -2059,7 +1968,7 @@ TEST_F(LibRadosTierPP, HitSetWrite) {
for (int i=0; i<num_pg; ++i) {
list< pair<time_t,time_t> > ls;
AioCompletion *c = librados::Rados::aio_create_completion();
- ASSERT_EQ(0, ioctx.hit_set_list(i, c, &ls));
+ ASSERT_EQ(0, cache_ioctx.hit_set_list(i, c, &ls));
c->wait_for_complete();
c->release();
std::cout << "pg " << i << " ls " << ls << std::endl;
@@ -2068,7 +1977,7 @@ TEST_F(LibRadosTierPP, HitSetWrite) {
// get the latest
c = librados::Rados::aio_create_completion();
bufferlist bl;
- ASSERT_EQ(0, ioctx.hit_set_get(i, c, ls.back().first, &bl));
+ ASSERT_EQ(0, cache_ioctx.hit_set_get(i, c, ls.back().first, &bl));
c->wait_for_complete();
c->release();
@@ -2081,14 +1990,14 @@ TEST_F(LibRadosTierPP, HitSetWrite) {
// cope with racing splits by refreshing pg_num
if (i == num_pg - 1)
- num_pg = _get_pg_num(cluster, pool_name);
+ num_pg = _get_pg_num(cluster, cache_pool_name);
}
- for (int i=0; i<1000; ++i) {
+ for (int i=0; i<num; ++i) {
string n = stringify(i);
- uint32_t hash = ioctx.get_object_hash_position(n);
+ uint32_t hash = cache_ioctx.get_object_hash_position(n);
hobject_t oid(sobject_t(n, CEPH_NOSNAP), "", hash,
- cluster.pool_lookup(pool_name.c_str()), "");
+ cluster.pool_lookup(cache_pool_name.c_str()), "");
std::cout << "checking for " << oid << std::endl;
bool found = false;
for (int p=0; p<num_pg; ++p) {
@@ -2101,25 +2010,32 @@ TEST_F(LibRadosTierPP, HitSetWrite) {
}
}
-TEST_F(LibRadosTierPP, HitSetTrim) {
+TEST_F(LibRadosTwoPoolsPP, HitSetTrim) {
unsigned count = 3;
unsigned period = 3;
- // enable hitset tracking for this pool
+ // make it a tier
bufferlist inbl;
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", count),
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+ "\", \"tierpool\": \"" + cache_pool_name +
+ "\", \"force_nonempty\": \"--force-nonempty\" }",
+ inbl, NULL, NULL));
+
+ // enable hitset tracking for this pool
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", count),
inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", period),
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", period),
inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type", "bloom"),
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_fpp", ".01"),
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_fpp", ".01"),
inbl, NULL, NULL));
// wait for maps to settle
cluster.wait_for_latest_osdmap();
- ioctx.set_namespace("");
+ cache_ioctx.set_namespace("");
// do a bunch of writes and make sure the hitsets rotate
utime_t start = ceph_clock_now(NULL);
@@ -2128,16 +2044,16 @@ TEST_F(LibRadosTierPP, HitSetTrim) {
time_t first = 0;
while (true) {
string name = "foo";
- uint32_t hash = ioctx.get_object_hash_position(name);
+ uint32_t hash = cache_ioctx.get_object_hash_position(name);
hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash, -1, "");
bufferlist bl;
bl.append("f");
- ASSERT_EQ(0, ioctx.write("foo", bl, 1, 0));
+ ASSERT_EQ(0, cache_ioctx.write("foo", bl, 1, 0));
list<pair<time_t, time_t> > ls;
AioCompletion *c = librados::Rados::aio_create_completion();
- ASSERT_EQ(0, ioctx.hit_set_list(hash, c, &ls));
+ ASSERT_EQ(0, cache_ioctx.hit_set_list(hash, c, &ls));
c->wait_for_complete();
c->release();
@@ -2187,9 +2103,29 @@ protected:
}
virtual void TearDown() {
RadosTestECPP::TearDown();
+
+ // flush + evict cache
+ flush_evict_all(cluster, cache_ioctx);
+
+ bufferlist inbl;
+ // tear down tiers
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+ "\"}",
+ inbl, NULL, NULL));
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+ "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+ inbl, NULL, NULL));
+
+ // wait for maps to settle before next test
+ cluster.wait_for_latest_osdmap();
+
cleanup_default_namespace(cache_ioctx);
+
cache_ioctx.close();
}
+
librados::IoCtx cache_ioctx;
};
@@ -2308,19 +2244,6 @@ TEST_F(LibRadosTwoPoolsECPP, Overlay) {
completion->release();
ASSERT_EQ('b', bl[0]);
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsECPP, Promote) {
@@ -2375,19 +2298,6 @@ TEST_F(LibRadosTwoPoolsECPP, Promote) {
++it;
ASSERT_TRUE(it == cache_ioctx.objects_end());
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsECPP, PromoteSnap) {
@@ -2552,19 +2462,6 @@ TEST_F(LibRadosTwoPoolsECPP, PromoteSnap) {
bufferlist bl;
ASSERT_EQ(-ENOENT, ioctx.read("baz", bl, 1, 0));
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsECPP, PromoteSnapTrimRace) {
@@ -2619,19 +2516,6 @@ TEST_F(LibRadosTwoPoolsECPP, PromoteSnapTrimRace) {
bufferlist bl;
ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsECPP, Whiteout) {
@@ -2695,19 +2579,6 @@ TEST_F(LibRadosTwoPoolsECPP, Whiteout) {
ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
ASSERT_EQ('h', bl[0]);
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsECPP, Evict) {
@@ -2798,19 +2669,6 @@ TEST_F(LibRadosTwoPoolsECPP, Evict) {
ASSERT_EQ(-EBUSY, completion->get_return_value());
completion->release();
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsECPP, EvictSnap) {
@@ -3046,19 +2904,6 @@ TEST_F(LibRadosTwoPoolsECPP, EvictSnap) {
ASSERT_EQ(0, completion->get_return_value());
completion->release();
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsECPP, TryFlush) {
@@ -3167,19 +3012,6 @@ TEST_F(LibRadosTwoPoolsECPP, TryFlush) {
ObjectIterator it = cache_ioctx.objects_begin();
ASSERT_TRUE(it == cache_ioctx.objects_end());
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsECPP, Flush) {
@@ -3340,19 +3172,6 @@ TEST_F(LibRadosTwoPoolsECPP, Flush) {
ObjectIterator it = ioctx.objects_begin();
ASSERT_TRUE(it == ioctx.objects_end());
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsECPP, FlushSnap) {
@@ -3517,10 +3336,6 @@ TEST_F(LibRadosTwoPoolsECPP, FlushSnap) {
"{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
"\"}",
inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
// wait for maps to settle
cluster.wait_for_latest_osdmap();
@@ -3544,6 +3359,11 @@ TEST_F(LibRadosTwoPoolsECPP, FlushSnap) {
ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
ASSERT_EQ('a', bl[0]);
}
+
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
+ "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
+ inbl, NULL, NULL));
}
TEST_F(LibRadosTierECPP, FlushWriteRaces) {
@@ -3828,19 +3648,6 @@ TEST_F(LibRadosTwoPoolsECPP, FlushTryFlushRaces) {
completion->release();
completion2->release();
}
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTwoPoolsECPP, TryFlushReadRace) {
@@ -3903,19 +3710,6 @@ TEST_F(LibRadosTwoPoolsECPP, TryFlushReadRace) {
while (num_reads > 0)
cond.Wait(test_lock);
test_lock.Unlock();
-
- // tear down tiers
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
- "\"}",
- inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(
- "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
- "\", \"tierpool\": \"" + cache_pool_name + "\"}",
- inbl, NULL, NULL));
-
- // wait for maps to settle before next test
- cluster.wait_for_latest_osdmap();
}
TEST_F(LibRadosTierECPP, HitSetNone) {
@@ -3938,21 +3732,28 @@ TEST_F(LibRadosTierECPP, HitSetNone) {
}
}
-TEST_F(LibRadosTierECPP, HitSetRead) {
- // enable hitset tracking for this pool
+TEST_F(LibRadosTwoPoolsECPP, HitSetRead) {
+ // make it a tier
bufferlist inbl;
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 2),
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+ "\", \"tierpool\": \"" + cache_pool_name +
+ "\", \"force_nonempty\": \"--force-nonempty\" }",
+ inbl, NULL, NULL));
+
+ // enable hitset tracking for this pool
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 2),
inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
"explicit_object"),
inbl, NULL, NULL));
// wait for maps to settle
cluster.wait_for_latest_osdmap();
- ioctx.set_namespace("");
+ cache_ioctx.set_namespace("");
// keep reading until we see our object appear in the HitSet
utime_t start = ceph_clock_now(NULL);
@@ -3963,16 +3764,16 @@ TEST_F(LibRadosTierECPP, HitSetRead) {
ASSERT_TRUE(now < hard_stop);
string name = "foo";
- uint32_t hash = ioctx.get_object_hash_position(name);
+ uint32_t hash = cache_ioctx.get_object_hash_position(name);
hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash,
- cluster.pool_lookup(pool_name.c_str()), "");
+ cluster.pool_lookup(cache_pool_name.c_str()), "");
bufferlist bl;
- ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
+ ASSERT_EQ(-ENOENT, cache_ioctx.read("foo", bl, 1, 0));
bufferlist hbl;
AioCompletion *c = librados::Rados::aio_create_completion();
- ASSERT_EQ(0, ioctx.hit_set_get(hash, c, now.sec(), &hbl));
+ ASSERT_EQ(0, cache_ioctx.hit_set_get(hash, c, now.sec(), &hbl));
c->wait_for_complete();
c->release();
@@ -4069,25 +3870,32 @@ TEST_F(LibRadosTierECPP, HitSetWrite) {
}
#endif
-TEST_F(LibRadosTierECPP, HitSetTrim) {
+TEST_F(LibRadosTwoPoolsECPP, HitSetTrim) {
unsigned count = 3;
unsigned period = 3;
- // enable hitset tracking for this pool
+ // make it a tier
bufferlist inbl;
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", count),
+ ASSERT_EQ(0, cluster.mon_command(
+ "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+ "\", \"tierpool\": \"" + cache_pool_name +
+ "\", \"force_nonempty\": \"--force-nonempty\" }",
+ inbl, NULL, NULL));
+
+ // enable hitset tracking for this pool
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", count),
inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", period),
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", period),
inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type", "bloom"),
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
inbl, NULL, NULL));
- ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_fpp", ".01"),
+ ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_fpp", ".01"),
inbl, NULL, NULL));
// wait for maps to settle
cluster.wait_for_latest_osdmap();
- ioctx.set_namespace("");
+ cache_ioctx.set_namespace("");
// do a bunch of writes and make sure the hitsets rotate
utime_t start = ceph_clock_now(NULL);
@@ -4100,16 +3908,16 @@ TEST_F(LibRadosTierECPP, HitSetTrim) {
while (true) {
string name = "foo";
- uint32_t hash = ioctx.get_object_hash_position(name);
+ uint32_t hash = cache_ioctx.get_object_hash_position(name);
hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash, -1, "");
bufferlist bl;
bl.append(buf, bsize);
- ASSERT_EQ(0, ioctx.append("foo", bl, bsize));
+ ASSERT_EQ(0, cache_ioctx.append("foo", bl, bsize));
list<pair<time_t, time_t> > ls;
AioCompletion *c = librados::Rados::aio_create_completion();
- ASSERT_EQ(0, ioctx.hit_set_list(hash, c, &ls));
+ ASSERT_EQ(0, cache_ioctx.hit_set_list(hash, c, &ls));
c->wait_for_complete();
c->release();
diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc
index 8133d4d..7c5dc58 100644
--- a/src/test/objectstore/store_test.cc
+++ b/src/test/objectstore/store_test.cc
@@ -1115,6 +1115,111 @@ TEST_P(StoreTest, MoveRename) {
ASSERT_TRUE(newomap.count("omap_key"));
ASSERT_TRUE(newomap["omap_key"].contents_equal(omap["omap_key"]));
}
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, oid);
+ t.remove_collection(cid);
+ t.remove_collection(temp_cid);
+ r = store->apply_transaction(t);
+ ASSERT_EQ(r, 0);
+ }
+}
+
+TEST_P(StoreTest, BigRGWObjectName) {
+ store->set_allow_sharded_objects();
+ store->sync_and_flush();
+ coll_t temp_cid("mytemp");
+ hobject_t temp_oid("tmp_oid", "", CEPH_NOSNAP, 0, 0, "");
+ coll_t cid("dest");
+ ghobject_t oid(
+ hobject_t(
+ "default.4106.50_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa [...]
+ "",
+ CEPH_NOSNAP,
+ 0x81920472,
+ 3,
+ ""),
+ 15,
+ shard_id_t(1));
+ ghobject_t oid2(oid);
+ oid2.generation = 17;
+ ghobject_t oidhead(oid);
+ oidhead.generation = ghobject_t::NO_GEN;
+
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid);
+ t.touch(cid, oidhead);
+ t.collection_move_rename(cid, oidhead, cid, oid);
+ t.touch(cid, oidhead);
+ t.collection_move_rename(cid, oidhead, cid, oid2);
+ r = store->apply_transaction(t);
+ ASSERT_EQ(r, 0);
+ }
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, oid);
+ r = store->apply_transaction(t);
+ ASSERT_EQ(r, 0);
+ }
+
+ {
+ vector<ghobject_t> objects;
+ r = store->collection_list(cid, objects);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(objects.size(), 1u);
+ ASSERT_EQ(objects[0], oid2);
+ }
+
+ ASSERT_FALSE(store->exists(cid, oid));
+
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, oid2);
+ t.remove_collection(cid);
+ r = store->apply_transaction(t);
+ ASSERT_EQ(r, 0);
+
+ }
+}
+
+TEST_P(StoreTest, SetAllocHint) {
+ coll_t cid("alloc_hint");
+ ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, 0, ""));
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid);
+ t.touch(cid, hoid);
+ r = store->apply_transaction(t);
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
+ r = store->apply_transaction(t);
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove(cid, hoid);
+ r = store->apply_transaction(t);
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
+ r = store->apply_transaction(t);
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.remove_collection(cid);
+ r = store->apply_transaction(t);
+ ASSERT_EQ(r, 0);
+ }
}
INSTANTIATE_TEST_CASE_P(
diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc
index 0ff12c8..451b6b2 100644
--- a/src/test/osd/TestOSDMap.cc
+++ b/src/test/osd/TestOSDMap.cc
@@ -50,13 +50,24 @@ public:
}
osdmap.apply_incremental(pending_inc);
- // kludge to get an erasure coding rule and pool
+ // Create an EC ruleset and a pool using it
int r = osdmap.crush->add_simple_ruleset("erasure", "default", "osd",
"indep", pg_pool_t::TYPE_ERASURE,
&cerr);
- pg_pool_t *p = (pg_pool_t *)osdmap.get_pg_pool(2);
+
+ OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
+ new_pool_inc.new_pool_max = osdmap.get_pool_max();
+ new_pool_inc.fsid = osdmap.get_fsid();
+ pg_pool_t empty;
+ uint64_t pool_id = ++new_pool_inc.new_pool_max;
+ pg_pool_t *p = new_pool_inc.get_new_pool(pool_id, &empty);
+ p->size = 3;
+ p->set_pg_num(64);
+ p->set_pgp_num(64);
p->type = pg_pool_t::TYPE_ERASURE;
p->crush_ruleset = r;
+ new_pool_inc.new_pool_names[pool_id] = "ec";
+ osdmap.apply_incremental(new_pool_inc);
}
unsigned int get_num_osds() { return num_osds; }
@@ -86,6 +97,48 @@ TEST_F(OSDMapTest, Create) {
ASSERT_EQ(get_num_osds(), osdmap.get_num_in_osds());
}
+TEST_F(OSDMapTest, Features) {
+ // with EC pool
+ set_up_map();
+ uint64_t features = osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
+ ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
+ ASSERT_TRUE(features & CEPH_FEATURE_OSD_ERASURE_CODES);
+ ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
+ ASSERT_FALSE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
+
+ // clients have a slightly different view
+ features = osdmap.get_features(CEPH_ENTITY_TYPE_CLIENT, NULL);
+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
+ ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
+ ASSERT_FALSE(features & CEPH_FEATURE_OSD_ERASURE_CODES); // dont' need this
+ ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
+ ASSERT_FALSE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
+
+ // remove teh EC pool, but leave the rule. add primary affinity.
+ {
+ OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
+ new_pool_inc.old_pools.insert(osdmap.lookup_pg_pool_name("ec"));
+ new_pool_inc.new_primary_affinity[0] = 0x8000;
+ osdmap.apply_incremental(new_pool_inc);
+ }
+
+ features = osdmap.get_features(CEPH_ENTITY_TYPE_MON, NULL);
+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
+ ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); // shared bit with primary affinity
+ ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_V2);
+ ASSERT_FALSE(features & CEPH_FEATURE_OSD_ERASURE_CODES);
+ ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
+ ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
+
+ // FIXME: test tiering feature bits
+}
+
TEST_F(OSDMapTest, MapPG) {
set_up_map();
diff --git a/src/test/osd/osd-test-helpers.sh b/src/test/osd/osd-test-helpers.sh
index 5117ae3..1ea17dd 100644
--- a/src/test/osd/osd-test-helpers.sh
+++ b/src/test/osd/osd-test-helpers.sh
@@ -37,6 +37,7 @@ function run_osd() {
ceph_args+=" --osd-journal-size=100"
ceph_args+=" --osd-data=$osd_data"
ceph_args+=" --chdir="
+ ceph_args+=" --osd-pool-default-erasure-code-directory=.libs"
ceph_args+=" --run-dir=$dir"
ceph_args+=" --debug-osd=20"
ceph_args+=" --log-file=$dir/osd-\$id.log"
diff --git a/src/test/strtol.cc b/src/test/strtol.cc
index d3f0ae0..08ba081 100644
--- a/src/test/strtol.cc
+++ b/src/test/strtol.cc
@@ -14,6 +14,7 @@
#include "common/strtol.h"
#include <string>
+#include <map>
#include "gtest/gtest.h"
@@ -134,3 +135,77 @@ TEST(StrToL, Error1) {
test_strict_strtof_err("0.05.0");
}
+
+
+static void test_strict_sistrtoll(const char *str)
+{
+ std::string err;
+ strict_sistrtoll(str, &err);
+ ASSERT_EQ(err, "");
+}
+
+static void test_strict_sistrtoll_units(const std::string& foo,
+ char u, const int m)
+{
+ std::string s(foo);
+ s.push_back(u);
+ const char *str = s.c_str();
+ std::string err;
+ uint64_t r = strict_sistrtoll(str, &err);
+ ASSERT_EQ(err, "");
+
+ str = foo.c_str();
+ std::string err2;
+ long long tmp = strict_strtoll(str, 10, &err2);
+ ASSERT_EQ(err2, "");
+ tmp = (tmp << m);
+ ASSERT_EQ(tmp, (long long)r);
+}
+
+TEST(SIStrToLL, WithUnits) {
+ std::map<char,int> units;
+ units['B'] = 0;
+ units['K'] = 10;
+ units['M'] = 20;
+ units['G'] = 30;
+ units['T'] = 40;
+ units['P'] = 50;
+ units['E'] = 60;
+
+ for (std::map<char,int>::iterator p = units.begin();
+ p != units.end(); ++p) {
+ test_strict_sistrtoll_units("1024", p->first, p->second);
+ test_strict_sistrtoll_units("1", p->first, p->second);
+ test_strict_sistrtoll_units("0", p->first, p->second);
+ }
+}
+
+TEST(SIStrToLL, WithoutUnits) {
+ test_strict_sistrtoll("1024");
+ test_strict_sistrtoll("1152921504606846976");
+ test_strict_sistrtoll("0");
+}
+
+static void test_strict_sistrtoll_err(const char *str)
+{
+ std::string err;
+ strict_sistrtoll(str, &err);
+ ASSERT_NE(err, "");
+}
+
+TEST(SIStrToLL, Error) {
+ test_strict_sistrtoll_err("1024F");
+ test_strict_sistrtoll_err("QDDSA");
+ test_strict_sistrtoll_err("1b");
+ test_strict_sistrtoll_err("100k");
+ test_strict_sistrtoll_err("1000m");
+ test_strict_sistrtoll_err("1g");
+ test_strict_sistrtoll_err("20t");
+ test_strict_sistrtoll_err("100p");
+ test_strict_sistrtoll_err("1000e");
+ test_strict_sistrtoll_err("B");
+ test_strict_sistrtoll_err("M");
+ test_strict_sistrtoll_err("BM");
+ test_strict_sistrtoll_err("B0wef");
+ test_strict_sistrtoll_err("0m");
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git
More information about the Pkg-ceph-commits
mailing list