[Pkg-ceph-commits] [ceph] 01/01: Imported Upstream version 0.80.1

Dmitry Smirnov onlyjob at moszumanska.debian.org
Tue May 13 03:34:40 UTC 2014


This is an automated email from the git hooks/post-receive script.

onlyjob pushed a commit to branch upstream
in repository ceph.

commit 42624ce (upstream)
Author: Dmitry Smirnov <onlyjob at member.fsf.org>
Date:   Tue May 13 03:08:55 2014

    Imported Upstream version 0.80.1
---
 ceph.spec                 |   2 +-
 configure                 |  22 +++----
 configure.ac              |   2 +-
 src/.git_version          |   4 +-
 src/client/Client.cc      |  31 +++++++++-
 src/client/Client.h       |   1 +
 src/client/MetaSession.cc |   1 +
 src/client/MetaSession.h  |   1 +
 src/common/config_opts.h  |   2 +
 src/mon/MonClient.cc      |   4 --
 src/os/KeyValueStore.cc   |   5 +-
 src/osd/OSD.cc            |  30 +++++++++-
 src/osd/OSD.h             |   3 +
 src/osd/PG.cc             |  17 ++----
 src/osd/PG.h              |  11 +---
 src/osd/ReplicatedPG.cc   | 144 +++++++++++++++++++++++++++++++---------------
 src/osd/ReplicatedPG.h    |   8 +--
 src/osd/TierAgentState.h  |  13 ++++-
 src/osd/osd_types.h       |   3 +-
 src/rgw/rgw_rados.cc      |  14 ++++-
 src/rgw/rgw_rest.cc       |   1 +
 21 files changed, 219 insertions(+), 100 deletions(-)

diff --git a/ceph.spec b/ceph.spec
index 7105d13..fca0c34 100644
--- a/ceph.spec
+++ b/ceph.spec
@@ -9,7 +9,7 @@
 # common
 #################################################################################
 Name:		ceph
-Version:	0.80
+Version:	0.80.1
 Release:	0%{?dist}
 Summary:	User space components of the Ceph file system
 License:	GPL-2.0
diff --git a/configure b/configure
index b882d94..961026d 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.68 for ceph 0.80.
+# Generated by GNU Autoconf 2.68 for ceph 0.80.1.
 #
 # Report bugs to <ceph-devel at vger.kernel.org>.
 #
@@ -570,8 +570,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ceph'
 PACKAGE_TARNAME='ceph'
-PACKAGE_VERSION='0.80'
-PACKAGE_STRING='ceph 0.80'
+PACKAGE_VERSION='0.80.1'
+PACKAGE_STRING='ceph 0.80.1'
 PACKAGE_BUGREPORT='ceph-devel at vger.kernel.org'
 PACKAGE_URL=''
 
@@ -1441,7 +1441,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ceph 0.80 to adapt to many kinds of systems.
+\`configure' configures ceph 0.80.1 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1512,7 +1512,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ceph 0.80:";;
+     short | recursive ) echo "Configuration of ceph 0.80.1:";;
    esac
   cat <<\_ACEOF
 
@@ -1657,7 +1657,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ceph configure 0.80
+ceph configure 0.80.1
 generated by GNU Autoconf 2.68
 
 Copyright (C) 2010 Free Software Foundation, Inc.
@@ -2504,7 +2504,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ceph $as_me 0.80, which was
+It was created by ceph $as_me 0.80.1, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
 
   $ $0 $@
@@ -4504,7 +4504,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='0.80'
+ VERSION='0.80.1'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -12482,7 +12482,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='0.80'
+ VERSION='0.80.1'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -22258,7 +22258,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ceph $as_me 0.80, which was
+This file was extended by ceph $as_me 0.80.1, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -22324,7 +22324,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ceph config.status 0.80
+ceph config.status 0.80.1
 configured by $0, generated by GNU Autoconf 2.68,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index f9482d6..36312cf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
 # VERSION define is not used by the code.  It gets a version string
 # from 'git describe'; see src/ceph_ver.[ch]
 
-AC_INIT([ceph], [0.80], [ceph-devel at vger.kernel.org])
+AC_INIT([ceph], [0.80.1], [ceph-devel at vger.kernel.org])
 
 # Create release string.  Used with VERSION for RPMs.
 RPM_RELEASE=0
diff --git a/src/.git_version b/src/.git_version
index 8e07215..2ca1a25 100644
--- a/src/.git_version
+++ b/src/.git_version
@@ -1,2 +1,2 @@
-b78644e7dee100e48dfeca32c9270a6b210d3003
-v0.80
+a38fe1169b6d2ac98b427334c12d7cf81f809b74
+v0.80.1
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 03b6438..47d1c1d 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -120,6 +120,8 @@ bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap,
     m_client->dump_mds_sessions(f);
   else if (command == "dump_cache")
     m_client->dump_cache(f);
+  else if (command == "kick_stale_sessions")
+    m_client->_kick_stale_sessions();
   else
     assert(0 == "bad command registered");
   m_client->client_lock.Unlock();
@@ -404,6 +406,14 @@ int Client::init()
     lderr(cct) << "error registering admin socket command: "
 	       << cpp_strerror(-ret) << dendl;
   }
+  ret = admin_socket->register_command("kick_stale_sessions",
+				       "kick_stale_sessions",
+				       &m_command_hook,
+				       "kick sessions that were remote reset");
+  if (ret < 0) {
+    lderr(cct) << "error registering admin socket command: "
+	       << cpp_strerror(-ret) << dendl;
+  }
 
   client_lock.Lock();
   initialized = true;
@@ -419,6 +429,7 @@ void Client::shutdown()
   admin_socket->unregister_command("mds_requests");
   admin_socket->unregister_command("mds_sessions");
   admin_socket->unregister_command("dump_cache");
+  admin_socket->unregister_command("kick_stale_sessions");
 
   if (ino_invalidate_cb) {
     ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
@@ -1538,7 +1549,8 @@ bool Client::have_open_session(int mds)
 {
   return
     mds_sessions.count(mds) &&
-    mds_sessions[mds]->state == MetaSession::STATE_OPEN;
+    (mds_sessions[mds]->state == MetaSession::STATE_OPEN ||
+     mds_sessions[mds]->state == MetaSession::STATE_STALE);
 }
 
 MetaSession *Client::_get_mds_session(int mds, Connection *con)
@@ -1649,6 +1661,19 @@ void Client::handle_client_session(MClientSession *m)
   m->put();
 }
 
+void Client::_kick_stale_sessions()
+{
+  ldout(cct, 1) << "kick_stale_sessions" << dendl;
+
+  for (map<int,MetaSession*>::iterator p = mds_sessions.begin();
+       p != mds_sessions.end(); ) {
+    MetaSession *s = p->second;
+    ++p;
+    if (s->state == MetaSession::STATE_STALE)
+      _closed_mds_session(s);
+  }
+}
+
 void Client::send_request(MetaRequest *request, MetaSession *session)
 {
   // make the request
@@ -8960,6 +8985,10 @@ void Client::ms_handle_remote_reset(Connection *con)
 	  break;
 
 	case MetaSession::STATE_OPEN:
+	  ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
+	  s->state = MetaSession::STATE_STALE;
+	  break;
+
 	case MetaSession::STATE_NEW:
 	case MetaSession::STATE_CLOSED:
 	default:
diff --git a/src/client/Client.h b/src/client/Client.h
index 4a3d753..e31e90a 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -250,6 +250,7 @@ public:
   MetaSession *_open_mds_session(int mds);
   void _close_mds_session(MetaSession *s);
   void _closed_mds_session(MetaSession *s);
+  void _kick_stale_sessions();
   void handle_client_session(MClientSession *m);
   void send_reconnect(MetaSession *s);
   void resend_unsafe_requests(MetaSession *s);
diff --git a/src/client/MetaSession.cc b/src/client/MetaSession.cc
index ee38db3..9f2a136 100644
--- a/src/client/MetaSession.cc
+++ b/src/client/MetaSession.cc
@@ -15,6 +15,7 @@ const char *MetaSession::get_state_name() const
   case STATE_OPEN: return "open";
   case STATE_CLOSING: return "closing";
   case STATE_CLOSED: return "closed";
+  case STATE_STALE: return "stale";
   default: return "unknown";
   }
 }
diff --git a/src/client/MetaSession.h b/src/client/MetaSession.h
index d9fcbb8..5525f07 100644
--- a/src/client/MetaSession.h
+++ b/src/client/MetaSession.h
@@ -33,6 +33,7 @@ struct MetaSession {
     STATE_OPEN,
     STATE_CLOSING,
     STATE_CLOSED,
+    STATE_STALE,
   } state;
 
   list<Context*> waiting_for_open;
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index a065a77..2c65e6c 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -399,6 +399,7 @@ OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 10.0)
 OPTION(osd_agent_max_ops, OPT_INT, 4)
 OPTION(osd_agent_min_evict_effort, OPT_FLOAT, .1)
 OPTION(osd_agent_quantize_effort, OPT_FLOAT, .1)
+OPTION(osd_agent_delay_time, OPT_FLOAT, 5.0)
 
 // decay atime and hist histograms after how many objects go by
 OPTION(osd_agent_hist_halflife, OPT_INT, 1000)
@@ -469,6 +470,7 @@ OPTION(osd_backfill_scan_max, OPT_INT, 512)
 OPTION(osd_op_thread_timeout, OPT_INT, 15)
 OPTION(osd_recovery_thread_timeout, OPT_INT, 30)
 OPTION(osd_snap_trim_thread_timeout, OPT_INT, 60*60*1)
+OPTION(osd_snap_trim_sleep, OPT_FLOAT, 0)
 OPTION(osd_scrub_thread_timeout, OPT_INT, 60)
 OPTION(osd_scrub_finalize_thread_timeout, OPT_INT, 60*10)
 OPTION(osd_remove_thread_timeout, OPT_INT, 60*60)
diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc
index f30be1b..3a6dda4 100644
--- a/src/mon/MonClient.cc
+++ b/src/mon/MonClient.cc
@@ -327,8 +327,6 @@ void MonClient::handle_monmap(MMonMap *m)
   if (!monmap.get_addr_name(cur_con->get_peer_addr(), cur_mon)) {
     ldout(cct, 10) << "mon." << cur_mon << " went away" << dendl;
     _reopen_session();  // can't find the mon we were talking to (above)
-  } else {
-    _finish_hunting();
   }
 
   map_cond.Signal();
@@ -756,8 +754,6 @@ void MonClient::_renew_subs()
 
 void MonClient::handle_subscribe_ack(MMonSubscribeAck *m)
 {
-  _finish_hunting();
-
   if (sub_renew_sent != utime_t()) {
     sub_renew_after = sub_renew_sent;
     sub_renew_after += m->interval / 2.0;
diff --git a/src/os/KeyValueStore.cc b/src/os/KeyValueStore.cc
index cc117fa..fb459b2 100644
--- a/src/os/KeyValueStore.cc
+++ b/src/os/KeyValueStore.cc
@@ -204,11 +204,14 @@ void StripObjectMap::clone_wrap(StripObjectHeader &old_header,
 
   if (target_header)
     *target_header = old_header;
+  if (origin_header)
+    *origin_header = old_header;
 
   clone(old_header.header, cid, oid, t, &new_origin_header,
         &target_header->header);
 
-  old_header.header = new_origin_header;
+  if(origin_header)
+    origin_header->header = new_origin_header;
 
   if (target_header) {
     target_header->oid = oid;
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 8e71f47..4240ba8 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -201,6 +201,8 @@ OSDService::OSDService(OSD *osd) :
   agent_active(true),
   agent_thread(this),
   agent_stop_flag(false),
+  agent_timer_lock("OSD::agent_timer_lock"),
+  agent_timer(osd->client_messenger->cct, agent_timer_lock),
   objecter_lock("OSD::objecter_lock"),
   objecter_timer(osd->client_messenger->cct, objecter_lock),
   objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, &objecter_osdmap,
@@ -435,6 +437,10 @@ void OSDService::shutdown()
     Mutex::Locker l(backfill_request_lock);
     backfill_request_timer.shutdown();
   }
+  {
+    Mutex::Locker l(agent_timer_lock);
+    agent_timer.shutdown();
+  }
   osdmap = OSDMapRef();
   next_osdmap = OSDMapRef();
 }
@@ -451,6 +457,7 @@ void OSDService::init()
     objecter->init_locked();
   }
   watch_timer.init();
+  agent_timer.init();
 
   agent_thread.create();
 }
@@ -466,6 +473,15 @@ void OSDService::activate_map()
   agent_lock.Unlock();
 }
 
+class AgentTimeoutCB : public Context {
+  PGRef pg;
+public:
+  AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
+  void finish(int) {
+    pg->agent_choose_mode_restart();
+  }
+};
+
 void OSDService::agent_entry()
 {
   dout(10) << __func__ << " start" << dendl;
@@ -501,7 +517,18 @@ void OSDService::agent_entry()
     PGRef pg = *agent_queue_pos;
     int max = g_conf->osd_agent_max_ops - agent_ops;
     agent_lock.Unlock();
-    pg->agent_work(max);
+    if (!pg->agent_work(max)) {
+      dout(10) << __func__ << " " << *pg
+	<< " no agent_work, delay for " << g_conf->osd_agent_delay_time
+	<< " seconds" << dendl;
+
+      osd->logger->inc(l_osd_tier_delay);
+      // Queue a timer to call agent_choose_mode for this pg in 5 seconds
+      agent_timer_lock.Lock();
+      Context *cb = new AgentTimeoutCB(pg);
+      agent_timer.add_event_after(g_conf->osd_agent_delay_time, cb);
+      agent_timer_lock.Unlock();
+    }
     agent_lock.Lock();
   }
   agent_lock.Unlock();
@@ -1478,6 +1505,7 @@ void OSD::create_logger()
   osd_plb.add_u64_counter(l_osd_tier_whiteout, "tier_whiteout");
   osd_plb.add_u64_counter(l_osd_tier_dirty, "tier_dirty");
   osd_plb.add_u64_counter(l_osd_tier_clean, "tier_clean");
+  osd_plb.add_u64_counter(l_osd_tier_delay, "tier_delay");
 
   osd_plb.add_u64_counter(l_osd_agent_wake, "agent_wake");
   osd_plb.add_u64_counter(l_osd_agent_skip, "agent_skip");
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 6b3c89d..ce8b74c 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -133,6 +133,7 @@ enum {
   l_osd_tier_whiteout,
   l_osd_tier_dirty,
   l_osd_tier_clean,
+  l_osd_tier_delay,
 
   l_osd_agent_wake,
   l_osd_agent_skip,
@@ -466,6 +467,8 @@ public:
     }
   } agent_thread;
   bool agent_stop_flag;
+  Mutex agent_timer_lock;
+  SafeTimer agent_timer;
 
   void agent_entry();
   void agent_stop();
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 2c86f3b..6deb099 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -3880,7 +3880,6 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
         scrubber.received_maps.clear();
 
         {
-	  hobject_t end;
 
           // get the start and end of our scrub chunk
           //
@@ -3899,11 +3898,11 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
 	      cct->_conf->osd_scrub_chunk_max,
 	      0,
 	      &objects,
-	      &end);
+	      &scrubber.end);
             assert(ret >= 0);
 
             // in case we don't find a boundary: start again at the end
-            start = end;
+            start = scrubber.end;
 
             // special case: reached end of file store, implicitly a boundary
             if (objects.empty()) {
@@ -3911,25 +3910,19 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
             }
 
             // search backward from the end looking for a boundary
-            objects.push_back(end);
+            objects.push_back(scrubber.end);
             while (!boundary_found && objects.size() > 1) {
               hobject_t end = objects.back().get_boundary();
               objects.pop_back();
 
               if (objects.back().get_filestore_key() != end.get_filestore_key()) {
-                end = end;
+                scrubber.end = end;
                 boundary_found = true;
               }
             }
           }
-
-	  if (!_range_available_for_scrub(scrubber.start, end)) {
-	    // we'll be requeued by whatever made us unavailable for scrub
-	    done = true;
-	    break;
-	  }
-	  scrubber.end = end;
         }
+
         scrubber.block_writes = true;
 
         // walk the log to find the latest update that affects our chunk
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 8967a56..1fce297 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -1118,13 +1118,6 @@ public:
   void build_scrub_map(ScrubMap &map, ThreadPool::TPHandle &handle);
   void build_inc_scrub_map(
     ScrubMap &map, eversion_t v, ThreadPool::TPHandle &handle);
-  /**
-   * returns true if [begin, end) is good to scrub at this time
-   * a false return value obliges the implementer to requeue scrub when the
-   * condition preventing scrub clears
-   */
-  virtual bool _range_available_for_scrub(
-    const hobject_t &begin, const hobject_t &end) = 0;
   virtual void _scrub(ScrubMap &map) { }
   virtual void _scrub_clear_state() { }
   virtual void _scrub_finish() { }
@@ -2131,9 +2124,11 @@ public:
   virtual void check_blacklisted_watchers() = 0;
   virtual void get_watchers(std::list<obj_watch_item_t>&) = 0;
 
-  virtual void agent_work(int max) = 0;
+  virtual bool agent_work(int max) = 0;
   virtual void agent_stop() = 0;
+  virtual void agent_delay() = 0;
   virtual void agent_clear() = 0;
+  virtual void agent_choose_mode_restart() = 0;
 };
 
 ostream& operator<<(ostream& out, const PG& pg);
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 95b61a3..94eec05 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -2516,7 +2516,15 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
 
 void ReplicatedPG::snap_trimmer()
 {
-  lock();
+  if (g_conf->osd_snap_trim_sleep > 0) {
+    utime_t t;
+    t.set_from_double(g_conf->osd_snap_trim_sleep);
+    t.sleep();
+    lock();
+    dout(20) << __func__ << " slept for " << t << dendl;
+  } else {
+    lock();
+  }
   if (deleting) {
     unlock();
     return;
@@ -4611,8 +4619,8 @@ inline int ReplicatedPG::_delete_oid(OpContext *ctx, bool no_whiteout)
   }
   oi.size = 0;
 
-  // cache: writeback: set whiteout on delete?
-  if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK && !no_whiteout) {
+  // cache: cache: set whiteout on delete?
+  if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE && !no_whiteout) {
     dout(20) << __func__ << " setting whiteout on " << soid << dendl;
     oi.set_flag(object_info_t::FLAG_WHITEOUT);
     ctx->delta_stats.num_whiteouts++;
@@ -5080,7 +5088,7 @@ int ReplicatedPG::prepare_transaction(OpContext *ctx)
   }
 
   // cache: clear whiteout?
-  if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
+  if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE) {
     if (ctx->user_modify &&
 	ctx->obc->obs.oi.is_whiteout()) {
       dout(10) << __func__ << " clearing whiteout on " << soid << dendl;
@@ -7431,9 +7439,6 @@ void ReplicatedPG::kick_object_context_blocked(ObjectContextRef obc)
   dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
   requeue_ops(ls);
   waiting_for_blocked_object.erase(p);
-
-  if (obc->requeue_scrub_on_unblock)
-    osd->queue_for_scrub(this);
 }
 
 SnapSetContext *ReplicatedPG::create_snapset_context(const hobject_t& oid)
@@ -10930,15 +10935,11 @@ void ReplicatedPG::hit_set_trim(RepGather *repop, unsigned max)
       agent_state->remove_oldest_hit_set();
     updated_hit_set_hist.history.pop_front();
 
-    struct stat st;
-    int r = osd->store->stat(
-      coll,
-      ghobject_t(oid, ghobject_t::NO_GEN, pg_whoami.shard),
-      &st);
-    assert(r == 0);
+    ObjectContextRef obc = get_object_context(oid, false);
+    assert(obc);
     --repop->ctx->delta_stats.num_objects;
     --repop->ctx->delta_stats.num_objects_hit_set_archive;
-    repop->ctx->delta_stats.num_bytes -= st.st_size;
+    repop->ctx->delta_stats.num_bytes -= obc->obs.oi.size;
   }
 }
 
@@ -10966,6 +10967,7 @@ void ReplicatedPG::agent_setup()
     agent_state->position.hash = pool.info.get_random_pg_position(
       info.pgid.pgid,
       rand());
+    agent_state->start = agent_state->position;
 
     dout(10) << __func__ << " allocated new state, position "
 	     << agent_state->position << dendl;
@@ -10986,13 +10988,14 @@ void ReplicatedPG::agent_clear()
   agent_state.reset(NULL);
 }
 
-void ReplicatedPG::agent_work(int start_max)
+// Return false if no objects operated on since start of object hash space
+bool ReplicatedPG::agent_work(int start_max)
 {
   lock();
   if (!agent_state) {
     dout(10) << __func__ << " no agent state, stopping" << dendl;
     unlock();
-    return;
+    return true;
   }
 
   assert(!deleting);
@@ -11000,7 +11003,7 @@ void ReplicatedPG::agent_work(int start_max)
   if (agent_state->is_idle()) {
     dout(10) << __func__ << " idle, stopping" << dendl;
     unlock();
-    return;
+    return true;
   }
 
   osd->logger->inc(l_osd_agent_wake);
@@ -11090,8 +11093,12 @@ void ReplicatedPG::agent_work(int start_max)
     if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
 	agent_maybe_evict(obc))
       ++started;
-    if (started >= start_max)
+    if (started >= start_max) {
+      // If finishing early, set "next" to the next object
+      if (++p != ls.end())
+	next = *p;
       break;
+    }
   }
 
   if (++agent_state->hist_age > g_conf->osd_agent_hist_halflife) {
@@ -11101,13 +11108,42 @@ void ReplicatedPG::agent_work(int start_max)
     agent_state->temp_hist.decay();
   }
 
+  // Total objects operated on so far
+  int total_started = agent_state->started + started;
+  bool need_delay = false;
+
+  dout(20) << __func__ << " start pos " << agent_state->position
+    << " next start pos " << next
+    << " started " << total_started << dendl;
+
+  // See if we've made a full pass over the object hash space
+  // This might check at most ls_max objects a second time to notice that
+  // we've checked every objects at least once.
+  if (agent_state->position < agent_state->start && next >= agent_state->start) {
+    dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
+    if (total_started == 0)
+      need_delay = true;
+    else
+      total_started = 0;
+    agent_state->start = next;
+  }
+  agent_state->started = total_started;
+
+  // See if we are starting from beginning
   if (next.is_max())
     agent_state->position = hobject_t();
   else
     agent_state->position = next;
-  dout(20) << __func__ << " final position " << agent_state->position << dendl;
+
+  if (need_delay) {
+    assert(agent_state->delaying == false);
+    agent_delay();
+    unlock();
+    return false;
+  }
   agent_choose_mode();
   unlock();
+  return true;
 }
 
 void ReplicatedPG::agent_load_hit_sets()
@@ -11309,9 +11345,37 @@ void ReplicatedPG::agent_stop()
   }
 }
 
-void ReplicatedPG::agent_choose_mode()
+void ReplicatedPG::agent_delay()
 {
+  dout(20) << __func__ << dendl;
+  if (agent_state && !agent_state->is_idle()) {
+    assert(agent_state->delaying == false);
+    agent_state->delaying = true;
+    osd->agent_disable_pg(this, agent_state->evict_effort);
+  }
+}
+
+void ReplicatedPG::agent_choose_mode_restart()
+{
+  dout(20) << __func__ << dendl;
+  lock();
+  if (agent_state && agent_state->delaying) {
+    agent_state->delaying = false;
+    agent_choose_mode(true);
+  }
+  unlock();
+}
+
+void ReplicatedPG::agent_choose_mode(bool restart)
+{
+  // Let delay play out
+  if (agent_state->delaying) {
+    dout(20) << __func__ << this << " delaying, ignored" << dendl;
+    return;
+  }
+
   uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
+  assert(divisor > 0);
 
   uint64_t num_user_objects = info.stats.stats.sum.num_objects;
 
@@ -11360,19 +11424,20 @@ void ReplicatedPG::agent_choose_mode()
       info.stats.stats.sum.num_objects;
     dirty_micro =
       num_dirty * avg_size * 1000000 /
-      (pool.info.target_max_bytes / divisor);
+      MAX(pool.info.target_max_bytes / divisor, 1);
     full_micro =
       num_user_objects * avg_size * 1000000 /
-      (pool.info.target_max_bytes / divisor);
+      MAX(pool.info.target_max_bytes / divisor, 1);
   }
   if (pool.info.target_max_objects) {
     uint64_t dirty_objects_micro =
       num_dirty * 1000000 /
-      (pool.info.target_max_objects / divisor);
+      MAX(pool.info.target_max_objects / divisor, 1);
     if (dirty_objects_micro > dirty_micro)
       dirty_micro = dirty_objects_micro;
     uint64_t full_objects_micro =
-      num_user_objects * 1000000 / (pool.info.target_max_objects / divisor);
+      num_user_objects * 1000000 /
+      MAX(pool.info.target_max_objects / divisor, 1);
     if (full_objects_micro > full_micro)
       full_micro = full_objects_micro;
   }
@@ -11384,7 +11449,7 @@ void ReplicatedPG::agent_choose_mode()
   TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
   uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
   uint64_t flush_slop = (float)flush_target * g_conf->osd_agent_slop;
-  if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE)
+  if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE)
     flush_target += flush_slop;
   else
     flush_target -= MIN(flush_target, flush_slop);
@@ -11401,7 +11466,7 @@ void ReplicatedPG::agent_choose_mode()
   unsigned evict_effort = 0;
   uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
   uint64_t evict_slop = (float)evict_target * g_conf->osd_agent_slop;
-  if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
+  if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
     evict_target += evict_slop;
   else
     evict_target -= MIN(evict_target, evict_slop);
@@ -11416,7 +11481,11 @@ void ReplicatedPG::agent_choose_mode()
     // set effort in [0..1] range based on where we are between
     evict_mode = TierAgentState::EVICT_MODE_SOME;
     uint64_t over = full_micro - evict_target;
-    uint64_t span = 1000000 - evict_target;
+    uint64_t span;
+    if (evict_target >= 1000000)
+      span = 1;
+    else
+      span = 1000000 - evict_target;
     evict_effort = MAX(over * 1000000 / span,
 		       (unsigned)(1000000.0 * g_conf->osd_agent_min_evict_effort));
 
@@ -11465,11 +11534,11 @@ void ReplicatedPG::agent_choose_mode()
   // (including flush).  This is probably fine (they should be
   // correlated) but it is not precisely correct.
   if (agent_state->is_idle()) {
-    if (!old_idle) {
+    if (!restart && !old_idle) {
       osd->agent_disable_pg(this, old_effort);
     }
   } else {
-    if (old_idle) {
+    if (restart || old_idle) {
       osd->agent_enable_pg(this, agent_state->evict_effort);
     } else if (old_effort != agent_state->evict_effort) {
       osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
@@ -11511,23 +11580,6 @@ void ReplicatedPG::agent_estimate_atime_temp(const hobject_t& oid,
 // SCRUB
 
 
-bool ReplicatedPG::_range_available_for_scrub(
-  const hobject_t &begin, const hobject_t &end)
-{
-  pair<hobject_t, ObjectContextRef> next;
-  next.second = object_contexts.lookup(begin);
-  next.first = begin;
-  bool more = true;
-  while (more && next.first < end) {
-    if (next.second && next.second->is_blocked()) {
-      next.second->requeue_scrub_on_unblock = true;
-      return true;
-    }
-    more = object_contexts.get_next(next.first, &next);
-  }
-  return false;
-}
-
 void ReplicatedPG::_scrub(ScrubMap& scrubmap)
 {
   dout(10) << "_scrub" << dendl;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index abee8bf..3ea4721 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -809,7 +809,7 @@ protected:
   friend class C_HitSetFlushing;
 
   void agent_setup();       ///< initialize agent state
-  void agent_work(int max); ///< entry point to do some agent work
+  bool agent_work(int max); ///< entry point to do some agent work
   bool agent_maybe_flush(ObjectContextRef& obc);  ///< maybe flush
   bool agent_maybe_evict(ObjectContextRef& obc);  ///< maybe evict
 
@@ -825,11 +825,13 @@ protected:
 
   /// stop the agent
   void agent_stop();
+  void agent_delay();
 
   /// clear agent state
   void agent_clear();
 
-  void agent_choose_mode();  ///< choose (new) agent mode(s)
+  void agent_choose_mode(bool restart = false);  ///< choose (new) agent mode(s)
+  void agent_choose_mode_restart();
 
   /// true if we can send an ondisk/commit for v
   bool already_complete(eversion_t v) {
@@ -1241,8 +1243,6 @@ protected:
   friend struct C_Flush;
 
   // -- scrub --
-  virtual bool _range_available_for_scrub(
-    const hobject_t &begin, const hobject_t &end);
   virtual void _scrub(ScrubMap& map);
   virtual void _scrub_clear_state();
   virtual void _scrub_finish();
diff --git a/src/osd/TierAgentState.h b/src/osd/TierAgentState.h
index b5f7910..e9c22b2 100644
--- a/src/osd/TierAgentState.h
+++ b/src/osd/TierAgentState.h
@@ -17,6 +17,10 @@
 struct TierAgentState {
   /// current position iterating across pool
   hobject_t position;
+  /// Count of agent_work since "start" position of object hash space
+  int started;
+  hobject_t start;
+  bool delaying;
 
   /// histogram of ages we've encountered
   pow2_hist_t atime_hist;
@@ -66,7 +70,9 @@ struct TierAgentState {
   unsigned evict_effort;
 
   TierAgentState()
-    : hist_age(0),
+    : started(0),
+      delaying(false),
+      hist_age(0),
       flush_mode(FLUSH_MODE_IDLE),
       evict_mode(EVICT_MODE_IDLE),
       evict_effort(0)
@@ -75,8 +81,9 @@ struct TierAgentState {
   /// false if we have any work to do
   bool is_idle() const {
     return
-      flush_mode == FLUSH_MODE_IDLE &&
-      evict_mode == EVICT_MODE_IDLE;
+      delaying ||
+      (flush_mode == FLUSH_MODE_IDLE &&
+      evict_mode == EVICT_MODE_IDLE);
   }
 
   /// add archived HitSet
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 3289620..092d6cc 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -2690,7 +2690,6 @@ public:
   // set if writes for this object are blocked on another objects recovery
   ObjectContextRef blocked_by;      // object blocking our writes
   set<ObjectContextRef> blocking;   // objects whose writes we block
-  bool requeue_scrub_on_unblock;    // true if we need to requeue scrub on unblock
 
   // any entity in obs.oi.watchers MUST be in either watchers or unconnected_watchers.
   map<pair<uint64_t, entity_name_t>, WatchRef> watchers;
@@ -2863,7 +2862,7 @@ public:
       destructor_callback(0),
       lock("ReplicatedPG::ObjectContext::lock"),
       unstable_writes(0), readers(0), writers_waiting(0), readers_waiting(0),
-      blocked(false), requeue_scrub_on_unblock(false) {}
+      blocked(false) {}
 
   ~ObjectContext() {
     assert(rwstate.empty());
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 405f00f..20602d5 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -597,8 +597,8 @@ void RGWObjManifest::obj_iterator::seek(uint64_t o)
     stripe_size = rule.stripe_max_size;
     stripe_size = MIN(manifest->get_obj_size() - stripe_ofs, stripe_size);
   } else {
-    stripe_size = rule.part_size - (ofs - stripe_ofs);
-    stripe_size = MIN(stripe_size, rule.stripe_max_size);
+    uint64_t next = MIN(stripe_ofs + rule.stripe_max_size, part_ofs + rule.part_size);
+    stripe_size = next - stripe_ofs;
   }
 
   update_location();
@@ -4817,8 +4817,16 @@ void RGWRados::get_obj_aio_completion_cb(completion_t c, void *arg)
   ldout(cct, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs << " len=" << len << dendl;
   d->throttle.put(len);
 
-  if (d->is_cancelled())
+  r = rados_aio_get_return_value(c);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: got unexpected error when trying to read object: " << r << dendl;
+    d->set_cancelled(r);
     goto done;
+  }
+
+  if (d->is_cancelled()) {
+    goto done;
+  }
 
   d->data_lock.Lock();
 
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
index 6473c61..69948a6 100644
--- a/src/rgw/rgw_rest.cc
+++ b/src/rgw/rgw_rest.cc
@@ -36,6 +36,7 @@ static struct rgw_http_attr rgw_to_http_attr_list[] = {
   { RGW_ATTR_CACHE_CONTROL, "Cache-Control"},
   { RGW_ATTR_CONTENT_DISP, "Content-Disposition"},
   { RGW_ATTR_CONTENT_ENC, "Content-Encoding"},
+  { RGW_ATTR_USER_MANIFEST, "X-Object-Manifest"},
   { NULL, NULL},
 };
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git



More information about the Pkg-ceph-commits mailing list