[Pkg-ceph-commits] [ceph] 01/01: Imported Upstream version 0.94.4
Gaudenz Steinlin
gaudenz at moszumanska.debian.org
Mon Dec 26 20:48:03 UTC 2016
This is an automated email from the git hooks/post-receive script.
gaudenz pushed a commit to annotated tag upstream/0.94.4
in repository ceph.
commit 9f49096fea280a7c19ed93c79fa0a71a3274d4c3
Author: Gaudenz Steinlin <gaudenz at debian.org>
Date: Tue Oct 20 10:39:23 2015 +0200
Imported Upstream version 0.94.4
---
AUTHORS | 14 +
ChangeLog | 227 +++++-
Makefile.in | 1 +
ceph.spec | 73 +-
ceph.spec.in | 71 +-
configure | 107 ++-
configure.ac | 15 +-
man/Makefile.in | 1 +
src/.git_version | 4 +-
src/Makefile.am | 1 -
src/Makefile.in | 48 +-
src/acconfig.h.in | 3 +
src/auth/cephx/CephxClientHandler.cc | 18 +-
src/ceph-disk | 8 +-
src/ceph.in | 12 +-
src/civetweb/civetweb.h | 3 +
src/civetweb/include/civetweb.h | 3 +
src/civetweb/src/civetweb.c | 5 +
src/client/Client.cc | 21 +-
src/common/Cycles.cc | 4 +
src/common/Makefile.am | 3 +-
src/common/Mutex.cc | 13 +-
src/common/Mutex.h | 12 +-
src/common/RWLock.h | 26 +-
src/common/WorkQueue.h | 23 +-
src/common/bit_vector.hpp | 5 +-
src/common/buffer.cc | 128 ++--
src/common/ceph_context.cc | 59 +-
src/common/ceph_context.h | 7 +
src/common/ceph_crypto.cc | 46 +-
src/common/common_init.cc | 7 +-
src/common/config.cc | 2 +-
src/common/config_opts.h | 6 +
src/common/hobject.cc | 18 +
src/common/hobject.h | 10 +
src/common/lockdep.cc | 80 ++-
src/common/lockdep.h | 3 +-
src/common/valgrind.h | 15 +
src/crush/CrushTester.cc | 15 +-
src/crush/CrushTester.h | 9 +-
src/crush/CrushWrapper.cc | 8 +-
src/erasure-code/shec/ErasureCodeShec.cc | 1 +
src/global/global_init.cc | 5 -
src/include/ceph_features.h | 4 +
src/init-radosgw | 81 ++-
src/init-radosgw.sysv | 114 ---
src/java/Makefile.in | 1 +
src/librados/RadosClient.cc | 4 +-
src/librados/RadosClient.h | 10 +-
src/librbd/AioCompletion.cc | 14 +
src/librbd/AioCompletion.h | 10 +-
src/librbd/AioRequest.cc | 449 ++++++------
src/librbd/AioRequest.h | 145 ++--
src/librbd/AsyncFlattenRequest.cc | 166 ++---
src/librbd/AsyncObjectThrottle.cc | 24 +-
src/librbd/AsyncObjectThrottle.h | 21 +-
src/librbd/AsyncRequest.cc | 10 +
src/librbd/AsyncRequest.h | 3 +
src/librbd/AsyncResizeRequest.cc | 176 ++---
src/librbd/AsyncTrimRequest.cc | 211 +++---
src/librbd/AsyncTrimRequest.h | 5 +-
src/librbd/CopyupRequest.cc | 119 ++--
src/librbd/CopyupRequest.h | 30 +-
src/librbd/ImageCtx.cc | 87 ++-
src/librbd/ImageCtx.h | 5 +-
src/librbd/ImageWatcher.cc | 191 ++---
src/librbd/ImageWatcher.h | 1 +
src/librbd/LibrbdWriteback.cc | 29 +-
src/librbd/LibrbdWriteback.h | 3 +
src/librbd/ObjectMap.cc | 18 +-
src/librbd/ObjectMap.h | 2 +
src/librbd/WatchNotifyTypes.cc | 6 +
src/librbd/WatchNotifyTypes.h | 2 +
src/librbd/internal.cc | 160 ++---
src/librbd/internal.h | 3 +-
src/log/Log.cc | 4 +
src/mon/Monitor.cc | 2 +-
src/mon/OSDMonitor.cc | 75 +-
src/mon/OSDMonitor.h | 10 +-
src/mon/PGMonitor.cc | 8 +-
src/mon/PaxosService.cc | 10 +
src/mon/PaxosService.h | 5 +-
src/msg/simple/Pipe.cc | 4 +-
src/ocf/Makefile.in | 1 +
src/os/WBThrottle.cc | 1 +
src/os/chain_xattr.cc | 8 +
src/osd/ECBackend.h | 8 +-
src/osd/OSD.cc | 15 +-
src/osd/OSD.h | 7 +
src/osd/OSDMap.cc | 14 +-
src/osd/PG.cc | 39 +-
src/osd/PG.h | 21 +-
src/osd/PGBackend.cc | 104 ++-
src/osd/PGBackend.h | 22 +-
src/osd/PGLog.cc | 67 +-
src/osd/PGLog.h | 6 +-
src/osd/ReplicatedBackend.h | 8 +-
src/osd/ReplicatedPG.cc | 80 ++-
src/osd/ReplicatedPG.h | 5 +-
src/osd/osd_types.cc | 33 +-
src/osd/osd_types.h | 34 +-
src/osdc/ObjectCacher.cc | 155 +++--
src/osdc/ObjectCacher.h | 11 +-
src/osdc/Objecter.cc | 8 +-
src/osdc/Objecter.h | 42 +-
src/osdc/WritebackHandler.h | 3 +
src/rgw/Makefile.am | 3 +-
src/rgw/rgw_admin.cc | 81 +++
src/rgw/rgw_civetweb.cc | 3 +
src/rgw/rgw_common.cc | 50 +-
src/rgw/rgw_common.h | 48 +-
src/rgw/rgw_gc.cc | 7 +-
src/rgw/rgw_main.cc | 5 +-
src/rgw/rgw_op.cc | 6 +
src/rgw/rgw_orphan.cc | 810 ++++++++++++++++++++++
src/rgw/rgw_orphan.h | 209 ++++++
src/rgw/rgw_rados.cc | 299 ++++++--
src/rgw/rgw_rados.h | 93 ++-
src/rgw/rgw_replica_log.cc | 4 +-
src/rgw/rgw_rest.cc | 62 +-
src/rgw/rgw_rest.h | 3 +-
src/rgw/rgw_rest_swift.cc | 37 +-
src/rgw/rgw_rest_user.cc | 5 +-
src/rgw/rgw_user.cc | 19 +-
src/test/Makefile-client.am | 3 +-
src/test/bufferlist.cc | 24 +
src/test/centos-6/ceph.spec.in | 71 +-
src/test/centos-7/ceph.spec.in | 71 +-
src/test/ceph-disk.sh | 10 +
src/test/cli/radosgw-admin/help.t | 3 +-
src/test/common/test_bit_vector.cc | 88 ++-
src/test/crush/CrushWrapper.cc | 103 +++
src/test/librados_test_stub/LibradosTestStub.cc | 39 ++
src/test/librados_test_stub/TestClassHandler.cc | 5 +-
src/test/librados_test_stub/TestIoCtxImpl.cc | 8 +-
src/test/librados_test_stub/TestMemRadosClient.cc | 1 +
src/test/librados_test_stub/TestWatchNotify.cc | 53 +-
src/test/librados_test_stub/TestWatchNotify.h | 7 +-
src/test/librbd/fsx.cc | 2 -
src/test/librbd/test_ImageWatcher.cc | 51 +-
src/test/librbd/test_internal.cc | 25 +
src/test/librbd/test_librbd.cc | 95 ++-
src/test/librbd/test_main.cc | 33 +-
src/test/mon/osd-crush.sh | 12 +
src/test/objectstore/chain_xattr.cc | 38 +
src/test/osd/TestPGLog.cc | 84 ++-
src/test/osd/types.cc | 14 +
src/test/osdc/object_cacher_stress.cc | 2 +-
src/tools/ceph_objectstore_tool.cc | 14 +-
src/tools/crushtool.cc | 6 +
src/tools/rest_bench.cc | 9 +-
src/tracing/Makefile.in | 1 +
src/upstart/ceph-mds.conf | 2 +-
src/upstart/ceph-mon.conf | 2 +-
src/upstart/ceph-osd.conf | 2 +-
src/vstart.sh | 5 +-
156 files changed, 4688 insertions(+), 1993 deletions(-)
diff --git a/AUTHORS b/AUTHORS
index c2e21b2..40a5316 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,4 +1,5 @@
9seconds <nineseconds at yandex.ru>
+Abhishek Dixit <dixitabhi at gmail.com>
Abhishek L <abhishekl.2006 at gmail.com>
Abhishek Lekshmanan <abhishek.lekshmanan at ril.com>
Accela Zhao <accelazh at gmail.com>
@@ -95,11 +96,13 @@ Derek Yarnell <derek at umiacs.umd.edu>
Derrick Schneider <derrick.schneider at opower.com>
Ding Dinghua <dingdinghua85 at gmail.com>
Dmitry Smirnov <onlyjob at member.fsf.org>
+Dmitry Yatsushkevich <dyatsushkevich at mirantis.com>
Dmytro Iurchenko <diurchenko at mirantis.com>
Dominik Hannen <cantares1+github at gmail.com>
Dongmao Zhang <deanraccoon at gmail.com>
Dongsu Park <dpark1978 at gmail.com>
Dong Yuan <yuandong1222 at gmail.com>
+dwj192 <duanweijun at h3c.com>
Eleanor Cawthon <eleanor.cawthon at inktank.com>
Emily Popper <emily.popper at dreamhost.com>
Eric Mourgaya <eric.mourgaya at arkea.com>
@@ -126,6 +129,7 @@ Gerben Meijer <gerben at daybyday.nl>
git-harry <git-harry at live.co.uk>
Greg Farnum <gfarnum at redhat.com>
Greg Farnum <greg at inktank.com>
+Guang G Yang <yguang at renownedground.corp.gq1.yahoo.com>
Guangliang Zhao <guangliang at unitedstack.com>
Guang Yang <yguang at yahoo-inc.com>
guce <guce at h3c.com>
@@ -154,6 +158,7 @@ Jan Harkes <jaharkes at cs.cmu.edu>
Janne Grunau <j at jannau.net>
Jason Dillaman <dillaman at redhat.com>
Javier M. Mellid <jmunhoz at igalia.com>
+Jenkins <jenkins at ceph.com>
Jenkins <jenkins at inktank.com>
Jens-Christian Fischer <jens-christian.fischer at switch.ch>
Jerry7X <875016668 at qq.com>
@@ -164,6 +169,7 @@ Jian Wen <wenjian at letv.com>
Jim Schutt <jaschut at sandia.gov>
João Eduardo Luís <joao.luis at inktank.com>
João Eduardo Luís <joao at redhat.com>
+Joao Eduardo Luis <joao at suse.de>
Joe Buck <jbbuck at gmail.com>
Johannes Erdfelt <johannes at erdfelt.com>
John Spray <john.spray at inktank.com>
@@ -191,6 +197,7 @@ Kefu Chai <kchai at redhat.com>
Kefu Chai <tchaikov at gmail.com>
Ken Dreyer <kdreyer at redhat.com>
Ken Dreyer <ken.dreyer at inktank.com>
+Ketor Meng <d.ketor at gmail.com>
Kevin Cox <kevincox at kevincox.ca>
Kevin Dalley <kevin at kelphead.org>
Kevin Jones <k.j.jonez at gmail.com>
@@ -271,8 +278,10 @@ Radoslaw Zarzynski <rzarzynski at mirantis.com>
Raju Kurunkad <raju.kurunkad at sandisk.com>
Ray Lv <xiangyulv at gmail.com>
rca <bertosmailbox at gmail.com>
+renhwztetecs <rhwlyw at 163.com>
riccardo80 <riccardo80 at 29311d96-e01e-0410-9327-a35deaab8ce9>
Riccardo Ferretti <rferrett at soe.ucsc.edu>
+ritz303 <ritz_303 at yahoo.com>
Roald J. van Loon <roald at roaldvanloon.nl>
RobertJansen1 <r.jansen86 at gmail.com>
Robin H. Johnson <robbat2 at gentoo.org>
@@ -284,6 +293,7 @@ root <root at phenom.dyweni.com>
Ross Turk <ross.turk at inktank.com>
Ross Turk <rturk at redhat.com>
Ruben Kerkhof <ruben at rubenkerkhof.com>
+Ruifeng Yang <149233652 at qq.com>
Rutger ter Borg <rutger at terborg.net>
Sage Weil <sage at inktank.com>
Sage Weil <sweil at redhat.com>
@@ -302,6 +312,7 @@ Sharif Olorin <sio at tesser.org>
Shawn Edwards <lesser.evil at gmail.com>
shishir gowda <shishir.gowda at sandisk.com>
Shu, Xinxin <xinxin.shu at intel.com>
+Shylesh Kumar <shmohan at redhat.com>
Simone Gotti <simone.gotti at gmail.com>
Simon Leinen <simon.leinen at switch.ch>
Somnath Roy <somnath.roy at sandisk.com>
@@ -317,6 +328,7 @@ Stratos Psomadakis <psomas at grnet.gr>
Stuart Longland <stuartl at vrt.com.au>
Sushma Gurram <sushma.gurram at sandisk.com>
Swami Reddy <swami.reddy at ril.com>
+Sylvain Baubeau <sbaubeau at redhat.com>
Sylvain Munaut <s.munaut at whatever-company.com>
Takeshi Miyamae <miyamae.takeshi at jp.fujitsu.com>
Takuya ASADA <syuu at dokukino.com>
@@ -343,6 +355,7 @@ Vangelis Koukis <vkoukis at cslab.ece.ntua.gr>
Ved-vampir <akiselyova at mirantis.com>
Venky Shankar <vshankar at redhat.com>
Vicente Cheng <freeze.bilsted at gmail.com>
+Vikhyat Umrao <vumrao at redhat.com>
Viktor Suprun <popsul1993 at gmail.com>
Volker Assmann <volker at twisted-nerve.de>
VRan Liu <gliuwr at gmail.com>
@@ -358,6 +371,7 @@ wuxingyi <wuxingyi2015 at outlook.com>
wuxingyi <wuxingyi at letv.com>
Wyllys Ingersoll <wyllys.ingersoll at keepertech.com>
Xan Peng <xanpeng at gmail.com>
+Xiaowei Chen <cxwshawn at gmail.com>
Xiaoxi Chen <xiaoxi.chen at intel.com>
Xihui He <xihuihe at gmail.com>
Xing Lin <xinglin at cs.utah.edu>
diff --git a/ChangeLog b/ChangeLog
index 2ad0178..e14f65f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,10 +1,214 @@
-95cefea (HEAD, tag: v0.94.3) 0.94.3
+9529269 (HEAD, tag: v0.94.4) 0.94.4
+b203979 use git://git.ceph.com
+0f4ef19 qa: http://ceph.com/qa -> http://download.ceph.com/qa
+294f016 (origin/wip-13227-hammer) init-radosgw.sysv: remove
+698d75c (origin/wip-13410-hammer) tests: robust test for the pool create crushmap test
+2a28114 (origin/wip-13401-hammer) crush/CrushTester: test fewer inputs when running crushtool
+abc5b5f tests: update to match crushmap validation message
+25bd277 mon/OSDMonitor: fix crush injection error message
+6635530 mon/OSDMonitor: only test crush ruleset for the newly created pool
+cc1fedd crush/CrushTester: allow testing by ruleset
+3228161 qa/workunits/cephtool/test.sh: don't assume crash_replay_interval=45
+ad83304 rgw:add --reset-regions for regionmap update
+7de65e7 rgw : setting max number of buckets for users via ceph.conf option
+297c04d rgw: init_rados failed leads to repeated delete
+4b0686f rgw: delete finisher only after finalizing watches
+6119b15 rgw: be more flexible with iso8601 timestamps
+607904e init-radosgw: specify pid file to start-stop-daemon
+f51ab26 rgw: fix radosgw start-up script.
+544a98f init-radosgw: unify init-radosgw[.sysv]
+2a733e9 init-radosgw: look in /var/lib/ceph/radosgw
+d00c52b doc: rgw: fix typo in comments
+eb001d3 rgw: init script waits until the radosgw stops
+9ab9c44 rgw: don't read actual data on user manifest HEAD
+9026c4a doc: remove mention of ceph-extra as a requirement
+45ed24d doc: remove ceph-extras
+faccdce doc: correct links to download.ceph.com
+e9f4aec doc: Added "Hammer" in the list of major releases.
+424fc1c rgw: set default value for env->get() call
+e72bdc3 osd/ReplicatedPG: tolerate promotion completion with stopped agent
+a3afb3f rgw: remove trailing :port from host for purposes of subdomain matching
+77cb503 (origin/wip-13015-hammer) rgw: preserve all attrs if intra-zone copy
+b9f2ed3 rgw: don't preserve acls when copying object
+b3822f1 upstart: limit respawn to 3 in 30 mins (instead of 5 in 30s)
+0d6a8c6 Pipe: Drop connect_seq increase line
+4be8a28 osd/PG: peek_map_epoch: skip legacy PGs if infos object is missing
+f237ed9 osd: allow peek_map_epoch to return an error
+3a50b90 crypto: fix unbalanced ceph::crypto::init/ceph::crypto:shutdown
+0a5b856 ReplicatedPG,Objecter: copy_get should include truncate_seq and size
+82ea02a rgw: fix assignment of copy obj attributes
+3b2affc rgw: add delimiter to prefix only when path is specified
+9f69660 tests: tiering agent and proxy read
+5656eec osd: trigger the cache agent after a promotion
+dc693fc lockdep: allow lockdep to be dynamically enabled/disabled
+805732b tests: librbd API test cannot use private md_config_t struct
+7ac0173 tests: ensure old-format RBD tests still work
+b68d757 librados_test_stub: implement conf get/set API methods
+f0fa637 crypto: use NSS_InitContext/NSS_ShutdownContex to avoid memory leak
+3f542aa auth: use crypto_init_mutex to protect NSS_Shutdown()
+e487e8e auth: reinitialize NSS modules after fork()
+00e73ad librbd: prevent race condition between resize requests
+6c4ccc8 librbd: Add a paramter:purge_on_error in ImageCtx::invalidate_cache().
+0573491 librbd: Remvoe unused func ImageCtx::read_from_cache.
+28838f2 osdc: clean up code in ObjectCacher::Object::map_write
+5c4f152 osdc: Don't pass mutex into ObjectCacher::_wait_for_write.
+86e7698 osdc: After write try merge bh.
+c96541a osdc: Make last missing bh to wake up the reader.
+4135b9a osdc: For trust_enoent is true, there is only one extent.
+81376b6 osdc: In _readx() only no error can tidy read result.
+e80bd0a (origin/wip-12859-hammer-loic) rgw: send Content-Length in response for GET on Swift account.
+2e54245 rgw: force content_type for swift bucket stats request
+5d57b63 rgw: we should not overide Swift sent content type
+b8aafbc rgw: enforce Content-Type in Swift responses.
+143cfc3 rgw: force content-type header for swift account responses without body
+b5420d6 rgw: shouldn't return content-type: application/xml if content length is 0
+836f763 OSD: break connection->session->waiting message->connection cycle
+77624af osd/PGLog: dirty_to is inclusive
+aa00373 common: fix code format
+aab35da test: add test case for insert empty ptr when buffer rebuild
+2b0b7ae common: fix insert empty ptr when bufferlist rebuild
+2348a5b osd: copy the RecoveryCtx::handle when creating a new RecoveryCtx instance from another one
+bf72785 config: skip lockdep for intentionally recursive md_config_t lock
+c94fd92 osd: Keep a reference count on Connection while calling send_message()
+059bf98 WBThrottle::clear_object: signal if we cleared an object
+a478385 ceph-disk: always check zap is applied on a full device
+e471c5d librados: Make librados pool_create respect default_crush_ruleset
+35fa47a (origin/wip-corpus-hammer) ceph-object-corpus: add 0.94.2-207-g88e7ee7 hammer objects
+b80859e (origin/wip-11455-hammer) rgw: init some manifest fields when handling explicit objs
+f47ba4b mon: test the crush ruleset when creating a pool
+b58cbba erasure-code: set max_size to chunk_count() instead of 20 for shec
+6f0af18 vstart.sh: set PATH to include pwd
+da00bed rgw: rework X-Trans-Id header to be conform with Swift API.
+9937c81 Transaction Id added in response
+f1c7c62 rgw: api adjustment following a rebase
+85911df rgw: orphans, fix check on number of shards
+c1cf7df rgw: orphans, change default number of shards
+bb1d4cc rgw: change error output related to orphans
+2e0f6fe rgw: orphan, fix truncated detection
+1bfebef radosgw-admin: simplify orphan command
+f244b15 radosgw-admin: stat orphan objects before reporting leakage
+f80e2b2 radosgw-admin: orphans finish command
+88d32c6 rgw: cannot re-init an orphan scan job
+80a4034 rgw: stat_async() sets the object locator appropriately
+0082036 rgw: list_objects() sets namespace appropriately
+1c37072 rgw: modify orphan search fingerprints
+ef81367 rgw: compare oids and dump leaked objects
+f4d0544 rgw: keep accurate state for linked objects orphan scan
+748ea57 rgw: iterate over linked objects, store them
+6c6aa5d rgw: add rgw_obj::parse_raw_oid()
+62d562d rgw: iterate asynchronously over linked objects
+00ecf2d rgw: async object stat functionality
+7d1cc48 rgw-admin: build index of bucket indexes
+c1b0e7a rgw: initial work of orphan detection tool implementation
+b16129c Avoid an extra read on the atomic variable
+1f6916d RGW: Make RADOS handles in RGW to be a configurable option
+a13c7fd rgw:the arguments 'domain' should not be assigned when return false
+6acf36f rgw:segmentation fault when rgw_gc_max_objs > HASH_PRIME
+6b36514 rgw: avoid using slashes for generated secret keys
+8ba6b2f rgw: url encode exposed bucket
+0bc909e (origin/wip-12638-hammer) mon: add a cache layer over MonitorDBStore
+bee8666 Objecter: pg_interval_t::is_new_interval needs pgid from previous pool
+b5418b9 osd_types::is_new_interval: size change triggers new interval
+f028389 (origin/liewegas-wip-hammer-feature-hammer) include/ceph_features: define HAMMER_0_94_4 feature
+95cefea (tag: v0.94.3) 0.94.3
81a311a (origin/hammer-12709) Workunits : fs/misc/chmod.sh : Include ACL characters in permission check.
+153744d (origin/wip-12682-hammer) tests: increase test coverage for partial encodes/decodes
+fca7876 common: bit_vector extent calculation incorrect for last page
+3396a96 osd/OSDMap: handle incrementals that modify+del pool
+3ab5d82 (origin/wip-12432-hammer) rgw: set http status in civetweb
+10a0383 civetweb: update submodule to support setting of http status
+00d802d hobject_t: fix get_boundary to work with new sorting regime
+9b91adc (origin/wip-osd-compat-hammer) mon: disallow post-hammer OSDs if there are up pre-hammer OSDs
+8a559c1 include/ceph_features: define MON_METADATA feature
+4faa8e0 (origin/wip-12577-hammer) osd: include newlines in scrub errors
+455eb2a osd: fix condition for loggin scrub errors
+67e7946 osd: fix fallback logic; move into be_select_auth_object
+0f57c70 osd: log a scrub error when we can't pick an auth object
+d4f4c5c osd: repair record digest if all replicas match but do not match
+acfed6b osd: move recorded vs on disk digest warning into be_compare_scrubmaps
+674029b osd: be slightly paranoid about value of okseed
+f2002b7 osd: be precise about "known" vs "best guess"
+4e5d146 osd: record digest if object is clean (vs entire scrub chunk)
+1357ed1 hobject_t: decode future hobject_t::get_min() properly
+6d01d6b OSDMonitor::preprocess_get_osdmap: send the last map as well
2ecb3b7 Fh ref count will leak if readahead does not need to do read from osd
4c199bf (origin/wip-11998-hammer) debian/control: ceph-common (>> 0.94.2) must be >= 0.94.2-2
+a785193 ceph.spec.in: drop SUSE-specific %py_requires macro
+8804b3f ceph.spec.in: remove SUSE-specific apache2-mod_fcgid dependency
+b575ecc (origin/wip-12236-hammer) tests: verify that image shrink properly handles flush op
+d4eb7bd librbd: invalidate cache outside cache callback context
+92272dd (origin/wip-12235-hammer) librbd: don't cancel request lock early
+58ae92f tests: new test for transitioning exclusive lock
+7b21ccb tests: verify that librbd will periodically resend lock request
+c95b37f common: Mutex shouldn't register w/ lockdep if disabled
+117205a librbd: improve debugging output for ImageWatcher
+08ae012 librados_test_stub: watcher id should be the instance id (gid)
+704c0e0 librbd: retry lock requests periodically until acquired
+dbaaed9 librbd: don't hold owner_lock for write during flush
+e971820 (origin/wip-12345-hammer) lockdep: do not automatically collect all backtraces
+27f7042 librbd: flush operations need to acquire owner lock
+5b39983 librbd: avoid infinite loop if copyup fails
+88b583b librbd: flush pending ops while not holding lock
+a88b180 tests: fix possible deadlock in librbd ImageWatcher tests
+321eb8d tests: enable lockdep for librbd unit tests
+bfe5b90 librbd: owner_lock should be held during flush request
+1e84fb0 osdc: ObjectCacher flusher might needs additional locks
+506a45a librbd: fix recursive locking issues
+acf5125 librbd: simplify state machine handling of exclusive lock
+9454f04 librbd: ObjectMap::aio_update can acquire snap_lock out-of-order
+3e0358e librbd: move copyup class method call to CopyupRequest
+2ee64a8 librbd: simplify AioRequest constructor parameters
+3e71a75 librbd/AioRequest.h: fix UNINIT_CTOR
+cb57fe5 librbd: add object state accessor to ObjectMap
+9249ab7 librbd: AsyncObjectThrottle should always hold owner_lock
+26902b9 librbd: execute flush completion outside of cache_lock
+571220d librbd: add AsyncRequest task enqueue helper method
+8e280f4 librbd: disable lockdep on AioCompletion
+b38da48 librbd: AioCompletion shouldn't hold its lock during callback
+6fdd3f1 librbd: give locks unique names to prevent false lockdep failures
+7004149 librbd: complete cache read in a new thread context
+65ef695 librbd: require callers to ObjectMap::aio_update to acquire lock
+58b8faf log: fix helgrind warnings regarding possible data race
+a5203d3 librados_test_stub: fix helgrind warnings
+b73e87e librados_test_stub: add support for flushing watches
+2fa35b1 common: lockdep now support unregistering once destructed
+7b85c7b common: add valgrind.h convenience wrapper
+6d3db5f librbd: add work queue for op completions
+64425e8 WorkQueue: ContextWQ can now accept a return code
+eccf369 packaging: RGW depends on /etc/mime.types
e19f928 (origin/wip-12502-hammer) rgw: conversion tool to fix broken multipart objects
28d32f6 rgw: only scan for objects not in namespace
e22e2b4 rgw_admin: add --remove-bad flag to bucket check
+7bddf5d rest_bench: bucketname is not mandatory as we have a default name
+6e7358b rest_bench: drain the work queue to fix a crash Fixes: #3896 Signed-off-by: huangjun <hjwsm1989 at gmail.com>
+1e05578 auth: check return value of keyring->get_secret
+256620e Client: check dir is still complete after dropping locks in _readdir_cache_cb
+8a2ad05 TestPGLog: fix invalid proc_replica_log test caes
+df71e6b TestPGLog: fix noop log proc_replica_log test case
+549ff9a TestPGLog: add test for 11358
+c224fc7 PGLog::proc_replica_log: handle split out overlapping entries
+b8176d0 Mutex: fix leak of pthread_mutexattr
+43a72e4 mon/PGMonitor: bug fix pg monitor get crush rule
+0ca93db mon: ceph osd map shows NONE when an osd is missing
+695f782 crush/CrushWrapper: fix adjust_subtree_weight debug
+0bd4c81 crush/CrushWrapper: return changed from adjust_subtree_weight
+05fc59b crush/CrushWrapper: adjust subtree base in adjust_subtree_weight
+d2f31ad unittest_crush_wrapper: test adjust_subtree_weight
+0ccdf34 unittest_crush_wrapper: attach buckets to root in adjust_item_weight test
+1e73753 unittest_crush_wrapper: parse env
+cd11b88 osd: pg_interval_t::check_new_interval should not rely on pool.min_size to determine if the PG was active
+c5f0e22 osd: Move IsRecoverablePredicate/IsReadablePredicate to osd_types.h
+42bff0b mon: OSDMonitor: fix hex output on 'osd reweight'
+e004941 ceph.in: print more detailed warning for 'ceph <type> tell'
+f18900f ceph.in: print more detailed error message for 'tell' command
+9916d37 mon/PGMonitor: avoid uint64_t overflow when checking pool 'target/max' status. Fixes: #12401
+4457d3e Update OSDMonitor.cc
+add0f1e ceph.in: do not throw on unknown errno
+fa19474 os/chain_xattr: handle read on chnk-aligned xattr
+931ffe3 common/Cycles.cc: skip initialization if rdtsc is not implemented
+0fde3a2 buffer: Fix bufferlist::zero bug with special case
+dabc611 UnittestBuffer: Add bufferlist zero test case
+d08db7a (origin/wip-11470.hammer) mon: PaxosService: call post_refresh() instead of post_paxos_update()
154f18c (origin/wip-12465-hammer) Log::reopen_log_file: take m_flush_mutex
b872882 (origin/wip-12237-hammer) librados_test_stub: read op should return number of bytes read
7d9fce3 tests: fixed TestObjectMap.InvalidateFlagInMemoryOnly
@@ -31,6 +235,8 @@ fe013e0 librbd: TaskFinisher should finish all queued tasks
13f926e librados_test_stub: cleanup singleton memory allocation
1063f52 PG::find_best_info: ignore info.les for incomplete peer
7132277 Conditional-compile against minimal tcmalloc.
+0818e9f ceph.spec.in: snappy-devel for all supported distros
+8b576bd ceph.spec.in: python-argparse only in Python 2.6
ad5745b OSD: add command_wq suicide timeout
059a579 OSD: add remove_wq suicide timeout
b8826bc OSD: add scrub_wq suicide timeout
@@ -46,8 +252,13 @@ ec70533 rgw: error out if frontend did not send all data
b1618a9 rgw: fix lack of account name in XML listing of Swift account.
e39dce7 rgw: generate the "Date" HTTP header for civetweb.
a5dbcbb Swift: Set Content-Length when requesting/checking Keystone tokens
+cdde626 ceph.spec.in: do not run fdupes, even on SLE/openSUSE
3c8cdea client: reference counting 'struct Fh'
c78cc00 rgw: rectify 202 Accepted in response for PUT on existing bucket.
+6417e8e rpm: add missing Java conditionals
+3728477 Add rpm conditionals : cephfs_java
+8f78001 ceph.spec.in: SUSE/openSUSE builds need libbz2-devel
+4eb58ad ceph.spec.in: use _udevrulesdir to eliminate conditionals
7f1c0cc crush/CrushTester: return EINVAL if crushtool returns non-zero
2aaeea1 tests: TEST_crush_reject_empty must not run a mon
80afb81 ceph-helpers: implement test_expect_failure
@@ -1860,7 +2071,7 @@ e8e27a8 (origin/wip-10296) unittest_blkdev: test an abbreviated /sys/block dir
5e454a8 common/blkdev: add simple sandboxing function for testing
9b26de3 ReplicatedPG: fail a non-blocking flush if the object is being scrubbed
dce6f28 ReplicatedPG::scan_range: an object can disappear between the list and the attr get
-6110220 (origin/wip-aarch64) debian: enable libgoogle-perftools-dev on arm64
+6110220 debian: enable libgoogle-perftools-dev on arm64
2246dca common/blkdev: fix block device discard check
25e3783 common/blkdev: get_block_device_base
beaa04e mon: MonitorDBStore: allow randomly injecting random delays on writes
@@ -4741,7 +4952,7 @@ f31e4c8 (origin/wip-da-update-libs3) libs3: update to latest git master of ceph/
23b657c Remove unused variables in KeyValueStore.cc
307ba48 Remove unused variables in MemStore.cc
5185a36 (origin/wip-autotools-dummy) automake: add dummy.cc to fix 'make tags'
-35509d2 bloom_filter, add test to validate assignement operator
+35509d274 bloom_filter, add test to validate assignement operator
c50f85e bloom_filter, remove unecessary operators
90cc6dd bloom_filter, add assertion to test validate element_count()
c323c5b Fix keyvaluestore fiemap bug
@@ -18599,7 +18810,7 @@ dd31ff2 doc: add short section on documenting code
590520c doc: fix rados_version todo formatting
50c9cb1 doc: add a prefix to group names in librados.h
d9d9e6d doc: Put rados_ioctx_locator_set_key in a group so it can be cross-referenced
-b464b75 doc: move rados_ioctx_get_id to the pool group
+b464b757 doc: move rados_ioctx_get_id to the pool group
b148bef doc: fix some typos in librados C API
c960641 doc: Switch doxygen integration from breathe to asphyxiate.
78cc07f librados: Avoid using "crush_rule" as name of function argument.
@@ -20812,7 +21023,7 @@ cbeedeb proflogger: Unlink our UNIX domain sockets on exit
adafec4 test/proflogger.cc: read length of message first
f8b4aa3 ProfLogger: write out length of message first
325951d test/proflogger: Add TeardownSetup and SimpleTest
-134a680 Add test/proflogger.cc, fix ProfLogger::init()
+134a680a Add test/proflogger.cc, fix ProfLogger::init()
5517b8f Rework ProfLogger
6424149 osd: remove unused variables
d07c480 mon: remove unused variables
@@ -22037,7 +22248,7 @@ e37878e mds: fix discover tid assignment
6025dee osd: move watch/notify effects out of do_osd_ops
0aeab99 obsync: implement RadosStore
ccf11fb osd: mention invalid snapc in log
-896de0ac osd: include (some) osd op flags in MOSDOp print method
+896de0a osd: include (some) osd op flags in MOSDOp print method
b08ee2c osd: add RWORDERED osd op flag
a44065d radostool: fix getxattr / setxattr return code
9c2f0f0 rbd: make showmapped output a bit prettier
@@ -22604,7 +22815,7 @@ cae43fc Makefile: drop libradosgw_a LDFLAGS
32fce3c rados_create: correctly handle null id
f06f4ee librados: always call keyring_init in connect
586fc66 librados: don't call keyring_init in init_internal
-9e1828a objecter: make response_data bufferlist static
+9e1828af objecter: make response_data bufferlist static
251fd50 rados_create_internal calls keyring_init
c548976 rados_create: set id based on parameter
b1c3321 librados: add rados_create_internal
@@ -26693,7 +26904,7 @@ ba515fe mkcephfs: generate cephx keys during mkfs
329178d mount: set flags when getting -o sync
6ea3030 mds: fix dumpcache
6285b61 authtool: only create keyring if --create-keyring (or -c)
-f40957e config: rename 'keys file' to 'keyring'
+f40957eb config: rename 'keys file' to 'keyring'
3ebf9a4 filestore: optionally checkpoint with snaps
5bdb348 journal: make sure max_size is multiple of block_size
54898b3 mds: print setattr'd values with MClientRequest
diff --git a/Makefile.in b/Makefile.in
index 6de298a..7f8b69d 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -252,6 +252,7 @@ GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
GIT_CHECK = @GIT_CHECK@
GREP = @GREP@
HAVE_CXX11 = @HAVE_CXX11@
+HAVE_VALGRIND = @HAVE_VALGRIND@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
diff --git a/ceph.spec b/ceph.spec
index c00f449..f52d569 100644
--- a/ceph.spec
+++ b/ceph.spec
@@ -1,15 +1,18 @@
%bcond_with ocf
+%bcond_without cephfs_java
%if ! (0%{?fedora} > 12 || 0%{?rhel} > 5)
%{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
%{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
%endif
+%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
+
#################################################################################
# common
#################################################################################
Name: ceph
-Version: 0.94.3
+Version: 0.94.4
Release: 0%{?dist}
Epoch: 1
Summary: User space components of the Ceph file system
@@ -28,7 +31,6 @@ Requires: python-rados = %{epoch}:%{version}-%{release}
Requires: python-rbd = %{epoch}:%{version}-%{release}
Requires: python-cephfs = %{epoch}:%{version}-%{release}
Requires: python
-Requires: python-argparse
Requires: python-requests
Requires: python-flask
Requires: xfsprogs
@@ -39,7 +41,9 @@ Requires: cryptsetup
Requires(post): binutils
BuildRequires: gcc-c++
BuildRequires: boost-devel
-%if ! 0%{defined suse_version}
+%if 0%{defined suse_version}
+BuildRequires: libbz2-devel
+%else
BuildRequires: bzip2-devel
%endif
BuildRequires: cryptsetup
@@ -59,18 +63,15 @@ BuildRequires: perl
BuildRequires: parted
BuildRequires: pkgconfig
BuildRequires: python
-BuildRequires: python-argparse
BuildRequires: python-nose
BuildRequires: python-requests
BuildRequires: python-virtualenv
+BuildRequires: snappy-devel
BuildRequires: util-linux
BuildRequires: xfsprogs
BuildRequires: xfsprogs-devel
BuildRequires: xmlstarlet
BuildRequires: yasm
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} || 0%{?suse_version}
-BuildRequires: snappy-devel
-%endif
%if 0%{?suse_version}
BuildRequires: net-tools
%endif
@@ -95,7 +96,6 @@ BuildRequires: %insserv_prereq
BuildRequires: mozilla-nss-devel
BuildRequires: keyutils-devel
BuildRequires: libatomic-ops-devel
-BuildRequires: fdupes
%else
Requires: gdisk
BuildRequires: nss-devel
@@ -126,12 +126,14 @@ Requires: python-rados = %{epoch}:%{version}-%{release}
Requires: python-rbd = %{epoch}:%{version}-%{release}
Requires: python-cephfs = %{epoch}:%{version}-%{release}
Requires: python-requests
-%if 0%{defined suse_version}
-Requires: python-argparse
-%endif
%if 0%{?rhel} || 0%{?fedora}
Requires: redhat-lsb-core
%endif
+# python-argparse is only needed in distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+Requires: python-argparse
+BuildRequires: python-argparse
+%endif
%description -n ceph-common
Common utilities to mount and interact with a ceph storage cluster.
@@ -161,10 +163,10 @@ Requires: librados2 = %{epoch}:%{version}-%{release}
%if 0%{defined suse_version}
BuildRequires: libexpat-devel
BuildRequires: FastCGI-devel
-Requires: apache2-mod_fcgid
%else
BuildRequires: expat-devel
BuildRequires: fcgi-devel
+Requires: mailcap
%endif
%description radosgw
This package is an S3 HTTP REST gateway for the RADOS object store. It
@@ -213,9 +215,6 @@ Group: System Environment/Libraries
License: LGPL-2.0
Requires: librados2 = %{epoch}:%{version}-%{release}
Obsoletes: python-ceph < %{epoch}:%{version}-%{release}
-%if 0%{defined suse_version}
-%py_requires
-%endif
%description -n python-rados
This package contains Python libraries for interacting with Cephs RADOS
object store.
@@ -333,6 +332,8 @@ BuildRequires: libbabeltrace-devel
%description -n ceph-test
This package contains Ceph benchmarks and test tools.
+%if 0%{with cephfs_java}
+
%package -n libcephfs_jni1
Summary: Java Native Interface library for CephFS Java bindings.
Group: System Environment/Libraries
@@ -372,6 +373,8 @@ BuildRequires: junit
%description -n cephfs-java
This package contains the Java libraries for the Ceph File System.
+%endif
+
%package libs-compat
Summary: Meta package to include ceph libraries.
Group: System Environment/Libraries
@@ -399,7 +402,9 @@ Requires: librados2-devel = %{epoch}:%{version}-%{release}
Requires: libradosstriper1-devel = %{epoch}:%{version}-%{release}
Requires: librbd1-devel = %{epoch}:%{version}-%{release}
Requires: libcephfs1-devel = %{epoch}:%{version}-%{release}
+%if 0%{with cephfs_java}
Requires: libcephfs_jni1-devel = %{epoch}:%{version}-%{release}
+%endif
Provides: ceph-devel
%description devel-compat
This is a compatibility package to accommodate ceph-devel split into
@@ -436,10 +441,12 @@ python-cephfs instead.
%endif
%build
+%if 0%{with cephfs_java}
# Find jni.h
for i in /usr/{lib64,lib}/jvm/java/include{,/linux}; do
[ -d $i ] && java_inc="$java_inc -I$i"
done
+%endif
./autogen.sh
MY_CONF_OPT=""
@@ -457,7 +464,9 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
--without-cryptopp \
--with-rest-bench \
--with-debug \
+%if 0%{with cephfs_java}
--enable-cephfs-java \
+%endif
--with-librocksdb-static=check \
$MY_CONF_OPT \
%{?_with_ocf} \
@@ -479,7 +488,7 @@ make DESTDIR=$RPM_BUILD_ROOT install
find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
-install -D src/init-radosgw.sysv $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
mkdir -p $RPM_BUILD_ROOT%{_sbindir}
@@ -497,13 +506,8 @@ install -m 0644 -D etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds %{buildro
%endif
# udev rules
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%else
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%endif
+install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT%{_udevrulesdir}/50-rbd.rules
+install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
%if (0%{?rhel} && 0%{?rhel} < 7)
install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
@@ -529,12 +533,6 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
-%if %{defined suse_version}
-# Fedora seems to have some problems with this macro, use it only on SUSE
-%fdupes -s $RPM_BUILD_ROOT/%{python_sitelib}
-%fdupes %buildroot
-%endif
-
%clean
rm -rf $RPM_BUILD_ROOT
@@ -615,13 +613,8 @@ fi
%{_libdir}/rados-classes/libcls_version.so*
%dir %{_libdir}/ceph/erasure-code
%{_libdir}/ceph/erasure-code/libec_*.so*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/usr/lib/udev/rules.d/95-ceph-osd.rules
-%else
-/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/lib/udev/rules.d/95-ceph-osd.rules
-%endif
+%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+%{_udevrulesdir}/95-ceph-osd.rules
%config %{_sysconfdir}/bash_completion.d/ceph
%config(noreplace) %{_sysconfdir}/logrotate.d/ceph
%if 0%{?suse_version}
@@ -687,11 +680,7 @@ fi
%config(noreplace) %{_sysconfdir}/ceph/rbdmap
%{_initrddir}/rbdmap
%{python_sitelib}/ceph_argparse.py*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/50-rbd.rules
-%else
-/lib/udev/rules.d/50-rbd.rules
-%endif
+%{_udevrulesdir}/50-rbd.rules
%postun -n ceph-common
# Package removal cleanup
@@ -904,6 +893,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
%endif
#################################################################################
+%if 0%{with cephfs_java}
%files -n libcephfs_jni1
%defattr(-,root,root,-)
%{_libdir}/libcephfs_jni.so.*
@@ -918,6 +908,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
%defattr(-,root,root,-)
%{_javadir}/libcephfs.jar
%{_javadir}/libcephfs-test.jar
+%endif
#################################################################################
%files libs-compat
diff --git a/ceph.spec.in b/ceph.spec.in
index b36a0b9..140e0e3 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -1,10 +1,13 @@
%bcond_with ocf
+%bcond_without cephfs_java
%if ! (0%{?fedora} > 12 || 0%{?rhel} > 5)
%{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
%{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
%endif
+%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
+
#################################################################################
# common
#################################################################################
@@ -28,7 +31,6 @@ Requires: python-rados = %{epoch}:%{version}-%{release}
Requires: python-rbd = %{epoch}:%{version}-%{release}
Requires: python-cephfs = %{epoch}:%{version}-%{release}
Requires: python
-Requires: python-argparse
Requires: python-requests
Requires: python-flask
Requires: xfsprogs
@@ -39,7 +41,9 @@ Requires: cryptsetup
Requires(post): binutils
BuildRequires: gcc-c++
BuildRequires: boost-devel
-%if ! 0%{defined suse_version}
+%if 0%{defined suse_version}
+BuildRequires: libbz2-devel
+%else
BuildRequires: bzip2-devel
%endif
BuildRequires: cryptsetup
@@ -59,18 +63,15 @@ BuildRequires: perl
BuildRequires: parted
BuildRequires: pkgconfig
BuildRequires: python
-BuildRequires: python-argparse
BuildRequires: python-nose
BuildRequires: python-requests
BuildRequires: python-virtualenv
+BuildRequires: snappy-devel
BuildRequires: util-linux
BuildRequires: xfsprogs
BuildRequires: xfsprogs-devel
BuildRequires: xmlstarlet
BuildRequires: yasm
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} || 0%{?suse_version}
-BuildRequires: snappy-devel
-%endif
%if 0%{?suse_version}
BuildRequires: net-tools
%endif
@@ -95,7 +96,6 @@ BuildRequires: %insserv_prereq
BuildRequires: mozilla-nss-devel
BuildRequires: keyutils-devel
BuildRequires: libatomic-ops-devel
-BuildRequires: fdupes
%else
Requires: gdisk
BuildRequires: nss-devel
@@ -126,12 +126,14 @@ Requires: python-rados = %{epoch}:%{version}-%{release}
Requires: python-rbd = %{epoch}:%{version}-%{release}
Requires: python-cephfs = %{epoch}:%{version}-%{release}
Requires: python-requests
-%if 0%{defined suse_version}
-Requires: python-argparse
-%endif
%if 0%{?rhel} || 0%{?fedora}
Requires: redhat-lsb-core
%endif
+# python-argparse is only needed in distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+Requires: python-argparse
+BuildRequires: python-argparse
+%endif
%description -n ceph-common
Common utilities to mount and interact with a ceph storage cluster.
@@ -161,10 +163,10 @@ Requires: librados2 = %{epoch}:%{version}-%{release}
%if 0%{defined suse_version}
BuildRequires: libexpat-devel
BuildRequires: FastCGI-devel
-Requires: apache2-mod_fcgid
%else
BuildRequires: expat-devel
BuildRequires: fcgi-devel
+Requires: mailcap
%endif
%description radosgw
This package is an S3 HTTP REST gateway for the RADOS object store. It
@@ -213,9 +215,6 @@ Group: System Environment/Libraries
License: LGPL-2.0
Requires: librados2 = %{epoch}:%{version}-%{release}
Obsoletes: python-ceph < %{epoch}:%{version}-%{release}
-%if 0%{defined suse_version}
-%py_requires
-%endif
%description -n python-rados
This package contains Python libraries for interacting with Cephs RADOS
object store.
@@ -333,6 +332,8 @@ BuildRequires: libbabeltrace-devel
%description -n ceph-test
This package contains Ceph benchmarks and test tools.
+%if 0%{with cephfs_java}
+
%package -n libcephfs_jni1
Summary: Java Native Interface library for CephFS Java bindings.
Group: System Environment/Libraries
@@ -372,6 +373,8 @@ BuildRequires: junit
%description -n cephfs-java
This package contains the Java libraries for the Ceph File System.
+%endif
+
%package libs-compat
Summary: Meta package to include ceph libraries.
Group: System Environment/Libraries
@@ -399,7 +402,9 @@ Requires: librados2-devel = %{epoch}:%{version}-%{release}
Requires: libradosstriper1-devel = %{epoch}:%{version}-%{release}
Requires: librbd1-devel = %{epoch}:%{version}-%{release}
Requires: libcephfs1-devel = %{epoch}:%{version}-%{release}
+%if 0%{with cephfs_java}
Requires: libcephfs_jni1-devel = %{epoch}:%{version}-%{release}
+%endif
Provides: ceph-devel
%description devel-compat
This is a compatibility package to accommodate ceph-devel split into
@@ -436,10 +441,12 @@ python-cephfs instead.
%endif
%build
+%if 0%{with cephfs_java}
# Find jni.h
for i in /usr/{lib64,lib}/jvm/java/include{,/linux}; do
[ -d $i ] && java_inc="$java_inc -I$i"
done
+%endif
./autogen.sh
MY_CONF_OPT=""
@@ -457,7 +464,9 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
--without-cryptopp \
--with-rest-bench \
--with-debug \
+%if 0%{with cephfs_java}
--enable-cephfs-java \
+%endif
--with-librocksdb-static=check \
$MY_CONF_OPT \
%{?_with_ocf} \
@@ -479,7 +488,7 @@ make DESTDIR=$RPM_BUILD_ROOT install
find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
-install -D src/init-radosgw.sysv $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
mkdir -p $RPM_BUILD_ROOT%{_sbindir}
@@ -497,13 +506,8 @@ install -m 0644 -D etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds %{buildro
%endif
# udev rules
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%else
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%endif
+install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT%{_udevrulesdir}/50-rbd.rules
+install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
%if (0%{?rhel} && 0%{?rhel} < 7)
install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
@@ -529,12 +533,6 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
-%if %{defined suse_version}
-# Fedora seems to have some problems with this macro, use it only on SUSE
-%fdupes -s $RPM_BUILD_ROOT/%{python_sitelib}
-%fdupes %buildroot
-%endif
-
%clean
rm -rf $RPM_BUILD_ROOT
@@ -615,13 +613,8 @@ fi
%{_libdir}/rados-classes/libcls_version.so*
%dir %{_libdir}/ceph/erasure-code
%{_libdir}/ceph/erasure-code/libec_*.so*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/usr/lib/udev/rules.d/95-ceph-osd.rules
-%else
-/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/lib/udev/rules.d/95-ceph-osd.rules
-%endif
+%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+%{_udevrulesdir}/95-ceph-osd.rules
%config %{_sysconfdir}/bash_completion.d/ceph
%config(noreplace) %{_sysconfdir}/logrotate.d/ceph
%if 0%{?suse_version}
@@ -687,11 +680,7 @@ fi
%config(noreplace) %{_sysconfdir}/ceph/rbdmap
%{_initrddir}/rbdmap
%{python_sitelib}/ceph_argparse.py*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/50-rbd.rules
-%else
-/lib/udev/rules.d/50-rbd.rules
-%endif
+%{_udevrulesdir}/50-rbd.rules
%postun -n ceph-common
# Package removal cleanup
@@ -904,6 +893,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
%endif
#################################################################################
+%if 0%{with cephfs_java}
%files -n libcephfs_jni1
%defattr(-,root,root,-)
%{_libdir}/libcephfs_jni.so.*
@@ -918,6 +908,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
%defattr(-,root,root,-)
%{_javadir}/libcephfs.jar
%{_javadir}/libcephfs-test.jar
+%endif
#################################################################################
%files libs-compat
diff --git a/configure b/configure
index 0efc087..4d5dc41 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for ceph 0.94.3.
+# Generated by GNU Autoconf 2.69 for ceph 0.94.4.
#
# Report bugs to <ceph-devel at vger.kernel.org>.
#
@@ -590,8 +590,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='ceph'
PACKAGE_TARNAME='ceph'
-PACKAGE_VERSION='0.94.3'
-PACKAGE_STRING='ceph 0.94.3'
+PACKAGE_VERSION='0.94.4'
+PACKAGE_STRING='ceph 0.94.4'
PACKAGE_BUGREPORT='ceph-devel at vger.kernel.org'
PACKAGE_URL=''
@@ -647,6 +647,9 @@ PYTHON_VERSION
PYTHON
WITH_BUILD_TESTS_FALSE
WITH_BUILD_TESTS_TRUE
+VALGRIND_ENABLED_FALSE
+VALGRIND_ENABLED_TRUE
+HAVE_VALGRIND
WITH_BABELTRACE_FALSE
WITH_BABELTRACE_TRUE
LTTNG_GEN_TP_PROG
@@ -967,6 +970,7 @@ with_libxfs
with_libzfs
with_lttng
with_babeltrace
+enable_valgrind
'
ac_precious_vars='build_alias
host_alias
@@ -1538,7 +1542,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures ceph 0.94.3 to adapt to many kinds of systems.
+\`configure' configures ceph 0.94.4 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1609,7 +1613,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of ceph 0.94.3:";;
+ short | recursive ) echo "Configuration of ceph 0.94.4:";;
esac
cat <<\_ACEOF
@@ -1636,6 +1640,7 @@ Optional Features:
--enable-pgrefdebugging enable pg ref debugging
--enable-cephfs-java build libcephfs Java bindings
--enable-xio build Ceph Accelio transport
+ --enable-valgrind enable valgrind unit tests
Optional Packages:
--with-PACKAGE[=ARG] use PACKAGE [ARG=yes]
@@ -1781,7 +1786,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-ceph configure 0.94.3
+ceph configure 0.94.4
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2857,7 +2862,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by ceph $as_me 0.94.3, which was
+It was created by ceph $as_me 0.94.4, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -4974,7 +4979,7 @@ fi
# Define the identity of the package.
PACKAGE='ceph'
- VERSION='0.94.3'
+ VERSION='0.94.4'
cat >>confdefs.h <<_ACEOF
@@ -12878,7 +12883,7 @@ fi
# Define the identity of the package.
PACKAGE='ceph'
- VERSION='0.94.3'
+ VERSION='0.94.4'
cat >>confdefs.h <<_ACEOF
@@ -20023,7 +20028,7 @@ else
JAVA_TEST=Test.java
CLASS_TEST=Test.class
cat << \EOF > $JAVA_TEST
-/* #line 20026 "configure" */
+/* #line 20031 "configure" */
public class Test {
}
EOF
@@ -23451,6 +23456,80 @@ fi
fi
+# Check whether --enable-valgrind was given.
+if test "${enable_valgrind+set}" = set; then :
+ enableval=$enable_valgrind; enable_valgrind=$enableval
+else
+ enable_valgrind=
+fi
+
+# Extract the first word of "valgrind", so it can be a program name with args.
+set dummy valgrind; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_HAVE_VALGRIND+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ if test -n "$HAVE_VALGRIND"; then
+ ac_cv_prog_HAVE_VALGRIND="$HAVE_VALGRIND" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+ ac_cv_prog_HAVE_VALGRIND="yes"
+ $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+ break 2
+ fi
+done
+ done
+IFS=$as_save_IFS
+
+fi
+fi
+HAVE_VALGRIND=$ac_cv_prog_HAVE_VALGRIND
+if test -n "$HAVE_VALGRIND"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $HAVE_VALGRIND" >&5
+$as_echo "$HAVE_VALGRIND" >&6; }
+else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+if test "x$HAVE_VALGRIND" = "x"; then :
+ if test "x$enable_valgrind" = "xyes"; then :
+ as_fn_error $? "valgrind not found" "$LINENO" 5
+fi
+elif test "x$enable_valgrind" = "x"; then :
+ enable_valgrind=yes
+fi
+
+ if test "x$enable_valgrind" = "xyes"; then
+ VALGRIND_ENABLED_TRUE=
+ VALGRIND_ENABLED_FALSE='#'
+else
+ VALGRIND_ENABLED_TRUE='#'
+ VALGRIND_ENABLED_FALSE=
+fi
+
+if test "x$enable_valgrind" = "xyes"; then
+ for ac_header in valgrind/helgrind.h
+do :
+ ac_fn_c_check_header_mongrel "$LINENO" "valgrind/helgrind.h" "ac_cv_header_valgrind_helgrind_h" "$ac_includes_default"
+if test "x$ac_cv_header_valgrind_helgrind_h" = xyes; then :
+ cat >>confdefs.h <<_ACEOF
+#define HAVE_VALGRIND_HELGRIND_H 1
+_ACEOF
+
+fi
+
+done
+
+fi
# Checks for typedefs, structures, and compiler characteristics.
@@ -24216,6 +24295,10 @@ if test -z "${WITH_BABELTRACE_TRUE}" && test -z "${WITH_BABELTRACE_FALSE}"; then
as_fn_error $? "conditional \"WITH_BABELTRACE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
+if test -z "${VALGRIND_ENABLED_TRUE}" && test -z "${VALGRIND_ENABLED_FALSE}"; then
+ as_fn_error $? "conditional \"VALGRIND_ENABLED\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
if test -z "${WITH_BUILD_TESTS_TRUE}" && test -z "${WITH_BUILD_TESTS_FALSE}"; then
as_fn_error $? "conditional \"WITH_BUILD_TESTS\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -24617,7 +24700,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by ceph $as_me 0.94.3, which was
+This file was extended by ceph $as_me 0.94.4, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -24683,7 +24766,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-ceph config.status 0.94.3
+ceph config.status 0.94.4
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
diff --git a/configure.ac b/configure.ac
index d90058d..13c087e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
# VERSION define is not used by the code. It gets a version string
# from 'git describe'; see src/ceph_ver.[ch]
-AC_INIT([ceph], [0.94.3], [ceph-devel at vger.kernel.org])
+AC_INIT([ceph], [0.94.4], [ceph-devel at vger.kernel.org])
# Create release string. Used with VERSION for RPMs.
RPM_RELEASE=0
@@ -1117,6 +1117,19 @@ AM_COND_IF([WITH_BABELTRACE], [
AC_MSG_ERROR([babeltrace/ctf/events.h not found (libbabeltrace-ctf-dev, libbabeltrace-devel)]))
])
+dnl check for valgrind
+AC_ARG_ENABLE([valgrind],
+ [AS_HELP_STRING([--enable-valgrind], [enable valgrind unit tests])],
+ [enable_valgrind=$enableval], [enable_valgrind=])
+AC_CHECK_PROG(HAVE_VALGRIND, valgrind, yes)
+AS_IF(
+ [test "x$HAVE_VALGRIND" = "x"], AS_IF([test "x$enable_valgrind" = "xyes"], [AC_MSG_ERROR([valgrind not found])]),
+ [test "x$enable_valgrind" = "x"], [enable_valgrind=yes])
+
+AM_CONDITIONAL([VALGRIND_ENABLED], [test "x$enable_valgrind" = "xyes"])
+if test "x$enable_valgrind" = "xyes"; then
+ AC_CHECK_HEADERS([valgrind/helgrind.h])
+fi
# Checks for typedefs, structures, and compiler characteristics.
diff --git a/man/Makefile.in b/man/Makefile.in
index 29a26d6..30f2088 100644
--- a/man/Makefile.in
+++ b/man/Makefile.in
@@ -254,6 +254,7 @@ GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
GIT_CHECK = @GIT_CHECK@
GREP = @GREP@
HAVE_CXX11 = @HAVE_CXX11@
+HAVE_VALGRIND = @HAVE_VALGRIND@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
diff --git a/src/.git_version b/src/.git_version
index 7a78c9a..1275375 100644
--- a/src/.git_version
+++ b/src/.git_version
@@ -1,2 +1,2 @@
-95cefea9fd9ab740263bf8bb4796fd864d9afe2b
-v0.94.3
+95292699291242794510b39ffde3f4df67898d3a
+v0.94.4
diff --git a/src/Makefile.am b/src/Makefile.am
index 6d686ee..b0f505a 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -70,7 +70,6 @@ EXTRA_DIST += \
$(srcdir)/ceph-osd-prestart.sh \
$(srcdir)/ceph_common.sh \
$(srcdir)/init-radosgw \
- $(srcdir)/init-radosgw.sysv \
$(srcdir)/init-rbdmap \
$(srcdir)/ceph-clsinfo \
$(srcdir)/make_version \
diff --git a/src/Makefile.in b/src/Makefile.in
index 503810d..3b60555 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -539,6 +539,7 @@ check_PROGRAMS = $(am__EXEEXT_44) $(am__EXEEXT_45) $(am__EXEEXT_46) \
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@ rgw/rgw_metadata.h \
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@ rgw/rgw_multi_del.h \
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@ rgw/rgw_op.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@ rgw/rgw_orphan.h \
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@ rgw/rgw_http_client.h \
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@ rgw/rgw_swift.h \
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@ rgw/rgw_swift_auth.h \
@@ -3460,8 +3461,8 @@ ceph_test_librbd_api_OBJECTS = $(am_ceph_test_librbd_api_OBJECTS)
@ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_api_DEPENDENCIES = \
@ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@ $(LIBRBD) \
@ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@ $(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@ $(LIBCOMMON) \
@ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@ $(am__DEPENDENCIES_15) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@ $(am__DEPENDENCIES_10) \
@ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@ $(RADOS_TEST_LDADD) \
@ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@ $(am__append_163)
ceph_test_librbd_api_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
@@ -4041,8 +4042,9 @@ radosgw_OBJECTS = $(am_radosgw_OBJECTS)
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@ $(am__DEPENDENCIES_18) \
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@ $(am__DEPENDENCIES_1) \
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@ $(am__DEPENDENCIES_10)
-am__radosgw_admin_SOURCES_DIST = rgw/rgw_admin.cc
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_radosgw_admin_OBJECTS = rgw/rgw_admin.$(OBJEXT)
+am__radosgw_admin_SOURCES_DIST = rgw/rgw_admin.cc rgw/rgw_orphan.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_radosgw_admin_OBJECTS = rgw/rgw_admin.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@ rgw/rgw_orphan.$(OBJEXT)
radosgw_admin_OBJECTS = $(am_radosgw_admin_OBJECTS)
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_admin_DEPENDENCIES = $(am__DEPENDENCIES_16) \
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@ $(am__DEPENDENCIES_18) \
@@ -5980,9 +5982,9 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
common/linux_version.h common/module.h common/Continuation.h \
common/Readahead.h common/Cycles.h common/Initialize.h \
common/ContextCompletion.h common/bit_vector.hpp \
- common/address_helper.h common/secret.h msg/Connection.h \
- msg/Dispatcher.h msg/Message.h msg/Messenger.h \
- msg/SimplePolicyMessenger.h msg/msg_types.h \
+ common/valgrind.h common/address_helper.h common/secret.h \
+ msg/Connection.h msg/Dispatcher.h msg/Message.h \
+ msg/Messenger.h msg/SimplePolicyMessenger.h msg/msg_types.h \
msg/simple/Accepter.h msg/simple/DispatchQueue.h \
msg/simple/Pipe.h msg/simple/PipeConnection.h \
msg/simple/SimpleMessenger.h msg/async/AsyncConnection.h \
@@ -6096,12 +6098,12 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
rgw/rgw_string.h rgw/rgw_formats.h rgw/rgw_http_errors.h \
rgw/rgw_log.h rgw/rgw_loadgen.h rgw/rgw_multi.h \
rgw/rgw_policy_s3.h rgw/rgw_gc.h rgw/rgw_metadata.h \
- rgw/rgw_multi_del.h rgw/rgw_op.h rgw/rgw_http_client.h \
- rgw/rgw_swift.h rgw/rgw_swift_auth.h rgw/rgw_quota.h \
- rgw/rgw_rados.h rgw/rgw_replica_log.h rgw/rgw_resolve.h \
- rgw/rgw_rest.h rgw/rgw_rest_swift.h rgw/rgw_rest_s3.h \
- rgw/rgw_auth_s3.h rgw/rgw_rest_admin.h rgw/rgw_rest_usage.h \
- rgw/rgw_rest_user.h rgw/rgw_rest_bucket.h \
+ rgw/rgw_multi_del.h rgw/rgw_op.h rgw/rgw_orphan.h \
+ rgw/rgw_http_client.h rgw/rgw_swift.h rgw/rgw_swift_auth.h \
+ rgw/rgw_quota.h rgw/rgw_rados.h rgw/rgw_replica_log.h \
+ rgw/rgw_resolve.h rgw/rgw_rest.h rgw/rgw_rest_swift.h \
+ rgw/rgw_rest_s3.h rgw/rgw_auth_s3.h rgw/rgw_rest_admin.h \
+ rgw/rgw_rest_usage.h rgw/rgw_rest_user.h rgw/rgw_rest_bucket.h \
rgw/rgw_rest_client.h rgw/rgw_rest_conn.h rgw/rgw_tools.h \
rgw/rgw_rest_metadata.h rgw/rgw_rest_log.h \
rgw/rgw_rest_opstate.h rgw/rgw_rest_replica_log.h \
@@ -6455,6 +6457,7 @@ GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
GIT_CHECK = @GIT_CHECK@
GREP = @GREP@
HAVE_CXX11 = @HAVE_CXX11@
+HAVE_VALGRIND = @HAVE_VALGRIND@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -6629,10 +6632,10 @@ EXTRA_DIST = $(am__append_21) \
$(srcdir)/verify-mds-journal.sh $(srcdir)/vstart.sh \
$(srcdir)/stop.sh ceph-run $(srcdir)/ceph-osd-prestart.sh \
$(srcdir)/ceph_common.sh $(srcdir)/init-radosgw \
- $(srcdir)/init-radosgw.sysv $(srcdir)/init-rbdmap \
- $(srcdir)/ceph-clsinfo $(srcdir)/make_version \
- $(srcdir)/check_version $(srcdir)/.git_version \
- $(srcdir)/ceph-rbdnamer $(srcdir)/test/encoding/readable.sh \
+ $(srcdir)/init-rbdmap $(srcdir)/ceph-clsinfo \
+ $(srcdir)/make_version $(srcdir)/check_version \
+ $(srcdir)/.git_version $(srcdir)/ceph-rbdnamer \
+ $(srcdir)/test/encoding/readable.sh \
$(srcdir)/upstart/ceph-all.conf \
$(srcdir)/upstart/ceph-mon.conf \
$(srcdir)/upstart/ceph-mon-all.conf \
@@ -6771,9 +6774,9 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
common/linux_version.h common/module.h common/Continuation.h \
common/Readahead.h common/Cycles.h common/Initialize.h \
common/ContextCompletion.h common/bit_vector.hpp \
- $(am__append_79) common/secret.h msg/Connection.h \
- msg/Dispatcher.h msg/Message.h msg/Messenger.h \
- msg/SimplePolicyMessenger.h msg/msg_types.h \
+ common/valgrind.h $(am__append_79) common/secret.h \
+ msg/Connection.h msg/Dispatcher.h msg/Message.h \
+ msg/Messenger.h msg/SimplePolicyMessenger.h msg/msg_types.h \
msg/simple/Accepter.h msg/simple/DispatchQueue.h \
msg/simple/Pipe.h msg/simple/PipeConnection.h \
msg/simple/SimpleMessenger.h msg/async/AsyncConnection.h \
@@ -7673,7 +7676,7 @@ librbd_types_la_SOURCES = \
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_CFLAGS = -I$(srcdir)/civetweb/include
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_LDADD = $(LIBRGW) $(LIBCIVETWEB) $(LIBRGW_DEPS) $(RESOLV_LIBS) $(CEPH_GLOBAL)
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_admin_SOURCES = rgw/rgw_admin.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_admin_SOURCES = rgw/rgw_admin.cc rgw/rgw_orphan.cc
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_admin_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_rgw_multiparser_SOURCES = rgw/rgw_multiparser.cc
@ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_rgw_multiparser_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
@@ -8308,8 +8311,8 @@ librbd_types_la_SOURCES = \
@ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_api_LDADD = \
@ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@ $(LIBRBD) \
@ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@ $(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@ $(LIBCOMMON) \
@ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@ $(UNITTEST_LDADD) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@ $(CEPH_GLOBAL) \
@ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@ $(RADOS_TEST_LDADD) \
@ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@ $(am__append_163)
@ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_fsx_SOURCES = test/librbd/fsx.cc
@@ -12135,6 +12138,8 @@ radosgw$(EXEEXT): $(radosgw_OBJECTS) $(radosgw_DEPENDENCIES) $(EXTRA_radosgw_DEP
$(AM_V_CXXLD)$(CXXLINK) $(radosgw_OBJECTS) $(radosgw_LDADD) $(LIBS)
rgw/rgw_admin.$(OBJEXT): rgw/$(am__dirstamp) \
rgw/$(DEPDIR)/$(am__dirstamp)
+rgw/rgw_orphan.$(OBJEXT): rgw/$(am__dirstamp) \
+ rgw/$(DEPDIR)/$(am__dirstamp)
radosgw-admin$(EXEEXT): $(radosgw_admin_OBJECTS) $(radosgw_admin_DEPENDENCIES) $(EXTRA_radosgw_admin_DEPENDENCIES)
@rm -f radosgw-admin$(EXEEXT)
@@ -14204,6 +14209,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_loadgen.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_main.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_multiparser.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_orphan.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_replica_log.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_resolve.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at rgw/$(DEPDIR)/rgw_rest.Po at am__quote@
diff --git a/src/acconfig.h.in b/src/acconfig.h.in
index 8caa7ef..2e8dbfd 100644
--- a/src/acconfig.h.in
+++ b/src/acconfig.h.in
@@ -332,6 +332,9 @@
/* Define to 1 if you have the <utime.h> header file. */
#undef HAVE_UTIME_H
+/* Define to 1 if you have the <valgrind/helgrind.h> header file. */
+#undef HAVE_VALGRIND_HELGRIND_H
+
/* Accelio conditional compilation */
#undef HAVE_XIO
diff --git a/src/auth/cephx/CephxClientHandler.cc b/src/auth/cephx/CephxClientHandler.cc
index b6d3501..ff32a42 100644
--- a/src/auth/cephx/CephxClientHandler.cc
+++ b/src/auth/cephx/CephxClientHandler.cc
@@ -40,7 +40,11 @@ int CephxClientHandler::build_request(bufferlist& bl) const
::encode(header, bl);
CryptoKey secret;
- keyring->get_secret(cct->_conf->name, secret);
+ const bool got = keyring->get_secret(cct->_conf->name, secret);
+ if (!got) {
+ ldout(cct, 20) << "no secret found for entity: " << cct->_conf->name << dendl;
+ return -ENOENT;
+ }
CephXAuthenticate req;
get_random_bytes((char *)&req.client_challenge, sizeof(req.client_challenge));
@@ -113,7 +117,11 @@ int CephxClientHandler::handle_response(int ret, bufferlist::iterator& indata)
{
ldout(cct, 10) << " get_auth_session_key" << dendl;
CryptoKey secret;
- keyring->get_secret(cct->_conf->name, secret);
+ const bool got = keyring->get_secret(cct->_conf->name, secret);
+ if (!got) {
+ ldout(cct, 0) << "key not found for " << cct->_conf->name << dendl;
+ return -ENOENT;
+ }
if (!tickets.verify_service_ticket_reply(secret, indata)) {
ldout(cct, 0) << "could not verify service_ticket reply" << dendl;
@@ -150,7 +158,11 @@ int CephxClientHandler::handle_response(int ret, bufferlist::iterator& indata)
if (rotating_secrets) {
RotatingSecrets secrets;
CryptoKey secret_key;
- keyring->get_secret(cct->_conf->name, secret_key);
+ const bool got = keyring->get_secret(cct->_conf->name, secret_key);
+ if (!got) {
+ ldout(cct, 0) << "key not found for " << cct->_conf->name << dendl;
+ return -ENOENT;
+ }
std::string error;
if (decode_decrypt(cct, secrets, secret_key, indata, error)) {
ldout(cct, 0) << "could not set rotating key: decode_decrypt failed. error:"
diff --git a/src/ceph-disk b/src/ceph-disk
index 61a28fd..4a48520 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -1039,6 +1039,9 @@ def zap(dev):
"""
Destroy the partition table and content of a given disk.
"""
+ dmode = os.stat(dev).st_mode
+ if not stat.S_ISBLK(dmode) or is_partition(dev):
+ raise Error('not full block device; cannot zap', dev)
try:
LOG.debug('Zapping partition table on %s', dev)
@@ -1501,10 +1504,7 @@ def main_prepare(args):
verify_not_in_use(args.journal, False)
if args.zap_disk is not None:
- if stat.S_ISBLK(dmode) and not is_partition(args.data):
- zap(args.data)
- else:
- raise Error('not full block device; cannot zap', args.data)
+ zap(args.data)
if args.cluster_uuid is None:
args.cluster_uuid = get_fsid(cluster=args.cluster)
diff --git a/src/ceph.in b/src/ceph.in
index 2b6adf4..9f857ec 100755
--- a/src/ceph.in
+++ b/src/ceph.in
@@ -459,7 +459,7 @@ def new_style_command(parsed_args, cmdargs, target, sigdict, inbuf, verbose):
if ret:
ret = abs(ret)
print >> sys.stderr, \
- 'Error: {0} {1}'.format(ret, errno.errorcode[ret])
+ 'Error: {0} {1}'.format(ret, errno.errorcode.get(ret, 'Unknown'))
if outbuf:
print outbuf
if outs:
@@ -679,7 +679,7 @@ def main():
if len(childargs) >= 2 and \
childargs[0] in ['mon', 'osd'] and \
childargs[1] == 'tell':
- print >> sys.stderr, '"{0} tell" is deprecated; try "tell {0}.<id>" instead (id can be "*") '.format(childargs[0])
+ print >> sys.stderr, '"{0} tell" is deprecated; try "tell {0}.<id> <command> [options...]" instead (id can be "*") '.format(childargs[0])
return 1
if parsed_args.help:
@@ -794,7 +794,9 @@ def main():
childargs = injectargs
if not len(childargs):
print >> sys.stderr, \
- 'Cannot use \'tell\' with interactive mode'
+ 'Cannot use \'tell\' with interactive mode.', \
+ 'For an interactive shell,', \
+ 'please start "{0}" without non-option arguments.'.format(sys.argv[0])
return errno.EINVAL
# fetch JSON sigs from command
@@ -858,11 +860,11 @@ def main():
sigdict, inbuf, verbose)
if ret < 0:
ret = -ret
- print >> sys.stderr, prefix + 'Second attempt of previously successful command failed with {0}: {1}'.format(errno.errorcode[ret], outs)
+ print >> sys.stderr, prefix + 'Second attempt of previously successful command failed with {0}: {1}'.format(errno.errorcode.get(ret, 'Unknown'), outs)
if ret < 0:
ret = -ret
- print >> sys.stderr, prefix + 'Error {0}: {1}'.format(errno.errorcode[ret], outs)
+ print >> sys.stderr, prefix + 'Error {0}: {1}'.format(errno.errorcode.get(ret, 'Unknown'), outs)
if len(targets) > 1:
final_ret = ret
else:
diff --git a/src/civetweb/civetweb.h b/src/civetweb/civetweb.h
index 5da8a73..ea3ff0c 100644
--- a/src/civetweb/civetweb.h
+++ b/src/civetweb/civetweb.h
@@ -552,6 +552,9 @@ CIVETWEB_API char *mg_md5(char buf[33], ...);
CIVETWEB_API void mg_cry(struct mg_connection *conn,
PRINTF_FORMAT_STRING(const char *fmt), ...) PRINTF_ARGS(2, 3);
+/* set connection's http status */
+CIVETWEB_API void mg_set_http_status(struct mg_connection *conn, int status);
+
/* utility method to compare two buffers, case incensitive. */
CIVETWEB_API int mg_strncasecmp(const char *s1, const char *s2, size_t len);
diff --git a/src/civetweb/include/civetweb.h b/src/civetweb/include/civetweb.h
index 5da8a73..ea3ff0c 100644
--- a/src/civetweb/include/civetweb.h
+++ b/src/civetweb/include/civetweb.h
@@ -552,6 +552,9 @@ CIVETWEB_API char *mg_md5(char buf[33], ...);
CIVETWEB_API void mg_cry(struct mg_connection *conn,
PRINTF_FORMAT_STRING(const char *fmt), ...) PRINTF_ARGS(2, 3);
+/* set connection's http status */
+CIVETWEB_API void mg_set_http_status(struct mg_connection *conn, int status);
+
/* utility method to compare two buffers, case incensitive. */
CIVETWEB_API int mg_strncasecmp(const char *s1, const char *s2, size_t len);
diff --git a/src/civetweb/src/civetweb.c b/src/civetweb/src/civetweb.c
index c9dc3ff..967d853 100644
--- a/src/civetweb/src/civetweb.c
+++ b/src/civetweb/src/civetweb.c
@@ -1145,6 +1145,11 @@ void mg_cry(struct mg_connection *conn, const char *fmt, ...)
}
}
+void mg_set_http_status(struct mg_connection *conn, int status)
+{
+ conn->status_code = status;
+}
+
/* Return fake connection structure. Used for logging, if connection
is not applicable at the moment of logging. */
static struct mg_connection *fc(struct mg_context *ctx)
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 446f0d1..0d85db2 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -6041,8 +6041,12 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p)
++pd;
}
- string prev_name;
- while (!pd.end()) {
+ string dn_name;
+ while (true) {
+ if (!dirp->inode->is_complete_and_ordered())
+ return -EAGAIN;
+ if (pd.end())
+ break;
Dentry *dn = *pd;
if (dn->inode == NULL) {
ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
@@ -6065,6 +6069,8 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p)
if (pd.end())
next_off = dir_result_t::END;
+ dn_name = dn->name; // fill in name while we have lock
+
client_lock.Unlock();
int r = cb(p, &de, &st, stmask, next_off); // _next_ offset
client_lock.Lock();
@@ -6072,13 +6078,12 @@ int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p)
<< " = " << r
<< dendl;
if (r < 0) {
- dirp->next_offset = dn->offset;
- dirp->at_cache_name = prev_name;
+ dirp->next_offset = next_off - 1;
return r;
}
- prev_name = dn->name;
- dirp->offset = next_off;
+ dirp->next_offset = dirp->offset = next_off;
+ dirp->at_cache_name = dn_name; // we successfully returned this one; update!
if (r > 0)
return r;
}
@@ -7365,9 +7370,7 @@ int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf)
// async, caching, non-blocking.
r = objectcacher->file_write(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
- offset, size, bl, ceph_clock_now(cct), 0,
- client_lock);
-
+ offset, size, bl, ceph_clock_now(cct), 0);
put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
if (r < 0)
diff --git a/src/common/Cycles.cc b/src/common/Cycles.cc
index a2efcf3..b0b687e 100644
--- a/src/common/Cycles.cc
+++ b/src/common/Cycles.cc
@@ -52,6 +52,10 @@ void Cycles::init()
if (cycles_per_sec != 0)
return;
+ // Skip initialization if rtdsc is not implemented
+ if (rdtsc() == 0)
+ return;
+
// Compute the frequency of the fine-grained CPU timer: to do this,
// take parallel time readings using both rdtsc and gettimeofday.
// After 10ms have elapsed, take the ratio between these readings.
diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index 78afd5e..620e550 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -233,7 +233,8 @@ noinst_HEADERS += \
common/Cycles.h \
common/Initialize.h \
common/ContextCompletion.h \
- common/bit_vector.hpp
+ common/bit_vector.hpp \
+ common/valgrind.h
if ENABLE_XIO
noinst_HEADERS += \
diff --git a/src/common/Mutex.cc b/src/common/Mutex.cc
index a0c1202..808513e 100644
--- a/src/common/Mutex.cc
+++ b/src/common/Mutex.cc
@@ -17,14 +17,15 @@
#include "common/perf_counters.h"
#include "common/ceph_context.h"
#include "common/config.h"
+#include "include/stringify.h"
#include "include/utime.h"
#include "common/Clock.h"
-Mutex::Mutex(const char *n, bool r, bool ld,
+Mutex::Mutex(const std::string &n, bool r, bool ld,
bool bt,
CephContext *cct) :
- name(n), id(-1), recursive(r), lockdep(ld), backtrace(bt),
- nlock(0), locked_by(0), cct(cct), logger(0)
+ name(n), id(-1), recursive(r), lockdep(ld), backtrace(bt), nlock(0),
+ locked_by(0), cct(cct), logger(0)
{
if (cct) {
PerfCountersBuilder b(cct, string("mutex-") + name,
@@ -42,7 +43,7 @@ Mutex::Mutex(const char *n, bool r, bool ld,
pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
pthread_mutex_init(&_m,&attr);
pthread_mutexattr_destroy(&attr);
- if (g_lockdep)
+ if (lockdep && g_lockdep)
_register();
}
else if (lockdep) {
@@ -55,6 +56,7 @@ Mutex::Mutex(const char *n, bool r, bool ld,
pthread_mutexattr_init(&attr);
pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK);
pthread_mutex_init(&_m, &attr);
+ pthread_mutexattr_destroy(&attr);
if (g_lockdep)
_register();
}
@@ -74,6 +76,9 @@ Mutex::~Mutex() {
cct->get_perfcounters_collection()->remove(logger);
delete logger;
}
+ if (lockdep && g_lockdep) {
+ lockdep_unregister(id);
+ }
}
void Mutex::Lock(bool no_lockdep) {
diff --git a/src/common/Mutex.h b/src/common/Mutex.h
index 7581575..6a4e6b3 100644
--- a/src/common/Mutex.h
+++ b/src/common/Mutex.h
@@ -33,7 +33,7 @@ enum {
class Mutex {
private:
- const char *name;
+ std::string name;
int id;
bool recursive;
bool lockdep;
@@ -50,20 +50,20 @@ private:
Mutex(const Mutex &M);
void _register() {
- id = lockdep_register(name);
+ id = lockdep_register(name.c_str());
}
void _will_lock() { // about to lock
- id = lockdep_will_lock(name, id);
+ id = lockdep_will_lock(name.c_str(), id, backtrace);
}
void _locked() { // just locked
- id = lockdep_locked(name, id, backtrace);
+ id = lockdep_locked(name.c_str(), id, backtrace);
}
void _will_unlock() { // about to unlock
- id = lockdep_will_unlock(name, id);
+ id = lockdep_will_unlock(name.c_str(), id);
}
public:
- Mutex(const char *n, bool r = false, bool ld=true, bool bt=false,
+ Mutex(const std::string &n, bool r = false, bool ld=true, bool bt=false,
CephContext *cct = 0);
~Mutex();
bool is_locked() const {
diff --git a/src/common/RWLock.h b/src/common/RWLock.h
index 6f0ab8e..c82a23c 100644
--- a/src/common/RWLock.h
+++ b/src/common/RWLock.h
@@ -18,6 +18,7 @@
#define CEPH_RWLock_Posix__H
#include <pthread.h>
+#include <string>
#include <include/assert.h>
#include "lockdep.h"
#include "include/atomic.h"
@@ -25,17 +26,19 @@
class RWLock
{
mutable pthread_rwlock_t L;
- const char *name;
+ std::string name;
mutable int id;
mutable atomic_t nrlock, nwlock;
+ std::string unique_name(const char* name) const;
+
public:
RWLock(const RWLock& other);
const RWLock& operator=(const RWLock& other);
- RWLock(const char *n) : name(n), id(-1), nrlock(0), nwlock(0) {
+ RWLock(const std::string &n) : name(n), id(-1), nrlock(0), nwlock(0) {
pthread_rwlock_init(&L, NULL);
- if (g_lockdep) id = lockdep_register(name);
+ if (g_lockdep) id = lockdep_register(name.c_str());
}
bool is_locked() const {
@@ -50,6 +53,9 @@ public:
// the object and we assume that there are no other users.
assert(!is_locked());
pthread_rwlock_destroy(&L);
+ if (g_lockdep) {
+ lockdep_unregister(id);
+ }
}
void unlock(bool lockdep=true) const {
@@ -59,23 +65,23 @@ public:
assert(nrlock.read() > 0);
nrlock.dec();
}
- if (lockdep && g_lockdep) id = lockdep_will_unlock(name, id);
+ if (lockdep && g_lockdep) id = lockdep_will_unlock(name.c_str(), id);
int r = pthread_rwlock_unlock(&L);
assert(r == 0);
}
// read
void get_read() const {
- if (g_lockdep) id = lockdep_will_lock(name, id);
+ if (g_lockdep) id = lockdep_will_lock(name.c_str(), id);
int r = pthread_rwlock_rdlock(&L);
assert(r == 0);
- if (g_lockdep) id = lockdep_locked(name, id);
+ if (g_lockdep) id = lockdep_locked(name.c_str(), id);
nrlock.inc();
}
bool try_get_read() const {
if (pthread_rwlock_tryrdlock(&L) == 0) {
nrlock.inc();
- if (g_lockdep) id = lockdep_locked(name, id);
+ if (g_lockdep) id = lockdep_locked(name.c_str(), id);
return true;
}
return false;
@@ -86,16 +92,16 @@ public:
// write
void get_write(bool lockdep=true) {
- if (lockdep && g_lockdep) id = lockdep_will_lock(name, id);
+ if (lockdep && g_lockdep) id = lockdep_will_lock(name.c_str(), id);
int r = pthread_rwlock_wrlock(&L);
assert(r == 0);
- if (g_lockdep) id = lockdep_locked(name, id);
+ if (g_lockdep) id = lockdep_locked(name.c_str(), id);
nwlock.inc();
}
bool try_get_write(bool lockdep=true) {
if (pthread_rwlock_trywrlock(&L) == 0) {
- if (lockdep && g_lockdep) id = lockdep_locked(name, id);
+ if (lockdep && g_lockdep) id = lockdep_locked(name.c_str(), id);
nwlock.inc();
return true;
}
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
index d1b11b6..300ae7d 100644
--- a/src/common/WorkQueue.h
+++ b/src/common/WorkQueue.h
@@ -433,35 +433,36 @@ public:
}
};
-class ContextWQ : public ThreadPool::WorkQueueVal<Context *> {
+class ContextWQ : public ThreadPool::WorkQueueVal<std::pair<Context *, int> > {
public:
ContextWQ(const string &name, time_t ti, ThreadPool *tp)
- : ThreadPool::WorkQueueVal<Context *>(name, ti, 0, tp) {}
+ : ThreadPool::WorkQueueVal<std::pair<Context *, int> >(name, ti, 0, tp) {}
- void queue(Context *ctx) {
- ThreadPool::WorkQueueVal<Context *>::queue(ctx);
+ void queue(Context *ctx, int result = 0) {
+ ThreadPool::WorkQueueVal<std::pair<Context *, int> >::queue(
+ std::make_pair(ctx, result));
}
protected:
- virtual void _enqueue(Context *item) {
+ virtual void _enqueue(std::pair<Context *, int> item) {
_queue.push_back(item);
}
- virtual void _enqueue_front(Context *item) {
+ virtual void _enqueue_front(std::pair<Context *, int> item) {
_queue.push_front(item);
}
virtual bool _empty() {
return _queue.empty();
}
- virtual Context *_dequeue() {
- Context *item = _queue.front();
+ virtual std::pair<Context *, int> _dequeue() {
+ std::pair<Context *, int> item = _queue.front();
_queue.pop_front();
return item;
}
- virtual void _process(Context *item) {
- item->complete(0);
+ virtual void _process(std::pair<Context *, int> item) {
+ item.first->complete(item.second);
}
private:
- list<Context *> _queue;
+ list<std::pair<Context *, int> > _queue;
};
class ShardedThreadPool {
diff --git a/src/common/bit_vector.hpp b/src/common/bit_vector.hpp
index 55403c5..f66294b 100644
--- a/src/common/bit_vector.hpp
+++ b/src/common/bit_vector.hpp
@@ -261,7 +261,10 @@ void BitVector<_b>::get_data_extents(uint64_t offset, uint64_t length,
end_offset += (CEPH_PAGE_SIZE - (end_offset % CEPH_PAGE_SIZE));
assert(*byte_offset <= end_offset);
- *byte_length = MIN(end_offset - *byte_offset, m_data.length());
+ *byte_length = end_offset - *byte_offset;
+ if (*byte_offset + *byte_length > m_data.length()) {
+ *byte_length = m_data.length() - *byte_offset;
+ }
}
template <uint8_t _b>
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index 88656e8..502163b 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -1165,12 +1165,23 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
it != _buffers.end();
++it) {
if (p + it->length() > o) {
- if (p >= o && p+it->length() <= o+l)
- it->zero(); // all
- else if (p >= o)
- it->zero(0, o+l-p); // head
- else
- it->zero(o-p, it->length()-(o-p)); // tail
+ if (p >= o && p+it->length() <= o+l) {
+ // 'o'------------- l -----------|
+ // 'p'-- it->length() --|
+ it->zero();
+ } else if (p >= o) {
+ // 'o'------------- l -----------|
+ // 'p'------- it->length() -------|
+ it->zero(0, o+l-p);
+ } else if (p + it->length() <= o+l) {
+ // 'o'------------- l -----------|
+ // 'p'------- it->length() -------|
+ it->zero(o-p, it->length()-(o-p));
+ } else {
+ // 'o'----------- l -----------|
+ // 'p'---------- it->length() ----------|
+ it->zero(o-p, l);
+ }
}
p += it->length();
if (o+l <= p)
@@ -1195,6 +1206,10 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
void buffer::list::rebuild()
{
+ if (_len == 0) {
+ _buffers.clear();
+ return;
+ }
ptr nb;
if ((_len & ~CEPH_PAGE_MASK) == 0)
nb = buffer::create_page_aligned(_len);
@@ -1214,60 +1229,61 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
}
_memcopy_count += pos;
_buffers.clear();
- _buffers.push_back(nb);
+ if (nb.length())
+ _buffers.push_back(nb);
}
-void buffer::list::rebuild_aligned(unsigned align)
-{
- rebuild_aligned_size_and_memory(align, align);
-}
-
-void buffer::list::rebuild_aligned_size_and_memory(unsigned align_size,
- unsigned align_memory)
-{
- std::list<ptr>::iterator p = _buffers.begin();
- while (p != _buffers.end()) {
- // keep anything that's already align and sized aligned
- if (p->is_aligned(align_memory) && p->is_n_align_sized(align_size)) {
- /*cout << " segment " << (void*)p->c_str()
- << " offset " << ((unsigned long)p->c_str() & (align - 1))
- << " length " << p->length()
- << " " << (p->length() & (align - 1)) << " ok" << std::endl;
- */
- ++p;
- continue;
+ void buffer::list::rebuild_aligned(unsigned align)
+ {
+ rebuild_aligned_size_and_memory(align, align);
+ }
+
+ void buffer::list::rebuild_aligned_size_and_memory(unsigned align_size,
+ unsigned align_memory)
+ {
+ std::list<ptr>::iterator p = _buffers.begin();
+ while (p != _buffers.end()) {
+ // keep anything that's already align and sized aligned
+ if (p->is_aligned(align_memory) && p->is_n_align_sized(align_size)) {
+ /*cout << " segment " << (void*)p->c_str()
+ << " offset " << ((unsigned long)p->c_str() & (align - 1))
+ << " length " << p->length()
+ << " " << (p->length() & (align - 1)) << " ok" << std::endl;
+ */
+ ++p;
+ continue;
+ }
+
+ // consolidate unaligned items, until we get something that is sized+aligned
+ list unaligned;
+ unsigned offset = 0;
+ do {
+ /*cout << " segment " << (void*)p->c_str()
+ << " offset " << ((unsigned long)p->c_str() & (align - 1))
+ << " length " << p->length() << " " << (p->length() & (align - 1))
+ << " overall offset " << offset << " " << (offset & (align - 1))
+ << " not ok" << std::endl;
+ */
+ offset += p->length();
+ unaligned.push_back(*p);
+ _buffers.erase(p++);
+ } while (p != _buffers.end() &&
+ (!p->is_aligned(align_memory) ||
+ !p->is_n_align_sized(align_size) ||
+ (offset % align_size)));
+ if (!(unaligned.is_contiguous() && unaligned._buffers.front().is_aligned(align_memory))) {
+ ptr nb(buffer::create_aligned(unaligned._len, align_memory));
+ unaligned.rebuild(nb);
+ _memcopy_count += unaligned._len;
+ }
+ _buffers.insert(p, unaligned._buffers.front());
}
-
- // consolidate unaligned items, until we get something that is sized+aligned
- list unaligned;
- unsigned offset = 0;
- do {
- /*cout << " segment " << (void*)p->c_str()
- << " offset " << ((unsigned long)p->c_str() & (align - 1))
- << " length " << p->length() << " " << (p->length() & (align - 1))
- << " overall offset " << offset << " " << (offset & (align - 1))
- << " not ok" << std::endl;
- */
- offset += p->length();
- unaligned.push_back(*p);
- _buffers.erase(p++);
- } while (p != _buffers.end() &&
- (!p->is_aligned(align_memory) ||
- !p->is_n_align_sized(align_size) ||
- (offset % align_size)));
- if (!(unaligned.is_contiguous() && unaligned._buffers.front().is_aligned(align_memory))) {
- ptr nb(buffer::create_aligned(unaligned._len, align_memory));
- unaligned.rebuild(nb);
- _memcopy_count += unaligned._len;
- }
- _buffers.insert(p, unaligned._buffers.front());
}
-}
-
-void buffer::list::rebuild_page_aligned()
-{
- rebuild_aligned(CEPH_PAGE_SIZE);
-}
+
+ void buffer::list::rebuild_page_aligned()
+ {
+ rebuild_aligned(CEPH_PAGE_SIZE);
+ }
// sort-of-like-assignment-op
void buffer::list::claim(list& bl, unsigned int flags)
diff --git a/src/common/ceph_context.cc b/src/common/ceph_context.cc
index 79aff8b..50346ed 100644
--- a/src/common/ceph_context.cc
+++ b/src/common/ceph_context.cc
@@ -20,6 +20,7 @@
#include "common/perf_counters.h"
#include "common/Thread.h"
#include "common/ceph_context.h"
+#include "common/ceph_crypto.h"
#include "common/config.h"
#include "common/debug.h"
#include "common/HeartbeatMap.h"
@@ -39,6 +40,41 @@
using ceph::HeartbeatMap;
+namespace {
+
+class LockdepObs : public md_config_obs_t {
+public:
+ LockdepObs(CephContext *cct) : m_cct(cct), m_registered(false) {
+ }
+ virtual ~LockdepObs() {
+ if (m_registered) {
+ lockdep_unregister_ceph_context(m_cct);
+ }
+ }
+
+ const char** get_tracked_conf_keys() const {
+ static const char *KEYS[] = {"lockdep", NULL};
+ return KEYS;
+ }
+
+ void handle_conf_change(const md_config_t *conf,
+ const std::set <std::string> &changed) {
+ if (conf->lockdep && !m_registered) {
+ lockdep_register_ceph_context(m_cct);
+ m_registered = true;
+ } else if (!conf->lockdep && m_registered) {
+ lockdep_unregister_ceph_context(m_cct);
+ m_registered = false;
+ }
+ }
+private:
+ CephContext *m_cct;
+ bool m_registered;
+};
+
+
+} // anonymous namespace
+
class CephContextServiceThread : public Thread
{
public:
@@ -363,6 +399,7 @@ CephContext::CephContext(uint32_t module_type_)
_conf(new md_config_t()),
_log(NULL),
_module_type(module_type_),
+ _crypto_inited(false),
_service_thread(NULL),
_log_obs(NULL),
_admin_socket(NULL),
@@ -370,7 +407,8 @@ CephContext::CephContext(uint32_t module_type_)
_perf_counters_conf_obs(NULL),
_heartbeat_map(NULL),
_crypto_none(NULL),
- _crypto_aes(NULL)
+ _crypto_aes(NULL),
+ _lockdep_obs(NULL)
{
ceph_spin_init(&_service_thread_lock);
ceph_spin_init(&_associated_objs_lock);
@@ -385,6 +423,9 @@ CephContext::CephContext(uint32_t module_type_)
_cct_obs = new CephContextObs(this);
_conf->add_observer(_cct_obs);
+ _lockdep_obs = new LockdepObs(this);
+ _conf->add_observer(_lockdep_obs);
+
_perf_counters_collection = new PerfCountersCollection(this);
_admin_socket = new AdminSocket(this);
_heartbeat_map = new HeartbeatMap(this);
@@ -419,10 +460,6 @@ CephContext::~CephContext()
it != _associated_objs.end(); ++it)
delete it->second;
- if (_conf->lockdep) {
- lockdep_unregister_ceph_context(this);
- }
-
_admin_socket->unregister_command("perfcounters_dump");
_admin_socket->unregister_command("perf dump");
_admin_socket->unregister_command("1");
@@ -456,6 +493,10 @@ CephContext::~CephContext()
delete _cct_obs;
_cct_obs = NULL;
+ _conf->remove_observer(_lockdep_obs);
+ delete _lockdep_obs;
+ _lockdep_obs = NULL;
+
_log->stop();
delete _log;
_log = NULL;
@@ -467,6 +508,14 @@ CephContext::~CephContext()
delete _crypto_none;
delete _crypto_aes;
+ if (_crypto_inited)
+ ceph::crypto::shutdown();
+}
+
+void CephContext::init_crypto()
+{
+ ceph::crypto::init(this);
+ _crypto_inited = true;
}
void CephContext::start_service_thread()
diff --git a/src/common/ceph_context.h b/src/common/ceph_context.h
index a8dfec5..a9ffde0 100644
--- a/src/common/ceph_context.h
+++ b/src/common/ceph_context.h
@@ -77,6 +77,9 @@ public:
md_config_t *_conf;
ceph::log::Log *_log;
+ /* init ceph::crypto */
+ void init_crypto();
+
/* Start the Ceph Context's service thread */
void start_service_thread();
@@ -139,6 +142,8 @@ private:
uint32_t _module_type;
+ bool _crypto_inited;
+
/* libcommon service thread.
* SIGHUP wakes this thread, which then reopens logfiles */
friend class CephContextServiceThread;
@@ -173,6 +178,8 @@ private:
ceph_spinlock_t _feature_lock;
std::set<std::string> _experimental_features;
+ md_config_obs_t *_lockdep_obs;
+
friend class CephContextObs;
};
diff --git a/src/common/ceph_crypto.cc b/src/common/ceph_crypto.cc
index b81ffdf..f15ef09 100644
--- a/src/common/ceph_crypto.cc
+++ b/src/common/ceph_crypto.cc
@@ -12,6 +12,7 @@
*
*/
+#include "include/int_types.h"
#include "common/config.h"
#include "common/ceph_context.h"
#include "ceph_crypto.h"
@@ -37,28 +38,51 @@ ceph::crypto::HMACSHA1::~HMACSHA1()
#elif USE_NSS
-// Initialization of NSS requires a mutex due to a race condition in
-// NSS_NoDB_Init.
+// for SECMOD_RestartModules()
+#include <secmod.h>
+
static pthread_mutex_t crypto_init_mutex = PTHREAD_MUTEX_INITIALIZER;
+static uint32_t crypto_refs = 0;
+static NSSInitContext *crypto_context = NULL;
+static pid_t crypto_init_pid = 0;
void ceph::crypto::init(CephContext *cct)
{
- SECStatus s;
+ pid_t pid = getpid();
pthread_mutex_lock(&crypto_init_mutex);
- if (cct->_conf->nss_db_path.empty()) {
- s = NSS_NoDB_Init(NULL);
- } else {
- s = NSS_Init(cct->_conf->nss_db_path.c_str());
+ if (crypto_init_pid != pid) {
+ if (crypto_init_pid > 0) {
+ SECMOD_RestartModules(PR_FALSE);
+ }
+ crypto_init_pid = pid;
+ }
+
+ if (++crypto_refs == 1) {
+ NSSInitParameters init_params;
+ memset(&init_params, 0, sizeof(init_params));
+ init_params.length = sizeof(init_params);
+
+ uint32_t flags = NSS_INIT_READONLY;
+ if (cct->_conf->nss_db_path.empty()) {
+ flags |= (NSS_INIT_NOCERTDB | NSS_INIT_NOMODDB);
+ }
+ crypto_context = NSS_InitContext(cct->_conf->nss_db_path.c_str(), "", "",
+ SECMOD_DB, &init_params, flags);
}
pthread_mutex_unlock(&crypto_init_mutex);
- assert(s == SECSuccess);
+ assert(crypto_context != NULL);
}
void ceph::crypto::shutdown()
{
- SECStatus s;
- s = NSS_Shutdown();
- assert(s == SECSuccess);
+ pthread_mutex_lock(&crypto_init_mutex);
+ assert(crypto_refs > 0);
+ if (--crypto_refs == 0) {
+ NSS_ShutdownContext(crypto_context);
+ crypto_context = NULL;
+ crypto_init_pid = 0;
+ }
+ pthread_mutex_unlock(&crypto_init_mutex);
}
ceph::crypto::HMACSHA1::~HMACSHA1()
diff --git a/src/common/common_init.cc b/src/common/common_init.cc
index dd0b0e7..a580309 100644
--- a/src/common/common_init.cc
+++ b/src/common/common_init.cc
@@ -115,13 +115,8 @@ void complain_about_parse_errors(CephContext *cct,
* same application. */
void common_init_finish(CephContext *cct, int flags)
{
- ceph::crypto::init(cct);
+ cct->init_crypto();
if (!(flags & CINIT_FLAG_NO_DAEMON_ACTIONS))
cct->start_service_thread();
-
- if (cct->_conf->lockdep) {
- g_lockdep = true;
- lockdep_register_ceph_context(cct);
- }
}
diff --git a/src/common/config.cc b/src/common/config.cc
index 5e923e6..3b0ed62 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -148,7 +148,7 @@ md_config_t::md_config_t()
#undef OPTION
#undef SUBSYS
#undef DEFAULT_SUBSYS
- lock("md_config_t", true)
+ lock("md_config_t", true, false)
{
init_subsys();
}
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index f2c34fe..95d3a4b 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -23,6 +23,7 @@ OPTION(num_client, OPT_INT, 1)
OPTION(monmap, OPT_STR, "")
OPTION(mon_host, OPT_STR, "")
OPTION(lockdep, OPT_BOOL, false)
+OPTION(lockdep_force_backtrace, OPT_BOOL, false) // always gather current backtrace at every lock
OPTION(run_dir, OPT_STR, "/var/run/ceph") // the "/var/run/ceph" dir, created on daemon startup
OPTION(admin_socket, OPT_STR, "$run_dir/$cluster-$name.asok") // default changed by common_preinit()
@@ -175,6 +176,8 @@ OPTION(mon_sync_fs_threshold, OPT_INT, 5) // sync() when writing this many obj
OPTION(mon_compact_on_start, OPT_BOOL, false) // compact leveldb on ceph-mon start
OPTION(mon_compact_on_bootstrap, OPT_BOOL, false) // trigger leveldb compaction on bootstrap
OPTION(mon_compact_on_trim, OPT_BOOL, true) // compact (a prefix) when we trim old states
+OPTION(mon_osd_cache_size, OPT_INT, 10) // the size of osdmaps cache, not to rely on underlying store's cache
+
OPTION(mon_tick_interval, OPT_INT, 5)
OPTION(mon_subscribe_interval, OPT_DOUBLE, 300)
OPTION(mon_delta_reset_interval, OPT_DOUBLE, 10) // seconds of inactivity before we reset the pg delta to 0
@@ -959,6 +962,7 @@ OPTION(rgw_swift_url_prefix, OPT_STR, "swift") // entry point for which a url is
OPTION(rgw_swift_auth_url, OPT_STR, "") // default URL to go and verify tokens for v1 auth (if not using internal swift auth)
OPTION(rgw_swift_auth_entry, OPT_STR, "auth") // entry point for which a url is considered a swift auth url
OPTION(rgw_swift_tenant_name, OPT_STR, "") // tenant name to use for swift access
+OPTION(rgw_swift_enforce_content_length, OPT_BOOL, false) // enforce generation of Content-Length even in cost of performance or scalability
OPTION(rgw_keystone_url, OPT_STR, "") // url for keystone server
OPTION(rgw_keystone_admin_token, OPT_STR, "") // keystone admin token (shared secret)
OPTION(rgw_keystone_admin_user, OPT_STR, "") // keystone admin user name
@@ -978,6 +982,7 @@ OPTION(rgw_op_thread_timeout, OPT_INT, 10*60)
OPTION(rgw_op_thread_suicide_timeout, OPT_INT, 0)
OPTION(rgw_thread_pool_size, OPT_INT, 100)
OPTION(rgw_num_control_oids, OPT_INT, 8)
+OPTION(rgw_num_rados_handles, OPT_U32, 1)
OPTION(rgw_zone, OPT_STR, "") // zone name
OPTION(rgw_zone_root_pool, OPT_STR, ".rgw.root") // pool where zone specific info is stored
@@ -1043,6 +1048,7 @@ OPTION(rgw_user_quota_sync_wait_time, OPT_INT, 3600 * 24) // min time between tw
OPTION(rgw_multipart_min_part_size, OPT_INT, 5 * 1024 * 1024) // min size for each part (except for last one) in multipart upload
OPTION(rgw_olh_pending_timeout_sec, OPT_INT, 3600) // time until we retire a pending olh change
+OPTION(rgw_user_max_buckets, OPT_U32, 1000) // global option to set max buckets count for all user
OPTION(mutex_perf_counter, OPT_BOOL, false) // enable/disable mutex perf counter
OPTION(throttler_perf_counter, OPT_BOOL, true) // enable/disable throttler perf counter
diff --git a/src/common/hobject.cc b/src/common/hobject.cc
index fda169b..866c992 100644
--- a/src/common/hobject.cc
+++ b/src/common/hobject.cc
@@ -130,6 +130,15 @@ void hobject_t::decode(bufferlist::iterator& bl)
if (struct_v >= 4) {
::decode(nspace, bl);
::decode(pool, bl);
+ // newer OSDs have a different hobject_t::get_min(); decode it properly.
+ if (pool == INT64_MIN &&
+ hash == 0 &&
+ snap == 0 &&
+ !max &&
+ oid.name.empty()) {
+ pool = -1;
+ assert(is_min());
+ }
}
DECODE_FINISH(bl);
build_filestore_key_cache();
@@ -226,6 +235,15 @@ void ghobject_t::decode(bufferlist::iterator& bl)
if (struct_v >= 4) {
::decode(hobj.nspace, bl);
::decode(hobj.pool, bl);
+ // newer OSDs have a different hobject_t::get_min(); decode it properly.
+ if (hobj.pool == INT64_MIN &&
+ hobj.hash == 0 &&
+ hobj.snap == 0 &&
+ !hobj.max &&
+ hobj.oid.name.empty()) {
+ hobj.pool = -1;
+ assert(hobj.is_min());
+ }
}
if (struct_v >= 5) {
::decode(generation, bl);
diff --git a/src/common/hobject.h b/src/common/hobject.h
index 94aa6bf..7495cc1 100644
--- a/src/common/hobject.h
+++ b/src/common/hobject.h
@@ -29,6 +29,13 @@ namespace ceph {
class Formatter;
}
+#ifndef UINT64_MAX
+#define UINT64_MAX (18446744073709551615ULL)
+#endif
+#ifndef INT64_MIN
+#define INT64_MIN ((int64_t)0x8000000000000000ll)
+#endif
+
struct hobject_t {
object_t oid;
snapid_t snap;
@@ -99,6 +106,7 @@ public:
return *this;
hobject_t ret;
ret.set_hash(hash);
+ ret.pool = pool;
return ret;
}
@@ -282,6 +290,8 @@ public:
return *this;
ghobject_t ret;
ret.hobj.set_hash(hobj.hash);
+ ret.shard_id = shard_id;
+ ret.hobj.pool = hobj.pool;
return ret;
}
filestore_hobject_key_t get_filestore_key_u32() const {
diff --git a/src/common/lockdep.cc b/src/common/lockdep.cc
index 6639d8a..5f9fa19 100644
--- a/src/common/lockdep.cc
+++ b/src/common/lockdep.cc
@@ -49,19 +49,31 @@ struct lockdep_stopper_t {
static pthread_mutex_t lockdep_mutex = PTHREAD_MUTEX_INITIALIZER;
static CephContext *g_lockdep_ceph_ctx = NULL;
static lockdep_stopper_t lockdep_stopper;
-static ceph::unordered_map<const char *, int> lock_ids;
-static map<int, const char *> lock_names;
-static int last_id = 0;
+static ceph::unordered_map<std::string, int> lock_ids;
+static map<int, std::string> lock_names;
+static map<int, int> lock_refs;
+static list<int> free_ids;
static ceph::unordered_map<pthread_t, map<int,BackTrace*> > held;
static BackTrace *follows[MAX_LOCKS][MAX_LOCKS]; // follows[a][b] means b taken after a
+static bool lockdep_force_backtrace()
+{
+ return (g_lockdep_ceph_ctx != NULL &&
+ g_lockdep_ceph_ctx->_conf->lockdep_force_backtrace);
+}
+
/******* Functions **********/
void lockdep_register_ceph_context(CephContext *cct)
{
pthread_mutex_lock(&lockdep_mutex);
if (g_lockdep_ceph_ctx == NULL) {
+ g_lockdep = true;
g_lockdep_ceph_ctx = cct;
lockdep_dout(0) << "lockdep start" << dendl;
+
+ for (int i=0; i<MAX_LOCKS; ++i) {
+ free_ids.push_back(i);
+ }
}
pthread_mutex_unlock(&lockdep_mutex);
}
@@ -82,7 +94,8 @@ void lockdep_unregister_ceph_context(CephContext *cct)
follows[i][j] = NULL;
lock_names.clear();
lock_ids.clear();
- last_id = 0;
+ lock_refs.clear();
+ free_ids.clear();
}
pthread_mutex_unlock(&lockdep_mutex);
}
@@ -115,15 +128,12 @@ int lockdep_register(const char *name)
int id;
pthread_mutex_lock(&lockdep_mutex);
- if (last_id == 0)
- for (int i=0; i<MAX_LOCKS; i++)
- for (int j=0; j<MAX_LOCKS; j++)
- follows[i][j] = NULL;
-
- ceph::unordered_map<const char *, int>::iterator p = lock_ids.find(name);
+ ceph::unordered_map<std::string, int>::iterator p = lock_ids.find(name);
if (p == lock_ids.end()) {
- assert(last_id < MAX_LOCKS);
- id = last_id++;
+ assert(!free_ids.empty());
+ id = free_ids.front();
+ free_ids.pop_front();
+
lock_ids[name] = id;
lock_names[id] = name;
lockdep_dout(10) << "registered '" << name << "' as " << id << dendl;
@@ -132,11 +142,47 @@ int lockdep_register(const char *name)
lockdep_dout(20) << "had '" << name << "' as " << id << dendl;
}
+ ++lock_refs[id];
pthread_mutex_unlock(&lockdep_mutex);
return id;
}
+void lockdep_unregister(int id)
+{
+ if (id < 0) {
+ return;
+ }
+
+ pthread_mutex_lock(&lockdep_mutex);
+
+ map<int, std::string>::iterator p = lock_names.find(id);
+ assert(p != lock_names.end());
+
+ int &refs = lock_refs[id];
+ if (--refs == 0) {
+ // reset dependency ordering
+ for (int i=0; i<MAX_LOCKS; ++i) {
+ delete follows[id][i];
+ follows[id][i] = NULL;
+
+ delete follows[i][id];
+ follows[i][id] = NULL;
+ }
+
+ lockdep_dout(10) << "unregistered '" << p->second << "' from " << id
+ << dendl;
+ lock_ids.erase(p->second);
+ lock_names.erase(id);
+ lock_refs.erase(id);
+ free_ids.push_back(id);
+ } else {
+ lockdep_dout(20) << "have " << refs << " of '" << p->second << "' "
+ << "from " << id << dendl;
+ }
+ pthread_mutex_unlock(&lockdep_mutex);
+}
+
// does b follow a?
static bool does_follow(int a, int b)
@@ -165,7 +211,7 @@ static bool does_follow(int a, int b)
return false;
}
-int lockdep_will_lock(const char *name, int id)
+int lockdep_will_lock(const char *name, int id, bool force_backtrace)
{
pthread_t p = pthread_self();
if (id < 0) id = lockdep_register(name);
@@ -195,8 +241,8 @@ int lockdep_will_lock(const char *name, int id)
// new dependency
// did we just create a cycle?
- BackTrace *bt = new BackTrace(BACKTRACE_SKIP);
if (does_follow(id, p->first)) {
+ BackTrace *bt = new BackTrace(BACKTRACE_SKIP);
lockdep_dout(0) << "new dependency " << lock_names[p->first]
<< " (" << p->first << ") -> " << name << " (" << id << ")"
<< " creates a cycle at\n";
@@ -222,6 +268,10 @@ int lockdep_will_lock(const char *name, int id)
assert(0); // actually, we should just die here.
} else {
+ BackTrace *bt = NULL;
+ if (force_backtrace || lockdep_force_backtrace()) {
+ bt = new BackTrace(BACKTRACE_SKIP);
+ }
follows[p->first][id] = bt;
lockdep_dout(10) << lock_names[p->first] << " -> " << name << " at" << dendl;
//bt->print(*_dout);
@@ -241,7 +291,7 @@ int lockdep_locked(const char *name, int id, bool force_backtrace)
pthread_mutex_lock(&lockdep_mutex);
lockdep_dout(20) << "_locked " << name << dendl;
- if (g_lockdep >= 2 || force_backtrace)
+ if (force_backtrace || lockdep_force_backtrace())
held[p][id] = new BackTrace(BACKTRACE_SKIP);
else
held[p][id] = 0;
diff --git a/src/common/lockdep.h b/src/common/lockdep.h
index 1dcf053..63d2f0d 100644
--- a/src/common/lockdep.h
+++ b/src/common/lockdep.h
@@ -22,7 +22,8 @@ extern int g_lockdep;
extern void lockdep_register_ceph_context(CephContext *cct);
extern void lockdep_unregister_ceph_context(CephContext *cct);
extern int lockdep_register(const char *n);
-extern int lockdep_will_lock(const char *n, int id);
+extern void lockdep_unregister(int id);
+extern int lockdep_will_lock(const char *n, int id, bool force_backtrace=false);
extern int lockdep_locked(const char *n, int id, bool force_backtrace=false);
extern int lockdep_will_unlock(const char *n, int id);
extern int lockdep_dump_locks();
diff --git a/src/common/valgrind.h b/src/common/valgrind.h
new file mode 100644
index 0000000..2aa3fb5
--- /dev/null
+++ b/src/common/valgrind.h
@@ -0,0 +1,15 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_VALGRIND_H
+#define CEPH_VALGRIND_H
+
+#ifdef HAVE_VALGRIND_HELGRIND_H
+ #include <valgrind/helgrind.h>
+#else
+ #define ANNOTATE_HAPPENS_AFTER(x) do {} while (0)
+ #define ANNOTATE_HAPPENS_BEFORE_FORGET_ALL(x) ANNOTATE_HAPPENS_AFTER(x)
+ #define ANNOTATE_HAPPENS_BEFORE(x) ANNOTATE_HAPPENS_AFTER(x)
+#endif
+
+#endif // CEPH_VALGRIND_H
diff --git a/src/crush/CrushTester.cc b/src/crush/CrushTester.cc
index 9aada7b..d2be1f0 100644
--- a/src/crush/CrushTester.cc
+++ b/src/crush/CrushTester.cc
@@ -359,7 +359,8 @@ void CrushTester::write_integer_indexed_scalar_data_string(vector<string> &dst,
int CrushTester::test_with_crushtool(const string& crushtool,
int max_id,
- int timeout)
+ int timeout,
+ int ruleset)
{
string timeout_string = stringify(timeout);
string opt_max_id = stringify(max_id);
@@ -372,6 +373,14 @@ int CrushTester::test_with_crushtool(const string& crushtool,
cmd_args.push_back("--test");
cmd_args.push_back("--check");
cmd_args.push_back(opt_max_id.c_str());
+ cmd_args.push_back("--min-x");
+ cmd_args.push_back("1");
+ cmd_args.push_back("--max-x");
+ cmd_args.push_back("50");
+ if (ruleset >= 0) {
+ cmd_args.push_back("--ruleset");
+ cmd_args.push_back(stringify(ruleset).c_str());
+ }
cmd_args.push_back(NULL);
int pipefds[2];
@@ -539,6 +548,10 @@ int CrushTester::test()
err << "rule " << r << " dne" << std::endl;
continue;
}
+ if (ruleset >= 0 &&
+ crush.get_rule_mask_ruleset(r) != ruleset) {
+ continue;
+ }
int minr = min_rep, maxr = max_rep;
if (min_rep < 0 || max_rep < 0) {
minr = crush.get_rule_mask_min_size(r);
diff --git a/src/crush/CrushTester.h b/src/crush/CrushTester.h
index 4f90aae..a9221c7 100644
--- a/src/crush/CrushTester.h
+++ b/src/crush/CrushTester.h
@@ -15,6 +15,7 @@ class CrushTester {
map<int, int> device_weight;
int min_rule, max_rule;
+ int ruleset;
int min_x, max_x;
int min_rep, max_rep;
@@ -168,6 +169,7 @@ public:
CrushTester(CrushWrapper& c, ostream& eo)
: crush(c), err(eo),
min_rule(-1), max_rule(-1),
+ ruleset(-1),
min_x(-1), max_x(-1),
min_rep(-1), max_rep(-1),
num_batches(1),
@@ -333,6 +335,10 @@ public:
min_rule = max_rule = rule;
}
+ void set_ruleset(int rs) {
+ ruleset = rs;
+ }
+
/**
* check if any bucket/nodes is referencing an unknown name or type
* @param max_id rejects any non-bucket items with id less than this number,
@@ -344,7 +350,8 @@ public:
int test();
int test_with_crushtool(const string& crushtool,
int max_id = -1,
- int timeout = 0);
+ int timeout = 0,
+ int ruleset = -1);
};
#endif
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index 4dcf6b8..0dac389 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -832,7 +832,7 @@ int CrushWrapper::adjust_item_weight_in_loc(CephContext *cct, int id, int weight
int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight)
{
- ldout(cct, 5) << "adjust_item_weight " << id << " weight " << weight << dendl;
+ ldout(cct, 5) << __func__ << " " << id << " weight " << weight << dendl;
crush_bucket *b = get_bucket(id);
if (IS_ERR(b))
return PTR_ERR(b);
@@ -842,10 +842,13 @@ int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight)
while (!q.empty()) {
b = q.front();
q.pop_front();
+ int local_changed = 0;
for (unsigned i=0; i<b->size; ++i) {
int n = b->items[i];
if (n >= 0) {
crush_bucket_adjust_item_weight(crush, b, n, weight);
+ ++changed;
+ ++local_changed;
} else {
crush_bucket *sub = get_bucket(n);
if (IS_ERR(sub))
@@ -853,6 +856,9 @@ int CrushWrapper::adjust_subtree_weight(CephContext *cct, int id, int weight)
q.push_back(sub);
}
}
+ if (local_changed) {
+ adjust_item_weight(cct, b->id, b->weight);
+ }
}
return changed;
}
diff --git a/src/erasure-code/shec/ErasureCodeShec.cc b/src/erasure-code/shec/ErasureCodeShec.cc
index f775715..b0437a5 100644
--- a/src/erasure-code/shec/ErasureCodeShec.cc
+++ b/src/erasure-code/shec/ErasureCodeShec.cc
@@ -50,6 +50,7 @@ int ErasureCodeShec::create_ruleset(const string &name,
if (ruleid < 0) {
return ruleid;
} else {
+ crush.set_rule_mask_max_size(ruleid, get_chunk_count());
return crush.get_rule_mask_ruleset(ruleid);
}
}
diff --git a/src/global/global_init.cc b/src/global/global_init.cc
index f03677c..3464b0a 100644
--- a/src/global/global_init.cc
+++ b/src/global/global_init.cc
@@ -116,8 +116,6 @@ void global_init(std::vector < const char * > *alt_def_args,
{
global_pre_init(alt_def_args, args, module_type, code_env, flags);
- g_lockdep = g_ceph_context->_conf->lockdep;
-
// signal stuff
int siglist[] = { SIGPIPE, 0 };
block_signals(siglist, NULL);
@@ -138,9 +136,6 @@ void global_init(std::vector < const char * > *alt_def_args,
}
}
- if (g_lockdep) {
- lockdep_register_ceph_context(g_ceph_context);
- }
register_assert_context(g_ceph_context);
// call all observers now. this has the side-effect of configuring
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
index 7f03616..781df1b 100644
--- a/src/include/ceph_features.h
+++ b/src/include/ceph_features.h
@@ -63,6 +63,9 @@
#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49)
// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */
+#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
+/* ... */
+#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
#define CEPH_FEATURE_RESERVED2 (1ULL<<61) /* slow down, we are almost out... */
#define CEPH_FEATURE_RESERVED (1ULL<<62) /* DO NOT USE THIS ... last bit! */
@@ -148,6 +151,7 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) {
CEPH_FEATURE_MDS_QUOTA | \
CEPH_FEATURE_CRUSH_V4 | \
CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY | \
+ CEPH_FEATURE_HAMMER_0_94_4 | \
0ULL)
#define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL
diff --git a/src/init-radosgw b/src/init-radosgw
index 914d6fd..b7569a0 100644
--- a/src/init-radosgw
+++ b/src/init-radosgw
@@ -11,7 +11,17 @@
PATH=/sbin:/bin:/usr/bin
-. /lib/lsb/init-functions
+if [ -x /sbin/start-stop-daemon ]; then
+ DEBIAN=1
+ . /lib/lsb/init-functions
+else
+ . /etc/rc.d/init.d/functions
+ DEBIAN=0
+
+ # detect systemd, also check whether the systemd-run binary exists
+ SYSTEMD_RUN=$(which systemd-run 2>/dev/null)
+ grep -qs systemd /proc/1/comm || SYSTEMD_RUN=""
+fi
daemon_is_running() {
daemon=$1
@@ -34,7 +44,7 @@ done
# prefix for radosgw instances in ceph.conf
PREFIX='client.radosgw.'
-# user to run radosgw as (it not specified in ceph.conf)
+# user to run radosgw as (if not specified in ceph.conf)
DEFAULT_USER='root'
RADOSGW=`which radosgw`
@@ -43,22 +53,37 @@ if [ ! -x "$RADOSGW" ]; then
exit 1
fi
+# list daemons, old-style and new-style
+# NOTE: no support for cluster names that aren't "ceph"
+dlist=`ceph-conf --list-sections $PREFIX`
+if [ -d "/var/lib/ceph/radosgw" ]; then
+ for d in `ls /var/lib/ceph/radosgw | grep ^ceph-`; do
+ if [ -e "/var/lib/ceph/radosgw/$d/sysvinit" ]; then
+ id=`echo $d | cut -c 6-`
+ dlist="client.$id $dlist"
+ fi
+ done
+fi
+
case "$1" in
start)
- for name in `ceph-conf --list-sections $PREFIX`;
+ for name in $dlist
do
auto_start=`ceph-conf -n $name 'auto start'`
if [ "$auto_start" = "no" ] || [ "$auto_start" = "false" ] || [ "$auto_start" = "0" ]; then
continue
fi
- # mapped to this host?
- host=`ceph-conf -n $name host`
- hostname=`hostname -s`
- if [ "$host" != "$hostname" ]; then
- [ $VERBOSE -eq 1 ] && echo "hostname $hostname could not be found in ceph.conf:[$name], not starting."
- continue
- fi
+ shortname=`echo $name | cut -c 8-`
+ if [ ! -e "/var/lib/ceph/radosgw/ceph-$shortname/sysvinit" ]; then
+ # mapped to this host?
+ host=`ceph-conf -n $name host`
+ hostname=`hostname -s`
+ if [ "$host" != "$hostname" ]; then
+ [ $VERBOSE -eq 1 ] && echo "hostname $hostname could not be found in ceph.conf:[$name], not starting."
+ continue
+ fi
+ fi
user=`ceph-conf -n $name user`
if [ -z "$user" ]; then
@@ -74,20 +99,46 @@ case "$1" in
fi
echo "Starting $name..."
- start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
+ if [ $DEBIAN -eq 1 ]; then
+ start-stop-daemon --start -u $user -x $RADOSGW -p /var/run/ceph/client-$name.pid -- -n $name
+ elif [ -n "$SYSTEMD_RUN" ]; then
+ $SYSTEMD_RUN -r su "$user" -c "ulimit -n 32768; $RADOSGW -n $name"
+ else
+ ulimit -n 32768
+ daemon --user="$user" "$RADOSGW -n $name"
+ fi
done
- daemon_is_running $RADOSGW
;;
reload)
echo "Reloading $name..."
- start-stop-daemon --stop --signal HUP -x $RADOSGW --oknodo
- ;;
+ if [ $DEBIAN -eq 1 ]; then
+ start-stop-daemon --stop --signal HUP -x $RADOSGW --oknodo
+ else
+ killproc $RADOSGW -SIGHUP
+ fi
+ ;;
restart|force-reload)
$0 stop
$0 start
;;
stop)
- start-stop-daemon --stop -x $RADOSGW --oknodo
+ timeout=0
+ for name in $dlist
+ do
+ t=`$RADOSGW -n $name --show-config-value rgw_exit_timeout_secs`
+ if [ $t -gt $timeout ]; then timeout=$t; fi
+ done
+
+ if [ $DEBIAN -eq 1 ]; then
+ if [ $timeout -gt 0 ]; then TIMEOUT="-R $timeout"; fi
+ start-stop-daemon --stop -x $RADOSGW --oknodo $TIMEOUT
+ else
+ killproc $RADOSGW
+ while pidof $RADOSGW >/dev/null && [ $timeout -gt 0 ] ; do
+ sleep 1
+ timeout=$(($timeout - 1))
+ done
+ fi
;;
status)
daemon_is_running $RADOSGW
diff --git a/src/init-radosgw.sysv b/src/init-radosgw.sysv
deleted file mode 100644
index 4ec891e..0000000
--- a/src/init-radosgw.sysv
+++ /dev/null
@@ -1,114 +0,0 @@
-#! /bin/bash
-### BEGIN INIT INFO
-# Provides: radosgw
-# Required-Start: $remote_fs $named $network $time
-# Required-Stop: $remote_fs $named $network $time
-# Default-Start: 2 3 4 5
-# Default-Stop: 0 1 6
-# Short-Description: radosgw RESTful rados gateway
-### END INIT INFO
-
-PATH=/sbin:/bin:/usr/bin
-
-#. /lib/lsb/init-functions
-. /etc/rc.d/init.d/functions
-
-daemon_is_running() {
- daemon=$1
- sleep 1
- if pidof $daemon >/dev/null; then
- echo "$daemon is running."
- exit 0
- else
- echo "$daemon is not running."
- exit 1
- fi
-}
-
-VERBOSE=0
-for opt in $*; do
- if [ "$opt" = "-v" ] || [ "$opt" = "--verbose" ]; then
- VERBOSE=1
- fi
-done
-
-# prefix for radosgw instances in ceph.conf
-PREFIX='client.radosgw.'
-
-# user to run radosgw as (it not specified in ceph.conf)
-#DEFAULT_USER='www-data'
-DEFAULT_USER='root'
-
-RADOSGW=`which radosgw`
-if [ ! -x "$RADOSGW" ]; then
- [ $VERBOSE -eq 1 ] && echo "$RADOSGW could not start, it is not executable."
- exit 1
-fi
-
-# detect systemd, also check whether the systemd-run binary exists
-SYSTEMD_RUN=$(which systemd-run 2>/dev/null)
-grep -qs systemd /proc/1/comm || SYSTEMD_RUN=""
-
-case "$1" in
- start)
- echo "Starting radosgw instance(s)..."
- for name in `ceph-conf --list-sections $PREFIX`;
- do
- auto_start=`ceph-conf -n $name 'auto start'`
- if [ "$auto_start" = "no" ] || [ "$auto_start" = "false" ] || [ "$auto_start" = "0" ]; then
- continue
- fi
-
- # mapped to this host?
- host=`ceph-conf -n $name host`
- hostname=`hostname -s`
- if [ "$host" != "$hostname" ]; then
- [ $VERBOSE -eq 1 ] && echo "hostname $hostname could not be found in ceph.conf:[$name], not starting."
- continue
- fi
-
- user=`ceph-conf -n $name user`
- if [ -z "$user" ]; then
- user="$DEFAULT_USER"
- fi
-
- log_file=`$RADOSGW -n $name --show-config-value log_file`
- if [ -n "$log_file" ]; then
- if [ ! -e "$log_file" ]; then
- touch "$log_file"
- fi
- chown $user $log_file
- fi
-
- if [ -n "$SYSTEMD_RUN" ]; then
- $SYSTEMD_RUN -r sudo -u "$user" bash -c "ulimit -n 32768; $RADOSGW -n $name"
- else
- ulimit -n 32768
- daemon --user="$user" "$RADOSGW -n $name"
- fi
- echo "Starting $name..."
- done
- daemon_is_running $RADOSGW
- ;;
- reload)
- #start-stop-daemon --signal HUP -x $RADOSGW --oknodo
- killproc $RADOSGW -SIGHUP
- echo "Reloading radosgw instance(s)..."
- ;;
- restart|force-reload)
- $0 stop
- $0 start
- ;;
- stop)
- #start-stop-daemon --stop -x $RADOSGW --oknodo
- killproc $RADOSGW
- echo "Stopping radosgw instance(s)..."
- ;;
- status)
- daemon_is_running $RADOSGW
- ;;
- *)
- echo "Usage: $0 {start|stop|restart|force-reload|reload|status} [-v|--verbose]" >&2
- exit 3
- ;;
-esac
diff --git a/src/java/Makefile.in b/src/java/Makefile.in
index 3a22050..b888f48 100644
--- a/src/java/Makefile.in
+++ b/src/java/Makefile.in
@@ -203,6 +203,7 @@ GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
GIT_CHECK = @GIT_CHECK@
GREP = @GREP@
HAVE_CXX11 = @HAVE_CXX11@
+HAVE_VALGRIND = @HAVE_VALGRIND@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
diff --git a/src/librados/RadosClient.cc b/src/librados/RadosClient.cc
index 8e63fce..3886b1e 100644
--- a/src/librados/RadosClient.cc
+++ b/src/librados/RadosClient.cc
@@ -551,7 +551,7 @@ bool librados::RadosClient::put() {
}
int librados::RadosClient::pool_create(string& name, unsigned long long auid,
- __u8 crush_rule)
+ int16_t crush_rule)
{
int r = wait_for_osdmap();
if (r < 0) {
@@ -578,7 +578,7 @@ int librados::RadosClient::pool_create(string& name, unsigned long long auid,
int librados::RadosClient::pool_create_async(string& name, PoolAsyncCompletionImpl *c,
unsigned long long auid,
- __u8 crush_rule)
+ int16_t crush_rule)
{
int r = wait_for_osdmap();
if (r < 0)
diff --git a/src/librados/RadosClient.h b/src/librados/RadosClient.h
index f4eb083..d44336f 100644
--- a/src/librados/RadosClient.h
+++ b/src/librados/RadosClient.h
@@ -101,9 +101,15 @@ public:
int get_pool_stats(std::list<string>& ls, map<string,::pool_stat_t>& result);
int get_fs_stats(ceph_statfs& result);
- int pool_create(string& name, unsigned long long auid=0, __u8 crush_rule=0);
+ /*
+ -1 was set as the default value and monitor will pickup the right crush rule with below order:
+ a) osd pool default crush replicated ruleset
+ b) the first ruleset in crush ruleset
+ c) error out if no value find
+ */
+ int pool_create(string& name, unsigned long long auid=0, int16_t crush_rule=-1);
int pool_create_async(string& name, PoolAsyncCompletionImpl *c, unsigned long long auid=0,
- __u8 crush_rule=0);
+ int16_t crush_rule=-1);
int pool_get_base_tier(int64_t pool_id, int64_t* base_tier);
int pool_delete(const char *name);
diff --git a/src/librbd/AioCompletion.cc b/src/librbd/AioCompletion.cc
index 2663e74..6222531 100644
--- a/src/librbd/AioCompletion.cc
+++ b/src/librbd/AioCompletion.cc
@@ -6,6 +6,7 @@
#include "common/ceph_context.h"
#include "common/dout.h"
#include "common/errno.h"
+#include "common/WorkQueue.h"
#include "librbd/AioRequest.h"
#include "librbd/internal.h"
@@ -89,7 +90,9 @@ namespace librbd {
}
if (complete_cb) {
+ lock.Unlock();
complete_cb(rbd_comp, complete_arg);
+ lock.Lock();
}
done = true;
cond.Signal();
@@ -171,6 +174,17 @@ namespace librbd {
m_completion->complete_request(m_cct, r);
}
+ void C_CacheRead::complete(int r) {
+ if (!m_enqueued) {
+ // cache_lock creates a lock ordering issue -- so re-execute this context
+ // outside the cache_lock
+ m_enqueued = true;
+ m_image_ctx.op_work_queue->queue(this, r);
+ return;
+ }
+ Context::complete(r);
+ }
+
void C_CacheRead::finish(int r)
{
m_req->complete(r);
diff --git a/src/librbd/AioCompletion.h b/src/librbd/AioCompletion.h
index bd527b1..4fe53eb 100644
--- a/src/librbd/AioCompletion.h
+++ b/src/librbd/AioCompletion.h
@@ -64,7 +64,7 @@ namespace librbd {
AsyncOperation async_op;
- AioCompletion() : lock("AioCompletion::lock", true),
+ AioCompletion() : lock("AioCompletion::lock", true, false),
done(false), rval(0), complete_cb(NULL),
complete_arg(NULL), rbd_comp(NULL),
pending_count(0), blockers(1),
@@ -183,11 +183,15 @@ namespace librbd {
class C_CacheRead : public Context {
public:
- explicit C_CacheRead(AioRead *req) : m_req(req) {}
- virtual ~C_CacheRead() {}
+ explicit C_CacheRead(ImageCtx *ictx, AioRead *req)
+ : m_image_ctx(*ictx), m_req(req), m_enqueued(false) {}
+ virtual void complete(int r);
+ protected:
virtual void finish(int r);
private:
+ ImageCtx &m_image_ctx;
AioRead *m_req;
+ bool m_enqueued;
};
}
diff --git a/src/librbd/AioRequest.cc b/src/librbd/AioRequest.cc
index d52cd5d..7dbec4a 100644
--- a/src/librbd/AioRequest.cc
+++ b/src/librbd/AioRequest.cc
@@ -24,28 +24,21 @@
namespace librbd {
- AioRequest::AioRequest() :
- m_ictx(NULL),
- m_object_no(0), m_object_off(0), m_object_len(0),
- m_snap_id(CEPH_NOSNAP), m_completion(NULL), m_parent_completion(NULL),
- m_hide_enoent(false) {}
AioRequest::AioRequest(ImageCtx *ictx, const std::string &oid,
uint64_t objectno, uint64_t off, uint64_t len,
- const ::SnapContext &snapc, librados::snap_t snap_id,
+ librados::snap_t snap_id,
Context *completion,
- bool hide_enoent) :
- m_ictx(ictx), m_oid(oid), m_object_no(objectno),
- m_object_off(off), m_object_len(len), m_snap_id(snap_id),
- m_completion(completion), m_parent_completion(NULL),
- m_hide_enoent(hide_enoent) {
- m_snaps.insert(m_snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
- }
+ bool hide_enoent)
+ : m_ictx(ictx), m_oid(oid), m_object_no(objectno), m_object_off(off),
+ m_object_len(len), m_snap_id(snap_id), m_completion(completion),
+ m_hide_enoent(hide_enoent) {
- AioRequest::~AioRequest() {
- if (m_parent_completion) {
- m_parent_completion->release();
- m_parent_completion = NULL;
- }
+ Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
+ 0, m_ictx->layout.fl_object_size, m_parent_extents);
+
+ RWLock::RLocker snap_locker(m_ictx->snap_lock);
+ RWLock::RLocker parent_locker(m_ictx->parent_lock);
+ compute_parent_extents();
}
void AioRequest::complete(int r)
@@ -60,24 +53,31 @@ namespace librbd {
}
}
- void AioRequest::read_from_parent(vector<pair<uint64_t,uint64_t> >& image_extents,
- bool block_completion)
- {
- assert(!m_parent_completion);
- m_parent_completion = aio_create_completion_internal(this, rbd_req_cb);
- if (block_completion) {
- // prevent the parent image from being deleted while this
- // request is still in-progress
- m_parent_completion->get();
- m_parent_completion->block();
+ bool AioRequest::compute_parent_extents() {
+ assert(m_ictx->snap_lock.is_locked());
+ assert(m_ictx->parent_lock.is_locked());
+
+ uint64_t parent_overlap;
+ int r = m_ictx->get_parent_overlap(m_snap_id, &parent_overlap);
+ if (r < 0) {
+ // NOTE: it's possible for a snapshot to be deleted while we are
+ // still reading from it
+ lderr(m_ictx->cct) << this << " compute_parent_extents: failed to "
+ << "retrieve parent overlap: " << cpp_strerror(r)
+ << dendl;
+ m_parent_extents.clear();
+ return false;
}
- ldout(m_ictx->cct, 20) << "read_from_parent this = " << this
- << " parent completion " << m_parent_completion
- << " extents " << image_extents
- << dendl;
- aio_read(m_ictx->parent, image_extents, NULL, &m_read_data,
- m_parent_completion, 0);
+ uint64_t object_overlap =
+ m_ictx->prune_parent_extents(m_parent_extents, parent_overlap);
+ if (object_overlap > 0) {
+ ldout(m_ictx->cct, 20) << this << " compute_parent_extents: "
+ << "overlap " << parent_overlap << " "
+ << "extents " << m_parent_extents << dendl;
+ return true;
+ }
+ return false;
}
static inline bool is_copy_on_read(ImageCtx *ictx, librados::snap_t snap_id) {
@@ -91,32 +91,30 @@ namespace librbd {
AioRead::AioRead(ImageCtx *ictx, const std::string &oid,
uint64_t objectno, uint64_t offset, uint64_t len,
vector<pair<uint64_t,uint64_t> >& be,
- const ::SnapContext &snapc,
librados::snap_t snap_id, bool sparse,
Context *completion, int op_flags)
- : AioRequest(ictx, oid, objectno, offset, len, snapc, snap_id, completion,
- false),
- m_buffer_extents(be), m_tried_parent(false),
- m_sparse(sparse), m_op_flags(op_flags), m_state(LIBRBD_AIO_READ_FLAT) {
- RWLock::RLocker l(m_ictx->snap_lock);
- RWLock::RLocker l2(m_ictx->parent_lock);
-
- Striper::extent_to_file(m_ictx->cct, &m_ictx->layout,
- m_object_no, 0, m_ictx->layout.fl_object_size,
- m_image_extents);
+ : AioRequest(ictx, oid, objectno, offset, len, snap_id, completion, false),
+ m_buffer_extents(be), m_tried_parent(false), m_sparse(sparse),
+ m_op_flags(op_flags), m_parent_completion(NULL),
+ m_state(LIBRBD_AIO_READ_FLAT) {
guard_read();
}
+ AioRead::~AioRead()
+ {
+ if (m_parent_completion) {
+ m_parent_completion->release();
+ m_parent_completion = NULL;
+ }
+ }
+
void AioRead::guard_read()
{
- assert(m_ictx->snap_lock.is_locked());
+ RWLock::RLocker snap_locker(m_ictx->snap_lock);
+ RWLock::RLocker parent_locker(m_ictx->parent_lock);
- uint64_t image_overlap = 0;
- m_ictx->get_parent_overlap(m_snap_id, &image_overlap);
- uint64_t object_overlap =
- m_ictx->prune_parent_extents(m_image_extents, image_overlap);
- if (object_overlap) {
+ if (has_parent()) {
ldout(m_ictx->cct, 20) << __func__ << " guarding read" << dendl;
m_state = LIBRBD_AIO_READ_GUARD;
}
@@ -124,7 +122,8 @@ namespace librbd {
bool AioRead::should_complete(int r)
{
- ldout(m_ictx->cct, 20) << "should_complete " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len
+ ldout(m_ictx->cct, 20) << "should_complete " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len
<< " r = " << r << dendl;
bool finished = true;
@@ -147,25 +146,25 @@ namespace librbd {
}
// calculate reverse mapping onto the image
- vector<pair<uint64_t,uint64_t> > image_extents;
- Striper::extent_to_file(m_ictx->cct, &m_ictx->layout,
- m_object_no, m_object_off, m_object_len,
- image_extents);
-
- uint64_t image_overlap = 0;
- r = m_ictx->get_parent_overlap(m_snap_id, &image_overlap);
- if (r < 0) {
- assert(0 == "FIXME");
+ vector<pair<uint64_t,uint64_t> > parent_extents;
+ Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
+ m_object_off, m_object_len, parent_extents);
+
+ uint64_t parent_overlap = 0;
+ uint64_t object_overlap = 0;
+ r = m_ictx->get_parent_overlap(m_snap_id, &parent_overlap);
+ if (r == 0) {
+ object_overlap = m_ictx->prune_parent_extents(parent_extents,
+ parent_overlap);
}
- uint64_t object_overlap = m_ictx->prune_parent_extents(image_extents,
- image_overlap);
- if (object_overlap) {
+
+ if (object_overlap > 0) {
m_tried_parent = true;
if (is_copy_on_read(m_ictx, m_snap_id)) {
m_state = LIBRBD_AIO_READ_COPYUP;
}
- read_from_parent(image_extents, true);
+ read_from_parent(parent_extents);
finished = false;
}
}
@@ -180,7 +179,8 @@ namespace librbd {
}
break;
case LIBRBD_AIO_READ_COPYUP:
- ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_COPYUP" << dendl;
+ ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_COPYUP"
+ << dendl;
// This is the extra step for copy-on-read: kick off an asynchronous copyup.
// It is different from copy-on-write as asynchronous copyup will finish
// by itself so state won't go back to LIBRBD_AIO_READ_GUARD.
@@ -190,37 +190,12 @@ namespace librbd {
// If read entire object from parent success and CoR is possible, kick
// off a asynchronous copyup. This approach minimizes the latency
// impact.
- Mutex::Locker copyup_locker(m_ictx->copyup_list_lock);
- map<uint64_t, CopyupRequest*>::iterator it =
- m_ictx->copyup_list.find(m_object_no);
- if (it == m_ictx->copyup_list.end()) {
- RWLock::RLocker l(m_ictx->snap_lock);
- RWLock::RLocker l2(m_ictx->parent_lock);
- if (m_ictx->parent == NULL) {
- ldout(m_ictx->cct, 20) << "parent is gone; do nothing" << dendl;
- break;
- }
-
- // If parent still exists, overlap might also have changed.
- uint64_t parent_overlap;
- r = m_ictx->get_parent_overlap(CEPH_NOSNAP, &parent_overlap);
- assert(r == 0);
-
- uint64_t newlen = m_ictx->prune_parent_extents(
- m_image_extents, parent_overlap);
- if (newlen != 0) {
- // create and kick off a CopyupRequest
- CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
- m_object_no,
- m_image_extents);
- m_ictx->copyup_list[m_object_no] = new_req;
- new_req->queue_send();
- }
- }
+ send_copyup();
}
break;
case LIBRBD_AIO_READ_FLAT:
- ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_FLAT" << dendl;
+ ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_FLAT"
+ << dendl;
// The read content should be deposit in m_read_data
break;
default:
@@ -260,26 +235,57 @@ namespace librbd {
rados_completion->release();
}
+ void AioRead::send_copyup()
+ {
+ {
+ RWLock::RLocker snap_locker(m_ictx->snap_lock);
+ RWLock::RLocker parent_locker(m_ictx->parent_lock);
+ if (!compute_parent_extents()) {
+ return;
+ }
+ }
+
+ Mutex::Locker copyup_locker(m_ictx->copyup_list_lock);
+ map<uint64_t, CopyupRequest*>::iterator it =
+ m_ictx->copyup_list.find(m_object_no);
+ if (it == m_ictx->copyup_list.end()) {
+ // create and kick off a CopyupRequest
+ CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid, m_object_no,
+ m_parent_extents);
+ m_ictx->copyup_list[m_object_no] = new_req;
+ new_req->queue_send();
+ }
+ }
+
+ void AioRead::read_from_parent(const vector<pair<uint64_t,uint64_t> >& parent_extents)
+ {
+ assert(!m_parent_completion);
+ m_parent_completion = aio_create_completion_internal(this, rbd_req_cb);
+
+ // prevent the parent image from being deleted while this
+ // request is still in-progress
+ m_parent_completion->get();
+ m_parent_completion->block();
+
+ ldout(m_ictx->cct, 20) << "read_from_parent this = " << this
+ << " parent completion " << m_parent_completion
+ << " extents " << parent_extents
+ << dendl;
+ aio_read(m_ictx->parent, parent_extents, NULL, &m_read_data,
+ m_parent_completion, 0);
+ }
+
/** write **/
- AbstractWrite::AbstractWrite()
- : m_state(LIBRBD_AIO_WRITE_FLAT),
- m_parent_overlap(0),
- m_snap_seq(0) {}
AbstractWrite::AbstractWrite(ImageCtx *ictx, const std::string &oid,
- uint64_t object_no, uint64_t object_off, uint64_t len,
- vector<pair<uint64_t,uint64_t> >& objectx,
- uint64_t object_overlap,
- const ::SnapContext &snapc, librados::snap_t snap_id,
- Context *completion,
- bool hide_enoent)
- : AioRequest(ictx, oid, object_no, object_off, len, snapc, snap_id,
- completion, hide_enoent),
- m_state(LIBRBD_AIO_WRITE_FLAT), m_snap_seq(snapc.seq.val),
- m_entire_object(NULL)
+ uint64_t object_no, uint64_t object_off,
+ uint64_t len, const ::SnapContext &snapc,
+ Context *completion, bool hide_enoent)
+ : AioRequest(ictx, oid, object_no, object_off, len, CEPH_NOSNAP, completion,
+ hide_enoent),
+ m_state(LIBRBD_AIO_WRITE_FLAT), m_snap_seq(snapc.seq.val)
{
- m_object_image_extents = objectx;
- m_parent_overlap = object_overlap;
+ m_snaps.insert(m_snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
}
void AbstractWrite::guard_write()
@@ -293,10 +299,10 @@ namespace librbd {
bool AbstractWrite::should_complete(int r)
{
- ldout(m_ictx->cct, 20) << "write " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len
+ ldout(m_ictx->cct, 20) << "write " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len
<< " should_complete: r = " << r << dendl;
- map<uint64_t, CopyupRequest*>::iterator it;
bool finished = true;
switch (m_state) {
case LIBRBD_AIO_WRITE_PRE:
@@ -318,72 +324,21 @@ namespace librbd {
ldout(m_ictx->cct, 20) << "WRITE_CHECK_GUARD" << dendl;
if (r == -ENOENT) {
- RWLock::RLocker l(m_ictx->snap_lock);
- RWLock::RLocker l2(m_ictx->parent_lock);
-
- /*
- * Parent may have disappeared; if so, recover by using
- * send_copyup() to send the original write req (the copyup
- * operation itself will be a no-op, since someone must have
- * populated the child object while we weren't looking).
- * Move to WRITE_FLAT state as we'll be done with the
- * operation once the null copyup completes.
- */
-
- if (m_ictx->parent == NULL) {
- ldout(m_ictx->cct, 20) << "parent is gone; do null copyup " << dendl;
- m_state = LIBRBD_AIO_WRITE_FLAT;
- send_copyup();
- finished = false;
- break;
- }
+ bool has_parent;
+ {
+ RWLock::RLocker snap_locker(m_ictx->snap_lock);
+ RWLock::RLocker parent_locker(m_ictx->parent_lock);
+ has_parent = compute_parent_extents();
+ }
// If parent still exists, overlap might also have changed.
- uint64_t parent_overlap;
- r = m_ictx->get_parent_overlap(CEPH_NOSNAP, &parent_overlap);
- assert(r == 0);
-
- uint64_t newlen = m_ictx->prune_parent_extents(
- m_object_image_extents, parent_overlap);
-
- // copyup the entire object up to the overlap point, if any
- if (newlen != 0) {
- ldout(m_ictx->cct, 20) << "should_complete(" << this << ") overlap "
- << parent_overlap << " newlen "
- << newlen << " image_extents"
- << m_object_image_extents << dendl;
-
- m_state = LIBRBD_AIO_WRITE_COPYUP;
-
- if (is_copy_on_read(m_ictx, m_snap_id)) {
- m_ictx->copyup_list_lock.Lock();
- it = m_ictx->copyup_list.find(m_object_no);
- if (it == m_ictx->copyup_list.end()) {
- // If it is not in the list, create a CopyupRequest and wait for it.
- CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
- m_object_no,
- m_object_image_extents);
- // make sure to wait on this CopyupRequest
- new_req->append_request(this);
- m_ictx->copyup_list[m_object_no] = new_req;
-
- m_entire_object = &(new_req->get_copyup_data());
- m_ictx->copyup_list_lock.Unlock();
- new_req->send();
- } else {
- it->second->append_request(this);
- m_entire_object = &it->second->get_copyup_data();
- m_ictx->copyup_list_lock.Unlock();
- }
- } else {
- read_from_parent(m_object_image_extents, false);
- }
+ if (has_parent) {
+ send_copyup();
} else {
+ // parent may have disappeared -- send original write again
ldout(m_ictx->cct, 20) << "should_complete(" << this
<< "): parent overlap now 0" << dendl;
- m_object_image_extents.clear();
- m_state = LIBRBD_AIO_WRITE_FLAT;
- send_copyup();
+ send_write();
}
finished = false;
break;
@@ -400,20 +355,13 @@ namespace librbd {
case LIBRBD_AIO_WRITE_COPYUP:
ldout(m_ictx->cct, 20) << "WRITE_COPYUP" << dendl;
- m_state = LIBRBD_AIO_WRITE_GUARD;
if (r < 0) {
- return should_complete(r);
- }
-
- // Read data from waiting list safely. If this AioWrite created a
- // CopyupRequest, m_read_data should be empty.
- if (m_entire_object != NULL) {
- assert(m_read_data.length() == 0);
- m_read_data.append(*m_entire_object);
+ m_state = LIBRBD_AIO_WRITE_ERROR;
+ complete(r);
+ finished = false;
+ } else {
+ finished = send_post();
}
-
- send_copyup();
- finished = false;
break;
case LIBRBD_AIO_WRITE_FLAT:
@@ -425,7 +373,7 @@ namespace librbd {
case LIBRBD_AIO_WRITE_ERROR:
assert(r < 0);
lderr(m_ictx->cct) << "WRITE_ERROR: " << cpp_strerror(r)
- << dendl;
+ << dendl;
break;
default:
@@ -437,76 +385,71 @@ namespace librbd {
}
void AbstractWrite::send() {
+ assert(m_ictx->owner_lock.is_locked());
ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " "
<< m_object_off << "~" << m_object_len << dendl;
+ send_pre();
+ }
- if (!send_pre()) {
+ void AbstractWrite::send_pre() {
+ assert(m_ictx->owner_lock.is_locked());
+ RWLock::RLocker snap_lock(m_ictx->snap_lock);
+ if (!m_ictx->object_map.enabled()) {
send_write();
+ return;
}
- }
- bool AbstractWrite::send_pre() {
- bool lost_exclusive_lock = false;
- {
- RWLock::RLocker l(m_ictx->owner_lock);
- if (!m_ictx->object_map.enabled()) {
- return false;
- }
+ // should have been flushed prior to releasing lock
+ assert(m_ictx->image_watcher->is_lock_owner());
- if (!m_ictx->image_watcher->is_lock_owner()) {
- ldout(m_ictx->cct, 1) << "lost exclusive lock during write" << dendl;
- lost_exclusive_lock = true;
- } else {
- ldout(m_ictx->cct, 20) << "send_pre " << this << " " << m_oid << " "
- << m_object_off << "~" << m_object_len << dendl;
-
- uint8_t new_state;
- boost::optional<uint8_t> current_state;
- pre_object_map_update(&new_state);
-
- m_state = LIBRBD_AIO_WRITE_PRE;
- FunctionContext *ctx = new FunctionContext(
- boost::bind(&AioRequest::complete, this, _1));
- if (!m_ictx->object_map.aio_update(m_object_no, new_state,
- current_state, ctx)) {
- // no object map update required
- delete ctx;
- return false;
- }
- }
- }
+ ldout(m_ictx->cct, 20) << "send_pre " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len << dendl;
+ m_state = LIBRBD_AIO_WRITE_PRE;
- if (lost_exclusive_lock) {
- complete(-ERESTART);
+ uint8_t new_state;
+ boost::optional<uint8_t> current_state;
+ pre_object_map_update(&new_state);
+
+ RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
+ if (m_ictx->object_map[m_object_no] == new_state) {
+ send_write();
+ return;
}
- return true;
+
+ FunctionContext *ctx = new FunctionContext(
+ boost::bind(&AioRequest::complete, this, _1));
+ bool updated = m_ictx->object_map.aio_update(m_object_no, new_state,
+ current_state, ctx);
+ assert(updated);
}
bool AbstractWrite::send_post() {
- ldout(m_ictx->cct, 20) << "send_post " << this << " " << m_oid << " "
- << m_object_off << "~" << m_object_len << dendl;
-
- RWLock::RLocker l(m_ictx->owner_lock);
+ RWLock::RLocker owner_locker(m_ictx->owner_lock);
+ RWLock::RLocker snap_locker(m_ictx->snap_lock);
if (!m_ictx->object_map.enabled() || !post_object_map_update()) {
return true;
}
- if (m_ictx->image_watcher->is_lock_supported() &&
- !m_ictx->image_watcher->is_lock_owner()) {
- // leave the object flagged as pending
- ldout(m_ictx->cct, 1) << "lost exclusive lock during write" << dendl;
+ // should have been flushed prior to releasing lock
+ assert(m_ictx->image_watcher->is_lock_owner());
+
+ ldout(m_ictx->cct, 20) << "send_post " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len << dendl;
+ m_state = LIBRBD_AIO_WRITE_POST;
+
+ RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
+ uint8_t current_state = m_ictx->object_map[m_object_no];
+ if (current_state != OBJECT_PENDING ||
+ current_state == OBJECT_NONEXISTENT) {
return true;
}
- m_state = LIBRBD_AIO_WRITE_POST;
FunctionContext *ctx = new FunctionContext(
boost::bind(&AioRequest::complete, this, _1));
- if (!m_ictx->object_map.aio_update(m_object_no, OBJECT_NONEXISTENT,
- OBJECT_PENDING, ctx)) {
- // no object map update required
- delete ctx;
- return true;
- }
+ bool updated = m_ictx->object_map.aio_update(m_object_no,
+ OBJECT_NONEXISTENT,
+ OBJECT_PENDING, ctx);
+ assert(updated);
return false;
}
@@ -527,20 +470,30 @@ namespace librbd {
rados_completion->release();
}
- void AbstractWrite::send_copyup() {
- ldout(m_ictx->cct, 20) << "send_copyup " << this << " " << m_oid << " " << m_object_off << "~" << m_object_len << dendl;
- librados::ObjectWriteOperation op;
- if (!m_read_data.is_zero()) {
- op.exec("rbd", "copyup", m_read_data);
+ void AbstractWrite::send_copyup()
+ {
+ ldout(m_ictx->cct, 20) << "send_copyup " << this << " " << m_oid << " "
+ << m_object_off << "~" << m_object_len << dendl;
+ m_state = LIBRBD_AIO_WRITE_COPYUP;
+
+ m_ictx->copyup_list_lock.Lock();
+ map<uint64_t, CopyupRequest*>::iterator it =
+ m_ictx->copyup_list.find(m_object_no);
+ if (it == m_ictx->copyup_list.end()) {
+ CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
+ m_object_no,
+ m_parent_extents);
+
+ // make sure to wait on this CopyupRequest
+ new_req->append_request(this);
+ m_ictx->copyup_list[m_object_no] = new_req;
+
+ m_ictx->copyup_list_lock.Unlock();
+ new_req->send();
+ } else {
+ it->second->append_request(this);
+ m_ictx->copyup_list_lock.Unlock();
}
- add_write_ops(&op);
- assert(op.size() != 0);
-
- librados::AioCompletion *rados_completion =
- librados::Rados::aio_create_completion(this, NULL, rados_req_cb);
- m_ictx->md_ctx.aio_operate(m_oid, rados_completion, &op,
- m_snap_seq, m_snaps);
- rados_completion->release();
}
void AioWrite::add_write_ops(librados::ObjectWriteOperation *wr) {
diff --git a/src/librbd/AioRequest.h b/src/librbd/AioRequest.h
index bac3b47..4fff5ef 100644
--- a/src/librbd/AioRequest.h
+++ b/src/librbd/AioRequest.h
@@ -27,41 +27,44 @@ namespace librbd {
class AioRequest
{
public:
- AioRequest();
AioRequest(ImageCtx *ictx, const std::string &oid,
uint64_t objectno, uint64_t off, uint64_t len,
- const ::SnapContext &snapc, librados::snap_t snap_id,
+ librados::snap_t snap_id,
Context *completion, bool hide_enoent);
- virtual ~AioRequest();
+ virtual ~AioRequest() {}
+
+ virtual void add_copyup_ops(librados::ObjectWriteOperation *wr) {};
void complete(int r);
virtual bool should_complete(int r) = 0;
virtual void send() = 0;
+ bool has_parent() const {
+ return !m_parent_extents.empty();
+ }
+
protected:
- void read_from_parent(vector<pair<uint64_t,uint64_t> >& image_extents,
- bool block_completion);
+ bool compute_parent_extents();
ImageCtx *m_ictx;
std::string m_oid;
uint64_t m_object_no, m_object_off, m_object_len;
librados::snap_t m_snap_id;
Context *m_completion;
- AioCompletion *m_parent_completion;
- ceph::bufferlist m_read_data;
+ std::vector<std::pair<uint64_t,uint64_t> > m_parent_extents;
bool m_hide_enoent;
- std::vector<librados::snap_t> m_snaps;
};
class AioRead : public AioRequest {
public:
AioRead(ImageCtx *ictx, const std::string &oid,
uint64_t objectno, uint64_t offset, uint64_t len,
- vector<pair<uint64_t,uint64_t> >& be, const ::SnapContext &snapc,
+ vector<pair<uint64_t,uint64_t> >& be,
librados::snap_t snap_id, bool sparse,
Context *completion, int op_flags);
- virtual ~AioRead() {}
+ virtual ~AioRead();
+
virtual bool should_complete(int r);
virtual void send();
void guard_read();
@@ -79,7 +82,8 @@ namespace librbd {
bool m_tried_parent;
bool m_sparse;
int m_op_flags;
- vector<pair<uint64_t,uint64_t> > m_image_extents;
+ ceph::bufferlist m_read_data;
+ AioCompletion *m_parent_completion;
/**
* Reads go through the following state machine to deal with
@@ -104,26 +108,26 @@ namespace librbd {
};
read_state_d m_state;
+
+ void send_copyup();
+ void read_from_parent(const vector<pair<uint64_t,uint64_t> >& image_extents);
};
class AbstractWrite : public AioRequest {
public:
- AbstractWrite();
- AbstractWrite(ImageCtx *ictx, const std::string &oid,
- uint64_t object_no, uint64_t object_off, uint64_t len,
- vector<pair<uint64_t,uint64_t> >& objectx, uint64_t object_overlap,
- const ::SnapContext &snapc,
- librados::snap_t snap_id,
- Context *completion,
- bool hide_enoent);
+ AbstractWrite(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+ uint64_t object_off, uint64_t len, const ::SnapContext &snapc,
+ Context *completion, bool hide_enoent);
virtual ~AbstractWrite() {}
- virtual bool should_complete(int r);
- virtual void send();
- bool has_parent() const {
- return !m_object_image_extents.empty();
+ virtual void add_copyup_ops(librados::ObjectWriteOperation *wr)
+ {
+ add_write_ops(wr);
}
+ virtual bool should_complete(int r);
+ virtual void send();
+
private:
/**
* Writes go through the following state machine to deal with
@@ -134,27 +138,30 @@ namespace librbd {
* . |
* . \---> LIBRBD_AIO_WRITE_PRE
* . | |
- * . . . . . . | . . . . | . . . . . . . . . . .
+ * . . . . . . | . . . . | . . . . . . . . . . .
* . | -or- | .
* . | | v
* . | \----------------> LIBRBD_AIO_WRITE_FLAT . . .
* . | | .
* v v need copyup | .
* LIBRBD_AIO_WRITE_GUARD -----------> LIBRBD_AIO_WRITE_COPYUP | .
- * . | ^ | | .
- * . | | | | .
- * . | \---------------------------/ | .
- * . | | .
- * . \-------------------\ /-------------------/ .
- * . | | .
- * . LIBRBD_AIO_WRITE_POST .
- * . | .
- * . v .
- * . . . . . . . . . . . . . . > <finish> < . . . . . . . . . . . . . .
+ * . | | . | .
+ * . | | . | .
+ * . | /-----/ . | .
+ * . | | . | .
+ * . \-------------------\ | /-------------------/ .
+ * . | | | . .
+ * . v v v . .
+ * . LIBRBD_AIO_WRITE_POST . .
+ * . | . .
+ * . | . . . . . . . . .
+ * . | . .
+ * . v v .
+ * . . . . . . . . . . . . . . > <finish> < . . . . . . . . . . . . . .
*
- * The _PRE_REMOVE/_POST_REMOVE states are skipped if the object map
- * is disabled. The write starts in _WRITE_GUARD or _FLAT depending on
- * whether or not there is a parent overlap.
+ * The _PRE/_POST states are skipped if the object map is disabled.
+ * The write starts in _WRITE_GUARD or _FLAT depending on whether or not
+ * there is a parent overlap.
*/
enum write_state_d {
LIBRBD_AIO_WRITE_GUARD,
@@ -167,11 +174,9 @@ namespace librbd {
protected:
write_state_d m_state;
- vector<pair<uint64_t,uint64_t> > m_object_image_extents;
- uint64_t m_parent_overlap;
librados::ObjectWriteOperation m_write;
uint64_t m_snap_seq;
- ceph::bufferlist *m_entire_object;
+ std::vector<librados::snap_t> m_snaps;
virtual void add_write_ops(librados::ObjectWriteOperation *wr) = 0;
virtual void guard_write();
@@ -181,7 +186,7 @@ namespace librbd {
}
private:
- bool send_pre();
+ void send_pre();
bool send_post();
void send_write();
void send_copyup();
@@ -189,16 +194,10 @@ namespace librbd {
class AioWrite : public AbstractWrite {
public:
- AioWrite(ImageCtx *ictx, const std::string &oid,
- uint64_t object_no, uint64_t object_off,
- vector<pair<uint64_t,uint64_t> >& objectx, uint64_t object_overlap,
- const ceph::bufferlist &data, const ::SnapContext &snapc,
- librados::snap_t snap_id,
- Context *completion)
- : AbstractWrite(ictx, oid,
- object_no, object_off, data.length(),
- objectx, object_overlap,
- snapc, snap_id,
+ AioWrite(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+ uint64_t object_off, const ceph::bufferlist &data,
+ const ::SnapContext &snapc, Context *completion)
+ : AbstractWrite(ictx, oid, object_no, object_off, data.length(), snapc,
completion, false),
m_write_data(data), m_op_flags(0) {
}
@@ -220,16 +219,10 @@ namespace librbd {
class AioRemove : public AbstractWrite {
public:
- AioRemove(ImageCtx *ictx, const std::string &oid,
- uint64_t object_no,
- vector<pair<uint64_t,uint64_t> >& objectx, uint64_t object_overlap,
- const ::SnapContext &snapc, librados::snap_t snap_id,
- Context *completion)
- : AbstractWrite(ictx, oid,
- object_no, 0, 0,
- objectx, object_overlap,
- snapc, snap_id, completion,
- true) {
+ AioRemove(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+ const ::SnapContext &snapc, Context *completion)
+ : AbstractWrite(ictx, oid, object_no, 0, 0, snapc, completion, true),
+ m_object_state(OBJECT_NONEXISTENT) {
}
virtual ~AioRemove() {}
@@ -268,16 +261,11 @@ namespace librbd {
class AioTruncate : public AbstractWrite {
public:
- AioTruncate(ImageCtx *ictx, const std::string &oid,
- uint64_t object_no, uint64_t object_off,
- vector<pair<uint64_t,uint64_t> >& objectx, uint64_t object_overlap,
- const ::SnapContext &snapc, librados::snap_t snap_id,
- Context *completion)
- : AbstractWrite(ictx, oid,
- object_no, object_off, 0,
- objectx, object_overlap,
- snapc, snap_id, completion,
- true) {
+ AioTruncate(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+ uint64_t object_off, const ::SnapContext &snapc,
+ Context *completion)
+ : AbstractWrite(ictx, oid, object_no, object_off, 0, snapc, completion,
+ true) {
}
virtual ~AioTruncate() {}
@@ -293,16 +281,11 @@ namespace librbd {
class AioZero : public AbstractWrite {
public:
- AioZero(ImageCtx *ictx, const std::string &oid,
- uint64_t object_no, uint64_t object_off, uint64_t object_len,
- vector<pair<uint64_t,uint64_t> >& objectx, uint64_t object_overlap,
- const ::SnapContext &snapc, librados::snap_t snap_id,
- Context *completion)
- : AbstractWrite(ictx, oid,
- object_no, object_off, object_len,
- objectx, object_overlap,
- snapc, snap_id, completion,
- true) {
+ AioZero(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+ uint64_t object_off, uint64_t object_len,
+ const ::SnapContext &snapc, Context *completion)
+ : AbstractWrite(ictx, oid, object_no, object_off, object_len, snapc,
+ completion, true) {
}
virtual ~AioZero() {}
diff --git a/src/librbd/AsyncFlattenRequest.cc b/src/librbd/AsyncFlattenRequest.cc
index ebaf511..bd1875c 100644
--- a/src/librbd/AsyncFlattenRequest.cc
+++ b/src/librbd/AsyncFlattenRequest.cc
@@ -9,11 +9,11 @@
#include "librbd/ObjectMap.h"
#include "common/dout.h"
#include "common/errno.h"
-#include <boost/lambda/bind.hpp>
-#include <boost/lambda/construct.hpp>
+#include <boost/lambda/bind.hpp>
+#include <boost/lambda/construct.hpp>
#define dout_subsys ceph_subsys_rbd
-#undef dout_prefix
+#undef dout_prefix
#define dout_prefix *_dout << "librbd::AsyncFlattenRequest: "
namespace librbd {
@@ -23,60 +23,37 @@ public:
AsyncFlattenObjectContext(AsyncObjectThrottle &throttle, ImageCtx *image_ctx,
uint64_t object_size, ::SnapContext snapc,
uint64_t object_no)
- : C_AsyncObjectThrottle(throttle), m_image_ctx(*image_ctx),
- m_object_size(object_size), m_snapc(snapc), m_object_no(object_no)
+ : C_AsyncObjectThrottle(throttle, *image_ctx), m_object_size(object_size),
+ m_snapc(snapc), m_object_no(object_no)
{
}
virtual int send() {
+ assert(m_image_ctx.owner_lock.is_locked());
CephContext *cct = m_image_ctx.cct;
- RWLock::RLocker l(m_image_ctx.owner_lock);
if (m_image_ctx.image_watcher->is_lock_supported() &&
!m_image_ctx.image_watcher->is_lock_owner()) {
ldout(cct, 1) << "lost exclusive lock during flatten" << dendl;
return -ERESTART;
}
- RWLock::RLocker l2(m_image_ctx.snap_lock);
- uint64_t overlap;
- {
- RWLock::RLocker l3(m_image_ctx.parent_lock);
+ bufferlist bl;
+ string oid = m_image_ctx.get_object_name(m_object_no);
+ AioWrite *req = new AioWrite(&m_image_ctx, oid, m_object_no, 0, bl, m_snapc,
+ this);
+ if (!req->has_parent()) {
// stop early if the parent went away - it just means
- // another flatten finished first, so this one is useless.
- if (!m_image_ctx.parent) {
- return 1;
- }
-
- // resize might have occurred while flatten is running
- uint64_t parent_overlap;
- int r = m_image_ctx.get_parent_overlap(CEPH_NOSNAP, &parent_overlap);
- assert(r == 0);
- overlap = min(m_image_ctx.size, parent_overlap);
- }
-
- // map child object onto the parent
- vector<pair<uint64_t,uint64_t> > objectx;
- Striper::extent_to_file(cct, &m_image_ctx.layout, m_object_no,
- 0, m_object_size, objectx);
- uint64_t object_overlap = m_image_ctx.prune_parent_extents(objectx, overlap);
- assert(object_overlap <= m_object_size);
- if (object_overlap == 0) {
- // resize shrunk image while flattening
+ // another flatten finished first or the image was resized
+ delete req;
return 1;
}
- bufferlist bl;
- string oid = m_image_ctx.get_object_name(m_object_no);
- AioWrite *req = new AioWrite(&m_image_ctx, oid, m_object_no, 0, objectx,
- object_overlap, bl, m_snapc, CEPH_NOSNAP,
- this);
req->send();
return 0;
}
private:
- ImageCtx &m_image_ctx;
uint64_t m_object_size;
::SnapContext m_snapc;
uint64_t m_object_no;
@@ -112,6 +89,7 @@ bool AsyncFlattenRequest::should_complete(int r) {
}
void AsyncFlattenRequest::send() {
+ assert(m_image_ctx.owner_lock.is_locked());
CephContext *cct = m_image_ctx.cct;
ldout(cct, 5) << this << " send" << dendl;
@@ -121,91 +99,77 @@ void AsyncFlattenRequest::send() {
boost::lambda::_1, &m_image_ctx, m_object_size, m_snapc,
boost::lambda::_2));
AsyncObjectThrottle *throttle = new AsyncObjectThrottle(
- *this, context_factory, create_callback_context(), m_prog_ctx, 0,
- m_overlap_objects);
+ this, m_image_ctx, context_factory, create_callback_context(), m_prog_ctx,
+ 0, m_overlap_objects);
throttle->start_ops(cct->_conf->rbd_concurrent_management_ops);
}
bool AsyncFlattenRequest::send_update_header() {
+ assert(m_image_ctx.owner_lock.is_locked());
CephContext *cct = m_image_ctx.cct;
- bool lost_exclusive_lock = false;
+ ldout(cct, 5) << this << " send_update_header" << dendl;
m_state = STATE_UPDATE_HEADER;
- {
- RWLock::RLocker l(m_image_ctx.owner_lock);
- if (m_image_ctx.image_watcher->is_lock_supported() &&
- !m_image_ctx.image_watcher->is_lock_owner()) {
- ldout(cct, 1) << "lost exclusive lock during header update" << dendl;
- lost_exclusive_lock = true;
- } else {
- ldout(cct, 5) << this << " send_update_header" << dendl;
- RWLock::RLocker l2(m_image_ctx.parent_lock);
- // stop early if the parent went away - it just means
- // another flatten finished first, so this one is useless.
- if (!m_image_ctx.parent) {
- ldout(cct, 5) << "image already flattened" << dendl;
- return true;
- }
- m_ignore_enoent = true;
- m_parent_spec = m_image_ctx.parent_md.spec;
-
- // remove parent from this (base) image
- librados::ObjectWriteOperation op;
- if (m_image_ctx.image_watcher->is_lock_supported()) {
- m_image_ctx.image_watcher->assert_header_locked(&op);
- }
- cls_client::remove_parent(&op);
-
- librados::AioCompletion *rados_completion = create_callback_completion();
- int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
- rados_completion, &op);
- assert(r == 0);
- rados_completion->release();
+ // should have been canceled prior to releasing lock
+ assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+ m_image_ctx.image_watcher->is_lock_owner());
+
+ {
+ RWLock::RLocker parent_locker(m_image_ctx.parent_lock);
+ // stop early if the parent went away - it just means
+ // another flatten finished first, so this one is useless.
+ if (!m_image_ctx.parent) {
+ ldout(cct, 5) << "image already flattened" << dendl;
+ return true;
}
+ m_parent_spec = m_image_ctx.parent_md.spec;
}
+ m_ignore_enoent = true;
- if (lost_exclusive_lock) {
- complete(-ERESTART);
+ // remove parent from this (base) image
+ librados::ObjectWriteOperation op;
+ if (m_image_ctx.image_watcher->is_lock_supported()) {
+ m_image_ctx.image_watcher->assert_header_locked(&op);
}
+ cls_client::remove_parent(&op);
+
+ librados::AioCompletion *rados_completion = create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
+ rados_completion, &op);
+ assert(r == 0);
+ rados_completion->release();
return false;
}
bool AsyncFlattenRequest::send_update_children() {
CephContext *cct = m_image_ctx.cct;
- bool lost_exclusive_lock = false;
- m_state = STATE_UPDATE_CHILDREN;
- {
- RWLock::RLocker l(m_image_ctx.owner_lock);
- if (m_image_ctx.image_watcher->is_lock_supported() &&
- !m_image_ctx.image_watcher->is_lock_owner()) {
- ldout(cct, 1) << "lost exclusive lock during children update" << dendl;
- lost_exclusive_lock = true;
- } else {
- // if there are no snaps, remove from the children object as well
- // (if snapshots remain, they have their own parent info, and the child
- // will be removed when the last snap goes away)
- RWLock::RLocker l2(m_image_ctx.snap_lock);
- if (!m_image_ctx.snaps.empty()) {
- return true;
- }
-
- ldout(cct, 2) << "removing child from children list..." << dendl;
- librados::ObjectWriteOperation op;
- cls_client::remove_child(&op, m_parent_spec, m_image_ctx.id);
-
- librados::AioCompletion *rados_completion = create_callback_completion();
- int r = m_image_ctx.md_ctx.aio_operate(RBD_CHILDREN, rados_completion,
- &op);
- assert(r == 0);
- rados_completion->release();
- }
- }
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
- if (lost_exclusive_lock) {
- complete(-ERESTART);
+ // should have been canceled prior to releasing lock
+ assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+ m_image_ctx.image_watcher->is_lock_owner());
+
+ // if there are no snaps, remove from the children object as well
+ // (if snapshots remain, they have their own parent info, and the child
+ // will be removed when the last snap goes away)
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ if (!m_image_ctx.snaps.empty()) {
+ return true;
}
+
+ ldout(cct, 2) << "removing child from children list..." << dendl;
+ m_state = STATE_UPDATE_CHILDREN;
+
+ librados::ObjectWriteOperation op;
+ cls_client::remove_child(&op, m_parent_spec, m_image_ctx.id);
+
+ librados::AioCompletion *rados_completion = create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(RBD_CHILDREN, rados_completion,
+ &op);
+ assert(r == 0);
+ rados_completion->release();
return false;
}
diff --git a/src/librbd/AsyncObjectThrottle.cc b/src/librbd/AsyncObjectThrottle.cc
index 4290eb8..2c7ccd1 100644
--- a/src/librbd/AsyncObjectThrottle.cc
+++ b/src/librbd/AsyncObjectThrottle.cc
@@ -2,24 +2,35 @@
// vim: ts=8 sw=2 smarttab
#include "librbd/AsyncObjectThrottle.h"
#include "include/rbd/librbd.hpp"
+#include "common/RWLock.h"
#include "librbd/AsyncRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
namespace librbd
{
-AsyncObjectThrottle::AsyncObjectThrottle(const AsyncRequest& async_request,
+void C_AsyncObjectThrottle::finish(int r) {
+ RWLock::RLocker l(m_image_ctx.owner_lock);
+ m_finisher.finish_op(r);
+}
+
+AsyncObjectThrottle::AsyncObjectThrottle(const AsyncRequest* async_request,
+ ImageCtx &image_ctx,
const ContextFactory& context_factory,
Context *ctx, ProgressContext &prog_ctx,
uint64_t object_no,
uint64_t end_object_no)
- : m_lock("librbd::AsyncThrottle::m_lock"),
- m_async_request(async_request), m_context_factory(context_factory),
- m_ctx(ctx), m_prog_ctx(prog_ctx), m_object_no(object_no),
- m_end_object_no(end_object_no), m_current_ops(0), m_ret(0)
+ : m_lock(unique_lock_name("librbd::AsyncThrottle::m_lock", this)),
+ m_async_request(async_request), m_image_ctx(image_ctx),
+ m_context_factory(context_factory), m_ctx(ctx), m_prog_ctx(prog_ctx),
+ m_object_no(object_no), m_end_object_no(end_object_no), m_current_ops(0),
+ m_ret(0)
{
}
void AsyncObjectThrottle::start_ops(uint64_t max_concurrent) {
+ assert(m_image_ctx.owner_lock.is_locked());
bool complete;
{
Mutex::Locker l(m_lock);
@@ -38,6 +49,7 @@ void AsyncObjectThrottle::start_ops(uint64_t max_concurrent) {
}
void AsyncObjectThrottle::finish_op(int r) {
+ assert(m_image_ctx.owner_lock.is_locked());
bool complete;
{
Mutex::Locker l(m_lock);
@@ -58,7 +70,7 @@ void AsyncObjectThrottle::finish_op(int r) {
void AsyncObjectThrottle::start_next_op() {
bool done = false;
while (!done) {
- if (m_async_request.is_canceled() && m_ret == 0) {
+ if (m_async_request->is_canceled() && m_ret == 0) {
// allow in-flight ops to complete, but don't start new ops
m_ret = -ERESTART;
return;
diff --git a/src/librbd/AsyncObjectThrottle.h b/src/librbd/AsyncObjectThrottle.h
index 83d69d8..f7f254fb 100644
--- a/src/librbd/AsyncObjectThrottle.h
+++ b/src/librbd/AsyncObjectThrottle.h
@@ -13,6 +13,7 @@ namespace librbd
{
class AsyncRequest;
class ProgressContext;
+struct ImageCtx;
class AsyncObjectThrottleFinisher {
public:
@@ -22,18 +23,19 @@ public:
class C_AsyncObjectThrottle : public Context {
public:
- C_AsyncObjectThrottle(AsyncObjectThrottleFinisher &finisher)
- : m_finisher(finisher)
+ C_AsyncObjectThrottle(AsyncObjectThrottleFinisher &finisher,
+ ImageCtx &image_ctx)
+ : m_image_ctx(image_ctx), m_finisher(finisher)
{
}
- virtual void finish(int r)
- {
- m_finisher.finish_op(r);
- }
-
virtual int send() = 0;
+protected:
+ ImageCtx &m_image_ctx;
+
+ virtual void finish(int r);
+
private:
AsyncObjectThrottleFinisher &m_finisher;
};
@@ -43,7 +45,7 @@ public:
typedef boost::function<C_AsyncObjectThrottle*(AsyncObjectThrottle&,
uint64_t)> ContextFactory;
- AsyncObjectThrottle(const AsyncRequest &async_request,
+ AsyncObjectThrottle(const AsyncRequest *async_request, ImageCtx &image_ctx,
const ContextFactory& context_factory, Context *ctx,
ProgressContext &prog_ctx, uint64_t object_no,
uint64_t end_object_no);
@@ -53,7 +55,8 @@ public:
private:
Mutex m_lock;
- const AsyncRequest &m_async_request;
+ const AsyncRequest *m_async_request;
+ ImageCtx &m_image_ctx;
ContextFactory m_context_factory;
Context *m_ctx;
ProgressContext &m_prog_ctx;
diff --git a/src/librbd/AsyncRequest.cc b/src/librbd/AsyncRequest.cc
index 825c8c4..2f0c2d9 100644
--- a/src/librbd/AsyncRequest.cc
+++ b/src/librbd/AsyncRequest.cc
@@ -1,6 +1,7 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
#include "librbd/AsyncRequest.h"
+#include "common/WorkQueue.h"
#include "librbd/ImageCtx.h"
#include "librbd/internal.h"
#include <boost/bind.hpp>
@@ -21,6 +22,10 @@ AsyncRequest::~AsyncRequest() {
m_image_ctx.async_requests_cond.Signal();
}
+void AsyncRequest::async_complete(int r) {
+ m_image_ctx.op_work_queue->queue(create_callback_context(), r);
+}
+
librados::AioCompletion *AsyncRequest::create_callback_completion() {
return librados::Rados::aio_create_completion(create_callback_context(),
NULL, rados_ctx_cb);
@@ -30,4 +35,9 @@ Context *AsyncRequest::create_callback_context() {
return new FunctionContext(boost::bind(&AsyncRequest::complete, this, _1));
}
+Context *AsyncRequest::create_async_callback_context() {
+ return new FunctionContext(boost::bind(&AsyncRequest::async_complete, this,
+ _1));;
+}
+
} // namespace librbd
diff --git a/src/librbd/AsyncRequest.h b/src/librbd/AsyncRequest.h
index fd260a9..7324a22 100644
--- a/src/librbd/AsyncRequest.h
+++ b/src/librbd/AsyncRequest.h
@@ -43,6 +43,9 @@ protected:
librados::AioCompletion *create_callback_completion();
Context *create_callback_context();
+ Context *create_async_callback_context();
+
+ void async_complete(int r);
virtual bool safely_cancel(int r) {
return true;
diff --git a/src/librbd/AsyncResizeRequest.cc b/src/librbd/AsyncResizeRequest.cc
index 621d59d..8ddf967 100644
--- a/src/librbd/AsyncResizeRequest.cc
+++ b/src/librbd/AsyncResizeRequest.cc
@@ -24,25 +24,20 @@ AsyncResizeRequest::AsyncResizeRequest(ImageCtx &image_ctx, Context *on_finish,
m_prog_ctx(prog_ctx), m_new_parent_overlap(0),
m_xlist_item(this)
{
- RWLock::WLocker l(m_image_ctx.snap_lock);
- m_image_ctx.async_resize_reqs.push_back(&m_xlist_item);
- m_original_size = m_image_ctx.size;
- compute_parent_overlap();
}
AsyncResizeRequest::~AsyncResizeRequest() {
AsyncResizeRequest *next_req = NULL;
{
- RWLock::WLocker l(m_image_ctx.snap_lock);
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
assert(m_xlist_item.remove_myself());
if (!m_image_ctx.async_resize_reqs.empty()) {
next_req = m_image_ctx.async_resize_reqs.front();
- next_req->m_original_size = m_image_ctx.size;
- next_req->compute_parent_overlap();
}
}
if (next_req != NULL) {
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
next_req->send();
}
}
@@ -72,7 +67,12 @@ bool AsyncResizeRequest::should_complete(int r) {
lderr(cct) << "resize encountered an error: " << cpp_strerror(r) << dendl;
return true;
}
+ if (m_state == STATE_FINISHED) {
+ ldout(cct, 5) << "FINISHED" << dendl;
+ return true;
+ }
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
switch (m_state) {
case STATE_FLUSH:
ldout(cct, 5) << "FLUSH" << dendl;
@@ -109,10 +109,6 @@ bool AsyncResizeRequest::should_complete(int r) {
increment_refresh_seq();
return true;
- case STATE_FINISHED:
- ldout(cct, 5) << "FINISHED" << dendl;
- return true;
-
default:
lderr(cct) << "invalid state: " << m_state << dendl;
assert(false);
@@ -122,14 +118,20 @@ bool AsyncResizeRequest::should_complete(int r) {
}
void AsyncResizeRequest::send() {
- {
- RWLock::RLocker l(m_image_ctx.snap_lock);
- assert(!m_image_ctx.async_resize_reqs.empty());
+ assert(m_image_ctx.owner_lock.is_locked());
- // only allow a single concurrent resize request
- if (m_image_ctx.async_resize_reqs.front() != this) {
- return;
+ {
+ RWLock::WLocker snap_locker(m_image_ctx.snap_lock);
+ if (!m_xlist_item.is_on_list()) {
+ m_image_ctx.async_resize_reqs.push_back(&m_xlist_item);
+ if (m_image_ctx.async_resize_reqs.front() != this) {
+ return;
+ }
}
+
+ assert(m_image_ctx.async_resize_reqs.front() == this);
+ m_original_size = m_image_ctx.size;
+ compute_parent_overlap();
}
CephContext *cct = m_image_ctx.cct;
@@ -158,11 +160,13 @@ void AsyncResizeRequest::send_flush() {
m_state = STATE_FLUSH;
// with clipping adjusted, ensure that write / copy-on-read operations won't
- // (re-)create objects that we just removed
- m_image_ctx.flush_async_operations(create_callback_context());
+ // (re-)create objects that we just removed. need async callback to ensure
+ // we don't have cache_lock already held
+ m_image_ctx.flush_async_operations(create_async_callback_context());
}
void AsyncResizeRequest::send_invalidate_cache() {
+ assert(m_image_ctx.owner_lock.is_locked());
ldout(m_image_ctx.cct, 5) << this << " send_invalidate_cache: "
<< " original_size=" << m_original_size
<< " new_size=" << m_new_size << dendl;
@@ -174,6 +178,7 @@ void AsyncResizeRequest::send_invalidate_cache() {
}
void AsyncResizeRequest::send_trim_image() {
+ assert(m_image_ctx.owner_lock.is_locked());
ldout(m_image_ctx.cct, 5) << this << " send_trim_image: "
<< " original_size=" << m_original_size
<< " new_size=" << m_new_size << dendl;
@@ -187,109 +192,76 @@ void AsyncResizeRequest::send_trim_image() {
}
void AsyncResizeRequest::send_grow_object_map() {
- bool lost_exclusive_lock = false;
- bool object_map_enabled = true;
- {
- RWLock::RLocker l(m_image_ctx.owner_lock);
- if (!m_image_ctx.object_map.enabled()) {
- object_map_enabled = false;
- } else {
- ldout(m_image_ctx.cct, 5) << this << " send_grow_object_map: "
- << " original_size=" << m_original_size
- << " new_size=" << m_new_size << dendl;
- m_state = STATE_GROW_OBJECT_MAP;
-
- if (m_image_ctx.image_watcher->is_lock_supported() &&
- !m_image_ctx.image_watcher->is_lock_owner()) {
- ldout(m_image_ctx.cct, 1) << "lost exclusive lock during grow object map" << dendl;
- lost_exclusive_lock = true;
- } else {
- m_image_ctx.object_map.aio_resize(m_new_size, OBJECT_NONEXISTENT,
- create_callback_context());
- object_map_enabled = true;
- }
- }
- }
-
- // avoid possible recursive lock attempts
- if (!object_map_enabled) {
+ assert(m_image_ctx.owner_lock.is_locked());
+ if (!m_image_ctx.object_map.enabled()) {
send_update_header();
- } else if (lost_exclusive_lock) {
- complete(-ERESTART);
+ return;
}
+
+ ldout(m_image_ctx.cct, 5) << this << " send_grow_object_map: "
+ << " original_size=" << m_original_size
+ << " new_size=" << m_new_size << dendl;
+ m_state = STATE_GROW_OBJECT_MAP;
+
+ // should have been canceled prior to releasing lock
+ assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+ m_image_ctx.image_watcher->is_lock_owner());
+
+ m_image_ctx.object_map.aio_resize(m_new_size, OBJECT_NONEXISTENT,
+ create_callback_context());
}
bool AsyncResizeRequest::send_shrink_object_map() {
- bool lost_exclusive_lock = false;
- {
- RWLock::RLocker l(m_image_ctx.owner_lock);
- if (!m_image_ctx.object_map.enabled() || m_new_size > m_original_size) {
- return true;
- }
-
- ldout(m_image_ctx.cct, 5) << this << " send_shrink_object_map: "
- << " original_size=" << m_original_size
- << " new_size=" << m_new_size << dendl;
- m_state = STATE_SHRINK_OBJECT_MAP;
-
- if (m_image_ctx.image_watcher->is_lock_supported() &&
- !m_image_ctx.image_watcher->is_lock_owner()) {
- ldout(m_image_ctx.cct, 1) << "lost exclusive lock during shrink object map" << dendl;
- lost_exclusive_lock = true;
- } else {
- m_image_ctx.object_map.aio_resize(m_new_size, OBJECT_NONEXISTENT,
- create_callback_context());
- }
+ assert(m_image_ctx.owner_lock.is_locked());
+ if (!m_image_ctx.object_map.enabled() || m_new_size > m_original_size) {
+ return true;
}
- // avoid possible recursive lock attempts
- if (lost_exclusive_lock) {
- complete(-ERESTART);
- }
+ ldout(m_image_ctx.cct, 5) << this << " send_shrink_object_map: "
+ << " original_size=" << m_original_size
+ << " new_size=" << m_new_size << dendl;
+ m_state = STATE_SHRINK_OBJECT_MAP;
+
+ // should have been canceled prior to releasing lock
+ assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+ m_image_ctx.image_watcher->is_lock_owner());
+
+ m_image_ctx.object_map.aio_resize(m_new_size, OBJECT_NONEXISTENT,
+ create_callback_context());
return false;
}
void AsyncResizeRequest::send_update_header() {
- bool lost_exclusive_lock = false;
+ assert(m_image_ctx.owner_lock.is_locked());
ldout(m_image_ctx.cct, 5) << this << " send_update_header: "
<< " original_size=" << m_original_size
<< " new_size=" << m_new_size << dendl;
m_state = STATE_UPDATE_HEADER;
- {
- RWLock::RLocker l(m_image_ctx.owner_lock);
- if (m_image_ctx.image_watcher->is_lock_supported() &&
- !m_image_ctx.image_watcher->is_lock_owner()) {
- ldout(m_image_ctx.cct, 1) << "lost exclusive lock during header update" << dendl;
- lost_exclusive_lock = true;
- } else {
- librados::ObjectWriteOperation op;
- if (m_image_ctx.old_format) {
- // rewrite header
- bufferlist bl;
- m_image_ctx.header.image_size = m_new_size;
- bl.append((const char *)&m_image_ctx.header, sizeof(m_image_ctx.header));
- op.write(0, bl);
- } else {
- if (m_image_ctx.image_watcher->is_lock_supported()) {
- m_image_ctx.image_watcher->assert_header_locked(&op);
- }
- cls_client::set_size(&op, m_new_size);
- }
-
- librados::AioCompletion *rados_completion = create_callback_completion();
- int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
- rados_completion, &op);
- assert(r == 0);
- rados_completion->release();
+ // should have been canceled prior to releasing lock
+ assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+ m_image_ctx.image_watcher->is_lock_owner());
+
+ librados::ObjectWriteOperation op;
+ if (m_image_ctx.old_format) {
+ // rewrite header
+ bufferlist bl;
+ m_image_ctx.header.image_size = m_new_size;
+ bl.append((const char *)&m_image_ctx.header, sizeof(m_image_ctx.header));
+ op.write(0, bl);
+ } else {
+ if (m_image_ctx.image_watcher->is_lock_supported()) {
+ m_image_ctx.image_watcher->assert_header_locked(&op);
}
+ cls_client::set_size(&op, m_new_size);
}
- // avoid possible recursive lock attempts
- if (lost_exclusive_lock) {
- complete(-ERESTART);
- }
+ librados::AioCompletion *rados_completion = create_callback_completion();
+ int r = m_image_ctx.md_ctx.aio_operate(m_image_ctx.header_oid,
+ rados_completion, &op);
+ assert(r == 0);
+ rados_completion->release();
}
void AsyncResizeRequest::compute_parent_overlap() {
diff --git a/src/librbd/AsyncTrimRequest.cc b/src/librbd/AsyncTrimRequest.cc
index cb4764a..20f7102 100644
--- a/src/librbd/AsyncTrimRequest.cc
+++ b/src/librbd/AsyncTrimRequest.cc
@@ -28,22 +28,18 @@ class AsyncTrimObjectContext : public C_AsyncObjectThrottle {
public:
AsyncTrimObjectContext(AsyncObjectThrottle &throttle, ImageCtx *image_ctx,
uint64_t object_no)
- : C_AsyncObjectThrottle(throttle), m_image_ctx(*image_ctx),
- m_object_no(object_no)
+ : C_AsyncObjectThrottle(throttle, *image_ctx), m_object_no(object_no)
{
}
virtual int send() {
+ assert(m_image_ctx.owner_lock.is_locked());
+ assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+ m_image_ctx.image_watcher->is_lock_owner());
if (!m_image_ctx.object_map.object_may_exist(m_object_no)) {
return 1;
}
- RWLock::RLocker l(m_image_ctx.owner_lock);
- if (m_image_ctx.image_watcher->is_lock_supported() &&
- !m_image_ctx.image_watcher->is_lock_owner()) {
- return -ERESTART;
- }
-
string oid = m_image_ctx.get_object_name(m_object_no);
ldout(m_image_ctx.cct, 10) << "removing " << oid << dendl;
@@ -56,7 +52,6 @@ public:
}
private:
- ImageCtx &m_image_ctx;
uint64_t m_object_no;
};
@@ -93,26 +88,29 @@ bool AsyncTrimRequest::should_complete(int r)
switch (m_state) {
case STATE_PRE_REMOVE:
ldout(cct, 5) << " PRE_REMOVE" << dendl;
- send_remove_objects();
- break;
+ {
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+ send_remove_objects();
+ }
+ break;
case STATE_REMOVE_OBJECTS:
ldout(cct, 5) << " REMOVE_OBJECTS" << dendl;
- if (send_post_remove()) {
- return true;
- }
+ send_post_remove();
break;
case STATE_POST_REMOVE:
ldout(cct, 5) << " POST_OBJECTS" << dendl;
- if (send_clean_boundary()) {
- return true;
+ {
+ RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+ send_clean_boundary();
}
break;
case STATE_CLEAN_BOUNDARY:
ldout(cct, 5) << "CLEAN_BOUNDARY" << dendl;
- return true;
+ finish();
+ break;
case STATE_FINISHED:
ldout(cct, 5) << "FINISHED" << dendl;
@@ -127,19 +125,18 @@ bool AsyncTrimRequest::should_complete(int r)
}
void AsyncTrimRequest::send() {
+ assert(m_image_ctx.owner_lock.is_locked());
if (m_delete_start < m_num_objects) {
send_pre_remove();
} else {
- bool finished = send_clean_boundary();
- if (finished) {
- m_state = STATE_FINISHED;
- complete(0);
- }
+ send_clean_boundary();
}
}
void AsyncTrimRequest::send_remove_objects() {
+ assert(m_image_ctx.owner_lock.is_locked());
CephContext *cct = m_image_ctx.cct;
+
ldout(m_image_ctx.cct, 5) << this << " send_remove_objects: "
<< " delete_start=" << m_delete_start
<< " num_objects=" << m_num_objects << dendl;
@@ -150,15 +147,17 @@ void AsyncTrimRequest::send_remove_objects() {
boost::lambda::bind(boost::lambda::new_ptr<AsyncTrimObjectContext>(),
boost::lambda::_1, &m_image_ctx, boost::lambda::_2));
AsyncObjectThrottle *throttle = new AsyncObjectThrottle(
- *this, context_factory, ctx, m_prog_ctx, m_delete_start, m_num_objects);
+ this, m_image_ctx, context_factory, ctx, m_prog_ctx, m_delete_start,
+ m_num_objects);
throttle->start_ops(cct->_conf->rbd_concurrent_management_ops);
}
void AsyncTrimRequest::send_pre_remove() {
+ assert(m_image_ctx.owner_lock.is_locked());
+
bool remove_objects = false;
- bool lost_exclusive_lock = false;
{
- RWLock::RLocker l(m_image_ctx.owner_lock);
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
if (!m_image_ctx.object_map.enabled()) {
remove_objects = true;
} else {
@@ -167,18 +166,16 @@ void AsyncTrimRequest::send_pre_remove() {
<< " num_objects=" << m_num_objects << dendl;
m_state = STATE_PRE_REMOVE;
- if (!m_image_ctx.image_watcher->is_lock_owner()) {
- ldout(m_image_ctx.cct, 1) << "lost exclusive lock during trim" << dendl;
- lost_exclusive_lock = true;
- } else {
- // flag the objects as pending deletion
- Context *ctx = create_callback_context();
- if (!m_image_ctx.object_map.aio_update(m_delete_start, m_num_objects,
- OBJECT_PENDING, OBJECT_EXISTS,
- ctx)) {
- delete ctx;
- remove_objects = true;
- }
+ assert(m_image_ctx.image_watcher->is_lock_owner());
+
+ // flag the objects as pending deletion
+ Context *ctx = create_callback_context();
+ RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
+ if (!m_image_ctx.object_map.aio_update(m_delete_start, m_num_objects,
+ OBJECT_PENDING, OBJECT_EXISTS,
+ ctx)) {
+ delete ctx;
+ remove_objects = true;
}
}
}
@@ -187,16 +184,15 @@ void AsyncTrimRequest::send_pre_remove() {
if (remove_objects) {
// no object map update required
send_remove_objects();
- } else if (lost_exclusive_lock) {
- complete(-ERESTART);
}
}
-bool AsyncTrimRequest::send_post_remove() {
+void AsyncTrimRequest::send_post_remove() {
+ assert(m_image_ctx.owner_lock.is_locked());
+
bool clean_boundary = false;
- bool lost_exclusive_lock = false;
{
- RWLock::RLocker l(m_image_ctx.owner_lock);
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
if (!m_image_ctx.object_map.enabled()) {
clean_boundary = true;
} else {
@@ -205,17 +201,16 @@ bool AsyncTrimRequest::send_post_remove() {
<< " num_objects=" << m_num_objects << dendl;
m_state = STATE_POST_REMOVE;
- if (!m_image_ctx.image_watcher->is_lock_owner()) {
- ldout(m_image_ctx.cct, 1) << "lost exclusive lock during trim" << dendl;
- } else {
- // flag the pending objects as removed
- Context *ctx = create_callback_context();
- if (!m_image_ctx.object_map.aio_update(m_delete_start, m_num_objects,
- OBJECT_NONEXISTENT,
- OBJECT_PENDING, ctx)) {
- delete ctx;
- clean_boundary = true;
- }
+ assert(m_image_ctx.image_watcher->is_lock_owner());
+
+ // flag the pending objects as removed
+ Context *ctx = create_callback_context();
+ RWLock::WLocker object_map_locker(m_image_ctx.object_map_lock);
+ if (!m_image_ctx.object_map.aio_update(m_delete_start, m_num_objects,
+ OBJECT_NONEXISTENT,
+ OBJECT_PENDING, ctx)) {
+ delete ctx;
+ clean_boundary = true;
}
}
}
@@ -223,85 +218,61 @@ bool AsyncTrimRequest::send_post_remove() {
// avoid possible recursive lock attempts
if (clean_boundary) {
// no object map update required
- return send_clean_boundary();
- } else if (lost_exclusive_lock) {
- complete(-ERESTART);
+ send_clean_boundary();
}
- return false;
}
-bool AsyncTrimRequest::send_clean_boundary() {
+void AsyncTrimRequest::send_clean_boundary() {
+ assert(m_image_ctx.owner_lock.is_locked());
CephContext *cct = m_image_ctx.cct;
if (m_delete_off <= m_new_size) {
- return true;
+ finish();
+ return;
}
- bool lost_exclusive_lock = false;
- ContextCompletion *completion = NULL;
+ // should have been canceled prior to releasing lock
+ assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+ m_image_ctx.image_watcher->is_lock_owner());
+ ldout(m_image_ctx.cct, 5) << this << " send_clean_boundary: "
+ << " delete_start=" << m_delete_start
+ << " num_objects=" << m_num_objects << dendl;
+ m_state = STATE_CLEAN_BOUNDARY;
+
+ ::SnapContext snapc;
{
- ldout(m_image_ctx.cct, 5) << this << " send_clean_boundary: "
- << " delete_start=" << m_delete_start
- << " num_objects=" << m_num_objects << dendl;
- m_state = STATE_CLEAN_BOUNDARY;
-
- RWLock::RLocker l(m_image_ctx.owner_lock);
- if (m_image_ctx.image_watcher->is_lock_supported() &&
- !m_image_ctx.image_watcher->is_lock_owner()) {
- ldout(m_image_ctx.cct, 1) << "lost exclusive lock during trim" << dendl;
- lost_exclusive_lock = true;
- } else {
- ::SnapContext snapc;
- uint64_t parent_overlap;
- {
- RWLock::RLocker l2(m_image_ctx.snap_lock);
- snapc = m_image_ctx.snapc;
-
- RWLock::RLocker l3(m_image_ctx.parent_lock);
- int r = m_image_ctx.get_parent_overlap(CEPH_NOSNAP, &parent_overlap);
- assert(r == 0);
- }
+ RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+ snapc = m_image_ctx.snapc;
+ }
- // discard the weird boundary, if any
- vector<ObjectExtent> extents;
- Striper::file_to_extents(cct, m_image_ctx.format_string,
- &m_image_ctx.layout, m_new_size,
- m_delete_off - m_new_size, 0, extents);
-
- completion = new ContextCompletion(create_callback_context(), true);
- for (vector<ObjectExtent>::iterator p = extents.begin();
- p != extents.end(); ++p) {
- ldout(cct, 20) << " ex " << *p << dendl;
- Context *req_comp = new C_ContextCompletion(*completion);
-
- // reverse map this object extent onto the parent
- vector<pair<uint64_t,uint64_t> > objectx;
- Striper::extent_to_file(cct, &m_image_ctx.layout, p->objectno, 0,
- m_image_ctx.layout.fl_object_size, objectx);
- uint64_t object_overlap =
- m_image_ctx.prune_parent_extents(objectx, parent_overlap);
-
- AbstractWrite *req;
- if (p->offset == 0) {
- req = new AioRemove(&m_image_ctx, p->oid.name, p->objectno, objectx,
- object_overlap, snapc, CEPH_NOSNAP, req_comp);
- } else {
- req = new AioTruncate(&m_image_ctx, p->oid.name, p->objectno, p->offset,
- objectx, object_overlap, snapc, CEPH_NOSNAP,
- req_comp);
- }
- req->send();
- }
+ // discard the weird boundary
+ std::vector<ObjectExtent> extents;
+ Striper::file_to_extents(cct, m_image_ctx.format_string,
+ &m_image_ctx.layout, m_new_size,
+ m_delete_off - m_new_size, 0, extents);
+
+ ContextCompletion *completion =
+ new ContextCompletion(create_callback_context(), true);
+ for (vector<ObjectExtent>::iterator p = extents.begin();
+ p != extents.end(); ++p) {
+ ldout(cct, 20) << " ex " << *p << dendl;
+ Context *req_comp = new C_ContextCompletion(*completion);
+
+ AbstractWrite *req;
+ if (p->offset == 0) {
+ req = new AioRemove(&m_image_ctx, p->oid.name, p->objectno, snapc,
+ req_comp);
+ } else {
+ req = new AioTruncate(&m_image_ctx, p->oid.name, p->objectno,
+ p->offset, snapc, req_comp);
}
-
+ req->send();
}
+ completion->finish_adding_requests();
+}
- // avoid possible recursive lock attempts
- if (lost_exclusive_lock) {
- complete(-ERESTART);
- } else if (completion != NULL) {
- completion->finish_adding_requests();
- }
- return false;
+void AsyncTrimRequest::finish() {
+ m_state = STATE_FINISHED;
+ async_complete(0);
}
} // namespace librbd
diff --git a/src/librbd/AsyncTrimRequest.h b/src/librbd/AsyncTrimRequest.h
index 7a89a11..d4d6af9 100644
--- a/src/librbd/AsyncTrimRequest.h
+++ b/src/librbd/AsyncTrimRequest.h
@@ -68,8 +68,9 @@ private:
void send_remove_objects();
void send_pre_remove();
- bool send_post_remove();
- bool send_clean_boundary();
+ void send_post_remove();
+ void send_clean_boundary();
+ void finish();
};
} // namespace librbd
diff --git a/src/librbd/CopyupRequest.cc b/src/librbd/CopyupRequest.cc
index 3d780c6..1535cde 100644
--- a/src/librbd/CopyupRequest.cc
+++ b/src/librbd/CopyupRequest.cc
@@ -35,20 +35,12 @@ namespace librbd {
m_async_op.finish_op();
}
- ceph::bufferlist& CopyupRequest::get_copyup_data() {
- return m_copyup_data;
- }
-
void CopyupRequest::append_request(AioRequest *req) {
ldout(m_ictx->cct, 20) << __func__ << " " << this << ": " << req << dendl;
m_pending_requests.push_back(req);
}
- bool CopyupRequest::complete_requests(int r) {
- if (m_pending_requests.empty()) {
- return false;
- }
-
+ void CopyupRequest::complete_requests(int r) {
while (!m_pending_requests.empty()) {
vector<AioRequest *>::iterator it = m_pending_requests.begin();
AioRequest *req = *it;
@@ -57,13 +49,9 @@ namespace librbd {
req->complete(r);
m_pending_requests.erase(it);
}
- return true;
}
- void CopyupRequest::send_copyup() {
- ldout(m_ictx->cct, 20) << __func__ << " " << this
- << ": oid " << m_oid << dendl;
-
+ bool CopyupRequest::send_copyup() {
m_ictx->snap_lock.get_read();
::SnapContext snapc = m_ictx->snapc;
m_ictx->snap_lock.put_read();
@@ -72,12 +60,33 @@ namespace librbd {
snaps.insert(snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
librados::ObjectWriteOperation copyup_op;
- copyup_op.exec("rbd", "copyup", m_copyup_data);
+ if (!m_copyup_data.is_zero()) {
+ copyup_op.exec("rbd", "copyup", m_copyup_data);
+ }
+
+ // merge all pending write ops into this single RADOS op
+ for (size_t i=0; i<m_pending_requests.size(); ++i) {
+ AioRequest *req = m_pending_requests[i];
+ ldout(m_ictx->cct, 20) << __func__ << " add_copyup_ops " << req << dendl;
+ req->add_copyup_ops(©up_op);
+ }
+
+ if (copyup_op.size() == 0) {
+ return true;
+ }
+
+ ldout(m_ictx->cct, 20) << __func__ << " " << this
+ << ": oid " << m_oid << dendl;
+ m_state = STATE_COPYUP;
librados::AioCompletion *comp =
- librados::Rados::aio_create_completion(NULL, NULL, NULL);
- m_ictx->md_ctx.aio_operate(m_oid, comp, ©up_op, snapc.seq.val, snaps);
+ librados::Rados::aio_create_completion(create_callback_context(), NULL,
+ rados_ctx_cb);
+ int r = m_ictx->md_ctx.aio_operate(m_oid, comp, ©up_op, snapc.seq.val,
+ snaps);
+ assert(r == 0);
comp->release();
+ return false;
}
void CopyupRequest::send()
@@ -116,7 +125,7 @@ namespace librbd {
bool CopyupRequest::should_complete(int r)
{
CephContext *cct = m_ictx->cct;
- ldout(cct, 20) << __func__ << " "
+ ldout(cct, 20) << __func__ << " " << this
<< ": oid " << m_oid
<< ", extents " << m_image_extents
<< ", r " << r << dendl;
@@ -125,22 +134,23 @@ namespace librbd {
case STATE_READ_FROM_PARENT:
ldout(cct, 20) << "READ_FROM_PARENT" << dendl;
remove_from_list();
- if (complete_requests(r)) {
- // pending write operation: it will handle object map / copyup
- return true;
- } else if (r < 0) {
- // nothing to copyup
- return true;
- } else if (send_object_map()) {
- return true;
+ if (r >= 0) {
+ return send_object_map();
+ } else if (r == -ENOENT) {
+ return send_copyup();
}
break;
case STATE_OBJECT_MAP:
ldout(cct, 20) << "OBJECT_MAP" << dendl;
if (r == 0) {
- send_copyup();
+ return send_copyup();
}
+ break;
+
+ case STATE_COPYUP:
+ ldout(cct, 20) << "COPYUP" << dendl;
+ complete_requests(r);
return true;
default:
@@ -148,6 +158,11 @@ namespace librbd {
assert(false);
break;
}
+
+ if (r < 0) {
+ complete_requests(r);
+ return true;
+ }
return false;
}
@@ -162,36 +177,40 @@ namespace librbd {
}
bool CopyupRequest::send_object_map() {
- ldout(m_ictx->cct, 20) << __func__ << " " << this
- << ": oid " << m_oid
- << ", extents " << m_image_extents
- << dendl;
-
- bool copyup = false;
+ bool copyup = true;
{
- RWLock::RLocker l(m_ictx->owner_lock);
- if (!m_ictx->object_map.enabled()) {
- copyup = true;
- } else if (!m_ictx->image_watcher->is_lock_owner()) {
- ldout(m_ictx->cct, 20) << "exclusive lock not held for copy-on-read"
- << dendl;
- return true;
- } else {
- m_state = STATE_OBJECT_MAP;
- Context *ctx = create_callback_context();
- if (!m_ictx->object_map.aio_update(m_object_no, OBJECT_EXISTS,
- boost::optional<uint8_t>(), ctx)) {
- delete ctx;
- copyup = true;
- }
+ RWLock::RLocker owner_locker(m_ictx->owner_lock);
+ RWLock::RLocker snap_locker(m_ictx->snap_lock);
+ if (m_ictx->object_map.enabled()) {
+ if (!m_ictx->image_watcher->is_lock_owner()) {
+ ldout(m_ictx->cct, 20) << "exclusive lock not held for copyup request"
+ << dendl;
+ assert(m_pending_requests.empty());
+ return true;
+ }
+
+ RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
+ if (m_ictx->object_map[m_object_no] != OBJECT_EXISTS) {
+ ldout(m_ictx->cct, 20) << __func__ << " " << this
+ << ": oid " << m_oid
+ << ", extents " << m_image_extents
+ << dendl;
+ m_state = STATE_OBJECT_MAP;
+
+ Context *ctx = create_callback_context();
+ bool sent = m_ictx->object_map.aio_update(m_object_no, OBJECT_EXISTS,
+ boost::optional<uint8_t>(),
+ ctx);
+ assert(sent);
+ copyup = false;
+ }
}
}
// avoid possible recursive lock attempts
if (copyup) {
// no object map update required
- send_copyup();
- return true;
+ return send_copyup();
}
return false;
}
diff --git a/src/librbd/CopyupRequest.h b/src/librbd/CopyupRequest.h
index 92714c2..f8d2e6b 100644
--- a/src/librbd/CopyupRequest.h
+++ b/src/librbd/CopyupRequest.h
@@ -20,7 +20,6 @@ namespace librbd {
vector<pair<uint64_t,uint64_t> >& image_extents);
~CopyupRequest();
- ceph::bufferlist& get_copyup_data();
void append_request(AioRequest *req);
void send();
@@ -34,17 +33,24 @@ namespace librbd {
* <start>
* |
* v
- * STATE_READ_FROM_PARENT ---> STATE_OBJECT_MAP
- * . |
- * . . . . . . . . . . . . . |
- * . |
- * v v
- * <finish>
- * The _OBJECT_MAP state is skipped if the object map isn't enabled.
+ * STATE_READ_FROM_PARENT ----> STATE_OBJECT_MAP . . .
+ * . . | .
+ * . . v .
+ * . . . . . > STATE_COPYUP .
+ * . | .
+ * . v .
+ * . . . . . . . . . . . . > <finish> < . . . . . .
+ *
+ * @endverbatim
+ *
+ * The _OBJECT_MAP state is skipped if the object map isn't enabled or if
+ * an object map update isn't required. The _COPYUP state is skipped if
+ * no data was read from the parent *and* there are no additional ops.
*/
enum State {
STATE_READ_FROM_PARENT,
- STATE_OBJECT_MAP
+ STATE_OBJECT_MAP,
+ STATE_COPYUP
};
ImageCtx *m_ictx;
@@ -57,15 +63,15 @@ namespace librbd {
AsyncOperation m_async_op;
- bool complete_requests(int r);
+ void complete_requests(int r);
void complete(int r);
bool should_complete(int r);
void remove_from_list();
- bool send_object_map();
- void send_copyup();
+ bool send_object_map();
+ bool send_copyup();
Context *create_callback_context();
};
diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc
index a68a906..0f5d46a 100644
--- a/src/librbd/ImageCtx.cc
+++ b/src/librbd/ImageCtx.cc
@@ -63,15 +63,15 @@ public:
image_watcher(NULL),
refresh_seq(0),
last_refresh(0),
- owner_lock("librbd::ImageCtx::owner_lock"),
- md_lock("librbd::ImageCtx::md_lock"),
- cache_lock("librbd::ImageCtx::cache_lock"),
- snap_lock("librbd::ImageCtx::snap_lock"),
- parent_lock("librbd::ImageCtx::parent_lock"),
- refresh_lock("librbd::ImageCtx::refresh_lock"),
- object_map_lock("librbd::ImageCtx::object_map_lock"),
- async_ops_lock("librbd::ImageCtx::async_ops_lock"),
- copyup_list_lock("librbd::ImageCtx::copyup_list_lock"),
+ owner_lock(unique_lock_name("librbd::ImageCtx::owner_lock", this)),
+ md_lock(unique_lock_name("librbd::ImageCtx::md_lock", this)),
+ cache_lock(unique_lock_name("librbd::ImageCtx::cache_lock", this)),
+ snap_lock(unique_lock_name("librbd::ImageCtx::snap_lock", this)),
+ parent_lock(unique_lock_name("librbd::ImageCtx::parent_lock", this)),
+ refresh_lock(unique_lock_name("librbd::ImageCtx::refresh_lock", this)),
+ object_map_lock(unique_lock_name("librbd::ImageCtx::object_map_lock", this)),
+ async_ops_lock(unique_lock_name("librbd::ImageCtx::async_ops_lock", this)),
+ copyup_list_lock(unique_lock_name("librbd::ImageCtx::copyup_list_lock", this)),
extra_read_flags(0),
old_format(true),
order(0), size(0), features(0),
@@ -81,7 +81,7 @@ public:
object_cacher(NULL), writeback_handler(NULL), object_set(NULL),
readahead(),
total_bytes_read(0), copyup_finisher(NULL),
- object_map(*this), aio_work_queue(NULL)
+ object_map(*this), aio_work_queue(NULL), op_work_queue(NULL)
{
md_ctx.dup(p);
data_ctx.dup(p);
@@ -138,6 +138,9 @@ public:
aio_work_queue = new ContextWQ("librbd::aio_work_queue",
cct->_conf->rbd_op_thread_timeout,
thread_pool_singleton);
+ op_work_queue = new ContextWQ("librbd::op_work_queue",
+ cct->_conf->rbd_op_thread_timeout,
+ thread_pool_singleton);
}
ImageCtx::~ImageCtx() {
@@ -160,6 +163,7 @@ public:
}
delete[] format_string;
+ delete op_work_queue;
delete aio_work_queue;
}
@@ -628,25 +632,10 @@ public:
wr->extents.push_back(extent);
{
Mutex::Locker l(cache_lock);
- object_cacher->writex(wr, object_set, cache_lock, onfinish);
+ object_cacher->writex(wr, object_set, onfinish);
}
}
- int ImageCtx::read_from_cache(object_t o, uint64_t object_no, bufferlist *bl,
- size_t len, uint64_t off) {
- int r;
- Mutex mylock("librbd::ImageCtx::read_from_cache");
- Cond cond;
- bool done;
- Context *onfinish = new C_SafeCond(&mylock, &cond, &done, &r);
- aio_read_from_cache(o, object_no, bl, len, off, onfinish, 0);
- mylock.Lock();
- while (!done)
- cond.Wait(mylock);
- mylock.Unlock();
- return r;
- }
-
void ImageCtx::user_flushed() {
if (object_cacher && cct->_conf->rbd_cache_writethrough_until_flush) {
md_lock.get_read();
@@ -667,6 +656,7 @@ public:
}
void ImageCtx::flush_cache_aio(Context *onfinish) {
+ assert(owner_lock.is_locked());
cache_lock.Lock();
object_cacher->flush_set(object_set, onfinish);
cache_lock.Unlock();
@@ -691,19 +681,33 @@ public:
void ImageCtx::shutdown_cache() {
flush_async_operations();
- invalidate_cache();
+
+ RWLock::RLocker owner_locker(owner_lock);
+ invalidate_cache(true);
object_cacher->stop();
}
- int ImageCtx::invalidate_cache() {
+ int ImageCtx::invalidate_cache(bool purge_on_error) {
+ int result;
C_SaferCond ctx;
invalidate_cache(&ctx);
- return ctx.wait();
+ result = ctx.wait();
+
+ if (result && purge_on_error) {
+ cache_lock.Lock();
+ if (object_cacher != NULL) {
+ lderr(cct) << "invalidate cache met error " << cpp_strerror(result) << " !Purging cache..." << dendl;
+ object_cacher->purge_set(object_set);
+ }
+ cache_lock.Unlock();
+ }
+
+ return result;
}
void ImageCtx::invalidate_cache(Context *on_finish) {
if (object_cacher == NULL) {
- on_finish->complete(0);
+ op_work_queue->queue(on_finish, 0);
return;
}
@@ -732,7 +736,8 @@ public:
<< unclean << " bytes remain" << dendl;
r = -EBUSY;
}
- on_finish->complete(r);
+
+ op_work_queue->queue(on_finish, r);
}
void ImageCtx::clear_nonexistence_cache() {
@@ -800,21 +805,15 @@ public:
}
void ImageCtx::flush_async_operations(Context *on_finish) {
- bool complete = false;
- {
- Mutex::Locker l(async_ops_lock);
- if (async_ops.empty()) {
- complete = true;
- } else {
- ldout(cct, 20) << "flush async operations: " << on_finish << " "
- << "count=" << async_ops.size() << dendl;
- async_ops.front()->add_flush_context(on_finish);
- }
+ Mutex::Locker l(async_ops_lock);
+ if (async_ops.empty()) {
+ op_work_queue->queue(on_finish, 0);
+ return;
}
- if (complete) {
- on_finish->complete(0);
- }
+ ldout(cct, 20) << "flush async operations: " << on_finish << " "
+ << "count=" << async_ops.size() << dendl;
+ async_ops.front()->add_flush_context(on_finish);
}
void ImageCtx::cancel_async_requests() {
diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h
index 47134e2..238b0ab 100644
--- a/src/librbd/ImageCtx.h
+++ b/src/librbd/ImageCtx.h
@@ -132,6 +132,7 @@ namespace librbd {
xlist<AsyncResizeRequest*> async_resize_reqs;
ContextWQ *aio_work_queue;
+ ContextWQ *op_work_queue;
/**
* Either image_name or image_id must be set.
@@ -190,13 +191,11 @@ namespace librbd {
int fadvise_flags);
void write_to_cache(object_t o, const bufferlist& bl, size_t len,
uint64_t off, Context *onfinish, int fadvise_flags);
- int read_from_cache(object_t o, uint64_t object_no, bufferlist *bl,
- size_t len, uint64_t off);
void user_flushed();
void flush_cache_aio(Context *onfinish);
int flush_cache();
void shutdown_cache();
- int invalidate_cache();
+ int invalidate_cache(bool purge_on_error=false);
void invalidate_cache(Context *on_finish);
void invalidate_cache_completion(int r, Context *on_finish);
void clear_nonexistence_cache();
diff --git a/src/librbd/ImageWatcher.cc b/src/librbd/ImageWatcher.cc
index 9962f48..71b4c86 100644
--- a/src/librbd/ImageWatcher.cc
+++ b/src/librbd/ImageWatcher.cc
@@ -3,6 +3,7 @@
#include "librbd/ImageWatcher.h"
#include "librbd/AioCompletion.h"
#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
#include "librbd/ObjectMap.h"
#include "librbd/TaskFinisher.h"
#include "cls/lock/cls_lock_client.h"
@@ -31,14 +32,14 @@ static const double RETRY_DELAY_SECONDS = 1.0;
ImageWatcher::ImageWatcher(ImageCtx &image_ctx)
: m_image_ctx(image_ctx),
- m_watch_lock("librbd::ImageWatcher::m_watch_lock"),
+ m_watch_lock(unique_lock_name("librbd::ImageWatcher::m_watch_lock", this)),
m_watch_ctx(*this), m_watch_handle(0),
m_watch_state(WATCH_STATE_UNREGISTERED),
m_lock_owner_state(LOCK_OWNER_STATE_NOT_LOCKED),
m_task_finisher(new TaskFinisher<Task>(*m_image_ctx.cct)),
- m_async_request_lock("librbd::ImageWatcher::m_async_request_lock"),
- m_aio_request_lock("librbd::ImageWatcher::m_aio_request_lock"),
- m_owner_client_id_lock("librbd::ImageWatcher::m_owner_client_id_lock")
+ m_async_request_lock(unique_lock_name("librbd::ImageWatcher::m_async_request_lock", this)),
+ m_aio_request_lock(unique_lock_name("librbd::ImageWatcher::m_aio_request_lock", this)),
+ m_owner_client_id_lock(unique_lock_name("librbd::ImageWatcher::m_owner_client_id_lock", this))
{
}
@@ -63,9 +64,7 @@ bool ImageWatcher::is_lock_supported() const {
bool ImageWatcher::is_lock_supported(const RWLock &) const {
assert(m_image_ctx.owner_lock.is_locked());
assert(m_image_ctx.snap_lock.is_locked());
- uint64_t snap_features;
- m_image_ctx.get_features(m_image_ctx.snap_id, &snap_features);
- return ((snap_features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0 &&
+ return ((m_image_ctx.features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0 &&
!m_image_ctx.read_only && m_image_ctx.snap_id == CEPH_NOSNAP);
}
@@ -76,7 +75,7 @@ bool ImageWatcher::is_lock_owner() const {
}
int ImageWatcher::register_watch() {
- ldout(m_image_ctx.cct, 10) << "registering image watcher" << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " registering image watcher" << dendl;
RWLock::WLocker l(m_watch_lock);
assert(m_watch_state == WATCH_STATE_UNREGISTERED);
@@ -92,7 +91,7 @@ int ImageWatcher::register_watch() {
}
int ImageWatcher::unregister_watch() {
- ldout(m_image_ctx.cct, 10) << "unregistering image watcher" << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " unregistering image watcher" << dendl;
{
Mutex::Locker l(m_aio_request_lock);
@@ -153,26 +152,27 @@ int ImageWatcher::try_lock() {
iter->addr, sizeof(iter->addr)) == 0) &&
(locker_handle == iter->cookie)) {
Mutex::Locker l(m_owner_client_id_lock);
- m_owner_client_id = ClientId(iter->watcher_id, locker_handle);
+ set_owner_client_id(ClientId(iter->watcher_id, locker_handle));
return 0;
}
}
md_config_t *conf = m_image_ctx.cct->_conf;
if (conf->rbd_blacklist_on_break_lock) {
- ldout(m_image_ctx.cct, 1) << "blacklisting client: " << locker << "@"
- << locker_address << dendl;
+ ldout(m_image_ctx.cct, 1) << this << " blacklisting client: " << locker
+ << "@" << locker_address << dendl;
librados::Rados rados(m_image_ctx.md_ctx);
r = rados.blacklist_add(locker_address,
conf->rbd_blacklist_expire_seconds);
if (r < 0) {
- lderr(m_image_ctx.cct) << "unable to blacklist client: "
+ lderr(m_image_ctx.cct) << this << " unable to blacklist client: "
<< cpp_strerror(r) << dendl;
return r;
}
}
- ldout(m_image_ctx.cct, 5) << "breaking exclusive lock: " << locker << dendl;
+ ldout(m_image_ctx.cct, 5) << this << " breaking exclusive lock: " << locker
+ << dendl;
r = rados::cls::lock::break_lock(&m_image_ctx.md_ctx,
m_image_ctx.header_oid, RBD_LOCK_NAME,
locker_cookie, locker);
@@ -191,7 +191,7 @@ void ImageWatcher::request_lock(
{
Mutex::Locker l(m_aio_request_lock);
bool request_pending = !m_aio_requests.empty();
- ldout(m_image_ctx.cct, 15) << "queuing aio request: " << c
+ ldout(m_image_ctx.cct, 15) << this << " queuing aio request: " << c
<< dendl;
c->get();
@@ -203,7 +203,7 @@ void ImageWatcher::request_lock(
RWLock::RLocker l(m_watch_lock);
if (m_watch_state == WATCH_STATE_REGISTERED) {
- ldout(m_image_ctx.cct, 10) << "requesting exclusive lock" << dendl;
+ ldout(m_image_ctx.cct, 15) << this << " requesting exclusive lock" << dendl;
// run notify request in finisher to avoid blocking aio path
FunctionContext *ctx = new FunctionContext(
@@ -229,17 +229,18 @@ bool ImageWatcher::try_request_lock() {
m_image_ctx.owner_lock.get_read();
if (r < 0) {
- ldout(m_image_ctx.cct, 5) << "failed to acquire exclusive lock:"
+ ldout(m_image_ctx.cct, 5) << this << " failed to acquire exclusive lock:"
<< cpp_strerror(r) << dendl;
return false;
}
if (is_lock_owner()) {
- ldout(m_image_ctx.cct, 15) << "successfully acquired exclusive lock"
+ ldout(m_image_ctx.cct, 15) << this << " successfully acquired exclusive lock"
<< dendl;
} else {
- ldout(m_image_ctx.cct, 15) << "unable to acquire exclusive lock, retrying"
- << dendl;
+ ldout(m_image_ctx.cct, 15) << this
+ << " unable to acquire exclusive lock, retrying"
+ << dendl;
}
return is_lock_owner();
}
@@ -259,34 +260,34 @@ int ImageWatcher::get_lock_owner_info(entity_name_t *locker, std::string *cookie
}
if (lockers.empty()) {
- ldout(m_image_ctx.cct, 20) << "no lockers detected" << dendl;
+ ldout(m_image_ctx.cct, 20) << this << " no lockers detected" << dendl;
return 0;
}
if (lock_tag != WATCHER_LOCK_TAG) {
- ldout(m_image_ctx.cct, 5) << "locked by external mechanism: tag="
+ ldout(m_image_ctx.cct, 5) << this << " locked by external mechanism: tag="
<< lock_tag << dendl;
return -EBUSY;
}
if (lock_type == LOCK_SHARED) {
- ldout(m_image_ctx.cct, 5) << "shared lock type detected" << dendl;
+ ldout(m_image_ctx.cct, 5) << this << " shared lock type detected" << dendl;
return -EBUSY;
}
std::map<rados::cls::lock::locker_id_t,
rados::cls::lock::locker_info_t>::iterator iter = lockers.begin();
if (!decode_lock_cookie(iter->first.cookie, handle)) {
- ldout(m_image_ctx.cct, 5) << "locked by external mechanism: cookie="
- << iter->first.cookie << dendl;
+ ldout(m_image_ctx.cct, 5) << this << " locked by external mechanism: "
+ << "cookie=" << iter->first.cookie << dendl;
return -EBUSY;
}
*locker = iter->first.locker;
*cookie = iter->first.cookie;
*address = stringify(iter->second.addr);
- ldout(m_image_ctx.cct, 10) << "retrieved exclusive locker: " << *locker
- << "@" << *address << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " retrieved exclusive locker: "
+ << *locker << "@" << *address << dendl;
return 0;
}
@@ -299,12 +300,13 @@ int ImageWatcher::lock() {
return r;
}
- ldout(m_image_ctx.cct, 10) << "acquired exclusive lock" << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " acquired exclusive lock" << dendl;
m_lock_owner_state = LOCK_OWNER_STATE_LOCKED;
+ ClientId owner_client_id = get_client_id();
{
Mutex::Locker l(m_owner_client_id_lock);
- m_owner_client_id = get_client_id();
+ set_owner_client_id(owner_client_id);
}
if (m_image_ctx.object_map.enabled()) {
@@ -349,12 +351,12 @@ int ImageWatcher::unlock()
return 0;
}
- ldout(m_image_ctx.cct, 10) << "releasing exclusive lock" << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " releasing exclusive lock" << dendl;
m_lock_owner_state = LOCK_OWNER_STATE_NOT_LOCKED;
int r = rados::cls::lock::unlock(&m_image_ctx.md_ctx, m_image_ctx.header_oid,
RBD_LOCK_NAME, encode_lock_cookie());
if (r < 0 && r != -ENOENT) {
- lderr(m_image_ctx.cct) << "failed to release exclusive lock: "
+ lderr(m_image_ctx.cct) << this << " failed to release exclusive lock: "
<< cpp_strerror(r) << dendl;
return r;
}
@@ -363,6 +365,9 @@ int ImageWatcher::unlock()
m_image_ctx.object_map.unlock();
}
+ Mutex::Locker l(m_owner_client_id_lock);
+ set_owner_client_id(ClientId());
+
FunctionContext *ctx = new FunctionContext(
boost::bind(&ImageWatcher::notify_released_lock, this));
m_task_finisher->queue(TASK_CODE_RELEASED_LOCK, ctx);
@@ -372,25 +377,27 @@ int ImageWatcher::unlock()
bool ImageWatcher::release_lock()
{
assert(m_image_ctx.owner_lock.is_wlocked());
- ldout(m_image_ctx.cct, 10) << "releasing exclusive lock by request" << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " releasing exclusive lock by request"
+ << dendl;
if (!is_lock_owner()) {
return false;
}
prepare_unlock();
-
m_image_ctx.owner_lock.put_write();
m_image_ctx.cancel_async_requests();
- m_image_ctx.owner_lock.get_write();
-
- if (!is_lock_owner()) {
- return false;
- }
+ m_image_ctx.flush_async_operations();
{
- RWLock::WLocker l2(m_image_ctx.md_lock);
+ RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+ RWLock::WLocker md_locker(m_image_ctx.md_lock);
librbd::_flush(&m_image_ctx);
}
+ m_image_ctx.owner_lock.get_write();
+ if (!is_lock_owner()) {
+ return false;
+ }
+
unlock();
return true;
}
@@ -410,7 +417,7 @@ void ImageWatcher::schedule_async_progress(const AsyncRequestId &request,
int ImageWatcher::notify_async_progress(const AsyncRequestId &request,
uint64_t offset, uint64_t total) {
- ldout(m_image_ctx.cct, 20) << "remote async request progress: "
+ ldout(m_image_ctx.cct, 20) << this << " remote async request progress: "
<< request << " @ " << offset
<< "/" << total << dendl;
@@ -430,7 +437,7 @@ void ImageWatcher::schedule_async_complete(const AsyncRequestId &request,
int ImageWatcher::notify_async_complete(const AsyncRequestId &request,
int r) {
- ldout(m_image_ctx.cct, 20) << "remote async request finished: "
+ ldout(m_image_ctx.cct, 20) << this << " remote async request finished: "
<< request << " = " << r << dendl;
bufferlist bl;
@@ -441,7 +448,7 @@ int ImageWatcher::notify_async_complete(const AsyncRequestId &request,
int ret = m_image_ctx.md_ctx.notify2(m_image_ctx.header_oid, bl,
NOTIFY_TIMEOUT, NULL);
if (ret < 0) {
- lderr(m_image_ctx.cct) << "failed to notify async complete: "
+ lderr(m_image_ctx.cct) << this << " failed to notify async complete: "
<< cpp_strerror(ret) << dendl;
if (ret == -ETIMEDOUT) {
schedule_async_complete(request, r);
@@ -516,6 +523,7 @@ bool ImageWatcher::decode_lock_cookie(const std::string &tag,
}
void ImageWatcher::schedule_retry_aio_requests(bool use_timer) {
+ m_task_finisher->cancel(TASK_CODE_REQUEST_LOCK);
Context *ctx = new FunctionContext(boost::bind(
&ImageWatcher::retry_aio_requests, this));
if (use_timer) {
@@ -534,11 +542,12 @@ void ImageWatcher::retry_aio_requests() {
lock_request_restarts.swap(m_aio_requests);
}
- ldout(m_image_ctx.cct, 15) << "retrying pending aio requests" << dendl;
+ ldout(m_image_ctx.cct, 15) << this << " retrying pending aio requests"
+ << dendl;
for (std::vector<AioRequest>::iterator iter = lock_request_restarts.begin();
iter != lock_request_restarts.end(); ++iter) {
- ldout(m_image_ctx.cct, 20) << "retrying aio request: " << iter->second
- << dendl;
+ ldout(m_image_ctx.cct, 20) << this << " retrying aio request: "
+ << iter->second << dendl;
iter->first(iter->second);
iter->second->put();
}
@@ -560,6 +569,13 @@ void ImageWatcher::cancel_async_requests() {
m_async_requests.clear();
}
+void ImageWatcher::set_owner_client_id(const WatchNotify::ClientId& client_id) {
+ assert(m_owner_client_id_lock.is_locked());
+ m_owner_client_id = client_id;
+ ldout(m_image_ctx.cct, 10) << this << " current lock owner: "
+ << m_owner_client_id << dendl;
+}
+
ClientId ImageWatcher::get_client_id() {
RWLock::RLocker l(m_watch_lock);
return ClientId(m_image_ctx.md_ctx.get_instance_id(), m_watch_handle);
@@ -571,14 +587,14 @@ void ImageWatcher::notify_release_lock() {
}
void ImageWatcher::notify_released_lock() {
- ldout(m_image_ctx.cct, 10) << "notify released lock" << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " notify released lock" << dendl;
bufferlist bl;
::encode(NotifyMessage(ReleasedLockPayload(get_client_id())), bl);
m_image_ctx.md_ctx.notify2(m_image_ctx.header_oid, bl, NOTIFY_TIMEOUT, NULL);
}
void ImageWatcher::notify_request_lock() {
- ldout(m_image_ctx.cct, 10) << "notify request lock" << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " notify request lock" << dendl;
m_task_finisher->cancel(TASK_CODE_RETRY_AIO_REQUESTS);
m_image_ctx.owner_lock.get_read();
@@ -595,12 +611,20 @@ void ImageWatcher::notify_request_lock() {
m_image_ctx.owner_lock.put_read();
if (r == -ETIMEDOUT) {
- ldout(m_image_ctx.cct, 5) << "timed out requesting lock: retrying" << dendl;
+ ldout(m_image_ctx.cct, 5) << this << "timed out requesting lock: retrying"
+ << dendl;
retry_aio_requests();
} else if (r < 0) {
- lderr(m_image_ctx.cct) << "error requesting lock: " << cpp_strerror(r)
- << dendl;
+ lderr(m_image_ctx.cct) << this << " error requesting lock: "
+ << cpp_strerror(r) << dendl;
schedule_retry_aio_requests(true);
+ } else {
+ // lock owner acked -- but resend if we don't see them release the lock
+ int retry_timeout = m_image_ctx.cct->_conf->client_notify_timeout;
+ FunctionContext *ctx = new FunctionContext(
+ boost::bind(&ImageWatcher::notify_request_lock, this));
+ m_task_finisher->add_event_after(TASK_CODE_REQUEST_LOCK,
+ retry_timeout, ctx);
}
}
@@ -615,7 +639,7 @@ int ImageWatcher::notify_lock_owner(bufferlist &bl) {
&response_bl);
m_image_ctx.owner_lock.get_read();
if (r < 0 && r != -ETIMEDOUT) {
- lderr(m_image_ctx.cct) << "lock owner notification failed: "
+ lderr(m_image_ctx.cct) << this << " lock owner notification failed: "
<< cpp_strerror(r) << dendl;
return r;
}
@@ -627,7 +651,7 @@ int ImageWatcher::notify_lock_owner(bufferlist &bl) {
bufferlist::iterator iter = response_bl.begin();
::decode(responses, iter);
} catch (const buffer::error &err) {
- lderr(m_image_ctx.cct) << "failed to decode response" << dendl;
+ lderr(m_image_ctx.cct) << this << " failed to decode response" << dendl;
return -EINVAL;
}
}
@@ -637,7 +661,8 @@ int ImageWatcher::notify_lock_owner(bufferlist &bl) {
for (responses_t::iterator i = responses.begin(); i != responses.end(); ++i) {
if (i->second.length() > 0) {
if (lock_owner_responded) {
- lderr(m_image_ctx.cct) << "duplicate lock owners detected" << dendl;
+ lderr(m_image_ctx.cct) << this << " duplicate lock owners detected"
+ << dendl;
return -EIO;
}
lock_owner_responded = true;
@@ -646,7 +671,7 @@ int ImageWatcher::notify_lock_owner(bufferlist &bl) {
}
if (!lock_owner_responded) {
- lderr(m_image_ctx.cct) << "no lock owners detected" << dendl;
+ lderr(m_image_ctx.cct) << this << " no lock owners detected" << dendl;
return -ETIMEDOUT;
}
@@ -680,7 +705,7 @@ void ImageWatcher::async_request_timed_out(const AsyncRequestId &id) {
std::map<AsyncRequestId, AsyncRequest>::iterator it =
m_async_requests.find(id);
if (it != m_async_requests.end()) {
- ldout(m_image_ctx.cct, 10) << "request timed-out: " << id << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " request timed-out: " << id << dendl;
it->second.first->complete(-ERESTART);
}
}
@@ -690,7 +715,8 @@ int ImageWatcher::notify_async_request(const AsyncRequestId &async_request_id,
ProgressContext& prog_ctx) {
assert(m_image_ctx.owner_lock.is_locked());
- ldout(m_image_ctx.cct, 10) << "async request: " << async_request_id << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " async request: " << async_request_id
+ << dendl;
C_SaferCond ctx;
@@ -717,7 +743,7 @@ int ImageWatcher::notify_async_request(const AsyncRequestId &async_request_id,
void ImageWatcher::handle_payload(const HeaderUpdatePayload &payload,
bufferlist *out) {
- ldout(m_image_ctx.cct, 10) << "image header updated" << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " image header updated" << dendl;
Mutex::Locker lictx(m_image_ctx.refresh_lock);
++m_image_ctx.refresh_seq;
@@ -726,14 +752,15 @@ void ImageWatcher::handle_payload(const HeaderUpdatePayload &payload,
void ImageWatcher::handle_payload(const AcquiredLockPayload &payload,
bufferlist *out) {
- ldout(m_image_ctx.cct, 10) << "image exclusively locked announcement" << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " image exclusively locked announcement"
+ << dendl;
if (payload.client_id.is_valid()) {
Mutex::Locker l(m_owner_client_id_lock);
if (payload.client_id == m_owner_client_id) {
// we already know that the remote client is the owner
return;
}
- m_owner_client_id = payload.client_id;
+ set_owner_client_id(payload.client_id);
}
RWLock::RLocker l(m_image_ctx.owner_lock);
@@ -745,13 +772,16 @@ void ImageWatcher::handle_payload(const AcquiredLockPayload &payload,
void ImageWatcher::handle_payload(const ReleasedLockPayload &payload,
bufferlist *out) {
- ldout(m_image_ctx.cct, 10) << "exclusive lock released" << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " exclusive lock released" << dendl;
if (payload.client_id.is_valid()) {
Mutex::Locker l(m_owner_client_id_lock);
if (payload.client_id != m_owner_client_id) {
+ ldout(m_image_ctx.cct, 10) << this << " unexpected owner: "
+ << payload.client_id << " != "
+ << m_owner_client_id << dendl;
return;
}
- m_owner_client_id = ClientId();
+ set_owner_client_id(ClientId());
}
RWLock::RLocker l(m_image_ctx.owner_lock);
@@ -763,7 +793,7 @@ void ImageWatcher::handle_payload(const ReleasedLockPayload &payload,
void ImageWatcher::handle_payload(const RequestLockPayload &payload,
bufferlist *out) {
- ldout(m_image_ctx.cct, 10) << "exclusive lock requested" << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " exclusive lock requested" << dendl;
if (payload.client_id == get_client_id()) {
return;
}
@@ -778,10 +808,10 @@ void ImageWatcher::handle_payload(const RequestLockPayload &payload,
if (!m_owner_client_id.is_valid()) {
return;
}
- m_owner_client_id = ClientId();
}
- ldout(m_image_ctx.cct, 10) << "queuing release of exclusive lock" << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " queuing release of exclusive lock"
+ << dendl;
FunctionContext *ctx = new FunctionContext(
boost::bind(&ImageWatcher::notify_release_lock, this));
m_task_finisher->queue(TASK_CODE_RELEASING_LOCK, ctx);
@@ -794,7 +824,7 @@ void ImageWatcher::handle_payload(const AsyncProgressPayload &payload,
std::map<AsyncRequestId, AsyncRequest>::iterator req_it =
m_async_requests.find(payload.async_request_id);
if (req_it != m_async_requests.end()) {
- ldout(m_image_ctx.cct, 20) << "request progress: "
+ ldout(m_image_ctx.cct, 20) << this << " request progress: "
<< payload.async_request_id << " @ "
<< payload.offset << "/" << payload.total
<< dendl;
@@ -809,7 +839,7 @@ void ImageWatcher::handle_payload(const AsyncCompletePayload &payload,
std::map<AsyncRequestId, AsyncRequest>::iterator req_it =
m_async_requests.find(payload.async_request_id);
if (req_it != m_async_requests.end()) {
- ldout(m_image_ctx.cct, 10) << "request finished: "
+ ldout(m_image_ctx.cct, 10) << this << " request finished: "
<< payload.async_request_id << "="
<< payload.result << dendl;
req_it->second.first->complete(payload.result);
@@ -839,12 +869,12 @@ void ImageWatcher::handle_payload(const FlattenPayload &payload,
RemoteContext *ctx = new RemoteContext(*this, payload.async_request_id,
prog_ctx);
- ldout(m_image_ctx.cct, 10) << "remote flatten request: "
+ ldout(m_image_ctx.cct, 10) << this << " remote flatten request: "
<< payload.async_request_id << dendl;
r = librbd::async_flatten(&m_image_ctx, ctx, *prog_ctx);
if (r < 0) {
delete ctx;
- lderr(m_image_ctx.cct) << "remove flatten request failed: "
+ lderr(m_image_ctx.cct) << this << " remove flatten request failed: "
<< cpp_strerror(r) << dendl;
RWLock::WLocker l(m_async_request_lock);
@@ -878,12 +908,12 @@ void ImageWatcher::handle_payload(const ResizePayload &payload,
RemoteContext *ctx = new RemoteContext(*this, payload.async_request_id,
prog_ctx);
- ldout(m_image_ctx.cct, 10) << "remote resize request: "
+ ldout(m_image_ctx.cct, 10) << this << " remote resize request: "
<< payload.async_request_id << " "
<< payload.size << dendl;
r = librbd::async_resize(&m_image_ctx, ctx, payload.size, *prog_ctx);
if (r < 0) {
- lderr(m_image_ctx.cct) << "remove resize request failed: "
+ lderr(m_image_ctx.cct) << this << " remove resize request failed: "
<< cpp_strerror(r) << dendl;
delete ctx;
@@ -900,7 +930,7 @@ void ImageWatcher::handle_payload(const SnapCreatePayload &payload,
bufferlist *out) {
RWLock::RLocker l(m_image_ctx.owner_lock);
if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
- ldout(m_image_ctx.cct, 10) << "remote snap_create request: "
+ ldout(m_image_ctx.cct, 10) << this << " remote snap_create request: "
<< payload.snap_name << dendl;
int r = librbd::snap_create_helper(&m_image_ctx, NULL,
payload.snap_name.c_str());
@@ -928,23 +958,23 @@ void ImageWatcher::handle_notify(uint64_t notify_id, uint64_t handle,
bufferlist::iterator iter = bl.begin();
::decode(notify_message, iter);
} catch (const buffer::error &err) {
- lderr(m_image_ctx.cct) << "error decoding image notification: "
+ lderr(m_image_ctx.cct) << this << " error decoding image notification: "
<< err.what() << dendl;
return;
}
}
apply_visitor(HandlePayloadVisitor(this, notify_id, handle),
- notify_message.payload);
+ notify_message.payload);
}
void ImageWatcher::handle_error(uint64_t handle, int err) {
- lderr(m_image_ctx.cct) << "image watch failed: " << handle << ", "
+ lderr(m_image_ctx.cct) << this << " image watch failed: " << handle << ", "
<< cpp_strerror(err) << dendl;
{
Mutex::Locker l(m_owner_client_id_lock);
- m_owner_client_id = ClientId();
+ set_owner_client_id(ClientId());
}
RWLock::WLocker l(m_watch_lock);
@@ -964,7 +994,7 @@ void ImageWatcher::acknowledge_notify(uint64_t notify_id, uint64_t handle,
}
void ImageWatcher::reregister_watch() {
- ldout(m_image_ctx.cct, 10) << "re-registering image watch" << dendl;
+ ldout(m_image_ctx.cct, 10) << this << " re-registering image watch" << dendl;
{
RWLock::WLocker l(m_image_ctx.owner_lock);
@@ -984,7 +1014,7 @@ void ImageWatcher::reregister_watch() {
r = m_image_ctx.md_ctx.watch2(m_image_ctx.header_oid,
&m_watch_handle, &m_watch_ctx);
if (r < 0) {
- lderr(m_image_ctx.cct) << "failed to re-register image watch: "
+ lderr(m_image_ctx.cct) << this << " failed to re-register image watch: "
<< cpp_strerror(r) << dendl;
if (r != -ESHUTDOWN) {
FunctionContext *ctx = new FunctionContext(boost::bind(
@@ -1002,10 +1032,11 @@ void ImageWatcher::reregister_watch() {
if (was_lock_owner) {
r = try_lock();
if (r == -EBUSY) {
- ldout(m_image_ctx.cct, 5) << "lost image lock while re-registering "
- << "image watch" << dendl;
+ ldout(m_image_ctx.cct, 5) << this << "lost image lock while "
+ << "re-registering image watch" << dendl;
} else if (r < 0) {
- lderr(m_image_ctx.cct) << "failed to lock image while re-registering "
+ lderr(m_image_ctx.cct) << this
+ << "failed to lock image while re-registering "
<< "image watch" << cpp_strerror(r) << dendl;
}
}
diff --git a/src/librbd/ImageWatcher.h b/src/librbd/ImageWatcher.h
index 19155ae..760a698 100644
--- a/src/librbd/ImageWatcher.h
+++ b/src/librbd/ImageWatcher.h
@@ -219,6 +219,7 @@ namespace librbd {
void schedule_cancel_async_requests();
void cancel_async_requests();
+ void set_owner_client_id(const WatchNotify::ClientId &client_id);
WatchNotify::ClientId get_client_id();
void notify_release_lock();
diff --git a/src/librbd/LibrbdWriteback.cc b/src/librbd/LibrbdWriteback.cc
index 694f2c7..058760e 100644
--- a/src/librbd/LibrbdWriteback.cc
+++ b/src/librbd/LibrbdWriteback.cc
@@ -157,34 +157,27 @@ namespace librbd {
uint64_t trunc_size, __u32 trunc_seq,
Context *oncommit)
{
- m_ictx->snap_lock.get_read();
- librados::snap_t snap_id = m_ictx->snap_id;
- m_ictx->parent_lock.get_read();
- uint64_t overlap = 0;
- m_ictx->get_parent_overlap(snap_id, &overlap);
- m_ictx->parent_lock.put_read();
- m_ictx->snap_lock.put_read();
-
+ assert(m_ictx->owner_lock.is_locked());
uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix);
- // reverse map this object extent onto the parent
- vector<pair<uint64_t,uint64_t> > objectx;
- Striper::extent_to_file(m_ictx->cct, &m_ictx->layout,
- object_no, 0, m_ictx->layout.fl_object_size,
- objectx);
- uint64_t object_overlap = m_ictx->prune_parent_extents(objectx, overlap);
write_result_d *result = new write_result_d(oid.name, oncommit);
m_writes[oid.name].push(result);
ldout(m_ictx->cct, 20) << "write will wait for result " << result << dendl;
C_OrderedWrite *req_comp = new C_OrderedWrite(m_ictx->cct, result, this);
- AioWrite *req = new AioWrite(m_ictx, oid.name,
- object_no, off, objectx, object_overlap,
- bl, snapc, snap_id,
- req_comp);
+ AioWrite *req = new AioWrite(m_ictx, oid.name, object_no, off, bl, snapc,
+ req_comp);
req->send();
return ++m_tid;
}
+ void LibrbdWriteback::get_client_lock() {
+ m_ictx->owner_lock.get_read();
+ }
+
+ void LibrbdWriteback::put_client_lock() {
+ m_ictx->owner_lock.put_read();
+ }
+
void LibrbdWriteback::complete_writes(const std::string& oid)
{
assert(m_lock.is_locked());
diff --git a/src/librbd/LibrbdWriteback.h b/src/librbd/LibrbdWriteback.h
index 2c71e84..b5578ae 100644
--- a/src/librbd/LibrbdWriteback.h
+++ b/src/librbd/LibrbdWriteback.h
@@ -38,6 +38,9 @@ namespace librbd {
const bufferlist &bl, utime_t mtime, uint64_t trunc_size,
__u32 trunc_seq, Context *oncommit);
+ virtual void get_client_lock();
+ virtual void put_client_lock();
+
struct write_result_d {
bool done;
int ret;
diff --git a/src/librbd/ObjectMap.cc b/src/librbd/ObjectMap.cc
index c2ca798..9e7aae2 100644
--- a/src/librbd/ObjectMap.cc
+++ b/src/librbd/ObjectMap.cc
@@ -33,6 +33,13 @@ std::string ObjectMap::object_map_name(const std::string &image_id,
return oid;
}
+uint8_t ObjectMap::operator[](uint64_t object_no) const
+{
+ assert(m_image_ctx.object_map_lock.is_locked());
+ assert(object_no < m_object_map.size());
+ return m_object_map[object_no];
+}
+
bool ObjectMap::enabled() const
{
RWLock::RLocker l(m_image_ctx.object_map_lock);
@@ -137,8 +144,8 @@ bool ObjectMap::object_may_exist(uint64_t object_no) const
}
assert(object_no < m_object_map.size());
- bool exists = (m_object_map[object_no] == OBJECT_EXISTS ||
- m_object_map[object_no] == OBJECT_PENDING);
+ uint8_t state = (*this)[object_no];
+ bool exists = (state == OBJECT_EXISTS || state == OBJECT_PENDING);
ldout(m_image_ctx.cct, 20) << &m_image_ctx << " object_may_exist: "
<< "object_no=" << object_no << " r=" << exists
<< dendl;
@@ -295,13 +302,12 @@ bool ObjectMap::aio_update(uint64_t start_object_no, uint64_t end_object_no,
const boost::optional<uint8_t> ¤t_state,
Context *on_finish)
{
- assert(m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP));
+ assert(m_image_ctx.snap_lock.is_locked());
+ assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0);
assert(m_image_ctx.owner_lock.is_locked());
assert(m_image_ctx.image_watcher != NULL);
assert(m_image_ctx.image_watcher->is_lock_owner());
- assert(start_object_no < end_object_no);
-
- RWLock::WLocker l(m_image_ctx.object_map_lock);
+ assert(m_image_ctx.object_map_lock.is_wlocked());
assert(start_object_no < end_object_no);
CephContext *cct = m_image_ctx.cct;
diff --git a/src/librbd/ObjectMap.h b/src/librbd/ObjectMap.h
index 44709b8..4104636 100644
--- a/src/librbd/ObjectMap.h
+++ b/src/librbd/ObjectMap.h
@@ -24,6 +24,8 @@ public:
static std::string object_map_name(const std::string &image_id,
uint64_t snap_id);
+ uint8_t operator[](uint64_t object_no) const;
+
int lock();
int unlock();
diff --git a/src/librbd/WatchNotifyTypes.cc b/src/librbd/WatchNotifyTypes.cc
index 1f02ac1..e7dde46 100644
--- a/src/librbd/WatchNotifyTypes.cc
+++ b/src/librbd/WatchNotifyTypes.cc
@@ -380,6 +380,12 @@ std::ostream &operator<<(std::ostream &out,
}
std::ostream &operator<<(std::ostream &out,
+ const librbd::WatchNotify::ClientId &client_id) {
+ out << "[" << client_id.gid << "," << client_id.handle << "]";
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out,
const librbd::WatchNotify::AsyncRequestId &request) {
out << "[" << request.client_id.gid << "," << request.client_id.handle << ","
<< request.request_id << "]";
diff --git a/src/librbd/WatchNotifyTypes.h b/src/librbd/WatchNotifyTypes.h
index 2b3c34b..270f25d 100644
--- a/src/librbd/WatchNotifyTypes.h
+++ b/src/librbd/WatchNotifyTypes.h
@@ -234,6 +234,8 @@ struct ResponseMessage {
std::ostream &operator<<(std::ostream &out,
const librbd::WatchNotify::NotifyOp &op);
std::ostream &operator<<(std::ostream &out,
+ const librbd::WatchNotify::ClientId &client);
+std::ostream &operator<<(std::ostream &out,
const librbd::WatchNotify::AsyncRequestId &request);
WRITE_CLASS_ENCODER(librbd::WatchNotify::ClientId);
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index 15ea416..447e9d0 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -152,6 +152,10 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
return image_name + RBD_SUFFIX;
}
+ std::string unique_lock_name(const std::string &name, void *address) {
+ return name + " (" + stringify(address) + ")";
+ }
+
int detect_format(IoCtx &io_ctx, const string &name,
bool *old_format, uint64_t *size)
{
@@ -567,7 +571,7 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
ldout(ictx->cct, 20) << "snap_create_helper " << ictx << " " << snap_name
<< dendl;
- int r = ictx_check(ictx);
+ int r = ictx_check(ictx, true);
if (r < 0) {
return r;
}
@@ -622,7 +626,8 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
if (r < 0)
return r;
- RWLock::RLocker l(ictx->md_lock);
+ RWLock::RLocker owner_locker(ictx->owner_lock);
+ RWLock::RLocker md_locker(ictx->md_lock);
snap_t snap_id;
{
@@ -1206,10 +1211,10 @@ reprotect_and_return_err:
goto err_close_child;
}
- p_imctx->md_lock.get_write();
- r = ictx_refresh(p_imctx);
- p_imctx->md_lock.put_write();
-
+ {
+ RWLock::RLocker owner_locker(p_imctx->owner_lock);
+ r = ictx_refresh(p_imctx);
+ }
if (r == 0) {
p_imctx->snap_lock.get_read();
r = p_imctx->is_snap_protected(p_imctx->snap_id, &snap_protected);
@@ -1621,9 +1626,7 @@ reprotect_and_return_err:
return -EBUSY;
}
- ictx->md_lock.get_read();
trim_image(ictx, 0, prog_ctx);
- ictx->md_lock.put_read();
ictx->parent_lock.get_read();
// struct assignment
@@ -1733,7 +1736,7 @@ reprotect_and_return_err:
<< size << dendl;
ictx->snap_lock.put_read();
- int r = ictx_check(ictx);
+ int r = ictx_check(ictx, true);
if (r < 0) {
return r;
}
@@ -1872,7 +1875,7 @@ reprotect_and_return_err:
return 0;
}
- int ictx_check(ImageCtx *ictx)
+ int ictx_check(ImageCtx *ictx, bool owner_locked)
{
CephContext *cct = ictx->cct;
ldout(cct, 20) << "ictx_check " << ictx << dendl;
@@ -1882,9 +1885,13 @@ reprotect_and_return_err:
ictx->refresh_lock.Unlock();
if (needs_refresh) {
- RWLock::WLocker l(ictx->md_lock);
-
- int r = ictx_refresh(ictx);
+ int r;
+ if (owner_locked) {
+ r = ictx_refresh(ictx);
+ } else {
+ RWLock::RLocker owner_lock(ictx->owner_lock);
+ r = ictx_refresh(ictx);
+ }
if (r < 0) {
lderr(cct) << "Error re-reading rbd header: " << cpp_strerror(-r)
<< dendl;
@@ -1932,6 +1939,9 @@ reprotect_and_return_err:
int ictx_refresh(ImageCtx *ictx)
{
+ assert(ictx->owner_lock.is_locked());
+ RWLock::WLocker md_locker(ictx->md_lock);
+
CephContext *cct = ictx->cct;
bufferlist bl, bl2;
@@ -2129,14 +2139,13 @@ reprotect_and_return_err:
if (r < 0)
return r;
- RWLock::RLocker l(ictx->owner_lock);
+ RWLock::RLocker owner_locker(ictx->owner_lock);
snap_t snap_id;
uint64_t new_size;
{
- RWLock::WLocker l2(ictx->md_lock);
{
// need to drop snap_lock before invalidating cache
- RWLock::RLocker l3(ictx->snap_lock);
+ RWLock::RLocker snap_locker(ictx->snap_lock);
if (!ictx->snap_exists) {
return -ENOENT;
}
@@ -2168,6 +2177,7 @@ reprotect_and_return_err:
// need to flush any pending writes before resizing and rolling back -
// writes might create new snapshots. Rolling back will replace
// the current version, so we have to invalidate that too.
+ RWLock::WLocker md_locker(ictx->md_lock);
ictx->flush_async_operations();
r = ictx->invalidate_cache();
if (r < 0) {
@@ -2388,7 +2398,8 @@ reprotect_and_return_err:
if (ictx->object_cacher) {
// complete pending writes before we're set to a snapshot and
// get -EROFS for writes
- RWLock::WLocker l(ictx->md_lock);
+ RWLock::RLocker owner_locker(ictx->owner_lock);
+ RWLock::WLocker md_locker(ictx->md_lock);
ictx->flush_cache();
}
int r = _snap_set(ictx, snap_name);
@@ -2433,9 +2444,10 @@ reprotect_and_return_err:
}
}
- ictx->md_lock.get_write();
- r = ictx_refresh(ictx);
- ictx->md_lock.put_write();
+ {
+ RWLock::RLocker owner_locker(ictx->owner_lock);
+ r = ictx_refresh(ictx);
+ }
if (r < 0)
goto err_close;
@@ -2462,15 +2474,18 @@ reprotect_and_return_err:
}
ictx->aio_work_queue->drain();
-
ictx->cancel_async_requests();
+ ictx->flush_async_operations();
ictx->readahead.wait_for_pending();
+
if (ictx->object_cacher) {
ictx->shutdown_cache(); // implicitly flushes
} else {
flush(ictx);
}
+ ictx->op_work_queue->drain();
+
if (ictx->copyup_finisher != NULL) {
ictx->copyup_finisher->wait_for_empty();
ictx->copyup_finisher->stop();
@@ -2549,7 +2564,7 @@ reprotect_and_return_err:
int r;
// ictx_check also updates parent data
- if ((r = ictx_check(ictx)) < 0) {
+ if ((r = ictx_check(ictx, true)) < 0) {
lderr(cct) << "ictx_check failed" << dendl;
return r;
}
@@ -2830,7 +2845,10 @@ reprotect_and_return_err:
<< " len = " << len << dendl;
// ensure previous writes are visible to listsnaps
- _flush(ictx);
+ {
+ RWLock::RLocker owner_locker(ictx->owner_lock);
+ _flush(ictx);
+ }
int r = ictx_check(ictx);
if (r < 0)
@@ -3206,6 +3224,7 @@ reprotect_and_return_err:
return;
}
+ RWLock::RLocker owner_locker(ictx->owner_lock);
ictx->user_flushed();
C_AioWrite *flush_ctx = new C_AioWrite(cct, c);
@@ -3239,13 +3258,17 @@ reprotect_and_return_err:
}
ictx->user_flushed();
- r = _flush(ictx);
+ {
+ RWLock::RLocker owner_locker(ictx->owner_lock);
+ r = _flush(ictx);
+ }
ictx->perfcounter->inc(l_librbd_flush);
return r;
}
int _flush(ImageCtx *ictx)
{
+ assert(ictx->owner_lock.is_locked());
CephContext *cct = ictx->cct;
int r;
// flush any outstanding writes
@@ -3274,7 +3297,8 @@ reprotect_and_return_err:
ictx->flush_async_operations();
- RWLock::WLocker l(ictx->md_lock);
+ RWLock::RLocker owner_locker(ictx->owner_lock);
+ RWLock::WLocker md_locker(ictx->md_lock);
r = ictx->invalidate_cache();
return r;
}
@@ -3297,31 +3321,23 @@ reprotect_and_return_err:
RWLock::RLocker md_locker(ictx->md_lock);
uint64_t clip_len = len;
- snapid_t snap_id;
::SnapContext snapc;
- uint64_t overlap = 0;
{
// prevent image size from changing between computing clip and recording
// pending async operation
RWLock::RLocker snap_locker(ictx->snap_lock);
+ if (ictx->snap_id != CEPH_NOSNAP || ictx->read_only) {
+ c->fail(cct, -EROFS);
+ return;
+ }
+
r = clip_io(ictx, off, &clip_len);
if (r < 0) {
c->fail(cct, r);
return;
}
- snap_id = ictx->snap_id;
snapc = ictx->snapc;
- ictx->parent_lock.get_read();
- ictx->get_parent_overlap(ictx->snap_id, &overlap);
- ictx->parent_lock.put_read();
-
- if (snap_id != CEPH_NOSNAP || ictx->read_only) {
- c->fail(cct, -EROFS);
- return;
- }
-
- ldout(cct, 20) << " parent overlap " << overlap << dendl;
c->init_time(ictx, AIO_TYPE_WRITE);
}
@@ -3357,16 +3373,8 @@ reprotect_and_return_err:
c->add_request();
ictx->write_to_cache(p->oid, bl, p->length, p->offset, req_comp, op_flags);
} else {
- // reverse map this object extent onto the parent
- vector<pair<uint64_t,uint64_t> > objectx;
- Striper::extent_to_file(ictx->cct, &ictx->layout,
- p->objectno, 0, ictx->layout.fl_object_size,
- objectx);
- uint64_t object_overlap = ictx->prune_parent_extents(objectx, overlap);
-
AioWrite *req = new AioWrite(ictx, p->oid.name, p->objectno, p->offset,
- objectx, object_overlap,
- bl, snapc, snap_id, req_comp);
+ bl, snapc, req_comp);
c->add_request();
req->set_op_flags(op_flags);
@@ -3398,13 +3406,16 @@ reprotect_and_return_err:
RWLock::RLocker md_locker(ictx->md_lock);
uint64_t clip_len = len;
- snapid_t snap_id;
::SnapContext snapc;
- uint64_t overlap;
{
// prevent image size from changing between computing clip and recording
// pending async operation
RWLock::RLocker snap_locker(ictx->snap_lock);
+ if (ictx->snap_id != CEPH_NOSNAP || ictx->read_only) {
+ c->fail(cct, -EROFS);
+ return;
+ }
+
r = clip_io(ictx, off, &clip_len);
if (r < 0) {
c->fail(cct, r);
@@ -3412,16 +3423,7 @@ reprotect_and_return_err:
}
// TODO: check for snap
- snap_id = ictx->snap_id;
snapc = ictx->snapc;
- ictx->parent_lock.get_read();
- ictx->get_parent_overlap(ictx->snap_id, &overlap);
- ictx->parent_lock.put_read();
-
- if (snap_id != CEPH_NOSNAP || ictx->read_only) {
- c->fail(cct, -EROFS);
- return;
- }
c->init_time(ictx, AIO_TYPE_DISCARD);
}
@@ -3448,26 +3450,14 @@ reprotect_and_return_err:
AbstractWrite *req;
c->add_request();
- // reverse map this object extent onto the parent
- vector<pair<uint64_t,uint64_t> > objectx;
- uint64_t object_overlap = 0;
- if (off < overlap) { // we might overlap...
- Striper::extent_to_file(ictx->cct, &ictx->layout,
- p->objectno, 0, ictx->layout.fl_object_size,
- objectx);
- object_overlap = ictx->prune_parent_extents(objectx, overlap);
- }
-
if (p->offset == 0 && p->length == ictx->layout.fl_object_size) {
- req = new AioRemove(ictx, p->oid.name, p->objectno, objectx, object_overlap,
- snapc, snap_id, req_comp);
+ req = new AioRemove(ictx, p->oid.name, p->objectno, snapc, req_comp);
} else if (p->offset + p->length == ictx->layout.fl_object_size) {
- req = new AioTruncate(ictx, p->oid.name, p->objectno, p->offset, objectx, object_overlap,
- snapc, snap_id, req_comp);
+ req = new AioTruncate(ictx, p->oid.name, p->objectno, p->offset, snapc,
+ req_comp);
} else {
req = new AioZero(ictx, p->oid.name, p->objectno, p->offset, p->length,
- objectx, object_overlap,
- snapc, snap_id, req_comp);
+ snapc, req_comp);
}
req->send();
@@ -3581,7 +3571,6 @@ reprotect_and_return_err:
}
snap_t snap_id;
- ::SnapContext snapc;
map<object_t,vector<ObjectExtent> > object_extents;
uint64_t buffer_ofs = 0;
{
@@ -3589,7 +3578,6 @@ reprotect_and_return_err:
// pending async operation
RWLock::RLocker snap_locker(ictx->snap_lock);
snap_id = ictx->snap_id;
- snapc = ictx->snapc;
// map
for (vector<pair<uint64_t,uint64_t> >::const_iterator p =
@@ -3617,21 +3605,23 @@ reprotect_and_return_err:
c->read_buf_len = buffer_ofs;
c->read_bl = pbl;
- for (map<object_t,vector<ObjectExtent> >::iterator p = object_extents.begin(); p != object_extents.end(); ++p) {
- for (vector<ObjectExtent>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
- ldout(ictx->cct, 20) << " oid " << q->oid << " " << q->offset << "~" << q->length
- << " from " << q->buffer_extents << dendl;
+ for (map<object_t,vector<ObjectExtent> >::iterator p = object_extents.begin();
+ p != object_extents.end(); ++p) {
+ for (vector<ObjectExtent>::iterator q = p->second.begin();
+ q != p->second.end(); ++q) {
+ ldout(ictx->cct, 20) << " oid " << q->oid << " " << q->offset << "~"
+ << q->length << " from " << q->buffer_extents
+ << dendl;
C_AioRead *req_comp = new C_AioRead(ictx->cct, c);
- AioRead *req = new AioRead(ictx, q->oid.name,
- q->objectno, q->offset, q->length,
- q->buffer_extents, snapc,
- snap_id, true, req_comp, op_flags);
+ AioRead *req = new AioRead(ictx, q->oid.name, q->objectno, q->offset,
+ q->length, q->buffer_extents, snap_id, true,
+ req_comp, op_flags);
req_comp->set_req(req);
c->add_request();
if (ictx->object_cacher) {
- C_CacheRead *cache_comp = new C_CacheRead(req);
+ C_CacheRead *cache_comp = new C_CacheRead(ictx, req);
ictx->aio_read_from_cache(q->oid, q->objectno, &req->data(),
q->length, q->offset,
cache_comp, op_flags);
diff --git a/src/librbd/internal.h b/src/librbd/internal.h
index 419f929..a633c9d 100644
--- a/src/librbd/internal.h
+++ b/src/librbd/internal.h
@@ -76,6 +76,7 @@ namespace librbd {
const std::string id_obj_name(const std::string &name);
const std::string header_name(const std::string &image_id);
const std::string old_header_name(const std::string &image_name);
+ std::string unique_lock_name(const std::string &name, void *address);
int detect_format(librados::IoCtx &io_ctx, const std::string &name,
bool *old_format, uint64_t *size);
@@ -123,7 +124,7 @@ namespace librbd {
int add_snap(ImageCtx *ictx, const char *snap_name);
int rm_snap(ImageCtx *ictx, const char *snap_name);
int refresh_parent(ImageCtx *ictx);
- int ictx_check(ImageCtx *ictx);
+ int ictx_check(ImageCtx *ictx, bool owner_locked=false);
int ictx_refresh(ImageCtx *ictx);
int copy(ImageCtx *ictx, IoCtx& dest_md_ctx, const char *destname,
ProgressContext &prog_ctx);
diff --git a/src/log/Log.cc b/src/log/Log.cc
index ce97494..3dc6c63 100644
--- a/src/log/Log.cc
+++ b/src/log/Log.cc
@@ -106,7 +106,11 @@ void Log::set_max_new(int n)
void Log::set_max_recent(int n)
{
+ pthread_mutex_lock(&m_flush_mutex);
+ m_flush_mutex_holder = pthread_self();
m_max_recent = n;
+ m_flush_mutex_holder = 0;
+ pthread_mutex_unlock(&m_flush_mutex);
}
void Log::set_log_file(string fn)
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 83098f0..ba2aecf 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -788,7 +788,7 @@ void Monitor::refresh_from_paxos(bool *need_bootstrap)
paxos_service[i]->refresh(need_bootstrap);
}
for (int i = 0; i < PAXOS_NUM; ++i) {
- paxos_service[i]->post_paxos_update();
+ paxos_service[i]->post_refresh();
}
}
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 730702e..cdbb6c7 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -70,6 +70,12 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, OSDMap& osdmap) {
<< ").osd e" << osdmap.get_epoch() << " ";
}
+OSDMonitor::OSDMonitor(Monitor *mn, Paxos *p, string service_name)
+ : PaxosService(mn, p, service_name),
+ inc_osd_cache(g_conf->mon_osd_cache_size),
+ full_osd_cache(g_conf->mon_osd_cache_size),
+ thrash_map(0), thrash_last_up_osd(-1) { }
+
bool OSDMonitor::_have_pending_crush()
{
return pending_inc.crush.length();
@@ -1153,13 +1159,13 @@ bool OSDMonitor::preprocess_get_osdmap(MMonGetOSDMap *m)
epoch_t last = osdmap.get_epoch();
int max = g_conf->osd_map_message_max;
for (epoch_t e = MAX(first, m->get_full_first());
- e < MIN(last, m->get_full_last()) && max > 0;
+ e <= MIN(last, m->get_full_last()) && max > 0;
++e, --max) {
int r = get_version_full(e, reply->maps[e]);
assert(r >= 0);
}
for (epoch_t e = MAX(first, m->get_inc_first());
- e < MIN(last, m->get_inc_last()) && max > 0;
+ e <= MIN(last, m->get_inc_last()) && max > 0;
++e, --max) {
int r = get_version(e, reply->incremental_maps[e]);
assert(r >= 0);
@@ -1614,7 +1620,27 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
<< " doesn't announce support -- ignore" << dendl;
goto ignore;
}
-
+
+ // make sure upgrades stop at hammer
+ // * OSD_PROXY_FEATURES is the last pre-hammer feature
+ // * MON_METADATA is the first post-hammer feature
+ if (osdmap.get_num_up_osds() > 0) {
+ if ((m->osd_features & CEPH_FEATURE_MON_METADATA) &&
+ !(osdmap.get_up_osd_features() & CEPH_FEATURE_OSD_PROXY_FEATURES)) {
+ mon->clog->info() << "disallowing boot of post-hammer OSD "
+ << m->get_orig_source_inst()
+ << " because one or more up OSDs is pre-hammer\n";
+ goto ignore;
+ }
+ if (!(m->osd_features & CEPH_FEATURE_OSD_PROXY_FEATURES) &&
+ (osdmap.get_up_osd_features() & CEPH_FEATURE_MON_METADATA)) {
+ mon->clog->info() << "disallowing boot of pre-hammer OSD "
+ << m->get_orig_source_inst()
+ << " because all up OSDs are post-hammer\n";
+ goto ignore;
+ }
+ }
+
// already booted?
if (osdmap.is_up(from) &&
osdmap.get_inst(from) == m->get_orig_source_inst()) {
@@ -2226,6 +2252,29 @@ void OSDMonitor::send_incremental(epoch_t first, MonSession *session,
}
}
+int OSDMonitor::get_version(version_t ver, bufferlist& bl)
+{
+ if (inc_osd_cache.lookup(ver, &bl)) {
+ return 0;
+ }
+ int ret = PaxosService::get_version(ver, bl);
+ if (!ret) {
+ inc_osd_cache.add(ver, bl);
+ }
+ return ret;
+}
+
+int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
+{
+ if (full_osd_cache.lookup(ver, &bl)) {
+ return 0;
+ }
+ int ret = PaxosService::get_version_full(ver, bl);
+ if (!ret) {
+ full_osd_cache.add(ver, bl);
+ }
+ return ret;
+}
@@ -2872,8 +2921,8 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
<< " pool '" << poolstr << "' (" << pool << ")"
<< " object '" << fullobjname << "' ->"
<< " pg " << pgid << " (" << mpgid << ")"
- << " -> up (" << up << ", p" << up_p << ") acting ("
- << acting << ", p" << acting_p << ")";
+ << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
+ << pg_vector_string(acting) << ", p" << acting_p << ")";
rdata.append(ds);
}
} else if ((prefix == "osd scrub" ||
@@ -3562,7 +3611,7 @@ void OSDMonitor::get_pools_health(
} else if (warn_threshold > 0 &&
sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
ss << "pool '" << pool_name
- << "' has " << si_t(sum.num_bytes) << " objects"
+ << "' has " << si_t(sum.num_bytes) << " bytes"
<< " (max " << si_t(pool.quota_max_bytes) << ")";
status = HEALTH_WARN;
}
@@ -3874,6 +3923,7 @@ int OSDMonitor::prepare_pool_crush_ruleset(const unsigned pool_type,
int *crush_ruleset,
stringstream &ss)
{
+
if (*crush_ruleset < 0) {
switch (pool_type) {
case pg_pool_t::TYPE_REPLICATED:
@@ -3985,6 +4035,15 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
crush_ruleset_name, &crush_ruleset, ss);
if (r)
return r;
+ CrushWrapper newcrush;
+ _get_pending_crush(newcrush);
+ CrushTester tester(newcrush, ss);
+ r = tester.test_with_crushtool(g_conf->crushtool.c_str(),
+ osdmap.get_max_osd(),
+ g_conf->mon_lease,
+ crush_ruleset);
+ if (r)
+ return r;
unsigned size, min_size;
r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
if (r)
@@ -4542,7 +4601,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
ss << "(note: crushtool tests not run because they took too long) ";
} else {
derr << "error on crush map: " << ess.str() << dendl;
- ss << "Failed to parse crushmap: " << ess.str();
+ ss << "Failed crushmap test: " << ess.str();
err = r;
goto reply;
}
@@ -5631,7 +5690,7 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
}
if (osdmap.exists(id)) {
pending_inc.new_weight[id] = ww;
- ss << "reweighted osd." << id << " to " << w << " (" << ios::hex << ww << ios::dec << ")";
+ ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
getline(ss, rs);
wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
get_last_committed() + 1));
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index afeacde..414bf08 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -26,6 +26,7 @@
using namespace std;
#include "include/types.h"
+#include "common/simple_cache.hpp"
#include "msg/Messenger.h"
#include "osd/OSDMap.h"
@@ -139,6 +140,8 @@ private:
* optimization to try to avoid sending the same inc maps twice.
*/
map<int,epoch_t> osd_epoch;
+ SimpleLRU<version_t, bufferlist> inc_osd_cache;
+ SimpleLRU<version_t, bufferlist> full_osd_cache;
void note_osd_has_epoch(int osd, epoch_t epoch);
@@ -380,9 +383,7 @@ private:
bool prepare_remove_snaps(struct MRemoveSnaps *m);
public:
- OSDMonitor(Monitor *mn, Paxos *p, string service_name)
- : PaxosService(mn, p, service_name),
- thrash_map(0), thrash_last_up_osd(-1) { }
+ OSDMonitor(Monitor *mn, Paxos *p, string service_name);
void tick(); // check state, take actions
@@ -407,6 +408,9 @@ private:
send_incremental(m, start);
}
+ int get_version(version_t ver, bufferlist& bl);
+ int get_version_full(version_t ver, bufferlist& bl);
+
epoch_t blacklist(const entity_addr_t& a, utime_t until);
void dump_info(Formatter *f);
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index e699efb..07e6305 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1023,8 +1023,8 @@ bool PGMonitor::register_new_pgs()
++p) {
int64_t poolid = p->first;
pg_pool_t &pool = p->second;
- int ruleno = pool.get_crush_ruleset();
- if (!osdmap->crush->rule_exists(ruleno))
+ int ruleno = osdmap->crush->find_rule(pool.get_crush_ruleset(), pool.get_type(), pool.get_size());
+ if (ruleno < 0 || !osdmap->crush->rule_exists(ruleno))
continue;
if (pool.get_last_change() <= pg_map.last_pg_scan ||
@@ -2113,7 +2113,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
((1000000 - p->second.cache_target_full_ratio_micro) *
g_conf->mon_cache_target_full_warn_ratio);
if (p->second.target_max_objects && (uint64_t)st.stats.sum.num_objects >
- p->second.target_max_objects * ratio / 1000000) {
+ p->second.target_max_objects * (ratio / 1000000.0)) {
nearfull = true;
if (detail) {
ostringstream ss;
@@ -2125,7 +2125,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
}
}
if (p->second.target_max_bytes && (uint64_t)st.stats.sum.num_bytes >
- p->second.target_max_bytes * ratio / 1000000) {
+ p->second.target_max_bytes * (ratio / 1000000.0)) {
nearfull = true;
if (detail) {
ostringstream ss;
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index 8d06b0b..4bdffc2 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -128,6 +128,16 @@ void PaxosService::refresh(bool *need_bootstrap)
update_from_paxos(need_bootstrap);
}
+void PaxosService::post_refresh()
+{
+ dout(10) << __func__ << dendl;
+
+ post_paxos_update();
+
+ if (mon->is_peon() && !waiting_for_finished_proposal.empty()) {
+ finish_contexts(g_ceph_context, waiting_for_finished_proposal, -EAGAIN);
+ }
+}
void PaxosService::remove_legacy_versions()
{
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
index 7c22592..d2f6285 100644
--- a/src/mon/PaxosService.h
+++ b/src/mon/PaxosService.h
@@ -322,6 +322,7 @@ public:
bool dispatch(PaxosServiceMessage *m);
void refresh(bool *need_bootstrap);
+ void post_refresh();
/**
* @defgroup PaxosService_h_override_funcs Functions that should be
@@ -858,7 +859,7 @@ public:
* @param bl The bufferlist to be populated
* @return 0 on success; <0 otherwise
*/
- int get_version(version_t ver, bufferlist& bl) {
+ virtual int get_version(version_t ver, bufferlist& bl) {
return mon->store->get(get_service_name(), ver, bl);
}
/**
@@ -868,7 +869,7 @@ public:
* @param bl The bufferlist to be populated
* @returns 0 on success; <0 otherwise
*/
- int get_version_full(version_t ver, bufferlist& bl) {
+ virtual int get_version_full(version_t ver, bufferlist& bl) {
string key = mon->store->combine_strings(full_prefix_name, ver);
return mon->store->get(get_service_name(), key, bl);
}
diff --git a/src/msg/simple/Pipe.cc b/src/msg/simple/Pipe.cc
index f5d8a36..ab277e0 100644
--- a/src/msg/simple/Pipe.cc
+++ b/src/msg/simple/Pipe.cc
@@ -1694,10 +1694,8 @@ void Pipe::writer()
<< " policy.server=" << policy.server << dendl;
// standby?
- if (is_queued() && state == STATE_STANDBY && !policy.server) {
- connect_seq++;
+ if (is_queued() && state == STATE_STANDBY && !policy.server)
state = STATE_CONNECTING;
- }
// connect?
if (state == STATE_CONNECTING) {
diff --git a/src/ocf/Makefile.in b/src/ocf/Makefile.in
index 9d49aa0..19267ac 100644
--- a/src/ocf/Makefile.in
+++ b/src/ocf/Makefile.in
@@ -200,6 +200,7 @@ GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
GIT_CHECK = @GIT_CHECK@
GREP = @GREP@
HAVE_CXX11 = @HAVE_CXX11@
+HAVE_VALGRIND = @HAVE_VALGRIND@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
diff --git a/src/os/WBThrottle.cc b/src/os/WBThrottle.cc
index 85bca32..f472a23 100644
--- a/src/os/WBThrottle.cc
+++ b/src/os/WBThrottle.cc
@@ -259,6 +259,7 @@ void WBThrottle::clear_object(const ghobject_t &hoid)
pending_wbs.erase(i);
remove_object(hoid);
+ cond.Signal();
}
void WBThrottle::throttle()
diff --git a/src/os/chain_xattr.cc b/src/os/chain_xattr.cc
index c0e64ea..c08acdb 100644
--- a/src/os/chain_xattr.cc
+++ b/src/os/chain_xattr.cc
@@ -138,6 +138,10 @@ int chain_getxattr(const char *fn, const char *name, void *val, size_t size)
size -= chunk_size;
r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size);
+ if (i && r == -ENODATA) {
+ ret = pos;
+ break;
+ }
if (r < 0) {
ret = r;
break;
@@ -201,6 +205,10 @@ int chain_fgetxattr(int fd, const char *name, void *val, size_t size)
size -= chunk_size;
r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size);
+ if (i && r == -ENODATA) {
+ ret = pos;
+ break;
+ }
if (r < 0) {
ret = r;
break;
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h
index 4290de8..d6e710d 100644
--- a/src/osd/ECBackend.h
+++ b/src/osd/ECBackend.h
@@ -385,7 +385,7 @@ public:
*
* Determines the whether _have is suffient to recover an object
*/
- class ECRecPred : public IsRecoverablePredicate {
+ class ECRecPred : public IsPGRecoverablePredicate {
set<int> want;
ErasureCodeInterfaceRef ec_impl;
public:
@@ -405,7 +405,7 @@ public:
return ec_impl->minimum_to_decode(want, have, &min) == 0;
}
};
- IsRecoverablePredicate *get_is_recoverable_predicate() {
+ IsPGRecoverablePredicate *get_is_recoverable_predicate() {
return new ECRecPred(ec_impl);
}
@@ -414,7 +414,7 @@ public:
*
* Determines the whether _have is suffient to read an object
*/
- class ECReadPred : public IsReadablePredicate {
+ class ECReadPred : public IsPGReadablePredicate {
pg_shard_t whoami;
ECRecPred rec_pred;
public:
@@ -425,7 +425,7 @@ public:
return _have.count(whoami) && rec_pred(_have);
}
};
- IsReadablePredicate *get_is_readable_predicate() {
+ IsPGReadablePredicate *get_is_readable_predicate() {
return new ECReadPred(get_parent()->whoami_shard(), ec_impl);
}
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 7dbcfc5..0c01ba6 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -684,8 +684,8 @@ void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epo
return;
}
const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer);
- Connection *peer_con = osd->cluster_messenger->get_connection(peer_inst).get();
- share_map_peer(peer, peer_con, next_map);
+ ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
+ share_map_peer(peer, peer_con.get(), next_map);
peer_con->send_message(m);
release_map(next_map);
}
@@ -2819,7 +2819,13 @@ void OSD::load_pgs()
dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
bufferlist bl;
- epoch_t map_epoch = PG::peek_map_epoch(store, pgid, &bl);
+ epoch_t map_epoch = 0;
+ int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
+ if (r < 0) {
+ derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
+ << dendl;
+ continue;
+ }
PG *pg = NULL;
if (map_epoch > 0) {
@@ -3012,6 +3018,8 @@ void OSD::build_past_intervals_parallel()
}
assert(last_map);
+ boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
+ pg->get_is_recoverable_predicate());
std::stringstream debug;
bool new_interval = pg_interval_t::check_new_interval(
p.primary,
@@ -3024,6 +3032,7 @@ void OSD::build_past_intervals_parallel()
pg->info.history.last_epoch_clean,
cur_map, last_map,
pgid,
+ recoverable.get(),
&pg->past_intervals,
&debug);
if (new_interval) {
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index f5021ef..a3b636e 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1244,6 +1244,13 @@ public:
++i) {
clear_session_waiting_on_pg(session, *i);
}
+ /* Messages have connection refs, we need to clear the
+ * connection->session->message->connection
+ * cycles which result.
+ * Bug #12338
+ */
+ session->waiting_on_map.clear();
+ session->waiting_for_pg.clear();
}
void register_session_waiting_on_pg(Session *session, spg_t pgid) {
Mutex::Locker l(session_waiting_lock);
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 1700f6b..a9154d4 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -1283,13 +1283,6 @@ int OSDMap::apply_incremental(const Incremental &inc)
if (inc.new_pool_max != -1)
pool_max = inc.new_pool_max;
- for (set<int64_t>::const_iterator p = inc.old_pools.begin();
- p != inc.old_pools.end();
- ++p) {
- pools.erase(*p);
- name_pool.erase(pool_name[*p]);
- pool_name.erase(*p);
- }
for (map<int64_t,pg_pool_t>::const_iterator p = inc.new_pools.begin();
p != inc.new_pools.end();
++p) {
@@ -1304,6 +1297,13 @@ int OSDMap::apply_incremental(const Incremental &inc)
pool_name[p->first] = p->second;
name_pool[p->second] = p->first;
}
+ for (set<int64_t>::const_iterator p = inc.old_pools.begin();
+ p != inc.old_pools.end();
+ ++p) {
+ pools.erase(*p);
+ name_pool.erase(pool_name[*p]);
+ pool_name.erase(*p);
+ }
for (map<int32_t,uint32_t>::const_iterator i = inc.new_weight.begin();
i != inc.new_weight.end();
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index bfe59b7..7b91bf8 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -694,6 +694,8 @@ void PG::generate_past_intervals()
pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
cur_map->pg_to_up_acting_osds(pgid, &up, &up_primary, &acting, &primary);
+ boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
+ get_is_recoverable_predicate());
std::stringstream debug;
bool new_interval = pg_interval_t::check_new_interval(
old_primary,
@@ -709,6 +711,7 @@ void PG::generate_past_intervals()
cur_map,
last_map,
pgid,
+ recoverable.get(),
&past_intervals,
&debug);
if (new_interval) {
@@ -1336,7 +1339,7 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id)
}
/* Check whether we have enough acting shards to later perform recovery */
- boost::scoped_ptr<PGBackend::IsRecoverablePredicate> recoverable_predicate(
+ boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
get_pgbackend()->get_is_recoverable_predicate());
set<pg_shard_t> have;
for (int i = 0; i < (int)want.size(); ++i) {
@@ -2805,9 +2808,10 @@ bool PG::_has_removal_flag(ObjectStore *store,
return false;
}
-epoch_t PG::peek_map_epoch(ObjectStore *store,
- spg_t pgid,
- bufferlist *bl)
+int PG::peek_map_epoch(ObjectStore *store,
+ spg_t pgid,
+ epoch_t *pepoch,
+ bufferlist *bl)
{
coll_t coll(pgid);
hobject_t legacy_infos_oid(OSD::make_infos_oid());
@@ -2852,7 +2856,8 @@ epoch_t PG::peek_map_epoch(ObjectStore *store,
return 0;
if (struct_v < 6) {
::decode(cur_epoch, bp);
- return cur_epoch;
+ *pepoch = cur_epoch;
+ return 0;
}
// get epoch out of leveldb
@@ -2861,13 +2866,19 @@ epoch_t PG::peek_map_epoch(ObjectStore *store,
values.clear();
keys.insert(ek);
store->omap_get_values(META_COLL, legacy_infos_oid, keys, &values);
- assert(values.size() == 1);
+ if (values.size() < 1) {
+ // see #13060: this suggests we failed to upgrade this pg
+ // because it was a zombie and then removed the legacy infos
+ // object. skip it.
+ return -1;
+ }
bufferlist::iterator p = values[ek].begin();
::decode(cur_epoch, p);
} else {
assert(0 == "unable to open pg metadata");
}
- return cur_epoch;
+ *pepoch = cur_epoch;
+ return 0;
}
#pragma GCC diagnostic pop
@@ -4189,9 +4200,14 @@ void PG::scrub_compare_maps()
maps[*i] = &scrubber.received_maps[*i];
}
+ // can we relate scrub digests to oi digests?
+ bool okseed = (get_min_peer_features() & CEPH_FEATURE_OSD_OBJECT_DIGEST);
+ assert(okseed == (scrubber.seed == 0xffffffff));
+
get_pgbackend()->be_compare_scrubmaps(
maps,
- scrubber.seed == 0xffffffff, // can we relate scrub digests to oi digests?
+ okseed,
+ state_test(PG_STATE_REPAIR),
scrubber.missing,
scrubber.inconsistent,
authoritative,
@@ -4202,7 +4218,7 @@ void PG::scrub_compare_maps()
ss);
dout(2) << ss.str() << dendl;
- if (!authoritative.empty()) {
+ if (!ss.str().empty()) {
osd->clog->error(ss);
}
@@ -4737,6 +4753,8 @@ void PG::start_peering_interval(
info.history.same_interval_since = osdmap->get_epoch();
} else {
std::stringstream debug;
+ boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
+ get_is_recoverable_predicate());
bool new_interval = pg_interval_t::check_new_interval(
old_acting_primary.osd,
new_acting_primary,
@@ -4749,6 +4767,7 @@ void PG::start_peering_interval(
osdmap,
lastmap,
info.pgid.pgid,
+ recoverable.get(),
&past_intervals,
&debug);
dout(10) << __func__ << ": check_new_interval output: "
@@ -7466,7 +7485,7 @@ void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_
#define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
PG::PriorSet::PriorSet(bool ec_pool,
- PGBackend::IsRecoverablePredicate *c,
+ IsPGRecoverablePredicate *c,
const OSDMap &osdmap,
const map<epoch_t, pg_interval_t> &past_intervals,
const vector<int> &up,
diff --git a/src/osd/PG.h b/src/osd/PG.h
index f69d431..41de9d6 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -197,6 +197,10 @@ public:
void update_snap_mapper_bits(uint32_t bits) {
snap_mapper.update_bits(bits);
}
+ /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
+ IsPGRecoverablePredicate *get_is_recoverable_predicate() {
+ return get_pgbackend()->get_is_recoverable_predicate();
+ }
protected:
// Ops waiting for map, should be queued at back
Mutex map_lock;
@@ -315,13 +319,13 @@ public:
PG *pg;
set<pg_shard_t> empty_set;
public:
- boost::scoped_ptr<PGBackend::IsReadablePredicate> is_readable;
- boost::scoped_ptr<PGBackend::IsRecoverablePredicate> is_recoverable;
+ boost::scoped_ptr<IsPGReadablePredicate> is_readable;
+ boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
MissingLoc(PG *pg)
: pg(pg) {}
void set_backend_predicates(
- PGBackend::IsReadablePredicate *_is_readable,
- PGBackend::IsRecoverablePredicate *_is_recoverable) {
+ IsPGReadablePredicate *_is_readable,
+ IsPGRecoverablePredicate *_is_recoverable) {
is_readable.reset(_is_readable);
is_recoverable.reset(_is_recoverable);
}
@@ -492,9 +496,9 @@ public:
map<int, epoch_t> blocked_by; /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
bool pg_down; /// some down osds are included in @a cur; the DOWN pg state bit should be set.
- boost::scoped_ptr<PGBackend::IsRecoverablePredicate> pcontdec;
+ boost::scoped_ptr<IsPGRecoverablePredicate> pcontdec;
PriorSet(bool ec_pool,
- PGBackend::IsRecoverablePredicate *c,
+ IsPGRecoverablePredicate *c,
const OSDMap &osdmap,
const map<epoch_t, pg_interval_t> &past_intervals,
const vector<int> &up,
@@ -549,7 +553,7 @@ public:
on_applied(rctx.on_applied),
on_safe(rctx.on_safe),
transaction(rctx.transaction),
- handle(NULL) {}
+ handle(rctx.handle) {}
void accept_buffered_messages(BufferedRecoveryMessages &m) {
assert(query_map);
@@ -2142,7 +2146,8 @@ public:
__u8 &);
void read_state(ObjectStore *store, bufferlist &bl);
static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
- static epoch_t peek_map_epoch(ObjectStore *store, spg_t pgid, bufferlist *bl);
+ static int peek_map_epoch(ObjectStore *store, spg_t pgid,
+ epoch_t *pepoch, bufferlist *bl);
void update_snap_map(
const vector<pg_log_entry_t> &log_entries,
ObjectStore::Transaction& t);
diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc
index cb856eb..7fc56d1 100644
--- a/src/osd/PGBackend.cc
+++ b/src/osd/PGBackend.cc
@@ -390,9 +390,11 @@ enum scrub_error_type PGBackend::be_compare_scrub_objects(
if (error != CLEAN)
errorstream << ", ";
error = DEEP_ERROR;
+ bool known = okseed && auth_oi.is_data_digest() &&
+ auth.digest == auth_oi.data_digest;
errorstream << "data_digest 0x" << std::hex << candidate.digest
<< " != "
- << (auth_oi.is_data_digest() && okseed ? "known" : "best guess")
+ << (known ? "known" : "best guess")
<< " data_digest 0x" << auth.digest << std::dec
<< " from auth shard " << auth_shard;
}
@@ -402,9 +404,11 @@ enum scrub_error_type PGBackend::be_compare_scrub_objects(
if (error != CLEAN)
errorstream << ", ";
error = DEEP_ERROR;
+ bool known = okseed && auth_oi.is_omap_digest() &&
+ auth.digest == auth_oi.omap_digest;
errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
<< " != "
- << (auth_oi.is_omap_digest() && okseed ? "known" : "best guess")
+ << (known ? "known" : "best guess")
<< " omap_digest 0x" << auth.omap_digest << std::dec
<< " from auth shard " << auth_shard;
}
@@ -494,6 +498,12 @@ map<pg_shard_t, ScrubMap *>::const_iterator
// invalid object info, probably corrupt
continue;
}
+
+ // note candidate in case we can't find anything better, because
+ // something is better than nothing. FIXME.
+ auth = j;
+ *auth_oi = oi;
+
uint64_t correct_size = be_get_ondisk_size(oi.size);
if (correct_size != i->second.size) {
// invalid size, probably corrupt
@@ -524,18 +534,19 @@ map<pg_shard_t, ScrubMap *>::const_iterator
continue;
}
}
- dout(10) << __func__ << ": selecting osd " << j->first
- << " for obj " << obj
- << dendl;
- auth = j;
- *auth_oi = oi;
+ break;
}
+ dout(10) << __func__ << ": selecting osd " << auth->first
+ << " for obj " << obj
+ << " with oi " << *auth_oi
+ << dendl;
return auth;
}
void PGBackend::be_compare_scrubmaps(
const map<pg_shard_t,ScrubMap*> &maps,
bool okseed,
+ bool repair,
map<hobject_t, set<pg_shard_t> > &missing,
map<hobject_t, set<pg_shard_t> > &inconsistent,
map<hobject_t, list<pg_shard_t> > &authoritative,
@@ -566,14 +577,10 @@ void PGBackend::be_compare_scrubmaps(
be_select_auth_object(*k, maps, okseed, &auth_oi);
list<pg_shard_t> auth_list;
if (auth == maps.end()) {
- // Something is better than nothing
- // TODO: something is NOT better than nothing, do something like
- // unfound_lost if no valid copies can be found, or just mark unfound
- auth = maps.begin();
- dout(10) << __func__ << ": selecting osd " << auth->first
- << " for obj " << *k
- << ", something is better than nothing, FIXME"
- << dendl;
+ dout(10) << __func__ << ": unable to find any auth object" << dendl;
+ ++shallow_errors;
+ errorstream << pgid << " shard " << j->first
+ << ": soid failed to pick suitable auth object\n";
continue;
}
auth_list.push_back(auth->first);
@@ -581,6 +588,7 @@ void PGBackend::be_compare_scrubmaps(
ScrubMap::object& auth_object = auth->second->objects[*k];
set<pg_shard_t> cur_missing;
set<pg_shard_t> cur_inconsistent;
+ bool clean = true;
for (j = maps.begin(); j != maps.end(); ++j) {
if (j == auth)
continue;
@@ -595,21 +603,23 @@ void PGBackend::be_compare_scrubmaps(
j->second->objects[*k],
ss);
if (error != CLEAN) {
+ clean = false;
cur_inconsistent.insert(j->first);
if (error == SHALLOW_ERROR)
++shallow_errors;
else
++deep_errors;
- errorstream << __func__ << ": " << pgid << " shard " << j->first
- << ": soid " << *k << " " << ss.str();
+ errorstream << pgid << " shard " << j->first << ": soid " << *k
+ << " " << ss.str() << "\n";
} else {
auth_list.push_back(j->first);
}
} else {
+ clean = false;
cur_missing.insert(j->first);
++shallow_errors;
- errorstream << __func__ << ": " << pgid << " shard " << j->first
- << " missing " << *k;
+ errorstream << pgid << " shard " << j->first << " missing " << *k
+ << "\n";
}
}
if (!cur_missing.empty()) {
@@ -621,20 +631,54 @@ void PGBackend::be_compare_scrubmaps(
if (!cur_inconsistent.empty() || !cur_missing.empty()) {
authoritative[*k] = auth_list;
}
+
if (okseed &&
- parent->get_pool().is_replicated() &&
- auth_object.digest_present && auth_object.omap_digest_present &&
- (!auth_oi.is_data_digest() || !auth_oi.is_omap_digest() ||
- (g_conf->osd_debug_scrub_chance_rewrite_digest &&
+ clean &&
+ parent->get_pool().is_replicated()) {
+ enum {
+ NO = 0,
+ MAYBE = 1,
+ FORCE = 2,
+ } update = NO;
+
+ // recorded digest != actual digest?
+ if (auth_oi.is_data_digest() && auth_object.digest_present &&
+ auth_oi.data_digest != auth_object.digest) {
+ ++deep_errors;
+ errorstream << pgid << " recorded data digest 0x"
+ << std::hex << auth_oi.data_digest << " != on disk 0x"
+ << auth_object.digest << std::dec << " on " << auth_oi.soid
+ << "\n";
+ if (repair)
+ update = FORCE;
+ }
+ if (auth_oi.is_omap_digest() && auth_object.omap_digest_present &&
+ auth_oi.omap_digest != auth_object.omap_digest) {
+ ++deep_errors;
+ errorstream << pgid << " recorded omap digest 0x"
+ << std::hex << auth_oi.data_digest << " != on disk 0x"
+ << auth_object.digest << std::dec << " on " << auth_oi.soid
+ << "\n";
+ if (repair)
+ update = FORCE;
+ }
+
+ if (auth_object.digest_present && auth_object.omap_digest_present &&
+ (!auth_oi.is_data_digest() || !auth_oi.is_omap_digest())) {
+ dout(20) << __func__ << " missing digest on " << *k << dendl;
+ update = MAYBE;
+ }
+ if (g_conf->osd_debug_scrub_chance_rewrite_digest &&
(((unsigned)rand() % 100) >
- g_conf->osd_debug_scrub_chance_rewrite_digest)))) {
- if (!cur_inconsistent.empty() || !cur_missing.empty()) {
- dout(20) << __func__ << " not updating oi digest on "
- << *k << " since it is inconsistent" << dendl;
- } else {
+ g_conf->osd_debug_scrub_chance_rewrite_digest)) {
+ dout(20) << __func__ << " randomly updating digest on " << *k << dendl;
+ update = MAYBE;
+ }
+ if (update != NO) {
utime_t age = now - auth_oi.local_mtime;
- if (age > g_conf->osd_deep_scrub_update_digest_min_age) {
- dout(20) << __func__ << " noting missing digest on " << *k << dendl;
+ if (update == FORCE ||
+ age > g_conf->osd_deep_scrub_update_digest_min_age) {
+ dout(20) << __func__ << " will update digest on " << *k << dendl;
missing_digest[*k] = make_pair(auth_object.digest,
auth_object.omap_digest);
} else {
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index 91b4d10..1e93641 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -318,25 +318,8 @@
virtual void on_flushed() = 0;
- class IsRecoverablePredicate {
- public:
- /**
- * have encodes the shards available
- */
- virtual bool operator()(const set<pg_shard_t> &have) const = 0;
- virtual ~IsRecoverablePredicate() {}
- };
- virtual IsRecoverablePredicate *get_is_recoverable_predicate() = 0;
-
- class IsReadablePredicate {
- public:
- /**
- * have encodes the shards available
- */
- virtual bool operator()(const set<pg_shard_t> &have) const = 0;
- virtual ~IsReadablePredicate() {}
- };
- virtual IsReadablePredicate *get_is_readable_predicate() = 0;
+ virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() = 0;
+ virtual IsPGReadablePredicate *get_is_readable_predicate() = 0;
void temp_colls(list<coll_t> *out) {
if (temp_created)
@@ -606,6 +589,7 @@
void be_compare_scrubmaps(
const map<pg_shard_t,ScrubMap*> &maps,
bool okseed, ///< true if scrub digests have same seed our oi digests
+ bool repair,
map<hobject_t, set<pg_shard_t> > &missing,
map<hobject_t, set<pg_shard_t> > &inconsistent,
map<hobject_t, list<pg_shard_t> > &authoritative,
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index 8c02309..b619bcd 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -190,6 +190,18 @@ void PGLog::proc_replica_log(
dout(10) << "proc_replica_log for osd." << from << ": "
<< oinfo << " " << olog << " " << omissing << dendl;
+ if (olog.head < log.tail) {
+ dout(10) << __func__ << ": osd." << from << " does not overlap, not looking "
+ << "for divergent objects" << dendl;
+ return;
+ }
+ if (olog.head == log.head) {
+ dout(10) << __func__ << ": osd." << from << " same log head, not looking "
+ << "for divergent objects" << dendl;
+ return;
+ }
+ assert(olog.head >= log.tail);
+
/*
basically what we're doing here is rewinding the remote log,
dropping divergent entries, until we find something that matches
@@ -207,48 +219,54 @@ void PGLog::proc_replica_log(
<< " have " << i->second.have << dendl;
}
- list<pg_log_entry_t>::const_iterator fromiter = log.log.end();
- eversion_t lower_bound = log.tail;
+ list<pg_log_entry_t>::const_reverse_iterator first_non_divergent =
+ log.log.rbegin();
while (1) {
- if (fromiter == log.log.begin())
+ if (first_non_divergent == log.log.rend())
break;
- --fromiter;
- if (fromiter->version <= olog.head) {
- dout(20) << "merge_log cut point (usually last shared) is "
- << *fromiter << dendl;
- lower_bound = fromiter->version;
- ++fromiter;
+ if (first_non_divergent->version <= olog.head) {
+ dout(20) << "merge_log point (usually last shared) is "
+ << *first_non_divergent << dendl;
break;
}
- }
+ ++first_non_divergent;
+ }
+
+ /* Because olog.head >= log.tail, we know that both pgs must at least have
+ * the event represented by log.tail. Thus, lower_bound >= log.tail. It's
+ * possible that olog/log contain no actual events between olog.head and
+ * log.tail, however, since they might have been split out. Thus, if
+ * we cannot find an event e such that log.tail <= e.version <= log.head,
+ * the last_update must actually be log.tail.
+ */
+ eversion_t lu =
+ (first_non_divergent == log.log.rend() ||
+ first_non_divergent->version < log.tail) ?
+ log.tail :
+ first_non_divergent->version;
list<pg_log_entry_t> divergent;
list<pg_log_entry_t>::const_iterator pp = olog.log.end();
- eversion_t lu(oinfo.last_update);
while (true) {
- if (pp == olog.log.begin()) {
- if (pp != olog.log.end()) // no last_update adjustment if we discard nothing!
- lu = olog.tail;
+ if (pp == olog.log.begin())
break;
- }
+
--pp;
const pg_log_entry_t& oe = *pp;
// don't continue past the tail of our log.
if (oe.version <= log.tail) {
- lu = oe.version;
++pp;
break;
}
- if (oe.version <= lower_bound) {
- lu = oe.version;
+ if (oe.version <= lu) {
++pp;
break;
}
divergent.push_front(oe);
- }
+ }
IndexedLog folog;
@@ -565,6 +583,7 @@ void PGLog::merge_log(ObjectStore::Transaction& t,
dout(10) << "merge_log extending tail to " << olog.tail << dendl;
list<pg_log_entry_t>::iterator from = olog.log.begin();
list<pg_log_entry_t>::iterator to;
+ eversion_t last;
for (to = from;
to != olog.log.end();
++to) {
@@ -572,12 +591,10 @@ void PGLog::merge_log(ObjectStore::Transaction& t,
break;
log.index(*to);
dout(15) << *to << dendl;
+ last = to->version;
}
-
- if (to == olog.log.end())
- mark_dirty_to(oinfo.last_update);
- else
- mark_dirty_to(to->version);
+ mark_dirty_to(last);
+
// splice into our log.
log.log.splice(log.log.begin(),
olog.log, from, to);
@@ -801,7 +818,7 @@ void PGLog::_write_log(
map<string,bufferlist> keys;
for (list<pg_log_entry_t>::iterator p = log.log.begin();
- p != log.log.end() && p->version < dirty_to;
+ p != log.log.end() && p->version <= dirty_to;
++p) {
bufferlist bl(sizeof(*p) * 2);
p->encode_with_checksum(bl);
diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h
index dcb966b..7029e90 100644
--- a/src/osd/PGLog.h
+++ b/src/osd/PGLog.h
@@ -307,9 +307,9 @@ protected:
/// Log is clean on [dirty_to, dirty_from)
bool touched_log;
- eversion_t dirty_to; ///< must clear/writeout all keys up to dirty_to
- eversion_t dirty_from; ///< must clear/writeout all keys past dirty_from
- eversion_t writeout_from; ///< must writout keys past writeout_from
+ eversion_t dirty_to; ///< must clear/writeout all keys <= dirty_to
+ eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from
+ eversion_t writeout_from; ///< must writout keys >= writeout_from
set<eversion_t> trimmed; ///< must clear keys in trimmed
bool dirty_divergent_priors;
CephContext *cct;
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
index 5ad22bf..5090657 100644
--- a/src/osd/ReplicatedBackend.h
+++ b/src/osd/ReplicatedBackend.h
@@ -73,17 +73,17 @@ public:
void clear_recovery_state();
void on_flushed();
- class RPCRecPred : public IsRecoverablePredicate {
+ class RPCRecPred : public IsPGRecoverablePredicate {
public:
bool operator()(const set<pg_shard_t> &have) const {
return !have.empty();
}
};
- IsRecoverablePredicate *get_is_recoverable_predicate() {
+ IsPGRecoverablePredicate *get_is_recoverable_predicate() {
return new RPCRecPred;
}
- class RPCReadPred : public IsReadablePredicate {
+ class RPCReadPred : public IsPGReadablePredicate {
pg_shard_t whoami;
public:
RPCReadPred(pg_shard_t whoami) : whoami(whoami) {}
@@ -91,7 +91,7 @@ public:
return have.count(whoami);
}
};
- IsReadablePredicate *get_is_readable_predicate() {
+ IsPGReadablePredicate *get_is_readable_predicate() {
return new RPCReadPred(get_parent()->whoami_shard());
}
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 66fd948..e1d0acf 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -5999,6 +5999,8 @@ int ReplicatedPG::fill_in_copy_get(
reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
reply_obj.omap_digest = oi.omap_digest;
}
+ reply_obj.truncate_seq = oi.truncate_seq;
+ reply_obj.truncate_size = oi.truncate_size;
// attrs
map<string,bufferlist>& out_attrs = reply_obj.attrs;
@@ -6188,6 +6190,8 @@ void ReplicatedPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
&cop->results.source_data_digest,
&cop->results.source_omap_digest,
&cop->results.reqids,
+ &cop->results.truncate_seq,
+ &cop->results.truncate_size,
&cop->rval);
C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
@@ -6285,6 +6289,8 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
cop->results.final_tx = pgbackend->get_transaction();
_build_finish_copy_transaction(cop, cop->results.final_tx);
+ derr << __func__ << " got truncate_seq " << cop->results.truncate_seq
+ << " " << cop->results.truncate_size << dendl;
// verify digests?
dout(20) << __func__ << std::hex
<< " got digest: rx data 0x" << cop->results.data_digest
@@ -6485,6 +6491,9 @@ void ReplicatedPG::finish_copyfrom(OpContext *ctx)
obs.oi.set_data_digest(cb->results->data_digest);
obs.oi.set_omap_digest(cb->results->omap_digest);
+ obs.oi.truncate_seq = cb->results->truncate_seq;
+ obs.oi.truncate_size = cb->results->truncate_size;
+
ctx->extra_reqids = cb->results->reqids;
// cache: clear whiteout?
@@ -6660,6 +6669,13 @@ void ReplicatedPG::finish_promote(int r, CopyResults *results,
}
tctx->new_obs.oi.size = results->object_size;
tctx->new_obs.oi.user_version = results->user_version;
+ // Don't care src object whether have data or omap digest
+ if (results->object_size)
+ tctx->new_obs.oi.set_data_digest(results->data_digest);
+ if (results->has_omap)
+ tctx->new_obs.oi.set_omap_digest(results->omap_digest);
+ tctx->new_obs.oi.truncate_seq = results->truncate_seq;
+ tctx->new_obs.oi.truncate_size = results->truncate_size;
if (soid.snap != CEPH_NOSNAP) {
tctx->new_obs.oi.snaps = results->snaps;
@@ -6694,6 +6710,10 @@ void ReplicatedPG::finish_promote(int r, CopyResults *results,
simple_repop_submit(repop);
osd->logger->inc(l_osd_tier_promote);
+
+ if (agent_state &&
+ agent_state->is_idle())
+ agent_choose_mode();
}
void ReplicatedPG::cancel_copy(CopyOpRef cop, bool requeue)
@@ -11251,26 +11271,6 @@ void ReplicatedPG::_scrub(
dout(20) << mode << " " << soid << " " << oi << dendl;
- if (pool.info.is_replicated() &&
- (get_min_peer_features() & CEPH_FEATURE_OSD_OBJECT_DIGEST)) {
- if (oi.is_data_digest() && p->second.digest_present &&
- oi.data_digest != p->second.digest) {
- osd->clog->error() << mode << " " << info.pgid << " " << soid
- << " on disk data digest 0x" << std::hex
- << p->second.digest << " != 0x"
- << oi.data_digest << std::dec;
- ++scrubber.deep_errors;
- }
- if (oi.is_omap_digest() && p->second.omap_digest_present &&
- oi.omap_digest != p->second.omap_digest) {
- osd->clog->error() << mode << " " << info.pgid << " " << soid
- << " on disk omap digest 0x" << std::hex
- << p->second.omap_digest << " != 0x"
- << oi.omap_digest << std::dec;
- ++scrubber.deep_errors;
- }
- }
-
if (soid.is_snap()) {
stat.num_bytes += snapset.get_clone_bytes(soid.snap);
} else {
@@ -11382,27 +11382,25 @@ void ReplicatedPG::_scrub(
++scrubber.shallow_errors;
}
- if (scrubber.shallow_errors == 0) {
- for (map<hobject_t,pair<uint32_t,uint32_t> >::const_iterator p =
- missing_digest.begin();
- p != missing_digest.end();
- ++p) {
- if (p->first.is_snapdir())
- continue;
- dout(10) << __func__ << " recording digests for " << p->first << dendl;
- ObjectContextRef obc = get_object_context(p->first, false);
- assert(obc);
- RepGather *repop = simple_repop_create(obc);
- OpContext *ctx = repop->ctx;
- ctx->at_version = get_next_version();
- ctx->mtime = utime_t(); // do not update mtime
- ctx->new_obs.oi.set_data_digest(p->second.first);
- ctx->new_obs.oi.set_omap_digest(p->second.second);
- finish_ctx(ctx, pg_log_entry_t::MODIFY, true, true);
- ctx->on_finish = new C_ScrubDigestUpdated(this);
- simple_repop_submit(repop);
- ++scrubber.num_digest_updates_pending;
- }
+ for (map<hobject_t,pair<uint32_t,uint32_t> >::const_iterator p =
+ missing_digest.begin();
+ p != missing_digest.end();
+ ++p) {
+ if (p->first.is_snapdir())
+ continue;
+ dout(10) << __func__ << " recording digests for " << p->first << dendl;
+ ObjectContextRef obc = get_object_context(p->first, false);
+ assert(obc);
+ RepGather *repop = simple_repop_create(obc);
+ OpContext *ctx = repop->ctx;
+ ctx->at_version = get_next_version();
+ ctx->mtime = utime_t(); // do not update mtime
+ ctx->new_obs.oi.set_data_digest(p->second.first);
+ ctx->new_obs.oi.set_omap_digest(p->second.second);
+ finish_ctx(ctx, pg_log_entry_t::MODIFY, true, true);
+ ctx->on_finish = new C_ScrubDigestUpdated(this);
+ simple_repop_submit(repop);
+ ++scrubber.num_digest_updates_pending;
}
dout(10) << "_scrub (" << mode << ") finish" << dendl;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 5b5bc23..48e0def 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -127,6 +127,8 @@ public:
uint32_t source_data_digest, source_omap_digest;
uint32_t data_digest, omap_digest;
vector<pair<osd_reqid_t, version_t> > reqids; // [(reqid, user_version)]
+ uint64_t truncate_seq;
+ uint64_t truncate_size;
bool is_data_digest() {
return flags & object_copy_data_t::FLAG_DATA_DIGEST;
}
@@ -140,7 +142,8 @@ public:
has_omap(false),
flags(0),
source_data_digest(-1), source_omap_digest(-1),
- data_digest(-1), omap_digest(-1)
+ data_digest(-1), omap_digest(-1),
+ truncate_seq(0), truncate_size(0)
{}
};
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 3774452..b2bea5b 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -931,6 +931,16 @@ void pg_pool_t::dump(Formatter *f) const
f->dump_unsigned("expected_num_objects", expected_num_objects);
}
+void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
+ for (size_t i = 0; i < from.size(); ++i) {
+ if (from[i] != CRUSH_ITEM_NONE) {
+ to->insert(
+ pg_shard_t(
+ from[i],
+ ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
+ }
+ }
+}
int pg_pool_t::calc_bits_of(int t)
{
@@ -2547,6 +2557,8 @@ bool pg_interval_t::is_new_interval(
int new_up_primary,
const vector<int> &old_up,
const vector<int> &new_up,
+ int old_size,
+ int new_size,
int old_min_size,
int new_min_size,
unsigned old_pg_num,
@@ -2557,6 +2569,7 @@ bool pg_interval_t::is_new_interval(
old_up_primary != new_up_primary ||
new_up != old_up ||
old_min_size != new_min_size ||
+ old_size != new_size ||
pgid.is_split(old_pg_num, new_pg_num, 0);
}
@@ -2581,6 +2594,8 @@ bool pg_interval_t::is_new_interval(
new_up_primary,
old_up,
new_up,
+ lastmap->get_pools().find(pgid.pool())->second.size,
+ osdmap->get_pools().find(pgid.pool())->second.size,
lastmap->get_pools().find(pgid.pool())->second.min_size,
osdmap->get_pools().find(pgid.pool())->second.min_size,
lastmap->get_pg_num(pgid.pool()),
@@ -2602,6 +2617,7 @@ bool pg_interval_t::check_new_interval(
OSDMapRef osdmap,
OSDMapRef lastmap,
pg_t pgid,
+ IsPGRecoverablePredicate *could_have_gone_active,
map<epoch_t, pg_interval_t> *past_intervals,
std::ostream *out)
{
@@ -2635,9 +2651,14 @@ bool pg_interval_t::check_new_interval(
if (*p != CRUSH_ITEM_NONE)
++num_acting;
+ const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
+ set<pg_shard_t> old_acting_shards;
+ old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);
+
if (num_acting &&
i.primary != -1 &&
- num_acting >= lastmap->get_pools().find(pgid.pool())->second.min_size) {
+ num_acting >= old_pg_pool.min_size &&
+ (*could_have_gone_active)(old_acting_shards)) {
if (out)
*out << "generate_past_intervals " << i
<< ": not rw,"
@@ -3578,7 +3599,7 @@ void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const
return;
}
- ENCODE_START(6, 5, bl);
+ ENCODE_START(7, 5, bl);
::encode(size, bl);
::encode(mtime, bl);
::encode(attrs, bl);
@@ -3592,12 +3613,14 @@ void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const
::encode(data_digest, bl);
::encode(omap_digest, bl);
::encode(reqids, bl);
+ ::encode(truncate_seq, bl);
+ ::encode(truncate_size, bl);
ENCODE_FINISH(bl);
}
void object_copy_data_t::decode(bufferlist::iterator& bl)
{
- DECODE_START(6, bl);
+ DECODE_START(7, bl);
if (struct_v < 5) {
// old
::decode(size, bl);
@@ -3655,6 +3678,10 @@ void object_copy_data_t::decode(bufferlist::iterator& bl)
if (struct_v >= 6) {
::decode(reqids, bl);
}
+ if (struct_v >= 7) {
+ ::decode(truncate_seq, bl);
+ ::decode(truncate_size, bl);
+ }
}
DECODE_FINISH(bl);
}
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 6525a0c..b9b3b81 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -95,6 +95,24 @@ WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
ostream &operator<<(ostream &lhs, const pg_shard_t &rhs);
+class IsPGRecoverablePredicate {
+public:
+ /**
+ * have encodes the shards available
+ */
+ virtual bool operator()(const set<pg_shard_t> &have) const = 0;
+ virtual ~IsPGRecoverablePredicate() {}
+};
+
+class IsPGReadablePredicate {
+public:
+ /**
+ * have encodes the shards available
+ */
+ virtual bool operator()(const set<pg_shard_t> &have) const = 0;
+ virtual ~IsPGReadablePredicate() {}
+};
+
inline ostream& operator<<(ostream& out, const osd_reqid_t& r) {
return out << r.name << "." << r.inc << ":" << r.tid;
}
@@ -879,6 +897,9 @@ struct pg_pool_t {
return 0;
}
+ /// converts the acting/up vector to a set of pg shards
+ void convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const;
+
typedef enum {
CACHEMODE_NONE = 0, ///< no caching
CACHEMODE_WRITEBACK = 1, ///< write to cache, flush later
@@ -1845,6 +1866,8 @@ struct pg_interval_t {
int new_up_primary,
const vector<int> &old_up,
const vector<int> &new_up,
+ int old_size,
+ int new_size,
int old_min_size,
int new_min_size,
unsigned old_pg_num,
@@ -1887,6 +1910,7 @@ struct pg_interval_t {
ceph::shared_ptr<const OSDMap> osdmap, ///< [in] current map
ceph::shared_ptr<const OSDMap> lastmap, ///< [in] last map
pg_t pgid, ///< [in] pgid for pg
+ IsPGRecoverablePredicate *could_have_gone_active, /// [in] predicate whether the pg can be active
map<epoch_t, pg_interval_t> *past_intervals,///< [out] intervals
ostream *out = 0 ///< [out] debug ostream
);
@@ -2582,9 +2606,15 @@ struct object_copy_data_t {
///< recent reqids on this object
vector<pair<osd_reqid_t, version_t> > reqids;
+ uint64_t truncate_seq;
+ uint64_t truncate_size;
+
public:
- object_copy_data_t() : size((uint64_t)-1), data_digest(-1),
- omap_digest(-1), flags(0) {}
+ object_copy_data_t() :
+ size((uint64_t)-1), data_digest(-1),
+ omap_digest(-1), flags(0),
+ truncate_seq(0),
+ truncate_size(0) {}
static void generate_test_instances(list<object_copy_data_t*>& o);
void encode_classic(bufferlist& bl) const;
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
index 95f4b8f..d21292e 100644
--- a/src/osdc/ObjectCacher.cc
+++ b/src/osdc/ObjectCacher.cc
@@ -379,7 +379,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(OSDWrite *wr)
if (p->first < cur) {
assert(final == 0);
- if (cur + max >= p->first + p->second->length()) {
+ if (cur + max >= bh->end()) {
// we want right bit (one splice)
final = split(bh, cur); // just split it, take right half.
++p;
@@ -393,7 +393,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(OSDWrite *wr)
}
} else {
assert(p->first == cur);
- if (p->second->length() <= max) {
+ if (bh->length() <= max) {
// whole bufferhead, piece of cake.
} else {
// we want left bit (one splice)
@@ -886,6 +886,7 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start,
}
}
+ list <BufferHead*> hit;
// apply to bh's!
for (map<loff_t, BufferHead*>::iterator p = ob->data_lower_bound(start);
p != ob->data.end();
@@ -917,6 +918,7 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start,
if (r >= 0) {
// ok! mark bh clean and error-free
mark_clean(bh);
+ hit.push_back(bh);
ldout(cct, 10) << "bh_write_commit clean " << *bh << dendl;
} else {
mark_dirty(bh);
@@ -926,6 +928,13 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start,
}
}
+ for (list<BufferHead*>::iterator bh = hit.begin();
+ bh != hit.end();
+ ++bh) {
+ assert(*bh);
+ ob->try_merge_bh(*bh);
+ }
+
// update last_commit.
assert(ob->last_commit_tid < tid);
ob->last_commit_tid = tid;
@@ -1060,6 +1069,13 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
map<uint64_t, bufferlist> stripe_map; // final buffer offset -> substring
bool dontneed = rd->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
+ /*
+ * WARNING: we can only meaningfully return ENOENT if the read request
+ * passed in a single ObjectExtent. Any caller who wants ENOENT instead of
+ * zeroed buffers needs to feed single extents into readx().
+ */
+ assert(!oset->return_enoent || rd->extents.size() == 1);
+
for (vector<ObjectExtent>::iterator ex_it = rd->extents.begin();
ex_it != rd->extents.end();
++ex_it) {
@@ -1075,10 +1091,6 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
// does not exist and no hits?
if (oset->return_enoent && !o->exists) {
- // WARNING: we can only meaningfully return ENOENT if the read request
- // passed in a single ObjectExtent. Any caller who wants ENOENT instead of
- // zeroed buffers needs to feed single extents into readx().
- assert(rd->extents.size() == 1);
ldout(cct, 10) << "readx object !exists, 1 extent..." << dendl;
// should we worry about COW underneaeth us?
@@ -1139,6 +1151,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
if (!missing.empty() || !rx.empty()) {
// read missing
+ map<loff_t, BufferHead*>::iterator last = missing.end();
for (map<loff_t, BufferHead*>::iterator bh_it = missing.begin();
bh_it != missing.end();
++bh_it) {
@@ -1160,15 +1173,20 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
delete bh_it->second;
} else {
bh_read(bh_it->second, rd->fadvise_flags);
- if (success && onfinish) {
- ldout(cct, 10) << "readx missed, waiting on " << *bh_it->second
- << " off " << bh_it->first << dendl;
- bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, oset, onfinish) );
- }
+ if ((success && onfinish) || last != missing.end())
+ last = bh_it;
}
success = false;
}
+ //add wait in last bh avoid wakeup early. Because read is order
+ if (last != missing.end()) {
+ ldout(cct, 10) << "readx missed, waiting on " << *last->second
+ << " off " << last->first << dendl;
+ last->second->waitfor_read[last->first].push_back( new C_RetryRead(this, rd, oset, onfinish) );
+
+ }
+
// bump rx
for (map<loff_t, BufferHead*>::iterator bh_it = rx.begin();
bh_it != rx.end();
@@ -1210,56 +1228,58 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
}
}
- // create reverse map of buffer offset -> object for the eventual result.
- // this is over a single ObjectExtent, so we know that
- // - the bh's are contiguous
- // - the buffer frags need not be (and almost certainly aren't)
- loff_t opos = ex_it->offset;
- map<loff_t, BufferHead*>::iterator bh_it = hits.begin();
- assert(bh_it->second->start() <= opos);
- uint64_t bhoff = opos - bh_it->second->start();
- vector<pair<uint64_t,uint64_t> >::iterator f_it = ex_it->buffer_extents.begin();
- uint64_t foff = 0;
- while (1) {
- BufferHead *bh = bh_it->second;
- assert(opos == (loff_t)(bh->start() + bhoff));
-
- uint64_t len = MIN(f_it->second - foff, bh->length() - bhoff);
- ldout(cct, 10) << "readx rmap opos " << opos
- << ": " << *bh << " +" << bhoff
- << " frag " << f_it->first << "~" << f_it->second << " +" << foff << "~" << len
- << dendl;
+ if (!error) {
+ // create reverse map of buffer offset -> object for the eventual result.
+ // this is over a single ObjectExtent, so we know that
+ // - the bh's are contiguous
+ // - the buffer frags need not be (and almost certainly aren't)
+ loff_t opos = ex_it->offset;
+ map<loff_t, BufferHead*>::iterator bh_it = hits.begin();
+ assert(bh_it->second->start() <= opos);
+ uint64_t bhoff = opos - bh_it->second->start();
+ vector<pair<uint64_t,uint64_t> >::iterator f_it = ex_it->buffer_extents.begin();
+ uint64_t foff = 0;
+ while (1) {
+ BufferHead *bh = bh_it->second;
+ assert(opos == (loff_t)(bh->start() + bhoff));
+
+ uint64_t len = MIN(f_it->second - foff, bh->length() - bhoff);
+ ldout(cct, 10) << "readx rmap opos " << opos
+ << ": " << *bh << " +" << bhoff
+ << " frag " << f_it->first << "~" << f_it->second << " +" << foff << "~" << len
+ << dendl;
+
+ bufferlist bit; // put substr here first, since substr_of clobbers, and
+ // we may get multiple bh's at this stripe_map position
+ if (bh->is_zero()) {
+ bufferptr bp(len);
+ bp.zero();
+ stripe_map[f_it->first].push_back(bp);
+ } else {
+ bit.substr_of(bh->bl,
+ opos - bh->start(),
+ len);
+ stripe_map[f_it->first].claim_append(bit);
+ }
- bufferlist bit; // put substr here first, since substr_of clobbers, and
- // we may get multiple bh's at this stripe_map position
- if (bh->is_zero()) {
- bufferptr bp(len);
- bp.zero();
- stripe_map[f_it->first].push_back(bp);
- } else {
- bit.substr_of(bh->bl,
- opos - bh->start(),
- len);
- stripe_map[f_it->first].claim_append(bit);
+ opos += len;
+ bhoff += len;
+ foff += len;
+ if (opos == bh->end()) {
+ ++bh_it;
+ bhoff = 0;
+ }
+ if (foff == f_it->second) {
+ ++f_it;
+ foff = 0;
+ }
+ if (bh_it == hits.end()) break;
+ if (f_it == ex_it->buffer_extents.end())
+ break;
}
-
- opos += len;
- bhoff += len;
- foff += len;
- if (opos == bh->end()) {
- ++bh_it;
- bhoff = 0;
- }
- if (foff == f_it->second) {
- ++f_it;
- foff = 0;
- }
- if (bh_it == hits.end()) break;
- if (f_it == ex_it->buffer_extents.end())
- break;
+ assert(f_it == ex_it->buffer_extents.end());
+ assert(opos == (loff_t)ex_it->offset + (loff_t)ex_it->length);
}
- assert(f_it == ex_it->buffer_extents.end());
- assert(opos == (loff_t)ex_it->offset + (loff_t)ex_it->length);
if (dontneed && o->include_all_cached_data(ex_it->offset, ex_it->length))
bottouch_ob(o);
@@ -1303,7 +1323,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
assert(rd->bl->length() == pos);
}
ldout(cct, 10) << "readx result is " << rd->bl->length() << dendl;
- } else {
+ } else if (!error) {
ldout(cct, 10) << "readx no bufferlist ptr (readahead?), done." << dendl;
map<uint64_t,bufferlist>::reverse_iterator i = stripe_map.rbegin();
pos = i->first + i->second.length();
@@ -1334,8 +1354,7 @@ void ObjectCacher::retry_waiting_reads()
waitfor_read.splice(waitfor_read.end(), ls);
}
-int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Mutex& wait_on_lock,
- Context *onfreespace)
+int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace)
{
assert(lock.is_locked());
utime_t now = ceph_clock_now(cct);
@@ -1408,7 +1427,7 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Mutex& wait_on_lock,
}
}
- int r = _wait_for_write(wr, bytes_written, oset, wait_on_lock, onfreespace);
+ int r = _wait_for_write(wr, bytes_written, oset, onfreespace);
delete wr;
//verify_stats();
@@ -1456,7 +1475,7 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len)
}
// blocking wait for write.
-int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, Mutex& lock, Context *onfreespace)
+int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, Context *onfreespace)
{
assert(lock.is_locked());
int ret = 0;
@@ -1501,6 +1520,7 @@ int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, M
void ObjectCacher::flusher_entry()
{
ldout(cct, 10) << "flusher start" << dendl;
+ writeback_handler.get_client_lock();
lock.Lock();
while (!flusher_stop) {
loff_t all = get_stat_tx() + get_stat_rx() + get_stat_clean() + get_stat_dirty();
@@ -1537,13 +1557,21 @@ void ObjectCacher::flusher_entry()
if (!max) {
// back off the lock to avoid starving other threads
lock.Unlock();
+ writeback_handler.put_client_lock();
+ writeback_handler.get_client_lock();
lock.Lock();
continue;
}
}
if (flusher_stop)
break;
+
+ writeback_handler.put_client_lock();
flusher_cond.WaitInterval(cct, lock, utime_t(1,0));
+ lock.Unlock();
+
+ writeback_handler.get_client_lock();
+ lock.Lock();
}
/* Wait for reads to finish. This is only possible if handling
@@ -1559,6 +1587,7 @@ void ObjectCacher::flusher_entry()
}
lock.Unlock();
+ writeback_handler.put_client_lock();
ldout(cct, 10) << "flusher finish" << dendl;
}
diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h
index ca23549..0bef597 100644
--- a/src/osdc/ObjectCacher.h
+++ b/src/osdc/ObjectCacher.h
@@ -602,14 +602,12 @@ class ObjectCacher {
* the return value is total bytes read
*/
int readx(OSDRead *rd, ObjectSet *oset, Context *onfinish);
- int writex(OSDWrite *wr, ObjectSet *oset, Mutex& wait_on_lock,
- Context *onfreespace);
+ int writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace);
bool is_cached(ObjectSet *oset, vector<ObjectExtent>& extents, snapid_t snapid);
private:
// write blocking
- int _wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, Mutex& lock,
- Context *onfreespace);
+ int _wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, Context *onfreespace);
void maybe_wait_for_writeback(uint64_t len);
bool _flush_set_finish(C_GatherBuilder *gather, Context *onfinish);
@@ -678,11 +676,10 @@ public:
int file_write(ObjectSet *oset, ceph_file_layout *layout, const SnapContext& snapc,
loff_t offset, uint64_t len,
- bufferlist& bl, utime_t mtime, int flags,
- Mutex& wait_on_lock) {
+ bufferlist& bl, utime_t mtime, int flags) {
OSDWrite *wr = prepare_write(snapc, bl, mtime, flags);
Striper::file_to_extents(cct, oset->ino, layout, offset, len, oset->truncate_size, wr->extents);
- return writex(wr, oset, wait_on_lock, NULL);
+ return writex(wr, oset, NULL);
}
bool file_flush(ObjectSet *oset, ceph_file_layout *layout, const SnapContext& snapc,
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index 6818feb..856425a 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -2428,12 +2428,14 @@ int Objecter::_calc_target(op_target_t *t, epoch_t *last_force_resend, bool any
}
}
+ int size = pi->size;
int min_size = pi->min_size;
unsigned pg_num = pi->get_pg_num();
int up_primary, acting_primary;
vector<int> up, acting;
osdmap->pg_to_up_acting_osds(pgid, &up, &up_primary,
&acting, &acting_primary);
+ unsigned prev_seed = ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask);
if (any_change && pg_interval_t::is_new_interval(
t->acting_primary,
acting_primary,
@@ -2443,11 +2445,13 @@ int Objecter::_calc_target(op_target_t *t, epoch_t *last_force_resend, bool any
up_primary,
t->up,
up,
+ t->size,
+ size,
t->min_size,
min_size,
t->pg_num,
pg_num,
- pi->raw_pg_to_pg(pgid))) {
+ pg_t(prev_seed, pgid.pool(), pgid.preferred()))) {
force_resend = true;
}
@@ -2469,8 +2473,10 @@ int Objecter::_calc_target(op_target_t *t, epoch_t *last_force_resend, bool any
t->acting_primary = acting_primary;
t->up_primary = up_primary;
t->up = up;
+ t->size = size;
t->min_size = min_size;
t->pg_num = pg_num;
+ t->pg_num_mask = pi->get_pg_num_mask();
ldout(cct, 10) << __func__ << " "
<< " pgid " << pgid << " acting " << acting << dendl;
t->used_replica = false;
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 3466d43..b9fd0cd 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -633,6 +633,8 @@ struct ObjectOperation {
uint32_t *out_data_digest;
uint32_t *out_omap_digest;
vector<pair<osd_reqid_t, version_t> > *out_reqids;
+ uint64_t *out_truncate_seq;
+ uint64_t *out_truncate_size;
int *prval;
C_ObjectOperation_copyget(object_copy_cursor_t *c,
uint64_t *s,
@@ -646,13 +648,18 @@ struct ObjectOperation {
uint32_t *dd,
uint32_t *od,
vector<pair<osd_reqid_t, version_t> > *oreqids,
+ uint64_t *otseq,
+ uint64_t *otsize,
int *r)
: cursor(c),
out_size(s), out_mtime(m),
out_attrs(a), out_data(d), out_omap_header(oh),
out_omap_data(o), out_snaps(osnaps), out_snap_seq(osnap_seq),
out_flags(flags), out_data_digest(dd), out_omap_digest(od),
- out_reqids(oreqids), prval(r) {}
+ out_reqids(oreqids),
+ out_truncate_seq(otseq),
+ out_truncate_size(otsize),
+ prval(r) {}
void finish(int r) {
if (r < 0)
return;
@@ -684,6 +691,10 @@ struct ObjectOperation {
*out_omap_digest = copy_reply.omap_digest;
if (out_reqids)
*out_reqids = copy_reply.reqids;
+ if (out_truncate_seq)
+ *out_truncate_seq = copy_reply.truncate_seq;
+ if (out_truncate_size)
+ *out_truncate_size = copy_reply.truncate_size;
*cursor = copy_reply.cursor;
} catch (buffer::error& e) {
if (prval)
@@ -706,6 +717,8 @@ struct ObjectOperation {
uint32_t *out_data_digest,
uint32_t *out_omap_digest,
vector<pair<osd_reqid_t, version_t> > *out_reqids,
+ uint64_t *truncate_seq,
+ uint64_t *truncate_size,
int *prval) {
OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_GET);
osd_op.op.copy_get.max = max;
@@ -718,7 +731,8 @@ struct ObjectOperation {
out_attrs, out_data, out_omap_header,
out_omap_data, out_snaps, out_snap_seq,
out_flags, out_data_digest, out_omap_digest,
- out_reqids, prval);
+ out_reqids, truncate_seq, truncate_size,
+ prval);
out_bl[p] = &h->bl;
out_handler[p] = h;
}
@@ -1113,16 +1127,18 @@ public:
object_t target_oid;
object_locator_t target_oloc;
- bool precalc_pgid; ///< true if we are directed at base_pgid, not base_oid
- pg_t base_pgid; ///< explciti pg target, if any
+ bool precalc_pgid; ///< true if we are directed at base_pgid, not base_oid
+ pg_t base_pgid; ///< explciti pg target, if any
- pg_t pgid; ///< last pg we mapped to
- unsigned pg_num; ///< last pg_num we mapped to
- vector<int> up; ///< set of up osds for last pg we mapped to
- vector<int> acting; ///< set of acting osds for last pg we mapped to
- int up_primary; ///< primary for last pg we mapped to based on the up set
- int acting_primary; ///< primary for last pg we mapped to based on the acting set
- int min_size; ///< the min size of the pool when were were last mapped
+ pg_t pgid; ///< last pg we mapped to
+ unsigned pg_num; ///< last pg_num we mapped to
+ unsigned pg_num_mask; ///< last pg_num_mask we mapped to
+ vector<int> up; ///< set of up osds for last pg we mapped to
+ vector<int> acting; ///< set of acting osds for last pg we mapped to
+ int up_primary; ///< primary for last pg we mapped to based on the up set
+ int acting_primary; ///< primary for last pg we mapped to based on the acting set
+ int size; ///< the size of the pool when were were last mapped
+ int min_size; ///< the min size of the pool when were were last mapped
bool used_replica;
bool paused;
@@ -1135,8 +1151,10 @@ public:
base_oloc(oloc),
precalc_pgid(false),
pg_num(0),
+ pg_num_mask(0),
up_primary(-1),
acting_primary(-1),
+ size(-1),
min_size(-1),
used_replica(false),
paused(false),
@@ -1456,7 +1474,7 @@ public:
Context *onfinish, *ontimeout;
int pool_op;
uint64_t auid;
- __u8 crush_rule;
+ int16_t crush_rule;
snapid_t snapid;
bufferlist *blp;
diff --git a/src/osdc/WritebackHandler.h b/src/osdc/WritebackHandler.h
index 466f84e..fe7d977 100644
--- a/src/osdc/WritebackHandler.h
+++ b/src/osdc/WritebackHandler.h
@@ -37,6 +37,9 @@ class WritebackHandler {
int op, int flags, Context *onack, Context *oncommit) {
assert(0 == "this WritebackHandler does not support the lock operation");
}
+
+ virtual void get_client_lock() {}
+ virtual void put_client_lock() {}
};
#endif
diff --git a/src/rgw/Makefile.am b/src/rgw/Makefile.am
index 316ae76..7620d73 100644
--- a/src/rgw/Makefile.am
+++ b/src/rgw/Makefile.am
@@ -100,7 +100,7 @@ radosgw_CFLAGS = -I$(srcdir)/civetweb/include
radosgw_LDADD = $(LIBRGW) $(LIBCIVETWEB) $(LIBRGW_DEPS) $(RESOLV_LIBS) $(CEPH_GLOBAL)
bin_PROGRAMS += radosgw
-radosgw_admin_SOURCES = rgw/rgw_admin.cc
+radosgw_admin_SOURCES = rgw/rgw_admin.cc rgw/rgw_orphan.cc
radosgw_admin_LDADD = $(LIBRGW) $(LIBRGW_DEPS) $(CEPH_GLOBAL)
bin_PROGRAMS += radosgw-admin
@@ -141,6 +141,7 @@ noinst_HEADERS += \
rgw/rgw_metadata.h \
rgw/rgw_multi_del.h \
rgw/rgw_op.h \
+ rgw/rgw_orphan.h \
rgw/rgw_http_client.h \
rgw/rgw_swift.h \
rgw/rgw_swift_auth.h \
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index 5debefb..45cb2e1 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -31,6 +31,7 @@ using namespace std;
#include "rgw_formats.h"
#include "rgw_usage.h"
#include "rgw_replica_log.h"
+#include "rgw_orphan.h"
#define dout_subsys ceph_subsys_rgw
@@ -124,6 +125,7 @@ void _usage()
cerr << " --access=<access> Set access permissions for sub-user, should be one\n";
cerr << " of read, write, readwrite, full\n";
cerr << " --display-name=<name>\n";
+ cerr << " --max_buckets max number of buckets for a user\n";
cerr << " --system set the system flag on the user\n";
cerr << " --bucket=<bucket>\n";
cerr << " --pool=<pool>\n";
@@ -164,6 +166,7 @@ void _usage()
cerr << " --categories=<list> comma separated list of categories, used in usage show\n";
cerr << " --caps=<caps> list of caps (e.g., \"usage=read, write; user=read\"\n";
cerr << " --yes-i-really-mean-it required for certain operations\n";
+ cerr << " --reset-regions reset regionmap when regionmap update";
cerr << "\n";
cerr << "<date> := \"YYYY-MM-DD[ hh:mm:ss]\"\n";
cerr << "\nQuota options:\n";
@@ -232,6 +235,8 @@ enum {
OPT_QUOTA_DISABLE,
OPT_GC_LIST,
OPT_GC_PROCESS,
+ OPT_ORPHANS_FIND,
+ OPT_ORPHANS_FINISH,
OPT_REGION_GET,
OPT_REGION_LIST,
OPT_REGION_SET,
@@ -281,6 +286,7 @@ static int get_cmd(const char *cmd, const char *prev_cmd, bool *need_more)
strcmp(cmd, "object") == 0 ||
strcmp(cmd, "olh") == 0 ||
strcmp(cmd, "opstate") == 0 ||
+ strcmp(cmd, "orphans") == 0 ||
strcmp(cmd, "pool") == 0 ||
strcmp(cmd, "pools") == 0 ||
strcmp(cmd, "quota") == 0 ||
@@ -441,6 +447,11 @@ static int get_cmd(const char *cmd, const char *prev_cmd, bool *need_more)
return OPT_GC_LIST;
if (strcmp(cmd, "process") == 0)
return OPT_GC_PROCESS;
+ } else if (strcmp(prev_cmd, "orphans") == 0) {
+ if (strcmp(cmd, "find") == 0)
+ return OPT_ORPHANS_FIND;
+ if (strcmp(cmd, "finish") == 0)
+ return OPT_ORPHANS_FINISH;
} else if (strcmp(prev_cmd, "metadata") == 0) {
if (strcmp(cmd, "get") == 0)
return OPT_METADATA_GET;
@@ -1059,6 +1070,7 @@ int do_check_object_locator(const string& bucket_name, bool fix, bool remove_bad
return 0;
}
+
int main(int argc, char **argv)
{
vector<const char*> args;
@@ -1133,6 +1145,7 @@ int main(int argc, char **argv)
int include_all = false;
int sync_stats = false;
+ int reset_regions = false;
uint64_t min_rewrite_size = 4 * 1024 * 1024;
uint64_t max_rewrite_size = ULLONG_MAX;
@@ -1140,6 +1153,11 @@ int main(int argc, char **argv)
BIIndexType bi_index_type = PlainIdx;
+ string job_id;
+ int num_shards = 0;
+ int max_concurrent_ios = 32;
+ uint64_t orphan_stale_secs = (24 * 3600);
+
std::string val;
std::ostringstream errs;
string err;
@@ -1189,6 +1207,8 @@ int main(int argc, char **argv)
cerr << "bad key type: " << key_type_str << std::endl;
return usage();
}
+ } else if (ceph_argparse_witharg(args, i, &val, "--job-id", (char*)NULL)) {
+ job_id = val;
} else if (ceph_argparse_binary_flag(args, i, &gen_access_key, NULL, "--gen-access-key", (char*)NULL)) {
// do nothing
} else if (ceph_argparse_binary_flag(args, i, &gen_secret_key, NULL, "--gen-secret", (char*)NULL)) {
@@ -1238,6 +1258,12 @@ int main(int argc, char **argv)
start_date = val;
} else if (ceph_argparse_witharg(args, i, &val, "--end-date", "--end-time", (char*)NULL)) {
end_date = val;
+ } else if (ceph_argparse_witharg(args, i, &val, "--num-shards", (char*)NULL)) {
+ num_shards = atoi(val.c_str());
+ } else if (ceph_argparse_witharg(args, i, &val, "--max-concurrent-ios", (char*)NULL)) {
+ max_concurrent_ios = atoi(val.c_str());
+ } else if (ceph_argparse_witharg(args, i, &val, "--orphan-stale-secs", (char*)NULL)) {
+ orphan_stale_secs = (uint64_t)atoi(val.c_str());
} else if (ceph_argparse_witharg(args, i, &val, "--shard-id", (char*)NULL)) {
shard_id = atoi(val.c_str());
specified_shard_id = true;
@@ -1292,6 +1318,8 @@ int main(int argc, char **argv)
// do nothing
} else if (ceph_argparse_binary_flag(args, i, &include_all, NULL, "--include-all", (char*)NULL)) {
// do nothing
+ } else if (ceph_argparse_binary_flag(args, i, &reset_regions, NULL, "--reset-regions", (char*)NULL)) {
+ // do nothing
} else if (ceph_argparse_witharg(args, i, &val, "--caps", (char*)NULL)) {
caps = val;
} else if (ceph_argparse_witharg(args, i, &val, "-i", "--infile", (char*)NULL)) {
@@ -1530,6 +1558,10 @@ int main(int argc, char **argv)
return -ret;
}
+ if (reset_regions) {
+ regionmap.regions.clear();
+ }
+
for (list<string>::iterator iter = regions.begin(); iter != regions.end(); ++iter) {
ret = region.read_info(*iter);
if (ret < 0) {
@@ -2557,6 +2589,55 @@ next:
}
}
+ if (opt_cmd == OPT_ORPHANS_FIND) {
+ RGWOrphanSearch search(store, max_concurrent_ios, orphan_stale_secs);
+
+ if (job_id.empty()) {
+ cerr << "ERROR: --job-id not specified" << std::endl;
+ return EINVAL;
+ }
+ if (pool_name.empty()) {
+ cerr << "ERROR: --pool not specified" << std::endl;
+ return EINVAL;
+ }
+
+ RGWOrphanSearchInfo info;
+
+ info.pool = pool_name;
+ info.job_name = job_id;
+ info.num_shards = num_shards;
+
+ int ret = search.init(job_id, &info);
+ if (ret < 0) {
+ cerr << "could not init search, ret=" << ret << std::endl;
+ return -ret;
+ }
+ ret = search.run();
+ if (ret < 0) {
+ return -ret;
+ }
+ }
+
+ if (opt_cmd == OPT_ORPHANS_FINISH) {
+ RGWOrphanSearch search(store, max_concurrent_ios, orphan_stale_secs);
+
+ if (job_id.empty()) {
+ cerr << "ERROR: --job-id not specified" << std::endl;
+ return EINVAL;
+ }
+ int ret = search.init(job_id, NULL);
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ cerr << "job not found" << std::endl;
+ }
+ return -ret;
+ }
+ ret = search.finish();
+ if (ret < 0) {
+ return -ret;
+ }
+ }
+
if (opt_cmd == OPT_USER_CHECK) {
check_bad_user_bucket_mapping(store, user_id, fix);
}
diff --git a/src/rgw/rgw_civetweb.cc b/src/rgw/rgw_civetweb.cc
index eea3b14..81e504c 100644
--- a/src/rgw/rgw_civetweb.cc
+++ b/src/rgw/rgw_civetweb.cc
@@ -143,6 +143,9 @@ int RGWMongoose::send_status(const char *status, const char *status_name)
bl.append(header_data);
header_data = bl;
+ int status_num = atoi(status);
+ mg_set_http_status(conn, status_num);
+
return 0;
}
diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc
index f3988cf..8d9ebf0 100644
--- a/src/rgw/rgw_common.cc
+++ b/src/rgw/rgw_common.cc
@@ -99,7 +99,7 @@ is_err() const
req_info::req_info(CephContext *cct, class RGWEnv *e) : env(e) {
- method = env->get("REQUEST_METHOD");
+ method = env->get("REQUEST_METHOD", "");
script_uri = env->get("SCRIPT_URI", cct->_conf->rgw_script_uri.c_str());
request_uri = env->get("REQUEST_URI", cct->_conf->rgw_request_uri.c_str());
int pos = request_uri.find('?');
@@ -109,7 +109,22 @@ req_info::req_info(CephContext *cct, class RGWEnv *e) : env(e) {
} else {
request_params = env->get("QUERY_STRING", "");
}
- host = env->get("HTTP_HOST");
+ host = env->get("HTTP_HOST", "");
+
+ // strip off any trailing :port from host (added by CrossFTP and maybe others)
+ size_t colon_offset = host.find_last_of(':');
+ if (colon_offset != string::npos) {
+ bool all_digits = true;
+ for (unsigned i = colon_offset + 1; i < host.size(); ++i) {
+ if (!isdigit(host[i])) {
+ all_digits = false;
+ break;
+ }
+ }
+ if (all_digits) {
+ host.resize(colon_offset);
+ }
+ }
}
void req_info::rebuild_from(req_info& src)
@@ -341,18 +356,17 @@ bool parse_iso8601(const char *s, struct tm *t)
}
string str;
trim_whitespace(p, str);
- if (str.size() == 1 && str[0] == 'Z')
+ int len = str.size();
+
+ if (len == 1 && str[0] == 'Z')
return true;
- if (str.size() != 5) {
- return false;
- }
if (str[0] != '.' ||
- str[str.size() - 1] != 'Z')
+ str[len - 1] != 'Z')
return false;
uint32_t ms;
- int r = stringtoul(str.substr(1, 3), &ms);
+ int r = stringtoul(str.substr(1, len - 2), &ms);
if (r < 0)
return false;
@@ -525,6 +539,26 @@ int gen_rand_alphanumeric_no_underscore(CephContext *cct, char *dest, int size)
return 0;
}
+static const char alphanum_plain_table[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+
+int gen_rand_alphanumeric_plain(CephContext *cct, char *dest, int size) /* size should be the required string size + 1 */
+{
+ int ret = get_random_bytes(dest, size);
+ if (ret < 0) {
+ lderr(cct) << "cannot get random bytes: " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+
+ int i;
+ for (i=0; i<size - 1; i++) {
+ int pos = (unsigned)dest[i];
+ dest[i] = alphanum_plain_table[pos % (sizeof(alphanum_plain_table) - 1)];
+ }
+ dest[i] = '\0';
+
+ return 0;
+}
+
int NameVal::parse()
{
int delim_pos = str.find('=');
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index 6c7912b..5b4e39b 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -194,6 +194,7 @@ extern int gen_rand_alphanumeric(CephContext *cct, char *dest, int size);
extern int gen_rand_alphanumeric_lower(CephContext *cct, char *dest, int size);
extern int gen_rand_alphanumeric_upper(CephContext *cct, char *dest, int size);
extern int gen_rand_alphanumeric_no_underscore(CephContext *cct, char *dest, int size);
+extern int gen_rand_alphanumeric_plain(CephContext *cct, char *dest, int size);
extern int gen_rand_alphanumeric_lower(CephContext *cct, string *str, int length);
@@ -908,7 +909,7 @@ struct req_info {
RGWHTTPArgs args;
map<string, string> x_meta_map;
- const char *host;
+ string host;
const char *method;
string script_uri;
string request_uri;
@@ -1057,6 +1058,8 @@ struct req_state {
string req_id;
+ string trans_id;
+
req_info info;
req_state(CephContext *_cct, class RGWEnv *e);
@@ -1316,31 +1319,46 @@ public:
* part of the given namespace, it returns false.
*/
static bool translate_raw_obj_to_obj_in_ns(string& obj, string& instance, string& ns) {
- if (ns.empty()) {
- if (obj[0] != '_')
- return true;
-
- if (obj.size() >= 2 && obj[1] == '_') {
- obj = obj.substr(1);
+ if (obj[0] != '_') {
+ if (ns.empty()) {
return true;
}
-
return false;
}
- if (obj[0] != '_' || obj.size() < 3) // for namespace, min size would be 3: _x_
+ string obj_ns;
+ bool ret = parse_raw_oid(obj, &obj, &instance, &obj_ns);
+ if (!ret) {
+ return ret;
+ }
+
+ return (ns == obj_ns);
+ }
+
+ static bool parse_raw_oid(const string& oid, string *obj_name, string *obj_instance, string *obj_ns) {
+ obj_instance->clear();
+ obj_ns->clear();
+ if (oid[0] != '_') {
+ *obj_name = oid;
+ return true;
+ }
+
+ if (oid.size() >= 2 && oid[1] == '_') {
+ *obj_name = oid.substr(1);
+ return true;
+ }
+
+ if (oid[0] != '_' || oid.size() < 3) // for namespace, min size would be 3: _x_
return false;
- int pos = obj.find('_', 1);
+ int pos = oid.find('_', 1);
if (pos <= 1) // if it starts with __, it's not in our namespace
return false;
- string obj_ns = obj.substr(1, pos - 1);
- parse_ns_field(obj_ns, instance);
- if (obj_ns.compare(ns) != 0)
- return false;
+ *obj_ns = oid.substr(1, pos - 1);
+ parse_ns_field(*obj_ns, *obj_instance);
- obj = obj.substr(pos + 1);
+ *obj_name = oid.substr(pos + 1);
return true;
}
diff --git a/src/rgw/rgw_gc.cc b/src/rgw/rgw_gc.cc
index c536fa9..3a949b6 100644
--- a/src/rgw/rgw_gc.cc
+++ b/src/rgw/rgw_gc.cc
@@ -95,7 +95,7 @@ int RGWGC::list(int *index, string& marker, uint32_t max, bool expired_only, std
{
result.clear();
- for (; *index < cct->_conf->rgw_gc_max_objs && result.size() < max; (*index)++, marker.clear()) {
+ for (; *index < max_objs && result.size() < max; (*index)++, marker.clear()) {
std::list<cls_rgw_gc_obj_info> entries;
int ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[*index], marker, max - result.size(), expired_only, entries, truncated);
if (ret == -ENOENT)
@@ -108,7 +108,7 @@ int RGWGC::list(int *index, string& marker, uint32_t max, bool expired_only, std
result.push_back(*iter);
}
- if (*index == cct->_conf->rgw_gc_max_objs - 1) {
+ if (*index == max_objs - 1) {
/* we cut short here, truncated will hold the correct value */
return 0;
}
@@ -186,7 +186,7 @@ int RGWGC::process(int index, int max_secs)
if (obj.pool != last_pool) {
delete ctx;
ctx = new IoCtx;
- ret = store->rados->ioctx_create(obj.pool.c_str(), *ctx);
+ ret = store->get_rados_handle()->ioctx_create(obj.pool.c_str(), *ctx);
if (ret < 0) {
dout(0) << "ERROR: failed to create ioctx pool=" << obj.pool << dendl;
continue;
@@ -234,7 +234,6 @@ done:
int RGWGC::process()
{
- int max_objs = cct->_conf->rgw_gc_max_objs;
int max_secs = cct->_conf->rgw_gc_processor_max_time;
unsigned start;
diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc
index 2a247e4..0ddd9de 100644
--- a/src/rgw/rgw_main.cc
+++ b/src/rgw/rgw_main.cc
@@ -555,8 +555,9 @@ static int process_request(RGWRados *store, RGWREST *rest, RGWRequest *req, RGWC
s->obj_ctx = &rados_ctx;
s->req_id = store->unique_id(req->id);
+ s->trans_id = store->unique_trans_id(req->id);
- req->log(s, "initializing");
+ req->log_format(s, "initializing for trans_id = %s", s->trans_id.c_str());
RGWOp *op = NULL;
int init_error = 0;
@@ -1261,8 +1262,6 @@ int main(int argc, const char **argv)
dout(1) << "final shutdown" << dendl;
g_ceph_context->put();
- ceph::crypto::shutdown();
-
signal_fd_finalize();
return 0;
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index bf6c3e7..4301bdd 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -845,6 +845,12 @@ int RGWGetObj::handle_user_manifest(const char *prefix)
s->obj_size = total_len;
+ if (!get_data) {
+ bufferlist bl;
+ send_response_data(bl, 0, 0);
+ return 0;
+ }
+
r = iterate_user_manifest_parts(s->cct, store, ofs, end, bucket, obj_prefix, bucket_policy, NULL, get_obj_user_manifest_iterate_cb, (void *)this);
if (r < 0)
return r;
diff --git a/src/rgw/rgw_orphan.cc b/src/rgw/rgw_orphan.cc
new file mode 100644
index 0000000..2818d79
--- /dev/null
+++ b/src/rgw/rgw_orphan.cc
@@ -0,0 +1,810 @@
+
+
+#include <string>
+
+using namespace std;
+
+#include "common/config.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "rgw_rados.h"
+#include "rgw_orphan.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#define DEFAULT_NUM_SHARDS 64
+
+static string obj_fingerprint(const string& oid, const char *force_ns = NULL)
+{
+ ssize_t pos = oid.find('_');
+ if (pos < 0) {
+ cerr << "ERROR: object does not have a bucket marker: " << oid << std::endl;
+ }
+
+ string obj_marker = oid.substr(0, pos);
+
+ string obj_name;
+ string obj_instance;
+ string obj_ns;
+
+ rgw_obj::parse_raw_oid(oid.substr(pos + 1), &obj_name, &obj_instance, &obj_ns);
+
+ if (obj_ns.empty()) {
+ return oid;
+ }
+
+ string s = oid;
+
+ if (force_ns) {
+ rgw_bucket b;
+ rgw_obj new_obj(b, obj_name);
+ new_obj.set_ns(force_ns);
+ new_obj.set_instance(obj_instance);
+ s = obj_marker + "_" + new_obj.get_object();
+ }
+
+ /* cut out suffix */
+ size_t i = s.size() - 1;
+ for (; i >= s.size() - 10; --i) {
+ char c = s[i];
+ if (!isdigit(c) && c != '.' && c != '_') {
+ break;
+ }
+ }
+
+ return s.substr(0, i + 1);
+}
+
+int RGWOrphanStore::read_job(const string& job_name, RGWOrphanSearchState & state)
+{
+ set<string> keys;
+ map<string, bufferlist> vals;
+ keys.insert(job_name);
+ int r = ioctx.omap_get_vals_by_keys(oid, keys, &vals);
+ if (r < 0) {
+ return r;
+ }
+
+ map<string, bufferlist>::iterator iter = vals.find(job_name);
+ if (iter == vals.end()) {
+ return -ENOENT;
+ }
+
+ try {
+ bufferlist& bl = iter->second;
+ ::decode(state, bl);
+ } catch (buffer::error& err) {
+ lderr(store->ctx()) << "ERROR: could not decode buffer" << dendl;
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int RGWOrphanStore::write_job(const string& job_name, const RGWOrphanSearchState& state)
+{
+ map<string, bufferlist> vals;
+ bufferlist bl;
+ ::encode(state, bl);
+ vals[job_name] = bl;
+ int r = ioctx.omap_set(oid, vals);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+int RGWOrphanStore::remove_job(const string& job_name)
+{
+ set<string> keys;
+ keys.insert(job_name);
+
+ int r = ioctx.omap_rm_keys(oid, keys);
+ if (r < 0) {
+ return r;
+ }
+
+ return 0;
+}
+
+int RGWOrphanStore::init()
+{
+ const char *log_pool = store->get_zone_params().log_pool.name.c_str();
+ librados::Rados *rados = store->get_rados_handle();
+ int r = rados->ioctx_create(log_pool, ioctx);
+ if (r < 0) {
+ cerr << "ERROR: failed to open log pool (" << store->get_zone_params().log_pool.name << " ret=" << r << std::endl;
+ return r;
+ }
+
+ return 0;
+}
+
+int RGWOrphanStore::store_entries(const string& oid, const map<string, bufferlist>& entries)
+{
+ librados::ObjectWriteOperation op;
+ op.omap_set(entries);
+ cout << "storing " << entries.size() << " entries at " << oid << std::endl;
+ ldout(store->ctx(), 20) << "storing " << entries.size() << " entries at " << oid << ": " << dendl;
+ for (map<string, bufferlist>::const_iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+ ldout(store->ctx(), 20) << " > " << iter->first << dendl;
+ }
+ int ret = ioctx.operate(oid, &op);
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: " << __func__ << "(" << oid << ") returned ret=" << ret << dendl;
+ }
+
+ return 0;
+}
+
+int RGWOrphanStore::read_entries(const string& oid, const string& marker, map<string, bufferlist> *entries, bool *truncated)
+{
+#define MAX_OMAP_GET 100
+ int ret = ioctx.omap_get_vals(oid, marker, MAX_OMAP_GET, entries);
+ if (ret < 0) {
+ cerr << "ERROR: " << __func__ << "(" << oid << ") returned ret=" << ret << std::endl;
+ }
+
+ *truncated = (entries->size() == MAX_OMAP_GET);
+
+ return 0;
+}
+
+int RGWOrphanSearch::init(const string& job_name, RGWOrphanSearchInfo *info) {
+ int r = orphan_store.init();
+ if (r < 0) {
+ return r;
+ }
+
+ RGWOrphanSearchState state;
+ r = orphan_store.read_job(job_name, state);
+ if (r < 0 && r != -ENOENT) {
+ lderr(store->ctx()) << "ERROR: failed to read state ret=" << r << dendl;
+ return r;
+ }
+
+ uint64_t num_shards = (info->num_shards ? info->num_shards : DEFAULT_NUM_SHARDS);
+ if (r == 0) {
+ if (num_shards != state.info.num_shards) {
+ return -EINVAL;
+ }
+ search_info = state.info;
+ search_stage = state.stage;
+ } else { /* r == -ENOENT */
+ search_info = *info;
+ search_info.job_name = job_name;
+ search_info.num_shards = num_shards;
+ search_info.start_time = ceph_clock_now(store->ctx());
+ search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_INIT);
+
+ r = save_state();
+ if (r < 0) {
+ lderr(store->ctx()) << "ERROR: failed to write state ret=" << r << dendl;
+ return r;
+ }
+ }
+
+ index_objs_prefix = RGW_ORPHAN_INDEX_PREFIX + string(".");
+ index_objs_prefix += job_name;
+
+ for (int i = 0; i < search_info.num_shards; i++) {
+ char buf[128];
+
+ snprintf(buf, sizeof(buf), "%s.rados.%d", index_objs_prefix.c_str(), i);
+ all_objs_index[i] = buf;
+
+ snprintf(buf, sizeof(buf), "%s.buckets.%d", index_objs_prefix.c_str(), i);
+ buckets_instance_index[i] = buf;
+
+ snprintf(buf, sizeof(buf), "%s.linked.%d", index_objs_prefix.c_str(), i);
+ linked_objs_index[i] = buf;
+ }
+ return 0;
+}
+
+int RGWOrphanSearch::log_oids(map<int, string>& log_shards, map<int, list<string> >& oids)
+{
+ map<int, list<string> >::iterator miter = oids.begin();
+
+ list<log_iter_info> liters; /* a list of iterator pairs for begin and end */
+
+ for (; miter != oids.end(); ++miter) {
+ log_iter_info info;
+ info.oid = log_shards[miter->first];
+ info.cur = miter->second.begin();
+ info.end = miter->second.end();
+ liters.push_back(info);
+ }
+
+ list<log_iter_info>::iterator list_iter;
+ while (!liters.empty()) {
+ list_iter = liters.begin();
+
+ while (list_iter != liters.end()) {
+ log_iter_info& cur_info = *list_iter;
+
+ list<string>::iterator& cur = cur_info.cur;
+ list<string>::iterator& end = cur_info.end;
+
+ map<string, bufferlist> entries;
+#define MAX_OMAP_SET_ENTRIES 100
+ for (int j = 0; cur != end && j != MAX_OMAP_SET_ENTRIES; ++cur, ++j) {
+ ldout(store->ctx(), 20) << "adding obj: " << *cur << dendl;
+ entries[*cur] = bufferlist();
+ }
+
+ int ret = orphan_store.store_entries(cur_info.oid, entries);
+ if (ret < 0) {
+ return ret;
+ }
+ list<log_iter_info>::iterator tmp = list_iter;
+ ++list_iter;
+ if (cur == end) {
+ liters.erase(tmp);
+ }
+ }
+ }
+ return 0;
+}
+
+int RGWOrphanSearch::build_all_oids_index()
+{
+ librados::Rados *rados = store->get_rados_handle();
+
+ librados::IoCtx ioctx;
+
+ int ret = rados->ioctx_create(search_info.pool.c_str(), ioctx);
+ if (ret < 0) {
+ lderr(store->ctx()) << __func__ << ": ioctx_create() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ ioctx.set_namespace(librados::all_nspaces);
+ librados::NObjectIterator i = ioctx.nobjects_begin();
+ librados::NObjectIterator i_end = ioctx.nobjects_end();
+
+ map<int, list<string> > oids;
+
+ int count = 0;
+ uint64_t total = 0;
+
+ cout << "logging all objects in the pool" << std::endl;
+
+ for (; i != i_end; ++i) {
+ string nspace = i->get_nspace();
+ string oid = i->get_oid();
+ string locator = i->get_locator();
+
+ string name = oid;
+ if (locator.size())
+ name += " (@" + locator + ")";
+
+ string oid_fp = obj_fingerprint(oid);
+
+ ldout(store->ctx(), 20) << "oid_fp=" << oid_fp << dendl;
+
+ int shard = orphan_shard(oid_fp);
+ oids[shard].push_back(oid);
+
+#define COUNT_BEFORE_FLUSH 1000
+ ++total;
+ if (++count >= COUNT_BEFORE_FLUSH) {
+ ldout(store->ctx(), 1) << "iterated through " << total << " objects" << dendl;
+ ret = log_oids(all_objs_index, oids);
+ if (ret < 0) {
+ cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl;
+ return ret;
+ }
+ count = 0;
+ oids.clear();
+ }
+ }
+ ret = log_oids(all_objs_index, oids);
+ if (ret < 0) {
+ cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl;
+ return ret;
+ }
+
+ return 0;
+}
+
+int RGWOrphanSearch::build_buckets_instance_index()
+{
+ void *handle;
+ int max = 1000;
+ string section = "bucket.instance";
+ int ret = store->meta_mgr->list_keys_init(section, &handle);
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: can't get key: " << cpp_strerror(-ret) << dendl;
+ return -ret;
+ }
+
+ map<int, list<string> > instances;
+
+ bool truncated;
+
+ RGWObjectCtx obj_ctx(store);
+
+ int count = 0;
+ uint64_t total = 0;
+
+ do {
+ list<string> keys;
+ ret = store->meta_mgr->list_keys_next(handle, max, keys, &truncated);
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << dendl;
+ return -ret;
+ }
+
+ for (list<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
+ ++total;
+ ldout(store->ctx(), 10) << "bucket_instance=" << *iter << " total=" << total << dendl;
+ int shard = orphan_shard(*iter);
+ instances[shard].push_back(*iter);
+
+ if (++count >= COUNT_BEFORE_FLUSH) {
+ ret = log_oids(buckets_instance_index, instances);
+ if (ret < 0) {
+ lderr(store->ctx()) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl;
+ return ret;
+ }
+ count = 0;
+ instances.clear();
+ }
+ }
+
+ } while (truncated);
+
+ ret = log_oids(buckets_instance_index, instances);
+ if (ret < 0) {
+ lderr(store->ctx()) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl;
+ return ret;
+ }
+ store->meta_mgr->list_keys_complete(handle);
+
+ return 0;
+}
+
+int RGWOrphanSearch::handle_stat_result(map<int, list<string> >& oids, RGWRados::Object::Stat::Result& result)
+{
+ set<string> obj_oids;
+ rgw_bucket& bucket = result.obj.bucket;
+ if (!result.has_manifest) { /* a very very old object, or part of a multipart upload during upload */
+ const string loc = bucket.bucket_id + "_" + result.obj.get_object();
+ obj_oids.insert(obj_fingerprint(loc));
+
+ /*
+ * multipart parts don't have manifest on them, it's in the meta object. Instead of reading the
+ * meta object, just add a "shadow" object to the mix
+ */
+ obj_oids.insert(obj_fingerprint(loc, "shadow"));
+ } else {
+ RGWObjManifest& manifest = result.manifest;
+
+ RGWObjManifest::obj_iterator miter;
+ for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
+ const rgw_obj& loc = miter.get_location();
+ string s = bucket.bucket_id + "_" + loc.get_object();
+ obj_oids.insert(obj_fingerprint(s));
+ }
+ }
+
+ for (set<string>::iterator iter = obj_oids.begin(); iter != obj_oids.end(); ++iter) {
+ ldout(store->ctx(), 20) << __func__ << ": oid for obj=" << result.obj << ": " << *iter << dendl;
+
+ int shard = orphan_shard(*iter);
+ oids[shard].push_back(*iter);
+ }
+
+ return 0;
+}
+
+int RGWOrphanSearch::pop_and_handle_stat_op(map<int, list<string> >& oids, std::deque<RGWRados::Object::Stat>& ops)
+{
+ RGWRados::Object::Stat& front_op = ops.front();
+
+ int ret = front_op.wait();
+ if (ret < 0) {
+ if (ret != -ENOENT) {
+ lderr(store->ctx()) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl;
+ }
+ goto done;
+ }
+ ret = handle_stat_result(oids, front_op.result);
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: handle_stat_response() returned error: " << cpp_strerror(-ret) << dendl;
+ }
+done:
+ ops.pop_front();
+ return ret;
+}
+
+int RGWOrphanSearch::build_linked_oids_for_bucket(const string& bucket_instance_id, map<int, list<string> >& oids)
+{
+ ldout(store->ctx(), 10) << "building linked oids for bucket instance: " << bucket_instance_id << dendl;
+ RGWBucketInfo bucket_info;
+ RGWObjectCtx obj_ctx(store);
+ int ret = store->get_bucket_instance_info(obj_ctx, bucket_instance_id, bucket_info, NULL, NULL);
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ /* probably raced with bucket removal */
+ return 0;
+ }
+ lderr(store->ctx()) << __func__ << ": ERROR: RGWRados::get_bucket_instance_info() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ RGWRados::Bucket target(store, bucket_info.bucket);
+ RGWRados::Bucket::List list_op(&target);
+
+ string marker;
+ list_op.params.marker = rgw_obj_key(marker);
+ list_op.params.list_versions = true;
+ list_op.params.enforce_ns = false;
+
+ bool truncated;
+
+ deque<RGWRados::Object::Stat> stat_ops;
+
+ int count = 0;
+
+ do {
+ vector<RGWObjEnt> result;
+
+#define MAX_LIST_OBJS_ENTRIES 100
+ ret = list_op.list_objects(MAX_LIST_OBJS_ENTRIES, &result, NULL, &truncated);
+ if (ret < 0) {
+ cerr << "ERROR: store->list_objects(): " << cpp_strerror(-ret) << std::endl;
+ return -ret;
+ }
+
+ for (vector<RGWObjEnt>::iterator iter = result.begin(); iter != result.end(); ++iter) {
+ RGWObjEnt& entry = *iter;
+ if (entry.key.instance.empty()) {
+ ldout(store->ctx(), 20) << "obj entry: " << entry.key.name << dendl;
+ } else {
+ ldout(store->ctx(), 20) << "obj entry: " << entry.key.name << " [" << entry.key.instance << "]" << dendl;
+ }
+
+ ldout(store->ctx(), 20) << __func__ << ": entry.key.name=" << entry.key.name << " entry.key.instance=" << entry.key.instance << " entry.ns=" << entry.ns << dendl;
+ rgw_obj obj(bucket_info.bucket, entry.key);
+ obj.set_ns(entry.ns);
+
+ RGWRados::Object op_target(store, bucket_info, obj_ctx, obj);
+
+ stat_ops.push_back(RGWRados::Object::Stat(&op_target));
+ RGWRados::Object::Stat& op = stat_ops.back();
+
+
+ ret = op.stat_async();
+ if (ret < 0) {
+ lderr(store->ctx()) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl;
+ return ret;
+ }
+ if (stat_ops.size() >= max_concurrent_ios) {
+ ret = pop_and_handle_stat_op(oids, stat_ops);
+ if (ret < 0) {
+ if (ret != -ENOENT) {
+ lderr(store->ctx()) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl;
+ }
+ }
+ }
+ if (++count >= COUNT_BEFORE_FLUSH) {
+ ret = log_oids(linked_objs_index, oids);
+ if (ret < 0) {
+ cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl;
+ return ret;
+ }
+ count = 0;
+ oids.clear();
+ }
+ }
+ } while (truncated);
+
+ while (!stat_ops.empty()) {
+ ret = pop_and_handle_stat_op(oids, stat_ops);
+ if (ret < 0) {
+ if (ret != -ENOENT) {
+ lderr(store->ctx()) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int RGWOrphanSearch::build_linked_oids_index()
+{
+ map<int, list<string> > oids;
+ map<int, string>::iterator iter = buckets_instance_index.find(search_stage.shard);
+ for (; iter != buckets_instance_index.end(); ++iter) {
+ ldout(store->ctx(), 0) << "building linked oids index: " << iter->first << "/" << buckets_instance_index.size() << dendl;
+ bool truncated;
+
+ string oid = iter->second;
+
+ do {
+ map<string, bufferlist> entries;
+ int ret = orphan_store.read_entries(oid, search_stage.marker, &entries, &truncated);
+ if (ret == -ENOENT) {
+ truncated = false;
+ ret = 0;
+ }
+
+ if (ret < 0) {
+ lderr(store->ctx()) << __func__ << ": ERROR: read_entries() oid=" << oid << " returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ if (entries.empty()) {
+ break;
+ }
+
+ for (map<string, bufferlist>::iterator eiter = entries.begin(); eiter != entries.end(); ++eiter) {
+ ldout(store->ctx(), 20) << " indexed entry: " << eiter->first << dendl;
+ ret = build_linked_oids_for_bucket(eiter->first, oids);
+ }
+
+ search_stage.shard = iter->first;
+ search_stage.marker = entries.rbegin()->first; /* last entry */
+ } while (truncated);
+
+ search_stage.marker.clear();
+ }
+
+ int ret = log_oids(linked_objs_index, oids);
+ if (ret < 0) {
+ cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl;
+ return ret;
+ }
+
+ save_state();
+
+ return 0;
+}
+
+class OMAPReader {
+ librados::IoCtx ioctx;
+ string oid;
+
+ map<string, bufferlist> entries;
+ map<string, bufferlist>::iterator iter;
+ string marker;
+ bool truncated;
+
+public:
+ OMAPReader(librados::IoCtx& _ioctx, const string& _oid) : ioctx(_ioctx), oid(_oid), truncated(true) {
+ iter = entries.end();
+ }
+
+ int get_next(string *key, bufferlist *pbl, bool *done);
+};
+
+int OMAPReader::get_next(string *key, bufferlist *pbl, bool *done)
+{
+ if (iter != entries.end()) {
+ *key = iter->first;
+ if (pbl) {
+ *pbl = iter->second;
+ }
+ ++iter;
+ *done = false;
+ marker = *key;
+ return 0;
+ }
+
+ if (!truncated) {
+ *done = true;
+ return 0;
+ }
+
+#define MAX_OMAP_GET_ENTRIES 100
+ int ret = ioctx.omap_get_vals(oid, marker, MAX_OMAP_GET_ENTRIES, &entries);
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ *done = true;
+ return 0;
+ }
+ return ret;
+ }
+
+ truncated = (entries.size() == MAX_OMAP_GET_ENTRIES);
+ iter = entries.begin();
+ return get_next(key, pbl, done);
+}
+
+int RGWOrphanSearch::compare_oid_indexes()
+{
+ assert(linked_objs_index.size() == all_objs_index.size());
+
+ librados::IoCtx& ioctx = orphan_store.get_ioctx();
+
+ librados::IoCtx data_ioctx;
+
+ librados::Rados *rados = store->get_rados_handle();
+
+ int ret = rados->ioctx_create(search_info.pool.c_str(), data_ioctx);
+ if (ret < 0) {
+ lderr(store->ctx()) << __func__ << ": ioctx_create() returned ret=" << ret << dendl;
+ return ret;
+ }
+
+ uint64_t time_threshold = search_info.start_time.sec() - stale_secs;
+
+ map<int, string>::iterator liter = linked_objs_index.begin();
+ map<int, string>::iterator aiter = all_objs_index.begin();
+
+ for (; liter != linked_objs_index.end(); ++liter, ++aiter) {
+ OMAPReader linked_entries(ioctx, liter->second);
+ OMAPReader all_entries(ioctx, aiter->second);
+
+ bool done;
+
+ string cur_linked;
+ bool linked_done = false;
+
+
+ do {
+ string key;
+ int r = all_entries.get_next(&key, NULL, &done);
+ if (r < 0) {
+ return r;
+ }
+ if (done) {
+ break;
+ }
+
+ string key_fp = obj_fingerprint(key);
+
+ while (cur_linked < key_fp && !linked_done) {
+ r = linked_entries.get_next(&cur_linked, NULL, &linked_done);
+ if (r < 0) {
+ return r;
+ }
+ }
+
+ if (cur_linked == key_fp) {
+ ldout(store->ctx(), 20) << "linked: " << key << dendl;
+ continue;
+ }
+
+ time_t mtime;
+ r = data_ioctx.stat(key, NULL, &mtime);
+ if (r < 0) {
+ if (r != -ENOENT) {
+ lderr(store->ctx()) << "ERROR: ioctx.stat(" << key << ") returned ret=" << r << dendl;
+ }
+ continue;
+ }
+ if (stale_secs && (uint64_t)mtime >= time_threshold) {
+ ldout(store->ctx(), 20) << "skipping: " << key << " (mtime=" << mtime << " threshold=" << time_threshold << ")" << dendl;
+ continue;
+ }
+ ldout(store->ctx(), 20) << "leaked: " << key << dendl;
+ cout << "leaked: " << key << std::endl;
+ } while (!done);
+ }
+
+ return 0;
+}
+
+int RGWOrphanSearch::run()
+{
+ int r;
+
+ switch (search_stage.stage) {
+
+ case ORPHAN_SEARCH_STAGE_INIT:
+ ldout(store->ctx(), 0) << __func__ << "(): initializing state" << dendl;
+ search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_LSPOOL);
+ r = save_state();
+ if (r < 0) {
+ lderr(store->ctx()) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl;
+ return r;
+ }
+ // fall through
+ case ORPHAN_SEARCH_STAGE_LSPOOL:
+ ldout(store->ctx(), 0) << __func__ << "(): building index of all objects in pool" << dendl;
+ r = build_all_oids_index();
+ if (r < 0) {
+ lderr(store->ctx()) << __func__ << ": ERROR: build_all_objs_index returnr ret=" << r << dendl;
+ return r;
+ }
+
+ search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_LSBUCKETS);
+ r = save_state();
+ if (r < 0) {
+ lderr(store->ctx()) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl;
+ return r;
+ }
+ // fall through
+
+ case ORPHAN_SEARCH_STAGE_LSBUCKETS:
+ ldout(store->ctx(), 0) << __func__ << "(): building index of all bucket indexes" << dendl;
+ r = build_buckets_instance_index();
+ if (r < 0) {
+ lderr(store->ctx()) << __func__ << ": ERROR: build_all_objs_index returnr ret=" << r << dendl;
+ return r;
+ }
+
+ search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_ITERATE_BI);
+ r = save_state();
+ if (r < 0) {
+ lderr(store->ctx()) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl;
+ return r;
+ }
+ // fall through
+
+
+ case ORPHAN_SEARCH_STAGE_ITERATE_BI:
+ ldout(store->ctx(), 0) << __func__ << "(): building index of all linked objects" << dendl;
+ r = build_linked_oids_index();
+ if (r < 0) {
+ lderr(store->ctx()) << __func__ << ": ERROR: build_all_objs_index returnr ret=" << r << dendl;
+ return r;
+ }
+
+ search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_COMPARE);
+ r = save_state();
+ if (r < 0) {
+ lderr(store->ctx()) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl;
+ return r;
+ }
+ // fall through
+
+ case ORPHAN_SEARCH_STAGE_COMPARE:
+ r = compare_oid_indexes();
+ if (r < 0) {
+ lderr(store->ctx()) << __func__ << ": ERROR: build_all_objs_index returnr ret=" << r << dendl;
+ return r;
+ }
+
+ break;
+
+ default:
+ assert(0);
+ };
+
+ return 0;
+}
+
+
+int RGWOrphanSearch::remove_index(map<int, string>& index)
+{
+ librados::IoCtx& ioctx = orphan_store.get_ioctx();
+
+ for (map<int, string>::iterator iter = index.begin(); iter != index.end(); ++iter) {
+ int r = ioctx.remove(iter->second);
+ if (r < 0) {
+ if (r != -ENOENT) {
+ ldout(store->ctx(), 0) << "ERROR: couldn't remove " << iter->second << ": ret=" << r << dendl;
+ }
+ }
+ }
+ return 0;
+}
+
+int RGWOrphanSearch::finish()
+{
+ int r = remove_index(all_objs_index);
+ if (r < 0) {
+ ldout(store->ctx(), 0) << "ERROR: remove_index(" << all_objs_index << ") returned ret=" << r << dendl;
+ }
+ r = remove_index(buckets_instance_index);
+ if (r < 0) {
+ ldout(store->ctx(), 0) << "ERROR: remove_index(" << buckets_instance_index << ") returned ret=" << r << dendl;
+ }
+ r = remove_index(linked_objs_index);
+ if (r < 0) {
+ ldout(store->ctx(), 0) << "ERROR: remove_index(" << linked_objs_index << ") returned ret=" << r << dendl;
+ }
+
+ r = orphan_store.remove_job(search_info.job_name);
+ if (r < 0) {
+ ldout(store->ctx(), 0) << "ERROR: could not remove job name (" << search_info.job_name << ") ret=" << r << dendl;
+ }
+
+ return r;
+}
diff --git a/src/rgw/rgw_orphan.h b/src/rgw/rgw_orphan.h
new file mode 100644
index 0000000..ad539b2
--- /dev/null
+++ b/src/rgw/rgw_orphan.h
@@ -0,0 +1,209 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RGW_ORPHAN_H
+#define CEPH_RGW_ORPHAN_H
+
+#include "common/config.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "rgw_rados.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#define RGW_ORPHAN_INDEX_OID "orphan.index"
+#define RGW_ORPHAN_INDEX_PREFIX "orphan.scan"
+
+
+enum RGWOrphanSearchStageId {
+ ORPHAN_SEARCH_STAGE_UNKNOWN = 0,
+ ORPHAN_SEARCH_STAGE_INIT = 1,
+ ORPHAN_SEARCH_STAGE_LSPOOL = 2,
+ ORPHAN_SEARCH_STAGE_LSBUCKETS = 3,
+ ORPHAN_SEARCH_STAGE_ITERATE_BI = 4,
+ ORPHAN_SEARCH_STAGE_COMPARE = 5,
+};
+
+
+struct RGWOrphanSearchStage {
+ RGWOrphanSearchStageId stage;
+ int shard;
+ string marker;
+
+ RGWOrphanSearchStage() : stage(ORPHAN_SEARCH_STAGE_UNKNOWN), shard(0) {}
+ RGWOrphanSearchStage(RGWOrphanSearchStageId _stage) : stage(_stage), shard(0) {}
+ RGWOrphanSearchStage(RGWOrphanSearchStageId _stage, int _shard, const string& _marker) : stage(_stage), shard(_shard), marker(_marker) {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ ::encode((int)stage, bl);
+ ::encode(shard, bl);
+ ::encode(marker, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::iterator& bl) {
+ DECODE_START(1, bl);
+ int s;
+ ::decode(s, bl);
+ stage = (RGWOrphanSearchStageId)s;
+ ::decode(shard, bl);
+ ::decode(marker, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOrphanSearchStage)
+
+struct RGWOrphanSearchInfo {
+ string job_name;
+ string pool;
+ uint16_t num_shards;
+ utime_t start_time;
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ ::encode(job_name, bl);
+ ::encode(pool, bl);
+ ::encode(num_shards, bl);
+ ::encode(start_time, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::iterator& bl) {
+ DECODE_START(1, bl);
+ ::decode(job_name, bl);
+ ::decode(pool, bl);
+ ::decode(num_shards, bl);
+ ::decode(start_time, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOrphanSearchInfo)
+
+struct RGWOrphanSearchState {
+ RGWOrphanSearchInfo info;
+ RGWOrphanSearchStage stage;
+
+ RGWOrphanSearchState() : stage(ORPHAN_SEARCH_STAGE_UNKNOWN) {}
+
+ void encode(bufferlist& bl) const {
+ ENCODE_START(1, 1, bl);
+ ::encode(info, bl);
+ ::encode(stage, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::iterator& bl) {
+ DECODE_START(1, bl);
+ ::decode(info, bl);
+ ::decode(stage, bl);
+ DECODE_FINISH(bl);
+ }
+
+ void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOrphanSearchState)
+
+class RGWOrphanStore {
+ RGWRados *store;
+ librados::IoCtx ioctx;
+
+ string oid;
+
+public:
+ RGWOrphanStore(RGWRados *_store) : store(_store) {
+ oid = RGW_ORPHAN_INDEX_OID;
+ }
+
+ librados::IoCtx& get_ioctx() { return ioctx; }
+
+ int init();
+
+ int read_job(const string& job_name, RGWOrphanSearchState& state);
+ int write_job(const string& job_name, const RGWOrphanSearchState& state);
+ int remove_job(const string& job_name);
+
+
+ int store_entries(const string& oid, const map<string, bufferlist>& entries);
+ int read_entries(const string& oid, const string& marker, map<string, bufferlist> *entries, bool *truncated);
+};
+
+
+class RGWOrphanSearch {
+ RGWRados *store;
+
+ RGWOrphanStore orphan_store;
+
+ RGWOrphanSearchInfo search_info;
+ RGWOrphanSearchStage search_stage;
+
+ map<int, string> all_objs_index;
+ map<int, string> buckets_instance_index;
+ map<int, string> linked_objs_index;
+
+ string index_objs_prefix;
+
+ uint16_t max_concurrent_ios;
+ uint64_t stale_secs;
+
+ struct log_iter_info {
+ string oid;
+ list<string>::iterator cur;
+ list<string>::iterator end;
+ };
+
+ int log_oids(map<int, string>& log_shards, map<int, list<string> >& oids);
+
+#define RGW_ORPHANSEARCH_HASH_PRIME 7877
+ int orphan_shard(const string& str) {
+ return ceph_str_hash_linux(str.c_str(), str.size()) % RGW_ORPHANSEARCH_HASH_PRIME % search_info.num_shards;
+ }
+
+ int handle_stat_result(map<int, list<string> >& oids, RGWRados::Object::Stat::Result& result);
+ int pop_and_handle_stat_op(map<int, list<string> >& oids, std::deque<RGWRados::Object::Stat>& ops);
+
+
+ int remove_index(map<int, string>& index);
+public:
+ RGWOrphanSearch(RGWRados *_store, int _max_ios, uint64_t _stale_secs) : store(_store), orphan_store(store), max_concurrent_ios(_max_ios), stale_secs(_stale_secs) {}
+
+ int save_state() {
+ RGWOrphanSearchState state;
+ state.info = search_info;
+ state.stage = search_stage;
+ return orphan_store.write_job(search_info.job_name, state);
+ }
+
+ int init(const string& job_name, RGWOrphanSearchInfo *info);
+
+ int create(const string& job_name, int num_shards);
+
+ int build_all_oids_index();
+ int build_buckets_instance_index();
+ int build_linked_oids_for_bucket(const string& bucket_instance_id, map<int, list<string> >& oids);
+ int build_linked_oids_index();
+ int compare_oid_indexes();
+
+ int run();
+ int finish();
+};
+
+
+
+#endif
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 6717c54..05c41ef 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -1258,7 +1258,7 @@ int RGWRados::unwatch(uint64_t watch_handle)
ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl;
return r;
}
- r = rados->watch_flush();
+ r = rados[0]->watch_flush();
if (r < 0) {
ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl;
return r;
@@ -1433,11 +1433,17 @@ void RGWRados::finalize()
{
if (finisher) {
finisher->stop();
- delete finisher;
}
if (need_watch_notify()) {
finalize_watch();
}
+ if (finisher) {
+ /* delete finisher only after cleaning up watches, as watch error path might call
+ * into finisher. We stop finisher before finalizing watch to make sure we don't
+ * actually handle any racing work
+ */
+ delete finisher;
+ }
delete meta_mgr;
delete data_log;
if (use_gc_thread) {
@@ -1466,24 +1472,54 @@ void RGWRados::finalize()
*/
int RGWRados::init_rados()
{
- int ret;
+ int ret = 0;
- rados = new Rados();
- if (!rados)
- return -ENOMEM;
+ num_rados_handles = cct->_conf->rgw_num_rados_handles;
- ret = rados->init_with_context(cct);
- if (ret < 0)
- return ret;
+ rados = new librados::Rados *[num_rados_handles];
+ if (!rados) {
+ ret = -ENOMEM;
+ return ret;
+ }
- ret = rados->connect();
- if (ret < 0)
- return ret;
+ for (uint32_t i=0; i < num_rados_handles; i++) {
+
+ rados[i] = new Rados();
+ if (!rados[i]) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ ret = rados[i]->init_with_context(cct);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ ret = rados[i]->connect();
+ if (ret < 0) {
+ goto fail;
+ }
+ }
meta_mgr = new RGWMetadataManager(cct, this);
data_log = new RGWDataChangesLog(cct, this);
return ret;
+
+fail:
+ for (uint32_t i=0; i < num_rados_handles; i++) {
+ if (rados[i]) {
+ delete rados[i];
+ rados[i] = NULL;
+ }
+ }
+ num_rados_handles = 0;
+ if (rados) {
+ delete[] rados;
+ rados = NULL;
+ }
+
+ return ret;
}
/**
@@ -1520,6 +1556,8 @@ int RGWRados::init_complete()
if (ret < 0)
return ret;
+ init_unique_trans_id_deps();
+
ret = region_map.read(cct, this);
if (ret < 0) {
if (ret != -ENOENT) {
@@ -1689,15 +1727,16 @@ int RGWRados::open_root_pool_ctx()
{
const string& pool = zone.domain_root.name;
const char *pool_str = pool.c_str();
- int r = rados->ioctx_create(pool_str, root_pool_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(pool_str, root_pool_ctx);
if (r == -ENOENT) {
- r = rados->pool_create(pool_str);
+ r = rad->pool_create(pool_str);
if (r == -EEXIST)
r = 0;
if (r < 0)
return r;
- r = rados->ioctx_create(pool_str, root_pool_ctx);
+ r = rad->ioctx_create(pool_str, root_pool_ctx);
}
return r;
@@ -1706,15 +1745,16 @@ int RGWRados::open_root_pool_ctx()
int RGWRados::open_gc_pool_ctx()
{
const char *gc_pool = zone.gc_pool.name.c_str();
- int r = rados->ioctx_create(gc_pool, gc_pool_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(gc_pool, gc_pool_ctx);
if (r == -ENOENT) {
- r = rados->pool_create(gc_pool);
+ r = rad->pool_create(gc_pool);
if (r == -EEXIST)
r = 0;
if (r < 0)
return r;
- r = rados->ioctx_create(gc_pool, gc_pool_ctx);
+ r = rad->ioctx_create(gc_pool, gc_pool_ctx);
}
return r;
@@ -1723,15 +1763,16 @@ int RGWRados::open_gc_pool_ctx()
int RGWRados::init_watch()
{
const char *control_pool = zone.control_pool.name.c_str();
- int r = rados->ioctx_create(control_pool, control_pool_ctx);
+ librados::Rados *rad = rados[0];
+ int r = rad->ioctx_create(control_pool, control_pool_ctx);
if (r == -ENOENT) {
- r = rados->pool_create(control_pool);
+ r = rad->pool_create(control_pool);
if (r == -EEXIST)
r = 0;
if (r < 0)
return r;
- r = rados->ioctx_create(control_pool, control_pool_ctx);
+ r = rad->ioctx_create(control_pool, control_pool_ctx);
if (r < 0)
return r;
}
@@ -1787,18 +1828,19 @@ void RGWRados::pick_control_oid(const string& key, string& notify_oid)
int RGWRados::open_bucket_pool_ctx(const string& bucket_name, const string& pool, librados::IoCtx& io_ctx)
{
- int r = rados->ioctx_create(pool.c_str(), io_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(pool.c_str(), io_ctx);
if (r != -ENOENT)
return r;
if (!pools_initialized)
return r;
- r = rados->pool_create(pool.c_str());
+ r = rad->pool_create(pool.c_str());
if (r < 0 && r != -EEXIST)
return r;
- r = rados->ioctx_create(pool.c_str(), io_ctx);
+ r = rad->ioctx_create(pool.c_str(), io_ctx);
return r;
}
@@ -1888,7 +1930,8 @@ int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
{
log_list_state *state = new log_list_state;
const char *log_pool = zone.log_pool.name.c_str();
- int r = rados->ioctx_create(log_pool, state->io_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(log_pool, state->io_ctx);
if (r < 0) {
delete state;
return r;
@@ -1923,7 +1966,8 @@ int RGWRados::log_remove(const string& name)
{
librados::IoCtx io_ctx;
const char *log_pool = zone.log_pool.name.c_str();
- int r = rados->ioctx_create(log_pool, io_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(log_pool, io_ctx);
if (r < 0)
return r;
return io_ctx.remove(name);
@@ -1943,7 +1987,8 @@ int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
{
log_show_state *state = new log_show_state;
const char *log_pool = zone.log_pool.name.c_str();
- int r = rados->ioctx_create(log_pool, state->io_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(log_pool, state->io_ctx);
if (r < 0) {
delete state;
return r;
@@ -2163,7 +2208,8 @@ int RGWRados::time_log_add(const string& oid, const utime_t& ut, const string& s
librados::IoCtx io_ctx;
const char *log_pool = zone.log_pool.name.c_str();
- int r = rados->ioctx_create(log_pool, io_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(log_pool, io_ctx);
if (r == -ENOENT) {
rgw_bucket pool(log_pool);
r = create_pool(pool);
@@ -2171,7 +2217,7 @@ int RGWRados::time_log_add(const string& oid, const utime_t& ut, const string& s
return r;
// retry
- r = rados->ioctx_create(log_pool, io_ctx);
+ r = rad->ioctx_create(log_pool, io_ctx);
}
if (r < 0)
return r;
@@ -2188,7 +2234,8 @@ int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries)
librados::IoCtx io_ctx;
const char *log_pool = zone.log_pool.name.c_str();
- int r = rados->ioctx_create(log_pool, io_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(log_pool, io_ctx);
if (r == -ENOENT) {
rgw_bucket pool(log_pool);
r = create_pool(pool);
@@ -2196,7 +2243,7 @@ int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries)
return r;
// retry
- r = rados->ioctx_create(log_pool, io_ctx);
+ r = rad->ioctx_create(log_pool, io_ctx);
}
if (r < 0)
return r;
@@ -2217,7 +2264,8 @@ int RGWRados::time_log_list(const string& oid, utime_t& start_time, utime_t& end
librados::IoCtx io_ctx;
const char *log_pool = zone.log_pool.name.c_str();
- int r = rados->ioctx_create(log_pool, io_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(log_pool, io_ctx);
if (r < 0)
return r;
librados::ObjectReadOperation op;
@@ -2239,7 +2287,8 @@ int RGWRados::time_log_info(const string& oid, cls_log_header *header)
librados::IoCtx io_ctx;
const char *log_pool = zone.log_pool.name.c_str();
- int r = rados->ioctx_create(log_pool, io_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(log_pool, io_ctx);
if (r < 0)
return r;
librados::ObjectReadOperation op;
@@ -2261,7 +2310,8 @@ int RGWRados::time_log_trim(const string& oid, const utime_t& start_time, const
librados::IoCtx io_ctx;
const char *log_pool = zone.log_pool.name.c_str();
- int r = rados->ioctx_create(log_pool, io_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(log_pool, io_ctx);
if (r < 0)
return r;
@@ -2275,7 +2325,8 @@ int RGWRados::lock_exclusive(rgw_bucket& pool, const string& oid, utime_t& durat
const char *pool_name = pool.name.c_str();
- int r = rados->ioctx_create(pool_name, io_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(pool_name, io_ctx);
if (r < 0)
return r;
@@ -2293,7 +2344,8 @@ int RGWRados::unlock(rgw_bucket& pool, const string& oid, string& zone_id, strin
const char *pool_name = pool.name.c_str();
- int r = rados->ioctx_create(pool_name, io_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(pool_name, io_ctx);
if (r < 0)
return r;
@@ -2425,8 +2477,14 @@ int RGWRados::Bucket::List::list_objects(int max, vector<RGWObjEnt> *result,
RGWObjEnt& entry = eiter->second;
rgw_obj_key key = obj;
string instance;
+ string ns;
- bool check_ns = rgw_obj::translate_raw_obj_to_obj_in_ns(obj.name, instance, params.ns);
+ bool valid = rgw_obj::parse_raw_oid(obj.name, &obj.name, &instance, &ns);
+ if (!valid) {
+ ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
+ continue;
+ }
+ bool check_ns = (ns == params.ns);
if (!params.list_versions && !entry.is_visible()) {
continue;
}
@@ -2487,7 +2545,7 @@ int RGWRados::Bucket::List::list_objects(int max, vector<RGWObjEnt> *result,
RGWObjEnt ent = eiter->second;
ent.key = obj;
- ent.ns = params.ns;
+ ent.ns = ns;
result->push_back(ent);
count++;
}
@@ -2514,14 +2572,15 @@ int RGWRados::create_pool(rgw_bucket& bucket)
string pool = bucket.index_pool;
- ret = rados->pool_create(pool.c_str(), 0);
+ librados::Rados *rad = get_rados_handle();
+ ret = rad->pool_create(pool.c_str(), 0);
if (ret == -EEXIST)
ret = 0;
if (ret < 0)
return ret;
if (bucket.data_pool != pool) {
- ret = rados->pool_create(bucket.data_pool.c_str(), 0);
+ ret = rad->pool_create(bucket.data_pool.c_str(), 0);
if (ret == -EEXIST)
ret = 0;
if (ret < 0)
@@ -2577,7 +2636,8 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
const string& pool = zone.domain_root.name;
const char *pool_str = pool.c_str();
librados::IoCtx id_io_ctx;
- int r = rados->ioctx_create(pool_str, id_io_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(pool_str, id_io_ctx);
if (r < 0)
return r;
@@ -2863,7 +2923,8 @@ int RGWRados::update_placement_map()
int RGWRados::add_bucket_placement(std::string& new_pool)
{
- int ret = rados->pool_lookup(new_pool.c_str());
+ librados::Rados *rad = get_rados_handle();
+ int ret = rad->pool_lookup(new_pool.c_str());
if (ret < 0) // DNE, or something
return ret;
@@ -2913,11 +2974,12 @@ int RGWRados::create_pools(vector<string>& names, vector<int>& retcodes)
vector<librados::PoolAsyncCompletion *> completions;
vector<int> rets;
+ librados::Rados *rad = get_rados_handle();
for (iter = names.begin(); iter != names.end(); ++iter) {
librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion();
completions.push_back(c);
string& name = *iter;
- int ret = rados->pool_create_async(name.c_str(), c);
+ int ret = rad->pool_create_async(name.c_str(), c);
rets.push_back(ret);
}
@@ -3645,17 +3707,18 @@ static void set_copy_attrs(map<string, bufferlist>& src_attrs,
{
switch (attrs_mod) {
case RGWRados::ATTRSMOD_NONE:
- src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
+ attrs = src_attrs;
break;
case RGWRados::ATTRSMOD_REPLACE:
if (!attrs[RGW_ATTR_ETAG].length()) {
attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
}
- src_attrs = attrs;
break;
case RGWRados::ATTRSMOD_MERGE:
- for (map<string, bufferlist>::iterator it = attrs.begin(); it != attrs.end(); ++it) {
- src_attrs[it->first] = it->second;
+ for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
+ if (attrs.find(it->first) == attrs.end()) {
+ attrs[it->first] = it->second;
+ }
}
break;
}
@@ -3805,8 +3868,8 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
}
if (petag) {
- map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
- if (iter != src_attrs.end()) {
+ map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_ETAG);
+ if (iter != attrs.end()) {
bufferlist& etagbl = iter->second;
*petag = string(etagbl.c_str(), etagbl.length());
}
@@ -3814,9 +3877,11 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
if (source_zone.empty()) {
set_copy_attrs(src_attrs, attrs, attrs_mod);
+ } else {
+ attrs = src_attrs;
}
- ret = cb.complete(etag, mtime, set_mtime, src_attrs);
+ ret = cb.complete(etag, mtime, set_mtime, attrs);
if (ret < 0) {
goto set_err_state;
}
@@ -3953,8 +4018,10 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
return ret;
}
+ src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
+
set_copy_attrs(src_attrs, attrs, attrs_mod);
- src_attrs.erase(RGW_ATTR_ID_TAG);
+ attrs.erase(RGW_ATTR_ID_TAG);
RGWObjManifest manifest;
RGWObjState *astate = NULL;
@@ -3967,7 +4034,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
if (remote_dest) {
/* dest is in a different region, copy it there */
- return copy_obj_to_remote_dest(astate, src_attrs, read_op, user_id, dest_obj, mtime);
+ return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
}
uint64_t max_chunk_size;
@@ -4005,7 +4072,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
return copy_obj_data(obj_ctx, dest_bucket_info, read_op, end, dest_obj, src_obj,
- max_chunk_size, mtime, 0, src_attrs, category, olh_epoch,
+ max_chunk_size, mtime, 0, attrs, category, olh_epoch,
version_id, ptag, petag, err);
}
@@ -4957,6 +5024,86 @@ int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
return 0;
}
+
+int RGWRados::Object::Stat::stat_async()
+{
+ RGWObjectCtx& ctx = source->get_ctx();
+ rgw_obj& obj = source->get_obj();
+ RGWRados *store = source->get_store();
+
+ RGWObjState *s = ctx.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
+ result.obj = obj;
+ if (s->has_attrs) {
+ state.ret = 0;
+ result.size = s->size;
+ result.mtime = s->mtime;
+ result.attrs = s->attrset;
+ result.has_manifest = s->has_manifest;
+ result.manifest = s->manifest;
+ return 0;
+ }
+
+ string oid;
+ string loc;
+ rgw_bucket bucket;
+ get_obj_bucket_and_oid_loc(obj, bucket, oid, loc);
+
+ int r = store->get_obj_ioctx(obj, &state.io_ctx);
+ if (r < 0) {
+ return r;
+ }
+
+ librados::ObjectReadOperation op;
+ op.stat(&result.size, &result.mtime, NULL);
+ op.getxattrs(&result.attrs, NULL);
+ state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
+ state.io_ctx.locator_set_key(loc);
+ r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
+ if (r < 0) {
+ ldout(store->ctx(), 5) << __func__ << ": ERROR: aio_operate() returned ret=" << r << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
+
+int RGWRados::Object::Stat::wait()
+{
+ if (!state.completion) {
+ return state.ret;
+ }
+
+ state.completion->wait_for_complete();
+ state.ret = state.completion->get_return_value();
+ state.completion->release();
+
+ if (state.ret != 0) {
+ return state.ret;
+ }
+
+ return finish();
+}
+
+int RGWRados::Object::Stat::finish()
+{
+ map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
+ if (iter != result.attrs.end()) {
+ bufferlist& bl = iter->second;
+ bufferlist::iterator biter = bl.begin();
+ try {
+ ::decode(result.manifest, biter);
+ } catch (buffer::error& err) {
+ RGWRados *store = source->get_store();
+ ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
+ return -EIO;
+ }
+ result.has_manifest = true;
+ }
+
+ return 0;
+}
+
/**
* Get the attributes for an object.
* bucket: name of the bucket holding the object.
@@ -7450,7 +7597,8 @@ int RGWRados::append_async(rgw_obj& obj, size_t size, bufferlist& bl)
if (r < 0) {
return r;
}
- librados::AioCompletion *completion = rados->aio_create_completion(NULL, NULL, NULL);
+ librados::Rados *rad = get_rados_handle();
+ librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
r = ref.ioctx.aio_append(ref.oid, completion, bl, size);
completion->release();
@@ -8002,7 +8150,8 @@ int RGWRados::cls_obj_usage_log_add(const string& oid, rgw_usage_log_info& info)
librados::IoCtx io_ctx;
const char *usage_log_pool = zone.usage_log_pool.name.c_str();
- int r = rados->ioctx_create(usage_log_pool, io_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(usage_log_pool, io_ctx);
if (r == -ENOENT) {
rgw_bucket pool(usage_log_pool);
r = create_pool(pool);
@@ -8010,7 +8159,7 @@ int RGWRados::cls_obj_usage_log_add(const string& oid, rgw_usage_log_info& info)
return r;
// retry
- r = rados->ioctx_create(usage_log_pool, io_ctx);
+ r = rad->ioctx_create(usage_log_pool, io_ctx);
}
if (r < 0)
return r;
@@ -8030,7 +8179,8 @@ int RGWRados::cls_obj_usage_log_read(string& oid, string& user, uint64_t start_e
*is_truncated = false;
const char *usage_log_pool = zone.usage_log_pool.name.c_str();
- int r = rados->ioctx_create(usage_log_pool, io_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(usage_log_pool, io_ctx);
if (r < 0)
return r;
@@ -8045,7 +8195,8 @@ int RGWRados::cls_obj_usage_log_trim(string& oid, string& user, uint64_t start_e
librados::IoCtx io_ctx;
const char *usage_log_pool = zone.usage_log_pool.name.c_str();
- int r = rados->ioctx_create(usage_log_pool, io_ctx);
+ librados::Rados *rad = get_rados_handle();
+ int r = rad->ioctx_create(usage_log_pool, io_ctx);
if (r < 0)
return r;
@@ -8538,7 +8689,7 @@ string RGWStateLog::get_oid(const string& object) {
int RGWStateLog::open_ioctx(librados::IoCtx& ioctx) {
string pool_name;
store->get_log_pool_name(pool_name);
- int r = store->rados->ioctx_create(pool_name.c_str(), ioctx);
+ int r = store->get_rados_handle()->ioctx_create(pool_name.c_str(), ioctx);
if (r < 0) {
lderr(store->ctx()) << "ERROR: could not open rados pool" << dendl;
return r;
@@ -8782,7 +8933,7 @@ int RGWOpStateSingleOp::renew_state() {
uint64_t RGWRados::instance_id()
{
- return rados->get_instance_id();
+ return get_rados_handle()->get_instance_id();
}
uint64_t RGWRados::next_bucket_id()
@@ -8834,3 +8985,31 @@ void RGWStoreManager::close_storage(RGWRados *store)
delete store;
}
+librados::Rados* RGWRados::get_rados_handle()
+{
+ if (num_rados_handles == 1) {
+ return rados[0];
+ } else {
+ handle_lock.get_read();
+ pthread_t id = pthread_self();
+ std::map<pthread_t, int>:: iterator it = rados_map.find(id);
+
+ if (it != rados_map.end()) {
+ handle_lock.put_read();
+ return rados[it->second];
+ } else {
+ handle_lock.put_read();
+ handle_lock.get_write();
+ uint32_t handle = next_rados_handle.read();
+ if (handle == num_rados_handles) {
+ next_rados_handle.set(0);
+ handle = 0;
+ }
+ rados_map[id] = handle;
+ next_rados_handle.inc();
+ handle_lock.put_write();
+ return rados[handle];
+ }
+ }
+}
+
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index 18213cf..37c7e8a 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -327,6 +327,12 @@ public:
::decode(rules, bl);
} else {
explicit_objs = true;
+ if (!objs.empty()) {
+ map<uint64_t, RGWObjManifestPart>::iterator iter = objs.begin();
+ head_obj = iter->second.loc;
+ head_size = iter->second.size;
+ max_head_size = head_size;
+ }
}
if (struct_v >= 4) {
@@ -1237,13 +1243,20 @@ class RGWRados
void cls_obj_check_prefix_exist(librados::ObjectOperation& op, const string& prefix, bool fail_if_exist);
protected:
CephContext *cct;
- librados::Rados *rados;
+
+ librados::Rados **rados;
+ atomic_t next_rados_handle;
+ uint32_t num_rados_handles;
+ RWLock handle_lock;
+ std::map<pthread_t, int> rados_map;
+
librados::IoCtx gc_pool_ctx; // .rgw.gc
bool pools_initialized;
string region_name;
string zone_name;
+ string trans_id_suffix;
RGWQuotaHandler *quota_handler;
@@ -1256,8 +1269,9 @@ public:
watch_initialized(false),
bucket_id_lock("rados_bucket_id"),
bucket_index_max_shards(0),
- max_bucket_id(0),
- cct(NULL), rados(NULL),
+ max_bucket_id(0), cct(NULL),
+ rados(NULL), next_rados_handle(0),
+ num_rados_handles(0), handle_lock("rados_handle_lock"),
pools_initialized(false),
quota_handler(NULL),
finisher(NULL),
@@ -1288,14 +1302,21 @@ public:
map<string, RGWRESTConn *> zone_conn_map;
map<string, RGWRESTConn *> region_conn_map;
+ RGWZoneParams& get_zone_params() { return zone; }
+
RGWMetadataManager *meta_mgr;
RGWDataChangesLog *data_log;
virtual ~RGWRados() {
+ for (uint32_t i=0; i < num_rados_handles; i++) {
+ if (rados[i]) {
+ rados[i]->shutdown();
+ delete rados[i];
+ }
+ }
if (rados) {
- rados->shutdown();
- delete rados;
+ delete[] rados;
}
}
@@ -1587,6 +1608,38 @@ public:
int delete_obj();
};
+
+ struct Stat {
+ RGWRados::Object *source;
+
+ struct Result {
+ rgw_obj obj;
+ RGWObjManifest manifest;
+ bool has_manifest;
+ uint64_t size;
+ time_t mtime;
+ map<string, bufferlist> attrs;
+
+ Result() : has_manifest(false), size(0), mtime(0) {}
+ } result;
+
+ struct State {
+ librados::IoCtx io_ctx;
+ librados::AioCompletion *completion;
+ int ret;
+
+ State() : completion(NULL), ret(0) {}
+ } state;
+
+
+ Stat(RGWRados::Object *_source) : source(_source) {}
+
+ int stat_async();
+ int wait();
+ int stat();
+ private:
+ int finish();
+ };
};
class Bucket {
@@ -2063,6 +2116,34 @@ public:
return s;
}
+ void init_unique_trans_id_deps() {
+ char buf[16 + 2 + 1]; /* uint64_t needs 16, 2 hyphens add further 2 */
+
+ snprintf(buf, sizeof(buf), "-%llx-", (unsigned long long)instance_id());
+ url_encode(string(buf) + zone.name, trans_id_suffix);
+ }
+
+ /* In order to preserve compability with Swift API, transaction ID
+ * should contain at least 32 characters satisfying following spec:
+ * - first 21 chars must be in range [0-9a-f]. Swift uses this
+ * space for storing fragment of UUID obtained through a call to
+ * uuid4() function of Python's uuid module;
+ * - char no. 22 must be a hyphen;
+ * - at least 10 next characters constitute hex-formatted timestamp
+ * padded with zeroes if necessary. All bytes must be in [0-9a-f]
+ * range;
+ * - last, optional part of transaction ID is any url-encoded string
+ * without restriction on length. */
+ string unique_trans_id(const uint64_t unique_num) {
+ char buf[41]; /* 2 + 21 + 1 + 16 (timestamp can consume up to 16) + 1 */
+ time_t timestamp = time(NULL);
+
+ snprintf(buf, sizeof(buf), "tx%021llx-%010llx",
+ (unsigned long long)unique_num,
+ (unsigned long long)timestamp);
+
+ return string(buf) + trans_id_suffix;
+ }
void get_log_pool_name(string& name) {
name = zone.log_pool.name;
@@ -2076,6 +2157,8 @@ public:
return zone_public_config.log_meta;
}
+ librados::Rados* get_rados_handle();
+
private:
/**
* This is a helper method, it generates a list of bucket index objects with the given
diff --git a/src/rgw/rgw_replica_log.cc b/src/rgw/rgw_replica_log.cc
index 6d8ed09..b56a90b 100644
--- a/src/rgw/rgw_replica_log.cc
+++ b/src/rgw/rgw_replica_log.cc
@@ -37,7 +37,7 @@ RGWReplicaLogger::RGWReplicaLogger(RGWRados *_store) :
int RGWReplicaLogger::open_ioctx(librados::IoCtx& ctx, const string& pool)
{
- int r = store->rados->ioctx_create(pool.c_str(), ctx);
+ int r = store->get_rados_handle()->ioctx_create(pool.c_str(), ctx);
if (r == -ENOENT) {
rgw_bucket p(pool.c_str());
r = store->create_pool(p);
@@ -45,7 +45,7 @@ int RGWReplicaLogger::open_ioctx(librados::IoCtx& ctx, const string& pool)
return r;
// retry
- r = store->rados->ioctx_create(pool.c_str(), ctx);
+ r = store->get_rados_handle()->ioctx_create(pool.c_str(), ctx);
}
if (r < 0) {
lderr(cct) << "ERROR: could not open rados pool " << pool << dendl;
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
index d385d62..45eba58 100644
--- a/src/rgw/rgw_rest.cc
+++ b/src/rgw/rgw_rest.cc
@@ -230,8 +230,8 @@ static bool rgw_find_host_in_domains(const string& host, string *domain, string
if (!str_ends_with(host, *iter, &pos))
continue;
- *domain = host.substr(pos);
if (pos == 0) {
+ *domain = host;
subdomain->clear();
} else {
if (host[pos - 1] != '.') {
@@ -359,8 +359,11 @@ void dump_bucket_from_state(struct req_state *s)
{
int expose_bucket = g_conf->rgw_expose_bucket;
if (expose_bucket) {
- if (!s->bucket_name_str.empty())
- s->cio->print("Bucket: %s\r\n", s->bucket_name_str.c_str());
+ if (!s->bucket_name_str.empty()) {
+ string b;
+ url_encode(s->bucket_name_str, b);
+ s->cio->print("Bucket: %s\r\n", b.c_str());
+ }
}
}
@@ -492,15 +495,34 @@ void dump_start(struct req_state *s)
}
}
-void end_header(struct req_state *s, RGWOp *op, const char *content_type, const int64_t proposed_content_length)
+void dump_trans_id(req_state *s)
+{
+ if (s->prot_flags & RGW_REST_SWIFT) {
+ s->cio->print("X-Trans-Id: %s\r\n", s->trans_id.c_str());
+ }
+ else {
+ s->cio->print("x-amz-request-id: %s\r\n", s->trans_id.c_str());
+ }
+}
+
+void end_header(struct req_state *s, RGWOp *op, const char *content_type, const int64_t proposed_content_length,
+ bool force_content_type)
{
string ctype;
+ dump_trans_id(s);
+
if (op) {
dump_access_control(s, op);
}
- if (!content_type || s->err.is_err()) {
+ if (s->prot_flags & RGW_REST_SWIFT && !content_type) {
+ force_content_type = true;
+ }
+
+ /* do not send content type if content length is zero
+ and the content type was not set by the user */
+ if (force_content_type || (!content_type && s->formatter->get_len() != 0) || s->err.is_err()){
switch (s->format) {
case RGW_FORMAT_XML:
ctype = "application/xml";
@@ -530,9 +552,13 @@ void end_header(struct req_state *s, RGWOp *op, const char *content_type, const
dump_content_length(s, proposed_content_length);
}
}
- int r = s->cio->print("Content-type: %s\r\n", content_type);
- if (r < 0) {
- ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
+
+ int r;
+ if (content_type) {
+ r = s->cio->print("Content-type: %s\r\n", content_type);
+ if (r < 0) {
+ ldout(s->cct, 0) << "ERROR: s->cio->print() returned err=" << r << dendl;
+ }
}
r = s->cio->complete_header();
if (r < 0) {
@@ -1322,26 +1348,28 @@ int RGWREST::preprocess(struct req_state *s, RGWClientIO *cio)
req_info& info = s->info;
s->cio = cio;
- if (info.host) {
- string h(s->info.host);
-
- ldout(s->cct, 10) << "host=" << s->info.host << dendl;
+ if (info.host.size()) {
+ ldout(s->cct, 10) << "host=" << info.host << dendl;
string domain;
string subdomain;
- bool in_hosted_domain = rgw_find_host_in_domains(h, &domain, &subdomain);
- ldout(s->cct, 20) << "subdomain=" << subdomain << " domain=" << domain << " in_hosted_domain=" << in_hosted_domain << dendl;
+ bool in_hosted_domain = rgw_find_host_in_domains(info.host, &domain,
+ &subdomain);
+ ldout(s->cct, 20) << "subdomain=" << subdomain << " domain=" << domain
+ << " in_hosted_domain=" << in_hosted_domain << dendl;
if (g_conf->rgw_resolve_cname && !in_hosted_domain) {
string cname;
bool found;
- int r = rgw_resolver->resolve_cname(h, cname, &found);
+ int r = rgw_resolver->resolve_cname(info.host, cname, &found);
if (r < 0) {
ldout(s->cct, 0) << "WARNING: rgw_resolver->resolve_cname() returned r=" << r << dendl;
}
if (found) {
- ldout(s->cct, 5) << "resolved host cname " << h << " -> " << cname << dendl;
+ ldout(s->cct, 5) << "resolved host cname " << info.host << " -> "
+ << cname << dendl;
in_hosted_domain = rgw_find_host_in_domains(cname, &domain, &subdomain);
- ldout(s->cct, 20) << "subdomain=" << subdomain << " domain=" << domain << " in_hosted_domain=" << in_hosted_domain << dendl;
+ ldout(s->cct, 20) << "subdomain=" << subdomain << " domain=" << domain
+ << " in_hosted_domain=" << in_hosted_domain << dendl;
}
}
diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h
index c92a59a..02ae790 100644
--- a/src/rgw/rgw_rest.h
+++ b/src/rgw/rgw_rest.h
@@ -365,7 +365,8 @@ extern void dump_errno(struct req_state *s, int ret);
extern void end_header(struct req_state *s,
RGWOp *op = NULL,
const char *content_type = NULL,
- const int64_t proposed_content_length = NO_CONTENT_LENGTH);
+ const int64_t proposed_content_length = NO_CONTENT_LENGTH,
+ bool force_content_type = false);
extern void dump_start(struct req_state *s);
extern void list_all_buckets_start(struct req_state *s);
extern void dump_owner(struct req_state *s, string& id, string& name, const char *section = NULL);
diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc
index 9513cd6..f549364 100644
--- a/src/rgw/rgw_rest_swift.cc
+++ b/src/rgw/rgw_rest_swift.cc
@@ -53,8 +53,11 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_begin(bool has_buckets)
ret = STATUS_NO_CONTENT;
set_req_state_err(s, ret);
}
- dump_errno(s);
- end_header(s, NULL);
+
+ if (!g_conf->rgw_swift_enforce_content_length) {
+ dump_errno(s);
+ end_header(s, NULL, NULL, NO_CONTENT_LENGTH, true);
+ }
if (!ret) {
dump_start(s);
@@ -82,7 +85,9 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_data(RGWUserBuckets& buckets)
s->formatter->dump_int("bytes", obj.size);
}
s->formatter->close_section();
- rgw_flush_formatter(s, s->formatter);
+ if (!g_conf->rgw_swift_enforce_content_length) {
+ rgw_flush_formatter(s, s->formatter);
+ }
}
}
@@ -90,6 +95,14 @@ void RGWListBuckets_ObjStore_SWIFT::send_response_end()
{
if (sent_data) {
s->formatter->close_section();
+ }
+
+ if (g_conf->rgw_swift_enforce_content_length) {
+ dump_errno(s);
+ end_header(s, NULL, NULL, s->formatter->get_len(), true);
+ }
+
+ if (sent_data || g_conf->rgw_swift_enforce_content_length) {
rgw_flush_formatter_and_reset(s, s->formatter);
}
}
@@ -120,13 +133,14 @@ int RGWListBucket_ObjStore_SWIFT::get_params()
path = prefix;
if (path.size() && path[path.size() - 1] != '/')
path.append("/");
- }
- int len = prefix.size();
- int delim_size = delimiter.size();
- if (len >= delim_size) {
- if (prefix.substr(len - delim_size).compare(delimiter) != 0)
- prefix.append(delimiter);
+ int len = prefix.size();
+ int delim_size = delimiter.size();
+
+ if (len >= delim_size) {
+ if (prefix.substr(len - delim_size).compare(delimiter) != 0)
+ prefix.append(delimiter);
+ }
}
return 0;
@@ -295,7 +309,8 @@ void RGWStatAccount_ObjStore_SWIFT::send_response()
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s, NULL, NULL, 0);
+ end_header(s, NULL, NULL, 0, true);
+
dump_start(s);
}
@@ -309,7 +324,7 @@ void RGWStatBucket_ObjStore_SWIFT::send_response()
set_req_state_err(s, ret);
dump_errno(s);
- end_header(s, this);
+ end_header(s, this,NULL,0, true);
dump_start(s);
}
diff --git a/src/rgw/rgw_rest_user.cc b/src/rgw/rgw_rest_user.cc
index fc46ff6..5e618c4 100644
--- a/src/rgw/rgw_rest_user.cc
+++ b/src/rgw/rgw_rest_user.cc
@@ -71,6 +71,7 @@ void RGWOp_User_Create::execute()
bool exclusive;
uint32_t max_buckets;
+ uint32_t default_max_buckets = s->cct->_conf->rgw_user_max_buckets;
RGWUserAdminOpState op_state;
@@ -83,7 +84,7 @@ void RGWOp_User_Create::execute()
RESTArgs::get_string(s, "user-caps", caps, &caps);
RESTArgs::get_bool(s, "generate-key", true, &gen_key);
RESTArgs::get_bool(s, "suspended", false, &suspended);
- RESTArgs::get_uint32(s, "max-buckets", RGW_DEFAULT_MAX_BUCKETS, &max_buckets);
+ RESTArgs::get_uint32(s, "max-buckets", default_max_buckets, &max_buckets);
RESTArgs::get_bool(s, "system", false, &system);
RESTArgs::get_bool(s, "exclusive", false, &exclusive);
@@ -122,7 +123,7 @@ void RGWOp_User_Create::execute()
op_state.set_key_type(key_type);
}
- if (max_buckets != RGW_DEFAULT_MAX_BUCKETS)
+ if (max_buckets != default_max_buckets)
op_state.set_max_buckets(max_buckets);
if (s->info.args.exists("suspended"))
diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc
index 24b72fb..1e122df 100644
--- a/src/rgw/rgw_user.cc
+++ b/src/rgw/rgw_user.cc
@@ -848,7 +848,7 @@ int RGWAccessKeyPool::generate_key(RGWUserAdminOpState& op_state, std::string *e
} else if (gen_secret) {
char secret_key_buf[SECRET_KEY_LEN + 1];
- ret = gen_rand_base64(g_ceph_context, secret_key_buf, sizeof(secret_key_buf));
+ ret = gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, sizeof(secret_key_buf));
if (ret < 0) {
set_err_msg(err_msg, "unable to generate secret key");
return ret;
@@ -962,7 +962,7 @@ int RGWAccessKeyPool::modify_key(RGWUserAdminOpState& op_state, std::string *err
int ret;
int key_buf_size = sizeof(secret_key_buf);
- ret = gen_rand_base64(g_ceph_context, secret_key_buf, key_buf_size);
+ ret = gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, key_buf_size);
if (ret < 0) {
set_err_msg(err_msg, "unable to generate secret key");
return ret;
@@ -1767,7 +1767,13 @@ int RGWUser::execute_add(RGWUserAdminOpState& op_state, std::string *err_msg)
if (!user_email.empty())
user_info.user_email = user_email;
- user_info.max_buckets = op_state.get_max_buckets();
+ CephContext *cct = store->ctx();
+ if (op_state.max_buckets_specified) {
+ user_info.max_buckets = op_state.get_max_buckets();
+ } else {
+ user_info.max_buckets = cct->_conf->rgw_user_max_buckets;
+ }
+
user_info.suspended = op_state.get_suspension_status();
user_info.system = op_state.system;
@@ -1973,13 +1979,8 @@ int RGWUser::execute_modify(RGWUserAdminOpState& op_state, std::string *err_msg)
if (!display_name.empty())
user_info.display_name = display_name;
- // will be set to RGW_DEFAULT_MAX_BUCKETS by default
- uint32_t max_buckets = op_state.get_max_buckets();
-
- ldout(store->ctx(), 0) << "max_buckets=" << max_buckets << " specified=" << op_state.max_buckets_specified << dendl;
-
if (op_state.max_buckets_specified)
- user_info.max_buckets = max_buckets;
+ user_info.max_buckets = op_state.get_max_buckets();
if (op_state.system_specified)
user_info.system = op_state.system;
diff --git a/src/test/Makefile-client.am b/src/test/Makefile-client.am
index bc4a86a..b55ad4e 100644
--- a/src/test/Makefile-client.am
+++ b/src/test/Makefile-client.am
@@ -335,8 +335,7 @@ ceph_test_librbd_api_SOURCES = \
test/librbd/test_main.cc
ceph_test_librbd_api_CXXFLAGS = $(UNITTEST_CXXFLAGS)
ceph_test_librbd_api_LDADD = \
- $(LIBRBD) $(LIBRADOS) $(UNITTEST_LDADD) \
- $(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
+ $(LIBRBD) $(LIBRADOS) $(LIBCOMMON) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD)
bin_DEBUGPROGRAMS += ceph_test_librbd_api
if WITH_LTTNG
diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc
index 787ce7e..09ee473 100644
--- a/src/test/bufferlist.cc
+++ b/src/test/bufferlist.cc
@@ -1421,6 +1421,19 @@ TEST(BufferList, rebuild) {
EXPECT_TRUE(bl.is_page_aligned());
EXPECT_EQ((unsigned)1, bl.buffers().size());
}
+ {
+ bufferlist bl;
+ char t1[] = "X";
+ bufferlist a2;
+ a2.append(t1, 1);
+ bl.rebuild();
+ bl.append(a2);
+ EXPECT_EQ((unsigned)1, bl.length());
+ bufferlist::iterator p = bl.begin();
+ char dst[1];
+ p.copy(1, dst);
+ EXPECT_EQ(0, memcmp(dst, "X", 1));
+ }
}
TEST(BufferList, rebuild_page_aligned) {
@@ -2221,6 +2234,17 @@ TEST(BufferList, zero) {
bl.zero((unsigned)3, (unsigned)3);
EXPECT_EQ(0, ::memcmp("ABC\0\0\0GHIKLM", bl.c_str(), 9));
}
+ {
+ bufferlist bl;
+ bufferptr ptr1(4);
+ bufferptr ptr2(4);
+ memset(ptr1.c_str(), 'a', 4);
+ memset(ptr2.c_str(), 'b', 4);
+ bl.append(ptr1);
+ bl.append(ptr2);
+ bl.zero((unsigned)2, (unsigned)4);
+ EXPECT_EQ(0, ::memcmp("aa\0\0\0\0bb", bl.c_str(), 8));
+ }
}
TEST(BufferList, EmptyAppend) {
diff --git a/src/test/centos-6/ceph.spec.in b/src/test/centos-6/ceph.spec.in
index b36a0b9..140e0e3 100644
--- a/src/test/centos-6/ceph.spec.in
+++ b/src/test/centos-6/ceph.spec.in
@@ -1,10 +1,13 @@
%bcond_with ocf
+%bcond_without cephfs_java
%if ! (0%{?fedora} > 12 || 0%{?rhel} > 5)
%{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
%{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
%endif
+%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
+
#################################################################################
# common
#################################################################################
@@ -28,7 +31,6 @@ Requires: python-rados = %{epoch}:%{version}-%{release}
Requires: python-rbd = %{epoch}:%{version}-%{release}
Requires: python-cephfs = %{epoch}:%{version}-%{release}
Requires: python
-Requires: python-argparse
Requires: python-requests
Requires: python-flask
Requires: xfsprogs
@@ -39,7 +41,9 @@ Requires: cryptsetup
Requires(post): binutils
BuildRequires: gcc-c++
BuildRequires: boost-devel
-%if ! 0%{defined suse_version}
+%if 0%{defined suse_version}
+BuildRequires: libbz2-devel
+%else
BuildRequires: bzip2-devel
%endif
BuildRequires: cryptsetup
@@ -59,18 +63,15 @@ BuildRequires: perl
BuildRequires: parted
BuildRequires: pkgconfig
BuildRequires: python
-BuildRequires: python-argparse
BuildRequires: python-nose
BuildRequires: python-requests
BuildRequires: python-virtualenv
+BuildRequires: snappy-devel
BuildRequires: util-linux
BuildRequires: xfsprogs
BuildRequires: xfsprogs-devel
BuildRequires: xmlstarlet
BuildRequires: yasm
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} || 0%{?suse_version}
-BuildRequires: snappy-devel
-%endif
%if 0%{?suse_version}
BuildRequires: net-tools
%endif
@@ -95,7 +96,6 @@ BuildRequires: %insserv_prereq
BuildRequires: mozilla-nss-devel
BuildRequires: keyutils-devel
BuildRequires: libatomic-ops-devel
-BuildRequires: fdupes
%else
Requires: gdisk
BuildRequires: nss-devel
@@ -126,12 +126,14 @@ Requires: python-rados = %{epoch}:%{version}-%{release}
Requires: python-rbd = %{epoch}:%{version}-%{release}
Requires: python-cephfs = %{epoch}:%{version}-%{release}
Requires: python-requests
-%if 0%{defined suse_version}
-Requires: python-argparse
-%endif
%if 0%{?rhel} || 0%{?fedora}
Requires: redhat-lsb-core
%endif
+# python-argparse is only needed in distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+Requires: python-argparse
+BuildRequires: python-argparse
+%endif
%description -n ceph-common
Common utilities to mount and interact with a ceph storage cluster.
@@ -161,10 +163,10 @@ Requires: librados2 = %{epoch}:%{version}-%{release}
%if 0%{defined suse_version}
BuildRequires: libexpat-devel
BuildRequires: FastCGI-devel
-Requires: apache2-mod_fcgid
%else
BuildRequires: expat-devel
BuildRequires: fcgi-devel
+Requires: mailcap
%endif
%description radosgw
This package is an S3 HTTP REST gateway for the RADOS object store. It
@@ -213,9 +215,6 @@ Group: System Environment/Libraries
License: LGPL-2.0
Requires: librados2 = %{epoch}:%{version}-%{release}
Obsoletes: python-ceph < %{epoch}:%{version}-%{release}
-%if 0%{defined suse_version}
-%py_requires
-%endif
%description -n python-rados
This package contains Python libraries for interacting with Cephs RADOS
object store.
@@ -333,6 +332,8 @@ BuildRequires: libbabeltrace-devel
%description -n ceph-test
This package contains Ceph benchmarks and test tools.
+%if 0%{with cephfs_java}
+
%package -n libcephfs_jni1
Summary: Java Native Interface library for CephFS Java bindings.
Group: System Environment/Libraries
@@ -372,6 +373,8 @@ BuildRequires: junit
%description -n cephfs-java
This package contains the Java libraries for the Ceph File System.
+%endif
+
%package libs-compat
Summary: Meta package to include ceph libraries.
Group: System Environment/Libraries
@@ -399,7 +402,9 @@ Requires: librados2-devel = %{epoch}:%{version}-%{release}
Requires: libradosstriper1-devel = %{epoch}:%{version}-%{release}
Requires: librbd1-devel = %{epoch}:%{version}-%{release}
Requires: libcephfs1-devel = %{epoch}:%{version}-%{release}
+%if 0%{with cephfs_java}
Requires: libcephfs_jni1-devel = %{epoch}:%{version}-%{release}
+%endif
Provides: ceph-devel
%description devel-compat
This is a compatibility package to accommodate ceph-devel split into
@@ -436,10 +441,12 @@ python-cephfs instead.
%endif
%build
+%if 0%{with cephfs_java}
# Find jni.h
for i in /usr/{lib64,lib}/jvm/java/include{,/linux}; do
[ -d $i ] && java_inc="$java_inc -I$i"
done
+%endif
./autogen.sh
MY_CONF_OPT=""
@@ -457,7 +464,9 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
--without-cryptopp \
--with-rest-bench \
--with-debug \
+%if 0%{with cephfs_java}
--enable-cephfs-java \
+%endif
--with-librocksdb-static=check \
$MY_CONF_OPT \
%{?_with_ocf} \
@@ -479,7 +488,7 @@ make DESTDIR=$RPM_BUILD_ROOT install
find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
-install -D src/init-radosgw.sysv $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
mkdir -p $RPM_BUILD_ROOT%{_sbindir}
@@ -497,13 +506,8 @@ install -m 0644 -D etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds %{buildro
%endif
# udev rules
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%else
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%endif
+install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT%{_udevrulesdir}/50-rbd.rules
+install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
%if (0%{?rhel} && 0%{?rhel} < 7)
install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
@@ -529,12 +533,6 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
-%if %{defined suse_version}
-# Fedora seems to have some problems with this macro, use it only on SUSE
-%fdupes -s $RPM_BUILD_ROOT/%{python_sitelib}
-%fdupes %buildroot
-%endif
-
%clean
rm -rf $RPM_BUILD_ROOT
@@ -615,13 +613,8 @@ fi
%{_libdir}/rados-classes/libcls_version.so*
%dir %{_libdir}/ceph/erasure-code
%{_libdir}/ceph/erasure-code/libec_*.so*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/usr/lib/udev/rules.d/95-ceph-osd.rules
-%else
-/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/lib/udev/rules.d/95-ceph-osd.rules
-%endif
+%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+%{_udevrulesdir}/95-ceph-osd.rules
%config %{_sysconfdir}/bash_completion.d/ceph
%config(noreplace) %{_sysconfdir}/logrotate.d/ceph
%if 0%{?suse_version}
@@ -687,11 +680,7 @@ fi
%config(noreplace) %{_sysconfdir}/ceph/rbdmap
%{_initrddir}/rbdmap
%{python_sitelib}/ceph_argparse.py*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/50-rbd.rules
-%else
-/lib/udev/rules.d/50-rbd.rules
-%endif
+%{_udevrulesdir}/50-rbd.rules
%postun -n ceph-common
# Package removal cleanup
@@ -904,6 +893,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
%endif
#################################################################################
+%if 0%{with cephfs_java}
%files -n libcephfs_jni1
%defattr(-,root,root,-)
%{_libdir}/libcephfs_jni.so.*
@@ -918,6 +908,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
%defattr(-,root,root,-)
%{_javadir}/libcephfs.jar
%{_javadir}/libcephfs-test.jar
+%endif
#################################################################################
%files libs-compat
diff --git a/src/test/centos-7/ceph.spec.in b/src/test/centos-7/ceph.spec.in
index b36a0b9..140e0e3 100644
--- a/src/test/centos-7/ceph.spec.in
+++ b/src/test/centos-7/ceph.spec.in
@@ -1,10 +1,13 @@
%bcond_with ocf
+%bcond_without cephfs_java
%if ! (0%{?fedora} > 12 || 0%{?rhel} > 5)
%{!?python_sitelib: %global python_sitelib %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")}
%{!?python_sitearch: %global python_sitearch %(%{__python} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(1))")}
%endif
+%{!?_udevrulesdir: %global _udevrulesdir /lib/udev/rules.d}
+
#################################################################################
# common
#################################################################################
@@ -28,7 +31,6 @@ Requires: python-rados = %{epoch}:%{version}-%{release}
Requires: python-rbd = %{epoch}:%{version}-%{release}
Requires: python-cephfs = %{epoch}:%{version}-%{release}
Requires: python
-Requires: python-argparse
Requires: python-requests
Requires: python-flask
Requires: xfsprogs
@@ -39,7 +41,9 @@ Requires: cryptsetup
Requires(post): binutils
BuildRequires: gcc-c++
BuildRequires: boost-devel
-%if ! 0%{defined suse_version}
+%if 0%{defined suse_version}
+BuildRequires: libbz2-devel
+%else
BuildRequires: bzip2-devel
%endif
BuildRequires: cryptsetup
@@ -59,18 +63,15 @@ BuildRequires: perl
BuildRequires: parted
BuildRequires: pkgconfig
BuildRequires: python
-BuildRequires: python-argparse
BuildRequires: python-nose
BuildRequires: python-requests
BuildRequires: python-virtualenv
+BuildRequires: snappy-devel
BuildRequires: util-linux
BuildRequires: xfsprogs
BuildRequires: xfsprogs-devel
BuildRequires: xmlstarlet
BuildRequires: yasm
-%if 0%{?rhel} || 0%{?centos} || 0%{?fedora} || 0%{?suse_version}
-BuildRequires: snappy-devel
-%endif
%if 0%{?suse_version}
BuildRequires: net-tools
%endif
@@ -95,7 +96,6 @@ BuildRequires: %insserv_prereq
BuildRequires: mozilla-nss-devel
BuildRequires: keyutils-devel
BuildRequires: libatomic-ops-devel
-BuildRequires: fdupes
%else
Requires: gdisk
BuildRequires: nss-devel
@@ -126,12 +126,14 @@ Requires: python-rados = %{epoch}:%{version}-%{release}
Requires: python-rbd = %{epoch}:%{version}-%{release}
Requires: python-cephfs = %{epoch}:%{version}-%{release}
Requires: python-requests
-%if 0%{defined suse_version}
-Requires: python-argparse
-%endif
%if 0%{?rhel} || 0%{?fedora}
Requires: redhat-lsb-core
%endif
+# python-argparse is only needed in distros with Python 2.6 or lower
+%if (0%{?rhel} && 0%{?rhel} <= 6) || (0%{?suse_version} && 0%{?suse_version} <= 1110)
+Requires: python-argparse
+BuildRequires: python-argparse
+%endif
%description -n ceph-common
Common utilities to mount and interact with a ceph storage cluster.
@@ -161,10 +163,10 @@ Requires: librados2 = %{epoch}:%{version}-%{release}
%if 0%{defined suse_version}
BuildRequires: libexpat-devel
BuildRequires: FastCGI-devel
-Requires: apache2-mod_fcgid
%else
BuildRequires: expat-devel
BuildRequires: fcgi-devel
+Requires: mailcap
%endif
%description radosgw
This package is an S3 HTTP REST gateway for the RADOS object store. It
@@ -213,9 +215,6 @@ Group: System Environment/Libraries
License: LGPL-2.0
Requires: librados2 = %{epoch}:%{version}-%{release}
Obsoletes: python-ceph < %{epoch}:%{version}-%{release}
-%if 0%{defined suse_version}
-%py_requires
-%endif
%description -n python-rados
This package contains Python libraries for interacting with Cephs RADOS
object store.
@@ -333,6 +332,8 @@ BuildRequires: libbabeltrace-devel
%description -n ceph-test
This package contains Ceph benchmarks and test tools.
+%if 0%{with cephfs_java}
+
%package -n libcephfs_jni1
Summary: Java Native Interface library for CephFS Java bindings.
Group: System Environment/Libraries
@@ -372,6 +373,8 @@ BuildRequires: junit
%description -n cephfs-java
This package contains the Java libraries for the Ceph File System.
+%endif
+
%package libs-compat
Summary: Meta package to include ceph libraries.
Group: System Environment/Libraries
@@ -399,7 +402,9 @@ Requires: librados2-devel = %{epoch}:%{version}-%{release}
Requires: libradosstriper1-devel = %{epoch}:%{version}-%{release}
Requires: librbd1-devel = %{epoch}:%{version}-%{release}
Requires: libcephfs1-devel = %{epoch}:%{version}-%{release}
+%if 0%{with cephfs_java}
Requires: libcephfs_jni1-devel = %{epoch}:%{version}-%{release}
+%endif
Provides: ceph-devel
%description devel-compat
This is a compatibility package to accommodate ceph-devel split into
@@ -436,10 +441,12 @@ python-cephfs instead.
%endif
%build
+%if 0%{with cephfs_java}
# Find jni.h
for i in /usr/{lib64,lib}/jvm/java/include{,/linux}; do
[ -d $i ] && java_inc="$java_inc -I$i"
done
+%endif
./autogen.sh
MY_CONF_OPT=""
@@ -457,7 +464,9 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
--without-cryptopp \
--with-rest-bench \
--with-debug \
+%if 0%{with cephfs_java}
--enable-cephfs-java \
+%endif
--with-librocksdb-static=check \
$MY_CONF_OPT \
%{?_with_ocf} \
@@ -479,7 +488,7 @@ make DESTDIR=$RPM_BUILD_ROOT install
find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
-install -D src/init-radosgw.sysv $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
+install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
mkdir -p $RPM_BUILD_ROOT%{_sbindir}
@@ -497,13 +506,8 @@ install -m 0644 -D etc/sysconfig/SuSEfirewall2.d/services/ceph-osd-mds %{buildro
%endif
# udev rules
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%else
-install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT/lib/udev/rules.d/50-rbd.rules
-install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-%endif
+install -m 0644 -D udev/50-rbd.rules $RPM_BUILD_ROOT%{_udevrulesdir}/50-rbd.rules
+install -m 0644 -D udev/60-ceph-partuuid-workaround.rules $RPM_BUILD_ROOT%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
%if (0%{?rhel} && 0%{?rhel} < 7)
install -m 0644 -D udev/95-ceph-osd-alt.rules $RPM_BUILD_ROOT/lib/udev/rules.d/95-ceph-osd.rules
@@ -529,12 +533,6 @@ mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-osd
mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/lib/ceph/bootstrap-mds
mkdir -p $RPM_BUILD_ROOT%{_localstatedir}/log/radosgw
-%if %{defined suse_version}
-# Fedora seems to have some problems with this macro, use it only on SUSE
-%fdupes -s $RPM_BUILD_ROOT/%{python_sitelib}
-%fdupes %buildroot
-%endif
-
%clean
rm -rf $RPM_BUILD_ROOT
@@ -615,13 +613,8 @@ fi
%{_libdir}/rados-classes/libcls_version.so*
%dir %{_libdir}/ceph/erasure-code
%{_libdir}/ceph/erasure-code/libec_*.so*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/usr/lib/udev/rules.d/95-ceph-osd.rules
-%else
-/lib/udev/rules.d/60-ceph-partuuid-workaround.rules
-/lib/udev/rules.d/95-ceph-osd.rules
-%endif
+%{_udevrulesdir}/60-ceph-partuuid-workaround.rules
+%{_udevrulesdir}/95-ceph-osd.rules
%config %{_sysconfdir}/bash_completion.d/ceph
%config(noreplace) %{_sysconfdir}/logrotate.d/ceph
%if 0%{?suse_version}
@@ -687,11 +680,7 @@ fi
%config(noreplace) %{_sysconfdir}/ceph/rbdmap
%{_initrddir}/rbdmap
%{python_sitelib}/ceph_argparse.py*
-%if 0%{?rhel} >= 7 || 0%{?fedora}
-/usr/lib/udev/rules.d/50-rbd.rules
-%else
-/lib/udev/rules.d/50-rbd.rules
-%endif
+%{_udevrulesdir}/50-rbd.rules
%postun -n ceph-common
# Package removal cleanup
@@ -904,6 +893,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
%endif
#################################################################################
+%if 0%{with cephfs_java}
%files -n libcephfs_jni1
%defattr(-,root,root,-)
%{_libdir}/libcephfs_jni.so.*
@@ -918,6 +908,7 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
%defattr(-,root,root,-)
%{_javadir}/libcephfs.jar
%{_javadir}/libcephfs-test.jar
+%endif
#################################################################################
%files libs-compat
diff --git a/src/test/ceph-disk.sh b/src/test/ceph-disk.sh
index d265a57..8f36a58 100755
--- a/src/test/ceph-disk.sh
+++ b/src/test/ceph-disk.sh
@@ -167,6 +167,15 @@ function test_no_path() {
( unset PATH ; test_activate_dir ) || return 1
}
+function test_zap() {
+ local osd_data=$DIR/dir
+ $mkdir -p $osd_data
+
+ ./ceph-disk $CEPH_DISK_ARGS zap $osd_data 2>&1 | grep 'not full block device' || return 1
+
+ $rm -fr $osd_data
+}
+
# ceph-disk prepare returns immediately on success if the magic file
# exists in the --osd-data directory.
function test_activate_dir_magic() {
@@ -470,6 +479,7 @@ function run() {
default_actions+="test_activate_dir_magic "
default_actions+="test_activate_dir "
default_actions+="test_keyring_path "
+ default_actions+="test_zap "
local actions=${@:-$default_actions}
for action in $actions ; do
setup
diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t
index 33aee1d..bdb7324 100644
--- a/src/test/cli/radosgw-admin/help.t
+++ b/src/test/cli/radosgw-admin/help.t
@@ -82,6 +82,7 @@
--access=<access> Set access permissions for sub-user, should be one
of read, write, readwrite, full
--display-name=<name>
+ --max_buckets max number of buckets for a user
--system set the system flag on the user
--bucket=<bucket>
--pool=<pool>
@@ -122,7 +123,7 @@
--categories=<list> comma separated list of categories, used in usage show
--caps=<caps> list of caps (e.g., "usage=read, write; user=read"
--yes-i-really-mean-it required for certain operations
-
+ --reset-regions reset regionmap when regionmap update
<date> := "YYYY-MM-DD[ hh:mm:ss]"
Quota options:
diff --git a/src/test/common/test_bit_vector.cc b/src/test/common/test_bit_vector.cc
index be31d25..c58583c 100644
--- a/src/test/common/test_bit_vector.cc
+++ b/src/test/common/test_bit_vector.cc
@@ -11,6 +11,7 @@
#include <gtest/gtest.h>
#include <cmath>
#include "common/bit_vector.hpp"
+#include <boost/assign/list_of.hpp>
using namespace ceph;
@@ -87,8 +88,9 @@ TYPED_TEST(BitVectorTest, get_set) {
TYPED_TEST(BitVectorTest, get_buffer_extents) {
typename TestFixture::bit_vector_t bit_vector;
+ uint64_t element_count = 2 * CEPH_PAGE_SIZE + 51;
uint64_t elements_per_byte = 8 / bit_vector.BIT_COUNT;
- bit_vector.resize((2 * CEPH_PAGE_SIZE + 51) * elements_per_byte);
+ bit_vector.resize(element_count * elements_per_byte);
uint64_t offset = (CEPH_PAGE_SIZE + 11) * elements_per_byte;
uint64_t length = (CEPH_PAGE_SIZE + 31) * elements_per_byte;
@@ -96,7 +98,7 @@ TYPED_TEST(BitVectorTest, get_buffer_extents) {
uint64_t byte_length;
bit_vector.get_data_extents(offset, length, &byte_offset, &byte_length);
ASSERT_EQ(CEPH_PAGE_SIZE, byte_offset);
- ASSERT_EQ(2 * CEPH_PAGE_SIZE, byte_length);
+ ASSERT_EQ(CEPH_PAGE_SIZE + (element_count % CEPH_PAGE_SIZE), byte_length);
bit_vector.get_data_extents(1, 1, &byte_offset, &byte_length);
ASSERT_EQ(0U, byte_offset);
@@ -128,7 +130,7 @@ TYPED_TEST(BitVectorTest, partial_decode_encode) {
typename TestFixture::bit_vector_t bit_vector;
uint64_t elements_per_byte = 8 / bit_vector.BIT_COUNT;
- bit_vector.resize(5111 * elements_per_byte);
+ bit_vector.resize(9161 * elements_per_byte);
for (uint64_t i = 0; i < bit_vector.size(); ++i) {
bit_vector[i] = i % 4;
}
@@ -148,38 +150,54 @@ TYPED_TEST(BitVectorTest, partial_decode_encode) {
bufferlist::iterator footer_it = footer_bl.begin();
bit_vector.decode_footer(footer_it);
- uint64_t byte_offset;
- uint64_t byte_length;
- bit_vector.get_data_extents(0, 1, &byte_offset, &byte_length);
-
- bufferlist data_bl;
- data_bl.substr_of(bl, bit_vector.get_header_length() + byte_offset,
- byte_length);
- bufferlist::iterator data_it = data_bl.begin();
- bit_vector.decode_data(data_it, byte_offset);
-
- bit_vector[0] = 3;
-
- data_bl.clear();
- bit_vector.encode_data(data_bl, byte_offset, byte_length);
-
- footer_bl.clear();
- bit_vector.encode_footer(footer_bl);
-
- bufferlist updated_bl;
- updated_bl.substr_of(bl, 0, bit_vector.get_header_length() + byte_offset);
- updated_bl.append(data_bl);
-
- uint64_t tail_data_offset = bit_vector.get_header_length() + byte_offset +
- byte_length;
- data_bl.substr_of(bl, tail_data_offset,
- bit_vector.get_footer_offset() - tail_data_offset);
- updated_bl.append(data_bl);
- updated_bl.append(footer_bl);
- ASSERT_EQ(bl.length(), updated_bl.length());
-
- bufferlist::iterator updated_it = updated_bl.begin();
- ::decode(bit_vector, updated_it);
+ typedef std::pair<uint64_t, uint64_t> Extent;
+ typedef std::list<Extent> Extents;
+
+ Extents extents = boost::assign::list_of(
+ std::make_pair(0, 1))(
+ std::make_pair((CEPH_PAGE_SIZE * elements_per_byte) - 2, 4))(
+ std::make_pair((CEPH_PAGE_SIZE * elements_per_byte) + 2, 2))(
+ std::make_pair((2 * CEPH_PAGE_SIZE * elements_per_byte) - 2, 4))(
+ std::make_pair((2 * CEPH_PAGE_SIZE * elements_per_byte) + 2, 2))(
+ std::make_pair(2, 2 * CEPH_PAGE_SIZE));
+ for (Extents::iterator it = extents.begin(); it != extents.end(); ++it) {
+ uint64_t element_offset = it->first;
+ uint64_t element_length = it->second;
+ uint64_t byte_offset;
+ uint64_t byte_length;
+ bit_vector.get_data_extents(element_offset, element_length, &byte_offset,
+ &byte_length);
+
+ bufferlist data_bl;
+ data_bl.substr_of(bl, bit_vector.get_header_length() + byte_offset,
+ byte_length);
+ bufferlist::iterator data_it = data_bl.begin();
+ bit_vector.decode_data(data_it, byte_offset);
+
+ data_bl.clear();
+ bit_vector.encode_data(data_bl, byte_offset, byte_length);
+
+ footer_bl.clear();
+ bit_vector.encode_footer(footer_bl);
+
+ bufferlist updated_bl;
+ updated_bl.substr_of(bl, 0, bit_vector.get_header_length() + byte_offset);
+ updated_bl.append(data_bl);
+
+ if (byte_offset + byte_length < bit_vector.get_footer_offset()) {
+ uint64_t tail_data_offset = bit_vector.get_header_length() + byte_offset +
+ byte_length;
+ data_bl.substr_of(bl, tail_data_offset,
+ bit_vector.get_footer_offset() - tail_data_offset);
+ updated_bl.append(data_bl);
+ }
+
+ updated_bl.append(footer_bl);
+ ASSERT_EQ(bl, updated_bl);
+
+ bufferlist::iterator updated_it = updated_bl.begin();
+ ::decode(bit_vector, updated_it);
+ }
}
TYPED_TEST(BitVectorTest, header_crc) {
diff --git a/src/test/crush/CrushWrapper.cc b/src/test/crush/CrushWrapper.cc
index c690ada..ddfa0f0 100644
--- a/src/test/crush/CrushWrapper.cc
+++ b/src/test/crush/CrushWrapper.cc
@@ -408,6 +408,10 @@ TEST(CrushWrapper, adjust_item_weight) {
EXPECT_EQ(true, c->bucket_exists(bucket_id));
EXPECT_EQ(host_weight, c->get_bucket_weightf(bucket_id));
+ map<string,string> bloc;
+ bloc["root"] = "default";
+ EXPECT_EQ(0, c->insert_item(g_ceph_context, host0, host_weight,
+ HOST0, bloc));
}
{
@@ -426,6 +430,11 @@ TEST(CrushWrapper, adjust_item_weight) {
bucket_id = c->get_item_id("fake");
EXPECT_EQ(true, c->bucket_exists(bucket_id));
EXPECT_EQ(host_weight, c->get_bucket_weightf(bucket_id));
+
+ map<string,string> bloc;
+ bloc["root"] = "default";
+ EXPECT_EQ(0, c->insert_item(g_ceph_context, hostfake, host_weight,
+ FAKE, bloc));
}
//
@@ -470,6 +479,99 @@ TEST(CrushWrapper, adjust_item_weight) {
EXPECT_EQ(modified_weight, c->get_item_weightf_in_loc(item, loc_two));
}
+TEST(CrushWrapper, adjust_subtree_weight) {
+ CrushWrapper *c = new CrushWrapper;
+
+ const int ROOT_TYPE = 2;
+ c->set_type_name(ROOT_TYPE, "root");
+ const int HOST_TYPE = 1;
+ c->set_type_name(HOST_TYPE, "host");
+ const int OSD_TYPE = 0;
+ c->set_type_name(OSD_TYPE, "osd");
+
+ int rootno;
+ c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1,
+ ROOT_TYPE, 0, NULL, NULL, &rootno);
+ c->set_item_name(rootno, "default");
+
+ const string HOST0("host0");
+ int host0;
+ c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1,
+ HOST_TYPE, 0, NULL, NULL, &host0);
+ c->set_item_name(host0, HOST0);
+
+ const string FAKE("fake");
+ int hostfake;
+ c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1,
+ HOST_TYPE, 0, NULL, NULL, &hostfake);
+ c->set_item_name(hostfake, FAKE);
+
+ int item = 0;
+
+ // construct crush map
+
+ {
+ map<string,string> loc;
+ loc["host"] = "host0";
+ float host_weight = 2.0;
+ int bucket_id = 0;
+
+ item = 0;
+ EXPECT_EQ(0, c->insert_item(g_ceph_context, item, 1.0,
+ "osd." + stringify(item), loc));
+ item = 1;
+ EXPECT_EQ(0, c->insert_item(g_ceph_context, item, 1.0,
+ "osd." + stringify(item), loc));
+
+ bucket_id = c->get_item_id("host0");
+ EXPECT_EQ(true, c->bucket_exists(bucket_id));
+ EXPECT_EQ(host_weight, c->get_bucket_weightf(bucket_id));
+
+ map<string,string> bloc;
+ bloc["root"] = "default";
+ EXPECT_EQ(0, c->insert_item(g_ceph_context, host0, host_weight,
+ HOST0, bloc));
+ }
+
+ {
+ map<string,string> loc;
+ loc["host"] = "fake";
+ float host_weight = 2.0;
+ int bucket_id = 0;
+
+ item = 0;
+ EXPECT_EQ(0, c->insert_item(g_ceph_context, item, 1.0,
+ "osd." + stringify(item), loc));
+ item = 1;
+ EXPECT_EQ(0, c->insert_item(g_ceph_context, item, 1.0,
+ "osd." + stringify(item), loc));
+
+ bucket_id = c->get_item_id("fake");
+ EXPECT_EQ(true, c->bucket_exists(bucket_id));
+ EXPECT_EQ(host_weight, c->get_bucket_weightf(bucket_id));
+
+ map<string,string> bloc;
+ bloc["root"] = "default";
+ EXPECT_EQ(0, c->insert_item(g_ceph_context, hostfake, host_weight,
+ FAKE, bloc));
+ }
+
+ //cout << "--------before---------" << std::endl;
+ //c->dump_tree(&cout, NULL);
+ ASSERT_EQ(c->get_bucket_weight(host0), 131072);
+ ASSERT_EQ(c->get_bucket_weight(rootno), 262144);
+
+ int r = c->adjust_subtree_weightf(g_ceph_context, host0, 2.0);
+ ASSERT_EQ(r, 2); // 2 items changed
+
+ //cout << "--------after---------" << std::endl;
+ //c->dump_tree(&cout, NULL);
+
+ ASSERT_EQ(c->get_bucket_weight(host0), 262144);
+ ASSERT_EQ(c->get_item_weight(host0), 262144);
+ ASSERT_EQ(c->get_bucket_weight(rootno), 262144 + 131072);
+}
+
TEST(CrushWrapper, insert_item) {
CrushWrapper *c = new CrushWrapper;
@@ -798,6 +900,7 @@ TEST(CrushWrapper, distance) {
int main(int argc, char **argv) {
vector<const char*> args;
argv_to_vec(argc, (const char **)argv, args);
+ env_to_vec(args);
vector<const char*> def_args;
def_args.push_back("--debug-crush=0");
diff --git a/src/test/librados_test_stub/LibradosTestStub.cc b/src/test/librados_test_stub/LibradosTestStub.cc
index f7f597c..8efd6ac 100644
--- a/src/test/librados_test_stub/LibradosTestStub.cc
+++ b/src/test/librados_test_stub/LibradosTestStub.cc
@@ -113,6 +113,14 @@ extern "C" rados_config_t rados_cct(rados_t cluster)
return reinterpret_cast<rados_config_t>(client->cct());
}
+extern "C" int rados_conf_set(rados_t cluster, const char *option,
+ const char *value) {
+ librados::TestRadosClient *impl =
+ reinterpret_cast<librados::TestRadosClient*>(cluster);
+ CephContext *cct = impl->cct();
+ return cct->_conf->set_val(option, value);
+}
+
extern "C" int rados_conf_parse_env(rados_t cluster, const char *var) {
librados::TestRadosClient *client =
reinterpret_cast<librados::TestRadosClient*>(cluster);
@@ -198,6 +206,12 @@ extern "C" void rados_ioctx_destroy(rados_ioctx_t io) {
ctx->put();
}
+extern "C" rados_t rados_ioctx_get_cluster(rados_ioctx_t io) {
+ librados::TestIoCtxImpl *ctx =
+ reinterpret_cast<librados::TestIoCtxImpl*>(io);
+ return reinterpret_cast<rados_t>(ctx->get_rados_client());
+}
+
extern "C" int rados_mon_command(rados_t cluster, const char **cmd,
size_t cmdlen, const char *inbuf,
size_t inbuflen, char **outbuf,
@@ -704,6 +718,31 @@ int Rados::blacklist_add(const std::string& client_address,
return impl->blacklist_add(client_address, expire_seconds);
}
+config_t Rados::cct() {
+ TestRadosClient *impl = reinterpret_cast<TestRadosClient*>(client);
+ return reinterpret_cast<config_t>(impl->cct());
+}
+
+int Rados::conf_set(const char *option, const char *value) {
+ return rados_conf_set(reinterpret_cast<rados_t>(client), option, value);
+}
+
+int Rados::conf_get(const char *option, std::string &val) {
+ TestRadosClient *impl = reinterpret_cast<TestRadosClient*>(client);
+ CephContext *cct = impl->cct();
+
+ char *str = NULL;
+ int ret = cct->_conf->get_val(option, &str, -1);
+ if (ret != 0) {
+ free(str);
+ return ret;
+ }
+
+ val = str;
+ free(str);
+ return 0;
+}
+
int Rados::conf_parse_env(const char *env) const {
return rados_conf_parse_env(reinterpret_cast<rados_t>(client), env);
}
diff --git a/src/test/librados_test_stub/TestClassHandler.cc b/src/test/librados_test_stub/TestClassHandler.cc
index c7a2e96..2732552 100644
--- a/src/test/librados_test_stub/TestClassHandler.cc
+++ b/src/test/librados_test_stub/TestClassHandler.cc
@@ -5,6 +5,9 @@
#include <boost/algorithm/string/predicate.hpp>
#include <dlfcn.h>
#include <errno.h>
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_rados
namespace librados {
@@ -22,7 +25,7 @@ void TestClassHandler::open_class(const std::string& name,
const std::string& path) {
void *handle = dlopen(path.c_str(), RTLD_NOW);
if (handle == NULL) {
- std::cerr << "Failed to load class: " << dlerror() << std::endl;
+ derr << "Failed to load class: " << dlerror() << dendl;
return;
}
m_class_handles.push_back(handle);
diff --git a/src/test/librados_test_stub/TestIoCtxImpl.cc b/src/test/librados_test_stub/TestIoCtxImpl.cc
index f810906..e376e63 100644
--- a/src/test/librados_test_stub/TestIoCtxImpl.cc
+++ b/src/test/librados_test_stub/TestIoCtxImpl.cc
@@ -7,6 +7,7 @@
#include "test/librados_test_stub/TestWatchNotify.h"
#include "librados/AioCompletionImpl.h"
#include "include/assert.h"
+#include "common/valgrind.h"
#include "objclass/objclass.h"
#include <boost/bind.hpp>
#include <errno.h>
@@ -45,7 +46,11 @@ void TestObjectOperationImpl::get() {
void TestObjectOperationImpl::put() {
if (m_refcount.dec() == 0) {
+ ANNOTATE_HAPPENS_AFTER(&m_refcount);
+ ANNOTATE_HAPPENS_BEFORE_FORGET_ALL(&m_refcount);
delete this;
+ } else {
+ ANNOTATE_HAPPENS_BEFORE(&m_refcount);
}
}
@@ -239,7 +244,8 @@ int TestIoCtxImpl::unwatch(uint64_t handle) {
int TestIoCtxImpl::watch(const std::string& o, uint64_t *handle,
librados::WatchCtx *ctx, librados::WatchCtx2 *ctx2) {
- return m_client->get_watch_notify().watch(o, handle, ctx, ctx2);
+ return m_client->get_watch_notify().watch(o, get_instance_id(), handle, ctx,
+ ctx2);
}
int TestIoCtxImpl::execute_aio_operations(const std::string& oid,
diff --git a/src/test/librados_test_stub/TestMemRadosClient.cc b/src/test/librados_test_stub/TestMemRadosClient.cc
index 73abfa9..b89f4eb 100644
--- a/src/test/librados_test_stub/TestMemRadosClient.cc
+++ b/src/test/librados_test_stub/TestMemRadosClient.cc
@@ -116,6 +116,7 @@ int TestMemRadosClient::pool_reverse_lookup(int64_t id, std::string *name) {
}
int TestMemRadosClient::watch_flush() {
+ get_watch_notify().flush();
return 0;
}
diff --git a/src/test/librados_test_stub/TestWatchNotify.cc b/src/test/librados_test_stub/TestWatchNotify.cc
index 6fd7748..14a43bc 100644
--- a/src/test/librados_test_stub/TestWatchNotify.cc
+++ b/src/test/librados_test_stub/TestWatchNotify.cc
@@ -11,7 +11,8 @@ namespace librados {
TestWatchNotify::TestWatchNotify(CephContext *cct)
: m_cct(cct), m_finisher(new Finisher(cct)), m_handle(), m_notify_id(),
- m_file_watcher_lock("librados::TestWatchNotify::m_file_watcher_lock") {
+ m_file_watcher_lock("librados::TestWatchNotify::m_file_watcher_lock"),
+ m_pending_notifies(0) {
m_cct->get();
m_finisher->start();
}
@@ -31,6 +32,13 @@ TestWatchNotify::Watcher::Watcher()
: lock("TestWatchNotify::Watcher::lock") {
}
+void TestWatchNotify::flush() {
+ Mutex::Locker file_watcher_locker(m_file_watcher_lock);
+ while (m_pending_notifies > 0) {
+ m_file_watcher_cond.Wait(m_file_watcher_lock);
+ }
+}
+
int TestWatchNotify::list_watchers(const std::string& o,
std::list<obj_watch_t> *out_watchers) {
SharedWatcher watcher = get_watcher(o);
@@ -42,7 +50,7 @@ int TestWatchNotify::list_watchers(const std::string& o,
it != watcher->watch_handles.end(); ++it) {
obj_watch_t obj;
strcpy(obj.addr, ":/0");
- obj.watcher_id = static_cast<int64_t>(it->second.handle);
+ obj.watcher_id = static_cast<int64_t>(it->second.instance_id);
obj.cookie = it->second.handle;
obj.timeout_seconds = 30;
out_watchers->push_back(obj);
@@ -61,6 +69,7 @@ int TestWatchNotify::notify(const std::string& oid, bufferlist& bl,
RWLock::WLocker l(watcher->lock);
{
Mutex::Locker l2(m_file_watcher_lock);
+ ++m_pending_notifies;
uint64_t notify_id = ++m_notify_id;
SharedNotifyHandle notify_handle(new NotifyHandle());
@@ -104,12 +113,14 @@ void TestWatchNotify::notify_ack(const std::string& o, uint64_t notify_id,
notify_handle->cond.Signal();
}
-int TestWatchNotify::watch(const std::string& o, uint64_t *handle,
- librados::WatchCtx *ctx, librados::WatchCtx2 *ctx2) {
+int TestWatchNotify::watch(const std::string& o, uint64_t instance_id,
+ uint64_t *handle, librados::WatchCtx *ctx,
+ librados::WatchCtx2 *ctx2) {
SharedWatcher watcher = get_watcher(o);
RWLock::WLocker l(watcher->lock);
WatchHandle watch_handle;
+ watch_handle.instance_id = instance_id;
watch_handle.handle = ++m_handle;
watch_handle.watch_ctx = ctx;
watch_handle.watch_ctx2 = ctx2;
@@ -160,20 +171,27 @@ void TestWatchNotify::execute_notify(const std::string &oid,
bufferlist &bl, uint64_t notify_id,
Mutex *lock, Cond *cond,
bool *done) {
- SharedWatcher watcher = get_watcher(oid);
- RWLock::RLocker l(watcher->lock);
+ WatchHandles watch_handles;
+ SharedNotifyHandle notify_handle;
- utime_t timeout;
- timeout.set_from_double(ceph_clock_now(m_cct) + 15);
+ {
+ SharedWatcher watcher = get_watcher(oid);
+ RWLock::RLocker l(watcher->lock);
- NotifyHandles::iterator n_it = watcher->notify_handles.find(notify_id);
- if (n_it == watcher->notify_handles.end()) {
- return;
+ NotifyHandles::iterator n_it = watcher->notify_handles.find(notify_id);
+ if (n_it == watcher->notify_handles.end()) {
+ return;
+ }
+
+ watch_handles = watcher->watch_handles;
+ notify_handle = n_it->second;
}
- SharedNotifyHandle notify_handle = n_it->second;
- for (WatchHandles::iterator w_it = watcher->watch_handles.begin();
- w_it != watcher->watch_handles.end(); ++w_it) {
+ utime_t timeout;
+ timeout.set_from_double(ceph_clock_now(m_cct) + 15);
+
+ for (WatchHandles::iterator w_it = watch_handles.begin();
+ w_it != watch_handles.end(); ++w_it) {
WatchHandle &watch_handle = w_it->second;
bufferlist notify_bl;
@@ -203,6 +221,13 @@ void TestWatchNotify::execute_notify(const std::string &oid,
Mutex::Locker l3(*lock);
*done = true;
cond->Signal();
+
+ {
+ Mutex::Locker file_watcher_locker(m_file_watcher_lock);
+ if (--m_pending_notifies == 0) {
+ m_file_watcher_cond.Signal();
+ }
+ }
}
} // namespace librados
diff --git a/src/test/librados_test_stub/TestWatchNotify.h b/src/test/librados_test_stub/TestWatchNotify.h
index f73ee3a..1761302 100644
--- a/src/test/librados_test_stub/TestWatchNotify.h
+++ b/src/test/librados_test_stub/TestWatchNotify.h
@@ -35,6 +35,7 @@ public:
typedef std::map<uint64_t, SharedNotifyHandle> NotifyHandles;
struct WatchHandle {
+ uint64_t instance_id;
uint64_t handle;
librados::WatchCtx* watch_ctx;
librados::WatchCtx2* watch_ctx2;
@@ -53,13 +54,14 @@ public:
TestWatchNotify(CephContext *cct);
~TestWatchNotify();
+ void flush();
int list_watchers(const std::string& o,
std::list<obj_watch_t> *out_watchers);
int notify(const std::string& o, bufferlist& bl,
uint64_t timeout_ms, bufferlist *pbl);
void notify_ack(const std::string& o, uint64_t notify_id,
uint64_t handle, uint64_t gid, bufferlist& bl);
- int watch(const std::string& o, uint64_t *handle,
+ int watch(const std::string& o, uint64_t instance_id, uint64_t *handle,
librados::WatchCtx *ctx, librados::WatchCtx2 *ctx2);
int unwatch(uint64_t handle);
@@ -74,6 +76,9 @@ private:
uint64_t m_notify_id;
Mutex m_file_watcher_lock;
+ Cond m_file_watcher_cond;
+ uint64_t m_pending_notifies;
+
FileWatchers m_file_watchers;
SharedWatcher get_watcher(const std::string& oid);
diff --git a/src/test/librbd/fsx.cc b/src/test/librbd/fsx.cc
index c5ed1e6..2465417 100644
--- a/src/test/librbd/fsx.cc
+++ b/src/test/librbd/fsx.cc
@@ -42,7 +42,6 @@
#include "include/krbd.h"
#include "include/rados/librados.h"
#include "include/rbd/librbd.h"
-#include "common/ceph_crypto.h"
#define NUMPRINTCOLUMNS 32 /* # columns of data to print on each line */
@@ -2312,7 +2311,6 @@ main(int argc, char **argv)
krbd_destroy(krbd);
rados_shutdown(cluster);
- ceph::crypto::shutdown();
free(original_buf);
free(good_buf);
free(temp_buf);
diff --git a/src/test/librbd/test_ImageWatcher.cc b/src/test/librbd/test_ImageWatcher.cc
index 99a1002..adf087e 100644
--- a/src/test/librbd/test_ImageWatcher.cc
+++ b/src/test/librbd/test_ImageWatcher.cc
@@ -164,10 +164,10 @@ public:
int handle_restart_aio(librbd::ImageCtx *ictx,
librbd::AioCompletion *aio_completion) {
- Mutex::Locker l1(m_callback_lock);
+ Mutex::Locker callback_locker(m_callback_lock);
++m_aio_completion_restarts;
- RWLock::WLocker l2(ictx->owner_lock);
+ RWLock::RLocker owner_locker(ictx->owner_lock);
if (!ictx->image_watcher->is_lock_owner() &&
(m_expected_aio_restarts == 0 ||
m_aio_completion_restarts < m_expected_aio_restarts)) {
@@ -176,7 +176,7 @@ public:
aio_completion);
} else {
{
- Mutex::Locker l2(aio_completion->lock);
+ Mutex::Locker completion_locker(aio_completion->lock);
aio_completion->complete(ictx->cct);
}
@@ -192,7 +192,8 @@ public:
Mutex::Locker l(m_callback_lock);
int r = 0;
while (!m_aio_completions.empty() &&
- m_aio_completion_restarts < m_expected_aio_restarts) {
+ (m_expected_aio_restarts == 0 ||
+ m_aio_completion_restarts < m_expected_aio_restarts)) {
r = m_callback_cond.WaitInterval(ictx.cct, m_callback_lock,
utime_t(10, 0));
if (r != 0) {
@@ -580,6 +581,7 @@ TEST_F(TestImageWatcher, RequestLockTimedOut) {
m_notify_acks = boost::assign::list_of(
std::make_pair(NOTIFY_OP_REQUEST_LOCK, bufferlist()));
+ m_expected_aio_restarts = 1;
{
RWLock::WLocker l(ictx->owner_lock);
ictx->image_watcher->request_lock(
@@ -595,6 +597,45 @@ TEST_F(TestImageWatcher, RequestLockTimedOut) {
ASSERT_TRUE(wait_for_aio_completions(*ictx));
}
+TEST_F(TestImageWatcher, RequestLockIgnored) {
+ REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+ ASSERT_EQ(0, register_image_watch(*ictx));
+ ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
+ "auto " + stringify(m_watch_ctx->get_handle())));
+
+ m_notify_acks = boost::assign::list_of(
+ std::make_pair(NOTIFY_OP_REQUEST_LOCK, create_response_message(0)));
+
+ int orig_notify_timeout = ictx->cct->_conf->client_notify_timeout;
+ ictx->cct->_conf->set_val("client_notify_timeout", "0");
+ BOOST_SCOPE_EXIT( (ictx)(orig_notify_timeout) ) {
+ ictx->cct->_conf->set_val("client_notify_timeout",
+ stringify(orig_notify_timeout));
+ } BOOST_SCOPE_EXIT_END;
+
+ {
+ RWLock::WLocker l(ictx->owner_lock);
+ ictx->image_watcher->request_lock(
+ boost::bind(&TestImageWatcher::handle_restart_aio, this, ictx, _1),
+ create_aio_completion(*ictx));
+ }
+
+ ASSERT_TRUE(wait_for_notifies(*ictx));
+ NotifyOps expected_notify_ops;
+ expected_notify_ops += NOTIFY_OP_REQUEST_LOCK;
+ ASSERT_EQ(expected_notify_ops, m_notifies);
+
+ // after the request times out -- it will be resent
+ ASSERT_TRUE(wait_for_notifies(*ictx));
+ ASSERT_EQ(expected_notify_ops, m_notifies);
+
+ ASSERT_EQ(0, unlock_image());
+ ASSERT_TRUE(wait_for_aio_completions(*ictx));
+}
+
TEST_F(TestImageWatcher, RequestLockTryLockRace) {
REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
@@ -607,6 +648,7 @@ TEST_F(TestImageWatcher, RequestLockTryLockRace) {
m_notify_acks = boost::assign::list_of(
std::make_pair(NOTIFY_OP_REQUEST_LOCK, create_response_message(0)));
+ m_expected_aio_restarts = 1;
{
RWLock::WLocker l(ictx->owner_lock);
ictx->image_watcher->request_lock(
@@ -642,6 +684,7 @@ TEST_F(TestImageWatcher, RequestLockPreTryLockFailed) {
ASSERT_EQ(0, open_image(m_image_name, &ictx));
ASSERT_EQ(0, lock_image(*ictx, LOCK_SHARED, "manually 1234"));
+ m_expected_aio_restarts = 1;
{
RWLock::WLocker l(ictx->owner_lock);
ictx->image_watcher->request_lock(
diff --git a/src/test/librbd/test_internal.cc b/src/test/librbd/test_internal.cc
index 2df917d..4aef7ae 100644
--- a/src/test/librbd/test_internal.cc
+++ b/src/test/librbd/test_internal.cc
@@ -365,3 +365,28 @@ TEST_F(TestInternal, MultipleResize) {
ASSERT_EQ(0, librbd::get_size(ictx, &size));
ASSERT_EQ(0U, size);
}
+
+TEST_F(TestInternal, ShrinkFlushesCache) {
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+ {
+ RWLock::WLocker owner_locker(ictx->owner_lock);
+ ASSERT_EQ(0, ictx->image_watcher->try_lock());
+ }
+
+ std::string buffer(4096, '1');
+ C_SaferCond cond_ctx;
+ librbd::AioCompletion *c =
+ librbd::aio_create_completion_internal(&cond_ctx, librbd::rbd_ctx_cb);
+ c->get();
+ aio_write(ictx, 0, buffer.size(), buffer.c_str(), c, 0);
+
+ librbd::NoOpProgressContext no_op;
+ ASSERT_EQ(0, librbd::resize(ictx, m_image_size >> 1, no_op));
+
+ ASSERT_TRUE(c->is_complete());
+ ASSERT_EQ(0, c->wait_for_complete());
+ ASSERT_EQ(0, cond_ctx.wait());
+ c->put();
+}
diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc
index b3c53c1..14f89ea 100644
--- a/src/test/librbd/test_librbd.cc
+++ b/src/test/librbd/test_librbd.cc
@@ -18,10 +18,6 @@
#include "include/rbd/librbd.h"
#include "include/rbd/librbd.hpp"
-#include "global/global_context.h"
-#include "global/global_init.h"
-#include "common/ceph_argparse.h"
-#include "common/config.h"
#include "common/Thread.h"
#include "gtest/gtest.h"
@@ -83,6 +79,12 @@ static int create_image_full(rados_ioctx_t ioctx, const char *name,
uint64_t features)
{
if (old_format) {
+ // ensure old-format tests actually use the old format
+ int r = rados_conf_set(rados_ioctx_get_cluster(ioctx),
+ "rbd_default_format", "1");
+ if (r < 0) {
+ return r;
+ }
return rbd_create(ioctx, name, size, order);
} else if ((features & RBD_FEATURE_STRIPINGV2) != 0) {
return rbd_create3(ioctx, name, size, features, order, 65536, 16);
@@ -113,6 +115,11 @@ static int create_image_pp(librbd::RBD &rbd,
if (r < 0)
return r;
if (old_format) {
+ librados::Rados rados(ioctx);
+ int r = rados.conf_set("rbd_default_format", "1");
+ if (r < 0) {
+ return r;
+ }
return rbd.create(ioctx, name, size, order);
} else {
return rbd.create2(ioctx, name, size, features, order);
@@ -1441,7 +1448,9 @@ TEST_F(TestLibRBD, TestClone2)
TEST_F(TestLibRBD, TestCoR)
{
- if (!g_conf->rbd_clone_copy_on_read) {
+ std::string config_value;
+ ASSERT_EQ(0, _rados.conf_get("rbd_clone_copy_on_read", config_value));
+ if (config_value == "false") {
std::cout << "SKIPPING due to disabled rbd_copy_on_read" << std::endl;
return;
}
@@ -2341,7 +2350,9 @@ TEST_F(TestLibRBD, ZeroLengthRead)
TEST_F(TestLibRBD, LargeCacheRead)
{
- if (!g_conf->rbd_cache) {
+ std::string config_value;
+ ASSERT_EQ(0, _rados.conf_get("rbd_cache", config_value));
+ if (config_value == "false") {
std::cout << "SKIPPING due to disabled cache" << std::endl;
return;
}
@@ -2349,17 +2360,21 @@ TEST_F(TestLibRBD, LargeCacheRead)
rados_ioctx_t ioctx;
rados_ioctx_create(_cluster, m_pool_name.c_str(), &ioctx);
- uint64_t orig_cache_size = g_conf->rbd_cache_size;
- g_conf->set_val("rbd_cache_size", "16777216");
+ uint32_t new_cache_size = 16777216;
+ std::string orig_cache_size;
+ ASSERT_EQ(0, _rados.conf_get("rbd_cache_size", orig_cache_size));
+ ASSERT_EQ(0, _rados.conf_set("rbd_cache_size",
+ stringify(new_cache_size).c_str()));
+ ASSERT_EQ(0, _rados.conf_get("rbd_cache_size", config_value));
+ ASSERT_EQ(stringify(new_cache_size), config_value);
BOOST_SCOPE_EXIT( (orig_cache_size) ) {
- g_conf->set_val("rbd_cache_size", stringify(orig_cache_size).c_str());
+ ASSERT_EQ(0, _rados.conf_set("rbd_cache_size", orig_cache_size.c_str()));
} BOOST_SCOPE_EXIT_END;
- ASSERT_EQ(16777216, g_conf->rbd_cache_size);
rbd_image_t image;
int order = 0;
const char *name = "testimg";
- uint64_t size = g_conf->rbd_cache_size + 1;
+ uint64_t size = new_cache_size + 1;
ASSERT_EQ(0, create_image(ioctx, name, size, &order));
ASSERT_EQ(0, rbd_open(ioctx, name, &image, NULL));
@@ -2622,8 +2637,7 @@ TEST_F(TestLibRBD, BlockingAIO)
int order = 18;
ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
- CephContext *cct = reinterpret_cast<CephContext*>(ioctx.cct());
- cct->_conf->set_val_or_die("rbd_non_blocking_aio", "0");
+ ASSERT_EQ(0, _rados.conf_set("rbd_non_blocking_aio", "0"));
librbd::Image image;
ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
@@ -2665,3 +2679,58 @@ TEST_F(TestLibRBD, BlockingAIO)
expected_bl.append(std::string(128, '\0'));
ASSERT_TRUE(expected_bl.contents_equal(read_bl));
}
+
+TEST_F(TestLibRBD, ExclusiveLockTransition)
+{
+ REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+ librados::IoCtx ioctx;
+ ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+ librbd::RBD rbd;
+ std::string name = get_temp_image_name();
+
+ uint64_t size = 1 << 18;
+ int order = 12;
+ ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+ librbd::Image image1;
+ ASSERT_EQ(0, rbd.open(ioctx, image1, name.c_str(), NULL));
+
+ librbd::Image image2;
+ ASSERT_EQ(0, rbd.open(ioctx, image2, name.c_str(), NULL));
+
+ std::list<librbd::RBD::AioCompletion *> comps;
+ ceph::bufferlist bl;
+ bl.append(std::string(1 << order, '1'));
+ for (size_t object_no = 0; object_no < (size >> 12); ++object_no) {
+ librbd::RBD::AioCompletion *comp = new librbd::RBD::AioCompletion(NULL,
+ NULL);
+ comps.push_back(comp);
+ if (object_no % 2 == 0) {
+ ASSERT_EQ(0, image1.aio_write(object_no << order, bl.length(), bl, comp));
+ } else {
+ ASSERT_EQ(0, image2.aio_write(object_no << order, bl.length(), bl, comp));
+ }
+ }
+
+ while (!comps.empty()) {
+ librbd::RBD::AioCompletion *comp = comps.front();
+ comps.pop_front();
+ ASSERT_EQ(0, comp->wait_for_complete());
+ ASSERT_EQ(1, comp->is_complete());
+ }
+
+ librbd::Image image3;
+ ASSERT_EQ(0, rbd.open(ioctx, image3, name.c_str(), NULL));
+ for (size_t object_no = 0; object_no < (size >> 12); ++object_no) {
+ bufferlist read_bl;
+ ASSERT_EQ(bl.length(), image3.read(object_no << order, bl.length(),
+ read_bl));
+ ASSERT_TRUE(bl.contents_equal(read_bl));
+ }
+
+ ASSERT_PASSED(validate_object_map, image1);
+ ASSERT_PASSED(validate_object_map, image2);
+ ASSERT_PASSED(validate_object_map, image3);
+}
diff --git a/src/test/librbd/test_main.cc b/src/test/librbd/test_main.cc
index 4c80fba..e71a5af 100644
--- a/src/test/librbd/test_main.cc
+++ b/src/test/librbd/test_main.cc
@@ -1,12 +1,12 @@
// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
-#include "gtest/gtest.h"
-#include "common/ceph_argparse.h"
-#include "common/ceph_crypto.h"
+#include "include/rados/librados.hpp"
#include "global/global_context.h"
-#include "global/global_init.h"
-#include <vector>
+#include "test/librados/test.h"
+#include "gtest/gtest.h"
+#include <iostream>
+#include <string>
extern void register_test_librbd();
#ifdef TEST_LIBRBD_INTERNALS
@@ -26,14 +26,21 @@ int main(int argc, char **argv)
::testing::InitGoogleTest(&argc, argv);
- vector<const char*> args;
- argv_to_vec(argc, (const char **)argv, args);
+ librados::Rados rados;
+ std::string result = connect_cluster_pp(rados);
+ if (result != "" ) {
+ std::cerr << result << std::endl;
+ return 1;
+ }
- global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
- common_init_finish(g_ceph_context);
+#ifdef TEST_LIBRBD_INTERNALS
+ g_ceph_context = reinterpret_cast<CephContext*>(rados.cct());
+#endif // TEST_LIBRBD_INTERNALS
- int r = RUN_ALL_TESTS();
- g_ceph_context->put();
- ceph::crypto::shutdown();
- return r;
+ int r = rados.conf_set("lockdep", "true");
+ if (r < 0) {
+ std::cerr << "failed to enable lockdep" << std::endl;
+ return -r;
+ }
+ return RUN_ALL_TESTS();
}
diff --git a/src/test/mon/osd-crush.sh b/src/test/mon/osd-crush.sh
index 2242e9c..2bf477f 100755
--- a/src/test/mon/osd-crush.sh
+++ b/src/test/mon/osd-crush.sh
@@ -78,6 +78,9 @@ function TEST_crush_rule_rm() {
function TEST_crush_rule_create_erasure() {
local dir=$1
+ # should have at least one OSD
+ run_osd $dir 0 || return 1
+
local ruleset=ruleset3
#
# create a new ruleset with the default profile, implicitly
@@ -108,6 +111,15 @@ function TEST_crush_rule_create_erasure() {
./ceph osd erasure-code-profile ls | grep default || return 1
./ceph osd crush rule rm $ruleset || return 1
! ./ceph osd crush rule ls | grep $ruleset || return 1
+ #
+ # verify that if the crushmap contains a bugous ruleset,
+ # it will prevent the creation of a pool.
+ #
+ local crushtool_path_old=`ceph-conf --show-config-value crushtool`
+ ceph tell mon.* injectargs --crushtool "false"
+
+ expect_failure $dir "Error EINVAL" \
+ ./ceph osd pool create mypool 1 1 erasure || return 1
}
function check_ruleset_id_match_rule_id() {
diff --git a/src/test/objectstore/chain_xattr.cc b/src/test/objectstore/chain_xattr.cc
index 7e2e693..c2e33f7 100644
--- a/src/test/objectstore/chain_xattr.cc
+++ b/src/test/objectstore/chain_xattr.cc
@@ -148,6 +148,44 @@ TEST(chain_xattr, get_and_set) {
::unlink(file);
}
+TEST(chain_xattr, chunk_aligned) {
+ const char* file = FILENAME;
+ ::unlink(file);
+ int fd = ::open(file, O_CREAT|O_WRONLY|O_TRUNC, 0700);
+ const string user("user.");
+
+ // set N* chunk size
+ const string name = "user.foo";
+ const string name2 = "user.bar";
+
+ for (int len = CHAIN_XATTR_MAX_BLOCK_LEN - 10;
+ len < CHAIN_XATTR_MAX_BLOCK_LEN + 10;
+ ++len) {
+ cout << len << std::endl;
+ const string x(len, 'x');
+ char buf[len*2];
+ ASSERT_EQ(len, chain_setxattr(file, name.c_str(), x.c_str(), len));
+ char attrbuf[4096];
+ int l = ceph_os_listxattr(file, attrbuf, sizeof(attrbuf));
+ for (char *p = attrbuf; p - attrbuf < l; p += strlen(p) + 1) {
+ cout << " attr " << p << std::endl;
+ }
+ ASSERT_EQ(len, chain_getxattr(file, name.c_str(), buf, len*2));
+ ASSERT_EQ(0, chain_removexattr(file, name.c_str()));
+
+ ASSERT_EQ(len, chain_fsetxattr(fd, name2.c_str(), x.c_str(), len));
+ l = ceph_os_flistxattr(fd, attrbuf, sizeof(attrbuf));
+ for (char *p = attrbuf; p - attrbuf < l; p += strlen(p) + 1) {
+ cout << " attr " << p << std::endl;
+ }
+ ASSERT_EQ(len, chain_fgetxattr(fd, name2.c_str(), buf, len*2));
+ ASSERT_EQ(0, chain_fremovexattr(fd, name2.c_str()));
+ }
+
+ ::close(fd);
+ ::unlink(file);
+}
+
TEST(chain_xattr, listxattr) {
const char* file = FILENAME;
::unlink(file);
diff --git a/src/test/osd/TestPGLog.cc b/src/test/osd/TestPGLog.cc
index c88bc98..cc9733a 100644
--- a/src/test/osd/TestPGLog.cc
+++ b/src/test/osd/TestPGLog.cc
@@ -139,6 +139,14 @@ public:
fullauth.index();
fulldiv.index();
}
+ void set_div_bounds(eversion_t head, eversion_t tail) {
+ fulldiv.tail = divinfo.log_tail = tail;
+ fulldiv.head = divinfo.last_update = head;
+ }
+ void set_auth_bounds(eversion_t head, eversion_t tail) {
+ fullauth.tail = authinfo.log_tail = tail;
+ fullauth.head = authinfo.last_update = head;
+ }
const IndexedLog &get_fullauth() const { return fullauth; }
const IndexedLog &get_fulldiv() const { return fulldiv; }
const pg_info_t &get_authinfo() const { return authinfo; }
@@ -236,6 +244,8 @@ public:
proc_replica_log(
t, oinfo, olog, omissing, pg_shard_t(1, shard_id_t(0)));
+ assert(oinfo.last_update >= log.tail);
+
if (!tcase.base.empty()) {
ASSERT_EQ(tcase.base.rbegin()->version, oinfo.last_update);
}
@@ -1271,8 +1281,8 @@ TEST_F(PGLogTest, proc_replica_log) {
pg_shard_t from;
eversion_t last_update(1, 1);
- oinfo.last_update = last_update;
- eversion_t last_complete(2, 1);
+ log.head = olog.head = oinfo.last_update = last_update;
+ eversion_t last_complete(1, 1);
oinfo.last_complete = last_complete;
EXPECT_TRUE(t.empty());
@@ -1471,12 +1481,12 @@ TEST_F(PGLogTest, proc_replica_log) {
}
/* +--------------------------+
- | log olog |
+ | olog log |
+--------+-------+---------+
| |object | |
|version | hash | version |
| | | |
- tail > (1,1) | x5 | (1,1) < tail
+ tail > (1,1) | x9 | (1,1) < tail
| | | |
| | | |
| (1,2) | x3 | (1,2) |
@@ -1504,34 +1514,38 @@ TEST_F(PGLogTest, proc_replica_log) {
pg_shard_t from;
eversion_t last_update(1, 2);
+ hobject_t divergent_object;
+ divergent_object.set_hash(0x9);
{
pg_log_entry_t e;
e.mod_desc.mark_unrollbackable();
e.version = eversion_t(1, 1);
- e.soid.set_hash(0x5);
+ e.soid = divergent_object;
log.tail = e.version;
log.log.push_back(e);
e.version = last_update;
e.soid.set_hash(0x3);
log.log.push_back(e);
- e.version = eversion_t(1,3);
- e.soid.set_hash(0x9);
+ e.version = eversion_t(2, 3);
+ e.prior_version = eversion_t(1, 1);
+ e.soid = divergent_object;
e.op = pg_log_entry_t::DELETE;
log.log.push_back(e);
log.head = e.version;
log.index();
e.version = eversion_t(1, 1);
- e.soid.set_hash(0x5);
+ e.soid = divergent_object;
olog.tail = e.version;
olog.log.push_back(e);
e.version = last_update;
e.soid.set_hash(0x3);
olog.log.push_back(e);
- e.version = eversion_t(2, 3);
- e.soid.set_hash(0x9);
+ e.version = eversion_t(1, 3);
+ e.prior_version = eversion_t(1, 1);
+ e.soid = divergent_object;
e.op = pg_log_entry_t::DELETE;
olog.log.push_back(e);
olog.head = e.version;
@@ -1548,28 +1562,30 @@ TEST_F(PGLogTest, proc_replica_log) {
proc_replica_log(t, oinfo, olog, omissing, from);
EXPECT_TRUE(t.empty());
- EXPECT_FALSE(omissing.have_missing());
+ EXPECT_TRUE(omissing.have_missing());
+ EXPECT_TRUE(omissing.is_missing(divergent_object));
+ EXPECT_EQ(omissing.missing[divergent_object].have, eversion_t(0, 0));
+ EXPECT_EQ(omissing.missing[divergent_object].need, eversion_t(1, 1));
EXPECT_EQ(last_update, oinfo.last_update);
- EXPECT_EQ(last_update, oinfo.last_complete);
}
/* +--------------------------+
- | log olog |
+ | olog log |
+--------+-------+---------+
| |object | |
|version | hash | version |
| | | |
- tail > (1,1) | x5 | (1,1) < tail
+ tail > (1,1) | x9 | (1,1) < tail
| | | |
| | | |
| (1,2) | x3 | (1,2) |
| | | |
| | | |
head > (1,3) | x9 | |
- | DELETE | | |
+ | MODIFY | | |
| | | |
| | x9 | (2,3) < head
- | | | MODIFY |
+ | | | DELETE |
| | | |
+--------+-------+---------+
@@ -1594,28 +1610,30 @@ TEST_F(PGLogTest, proc_replica_log) {
e.mod_desc.mark_unrollbackable();
e.version = eversion_t(1, 1);
- e.soid.set_hash(0x5);
+ e.soid = divergent_object;
log.tail = e.version;
log.log.push_back(e);
e.version = last_update;
e.soid.set_hash(0x3);
log.log.push_back(e);
- e.version = eversion_t(1, 3);
- e.soid.set_hash(0x9);
+ e.version = eversion_t(2, 3);
+ e.prior_version = eversion_t(1, 1);
+ e.soid = divergent_object;
e.op = pg_log_entry_t::DELETE;
log.log.push_back(e);
log.head = e.version;
log.index();
e.version = eversion_t(1, 1);
- e.soid.set_hash(0x5);
+ e.soid = divergent_object;
olog.tail = e.version;
olog.log.push_back(e);
e.version = last_update;
e.soid.set_hash(0x3);
olog.log.push_back(e);
- e.version = eversion_t(2, 3);
- e.soid.set_hash(0x9);
+ e.version = eversion_t(1, 3);
+ e.prior_version = eversion_t(1, 1);
+ e.soid = divergent_object;
divergent_object = e.soid;
omissing.add(divergent_object, e.version, eversion_t());
e.op = pg_log_entry_t::MODIFY;
@@ -1629,16 +1647,18 @@ TEST_F(PGLogTest, proc_replica_log) {
EXPECT_TRUE(t.empty());
EXPECT_TRUE(omissing.have_missing());
EXPECT_TRUE(omissing.is_missing(divergent_object));
- EXPECT_EQ(eversion_t(2, 3), omissing.missing[divergent_object].need);
+ EXPECT_EQ(eversion_t(1, 3), omissing.missing[divergent_object].need);
EXPECT_EQ(olog.head, oinfo.last_update);
EXPECT_EQ(olog.head, oinfo.last_complete);
proc_replica_log(t, oinfo, olog, omissing, from);
EXPECT_TRUE(t.empty());
- EXPECT_FALSE(omissing.have_missing());
+ EXPECT_TRUE(omissing.have_missing());
+ EXPECT_TRUE(omissing.is_missing(divergent_object));
+ EXPECT_EQ(omissing.missing[divergent_object].have, eversion_t(0, 0));
+ EXPECT_EQ(omissing.missing[divergent_object].need, eversion_t(1, 1));
EXPECT_EQ(last_update, oinfo.last_update);
- EXPECT_EQ(last_update, oinfo.last_complete);
}
/* +--------------------------+
@@ -1863,6 +1883,20 @@ TEST_F(PGLogTest, merge_log_prior_version_have) {
run_test_case(t);
}
+TEST_F(PGLogTest, merge_log_split_missing_entries_at_head) {
+ TestCase t;
+ t.auth.push_back(mk_ple_mod_rb(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+ t.auth.push_back(mk_ple_mod_rb(mk_obj(1), mk_evt(15, 150), mk_evt(10, 100)));
+
+ t.div.push_back(mk_ple_mod(mk_obj(1), mk_evt(8, 70), mk_evt(8, 65)));
+
+ t.setup();
+ t.set_div_bounds(mk_evt(9, 79), mk_evt(8, 69));
+ t.set_auth_bounds(mk_evt(10, 160), mk_evt(9, 77));
+ t.final.add(mk_obj(1), mk_evt(15, 150), mk_evt(8, 70));
+ run_test_case(t);
+}
+
TEST_F(PGLogTest, filter_log_1) {
{
clear();
diff --git a/src/test/osd/types.cc b/src/test/osd/types.cc
index 83d9c0f..33324b2 100644
--- a/src/test/osd/types.cc
+++ b/src/test/osd/types.cc
@@ -20,6 +20,7 @@
#include "osd/OSDMap.h"
#include "gtest/gtest.h"
#include "common/Thread.h"
+#include "osd/ReplicatedBackend.h"
#include <sstream>
@@ -139,6 +140,7 @@ TEST(pg_interval_t, check_new_interval)
int64_t pool_id = 200;
int pg_num = 4;
__u8 min_size = 2;
+ boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(new ReplicatedBackend::RPCRecPred());
{
OSDMap::Incremental inc(epoch + 1);
inc.new_pools[pool_id].min_size = min_size;
@@ -183,6 +185,7 @@ TEST(pg_interval_t, check_new_interval)
osdmap,
lastmap,
pgid,
+ recoverable.get(),
&past_intervals));
ASSERT_TRUE(past_intervals.empty());
}
@@ -212,6 +215,7 @@ TEST(pg_interval_t, check_new_interval)
osdmap,
lastmap,
pgid,
+ recoverable.get(),
&past_intervals));
ASSERT_EQ((unsigned int)1, past_intervals.size());
ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
@@ -244,6 +248,7 @@ TEST(pg_interval_t, check_new_interval)
osdmap,
lastmap,
pgid,
+ recoverable.get(),
&past_intervals));
old_primary = new_primary;
ASSERT_EQ((unsigned int)1, past_intervals.size());
@@ -277,6 +282,7 @@ TEST(pg_interval_t, check_new_interval)
osdmap,
lastmap,
pgid,
+ recoverable.get(),
&past_intervals));
ASSERT_EQ((unsigned int)1, past_intervals.size());
ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
@@ -308,6 +314,7 @@ TEST(pg_interval_t, check_new_interval)
osdmap,
lastmap,
pgid,
+ recoverable.get(),
&past_intervals));
ASSERT_EQ((unsigned int)1, past_intervals.size());
ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
@@ -346,6 +353,7 @@ TEST(pg_interval_t, check_new_interval)
osdmap,
lastmap,
pgid,
+ recoverable.get(),
&past_intervals));
ASSERT_EQ((unsigned int)1, past_intervals.size());
ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
@@ -384,6 +392,7 @@ TEST(pg_interval_t, check_new_interval)
osdmap,
lastmap,
pgid,
+ recoverable.get(),
&past_intervals));
ASSERT_EQ((unsigned int)1, past_intervals.size());
ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
@@ -417,6 +426,7 @@ TEST(pg_interval_t, check_new_interval)
osdmap,
lastmap,
pgid,
+ recoverable.get(),
&past_intervals,
&out));
ASSERT_EQ((unsigned int)1, past_intervals.size());
@@ -468,6 +478,7 @@ TEST(pg_interval_t, check_new_interval)
osdmap,
lastmap,
pgid,
+ recoverable.get(),
&past_intervals,
&out));
ASSERT_EQ((unsigned int)1, past_intervals.size());
@@ -502,6 +513,7 @@ TEST(pg_interval_t, check_new_interval)
osdmap,
lastmap,
pgid,
+ recoverable.get(),
&past_intervals,
&out));
ASSERT_EQ((unsigned int)1, past_intervals.size());
@@ -546,6 +558,7 @@ TEST(pg_interval_t, check_new_interval)
osdmap,
lastmap,
pgid,
+ recoverable.get(),
&past_intervals,
&out));
ASSERT_EQ((unsigned int)1, past_intervals.size());
@@ -594,6 +607,7 @@ TEST(pg_interval_t, check_new_interval)
osdmap,
lastmap,
pgid,
+ recoverable.get(),
&past_intervals,
&out));
ASSERT_EQ((unsigned int)1, past_intervals.size());
diff --git a/src/test/osdc/object_cacher_stress.cc b/src/test/osdc/object_cacher_stress.cc
index 4f6fffe..ec5f926 100644
--- a/src/test/osdc/object_cacher_stress.cc
+++ b/src/test/osdc/object_cacher_stress.cc
@@ -112,7 +112,7 @@ int stress_test(uint64_t num_ops, uint64_t num_objs,
ObjectCacher::OSDWrite *wr = obc.prepare_write(snapc, bl, utime_t(), 0);
wr->extents.push_back(op->extent);
lock.Lock();
- obc.writex(wr, &object_set, lock, NULL);
+ obc.writex(wr, &object_set, NULL);
lock.Unlock();
}
}
diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc
index f9d5473..9e68946 100644
--- a/src/tools/ceph_objectstore_tool.cc
+++ b/src/tools/ceph_objectstore_tool.cc
@@ -738,10 +738,14 @@ int mark_pg_for_removal(ObjectStore *fs, spg_t pgid, ObjectStore::Transaction *t
ghobject_t pgmeta_oid(info.pgid.make_pgmeta_oid());
bufferlist bl;
- PG::peek_map_epoch(fs, pgid, &bl);
+ epoch_t pg_epoch = 0;
+ int r = PG::peek_map_epoch(fs, pgid, &pg_epoch, &bl);
+ if (r < 0)
+ cerr << __func__ << " warning: peek_map_epoch fails" << std::endl;
+
map<epoch_t,pg_interval_t> past_intervals;
__u8 struct_v;
- int r = PG::read_info(fs, pgid, coll, bl, info, past_intervals, struct_v);
+ r = PG::read_info(fs, pgid, coll, bl, info, past_intervals, struct_v);
if (r < 0) {
cerr << __func__ << " error on read_info " << cpp_strerror(-r) << std::endl;
return r;
@@ -3058,7 +3062,11 @@ int main(int argc, char **argv)
}
bufferlist bl;
- map_epoch = PG::peek_map_epoch(fs, pgid, &bl);
+ map_epoch = 0;
+ r = PG::peek_map_epoch(fs, pgid, &map_epoch, &bl);
+ if (r < 0)
+ cerr << "peek_map_epoch returns an error" << std::endl;
+
if (debug)
cerr << "map_epoch " << map_epoch << std::endl;
diff --git a/src/tools/crushtool.cc b/src/tools/crushtool.cc
index 3e36ac9..d576b5e 100644
--- a/src/tools/crushtool.cc
+++ b/src/tools/crushtool.cc
@@ -433,6 +433,12 @@ int main(int argc, const char **argv)
exit(EXIT_FAILURE);
}
tester.set_rule(x);
+ } else if (ceph_argparse_withint(args, i, &x, &err, "--ruleset", (char*)NULL)) {
+ if (!err.str().empty()) {
+ cerr << err.str() << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ tester.set_ruleset(x);
} else if (ceph_argparse_withint(args, i, &x, &err, "--batches", (char*)NULL)) {
if (!err.str().empty()) {
cerr << err.str() << std::endl;
diff --git a/src/tools/rest_bench.cc b/src/tools/rest_bench.cc
index 50340d9..6da5cf8 100644
--- a/src/tools/rest_bench.cc
+++ b/src/tools/rest_bench.cc
@@ -281,6 +281,11 @@ public:
list_bucket_handler.listBucketCallback = list_bucket_callback;
}
+ ~RESTDispatcher()
+ {
+ req_wq.drain();
+ m_tp.stop();
+ }
void process_context(req_context *ctx);
void get_obj(req_context *ctx);
void put_obj(req_context *ctx);
@@ -738,10 +743,6 @@ int main(int argc, const char **argv)
}
}
- if (bucket.empty()) {
- cerr << "rest-bench: bucket not specified" << std::endl;
- usage_exit();
- }
if (args.empty())
usage_exit();
int operation = 0;
diff --git a/src/tracing/Makefile.in b/src/tracing/Makefile.in
index 5f45778..13c3458 100644
--- a/src/tracing/Makefile.in
+++ b/src/tracing/Makefile.in
@@ -253,6 +253,7 @@ GCOV_PREFIX_STRIP = @GCOV_PREFIX_STRIP@
GIT_CHECK = @GIT_CHECK@
GREP = @GREP@
HAVE_CXX11 = @HAVE_CXX11@
+HAVE_VALGRIND = @HAVE_VALGRIND@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
diff --git a/src/upstart/ceph-mds.conf b/src/upstart/ceph-mds.conf
index 77841cd..4063d91 100644
--- a/src/upstart/ceph-mds.conf
+++ b/src/upstart/ceph-mds.conf
@@ -4,7 +4,7 @@ start on ceph-mds
stop on runlevel [!2345] or stopping ceph-mds-all
respawn
-respawn limit 5 30
+respawn limit 3 1800
limit nofile 16384 16384
diff --git a/src/upstart/ceph-mon.conf b/src/upstart/ceph-mon.conf
index 0279f15..83c9858 100644
--- a/src/upstart/ceph-mon.conf
+++ b/src/upstart/ceph-mon.conf
@@ -4,7 +4,7 @@ start on ceph-mon
stop on runlevel [!2345] or stopping ceph-mon-all
respawn
-respawn limit 5 30
+respawn limit 3 1800
limit nofile 16384 16384
diff --git a/src/upstart/ceph-osd.conf b/src/upstart/ceph-osd.conf
index d0205ee..2438c20 100644
--- a/src/upstart/ceph-osd.conf
+++ b/src/upstart/ceph-osd.conf
@@ -4,7 +4,7 @@ start on ceph-osd
stop on runlevel [!2345] or stopping ceph-osd-all
respawn
-respawn limit 5 30
+respawn limit 3 1800
limit nofile 327680 327680
diff --git a/src/vstart.sh b/src/vstart.sh
index bf863dc..87b4a57 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -12,11 +12,14 @@ else
[ -z $OBJCLASS_PATH ] && OBJCLASS_PATH=$CEPH_LIB/rados-classes
fi
+if [ -z "${CEPH_VSTART_WRAPPER}" ]; then
+ PATH=$(pwd):$PATH
+fi
+
export PYTHONPATH=./pybind
export LD_LIBRARY_PATH=$CEPH_LIB
export DYLD_LIBRARY_PATH=$LD_LIBRARY_PATH
-
# abort on failure
set -e
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git
More information about the Pkg-ceph-commits
mailing list