[Pkg-bazaar-commits] ./bzr/unstable r846: - start adding refactored/simplified hash cache
Martin Pool
mbp at sourcefrog.net
Fri Apr 10 08:21:14 UTC 2009
------------------------------------------------------------
revno: 846
committer: Martin Pool <mbp at sourcefrog.net>
timestamp: Wed 2005-07-06 20:07:31 +1000
message:
- start adding refactored/simplified hash cache
not used yet
added:
bzrlib/hashcache.py
bzrlib/selftest/testhashcache.py
modified:
bzrlib/selftest/__init__.py
bzrlib/statcache.py
-------------- next part --------------
=== added file 'bzrlib/hashcache.py'
--- a/bzrlib/hashcache.py 1970-01-01 00:00:00 +0000
+++ b/bzrlib/hashcache.py 2005-07-06 10:07:31 +0000
@@ -0,0 +1,127 @@
+# (C) 2005 Canonical Ltd
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+
+
+def _fingerprint(abspath):
+ import os, stat
+
+ try:
+ fs = os.lstat(abspath)
+ except OSError:
+ # might be missing, etc
+ return None
+
+ if stat.S_ISDIR(fs.st_mode):
+ return None
+
+ return (fs.st_size, fs.st_mtime,
+ fs.st_ctime, fs.st_ino, fs.st_dev)
+
+
+class HashCache(object):
+ """Cache for looking up file SHA-1.
+
+ Files are considered to match the cached value if the fingerprint
+ of the file has not changed. This includes its mtime, ctime,
+ device number, inode number, and size. This should catch
+ modifications or replacement of the file by a new one.
+
+ This may not catch modifications that do not change the file's
+ size and that occur within the resolution window of the
+ timestamps. To handle this we specifically do not cache files
+ which have changed since the start of the present second, since
+ they could undetectably change again.
+
+ This scheme may fail if the machine's clock steps backwards.
+ Don't do that.
+
+ This does not canonicalize the paths passed in; that should be
+ done by the caller.
+
+ cache_sha1
+ Indexed by path, gives the SHA-1 of the file.
+
+ validator
+ Indexed by path, gives the fingerprint of the file last time it was read.
+
+ stat_count
+ number of times files have been statted
+
+ hit_count
+ number of times files have been retrieved from the cache, avoiding a
+ re-read
+
+ miss_count
+ number of misses (times files have been completely re-read)
+ """
+ def __init__(self, basedir):
+ self.basedir = basedir
+ self.hit_count = 0
+ self.miss_count = 0
+ self.stat_count = 0
+ self.danger_count = 0
+ self.cache_sha1 = {}
+ self.validator = {}
+
+
+ def clear(self):
+ """Discard all cached information."""
+ self.validator = {}
+ self.cache_sha1 = {}
+
+
+ def get_sha1(self, path):
+ """Return the hex SHA-1 of the contents of the file at path.
+
+ XXX: If the file does not exist or is not a plain file???
+ """
+
+ import os, time
+ from bzrlib.osutils import sha_file
+
+ abspath = os.path.join(self.basedir, path)
+ fp = _fingerprint(abspath)
+ cache_fp = self.validator.get(path)
+
+ self.stat_count += 1
+
+ if not fp:
+ # not a regular file
+ return None
+ elif cache_fp and (cache_fp == fp):
+ self.hit_count += 1
+ return self.cache_sha1[path]
+ else:
+ self.miss_count += 1
+ digest = sha_file(file(abspath, 'rb'))
+
+ now = int(time.time())
+ if fp[1] >= now or fp[2] >= now:
+ # changed too recently; can't be cached. we can
+ # return the result and it could possibly be cached
+ # next time.
+ self.danger_count += 1
+ if cache_fp:
+ del self.validator[path]
+ del self.cache_sha1[path]
+ else:
+ self.validator[path] = fp
+ self.cache_sha1[path] = digest
+
+ return digest
+
=== modified file 'bzrlib/selftest/__init__.py'
--- a/bzrlib/selftest/__init__.py 2005-07-06 05:24:29 +0000
+++ b/bzrlib/selftest/__init__.py 2005-07-06 10:07:31 +0000
@@ -26,6 +26,7 @@
import bzrlib.selftest.blackbox
import bzrlib.selftest.versioning
import bzrlib.selftest.testmerge3
+ import bzrlib.selftest.testhashcache
import bzrlib.merge_core
from doctest import DocTestSuite
import os
@@ -57,10 +58,11 @@
for cl in (bzrlib.selftest.whitebox.TEST_CLASSES
+ bzrlib.selftest.versioning.TEST_CLASSES
+ bzrlib.selftest.testmerge3.TEST_CLASSES
+ + bzrlib.selftest.testhashcache.TEST_CLASSES
+ bzrlib.selftest.blackbox.TEST_CLASSES):
suite.addTest(cl())
- return run_suite(suite)
+ return run_suite(suite, 'testbzr')
=== added file 'bzrlib/selftest/testhashcache.py'
--- a/bzrlib/selftest/testhashcache.py 1970-01-01 00:00:00 +0000
+++ b/bzrlib/selftest/testhashcache.py 2005-07-06 10:07:31 +0000
@@ -0,0 +1,89 @@
+# (C) 2005 Canonical Ltd
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+from bzrlib.selftest import InTempDir
+
+
+
+def sha1(t):
+ import sha
+ return sha.new(t).hexdigest()
+
+
+def pause():
+ import time
+ # allow it to stabilize
+ start = int(time.time())
+ while int(time.time()) == start:
+ time.sleep(0.2)
+
+
+
+class TestStatCache(InTempDir):
+ """Functional tests for statcache"""
+ def runTest(self):
+ from bzrlib.hashcache import HashCache
+ import os
+ import time
+
+ hc = HashCache('.')
+
+ file('foo', 'wb').write('hello')
+ os.mkdir('subdir')
+ pause()
+
+ self.assertEquals(hc.get_sha1('foo'),
+ 'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d')
+ self.assertEquals(hc.miss_count, 1)
+ self.assertEquals(hc.hit_count, 0)
+
+ # check we hit without re-reading
+ self.assertEquals(hc.get_sha1('foo'),
+ 'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d')
+ self.assertEquals(hc.miss_count, 1)
+ self.assertEquals(hc.hit_count, 1)
+
+ # check again without re-reading
+ self.assertEquals(hc.get_sha1('foo'),
+ 'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d')
+ self.assertEquals(hc.miss_count, 1)
+ self.assertEquals(hc.hit_count, 2)
+
+ # write new file and make sure it is seen
+ file('foo', 'wb').write('goodbye')
+ pause()
+ self.assertEquals(hc.get_sha1('foo'),
+ '3c8ec4874488f6090a157b014ce3397ca8e06d4f')
+ self.assertEquals(hc.miss_count, 2)
+
+ # quickly write new file of same size and make sure it is seen
+ # this may rely on detection of timestamps that are too close
+ # together to be safe
+ file('foo', 'wb').write('g00dbye')
+ self.assertEquals(hc.get_sha1('foo'),
+ sha1('g00dbye'))
+
+ # this is not quite guaranteed to be true; we might have
+ # crossed a 1s boundary before
+ self.assertEquals(hc.danger_count, 1)
+
+ self.assertEquals(hc.get_sha1('subdir'), None)
+
+
+
+TEST_CLASSES = [
+ TestStatCache,
+ ]
=== modified file 'bzrlib/statcache.py'
--- a/bzrlib/statcache.py 2005-07-06 05:33:53 +0000
+++ b/bzrlib/statcache.py 2005-07-06 10:07:31 +0000
@@ -40,6 +40,14 @@
implemented, and in particular should not depend on the particular
data which is stored or its format.
+The cache maintains a mapping from filename to the SHA-1 of the
+content of the file.
+
+The cache also stores a fingerprint of (size, mtime, ctime, ino, dev)
+which is used to validate that the entry is up-to-date.
+
+
+
This is done by maintaining a cache indexed by a file fingerprint of
(path, size, mtime, ctime, ino, dev) pointing to the SHA-1. If the
fingerprint has changed, we assume the file content has not changed
More information about the Pkg-bazaar-commits
mailing list