[Pkg-bazaar-commits] ./bzr/unstable r846: - start adding refactored/simplified hash cache

Martin Pool mbp at sourcefrog.net
Fri Apr 10 08:21:14 UTC 2009


------------------------------------------------------------
revno: 846
committer: Martin Pool <mbp at sourcefrog.net>
timestamp: Wed 2005-07-06 20:07:31 +1000
message:
  - start adding refactored/simplified hash cache
    not used yet
added:
  bzrlib/hashcache.py
  bzrlib/selftest/testhashcache.py
modified:
  bzrlib/selftest/__init__.py
  bzrlib/statcache.py
-------------- next part --------------
=== added file 'bzrlib/hashcache.py'
--- a/bzrlib/hashcache.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/hashcache.py	2005-07-06 10:07:31 +0000
@@ -0,0 +1,127 @@
+# (C) 2005 Canonical Ltd
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+
+
+def _fingerprint(abspath):
+    import os, stat
+
+    try:
+        fs = os.lstat(abspath)
+    except OSError:
+        # might be missing, etc
+        return None
+
+    if stat.S_ISDIR(fs.st_mode):
+        return None
+
+    return (fs.st_size, fs.st_mtime,
+            fs.st_ctime, fs.st_ino, fs.st_dev)
+
+
+class HashCache(object):
+    """Cache for looking up file SHA-1.
+
+    Files are considered to match the cached value if the fingerprint
+    of the file has not changed.  This includes its mtime, ctime,
+    device number, inode number, and size.  This should catch
+    modifications or replacement of the file by a new one.
+
+    This may not catch modifications that do not change the file's
+    size and that occur within the resolution window of the
+    timestamps.  To handle this we specifically do not cache files
+    which have changed since the start of the present second, since
+    they could undetectably change again.
+
+    This scheme may fail if the machine's clock steps backwards.
+    Don't do that.
+
+    This does not canonicalize the paths passed in; that should be
+    done by the caller.
+
+    cache_sha1
+        Indexed by path, gives the SHA-1 of the file.
+
+    validator
+        Indexed by path, gives the fingerprint of the file last time it was read.
+
+    stat_count
+        number of times files have been statted
+
+    hit_count
+        number of times files have been retrieved from the cache, avoiding a
+        re-read
+        
+    miss_count
+        number of misses (times files have been completely re-read)
+    """
+    def __init__(self, basedir):
+        self.basedir = basedir
+        self.hit_count = 0
+        self.miss_count = 0
+        self.stat_count = 0
+        self.danger_count = 0
+        self.cache_sha1 = {}
+        self.validator = {}
+
+
+    def clear(self):
+        """Discard all cached information."""
+        self.validator = {}
+        self.cache_sha1 = {}
+
+
+    def get_sha1(self, path):
+        """Return the hex SHA-1 of the contents of the file at path.
+
+        XXX: If the file does not exist or is not a plain file???
+        """
+
+        import os, time
+        from bzrlib.osutils import sha_file
+        
+        abspath = os.path.join(self.basedir, path)
+        fp = _fingerprint(abspath)
+        cache_fp = self.validator.get(path)
+
+        self.stat_count += 1
+
+        if not fp:
+            # not a regular file
+            return None
+        elif cache_fp and (cache_fp == fp):
+            self.hit_count += 1
+            return self.cache_sha1[path]
+        else:
+            self.miss_count += 1
+            digest = sha_file(file(abspath, 'rb'))
+
+            now = int(time.time())
+            if fp[1] >= now or fp[2] >= now:
+                # changed too recently; can't be cached.  we can
+                # return the result and it could possibly be cached
+                # next time.
+                self.danger_count += 1 
+                if cache_fp:
+                    del self.validator[path]
+                    del self.cache_sha1[path]
+            else:
+                self.validator[path] = fp
+                self.cache_sha1[path] = digest
+
+            return digest
+

=== modified file 'bzrlib/selftest/__init__.py'
--- a/bzrlib/selftest/__init__.py	2005-07-06 05:24:29 +0000
+++ b/bzrlib/selftest/__init__.py	2005-07-06 10:07:31 +0000
@@ -26,6 +26,7 @@
     import bzrlib.selftest.blackbox
     import bzrlib.selftest.versioning
     import bzrlib.selftest.testmerge3
+    import bzrlib.selftest.testhashcache
     import bzrlib.merge_core
     from doctest import DocTestSuite
     import os
@@ -57,10 +58,11 @@
     for cl in (bzrlib.selftest.whitebox.TEST_CLASSES 
                + bzrlib.selftest.versioning.TEST_CLASSES
                + bzrlib.selftest.testmerge3.TEST_CLASSES
+               + bzrlib.selftest.testhashcache.TEST_CLASSES
                + bzrlib.selftest.blackbox.TEST_CLASSES):
         suite.addTest(cl())
 
-    return run_suite(suite)
+    return run_suite(suite, 'testbzr')
 
 
 

=== added file 'bzrlib/selftest/testhashcache.py'
--- a/bzrlib/selftest/testhashcache.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/selftest/testhashcache.py	2005-07-06 10:07:31 +0000
@@ -0,0 +1,89 @@
+# (C) 2005 Canonical Ltd
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+from bzrlib.selftest import InTempDir
+
+
+
+def sha1(t):
+    import sha
+    return sha.new(t).hexdigest()
+
+
+def pause():
+    import time
+    # allow it to stabilize
+    start = int(time.time())
+    while int(time.time()) == start:
+        time.sleep(0.2)
+    
+
+
+class TestStatCache(InTempDir):
+    """Functional tests for statcache"""
+    def runTest(self):
+        from bzrlib.hashcache import HashCache
+        import os
+        import time
+
+        hc = HashCache('.')
+
+        file('foo', 'wb').write('hello')
+        os.mkdir('subdir')
+        pause()
+
+        self.assertEquals(hc.get_sha1('foo'),
+                          'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d')
+        self.assertEquals(hc.miss_count, 1)
+        self.assertEquals(hc.hit_count, 0)
+
+        # check we hit without re-reading
+        self.assertEquals(hc.get_sha1('foo'),
+                          'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d')
+        self.assertEquals(hc.miss_count, 1)
+        self.assertEquals(hc.hit_count, 1)
+
+        # check again without re-reading
+        self.assertEquals(hc.get_sha1('foo'),
+                          'aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d')
+        self.assertEquals(hc.miss_count, 1)
+        self.assertEquals(hc.hit_count, 2)
+
+        # write new file and make sure it is seen
+        file('foo', 'wb').write('goodbye')
+        pause()
+        self.assertEquals(hc.get_sha1('foo'),
+                          '3c8ec4874488f6090a157b014ce3397ca8e06d4f')
+        self.assertEquals(hc.miss_count, 2)
+
+        # quickly write new file of same size and make sure it is seen
+        # this may rely on detection of timestamps that are too close
+        # together to be safe
+        file('foo', 'wb').write('g00dbye')
+        self.assertEquals(hc.get_sha1('foo'),
+                          sha1('g00dbye'))
+        
+        # this is not quite guaranteed to be true; we might have
+        # crossed a 1s boundary before
+        self.assertEquals(hc.danger_count, 1)
+
+        self.assertEquals(hc.get_sha1('subdir'), None)
+        
+
+
+TEST_CLASSES = [
+    TestStatCache,
+    ]

=== modified file 'bzrlib/statcache.py'
--- a/bzrlib/statcache.py	2005-07-06 05:33:53 +0000
+++ b/bzrlib/statcache.py	2005-07-06 10:07:31 +0000
@@ -40,6 +40,14 @@
 implemented, and in particular should not depend on the particular
 data which is stored or its format.
 
+The cache maintains a mapping from filename to the SHA-1 of the
+content of the file.
+
+The cache also stores a fingerprint of (size, mtime, ctime, ino, dev)
+which is used to validate that the entry is up-to-date.
+
+
+
 This is done by maintaining a cache indexed by a file fingerprint of
 (path, size, mtime, ctime, ino, dev) pointing to the SHA-1.  If the
 fingerprint has changed, we assume the file content has not changed



More information about the Pkg-bazaar-commits mailing list