[Pkg-bazaar-commits] ./bzr/unstable r362: - Import stat-cache code

Fri Apr 10 07:52:03 UTC 2009

------------------------------------------------------------
revno: 362
committer: Martin Pool <mbp at sourcefrog.net>
timestamp: Thu 2005-05-05 16:24:20 +1000
message:
  - Import stat-cache code
added:
  bzrlib/cache.py
  bzrlib/status.py
-------------- next part --------------
=== added file 'bzrlib/cache.py'

--- a/bzrlib/cache.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/cache.py	2005-05-05 06:24:20 +0000
@@ -0,0 +1,156 @@
+# (C) 2005 Canonical Ltd
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import stat, os, sha, time
+from binascii import b2a_qp, a2b_qp
+
+from trace import mutter
+
+
+# file fingerprints are: (path, size, mtime, ctime, ino, dev).
+#
+# if this is the same for this file as in the previous revision, we
+# assume the content is the same and the SHA-1 is the same.
+
+# This is stored in a fingerprint file that also contains the file-id
+# and the content SHA-1.
+
+# Thus for any given file we can quickly get the SHA-1, either from
+# the cache or if the cache is out of date.
+
+# At the moment this is stored in a simple textfile; it might be nice
+# to use a tdb instead.
+
+
+# What we need:
+
+# build a new cache from scratch
+# load cache, incrementally update it
+
+# TODO: Have a paranoid mode where we always compare the texts and
+# always recalculate the digest, to trap modification without stat
+# change and SHA collisions.
+
+
+
+def fingerprint(path, abspath):
+    try:
+        fs = os.lstat(abspath)
+    except OSError:
+        # might be missing, etc
+        return None
+
+    if stat.S_ISDIR(fs.st_mode):
+        return None
+
+    return (fs.st_size, fs.st_mtime,
+            fs.st_ctime, fs.st_ino, fs.st_dev)
+
+
+def write_cache(branch, entry_iter):
+    outf = branch.controlfile('work-cache.tmp', 'wt')
+    for entry in entry_iter:
+        outf.write(entry[0] + ' ' + entry[1] + ' ')
+        outf.write(b2a_qp(entry[2], True))
+        outf.write(' %d %d %d %d %d\n' % entry[3:])
+        
+    outf.close()
+    os.rename(branch.controlfilename('work-cache.tmp'),
+              branch.controlfilename('work-cache'))
+
+        
+        
+def load_cache(branch):
+    cache = {}
+
+    try:
+        cachefile = branch.controlfile('work-cache', 'rt')
+    except IOError:
+        return cache
+    
+    for l in cachefile:
+        f = l.split(' ')
+        file_id = f[0]
+        if file_id in cache:
+            raise BzrError("duplicated file_id in cache: {%s}" % file_id)
+        cache[file_id] = (f[0], f[1], a2b_qp(f[2])) + tuple([long(x) for x in f[3:]])
+    return cache
+
+
+
+
+def _files_from_inventory(inv):
+    for path, ie in inv.iter_entries():
+        if ie.kind != 'file':
+            continue
+        yield ie.file_id, path
+    
+
+def build_cache(branch):
+    inv = branch.read_working_inventory()
+
+    cache = {}
+    _update_cache_from_list(branch, cache, _files_from_inventory(inv))
+    
+
+
+def update_cache(branch, inv):
+    # TODO: It's supposed to be faster to stat the files in order by inum.
+    # We don't directly know the inum of the files of course but we do
+    # know where they were last sighted, so we can sort by that.
+
+    cache = load_cache(branch)
+    return _update_cache_from_list(branch, cache, _files_from_inventory(inv))
+
+
+
+def _update_cache_from_list(branch, cache, to_update):
+    """Update the cache to have info on the named files.
+
+    to_update is a sequence of (file_id, path) pairs.
+    """
+    hardcheck = dirty = 0
+    for file_id, path in to_update:
+        fap = branch.abspath(path)
+        fp = fingerprint(fap, path)
+        cacheentry = cache.get(file_id)
+
+        if fp == None: # not here
+            if cacheentry:
+                del cache[file_id]
+                dirty += 1
+            continue
+
+        if cacheentry and (cacheentry[3:] == fp):
+            continue                    # all stat fields unchanged
+
+        hardcheck += 1
+
+        dig = sha.new(file(fap, 'rb').read()).hexdigest()
+
+        if cacheentry == None or dig != cacheentry[1]: 
+            # if there was no previous entry for this file, or if the
+            # SHA has changed, then update the cache
+            cacheentry = (file_id, dig, path) + fp
+            cache[file_id] = cacheentry
+            dirty += 1
+
+    mutter('work cache: read %d files, %d changed' % (hardcheck, dirty))
+        
+    if dirty:
+        write_cache(branch, cache.itervalues())
+
+    return cache

=== added file 'bzrlib/status.py'
--- a/bzrlib/status.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/status.py	2005-05-05 06:24:20 +0000
@@ -0,0 +1,52 @@
+# (C) 2005 Canonical Ltd
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+
+def find_modified(branch):
+    """Return a list of files that have been modified in the working copy.
+
+    This does not consider renames and does not include files added or
+    deleted.
+
+    Each modified file is returned as (PATH, ENTRY).
+    """
+    import cache
+
+    inv = branch.read_working_inventory()
+    cc = cache.update_cache(branch, inv)
+    basis_inv = branch.basis_tree().inventory
+    
+    for path, entry in inv.iter_entries():
+        if entry.kind != 'file':
+            continue
+        
+        file_id = entry.file_id
+        ce = cc.get(file_id)
+        if not ce:
+            continue                    # not in working dir
+
+        if file_id not in basis_inv:
+            continue                    # newly added
+
+        old_entry = basis_inv[file_id]
+ 
+        if (old_entry.text_size == ce[3]
+            and old_entry.text_sha1 == ce[1]):
+            continue
+
+        yield path, entry
+