[Pkg-bazaar-commits] ./bzr/unstable r205: Revfile:- store and retrieve deltas!mdiff:- work on bytes not lines

Fri Apr 10 07:44:02 UTC 2009

------------------------------------------------------------
revno: 205
committer: mbp at sourcefrog.net
timestamp: Sat 2005-04-09 14:38:18 +1000
message:
  Revfile:- store and retrieve deltas!mdiff:- work on bytes not lines
modified:
  bzrlib/mdiff.py
  bzrlib/revfile.py
-------------- next part --------------
=== modified file 'bzrlib/mdiff.py'

--- a/bzrlib/mdiff.py	2005-04-09 02:49:04 +0000
+++ b/bzrlib/mdiff.py	2005-04-09 04:38:18 +0000
@@ -1,4 +1,12 @@
 # (C) 2005 Matt Mackall
+# (C) 2005 Canonical Ltd
+
+# based on code by Matt Mackall, hacked by Martin Pool
+
+# mm's code works line-by-line; this just works on byte strings.
+# Possibly slower; possibly gives better results for code not
+# regularly separated by newlines and anyhow a bit simpler.
+
 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -14,63 +22,68 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
+
+# TODO: maybe work on files not strings?
+
+
 import difflib, sys, struct
-
-def linesplit(a):
-    al, ap = [], []
-    last = 0
-
-    n = a.find("\n") + 1
-    while n > 0:
-        ap.append(last)
-        al.append(a[last:n])
-        last = n
-        n = a.find("\n", n) + 1
-
-    return (al, ap)
+from cStringIO import StringIO
 
 def diff(a, b):
-    (al, ap) = linesplit(a)
-    (bl, bp) = linesplit(b)
-
-    d = difflib.SequenceMatcher(None, al, bl)
-    ops = []
+    d = difflib.SequenceMatcher(None, a, b)
     for o, m, n, s, t in d.get_opcodes():
         if o == 'equal': continue
-        ops.append((ap[m], ap[n], "".join(bl[s:t])))
+        # a[m:n] should be replaced by b[s:t]
+        if s == t:
+            yield m, n, ''
+        else:
+            yield m, n, b[s:t]
 
-    return ops
 
 def tobinary(ops):
-    b = ""
+    b = StringIO()
     for f in ops:
-        b += struct.pack(">lll", f[0], f[1], len(f[2])) + f[2]
-    return b
+        b.write(struct.pack(">III", f[0], f[1], len(f[2])))
+        b.write(f[2])
+    return b.getvalue()
+
 
 def bdiff(a, b):
     return tobinary(diff(a, b))
 
+
 def patch(t, ops):
     last = 0
-    r = []
-
-    for p1, p2, sub in ops:
-        r.append(t[last:p1])
-        r.append(sub)
-        last = p2
-
-    r.append(t[last:])
-    return "".join(r)
+    b = StringIO()
+
+    for m, n, r in ops:
+        b.write(t[last:m])
+        if r:
+            b.write(r)
+        last = n
+        
+    b.write(t[last:])
+    return b.getvalue()
+
 
 def frombinary(b):
-    ops = []
-    while b:
-        p = b[:12]
-        m, n, l = struct.unpack(">lll", p)
-        ops.append((m, n, b[12:12 + l]))
-        b = b[12 + l:]
-
-    return ops
+    bin = StringIO(b)
+    while True:
+        p = bin.read(12)
+        if not p:
+            break
+
+        m, n, l = struct.unpack(">III", p)
+        
+        if l == 0:
+            r = ''
+        else:
+            r = bin.read(l)
+            if len(r) != l:
+                raise Exception("truncated patch data")
+            
+        yield m, n, r
+
 
 def bpatch(t, b):
     return patch(t, frombinary(b))

=== modified file 'bzrlib/revfile.py'
--- a/bzrlib/revfile.py	2005-04-09 04:01:25 +0000
+++ b/bzrlib/revfile.py	2005-04-09 04:38:18 +0000
@@ -88,6 +88,7 @@
 I_OFFSET = 3
 I_LEN = 4
 
+
 class RevfileError(Exception):
     pass
 
@@ -150,53 +151,68 @@
         return text    
 
 
-    def _add_full_text(self, t):
-        """Add a full text to the file.
-
-        This is not compressed against any reference version.
-
-        Returns the index for that text."""
+    def _check_index(self, idx):
+        if idx < 0 or idx > len(self):
+            raise RevfileError("invalid index %r" % idx)
+
+
+    def find_sha(self, s):
+        assert isinstance(s, str)
+        assert len(s) == 20
+        
+        for idx, idxrec in enumerate(self):
+            if idxrec[I_SHA] == s:
+                return idx
+        else:
+            return _NO_RECORD        
+
+
+    def _add_common(self, text_sha, data, flags, base):
+        """Add pre-processed data, can be either full text or delta."""
         idx = len(self)
         self.datafile.seek(0, 2)        # to end
         self.idxfile.seek(0, 2)
         assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)
         data_offset = self.datafile.tell()
 
-        assert isinstance(t, str) # not unicode or anything wierd
+        assert isinstance(data, str) # not unicode or anything wierd
 
-        self.datafile.write(t)
+        self.datafile.write(data)
         self.datafile.flush()
 
-        entry = sha.new(t).digest()
-        entry += struct.pack(">IIII12x", 0xFFFFFFFFL, 0, data_offset, len(t))
+        assert isinstance(text_sha, str)
+        entry = text_sha
+        entry += struct.pack(">IIII12x", base, flags, data_offset, len(data))
         assert len(entry) == _RECORDSIZE
 
         self.idxfile.write(entry)
         self.idxfile.flush()
 
         return idx
-
-
-    def _check_index(self, idx):
-        if idx < 0 or idx > len(self):
-            raise RevfileError("invalid index %r" % idx)
-
-
-    def find_sha(self, s):
-        assert isinstance(s, str)
-        assert len(s) == 20
         
-        for idx, idxrec in enumerate(self):
-            if idxrec[I_SHA] == s:
-                return idx
-        else:
-            return _NO_RECORD        
-
-
-    def _add_diff(self, text, base):
+
+
+    def _add_full_text(self, text):
+        """Add a full text to the file.
+
+        This is not compressed against any reference version.
+
+        Returns the index for that text."""
+        return self._add_common(sha.new(text).digest(), text, 0, _NO_RECORD)
+
+
+    def _add_delta(self, text, base):
         """Add a text stored relative to a previous text."""
         self._check_index(base)
         text_sha = sha.new(text).digest()
+        base_text = self.get(base)
+        data = mdiff.bdiff(base_text, text)
+        return self._add_common(text_sha, data, 0, base)
+
+
+    def add(self, text, base=None):
+        # TODO: check it's not already present?
+        assert 0
 
         
     def addrevision(self, text, changeset):
@@ -221,28 +237,61 @@
         open(self.indexfile(), "a").write(entry)
         open(self.datafile(), "a").write(data)
 
-    def _get_full_text(self, idx):
+
+    def get(self, idx):
         idxrec = self[idx]
-        assert idxrec[I_FLAGS] == 0
-        assert idxrec[I_BASE] == _NO_RECORD
-
+        base = idxrec[I_BASE]
+        if base == _NO_RECORD:
+            text = self._get_full_text(idx, idxrec)
+        else:
+            text = self._get_patched(idx, idxrec)
+
+        if sha.new(text).digest() != idxrec[I_SHA]:
+            raise RevfileError("corrupt SHA-1 digest on record %d"
+                               % idx)
+
+        return text
+
+
+
+    def _get_raw(self, idx, idxrec):
         l = idxrec[I_LEN]
         if l == 0:
             return ''
 
         self.datafile.seek(idxrec[I_OFFSET])
 
-        text = self.datafile.read(l)
-        if len(text) != l:
+        data = self.datafile.read(l)
+        if len(data) != l:
             raise RevfileError("short read %d of %d "
                                "getting text for record %d in %r"
-                               % (len(text), l, idx, self.basename))
+                               % (len(data), l, idx, self.basename))
 
-        if sha.new(text).digest() != idxrec[I_SHA]:
-            raise RevfileError("corrupt SHA-1 digest on record %d"
-                               % idx)
+        return data
         
-        return text
+
+    def _get_full_text(self, idx, idxrec):
+        assert idxrec[I_FLAGS] == 0
+        assert idxrec[I_BASE] == _NO_RECORD
+
+        text = self._get_raw(idx, idxrec)
+
+        return text
+
+
+    def _get_patched(self, idx, idxrec):
+        assert idxrec[I_FLAGS] == 0
+        base = idxrec[I_BASE]
+        assert base >= 0
+        assert base < idx    # no loops!
+
+        base_text = self.get(base)
+        patch = self._get_raw(idx, idxrec)
+
+        text = mdiff.bpatch(base_text, patch)
+
+        return text
+
 
 
     def __len__(self):
@@ -263,6 +312,8 @@
 
 
     def _seek_index(self, idx):
+        if idx < 0:
+            raise RevfileError("invalid index %r" % idx)
         self.idxfile.seek((idx + 1) * _RECORDSIZE)
         
 
@@ -302,6 +353,7 @@
     except IndexError:
         sys.stderr.write("usage: revfile dump\n"
                          "       revfile add\n"
+                         "       revfile add-delta BASE\n"
                          "       revfile get IDX\n"
                          "       revfile find-sha HEX\n")
         return 1
@@ -310,6 +362,9 @@
     if cmd == 'add':
         new_idx = r._add_full_text(sys.stdin.read())
         print 'added idx %d' % new_idx
+    elif cmd == 'add-delta':
+        new_idx = r._add_delta(sys.stdin.read(), int(argv[2]))
+        print 'added idx %d' % new_idx
     elif cmd == 'dump':
         r.dump()
     elif cmd == 'get':
@@ -323,7 +378,7 @@
             sys.stderr.write("invalid index %r\n" % idx)
             return 1
 
-        sys.stdout.write(r._get_full_text(idx))
+        sys.stdout.write(r.get(idx))
     elif cmd == 'find-sha':
         try:
             s = unhexlify(argv[2])