[Pkg-bazaar-commits] ./bzr/unstable r205: Revfile:- store and retrieve deltas!mdiff:- work on bytes not lines
mbp at sourcefrog.net
mbp at sourcefrog.net
Fri Apr 10 07:44:02 UTC 2009
------------------------------------------------------------
revno: 205
committer: mbp at sourcefrog.net
timestamp: Sat 2005-04-09 14:38:18 +1000
message:
Revfile:- store and retrieve deltas!mdiff:- work on bytes not lines
modified:
bzrlib/mdiff.py
bzrlib/revfile.py
-------------- next part --------------
=== modified file 'bzrlib/mdiff.py'
--- a/bzrlib/mdiff.py 2005-04-09 02:49:04 +0000
+++ b/bzrlib/mdiff.py 2005-04-09 04:38:18 +0000
@@ -1,4 +1,12 @@
# (C) 2005 Matt Mackall
+# (C) 2005 Canonical Ltd
+
+# based on code by Matt Mackall, hacked by Martin Pool
+
+# mm's code works line-by-line; this just works on byte strings.
+# Possibly slower; possibly gives better results for code not
+# regularly separated by newlines and anyhow a bit simpler.
+
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -14,63 +22,68 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+# TODO: maybe work on files not strings?
+
+
import difflib, sys, struct
-
-def linesplit(a):
- al, ap = [], []
- last = 0
-
- n = a.find("\n") + 1
- while n > 0:
- ap.append(last)
- al.append(a[last:n])
- last = n
- n = a.find("\n", n) + 1
-
- return (al, ap)
+from cStringIO import StringIO
def diff(a, b):
- (al, ap) = linesplit(a)
- (bl, bp) = linesplit(b)
-
- d = difflib.SequenceMatcher(None, al, bl)
- ops = []
+ d = difflib.SequenceMatcher(None, a, b)
for o, m, n, s, t in d.get_opcodes():
if o == 'equal': continue
- ops.append((ap[m], ap[n], "".join(bl[s:t])))
+ # a[m:n] should be replaced by b[s:t]
+ if s == t:
+ yield m, n, ''
+ else:
+ yield m, n, b[s:t]
- return ops
def tobinary(ops):
- b = ""
+ b = StringIO()
for f in ops:
- b += struct.pack(">lll", f[0], f[1], len(f[2])) + f[2]
- return b
+ b.write(struct.pack(">III", f[0], f[1], len(f[2])))
+ b.write(f[2])
+ return b.getvalue()
+
def bdiff(a, b):
return tobinary(diff(a, b))
+
def patch(t, ops):
last = 0
- r = []
-
- for p1, p2, sub in ops:
- r.append(t[last:p1])
- r.append(sub)
- last = p2
-
- r.append(t[last:])
- return "".join(r)
+ b = StringIO()
+
+ for m, n, r in ops:
+ b.write(t[last:m])
+ if r:
+ b.write(r)
+ last = n
+
+ b.write(t[last:])
+ return b.getvalue()
+
def frombinary(b):
- ops = []
- while b:
- p = b[:12]
- m, n, l = struct.unpack(">lll", p)
- ops.append((m, n, b[12:12 + l]))
- b = b[12 + l:]
-
- return ops
+ bin = StringIO(b)
+ while True:
+ p = bin.read(12)
+ if not p:
+ break
+
+ m, n, l = struct.unpack(">III", p)
+
+ if l == 0:
+ r = ''
+ else:
+ r = bin.read(l)
+ if len(r) != l:
+ raise Exception("truncated patch data")
+
+ yield m, n, r
+
def bpatch(t, b):
return patch(t, frombinary(b))
=== modified file 'bzrlib/revfile.py'
--- a/bzrlib/revfile.py 2005-04-09 04:01:25 +0000
+++ b/bzrlib/revfile.py 2005-04-09 04:38:18 +0000
@@ -88,6 +88,7 @@
I_OFFSET = 3
I_LEN = 4
+
class RevfileError(Exception):
pass
@@ -150,53 +151,68 @@
return text
- def _add_full_text(self, t):
- """Add a full text to the file.
-
- This is not compressed against any reference version.
-
- Returns the index for that text."""
+ def _check_index(self, idx):
+ if idx < 0 or idx > len(self):
+ raise RevfileError("invalid index %r" % idx)
+
+
+ def find_sha(self, s):
+ assert isinstance(s, str)
+ assert len(s) == 20
+
+ for idx, idxrec in enumerate(self):
+ if idxrec[I_SHA] == s:
+ return idx
+ else:
+ return _NO_RECORD
+
+
+ def _add_common(self, text_sha, data, flags, base):
+ """Add pre-processed data, can be either full text or delta."""
idx = len(self)
self.datafile.seek(0, 2) # to end
self.idxfile.seek(0, 2)
assert self.idxfile.tell() == _RECORDSIZE * (idx + 1)
data_offset = self.datafile.tell()
- assert isinstance(t, str) # not unicode or anything wierd
+ assert isinstance(data, str) # not unicode or anything wierd
- self.datafile.write(t)
+ self.datafile.write(data)
self.datafile.flush()
- entry = sha.new(t).digest()
- entry += struct.pack(">IIII12x", 0xFFFFFFFFL, 0, data_offset, len(t))
+ assert isinstance(text_sha, str)
+ entry = text_sha
+ entry += struct.pack(">IIII12x", base, flags, data_offset, len(data))
assert len(entry) == _RECORDSIZE
self.idxfile.write(entry)
self.idxfile.flush()
return idx
-
-
- def _check_index(self, idx):
- if idx < 0 or idx > len(self):
- raise RevfileError("invalid index %r" % idx)
-
-
- def find_sha(self, s):
- assert isinstance(s, str)
- assert len(s) == 20
- for idx, idxrec in enumerate(self):
- if idxrec[I_SHA] == s:
- return idx
- else:
- return _NO_RECORD
-
-
- def _add_diff(self, text, base):
+
+
+ def _add_full_text(self, text):
+ """Add a full text to the file.
+
+ This is not compressed against any reference version.
+
+ Returns the index for that text."""
+ return self._add_common(sha.new(text).digest(), text, 0, _NO_RECORD)
+
+
+ def _add_delta(self, text, base):
"""Add a text stored relative to a previous text."""
self._check_index(base)
text_sha = sha.new(text).digest()
+ base_text = self.get(base)
+ data = mdiff.bdiff(base_text, text)
+ return self._add_common(text_sha, data, 0, base)
+
+
+ def add(self, text, base=None):
+ # TODO: check it's not already present?
+ assert 0
def addrevision(self, text, changeset):
@@ -221,28 +237,61 @@
open(self.indexfile(), "a").write(entry)
open(self.datafile(), "a").write(data)
- def _get_full_text(self, idx):
+
+ def get(self, idx):
idxrec = self[idx]
- assert idxrec[I_FLAGS] == 0
- assert idxrec[I_BASE] == _NO_RECORD
-
+ base = idxrec[I_BASE]
+ if base == _NO_RECORD:
+ text = self._get_full_text(idx, idxrec)
+ else:
+ text = self._get_patched(idx, idxrec)
+
+ if sha.new(text).digest() != idxrec[I_SHA]:
+ raise RevfileError("corrupt SHA-1 digest on record %d"
+ % idx)
+
+ return text
+
+
+
+ def _get_raw(self, idx, idxrec):
l = idxrec[I_LEN]
if l == 0:
return ''
self.datafile.seek(idxrec[I_OFFSET])
- text = self.datafile.read(l)
- if len(text) != l:
+ data = self.datafile.read(l)
+ if len(data) != l:
raise RevfileError("short read %d of %d "
"getting text for record %d in %r"
- % (len(text), l, idx, self.basename))
+ % (len(data), l, idx, self.basename))
- if sha.new(text).digest() != idxrec[I_SHA]:
- raise RevfileError("corrupt SHA-1 digest on record %d"
- % idx)
+ return data
- return text
+
+ def _get_full_text(self, idx, idxrec):
+ assert idxrec[I_FLAGS] == 0
+ assert idxrec[I_BASE] == _NO_RECORD
+
+ text = self._get_raw(idx, idxrec)
+
+ return text
+
+
+ def _get_patched(self, idx, idxrec):
+ assert idxrec[I_FLAGS] == 0
+ base = idxrec[I_BASE]
+ assert base >= 0
+ assert base < idx # no loops!
+
+ base_text = self.get(base)
+ patch = self._get_raw(idx, idxrec)
+
+ text = mdiff.bpatch(base_text, patch)
+
+ return text
+
def __len__(self):
@@ -263,6 +312,8 @@
def _seek_index(self, idx):
+ if idx < 0:
+ raise RevfileError("invalid index %r" % idx)
self.idxfile.seek((idx + 1) * _RECORDSIZE)
@@ -302,6 +353,7 @@
except IndexError:
sys.stderr.write("usage: revfile dump\n"
" revfile add\n"
+ " revfile add-delta BASE\n"
" revfile get IDX\n"
" revfile find-sha HEX\n")
return 1
@@ -310,6 +362,9 @@
if cmd == 'add':
new_idx = r._add_full_text(sys.stdin.read())
print 'added idx %d' % new_idx
+ elif cmd == 'add-delta':
+ new_idx = r._add_delta(sys.stdin.read(), int(argv[2]))
+ print 'added idx %d' % new_idx
elif cmd == 'dump':
r.dump()
elif cmd == 'get':
@@ -323,7 +378,7 @@
sys.stderr.write("invalid index %r\n" % idx)
return 1
- sys.stdout.write(r._get_full_text(idx))
+ sys.stdout.write(r.get(idx))
elif cmd == 'find-sha':
try:
s = unhexlify(argv[2])
More information about the Pkg-bazaar-commits
mailing list