[Reproducible-commits] [diffoscope] 15/21: Differentiate text and bytes when computing diffs
Jérémy Bobbio
lunar at moszumanska.debian.org
Mon Sep 21 17:39:29 UTC 2015
This is an automated email from the git hooks/post-receive script.
lunar pushed a commit to branch master
in repository diffoscope.
commit 555298a4b3b009c68b4fa4485fee88679a376e62
Author: Jérémy Bobbio <lunar at debian.org>
Date: Sun Sep 20 00:24:18 2015 +0200
Differentiate text and bytes when computing diffs
In some situation, we want to compare streams of already encoded UTF-8 bytes,
in some other we compare streams of unicode strings. To make it clear, we split
Difference.from_file into Difference.from_raw_readers and
Difference.from_text_readers.
We want to avoid an overhead when we use the unfiltered output of a command to
feed diff directly. So we assume commands output UTF-8 encoded bytes as its
true in most cases. This leaves up to filters to similarly return UTF-8 bytes.
They might have to decode their input from UTF-8 or other encoding if required.
We also rename Difference.from_unicode into Difference.from_text to keep
names in line.
This changes are required for Python 3 which clearly separates unicode strings
and bytes, but this should clearly help avoid encoding issues in the future.
---
diffoscope/comparators/binary.py | 4 ++--
diffoscope/comparators/deb.py | 2 +-
diffoscope/comparators/debian.py | 8 ++++----
diffoscope/comparators/device.py | 2 +-
diffoscope/comparators/directory.py | 5 +++--
diffoscope/comparators/elf.py | 16 +++++++---------
diffoscope/comparators/gettext.py | 6 +++---
diffoscope/comparators/gzip.py | 2 +-
diffoscope/comparators/java.py | 4 ++--
diffoscope/comparators/rpm.py | 2 +-
diffoscope/comparators/squashfs.py | 4 ++--
diffoscope/comparators/symlink.py | 2 +-
diffoscope/comparators/text.py | 4 ++--
diffoscope/comparators/zip.py | 4 ++--
diffoscope/difference.py | 30 +++++++++++++++++++++---------
15 files changed, 53 insertions(+), 42 deletions(-)
diff --git a/diffoscope/comparators/binary.py b/diffoscope/comparators/binary.py
index ade85f5..fea9975 100644
--- a/diffoscope/comparators/binary.py
+++ b/diffoscope/comparators/binary.py
@@ -60,12 +60,12 @@ def compare_binary_files(path1, path2, source=None):
try:
with xxd(path1) as xxd1:
with xxd(path2) as xxd2:
- return Difference.from_file(xxd1, xxd2, path1, path2, source)
+ return Difference.from_raw_readers(xxd1, xxd2, path1, path2, source)
except RequiredToolNotFound:
hexdump1 = hexdump_fallback(path1)
hexdump2 = hexdump_fallback(path2)
comment = 'xxd not available in path. Falling back to Python hexlify.\n'
- return Difference.from_unicode(hexdump1, hexdump2, path1, path2, source, comment)
+ return Difference.from_text(hexdump1, hexdump2, path1, path2, source, comment)
SMALL_FILE_THRESHOLD = 65536 # 64 kiB
diff --git a/diffoscope/comparators/deb.py b/diffoscope/comparators/deb.py
index a26f654..3dd4dbd 100644
--- a/diffoscope/comparators/deb.py
+++ b/diffoscope/comparators/deb.py
@@ -79,7 +79,7 @@ class DebFile(File):
differences = []
my_content = get_ar_content(self.path)
other_content = get_ar_content(other.path)
- differences.append(Difference.from_unicode(
+ differences.append(Difference.from_text(
my_content, other_content, self.path, other.path, source="metadata"))
with DebContainer(self).open() as my_container, \
DebContainer(other).open() as other_container:
diff --git a/diffoscope/comparators/debian.py b/diffoscope/comparators/debian.py
index 734577b..e431ef2 100644
--- a/diffoscope/comparators/debian.py
+++ b/diffoscope/comparators/debian.py
@@ -131,13 +131,13 @@ class DotChangesFile(File):
other_value = ''
if field in other.changes:
other_value = other.changes.get_as_string(field).lstrip()
- differences.append(Difference.from_unicode(
+ differences.append(Difference.from_text(
my_value, other_value,
self.path, other.path, source=field))
# compare Files as string
- differences.append(Difference.from_unicode(self.changes.get_as_string('Files'),
- other.changes.get_as_string('Files'),
- self.path, other.path, source='Files'))
+ differences.append(Difference.from_text(self.changes.get_as_string('Files'),
+ other.changes.get_as_string('Files'),
+ self.path, other.path, source='Files'))
with DotChangesContainer(self).open() as my_container, \
DotChangesContainer(other).open() as other_container:
differences.extend(my_container.compare(other_container))
diff --git a/diffoscope/comparators/device.py b/diffoscope/comparators/device.py
index 52b3268..cd9aaf0 100644
--- a/diffoscope/comparators/device.py
+++ b/diffoscope/comparators/device.py
@@ -52,4 +52,4 @@ class Device(File):
def compare(self, other, source=None):
with open(self.path) as my_content, \
open(other.path) as other_content:
- return Difference.from_file(my_content, other_content, self.name, other.name, source=source, comment="device")
+ return Difference.from_text_readers(my_content, other_content, self.name, other.name, source=source, comment="device")
diff --git a/diffoscope/comparators/directory.py b/diffoscope/comparators/directory.py
index 1ce28ea..2141572 100644
--- a/diffoscope/comparators/directory.py
+++ b/diffoscope/comparators/directory.py
@@ -45,11 +45,12 @@ class Stat(Command):
ACCESS_TIME_RE = re.compile(r'^Access: [0-9]{4}-[0-9]{2}-[0-9]{2}.*$')
def filter(self, line):
+ line = line.decode('utf-8')
line = Stat.FILE_RE.sub('', line)
line = Stat.DEVICE_RE.sub('', line)
line = Stat.INODE_RE.sub('', line)
line = Stat.ACCESS_TIME_RE.sub('', line)
- return line
+ return line.encode('utf-8')
@tool_required('lsattr')
@@ -79,7 +80,7 @@ def compare_meta(path1, path2):
try:
lsattr1 = lsattr(path1)
lsattr2 = lsattr(path2)
- differences.append(Difference.from_unicode(
+ differences.append(Difference.from_text(
lsattr1, lsattr2, path1, path2, source="lattr"))
except RequiredToolNotFound:
logger.info("Unable to find 'lsattr'.")
diff --git a/diffoscope/comparators/elf.py b/diffoscope/comparators/elf.py
index 21d8fda..7e6a656 100644
--- a/diffoscope/comparators/elf.py
+++ b/diffoscope/comparators/elf.py
@@ -30,8 +30,7 @@ class Readelf(Command):
super(Readelf, self).__init__(*args, **kwargs)
# we don't care about the name of the archive
self._archive_re = re.compile(r'^File: %s\(' % re.escape(self.path))
- self._encoded_path = self.path.encode('utf-8')
- self._basename = os.path.basename(self._encoded_path)
+ self._basename = os.path.basename(self.path)
@tool_required('readelf')
def cmdline(self):
@@ -42,9 +41,9 @@ class Readelf(Command):
def filter(self, line):
# we don't care about the name of the archive
- line = self._archive_re.sub('File: lib.a(', line)
+ line = self._archive_re.sub('File: lib.a(', line.decode('utf-8'))
# the full path can appear in the output, we need to remove it
- return line.replace(self._encoded_path, self._basename)
+ return line.replace(self.path, self._basename).encode('utf-8')
class ReadelfAll(Readelf):
def readelf_options(self):
@@ -59,8 +58,7 @@ class ObjdumpDisassemble(Command):
super(ObjdumpDisassemble, self).__init__(*args, **kwargs)
# we don't care about the name of the archive
self._archive_re = re.compile(r'^In archive %s:' % re.escape(self.path))
- self._encoded_path = self.path.encode('utf-8')
- self._basename = os.path.basename(self._encoded_path)
+ self._basename = os.path.basename(self.path)
@tool_required('objdump')
def cmdline(self):
@@ -68,9 +66,9 @@ class ObjdumpDisassemble(Command):
def filter(self, line):
# we don't care about the name of the archive
- line = self._archive_re.sub('In archive:', line)
+ line = self._archive_re.sub('In archive:', line.decode('utf-8'))
# the full path can appear in the output, we need to remove it
- return line.replace(self._encoded_path, self._basename)
+ return line.replace(self.path, self._basename).encode('utf-8')
def _compare_elf_data(path1, path2):
differences = []
@@ -104,7 +102,7 @@ class StaticLibFile(File):
# look up differences in metadata
content1 = get_ar_content(self.path)
content2 = get_ar_content(other.path)
- differences.append(Difference.from_unicode(
+ differences.append(Difference.from_text(
content1, content2, self.path, other.path, source="metadata"))
differences.extend(_compare_elf_data(self.path, other.path))
return differences
diff --git a/diffoscope/comparators/gettext.py b/diffoscope/comparators/gettext.py
index 2b890e9..4e02dea 100644
--- a/diffoscope/comparators/gettext.py
+++ b/diffoscope/comparators/gettext.py
@@ -41,15 +41,15 @@ class Msgunfmt(Command):
def filter(self, line):
if not self._encoding:
self._header.write(line)
- if line == '\n':
+ if line == b'\n':
logger.debug("unable to determine PO encoding, let's hope it's utf-8")
self._encoding = 'utf-8'
return self._header.getvalue()
found = Msgunfmt.CHARSET_RE.match(line)
if found:
- self._encoding = found.group(1).lower()
+ self._encoding = found.group(1).decode('us-ascii').lower()
return self._header.getvalue().decode(self._encoding).encode('utf-8')
- return ''
+ return b''
if self._encoding != 'utf-8':
return line.decode(self._encoding).encode('utf-8')
else:
diff --git a/diffoscope/comparators/gzip.py b/diffoscope/comparators/gzip.py
index 1f29c7c..f151d48 100644
--- a/diffoscope/comparators/gzip.py
+++ b/diffoscope/comparators/gzip.py
@@ -66,7 +66,7 @@ class GzipFile(object):
@needs_content
def compare_details(self, other, source=None):
differences = []
- differences.append(Difference.from_unicode(
+ differences.append(Difference.from_text(
self.magic_file_type, other.magic_file_type, self, other, source='metadata'))
with GzipContainer(self).open() as my_container, \
GzipContainer(other).open() as other_container:
diff --git a/diffoscope/comparators/java.py b/diffoscope/comparators/java.py
index 24c8bb2..6027f71 100644
--- a/diffoscope/comparators/java.py
+++ b/diffoscope/comparators/java.py
@@ -36,8 +36,8 @@ class Javap(Command):
return ['javap', '-verbose', '-constants', '-s', '-l', '-private', self.path]
def filter(self, line):
- if re.match(r'^(Classfile %s$| Last modified | MD5 checksum )' % re.escape(self.real_path), line):
- return ''
+ if re.match(r'^(Classfile %s$| Last modified | MD5 checksum )' % re.escape(self.real_path), line.decode('utf-8')):
+ return b''
return line
diff --git a/diffoscope/comparators/rpm.py b/diffoscope/comparators/rpm.py
index 1330eae..ddeab56 100644
--- a/diffoscope/comparators/rpm.py
+++ b/diffoscope/comparators/rpm.py
@@ -57,7 +57,7 @@ def compare_rpm_headers(path1, path2):
ts.setVSFlags(-1)
header1 = get_rpm_header(path1, ts)
header2 = get_rpm_header(path2, ts)
- return Difference.from_unicode(header1, header2, path1, path2, source="header")
+ return Difference.from_text(header1, header2, path1, path2, source="header")
class RpmContainer(Archive):
diff --git a/diffoscope/comparators/squashfs.py b/diffoscope/comparators/squashfs.py
index fb13b2f..a489742 100644
--- a/diffoscope/comparators/squashfs.py
+++ b/diffoscope/comparators/squashfs.py
@@ -38,7 +38,7 @@ class SquashfsSuperblock(Command):
def filter(self, line):
# strip filename
- return re.sub(r'^(Found a valid .*) on .*', '\\1', line)
+ return re.sub(r'^(Found a valid .*) on .*', '\\1', line.decode('utf-8')).encode('utf-8')
class SquashfsListing(Command):
@@ -153,7 +153,7 @@ class SquashfsContainer(Archive):
# We pass `-d ''` in order to get a listing with the names we actually
# need to use when extracting files
cmd = ['unsquashfs', '-d', '', '-lls', path]
- output = subprocess.check_output(cmd, shell=False)
+ output = subprocess.check_output(cmd, shell=False).decode('utf-8')
header = True
for line in output.rstrip('\n').split('\n'):
if header:
diff --git a/diffoscope/comparators/symlink.py b/diffoscope/comparators/symlink.py
index 4451d28..93dad0f 100644
--- a/diffoscope/comparators/symlink.py
+++ b/diffoscope/comparators/symlink.py
@@ -49,4 +49,4 @@ class Symlink(File):
logger.debug('my_content %s', self.path)
with open(self.path) as my_content, \
open(other.path) as other_content:
- return Difference.from_file(my_content, other_content, self.name, other.name, source=source, comment="symlink")
+ return Difference.from_text_readers(my_content, other_content, self.name, other.name, source=source, comment="symlink")
diff --git a/diffoscope/comparators/text.py b/diffoscope/comparators/text.py
index d8cd699..417238a 100644
--- a/diffoscope/comparators/text.py
+++ b/diffoscope/comparators/text.py
@@ -44,11 +44,11 @@ class TextFile(File):
try:
with codecs.open(self.path, 'r', encoding=my_encoding) as my_content, \
codecs.open(other.path, 'r', encoding=other_encoding) as other_content:
- difference = Difference.from_file(my_content, other_content, self.name, other.name, source)
+ difference = Difference.from_text_readers(my_content, other_content, self.name, other.name, source)
if my_encoding != other_encoding:
if difference is None:
difference = Difference(None, self.path, other.path, source)
- difference.add_details([Difference.from_unicode(my_encoding, other_encoding, None, None, source='encoding')])
+ difference.add_details([Difference.from_text(my_encoding, other_encoding, None, None, source='encoding')])
return difference
except (LookupError, UnicodeDecodeError):
# unknown or misdetected encoding
diff --git a/diffoscope/comparators/zip.py b/diffoscope/comparators/zip.py
index 644304e..97444bb 100644
--- a/diffoscope/comparators/zip.py
+++ b/diffoscope/comparators/zip.py
@@ -33,8 +33,8 @@ class Zipinfo(Command):
def filter(self, line):
# we don't care about the archive file path
- if re.match('^Archive:.*', line):
- return ''
+ if line.startswith(b'Archive:'):
+ return b''
return line
diff --git a/diffoscope/difference.py b/diffoscope/difference.py
index 640b31d..51d79ac 100644
--- a/diffoscope/difference.py
+++ b/diffoscope/difference.py
@@ -211,7 +211,7 @@ def fd_from_feeder(feeder, end_nl_q):
outf.close()
-def make_feeder_from_unicode(content):
+def make_feeder_from_text(content):
def feeder(f):
for offset in range(0, len(content), DIFF_CHUNK):
f.write(content[offset:offset + DIFF_CHUNK].encode('utf-8'))
@@ -225,7 +225,7 @@ def empty_file_feeder():
return feeder
-def make_feeder_from_file(in_file, filter=lambda buf: buf.encode('utf-8')):
+def make_feeder_from_raw_reader(in_file, filter=lambda buf: buf):
def feeder(out_file):
line_count = 0
end_nl = False
@@ -242,9 +242,15 @@ def make_feeder_from_file(in_file, filter=lambda buf: buf.encode('utf-8')):
return feeder
+def make_feeder_from_text_reader(in_file, filter=lambda text_buf: text_buf):
+ def encoding_filter(text_buf):
+ return filter(text_buf).encode('utf-8')
+ return make_feeder_from_raw_reader(in_file, encoding_filter)
+
+
def make_feeder_from_command(command):
def feeder(out_file):
- end_nl = make_feeder_from_file(command.stdout, command.filter)(out_file)
+ end_nl = make_feeder_from_raw_reader(command.stdout, command.filter)(out_file)
if command.poll() is None:
command.terminate()
returncode = command.wait()
@@ -302,15 +308,21 @@ class Difference(object):
return difference
@staticmethod
- def from_unicode(content1, content2, *args, **kwargs):
- return Difference.from_feeder(make_feeder_from_unicode(content1),
- make_feeder_from_unicode(content2),
+ def from_text(content1, content2, *args, **kwargs):
+ return Difference.from_feeder(make_feeder_from_text(content1),
+ make_feeder_from_text(content2),
+ *args, **kwargs)
+
+ @staticmethod
+ def from_raw_readers(file1, file2, *args, **kwargs):
+ return Difference.from_feeder(make_feeder_from_raw_reader(file1),
+ make_feeder_from_raw_reader(file2),
*args, **kwargs)
@staticmethod
- def from_file(file1, file2, *args, **kwargs):
- return Difference.from_feeder(make_feeder_from_file(file1),
- make_feeder_from_file(file2),
+ def from_text_readers(file1, file2, *args, **kwargs):
+ return Difference.from_feeder(make_feeder_from_text_reader(file1),
+ make_feeder_from_text_reader(file2),
*args, **kwargs)
@staticmethod
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git
More information about the Reproducible-commits
mailing list