[Reproducible-commits] [diffoscope] 15/21: Differentiate text and bytes when computing diffs

Jérémy Bobbio lunar at moszumanska.debian.org
Mon Sep 21 17:39:29 UTC 2015


This is an automated email from the git hooks/post-receive script.

lunar pushed a commit to branch master
in repository diffoscope.

commit 555298a4b3b009c68b4fa4485fee88679a376e62
Author: Jérémy Bobbio <lunar at debian.org>
Date:   Sun Sep 20 00:24:18 2015 +0200

    Differentiate text and bytes when computing diffs
    
    In some situation, we want to compare streams of already encoded UTF-8 bytes,
    in some other we compare streams of unicode strings. To make it clear, we split
    Difference.from_file into Difference.from_raw_readers and
    Difference.from_text_readers.
    
    We want to avoid an overhead when we use the unfiltered output of a command to
    feed diff directly. So we assume commands output UTF-8 encoded bytes as its
    true in most cases. This leaves up to filters to similarly return UTF-8 bytes.
    They might have to decode their input from UTF-8 or other encoding if required.
    
    We also rename Difference.from_unicode into Difference.from_text to keep
    names in line.
    
    This changes are required for Python 3 which clearly separates unicode strings
    and bytes, but this should clearly help avoid encoding issues in the future.
---
 diffoscope/comparators/binary.py    |  4 ++--
 diffoscope/comparators/deb.py       |  2 +-
 diffoscope/comparators/debian.py    |  8 ++++----
 diffoscope/comparators/device.py    |  2 +-
 diffoscope/comparators/directory.py |  5 +++--
 diffoscope/comparators/elf.py       | 16 +++++++---------
 diffoscope/comparators/gettext.py   |  6 +++---
 diffoscope/comparators/gzip.py      |  2 +-
 diffoscope/comparators/java.py      |  4 ++--
 diffoscope/comparators/rpm.py       |  2 +-
 diffoscope/comparators/squashfs.py  |  4 ++--
 diffoscope/comparators/symlink.py   |  2 +-
 diffoscope/comparators/text.py      |  4 ++--
 diffoscope/comparators/zip.py       |  4 ++--
 diffoscope/difference.py            | 30 +++++++++++++++++++++---------
 15 files changed, 53 insertions(+), 42 deletions(-)

diff --git a/diffoscope/comparators/binary.py b/diffoscope/comparators/binary.py
index ade85f5..fea9975 100644
--- a/diffoscope/comparators/binary.py
+++ b/diffoscope/comparators/binary.py
@@ -60,12 +60,12 @@ def compare_binary_files(path1, path2, source=None):
     try:
         with xxd(path1) as xxd1:
             with xxd(path2) as xxd2:
-                return Difference.from_file(xxd1, xxd2, path1, path2, source)
+                return Difference.from_raw_readers(xxd1, xxd2, path1, path2, source)
     except RequiredToolNotFound:
         hexdump1 = hexdump_fallback(path1)
         hexdump2 = hexdump_fallback(path2)
         comment = 'xxd not available in path. Falling back to Python hexlify.\n'
-        return Difference.from_unicode(hexdump1, hexdump2, path1, path2, source, comment)
+        return Difference.from_text(hexdump1, hexdump2, path1, path2, source, comment)
 
 SMALL_FILE_THRESHOLD = 65536 # 64 kiB
 
diff --git a/diffoscope/comparators/deb.py b/diffoscope/comparators/deb.py
index a26f654..3dd4dbd 100644
--- a/diffoscope/comparators/deb.py
+++ b/diffoscope/comparators/deb.py
@@ -79,7 +79,7 @@ class DebFile(File):
         differences = []
         my_content = get_ar_content(self.path)
         other_content = get_ar_content(other.path)
-        differences.append(Difference.from_unicode(
+        differences.append(Difference.from_text(
                                my_content, other_content, self.path, other.path, source="metadata"))
         with DebContainer(self).open() as my_container, \
              DebContainer(other).open() as other_container:
diff --git a/diffoscope/comparators/debian.py b/diffoscope/comparators/debian.py
index 734577b..e431ef2 100644
--- a/diffoscope/comparators/debian.py
+++ b/diffoscope/comparators/debian.py
@@ -131,13 +131,13 @@ class DotChangesFile(File):
             other_value = ''
             if field in other.changes:
                 other_value = other.changes.get_as_string(field).lstrip()
-            differences.append(Difference.from_unicode(
+            differences.append(Difference.from_text(
                                    my_value, other_value,
                                    self.path, other.path, source=field))
         # compare Files as string
-        differences.append(Difference.from_unicode(self.changes.get_as_string('Files'),
-                                                   other.changes.get_as_string('Files'),
-                                                   self.path, other.path, source='Files'))
+        differences.append(Difference.from_text(self.changes.get_as_string('Files'),
+                                                other.changes.get_as_string('Files'),
+                                                self.path, other.path, source='Files'))
         with DotChangesContainer(self).open() as my_container, \
              DotChangesContainer(other).open() as other_container:
             differences.extend(my_container.compare(other_container))
diff --git a/diffoscope/comparators/device.py b/diffoscope/comparators/device.py
index 52b3268..cd9aaf0 100644
--- a/diffoscope/comparators/device.py
+++ b/diffoscope/comparators/device.py
@@ -52,4 +52,4 @@ class Device(File):
     def compare(self, other, source=None):
         with open(self.path) as my_content, \
              open(other.path) as other_content:
-            return Difference.from_file(my_content, other_content, self.name, other.name, source=source, comment="device")
+            return Difference.from_text_readers(my_content, other_content, self.name, other.name, source=source, comment="device")
diff --git a/diffoscope/comparators/directory.py b/diffoscope/comparators/directory.py
index 1ce28ea..2141572 100644
--- a/diffoscope/comparators/directory.py
+++ b/diffoscope/comparators/directory.py
@@ -45,11 +45,12 @@ class Stat(Command):
     ACCESS_TIME_RE = re.compile(r'^Access: [0-9]{4}-[0-9]{2}-[0-9]{2}.*$')
 
     def filter(self, line):
+        line = line.decode('utf-8')
         line = Stat.FILE_RE.sub('', line)
         line = Stat.DEVICE_RE.sub('', line)
         line = Stat.INODE_RE.sub('', line)
         line = Stat.ACCESS_TIME_RE.sub('', line)
-        return line
+        return line.encode('utf-8')
 
 
 @tool_required('lsattr')
@@ -79,7 +80,7 @@ def compare_meta(path1, path2):
     try:
         lsattr1 = lsattr(path1)
         lsattr2 = lsattr(path2)
-        differences.append(Difference.from_unicode(
+        differences.append(Difference.from_text(
                                lsattr1, lsattr2, path1, path2, source="lattr"))
     except RequiredToolNotFound:
         logger.info("Unable to find 'lsattr'.")
diff --git a/diffoscope/comparators/elf.py b/diffoscope/comparators/elf.py
index 21d8fda..7e6a656 100644
--- a/diffoscope/comparators/elf.py
+++ b/diffoscope/comparators/elf.py
@@ -30,8 +30,7 @@ class Readelf(Command):
         super(Readelf, self).__init__(*args, **kwargs)
         # we don't care about the name of the archive
         self._archive_re = re.compile(r'^File: %s\(' % re.escape(self.path))
-        self._encoded_path = self.path.encode('utf-8')
-        self._basename = os.path.basename(self._encoded_path)
+        self._basename = os.path.basename(self.path)
 
     @tool_required('readelf')
     def cmdline(self):
@@ -42,9 +41,9 @@ class Readelf(Command):
 
     def filter(self, line):
         # we don't care about the name of the archive
-        line = self._archive_re.sub('File: lib.a(', line)
+        line = self._archive_re.sub('File: lib.a(', line.decode('utf-8'))
         # the full path can appear in the output, we need to remove it
-        return line.replace(self._encoded_path, self._basename)
+        return line.replace(self.path, self._basename).encode('utf-8')
 
 class ReadelfAll(Readelf):
     def readelf_options(self):
@@ -59,8 +58,7 @@ class ObjdumpDisassemble(Command):
         super(ObjdumpDisassemble, self).__init__(*args, **kwargs)
         # we don't care about the name of the archive
         self._archive_re = re.compile(r'^In archive %s:' % re.escape(self.path))
-        self._encoded_path = self.path.encode('utf-8')
-        self._basename = os.path.basename(self._encoded_path)
+        self._basename = os.path.basename(self.path)
 
     @tool_required('objdump')
     def cmdline(self):
@@ -68,9 +66,9 @@ class ObjdumpDisassemble(Command):
 
     def filter(self, line):
         # we don't care about the name of the archive
-        line = self._archive_re.sub('In archive:', line)
+        line = self._archive_re.sub('In archive:', line.decode('utf-8'))
         # the full path can appear in the output, we need to remove it
-        return line.replace(self._encoded_path, self._basename)
+        return line.replace(self.path, self._basename).encode('utf-8')
 
 def _compare_elf_data(path1, path2):
     differences = []
@@ -104,7 +102,7 @@ class StaticLibFile(File):
         # look up differences in metadata
         content1 = get_ar_content(self.path)
         content2 = get_ar_content(other.path)
-        differences.append(Difference.from_unicode(
+        differences.append(Difference.from_text(
                                content1, content2, self.path, other.path, source="metadata"))
         differences.extend(_compare_elf_data(self.path, other.path))
         return differences
diff --git a/diffoscope/comparators/gettext.py b/diffoscope/comparators/gettext.py
index 2b890e9..4e02dea 100644
--- a/diffoscope/comparators/gettext.py
+++ b/diffoscope/comparators/gettext.py
@@ -41,15 +41,15 @@ class Msgunfmt(Command):
     def filter(self, line):
         if not self._encoding:
             self._header.write(line)
-            if line == '\n':
+            if line == b'\n':
                 logger.debug("unable to determine PO encoding, let's hope it's utf-8")
                 self._encoding = 'utf-8'
                 return self._header.getvalue()
             found = Msgunfmt.CHARSET_RE.match(line)
             if found:
-                self._encoding = found.group(1).lower()
+                self._encoding = found.group(1).decode('us-ascii').lower()
                 return self._header.getvalue().decode(self._encoding).encode('utf-8')
-            return ''
+            return b''
         if self._encoding != 'utf-8':
             return line.decode(self._encoding).encode('utf-8')
         else:
diff --git a/diffoscope/comparators/gzip.py b/diffoscope/comparators/gzip.py
index 1f29c7c..f151d48 100644
--- a/diffoscope/comparators/gzip.py
+++ b/diffoscope/comparators/gzip.py
@@ -66,7 +66,7 @@ class GzipFile(object):
     @needs_content
     def compare_details(self, other, source=None):
         differences = []
-        differences.append(Difference.from_unicode(
+        differences.append(Difference.from_text(
                                self.magic_file_type, other.magic_file_type, self, other, source='metadata'))
         with GzipContainer(self).open() as my_container, \
              GzipContainer(other).open() as other_container:
diff --git a/diffoscope/comparators/java.py b/diffoscope/comparators/java.py
index 24c8bb2..6027f71 100644
--- a/diffoscope/comparators/java.py
+++ b/diffoscope/comparators/java.py
@@ -36,8 +36,8 @@ class Javap(Command):
         return ['javap', '-verbose', '-constants', '-s', '-l', '-private', self.path]
 
     def filter(self, line):
-        if re.match(r'^(Classfile %s$|  Last modified |  MD5 checksum )' % re.escape(self.real_path), line):
-            return ''
+        if re.match(r'^(Classfile %s$|  Last modified |  MD5 checksum )' % re.escape(self.real_path), line.decode('utf-8')):
+            return b''
         return line
 
 
diff --git a/diffoscope/comparators/rpm.py b/diffoscope/comparators/rpm.py
index 1330eae..ddeab56 100644
--- a/diffoscope/comparators/rpm.py
+++ b/diffoscope/comparators/rpm.py
@@ -57,7 +57,7 @@ def compare_rpm_headers(path1, path2):
         ts.setVSFlags(-1)
         header1 = get_rpm_header(path1, ts)
         header2 = get_rpm_header(path2, ts)
-    return Difference.from_unicode(header1, header2, path1, path2, source="header")
+    return Difference.from_text(header1, header2, path1, path2, source="header")
 
 
 class RpmContainer(Archive):
diff --git a/diffoscope/comparators/squashfs.py b/diffoscope/comparators/squashfs.py
index fb13b2f..a489742 100644
--- a/diffoscope/comparators/squashfs.py
+++ b/diffoscope/comparators/squashfs.py
@@ -38,7 +38,7 @@ class SquashfsSuperblock(Command):
 
     def filter(self, line):
         # strip filename
-        return re.sub(r'^(Found a valid .*) on .*', '\\1', line)
+        return re.sub(r'^(Found a valid .*) on .*', '\\1', line.decode('utf-8')).encode('utf-8')
 
 
 class SquashfsListing(Command):
@@ -153,7 +153,7 @@ class SquashfsContainer(Archive):
         # We pass `-d ''` in order to get a listing with the names we actually
         # need to use when extracting files
         cmd = ['unsquashfs', '-d', '', '-lls', path]
-        output = subprocess.check_output(cmd, shell=False)
+        output = subprocess.check_output(cmd, shell=False).decode('utf-8')
         header = True
         for line in output.rstrip('\n').split('\n'):
             if header:
diff --git a/diffoscope/comparators/symlink.py b/diffoscope/comparators/symlink.py
index 4451d28..93dad0f 100644
--- a/diffoscope/comparators/symlink.py
+++ b/diffoscope/comparators/symlink.py
@@ -49,4 +49,4 @@ class Symlink(File):
         logger.debug('my_content %s', self.path)
         with open(self.path) as my_content, \
              open(other.path) as other_content:
-            return Difference.from_file(my_content, other_content, self.name, other.name, source=source, comment="symlink")
+            return Difference.from_text_readers(my_content, other_content, self.name, other.name, source=source, comment="symlink")
diff --git a/diffoscope/comparators/text.py b/diffoscope/comparators/text.py
index d8cd699..417238a 100644
--- a/diffoscope/comparators/text.py
+++ b/diffoscope/comparators/text.py
@@ -44,11 +44,11 @@ class TextFile(File):
         try:
             with codecs.open(self.path, 'r', encoding=my_encoding) as my_content, \
                  codecs.open(other.path, 'r', encoding=other_encoding) as other_content:
-                difference = Difference.from_file(my_content, other_content, self.name, other.name, source)
+                difference = Difference.from_text_readers(my_content, other_content, self.name, other.name, source)
                 if my_encoding != other_encoding:
                     if difference is None:
                         difference = Difference(None, self.path, other.path, source)
-                    difference.add_details([Difference.from_unicode(my_encoding, other_encoding, None, None, source='encoding')])
+                    difference.add_details([Difference.from_text(my_encoding, other_encoding, None, None, source='encoding')])
                 return difference
         except (LookupError, UnicodeDecodeError):
             # unknown or misdetected encoding
diff --git a/diffoscope/comparators/zip.py b/diffoscope/comparators/zip.py
index 644304e..97444bb 100644
--- a/diffoscope/comparators/zip.py
+++ b/diffoscope/comparators/zip.py
@@ -33,8 +33,8 @@ class Zipinfo(Command):
 
     def filter(self, line):
         # we don't care about the archive file path
-        if re.match('^Archive:.*', line):
-            return ''
+        if line.startswith(b'Archive:'):
+            return b''
         return line
 
 
diff --git a/diffoscope/difference.py b/diffoscope/difference.py
index 640b31d..51d79ac 100644
--- a/diffoscope/difference.py
+++ b/diffoscope/difference.py
@@ -211,7 +211,7 @@ def fd_from_feeder(feeder, end_nl_q):
         outf.close()
 
 
-def make_feeder_from_unicode(content):
+def make_feeder_from_text(content):
     def feeder(f):
         for offset in range(0, len(content), DIFF_CHUNK):
             f.write(content[offset:offset + DIFF_CHUNK].encode('utf-8'))
@@ -225,7 +225,7 @@ def empty_file_feeder():
     return feeder
 
 
-def make_feeder_from_file(in_file, filter=lambda buf: buf.encode('utf-8')):
+def make_feeder_from_raw_reader(in_file, filter=lambda buf: buf):
     def feeder(out_file):
         line_count = 0
         end_nl = False
@@ -242,9 +242,15 @@ def make_feeder_from_file(in_file, filter=lambda buf: buf.encode('utf-8')):
     return feeder
 
 
+def make_feeder_from_text_reader(in_file, filter=lambda text_buf: text_buf):
+    def encoding_filter(text_buf):
+        return filter(text_buf).encode('utf-8')
+    return make_feeder_from_raw_reader(in_file, encoding_filter)
+
+
 def make_feeder_from_command(command):
     def feeder(out_file):
-        end_nl = make_feeder_from_file(command.stdout, command.filter)(out_file)
+        end_nl = make_feeder_from_raw_reader(command.stdout, command.filter)(out_file)
         if command.poll() is None:
             command.terminate()
         returncode = command.wait()
@@ -302,15 +308,21 @@ class Difference(object):
             return difference
 
     @staticmethod
-    def from_unicode(content1, content2, *args, **kwargs):
-        return Difference.from_feeder(make_feeder_from_unicode(content1),
-                                      make_feeder_from_unicode(content2),
+    def from_text(content1, content2, *args, **kwargs):
+        return Difference.from_feeder(make_feeder_from_text(content1),
+                                      make_feeder_from_text(content2),
+                                      *args, **kwargs)
+
+    @staticmethod
+    def from_raw_readers(file1, file2, *args, **kwargs):
+        return Difference.from_feeder(make_feeder_from_raw_reader(file1),
+                                      make_feeder_from_raw_reader(file2),
                                       *args, **kwargs)
 
     @staticmethod
-    def from_file(file1, file2, *args, **kwargs):
-        return Difference.from_feeder(make_feeder_from_file(file1),
-                                      make_feeder_from_file(file2),
+    def from_text_readers(file1, file2, *args, **kwargs):
+        return Difference.from_feeder(make_feeder_from_text_reader(file1),
+                                      make_feeder_from_text_reader(file2),
                                       *args, **kwargs)
 
     @staticmethod

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git



More information about the Reproducible-commits mailing list