[diffoscope] 01/01: Add support for .docx and .odt files via docx2txt & odt2txt. (Closes: #859056)
Chris Lamb
chris at chris-lamb.co.uk
Wed Mar 29 19:55:54 UTC 2017
This is an automated email from the git hooks/post-receive script.
lamby pushed a commit to branch experimental
in repository diffoscope.
commit 5d96a92c783ddc595e840784b6630e654dc60c09
Author: Chris Lamb <lamby at debian.org>
Date: Wed Mar 29 20:47:27 2017 +0100
Add support for .docx and .odt files via docx2txt & odt2txt. (Closes: #859056)
Signed-off-by: Chris Lamb <lamby at debian.org>
---
debian/control | 2 ++
diffoscope/comparators/__init__.py | 2 ++
diffoscope/comparators/docx.py | 48 +++++++++++++++++++++++++++++++++
diffoscope/comparators/odt.py | 48 +++++++++++++++++++++++++++++++++
diffoscope/external_tools.py | 6 +++++
tests/comparators/test_docx.py | 54 +++++++++++++++++++++++++++++++++++++
tests/comparators/test_odt.py | 54 +++++++++++++++++++++++++++++++++++++
tests/data/docx_expected_diff | 3 +++
tests/data/odt_expected_diff | 5 ++++
tests/data/test1.docx | Bin 0 -> 4046 bytes
tests/data/test1.odt | Bin 0 -> 7922 bytes
tests/data/test1.txt | 1 +
tests/data/test2.docx | Bin 0 -> 4046 bytes
tests/data/test2.odt | Bin 0 -> 7931 bytes
14 files changed, 223 insertions(+)
diff --git a/debian/control b/debian/control
index 29259b8..a327610 100644
--- a/debian/control
+++ b/debian/control
@@ -17,6 +17,7 @@ Build-Depends:
debhelper (>= 10),
default-jdk-headless <!nocheck> | default-jdk <!nocheck>,
dh-python (>= 2.20160818~),
+ docx2txt <!nocheck>,
dpkg-dev (>= 1.17.14),
enjarify <!nocheck>,
fontforge-extras <!nocheck>,
@@ -33,6 +34,7 @@ Build-Depends:
libjs-jquery-throttle-debounce <!nocheck>,
llvm <!nocheck>,
mono-utils <!nocheck>,
+ odt2txt <!nocheck>,
openssh-client <!nocheck>,
pdftk <!nocheck>,
pgpdump <!nocheck>,
diff --git a/diffoscope/comparators/__init__.py b/diffoscope/comparators/__init__.py
index 56fa166..45f6ca4 100644
--- a/diffoscope/comparators/__init__.py
+++ b/diffoscope/comparators/__init__.py
@@ -70,6 +70,8 @@ class ComparatorManager(object):
('tar.TarFile',),
('xz.XzFile',),
('apk.ApkFile',),
+ ('odt.OdtFile',),
+ ('docx.DocxFile',),
('zip.ZipFile',),
('zip.MozillaZipFile',),
('image.JPEGImageFile',),
diff --git a/diffoscope/comparators/docx.py b/diffoscope/comparators/docx.py
new file mode 100644
index 0000000..6988af6
--- /dev/null
+++ b/diffoscope/comparators/docx.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+#
+# diffoscope: in-depth comparison of files, archives, and directories
+#
+# Copyright © 2017 Chris Lamb <lamby at debian.org>
+#
+# diffoscope is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# diffoscope is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
+
+import re
+
+from diffoscope.tools import tool_required
+from diffoscope.difference import Difference
+
+from .utils.file import File
+from .utils.command import Command
+
+
+class Docx2txt(Command):
+ @tool_required('docx2txt')
+ def cmdline(self):
+ return (
+ 'docx2txt',
+ self.path,
+ '-',
+ )
+
+
+class DocxFile(File):
+ RE_FILE_TYPE = re.compile(r'^Microsoft Word 2007+\b')
+
+ def compare_details(self, other, source=None):
+ return [Difference.from_command(
+ Docx2txt,
+ self.path,
+ other.path,
+ source='docx2txt',
+ )]
diff --git a/diffoscope/comparators/odt.py b/diffoscope/comparators/odt.py
new file mode 100644
index 0000000..78dead3
--- /dev/null
+++ b/diffoscope/comparators/odt.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+#
+# diffoscope: in-depth comparison of files, archives, and directories
+#
+# Copyright © 2017 Chris Lamb <lamby at debian.org>
+#
+# diffoscope is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# diffoscope is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
+
+import re
+
+from diffoscope.tools import tool_required
+from diffoscope.difference import Difference
+
+from .utils.file import File
+from .utils.command import Command
+
+
+class Odt2txt(Command):
+ @tool_required('odt2txt')
+ def cmdline(self):
+ return (
+ 'odt2txt',
+ '--encoding=UTF-8',
+ self.path,
+ )
+
+
+class OdtFile(File):
+ RE_FILE_TYPE = re.compile(r'^OpenDocument Text\b')
+
+ def compare_details(self, other, source=None):
+ return [Difference.from_command(
+ Odt2txt,
+ self.path,
+ other.path,
+ source='odt2txt',
+ )]
diff --git a/diffoscope/external_tools.py b/diffoscope/external_tools.py
index 5579994..d68db1f 100644
--- a/diffoscope/external_tools.py
+++ b/diffoscope/external_tools.py
@@ -44,6 +44,9 @@ EXTERNAL_TOOLS = {
'debian': 'diffutils',
'arch': 'diffutils',
},
+ 'docx2txt': {
+ 'debian': 'docx2txt',
+ },
'enjarify': {
'debian': 'enjarify',
'arch': 'enjarify',
@@ -136,6 +139,9 @@ EXTERNAL_TOOLS = {
'debian': 'binutils-multiarch',
'arch': 'binutils',
},
+ 'odt2txt': {
+ 'debian': 'odt2txt',
+ },
'pgpdump': {
'debian': 'pgpdump',
'arch': 'pgpdump',
diff --git a/tests/comparators/test_docx.py b/tests/comparators/test_docx.py
new file mode 100644
index 0000000..675374c
--- /dev/null
+++ b/tests/comparators/test_docx.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+#
+# diffoscope: in-depth comparison of files, archives, and directories
+#
+# Copyright © 2017 Chris Lamb <lamby at debian.org>
+#
+# diffoscope is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# diffoscope is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
+
+import pytest
+
+from diffoscope.comparators.docx import DocxFile
+
+from utils.data import load_fixture, get_data
+from utils.tools import skip_unless_tools_exist
+from utils.nonexisting import assert_non_existing
+
+docx1 = load_fixture('test1.docx')
+docx2 = load_fixture('test2.docx')
+
+
+def test_identification(docx1):
+ assert isinstance(docx1, DocxFile)
+
+
+def test_no_differences(docx1):
+ difference = docx1.compare(docx1)
+ assert difference is None
+
+
+ at pytest.fixture
+def differences(docx1, docx2):
+ return docx1.compare(docx2).details
+
+
+ at skip_unless_tools_exist('docx2txt')
+def test_diff(differences):
+ expected_diff = get_data('docx_expected_diff')
+ assert differences[0].unified_diff == expected_diff
+
+
+ at skip_unless_tools_exist('docx2txt')
+def test_compare_non_existing(monkeypatch, docx1):
+ assert_non_existing(monkeypatch, docx1, has_null_source=False)
diff --git a/tests/comparators/test_odt.py b/tests/comparators/test_odt.py
new file mode 100644
index 0000000..5e2bcda
--- /dev/null
+++ b/tests/comparators/test_odt.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+#
+# diffoscope: in-depth comparison of files, archives, and directories
+#
+# Copyright © 2017 Chris Lamb <lamby at debian.org>
+#
+# diffoscope is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# diffoscope is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with diffoscope. If not, see <https://www.gnu.org/licenses/>.
+
+import pytest
+
+from diffoscope.comparators.odt import OdtFile
+
+from utils.data import load_fixture, get_data
+from utils.tools import skip_unless_tools_exist
+from utils.nonexisting import assert_non_existing
+
+odt1 = load_fixture('test1.odt')
+odt2 = load_fixture('test2.odt')
+
+
+def test_identification(odt1):
+ assert isinstance(odt1, OdtFile)
+
+
+def test_no_differences(odt1):
+ difference = odt1.compare(odt1)
+ assert difference is None
+
+
+ at pytest.fixture
+def differences(odt1, odt2):
+ return odt1.compare(odt2).details
+
+
+ at skip_unless_tools_exist('odt2txt')
+def test_diff(differences):
+ expected_diff = get_data('odt_expected_diff')
+ assert differences[0].unified_diff == expected_diff
+
+
+ at skip_unless_tools_exist('odt2txt')
+def test_compare_non_existing(monkeypatch, odt1):
+ assert_non_existing(monkeypatch, odt1, has_null_source=False)
diff --git a/tests/data/docx_expected_diff b/tests/data/docx_expected_diff
new file mode 100644
index 0000000..a2319c2
--- /dev/null
+++ b/tests/data/docx_expected_diff
@@ -0,0 +1,3 @@
+@@ -1 +1 @@
+-a
++b
diff --git a/tests/data/odt_expected_diff b/tests/data/odt_expected_diff
new file mode 100644
index 0000000..7a20d9a
--- /dev/null
+++ b/tests/data/odt_expected_diff
@@ -0,0 +1,5 @@
+@@ -1,3 +1,3 @@
+
+-a
++b
+
diff --git a/tests/data/test1.docx b/tests/data/test1.docx
new file mode 100644
index 0000000..f262842
Binary files /dev/null and b/tests/data/test1.docx differ
diff --git a/tests/data/test1.odt b/tests/data/test1.odt
new file mode 100644
index 0000000..d39ee4c
Binary files /dev/null and b/tests/data/test1.odt differ
diff --git a/tests/data/test1.txt b/tests/data/test1.txt
new file mode 100644
index 0000000..7898192
--- /dev/null
+++ b/tests/data/test1.txt
@@ -0,0 +1 @@
+a
diff --git a/tests/data/test2.docx b/tests/data/test2.docx
new file mode 100644
index 0000000..bc5b405
Binary files /dev/null and b/tests/data/test2.docx differ
diff --git a/tests/data/test2.odt b/tests/data/test2.odt
new file mode 100644
index 0000000..a02a7f8
Binary files /dev/null and b/tests/data/test2.odt differ
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/diffoscope.git
More information about the Reproducible-commits
mailing list