[Pkg-isocodes-devel] [iso-codes] 03/04: Rewrite UTF-8 checking script in python

Tobias Quathamer toddy at moszumanska.debian.org
Tue Feb 23 18:41:18 UTC 2016


This is an automated email from the git hooks/post-receive script.

toddy pushed a commit to branch master
in repository iso-codes.

commit f91f940e85d4d64256a06b8d7bc511e1c2e0200d
Author: Dr. Tobias Quathamer <toddy at debian.org>
Date:   Tue Feb 23 19:39:11 2016 +0100

    Rewrite UTF-8 checking script in python
---
 Makefile.am         |  2 +-
 check_valid_utf8.pl | 63 -----------------------------------------------------
 check_valid_utf8.py | 56 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 64 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index b2b9385..f8feee9 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -7,7 +7,7 @@ EXTRA_DIST = \
 	LICENSE			\
 	common.mk		\
 	iso2pot.py		\
-	check_valid_utf8.pl
+	check_valid_utf8.py
 
 DISTCLEANFILES = $(pkgconfig_DATA)
 
diff --git a/check_valid_utf8.pl b/check_valid_utf8.pl
deleted file mode 100644
index 093c7cf..0000000
--- a/check_valid_utf8.pl
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/usr/bin/env perl
-#
-# Takes a list of files on the command line and checks for valid
-# UTF-8 data. Used for checking .po files.
-#
-# Copyright © 2009 Tobias Quathamer <toddy at debian.org>
-#
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-
-use strict;
-use warnings;
-
-my $exit_status = 0;
-
-foreach my $filename (@ARGV) {
-  my $content_type_checked = 0;
-  open FILE, "< $filename";
-  while (<FILE>) {
-    # Check for valid UTF-8 encoding
-    unless (m/\A(
-      [\x09\x0A\x0D\x20-\x7E]            # ASCII
-    | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
-    |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
-    | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
-    |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
-    |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
-    | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
-    |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
-    )*\z/x) {
-      # Found invalid characters for UTF-8
-      printf("Error in file %s at line number %d:\n", $filename, $.);
-      # Show the line with the error
-      print;
-      $exit_status = 1;
-      # Skip the rest of the current file
-      last;
-    }
-    # Check that the Content-Type header field is set correctly.
-    if (!$content_type_checked && /Content-Type: text\/plain; charset=UTF-8/) {
-      $content_type_checked = 1;
-    }
-  }
-  unless ($content_type_checked) {
-    printf("Error in file %s:\n", $filename);
-    print("Could not detect correct Content-Type header field.\n");
-    $exit_status = 1;
-  }
-  close FILE;
-}
-
-exit($exit_status);
diff --git a/check_valid_utf8.py b/check_valid_utf8.py
new file mode 100755
index 0000000..3f37853
--- /dev/null
+++ b/check_valid_utf8.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+#
+# Takes a list of files on the command line and checks for valid
+# UTF-8 data. Used for checking .po files.
+#
+# Copyright © 2016 Dr. Tobias Quathamer <toddy at debian.org>
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+import re
+import sys
+
+# Remove the script name from the files to check
+sys.argv.pop(0)
+
+# Assume that every file is valid
+exit_status = 0
+
+# Cycle through all files and check for valid UTF-8 encoding
+for filename in sys.argv:
+	# Open the file for reading in binary mode
+	with open(filename, "rb") as pofile:
+		# The "Content-Type" header has not been seen yet
+		charset_utf8_seen = False
+		# Read all lines to check for Content-Type header
+		for line in pofile:
+			# Try to decode binary data to UTF-8
+			try:
+				utf8 = line.decode(encoding="utf-8", errors="strict")
+			except UnicodeError as error:
+				print("UTF-8 encoding error in file %s: %s (position %d)" % (filename, error.reason, error.start))
+				print("Binary data: %s" % line)
+				exit_status = 1
+				break
+			if re.search(r'Content-Type: text/plain; charset=UTF-8', utf8):
+				charset_utf8_seen = True
+		# The whole file has been read, the content time should have
+		# been detected now. Otherwise, it's an error.
+		if not charset_utf8_seen:
+			print("Error in file %s: could not detect Content-Type header" % filename)
+			exit_status = 1
+			break
+
+sys.exit(exit_status)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-isocodes/iso-codes.git



More information about the Pkg-isocodes-devel mailing list