[Pkg-isocodes-devel] [iso-codes] 03/04: Rewrite UTF-8 checking script in python
Tobias Quathamer
toddy at moszumanska.debian.org
Tue Feb 23 18:41:18 UTC 2016
This is an automated email from the git hooks/post-receive script.
toddy pushed a commit to branch master
in repository iso-codes.
commit f91f940e85d4d64256a06b8d7bc511e1c2e0200d
Author: Dr. Tobias Quathamer <toddy at debian.org>
Date: Tue Feb 23 19:39:11 2016 +0100
Rewrite UTF-8 checking script in python
---
Makefile.am | 2 +-
check_valid_utf8.pl | 63 -----------------------------------------------------
check_valid_utf8.py | 56 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 57 insertions(+), 64 deletions(-)
diff --git a/Makefile.am b/Makefile.am
index b2b9385..f8feee9 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -7,7 +7,7 @@ EXTRA_DIST = \
LICENSE \
common.mk \
iso2pot.py \
- check_valid_utf8.pl
+ check_valid_utf8.py
DISTCLEANFILES = $(pkgconfig_DATA)
diff --git a/check_valid_utf8.pl b/check_valid_utf8.pl
deleted file mode 100644
index 093c7cf..0000000
--- a/check_valid_utf8.pl
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/usr/bin/env perl
-#
-# Takes a list of files on the command line and checks for valid
-# UTF-8 data. Used for checking .po files.
-#
-# Copyright © 2009 Tobias Quathamer <toddy at debian.org>
-#
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-
-use strict;
-use warnings;
-
-my $exit_status = 0;
-
-foreach my $filename (@ARGV) {
- my $content_type_checked = 0;
- open FILE, "< $filename";
- while (<FILE>) {
- # Check for valid UTF-8 encoding
- unless (m/\A(
- [\x09\x0A\x0D\x20-\x7E] # ASCII
- | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
- | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
- | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
- | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
- | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
- | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
- | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
- )*\z/x) {
- # Found invalid characters for UTF-8
- printf("Error in file %s at line number %d:\n", $filename, $.);
- # Show the line with the error
- print;
- $exit_status = 1;
- # Skip the rest of the current file
- last;
- }
- # Check that the Content-Type header field is set correctly.
- if (!$content_type_checked && /Content-Type: text\/plain; charset=UTF-8/) {
- $content_type_checked = 1;
- }
- }
- unless ($content_type_checked) {
- printf("Error in file %s:\n", $filename);
- print("Could not detect correct Content-Type header field.\n");
- $exit_status = 1;
- }
- close FILE;
-}
-
-exit($exit_status);
diff --git a/check_valid_utf8.py b/check_valid_utf8.py
new file mode 100755
index 0000000..3f37853
--- /dev/null
+++ b/check_valid_utf8.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+#
+# Takes a list of files on the command line and checks for valid
+# UTF-8 data. Used for checking .po files.
+#
+# Copyright © 2016 Dr. Tobias Quathamer <toddy at debian.org>
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+import re
+import sys
+
+# Remove the script name from the files to check
+sys.argv.pop(0)
+
+# Assume that every file is valid
+exit_status = 0
+
+# Cycle through all files and check for valid UTF-8 encoding
+for filename in sys.argv:
+ # Open the file for reading in binary mode
+ with open(filename, "rb") as pofile:
+ # The "Content-Type" header has not been seen yet
+ charset_utf8_seen = False
+ # Read all lines to check for Content-Type header
+ for line in pofile:
+ # Try to decode binary data to UTF-8
+ try:
+ utf8 = line.decode(encoding="utf-8", errors="strict")
+ except UnicodeError as error:
+ print("UTF-8 encoding error in file %s: %s (position %d)" % (filename, error.reason, error.start))
+ print("Binary data: %s" % line)
+ exit_status = 1
+ break
+ if re.search(r'Content-Type: text/plain; charset=UTF-8', utf8):
+ charset_utf8_seen = True
+ # The whole file has been read, the content time should have
+ # been detected now. Otherwise, it's an error.
+ if not charset_utf8_seen:
+ print("Error in file %s: could not detect Content-Type header" % filename)
+ exit_status = 1
+ break
+
+sys.exit(exit_status)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-isocodes/iso-codes.git
More information about the Pkg-isocodes-devel
mailing list