[Pkg-isocodes-devel] [SCM] ISO language, territory, currency, script codes and their translations branch, master, updated. iso-codes/3.6-10-gbe10688
Tobias Quathamer
toddy at debian.org
Mon Feb 2 16:36:58 UTC 2009
The following commit has been merged in the master branch:
commit be106886e2c4fad35c0959ef097561523ff377d7
Author: Tobias Quathamer <toddy at debian.org>
Date: Mon Feb 2 17:36:45 2009 +0100
Use a Perl script for a complete check of valid UTF-8 text in po files
diff --git a/ChangeLog b/ChangeLog
index 4275a41..24f6a75 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -2,6 +2,9 @@ iso-codes trunk
---------------
UNRELEASED
+ [ General ]
+ * Use a Perl script for a complete check of valid UTF-8 text in po files
+
[ ISO 3166 translations ]
* Norwegian Bokmål by Hans Fredrik Nordhaug. Closes: #513840
* Traditional Chinese by Tetralet. Closes: #513926
diff --git a/check_valid_utf8.pl b/check_valid_utf8.pl
new file mode 100644
index 0000000..e855df0
--- /dev/null
+++ b/check_valid_utf8.pl
@@ -0,0 +1,50 @@
+#!/usr/bin/env perl
+#
+# Takes a list of files on the command line and checks for valid
+# UTF-8 data. Used for checking .po files.
+#
+# Copyright (C) 2009 Tobias Quathamer <toddy at debian.org>
+# Released under the GPL version 2 or later.
+
+use strict;
+use warnings;
+
+my $exit_status = 0;
+
+foreach my $filename (@ARGV) {
+ my $content_type_checked = 0;
+ open FILE, "< $filename";
+ while (<FILE>) {
+ # Check for valid UTF-8 encoding
+ unless (m/\A(
+ [\x09\x0A\x0D\x20-\x7E] # ASCII
+ | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
+ | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
+ | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
+ | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
+ | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
+ | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
+ | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
+ )*\z/x) {
+ # Found invalid characters for UTF-8
+ printf("Error in file %s at line number %d:\n", $filename, $.);
+ # Show the line with the error
+ print;
+ $exit_status = 1;
+ # Skip the rest of the current file
+ last;
+ }
+ # Check that the Content-Type header field is set correctly.
+ if (!$content_type_checked && /Content-Type: text\/plain; charset=UTF-8/) {
+ $content_type_checked = 1;
+ }
+ }
+ unless ($content_type_checked) {
+ printf("Error in file %s:\n", $filename);
+ print("Could not detect correct Content-Type header field.\n");
+ $exit_status = 1;
+ }
+ close FILE;
+}
+
+exit($exit_status);
diff --git a/rules.make b/rules.make
index faaa961..3268077 100644
--- a/rules.make
+++ b/rules.make
@@ -4,15 +4,7 @@
.PHONY: check-content
check-content:
- @grep "Content-Type" *po | grep -v "UTF-8" && touch found-non-utf.stamp || true
- @if [ -e found-non-utf.stamp ]; then \
- echo "*********"; \
- echo "* Error *"; \
- echo "*********"; \
- echo "At least one file is not encoded in UTF-8. Please check."; \
- rm -f found-non-utf.stamp; \
- false; \
- fi
+ perl $(top_builddir)/check_valid_utf8.pl $(pofiles)
# This target merges all po files with the current pot file,
# removes obsolete msgids and substitutes the Project-Id-Version
--
ISO language, territory, currency, script codes and their translations
More information about the Pkg-isocodes-devel
mailing list