[Pkg-isocodes-devel] [SCM] ISO language, territory, currency,	script codes and their translations branch, master,	updated. iso-codes/3.6-10-gbe10688
    Tobias Quathamer 
    toddy at debian.org
       
    Mon Feb  2 16:36:58 UTC 2009
    
    
  
The following commit has been merged in the master branch:
commit be106886e2c4fad35c0959ef097561523ff377d7
Author: Tobias Quathamer <toddy at debian.org>
Date:   Mon Feb 2 17:36:45 2009 +0100
    Use a Perl script for a complete check of valid UTF-8 text in po files
diff --git a/ChangeLog b/ChangeLog
index 4275a41..24f6a75 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -2,6 +2,9 @@ iso-codes trunk
 ---------------
 UNRELEASED
 
+  [ General ]
+  * Use a Perl script for a complete check of valid UTF-8 text in po files
+
   [ ISO 3166 translations ]
   * Norwegian Bokmål by Hans Fredrik Nordhaug. Closes: #513840
   * Traditional Chinese by Tetralet. Closes: #513926
diff --git a/check_valid_utf8.pl b/check_valid_utf8.pl
new file mode 100644
index 0000000..e855df0
--- /dev/null
+++ b/check_valid_utf8.pl
@@ -0,0 +1,50 @@
+#!/usr/bin/env perl
+#
+# Takes a list of files on the command line and checks for valid
+# UTF-8 data. Used for checking .po files.
+#
+# Copyright (C) 2009 Tobias Quathamer <toddy at debian.org>
+# Released under the GPL version 2 or later.
+
+use strict;
+use warnings;
+
+my $exit_status = 0;
+
+foreach my $filename (@ARGV) {
+  my $content_type_checked = 0;
+  open FILE, "< $filename";
+  while (<FILE>) {
+    # Check for valid UTF-8 encoding
+    unless (m/\A(
+      [\x09\x0A\x0D\x20-\x7E]            # ASCII
+    | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
+    |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
+    | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
+    |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
+    |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
+    | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
+    |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
+    )*\z/x) {
+      # Found invalid characters for UTF-8
+      printf("Error in file %s at line number %d:\n", $filename, $.);
+      # Show the line with the error
+      print;
+      $exit_status = 1;
+      # Skip the rest of the current file
+      last;
+    }
+    # Check that the Content-Type header field is set correctly.
+    if (!$content_type_checked && /Content-Type: text\/plain; charset=UTF-8/) {
+      $content_type_checked = 1;
+    }
+  }
+  unless ($content_type_checked) {
+    printf("Error in file %s:\n", $filename);
+    print("Could not detect correct Content-Type header field.\n");
+    $exit_status = 1;
+  }
+  close FILE;
+}
+
+exit($exit_status);
diff --git a/rules.make b/rules.make
index faaa961..3268077 100644
--- a/rules.make
+++ b/rules.make
@@ -4,15 +4,7 @@
 
 .PHONY: check-content
 check-content:
-	@grep "Content-Type" *po | grep -v "UTF-8" && touch found-non-utf.stamp || true
-	@if [ -e found-non-utf.stamp ]; then \
-		echo "*********"; \
-		echo "* Error *"; \
-		echo "*********"; \
-		echo "At least one file is not encoded in UTF-8. Please check."; \
-		rm -f found-non-utf.stamp; \
-		false; \
-	fi
+	perl $(top_builddir)/check_valid_utf8.pl $(pofiles)
 
 # This target merges all po files with the current pot file,
 # removes obsolete msgids and substitutes the Project-Id-Version
-- 
ISO language, territory, currency, script codes and their translations
    
    
More information about the Pkg-isocodes-devel
mailing list