[Po4a-devel][patch] Take 2: Making Html.pm (a little) better

Yves Rutschle debian.anti-spam@rutschle.net
Sun, 28 Nov 2004 14:58:15 +0000


--C7zPtVaVf+AK4Oqc
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Ok, here is a much bigger patch:

Thanks to Nekral for pointing out 2 problems:

- Those two spaces around the pushline() call where wrong,
  but so was what I did. After further thinking, it's
  actually obvious that you _cannot_ touch leading and
  trailing spaces. They are now conserved, and all we do is
  remove multiple spaces.

- The title/alt attribute translation was indeed wrong. Now
  the <img> tag is rewritten entirely.

Additional things:

- tokens with no content shall be translated no more (things
  like <b>hello<b>, <i>world</i> would generated a msgid ", ")

- Test suite added. Thanks to Denis for making me do this
  (made me find potential problems) and Jordi for pointing
  po4a-normalize.

Enjoy!
Y. - damn, soon I'll run out of excuses for not translating
the site.


--C7zPtVaVf+AK4Oqc
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="html.patch"

diff -urN po4a_orig/lib/Locale/Po4a/Html.pm po4a/lib/Locale/Po4a/Html.pm
--- po4a_orig/lib/Locale/Po4a/Html.pm	2004-08-27 11:31:53.000000000 +0100
+++ po4a/lib/Locale/Po4a/Html.pm	2004-11-28 15:21:01.000000000 +0000
@@ -80,11 +80,16 @@
     my ($self,$filename)=@_;
     my $stream = HTML::TokeParser->new($filename)
         || die "Couldn't read HTML file $filename : $!";
+
+    $stream->unbroken_text( [1] );
     
     my @type=();
     NEXT : while (my $token = $stream->get_token) {
         if($token->[0] eq 'T') {
-            my $text = trim($token->[1]);
+            my $text = $token->[1];
+            my ($pre_spaces) = ($text =~ /^(\s*)/);
+            my ($post_spaces) = ($text =~ /(\s*)$/);
+            $text = trim($text);
             if (notranslation($text) == 1) {
                 $self->pushline( get_tag( $token ) );
                 next NEXT;
@@ -97,14 +102,38 @@
 #  $encoded = HTML::Entities::encode($a);
 #  $decoded = HTML::Entities::decode($a);
 	    #print STDERR $token->[0];
-            $self->pushline( " ".$self->translate($text,
+            $self->pushline( $pre_spaces . $self->translate($text,
 		                                  "FIXME:0",
 		                                  (scalar @type ? $type[scalar @type-1]: "NOTYPE")
-	                                         )." " );
+	                                         ) . $post_spaces,
+                             'wrap' => 1
+                             );
             next NEXT;
 	} elsif ($token->[0] eq 'S') {
 	    push @type,$token->[1];
-            $self->pushline( get_tag( $token ) );
+            my $text =  get_tag( $token );
+            if ( $token->[1] eq 'img' ) {
+                my %attr = %{$token->[2]};
+                for my $a (qw/title alt/) {
+                    my $content = $attr{$a};
+                    if (defined $content) {
+                        $content = trim($content);
+                        my $translated = $self->translate( 
+                                              $content,
+                                              "FIXME:0",
+                                              "img_$a"
+                                              );
+                        $attr{$a} = $translated;
+                    }
+                }
+                my ($closing) = ( $text =~ /(\s*\/?>)/ );
+                # reconstruct the tag from scratch
+                delete $attr{'/'}; # Parser thinks closing / in XHTML is an attribute
+                $text = "<img";
+                $text .= " $_=\"$attr{$_}\"" foreach keys %attr;
+                $text .= $closing;
+            }
+            $self->pushline( $text );
         } elsif ($token->[0] eq 'E') {
 	    pop @type;
             $self->pushline( get_tag( $token ) );
@@ -136,11 +165,12 @@
 
 sub trim { 
     my $s=shift;
-    $s =~ s/\n//g;  # remove \n in text
-    $s =~ s/\r//g;  # remove \r in text
-    $s =~ s/\t//g;  # remove tabulations
-    $s =~ s/^\s+//; # remove leading spaces
-    $s =~ s/\s+$//; # remove trailing spaces
+    $s =~ s/\n/ /g;  # remove \n in text
+    $s =~ s/\r/ /g;  # remove \r in text
+    $s =~ s/\t/ /g;  # remove tabulations
+    $s =~ s/\s+/ /g; # remove multiple spaces
+    $s =~ s/^\s*//g; # remove leading spaces
+    $s =~ s/\s*$//g; # remove trailing spaces
     return $s;
 } 
 
@@ -163,6 +193,11 @@
     # don't translate entries composed of one entity
     return 1 if ($s =~ /^&[^;]*;$/);
     
+# don't translate entries with no letters
+# (happens with e.g.  <b>Hello</b>, <i>world</i> )
+#                                 ^^
+#                    ", " doesn't need translation
+    return 1 unless $s =~ /\w/;
     return 0;          
 }
 
diff -urN po4a_orig/t/22-html.t po4a/t/22-html.t
--- po4a_orig/t/22-html.t	1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/22-html.t	2004-11-28 01:43:34.000000000 +0000
@@ -0,0 +1,65 @@
+#! /usr/bin/perl
+# HTML module tester.
+
+#########################
+
+use strict;
+use warnings;
+
+my @tests;
+
+mkdir "t/tmp" unless -e "t/tmp";
+
+my $diff_po_flags = " -I '^# SOME' -I '^# Test' ".
+  "-I '^\"POT-Creation-Date: ' -I '^\"Content-Transfer-Encoding:'";
+
+push @tests, {
+  'run' => 'perl ../../po4a-gettextize -f html -m ../data-22/html.html -p html.po',
+  'test'=> "diff -u $diff_po_flags ../data-22/html.po html.po",
+  'doc' => 'General',
+}, {
+  'run' => 'perl ../../po4a-normalize -f html ../data-22/spaces.html',
+  'test'=> "diff -u $diff_po_flags ../data-22/spaces.po po4a-normalize.po".
+            "&& diff -u $diff_po_flags ../data-22/spaces_out.html po4a-normalize.output",
+  'doc' => 'Spaces',
+}, {
+  'run' => 'perl ../../po4a-gettextize -f html -m ../data-22/attribute.html -p attribute.po;'.
+           'sed "s/msgstr \"\"/msgstr \"baz\"/" attribute.po > attribute2.po;'.
+           'perl ../../po4a-translate -f html -m ../data-22/attribute.html -p attribute2.po -l attribute.html'
+  ,
+  'test'=> "diff -u $diff_po_flags ../data-22/attribute_out.html attribute.html",
+  'doc' => 'Attribute replacement'
+};
+
+use Test::More tests => 6;
+
+chdir "t/tmp" || die "Can't chdir to my test directory";
+
+foreach my $test ( @tests ) {
+    my ($val,$name);
+
+    my $cmd=$test->{'run'};
+    $val=system($cmd);
+
+    $name=$test->{'doc'}.' runs';
+    ok($val == 0,$name);
+    diag($test->{'run'}) unless ($val == 0);
+
+    SKIP: {
+    	skip ("Command didn't run, can't test the validity of its return",1)
+	     if $val;
+        $val=system($test->{'test'});	
+    	$name=$test->{'doc'}.' returns what is expected';
+        ok($val == 0,$name);
+	unless ($val == 0) {
+	    diag ("Failed (retval=$val) on:");
+	    diag ($test->{'test'});
+	    diag ("Was created with:");
+	    diag ($test->{'run'});
+	}
+    }
+}
+
+chdir "../.." || die "Can't chdir back to my root";
+
+0;
diff -urN po4a_orig/t/data-22/attribute.html po4a/t/data-22/attribute.html
--- po4a_orig/t/data-22/attribute.html	1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/data-22/attribute.html	2004-11-28 01:38:48.000000000 +0000
@@ -0,0 +1,4 @@
+<img src="foo.bar.html" title="bar">
+<img src="foo.bar.html" title="bar" >
+<img src="foo.bar.html" title="bar"/>
+<img src="foo.bar.html" title="bar" />
diff -urN po4a_orig/t/data-22/attribute_out.html po4a/t/data-22/attribute_out.html
--- po4a_orig/t/data-22/attribute_out.html	1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/data-22/attribute_out.html	2004-11-28 01:38:38.000000000 +0000
@@ -0,0 +1,4 @@
+<img src="foo.bar.html" title="baz">
+<img src="foo.bar.html" title="baz" >
+<img src="foo.bar.html" title="baz"/>
+<img src="foo.bar.html" title="baz" />
diff -urN po4a_orig/t/data-22/html.html po4a/t/data-22/html.html
--- po4a_orig/t/data-22/html.html	1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/data-22/html.html	2004-11-27 02:21:11.000000000 +0000
@@ -0,0 +1,26 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+
+<html>
+  <head>
+    <!--#set var="BLAH" value="1" -->
+    <!--#include virtual="header.shtml"-->
+    <title>Title string</title>
+  </head>
+
+  <body>
+    <div class="content">
+	    <h1>Header</h1>
+		<img src="somepicture.jpg" class="left" 
+                alt="Some text" title="My picture" />
+
+              <p><strong>Strong</strong>not strong</p>
+              <p>
+                <a href="somelink.html">My link</a>,
+                <a href="nextline.html">link on next line<a>, <a href="sameline.html">line on same line</a>.
+      </p>
+
+      <img src="picture.jpg" alt="picture">
+  </body>
+</html>
+
diff -urN po4a_orig/t/data-22/html.po po4a/t/data-22/html.po
--- po4a_orig/t/data-22/html.po	1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/data-22/html.po	2004-11-28 15:24:26.000000000 +0000
@@ -0,0 +1,75 @@
+# SOME DESCRIPTIVE TITLE
+# Copyright (C) YEAR Free Software Foundation, Inc.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+# 
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2004-11-28 15:24+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=CHARSET\n"
+"Content-Transfer-Encoding: ENCODING"
+
+# type: title
+#: FIXME:0
+#, no-wrap
+msgid "Title string"
+msgstr ""
+
+# type: h1
+#: FIXME:0
+#, no-wrap
+msgid "Header"
+msgstr ""
+
+# type: img_title
+#: FIXME:0
+#, no-wrap
+msgid "My picture"
+msgstr ""
+
+# type: img_alt
+#: FIXME:0
+#, no-wrap
+msgid "Some text"
+msgstr ""
+
+# type: strong
+#: FIXME:0
+#, no-wrap
+msgid "Strong"
+msgstr ""
+
+# type: p
+#: FIXME:0
+#, no-wrap
+msgid "not strong"
+msgstr ""
+
+# type: a
+#: FIXME:0
+#, no-wrap
+msgid "My link"
+msgstr ""
+
+# type: a
+#: FIXME:0
+#, no-wrap
+msgid "link on next line"
+msgstr ""
+
+# type: a
+#: FIXME:0
+#, no-wrap
+msgid "line on same line"
+msgstr ""
+
+# type: img_alt
+#: FIXME:0
+#, no-wrap
+msgid "picture"
+msgstr ""
diff -urN po4a_orig/t/data-22/spaces.html po4a/t/data-22/spaces.html
--- po4a_orig/t/data-22/spaces.html	1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/data-22/spaces.html	2004-11-28 00:38:55.000000000 +0000
@@ -0,0 +1,13 @@
+<html><head><title>Title string</title></head> 
+  <body>
+	    <h1>Header1</h1>
+	    <h1>Header2 </h1>
+              <p><strong>Strong1</strong>not   strong 1</p>
+              <p><strong>Strong2</strong> not strong 2</p>
+              <p><strong>Strong3</strong>not strong 2 </p>
+<b>first line</b>
+<b>second line</b><b>glued</b> <b>spaced</b>
+<a href="nextline.html">link on next line<a>, <a href="sameline.html">line on same line</a>.
+  </body>
+</html>
+
diff -urN po4a_orig/t/data-22/spaces.po po4a/t/data-22/spaces.po
--- po4a_orig/t/data-22/spaces.po	1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/data-22/spaces.po	2004-11-28 15:25:37.000000000 +0000
@@ -0,0 +1,99 @@
+# SOME DESCRIPTIVE TITLE
+# Copyright (C) YEAR Free Software Foundation, Inc.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+# 
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2004-11-28 15:25+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=CHARSET\n"
+"Content-Transfer-Encoding: ENCODING"
+
+# type: title
+#: FIXME:0
+#, no-wrap
+msgid "Title string"
+msgstr ""
+
+# type: h1
+#: FIXME:0
+#, no-wrap
+msgid "Header1"
+msgstr ""
+
+# type: h1
+#: FIXME:0
+#, no-wrap
+msgid "Header2"
+msgstr ""
+
+# type: strong
+#: FIXME:0
+#, no-wrap
+msgid "Strong1"
+msgstr ""
+
+# type: p
+#: FIXME:0
+#, no-wrap
+msgid "not strong 1"
+msgstr ""
+
+# type: strong
+#: FIXME:0
+#, no-wrap
+msgid "Strong2"
+msgstr ""
+
+# type: p
+#: FIXME:0 FIXME:0
+#, no-wrap
+msgid "not strong 2"
+msgstr ""
+
+# type: strong
+#: FIXME:0
+#, no-wrap
+msgid "Strong3"
+msgstr ""
+
+# type: b
+#: FIXME:0
+#, no-wrap
+msgid "first line"
+msgstr ""
+
+# type: b
+#: FIXME:0
+#, no-wrap
+msgid "second line"
+msgstr ""
+
+# type: b
+#: FIXME:0
+#, no-wrap
+msgid "glued"
+msgstr ""
+
+# type: b
+#: FIXME:0
+#, no-wrap
+msgid "spaced"
+msgstr ""
+
+# type: a
+#: FIXME:0
+#, no-wrap
+msgid "link on next line"
+msgstr ""
+
+# type: a
+#: FIXME:0
+#, no-wrap
+msgid "line on same line"
+msgstr ""
diff -urN po4a_orig/t/data-22/spaces_out.html po4a/t/data-22/spaces_out.html
--- po4a_orig/t/data-22/spaces_out.html	1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/data-22/spaces_out.html	2004-11-28 15:25:37.000000000 +0000
@@ -0,0 +1,13 @@
+<html><head><title>Title string</title></head> 
+  <body>
+	    <h1>Header1</h1>
+	    <h1>Header2 </h1>
+              <p><strong>Strong1</strong>not strong 1</p>
+              <p><strong>Strong2</strong> not strong 2</p>
+              <p><strong>Strong3</strong>not strong 2 </p>
+<b>first line</b>
+<b>second line</b><b>glued</b> <b>spaced</b>
+<a href="nextline.html">link on next line<a>, <a href="sameline.html">line on same line</a>.
+  </body>
+</html>
+

--C7zPtVaVf+AK4Oqc--