[Po4a-devel][patch] Take 2: Making Html.pm (a little) better
Yves Rutschle
debian.anti-spam@rutschle.net
Sun, 28 Nov 2004 14:58:15 +0000
--C7zPtVaVf+AK4Oqc
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Ok, here is a much bigger patch:
Thanks to Nekral for pointing out 2 problems:
- Those two spaces around the pushline() call where wrong,
but so was what I did. After further thinking, it's
actually obvious that you _cannot_ touch leading and
trailing spaces. They are now conserved, and all we do is
remove multiple spaces.
- The title/alt attribute translation was indeed wrong. Now
the <img> tag is rewritten entirely.
Additional things:
- tokens with no content shall be translated no more (things
like <b>hello<b>, <i>world</i> would generated a msgid ", ")
- Test suite added. Thanks to Denis for making me do this
(made me find potential problems) and Jordi for pointing
po4a-normalize.
Enjoy!
Y. - damn, soon I'll run out of excuses for not translating
the site.
--C7zPtVaVf+AK4Oqc
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="html.patch"
diff -urN po4a_orig/lib/Locale/Po4a/Html.pm po4a/lib/Locale/Po4a/Html.pm
--- po4a_orig/lib/Locale/Po4a/Html.pm 2004-08-27 11:31:53.000000000 +0100
+++ po4a/lib/Locale/Po4a/Html.pm 2004-11-28 15:21:01.000000000 +0000
@@ -80,11 +80,16 @@
my ($self,$filename)=@_;
my $stream = HTML::TokeParser->new($filename)
|| die "Couldn't read HTML file $filename : $!";
+
+ $stream->unbroken_text( [1] );
my @type=();
NEXT : while (my $token = $stream->get_token) {
if($token->[0] eq 'T') {
- my $text = trim($token->[1]);
+ my $text = $token->[1];
+ my ($pre_spaces) = ($text =~ /^(\s*)/);
+ my ($post_spaces) = ($text =~ /(\s*)$/);
+ $text = trim($text);
if (notranslation($text) == 1) {
$self->pushline( get_tag( $token ) );
next NEXT;
@@ -97,14 +102,38 @@
# $encoded = HTML::Entities::encode($a);
# $decoded = HTML::Entities::decode($a);
#print STDERR $token->[0];
- $self->pushline( " ".$self->translate($text,
+ $self->pushline( $pre_spaces . $self->translate($text,
"FIXME:0",
(scalar @type ? $type[scalar @type-1]: "NOTYPE")
- )." " );
+ ) . $post_spaces,
+ 'wrap' => 1
+ );
next NEXT;
} elsif ($token->[0] eq 'S') {
push @type,$token->[1];
- $self->pushline( get_tag( $token ) );
+ my $text = get_tag( $token );
+ if ( $token->[1] eq 'img' ) {
+ my %attr = %{$token->[2]};
+ for my $a (qw/title alt/) {
+ my $content = $attr{$a};
+ if (defined $content) {
+ $content = trim($content);
+ my $translated = $self->translate(
+ $content,
+ "FIXME:0",
+ "img_$a"
+ );
+ $attr{$a} = $translated;
+ }
+ }
+ my ($closing) = ( $text =~ /(\s*\/?>)/ );
+ # reconstruct the tag from scratch
+ delete $attr{'/'}; # Parser thinks closing / in XHTML is an attribute
+ $text = "<img";
+ $text .= " $_=\"$attr{$_}\"" foreach keys %attr;
+ $text .= $closing;
+ }
+ $self->pushline( $text );
} elsif ($token->[0] eq 'E') {
pop @type;
$self->pushline( get_tag( $token ) );
@@ -136,11 +165,12 @@
sub trim {
my $s=shift;
- $s =~ s/\n//g; # remove \n in text
- $s =~ s/\r//g; # remove \r in text
- $s =~ s/\t//g; # remove tabulations
- $s =~ s/^\s+//; # remove leading spaces
- $s =~ s/\s+$//; # remove trailing spaces
+ $s =~ s/\n/ /g; # remove \n in text
+ $s =~ s/\r/ /g; # remove \r in text
+ $s =~ s/\t/ /g; # remove tabulations
+ $s =~ s/\s+/ /g; # remove multiple spaces
+ $s =~ s/^\s*//g; # remove leading spaces
+ $s =~ s/\s*$//g; # remove trailing spaces
return $s;
}
@@ -163,6 +193,11 @@
# don't translate entries composed of one entity
return 1 if ($s =~ /^&[^;]*;$/);
+# don't translate entries with no letters
+# (happens with e.g. <b>Hello</b>, <i>world</i> )
+# ^^
+# ", " doesn't need translation
+ return 1 unless $s =~ /\w/;
return 0;
}
diff -urN po4a_orig/t/22-html.t po4a/t/22-html.t
--- po4a_orig/t/22-html.t 1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/22-html.t 2004-11-28 01:43:34.000000000 +0000
@@ -0,0 +1,65 @@
+#! /usr/bin/perl
+# HTML module tester.
+
+#########################
+
+use strict;
+use warnings;
+
+my @tests;
+
+mkdir "t/tmp" unless -e "t/tmp";
+
+my $diff_po_flags = " -I '^# SOME' -I '^# Test' ".
+ "-I '^\"POT-Creation-Date: ' -I '^\"Content-Transfer-Encoding:'";
+
+push @tests, {
+ 'run' => 'perl ../../po4a-gettextize -f html -m ../data-22/html.html -p html.po',
+ 'test'=> "diff -u $diff_po_flags ../data-22/html.po html.po",
+ 'doc' => 'General',
+}, {
+ 'run' => 'perl ../../po4a-normalize -f html ../data-22/spaces.html',
+ 'test'=> "diff -u $diff_po_flags ../data-22/spaces.po po4a-normalize.po".
+ "&& diff -u $diff_po_flags ../data-22/spaces_out.html po4a-normalize.output",
+ 'doc' => 'Spaces',
+}, {
+ 'run' => 'perl ../../po4a-gettextize -f html -m ../data-22/attribute.html -p attribute.po;'.
+ 'sed "s/msgstr \"\"/msgstr \"baz\"/" attribute.po > attribute2.po;'.
+ 'perl ../../po4a-translate -f html -m ../data-22/attribute.html -p attribute2.po -l attribute.html'
+ ,
+ 'test'=> "diff -u $diff_po_flags ../data-22/attribute_out.html attribute.html",
+ 'doc' => 'Attribute replacement'
+};
+
+use Test::More tests => 6;
+
+chdir "t/tmp" || die "Can't chdir to my test directory";
+
+foreach my $test ( @tests ) {
+ my ($val,$name);
+
+ my $cmd=$test->{'run'};
+ $val=system($cmd);
+
+ $name=$test->{'doc'}.' runs';
+ ok($val == 0,$name);
+ diag($test->{'run'}) unless ($val == 0);
+
+ SKIP: {
+ skip ("Command didn't run, can't test the validity of its return",1)
+ if $val;
+ $val=system($test->{'test'});
+ $name=$test->{'doc'}.' returns what is expected';
+ ok($val == 0,$name);
+ unless ($val == 0) {
+ diag ("Failed (retval=$val) on:");
+ diag ($test->{'test'});
+ diag ("Was created with:");
+ diag ($test->{'run'});
+ }
+ }
+}
+
+chdir "../.." || die "Can't chdir back to my root";
+
+0;
diff -urN po4a_orig/t/data-22/attribute.html po4a/t/data-22/attribute.html
--- po4a_orig/t/data-22/attribute.html 1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/data-22/attribute.html 2004-11-28 01:38:48.000000000 +0000
@@ -0,0 +1,4 @@
+<img src="foo.bar.html" title="bar">
+<img src="foo.bar.html" title="bar" >
+<img src="foo.bar.html" title="bar"/>
+<img src="foo.bar.html" title="bar" />
diff -urN po4a_orig/t/data-22/attribute_out.html po4a/t/data-22/attribute_out.html
--- po4a_orig/t/data-22/attribute_out.html 1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/data-22/attribute_out.html 2004-11-28 01:38:38.000000000 +0000
@@ -0,0 +1,4 @@
+<img src="foo.bar.html" title="baz">
+<img src="foo.bar.html" title="baz" >
+<img src="foo.bar.html" title="baz"/>
+<img src="foo.bar.html" title="baz" />
diff -urN po4a_orig/t/data-22/html.html po4a/t/data-22/html.html
--- po4a_orig/t/data-22/html.html 1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/data-22/html.html 2004-11-27 02:21:11.000000000 +0000
@@ -0,0 +1,26 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+
+<html>
+ <head>
+ <!--#set var="BLAH" value="1" -->
+ <!--#include virtual="header.shtml"-->
+ <title>Title string</title>
+ </head>
+
+ <body>
+ <div class="content">
+ <h1>Header</h1>
+ <img src="somepicture.jpg" class="left"
+ alt="Some text" title="My picture" />
+
+ <p><strong>Strong</strong>not strong</p>
+ <p>
+ <a href="somelink.html">My link</a>,
+ <a href="nextline.html">link on next line<a>, <a href="sameline.html">line on same line</a>.
+ </p>
+
+ <img src="picture.jpg" alt="picture">
+ </body>
+</html>
+
diff -urN po4a_orig/t/data-22/html.po po4a/t/data-22/html.po
--- po4a_orig/t/data-22/html.po 1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/data-22/html.po 2004-11-28 15:24:26.000000000 +0000
@@ -0,0 +1,75 @@
+# SOME DESCRIPTIVE TITLE
+# Copyright (C) YEAR Free Software Foundation, Inc.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2004-11-28 15:24+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=CHARSET\n"
+"Content-Transfer-Encoding: ENCODING"
+
+# type: title
+#: FIXME:0
+#, no-wrap
+msgid "Title string"
+msgstr ""
+
+# type: h1
+#: FIXME:0
+#, no-wrap
+msgid "Header"
+msgstr ""
+
+# type: img_title
+#: FIXME:0
+#, no-wrap
+msgid "My picture"
+msgstr ""
+
+# type: img_alt
+#: FIXME:0
+#, no-wrap
+msgid "Some text"
+msgstr ""
+
+# type: strong
+#: FIXME:0
+#, no-wrap
+msgid "Strong"
+msgstr ""
+
+# type: p
+#: FIXME:0
+#, no-wrap
+msgid "not strong"
+msgstr ""
+
+# type: a
+#: FIXME:0
+#, no-wrap
+msgid "My link"
+msgstr ""
+
+# type: a
+#: FIXME:0
+#, no-wrap
+msgid "link on next line"
+msgstr ""
+
+# type: a
+#: FIXME:0
+#, no-wrap
+msgid "line on same line"
+msgstr ""
+
+# type: img_alt
+#: FIXME:0
+#, no-wrap
+msgid "picture"
+msgstr ""
diff -urN po4a_orig/t/data-22/spaces.html po4a/t/data-22/spaces.html
--- po4a_orig/t/data-22/spaces.html 1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/data-22/spaces.html 2004-11-28 00:38:55.000000000 +0000
@@ -0,0 +1,13 @@
+<html><head><title>Title string</title></head>
+ <body>
+ <h1>Header1</h1>
+ <h1>Header2 </h1>
+ <p><strong>Strong1</strong>not strong 1</p>
+ <p><strong>Strong2</strong> not strong 2</p>
+ <p><strong>Strong3</strong>not strong 2 </p>
+<b>first line</b>
+<b>second line</b><b>glued</b> <b>spaced</b>
+<a href="nextline.html">link on next line<a>, <a href="sameline.html">line on same line</a>.
+ </body>
+</html>
+
diff -urN po4a_orig/t/data-22/spaces.po po4a/t/data-22/spaces.po
--- po4a_orig/t/data-22/spaces.po 1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/data-22/spaces.po 2004-11-28 15:25:37.000000000 +0000
@@ -0,0 +1,99 @@
+# SOME DESCRIPTIVE TITLE
+# Copyright (C) YEAR Free Software Foundation, Inc.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2004-11-28 15:25+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=CHARSET\n"
+"Content-Transfer-Encoding: ENCODING"
+
+# type: title
+#: FIXME:0
+#, no-wrap
+msgid "Title string"
+msgstr ""
+
+# type: h1
+#: FIXME:0
+#, no-wrap
+msgid "Header1"
+msgstr ""
+
+# type: h1
+#: FIXME:0
+#, no-wrap
+msgid "Header2"
+msgstr ""
+
+# type: strong
+#: FIXME:0
+#, no-wrap
+msgid "Strong1"
+msgstr ""
+
+# type: p
+#: FIXME:0
+#, no-wrap
+msgid "not strong 1"
+msgstr ""
+
+# type: strong
+#: FIXME:0
+#, no-wrap
+msgid "Strong2"
+msgstr ""
+
+# type: p
+#: FIXME:0 FIXME:0
+#, no-wrap
+msgid "not strong 2"
+msgstr ""
+
+# type: strong
+#: FIXME:0
+#, no-wrap
+msgid "Strong3"
+msgstr ""
+
+# type: b
+#: FIXME:0
+#, no-wrap
+msgid "first line"
+msgstr ""
+
+# type: b
+#: FIXME:0
+#, no-wrap
+msgid "second line"
+msgstr ""
+
+# type: b
+#: FIXME:0
+#, no-wrap
+msgid "glued"
+msgstr ""
+
+# type: b
+#: FIXME:0
+#, no-wrap
+msgid "spaced"
+msgstr ""
+
+# type: a
+#: FIXME:0
+#, no-wrap
+msgid "link on next line"
+msgstr ""
+
+# type: a
+#: FIXME:0
+#, no-wrap
+msgid "line on same line"
+msgstr ""
diff -urN po4a_orig/t/data-22/spaces_out.html po4a/t/data-22/spaces_out.html
--- po4a_orig/t/data-22/spaces_out.html 1970-01-01 01:00:00.000000000 +0100
+++ po4a/t/data-22/spaces_out.html 2004-11-28 15:25:37.000000000 +0000
@@ -0,0 +1,13 @@
+<html><head><title>Title string</title></head>
+ <body>
+ <h1>Header1</h1>
+ <h1>Header2 </h1>
+ <p><strong>Strong1</strong>not strong 1</p>
+ <p><strong>Strong2</strong> not strong 2</p>
+ <p><strong>Strong3</strong>not strong 2 </p>
+<b>first line</b>
+<b>second line</b><b>glued</b> <b>spaced</b>
+<a href="nextline.html">link on next line<a>, <a href="sameline.html">line on same line</a>.
+ </body>
+</html>
+
--C7zPtVaVf+AK4Oqc--