[Po4a-commits] "po4a/lib/Locale/Po4a Po.pm, 1.71, 1.72 TransTractor.pm, 1.89, 1.90"

Fri Feb 16 20:13:02 CET 2007

Update of /cvsroot/po4a/po4a/lib/Locale/Po4a
In directory alioth:/tmp/cvs-serv11129/lib/Locale/Po4a

Modified Files:
	Po.pm TransTractor.pm 
Log Message:
Various speedup based on avoiding Encode::from_to, which spends 40% of its
time in find_encoding. As the encoding of the input file, output file and
po files do not change a lot, it's faster to keep the encoders in the
Transtractor or Po objects.


Index: Po.pm
===================================================================
RCS file: /cvsroot/po4a/po4a/lib/Locale/Po4a/Po.pm,v
retrieving revision 1.71
retrieving revision 1.72
diff -u -d -r1.71 -r1.72

--- Po.pm	31 Dec 2006 14:36:27 -0000	1.71
+++ Po.pm	16 Feb 2007 19:13:00 -0000	1.72
@@ -171,6 +171,7 @@
 				"MIME-Version: 1.0\n".
 				"Content-Type: text/plain; charset=CHARSET\n".
 				"Content-Transfer-Encoding: ENCODING");
+    $self->{encoder}=find_encoding("ascii");
 
     # To make stats about gettext hits
     $self->stats_clear();
@@ -991,6 +992,12 @@
 #	} FIXME: do that iff the header isn't the default one.
 	$self->{header}=$msgstr;
 	$self->{header_comment}=$comment;
+	my $charset = $self->get_charset;
+	if ($charset ne "CHARSET") {
+	    $self->{encoder}=find_encoding($charset);
+	} else {
+	    $self->{encoder}=find_encoding("ascii");
+	}
 	return;
     }
 
@@ -1158,6 +1165,7 @@
     $oldchar = $self->get_charset();
 
     $self->{header} =~ s/$oldchar/$newchar/;
+    $self->{encoder}=find_encoding($newchar);
 }
 
 #----[ helper functions ]---------------------------------------------------

Index: TransTractor.pm
===================================================================
RCS file: /cvsroot/po4a/po4a/lib/Locale/Po4a/TransTractor.pm,v
retrieving revision 1.89
retrieving revision 1.90
diff -u -d -r1.89 -r1.90
--- TransTractor.pm	14 Feb 2007 23:02:12 -0000	1.89
+++ TransTractor.pm	16 Feb 2007 19:13:00 -0000	1.90
@@ -280,14 +280,13 @@
 	$newparams{$_}=$params{$_};
     }
 
-    $self->{TT}{'file_in_charset'}=$params{'file_in_charset'};
+    $self->detected_charset($params{'file_in_charset'});
     $self->{TT}{'file_out_charset'}=$params{'file_out_charset'};
-    $self->{TT}{'addendum_charset'}=$params{'addendum_charset'};
-    if (defined $self->{TT}{'file_in_charset'} and
-        length $self->{TT}{'file_in_charset'} and
-        $self->{TT}{'file_in_charset'} !~ m/ascii/i) {
-	$self->{TT}{ascii_input}=0;
+    if (defined($self->{TT}{'file_out_charset'}) and
+	length($self->{TT}{'file_out_charset'})) {
+	$self->{TT}{'file_out_encoder'} = find_encoding($self->{TT}{'file_out_charset'});
     }
+    $self->{TT}{'addendum_charset'}=$params{'addendum_charset'};
 
     foreach my $file (@{$params{'po_in_name'}}) {
 	print STDERR "readpo($file)... " if $self->debug();
@@ -835,7 +834,9 @@
     }
 
     if ($self->{TT}{po_in}->get_charset ne "CHARSET") {
-	Encode::from_to($string, $in_charset, $self->{TT}{po_in}->get_charset);
+	$string = encode_from_to($string,
+	                         $self->{TT}{'file_in_encoder'},
+	                         $self->{TT}{po_in}{encoder});
     }
 
     if (defined $options{'wrapcol'} && $options{'wrapcol'} < 0) {
@@ -847,8 +848,13 @@
 					'wrapcol'   => $options{'wrapcol'});
 
     if ($self->{TT}{po_in}->get_charset ne "CHARSET") {
-	Encode::from_to($transstring,$self->{TT}{po_in}->get_charset,
-	    $self->get_out_charset);
+	my $out_encoder = $self->{TT}{'file_out_encoder'};
+	unless (defined $out_encoder) {
+	    $out_encoder = find_encoding($self->get_out_charset)
+	}
+	$transstring = encode_from_to($transstring,
+	                              $self->{TT}{po_in}{encoder},
+	                              $out_encoder);
     }
 
     # If the input document isn't completely in ascii, we should see what to
@@ -929,10 +935,13 @@
 sub detected_charset {
     my ($self,$charset)=(shift,shift);
     unless (defined($self->{TT}{'file_in_charset'}) and
-	length($self->{TT}{'file_in_charset'}) ) {
-
-	$self->{TT}{'file_in_charset'}=$charset;
+            length($self->{TT}{'file_in_charset'}) ) {
+        $self->{TT}{'file_in_charset'}=$charset;
+        if (defined $charset) {
+            $self->{TT}{'file_in_encoder'}=find_encoding($charset);
+        }
     }
+
     if (defined $self->{TT}{'file_in_charset'} and
         length $self->{TT}{'file_in_charset'} and
         $self->{TT}{'file_in_charset'} !~ m/ascii/i) {
@@ -992,8 +1001,9 @@
     unless ($self->{TT}{'ascii_input'}) {
 	if(defined($self->{TT}{'file_in_charset'}) and
 	    length($self->{TT}{'file_in_charset'}) ) {
-	    Encode::from_to($text,$self->{TT}{'file_in_charset'},
-		$self->get_out_charset);
+	    $text = encode_from_to($text,
+	                           $self->{TT}{'file_in_encoder'},
+	                           find_encoding($self->get_out_charset));
 	} else {
 	    die wrap_mod("po4a", dgettext("po4a", "Couldn't determine the input document's charset. Please specify it on the command line. (non-ascii char at %s)"), $self->{TT}{non_ascii_ref})
 	}
@@ -1001,6 +1011,38 @@
     return $text;
 }
 
+
+# encode_from_to($,$,$)
+#
+# Encode the given text from one encoding to another one.
+# It differs from Encode::from_to because it does not take the name of the
+# encoding in argument, but the encoders (as returned by the
+# Encode::find_encoding(<name>) method). Thus it permits to save a bunch
+# of call to find_encoding.
+#
+# If the "from" encoding is undefined, it is considered as UTF-8 (or
+# ascii).
+# If the "to" encoding is undefined, it is considered as UTF-8.
+#
+sub encode_from_to {
+    my ($text,$from,$to) = (shift,shift,shift);
+
+    if (not defined $from) {
+        # for ascii and UTF-8, no conversion needed to get an utf-8
+        # string.
+    } else {
+        $text = $from->decode($text, 0);
+    }
+
+    if (not defined $to) {
+        # Already in UTF-8, no conversion needed
+    } else {
+        $text = $to->encode($text, 0);
+    }
+
+    return $text;
+}
+
 =back
 
 =head1 FUTURE DIRECTIONS