[Po4a-commits] "po4a/lib/Locale/Po4a Po.pm, 1.71,
1.72 TransTractor.pm, 1.89, 1.90"
Nicolas FRANCOIS
nekral-guest at alioth.debian.org
Fri Feb 16 20:13:02 CET 2007
Update of /cvsroot/po4a/po4a/lib/Locale/Po4a
In directory alioth:/tmp/cvs-serv11129/lib/Locale/Po4a
Modified Files:
Po.pm TransTractor.pm
Log Message:
Various speedup based on avoiding Encode::from_to, which spends 40% of its
time in find_encoding. As the encoding of the input file, output file and
po files do not change a lot, it's faster to keep the encoders in the
Transtractor or Po objects.
Index: Po.pm
===================================================================
RCS file: /cvsroot/po4a/po4a/lib/Locale/Po4a/Po.pm,v
retrieving revision 1.71
retrieving revision 1.72
diff -u -d -r1.71 -r1.72
--- Po.pm 31 Dec 2006 14:36:27 -0000 1.71
+++ Po.pm 16 Feb 2007 19:13:00 -0000 1.72
@@ -171,6 +171,7 @@
"MIME-Version: 1.0\n".
"Content-Type: text/plain; charset=CHARSET\n".
"Content-Transfer-Encoding: ENCODING");
+ $self->{encoder}=find_encoding("ascii");
# To make stats about gettext hits
$self->stats_clear();
@@ -991,6 +992,12 @@
# } FIXME: do that iff the header isn't the default one.
$self->{header}=$msgstr;
$self->{header_comment}=$comment;
+ my $charset = $self->get_charset;
+ if ($charset ne "CHARSET") {
+ $self->{encoder}=find_encoding($charset);
+ } else {
+ $self->{encoder}=find_encoding("ascii");
+ }
return;
}
@@ -1158,6 +1165,7 @@
$oldchar = $self->get_charset();
$self->{header} =~ s/$oldchar/$newchar/;
+ $self->{encoder}=find_encoding($newchar);
}
#----[ helper functions ]---------------------------------------------------
Index: TransTractor.pm
===================================================================
RCS file: /cvsroot/po4a/po4a/lib/Locale/Po4a/TransTractor.pm,v
retrieving revision 1.89
retrieving revision 1.90
diff -u -d -r1.89 -r1.90
--- TransTractor.pm 14 Feb 2007 23:02:12 -0000 1.89
+++ TransTractor.pm 16 Feb 2007 19:13:00 -0000 1.90
@@ -280,14 +280,13 @@
$newparams{$_}=$params{$_};
}
- $self->{TT}{'file_in_charset'}=$params{'file_in_charset'};
+ $self->detected_charset($params{'file_in_charset'});
$self->{TT}{'file_out_charset'}=$params{'file_out_charset'};
- $self->{TT}{'addendum_charset'}=$params{'addendum_charset'};
- if (defined $self->{TT}{'file_in_charset'} and
- length $self->{TT}{'file_in_charset'} and
- $self->{TT}{'file_in_charset'} !~ m/ascii/i) {
- $self->{TT}{ascii_input}=0;
+ if (defined($self->{TT}{'file_out_charset'}) and
+ length($self->{TT}{'file_out_charset'})) {
+ $self->{TT}{'file_out_encoder'} = find_encoding($self->{TT}{'file_out_charset'});
}
+ $self->{TT}{'addendum_charset'}=$params{'addendum_charset'};
foreach my $file (@{$params{'po_in_name'}}) {
print STDERR "readpo($file)... " if $self->debug();
@@ -835,7 +834,9 @@
}
if ($self->{TT}{po_in}->get_charset ne "CHARSET") {
- Encode::from_to($string, $in_charset, $self->{TT}{po_in}->get_charset);
+ $string = encode_from_to($string,
+ $self->{TT}{'file_in_encoder'},
+ $self->{TT}{po_in}{encoder});
}
if (defined $options{'wrapcol'} && $options{'wrapcol'} < 0) {
@@ -847,8 +848,13 @@
'wrapcol' => $options{'wrapcol'});
if ($self->{TT}{po_in}->get_charset ne "CHARSET") {
- Encode::from_to($transstring,$self->{TT}{po_in}->get_charset,
- $self->get_out_charset);
+ my $out_encoder = $self->{TT}{'file_out_encoder'};
+ unless (defined $out_encoder) {
+ $out_encoder = find_encoding($self->get_out_charset)
+ }
+ $transstring = encode_from_to($transstring,
+ $self->{TT}{po_in}{encoder},
+ $out_encoder);
}
# If the input document isn't completely in ascii, we should see what to
@@ -929,10 +935,13 @@
sub detected_charset {
my ($self,$charset)=(shift,shift);
unless (defined($self->{TT}{'file_in_charset'}) and
- length($self->{TT}{'file_in_charset'}) ) {
-
- $self->{TT}{'file_in_charset'}=$charset;
+ length($self->{TT}{'file_in_charset'}) ) {
+ $self->{TT}{'file_in_charset'}=$charset;
+ if (defined $charset) {
+ $self->{TT}{'file_in_encoder'}=find_encoding($charset);
+ }
}
+
if (defined $self->{TT}{'file_in_charset'} and
length $self->{TT}{'file_in_charset'} and
$self->{TT}{'file_in_charset'} !~ m/ascii/i) {
@@ -992,8 +1001,9 @@
unless ($self->{TT}{'ascii_input'}) {
if(defined($self->{TT}{'file_in_charset'}) and
length($self->{TT}{'file_in_charset'}) ) {
- Encode::from_to($text,$self->{TT}{'file_in_charset'},
- $self->get_out_charset);
+ $text = encode_from_to($text,
+ $self->{TT}{'file_in_encoder'},
+ find_encoding($self->get_out_charset));
} else {
die wrap_mod("po4a", dgettext("po4a", "Couldn't determine the input document's charset. Please specify it on the command line. (non-ascii char at %s)"), $self->{TT}{non_ascii_ref})
}
@@ -1001,6 +1011,38 @@
return $text;
}
+
+# encode_from_to($,$,$)
+#
+# Encode the given text from one encoding to another one.
+# It differs from Encode::from_to because it does not take the name of the
+# encoding in argument, but the encoders (as returned by the
+# Encode::find_encoding(<name>) method). Thus it permits to save a bunch
+# of call to find_encoding.
+#
+# If the "from" encoding is undefined, it is considered as UTF-8 (or
+# ascii).
+# If the "to" encoding is undefined, it is considered as UTF-8.
+#
+sub encode_from_to {
+ my ($text,$from,$to) = (shift,shift,shift);
+
+ if (not defined $from) {
+ # for ascii and UTF-8, no conversion needed to get an utf-8
+ # string.
+ } else {
+ $text = $from->decode($text, 0);
+ }
+
+ if (not defined $to) {
+ # Already in UTF-8, no conversion needed
+ } else {
+ $text = $to->encode($text, 0);
+ }
+
+ return $text;
+}
+
=back
=head1 FUTURE DIRECTIONS
More information about the Po4a-commits
mailing list