r6424 - in /branches/upstream/libmarc-charset-perl/current: Changes MANIFEST META.yml etc/codetables.xml lib/MARC/Charset.pm t/farsi.marc t/farsi.t
dmn at users.alioth.debian.org
dmn at users.alioth.debian.org
Fri Aug 10 11:09:12 UTC 2007
Author: dmn
Date: Fri Aug 10 11:09:12 2007
New Revision: 6424
URL: http://svn.debian.org/wsvn/?sc=1&rev=6424
Log:
[svn-upgrade] Integrating new upstream version, libmarc-charset-perl (0.98)
Added:
branches/upstream/libmarc-charset-perl/current/t/farsi.marc
branches/upstream/libmarc-charset-perl/current/t/farsi.t
Modified:
branches/upstream/libmarc-charset-perl/current/Changes
branches/upstream/libmarc-charset-perl/current/MANIFEST
branches/upstream/libmarc-charset-perl/current/META.yml
branches/upstream/libmarc-charset-perl/current/etc/codetables.xml
branches/upstream/libmarc-charset-perl/current/lib/MARC/Charset.pm
Modified: branches/upstream/libmarc-charset-perl/current/Changes
URL: http://svn.debian.org/wsvn/branches/upstream/libmarc-charset-perl/current/Changes?rev=6424&op=diff
==============================================================================
--- branches/upstream/libmarc-charset-perl/current/Changes (original)
+++ branches/upstream/libmarc-charset-perl/current/Changes Fri Aug 10 11:09:12 2007
@@ -1,4 +1,18 @@
Revision history for MARC::Charset
+
+0.98 Tue Aug 7 08:28:24 EDT 2007
+ - addition of two code elements to etc/codetables.xml that enable
+ the conversion of some Arabic records that contain 0x8D and 0x8E
+ which ought to map to 0x200D and 0x200C in Unicode. These mappings
+ are present for Basic and Extended Latin, but are not present
+ in Arabic codetables. There are actually some records that seem
+ to prove the need for these rules (LCCN 2006552991). Thanks to
+ François Charette <fcharette at ankabut.net> for finding and proposing
+ the fix. Rules were forwarded on to LC for inclusion in canonical
+ character set mapping.
+ - added t/farsi.t and t/farsi.marc to enable testing of new
+ code rules. Hopefully this will fail if the codetables.xml is
+ inadvertently removed without LC having added the new rules.
0.97 Sun May 20 13:48:31 EDT 2007
- added t/null.t
Modified: branches/upstream/libmarc-charset-perl/current/MANIFEST
URL: http://svn.debian.org/wsvn/branches/upstream/libmarc-charset-perl/current/MANIFEST?rev=6424&op=diff
==============================================================================
--- branches/upstream/libmarc-charset-perl/current/MANIFEST (original)
+++ branches/upstream/libmarc-charset-perl/current/MANIFEST Fri Aug 10 11:09:12 2007
@@ -21,6 +21,8 @@
t/decompose.t
t/escape1.t
t/escape2.t
+t/farsi.marc
+t/farsi.t
t/hebrew1.marc
t/hebrew2.marc
t/hebrew3.marc
Modified: branches/upstream/libmarc-charset-perl/current/META.yml
URL: http://svn.debian.org/wsvn/branches/upstream/libmarc-charset-perl/current/META.yml?rev=6424&op=diff
==============================================================================
--- branches/upstream/libmarc-charset-perl/current/META.yml (original)
+++ branches/upstream/libmarc-charset-perl/current/META.yml Fri Aug 10 11:09:12 2007
@@ -1,7 +1,7 @@
# http://module-build.sourceforge.net/META-spec.html
#XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX#
name: MARC-Charset
-version: 0.97
+version: 0.98
version_from: lib/MARC/Charset.pm
installdirs: site
requires:
@@ -11,4 +11,4 @@
XML::SAX: 0
distribution_type: module
-generated_by: ExtUtils::MakeMaker version 6.30
+generated_by: ExtUtils::MakeMaker version 6.30_01
Modified: branches/upstream/libmarc-charset-perl/current/etc/codetables.xml
URL: http://svn.debian.org/wsvn/branches/upstream/libmarc-charset-perl/current/etc/codetables.xml?rev=6424&op=diff
==============================================================================
--- branches/upstream/libmarc-charset-perl/current/etc/codetables.xml (original)
+++ branches/upstream/libmarc-charset-perl/current/etc/codetables.xml Fri Aug 10 11:09:12 2007
@@ -3801,7 +3801,32 @@
<ucs>030C</ucs>
<utf-8>CC8C</utf-8>
<name>SHORT U / COMBINING CARON</name>
- </code>
+ </code>
+
+ <!--
+ These last two code points were suggested by
+ François Charette (fcharette at ankabut.net) to
+ process LCCN 2006552991. They were forwarded
+ on to Clay Redding and Nate Trail at LC
+ for addition to the canonical tables. Before
+ replacing this file with the one from LC
+ at some future date the presence of these
+ two code points should be confirmed.
+ ehs Aug-07-2007.
+ -->
+
+ <code>
+ <marc>8D</marc>
+ <ucs>200D</ucs>
+ <utf-8>E2808D</utf-8>
+ <name>JOINER / ZERO WIDTH JOINER</name>
+ </code>
+ <code>
+ <marc>8E</marc>
+ <ucs>200C</ucs>
+ <utf-8>E2808C</utf-8>
+ <name>NON-JOINER / ZERO WIDTH NON-JOINER</name>
+ </code>
</characterSet>
</codeTable>
<codeTable name="Greek" date="January 2000" number="8">
@@ -98940,4 +98965,4 @@
</grouping>
</characterSet>
</codeTable>
-</codeTables>
+</codeTables>
Modified: branches/upstream/libmarc-charset-perl/current/lib/MARC/Charset.pm
URL: http://svn.debian.org/wsvn/branches/upstream/libmarc-charset-perl/current/lib/MARC/Charset.pm?rev=6424&op=diff
==============================================================================
--- branches/upstream/libmarc-charset-perl/current/lib/MARC/Charset.pm (original)
+++ branches/upstream/libmarc-charset-perl/current/lib/MARC/Charset.pm Fri Aug 10 11:09:12 2007
@@ -1,6 +1,6 @@
package MARC::Charset;
-our $VERSION = '0.97';
+our $VERSION = '0.98';
use strict;
use warnings;
Added: branches/upstream/libmarc-charset-perl/current/t/farsi.marc
URL: http://svn.debian.org/wsvn/branches/upstream/libmarc-charset-perl/current/t/farsi.marc?rev=6424&op=file
==============================================================================
--- branches/upstream/libmarc-charset-perl/current/t/farsi.marc (added)
+++ branches/upstream/libmarc-charset-perl/current/t/farsi.marc Fri Aug 10 11:09:12 2007
@@ -1,0 +1,5 @@
+(3)4cJGHNGfg Yehei GUagGf.(B
+(3)4agQSJ fSNggGi NWi cJGHNGfî Yehei GUagGf /(B
+(3)4JgQGf :(B
+(3)4cJGHNGfî Yehei GUagGf.(B
+(3)4ebUhO, LhGO.(B
Added: branches/upstream/libmarc-charset-perl/current/t/farsi.t
URL: http://svn.debian.org/wsvn/branches/upstream/libmarc-charset-perl/current/t/farsi.t?rev=6424&op=file
==============================================================================
--- branches/upstream/libmarc-charset-perl/current/t/farsi.t (added)
+++ branches/upstream/libmarc-charset-perl/current/t/farsi.t Fri Aug 10 11:09:12 2007
@@ -1,0 +1,45 @@
+use Test::More no_plan;
+use strict;
+use warnings;
+
+# Date: Thu, 26 Jul 2007 17:16:01 +0200
+# From: fcharette at ankabut.net
+# To: ehs at pobox.com
+# Subject: [MARC::Charset] error with ZWNJ in strings encoded for Arabic
+#
+# Dear Ed Summers,
+#
+# While converting records from the LoC from MARC-8 to UTF-8 using your
+# MARC::Charset module, I encounter the following error:
+#
+# no mapping found for [0x8E] at position 16 in agQSJ fSNg�gGi NWi cJGHNGfî
+# Yehei GUagGf / g0=BASIC_ARABIC g1=EXTENDED_ARABIC at
+# /usr/lib/perl5/site_perl/5.8.7/MARC/Charset.pm line 209.
+# no mapping found for [0x8E] at position 42 in hRGQJ aQgfà h gfQ, GOGQg cd
+# cJGHNGfg�gG, g0=BASIC_ARABIC g1=EXTENDED_ARABIC at
+# /usr/lib/perl5/site_perl/5.8.7/MARC/Charset.pm line 209.
+#
+# As you see, the problem is with byte 0x8E which corresponds to Unicode U+200C
+# ZEROWIDTH NON-JOINER.
+#
+# I found out by looking at the database codetable.xml that this "character" is
+# only included in (in XPath notation): //codeTable[@name="Basic and Extended
+# Latin"]/characterSet[@name="Extended Latin"].
+# But both U+200C and U+200D are occasionally needed for the Arabic script,
+# especially in Farsi (see for example LCCN 2006552991, which occasioned the
+# above two errors).
+#
+# --
+#
+# So two new rules were added to the code tables from LC and these errors
+# went away. Hopefully the LC tables will be updated appropriately.
+
+use MARC::Charset qw(marc8_to_utf8);
+
+open FARSI, 't/farsi.marc';
+my @lines = <FARSI>;
+
+foreach my $line (@lines) {
+ ok marc8_to_utf8($line);
+}
+
More information about the Pkg-perl-cvs-commits
mailing list