r6424 - in /branches/upstream/libmarc-charset-perl/current: Changes MANIFEST META.yml etc/codetables.xml lib/MARC/Charset.pm t/farsi.marc t/farsi.t

dmn at users.alioth.debian.org dmn at users.alioth.debian.org
Fri Aug 10 11:09:12 UTC 2007


Author: dmn
Date: Fri Aug 10 11:09:12 2007
New Revision: 6424

URL: http://svn.debian.org/wsvn/?sc=1&rev=6424
Log:
[svn-upgrade] Integrating new upstream version, libmarc-charset-perl (0.98)

Added:
    branches/upstream/libmarc-charset-perl/current/t/farsi.marc
    branches/upstream/libmarc-charset-perl/current/t/farsi.t
Modified:
    branches/upstream/libmarc-charset-perl/current/Changes
    branches/upstream/libmarc-charset-perl/current/MANIFEST
    branches/upstream/libmarc-charset-perl/current/META.yml
    branches/upstream/libmarc-charset-perl/current/etc/codetables.xml
    branches/upstream/libmarc-charset-perl/current/lib/MARC/Charset.pm

Modified: branches/upstream/libmarc-charset-perl/current/Changes
URL: http://svn.debian.org/wsvn/branches/upstream/libmarc-charset-perl/current/Changes?rev=6424&op=diff
==============================================================================
--- branches/upstream/libmarc-charset-perl/current/Changes (original)
+++ branches/upstream/libmarc-charset-perl/current/Changes Fri Aug 10 11:09:12 2007
@@ -1,4 +1,18 @@
 Revision history for MARC::Charset
+
+0.98 Tue Aug  7 08:28:24 EDT 2007
+     - addition of two code elements to etc/codetables.xml that enable 
+       the conversion of some Arabic records that contain 0x8D and 0x8E
+       which ought to map to 0x200D and 0x200C in Unicode. These mappings
+       are present for Basic and Extended Latin, but are not present
+       in Arabic codetables. There are actually some records that seem
+       to prove the need for these rules (LCCN 2006552991). Thanks to 
+       François Charette <fcharette at ankabut.net> for finding and proposing
+       the fix. Rules were forwarded on to LC for inclusion in canonical 
+       character set mapping.
+     - added t/farsi.t and t/farsi.marc to enable testing of new 
+       code rules. Hopefully this will fail if the codetables.xml is 
+       inadvertently removed without LC having added the new rules.
 
 0.97 Sun May 20 13:48:31 EDT 2007
      - added t/null.t

Modified: branches/upstream/libmarc-charset-perl/current/MANIFEST
URL: http://svn.debian.org/wsvn/branches/upstream/libmarc-charset-perl/current/MANIFEST?rev=6424&op=diff
==============================================================================
--- branches/upstream/libmarc-charset-perl/current/MANIFEST (original)
+++ branches/upstream/libmarc-charset-perl/current/MANIFEST Fri Aug 10 11:09:12 2007
@@ -21,6 +21,8 @@
 t/decompose.t
 t/escape1.t
 t/escape2.t
+t/farsi.marc
+t/farsi.t
 t/hebrew1.marc
 t/hebrew2.marc
 t/hebrew3.marc

Modified: branches/upstream/libmarc-charset-perl/current/META.yml
URL: http://svn.debian.org/wsvn/branches/upstream/libmarc-charset-perl/current/META.yml?rev=6424&op=diff
==============================================================================
--- branches/upstream/libmarc-charset-perl/current/META.yml (original)
+++ branches/upstream/libmarc-charset-perl/current/META.yml Fri Aug 10 11:09:12 2007
@@ -1,7 +1,7 @@
 # http://module-build.sourceforge.net/META-spec.html
 #XXXXXXX This is a prototype!!!  It will change in the future!!! XXXXX#
 name:         MARC-Charset
-version:      0.97
+version:      0.98
 version_from: lib/MARC/Charset.pm
 installdirs:  site
 requires:
@@ -11,4 +11,4 @@
     XML::SAX:                      0
 
 distribution_type: module
-generated_by: ExtUtils::MakeMaker version 6.30
+generated_by: ExtUtils::MakeMaker version 6.30_01

Modified: branches/upstream/libmarc-charset-perl/current/etc/codetables.xml
URL: http://svn.debian.org/wsvn/branches/upstream/libmarc-charset-perl/current/etc/codetables.xml?rev=6424&op=diff
==============================================================================
--- branches/upstream/libmarc-charset-perl/current/etc/codetables.xml (original)
+++ branches/upstream/libmarc-charset-perl/current/etc/codetables.xml Fri Aug 10 11:09:12 2007
@@ -3801,7 +3801,32 @@
 				<ucs>030C</ucs>
 				<utf-8>CC8C</utf-8>
 				<name>SHORT U / COMBINING CARON</name>
-			</code>
+                        </code>
+
+                        <!-- 
+                        These last two code points were suggested by 
+                        François Charette (fcharette at ankabut.net) to 
+                        process LCCN 2006552991. They were forwarded
+                        on to Clay Redding and Nate Trail at LC 
+                        for addition to the canonical tables. Before
+                        replacing this file with the one from LC 
+                        at some future date the presence of these
+                        two code points should be confirmed. 
+                        ehs Aug-07-2007.
+                        -->
+
+                        <code>
+                                <marc>8D</marc>
+                                <ucs>200D</ucs>
+                                <utf-8>E2808D</utf-8>
+                                <name>JOINER / ZERO WIDTH JOINER</name>
+                        </code>
+                        <code>  
+                                <marc>8E</marc>
+                                <ucs>200C</ucs>
+                                <utf-8>E2808C</utf-8>
+                                <name>NON-JOINER / ZERO WIDTH NON-JOINER</name>
+                        </code>
 		</characterSet>
 	</codeTable>
 	<codeTable name="Greek" date="January 2000" number="8">
@@ -98940,4 +98965,4 @@
 			</grouping>
 		</characterSet>
 	</codeTable>
-</codeTables>
+</codeTables>

Modified: branches/upstream/libmarc-charset-perl/current/lib/MARC/Charset.pm
URL: http://svn.debian.org/wsvn/branches/upstream/libmarc-charset-perl/current/lib/MARC/Charset.pm?rev=6424&op=diff
==============================================================================
--- branches/upstream/libmarc-charset-perl/current/lib/MARC/Charset.pm (original)
+++ branches/upstream/libmarc-charset-perl/current/lib/MARC/Charset.pm Fri Aug 10 11:09:12 2007
@@ -1,6 +1,6 @@
 package MARC::Charset;
 
-our $VERSION = '0.97';
+our $VERSION = '0.98';
 use strict;
 use warnings;
 

Added: branches/upstream/libmarc-charset-perl/current/t/farsi.marc
URL: http://svn.debian.org/wsvn/branches/upstream/libmarc-charset-perl/current/t/farsi.marc?rev=6424&op=file
==============================================================================
--- branches/upstream/libmarc-charset-perl/current/t/farsi.marc (added)
+++ branches/upstream/libmarc-charset-perl/current/t/farsi.marc Fri Aug 10 11:09:12 2007
@@ -1,0 +1,5 @@
+(3)4cJGHNGfg Yehei GUagGf.(B
+(3)4agQSJ fSNgŽgGi NWi cJGHNGfî Yehei GUagGf /(B
+(3)4JgQGf :(B
+(3)4cJGHNGfî Yehei GUagGf.(B
+(3)4ebUhO, LhGO.(B

Added: branches/upstream/libmarc-charset-perl/current/t/farsi.t
URL: http://svn.debian.org/wsvn/branches/upstream/libmarc-charset-perl/current/t/farsi.t?rev=6424&op=file
==============================================================================
--- branches/upstream/libmarc-charset-perl/current/t/farsi.t (added)
+++ branches/upstream/libmarc-charset-perl/current/t/farsi.t Fri Aug 10 11:09:12 2007
@@ -1,0 +1,45 @@
+use Test::More no_plan;
+use strict;
+use warnings;
+
+# Date: Thu, 26 Jul 2007 17:16:01 +0200
+# From: fcharette at ankabut.net
+# To: ehs at pobox.com
+# Subject: [MARC::Charset] error with ZWNJ in strings encoded for Arabic
+# 
+# Dear Ed Summers,
+#
+# While converting records from the LoC from MARC-8 to UTF-8 using your
+# MARC::Charset module, I encounter the following error:
+#
+# no mapping found for [0x8E] at position 16 in agQSJ fSNg�gGi NWi cJGHNGfî
+# Yehei GUagGf / g0=BASIC_ARABIC g1=EXTENDED_ARABIC at
+# /usr/lib/perl5/site_perl/5.8.7/MARC/Charset.pm line 209.
+# no mapping found for [0x8E] at position 42 in hRGQJ aQgfÞ h gfQ, GOGQg cd
+# cJGHNGfg�gG, g0=BASIC_ARABIC g1=EXTENDED_ARABIC at
+# /usr/lib/perl5/site_perl/5.8.7/MARC/Charset.pm line 209.
+#
+# As you see, the problem is with byte 0x8E which corresponds to Unicode U+200C
+# ZEROWIDTH NON-JOINER.
+#
+# I found out by looking at the database codetable.xml that this "character" is
+# only included in (in XPath notation): //codeTable[@name="Basic and Extended
+# Latin"]/characterSet[@name="Extended Latin"].
+# But both U+200C and U+200D are occasionally needed for the Arabic script,
+# especially in Farsi (see for example LCCN 2006552991, which occasioned the
+# above two errors).
+#
+# --
+#
+# So two new rules were added to the code tables from LC and these errors 
+# went away. Hopefully the LC tables will be updated appropriately.
+
+use MARC::Charset qw(marc8_to_utf8);
+
+open FARSI, 't/farsi.marc';
+my @lines = <FARSI>;
+
+foreach my $line (@lines) {
+  ok marc8_to_utf8($line);
+}
+




More information about the Pkg-perl-cvs-commits mailing list