r67639 - in /branches/upstream/liblingua-en-nameparse-perl/current: Changes MANIFEST META.yml lib/Lingua/EN/NameGrammar.pm lib/Lingua/EN/NameParse.pm lib/Lingua/EN/NameParse/ lib/Lingua/EN/NameParse/Grammar.pm t/rules.t
periapt-guest at users.alioth.debian.org
periapt-guest at users.alioth.debian.org
Fri Jan 21 22:16:49 UTC 2011
Author: periapt-guest
Date: Fri Jan 21 22:16:37 2011
New Revision: 67639
URL: http://svn.debian.org/wsvn/pkg-perl/?sc=1&rev=67639
Log:
[svn-upgrade] new version liblingua-en-nameparse-perl (1.29)
Added:
branches/upstream/liblingua-en-nameparse-perl/current/lib/Lingua/EN/NameParse/
branches/upstream/liblingua-en-nameparse-perl/current/lib/Lingua/EN/NameParse/Grammar.pm
Removed:
branches/upstream/liblingua-en-nameparse-perl/current/lib/Lingua/EN/NameGrammar.pm
Modified:
branches/upstream/liblingua-en-nameparse-perl/current/Changes
branches/upstream/liblingua-en-nameparse-perl/current/MANIFEST
branches/upstream/liblingua-en-nameparse-perl/current/META.yml
branches/upstream/liblingua-en-nameparse-perl/current/lib/Lingua/EN/NameParse.pm
branches/upstream/liblingua-en-nameparse-perl/current/t/rules.t
Modified: branches/upstream/liblingua-en-nameparse-perl/current/Changes
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/liblingua-en-nameparse-perl/current/Changes?rev=67639&op=diff
==============================================================================
--- branches/upstream/liblingua-en-nameparse-perl/current/Changes (original)
+++ branches/upstream/liblingua-en-nameparse-perl/current/Changes Fri Jan 21 22:16:37 2011
@@ -1,4 +1,11 @@
Revision history for Perl CPAN module Lingua::En::NameParse
+
+1.29 23 Jan 2011
+ Corrected documentation of case_components module, thanks to John Hansen
+ Removed invalid space after /Pilot Officer/ in extended titles grammar, thanks to John Hansen
+ Added the 'Mr_J_Adam_Smith' name type, thanks to John Hansen
+ Added the 'John' name type, thanks to Graham Seamen
+ Moved NameGrammar.pm to Lingua::EN::NameParse::Grammar name space
1.28 3 Jan 2011
Added more extended titles including Pilot Officer, Count, Duke, Dutchess, Marquess (thanks to Hugh Myers)
Modified: branches/upstream/liblingua-en-nameparse-perl/current/MANIFEST
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/liblingua-en-nameparse-perl/current/MANIFEST?rev=67639&op=diff
==============================================================================
--- branches/upstream/liblingua-en-nameparse-perl/current/MANIFEST (original)
+++ branches/upstream/liblingua-en-nameparse-perl/current/MANIFEST Fri Jan 21 22:16:37 2011
@@ -4,7 +4,7 @@
Makefile.PL
examples/demo.pl
lib/Lingua/EN/NameParse.pm
-lib/Lingua/EN/NameGrammar.pm
+lib/Lingua/EN/NameParse/Grammar.pm
surname_prefs.txt
t/main.t
t/rules.t
Modified: branches/upstream/liblingua-en-nameparse-perl/current/META.yml
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/liblingua-en-nameparse-perl/current/META.yml?rev=67639&op=diff
==============================================================================
--- branches/upstream/liblingua-en-nameparse-perl/current/META.yml (original)
+++ branches/upstream/liblingua-en-nameparse-perl/current/META.yml Fri Jan 21 22:16:37 2011
@@ -1,14 +1,22 @@
--- #YAML:1.0
-name: Lingua-EN-NameParse
-version: 1.28
-abstract: Manipulate peoples names, titles and initials
-license: perl
-author:
+name: Lingua-EN-NameParse
+version: 1.29
+abstract: Manipulate peoples names, titles and initials
+author:
- Kim Ryan
-generated_by: ExtUtils::MakeMaker version 6.42_01
-distribution_type: module
-requires:
- Parse::RecDescent: 0
+license: perl
+distribution_type: module
+configure_requires:
+ ExtUtils::MakeMaker: 0
+build_requires:
+ ExtUtils::MakeMaker: 0
+requires:
+ Parse::RecDescent: 0
+no_index:
+ directory:
+ - t
+ - inc
+generated_by: ExtUtils::MakeMaker version 6.56
meta-spec:
- url: http://module-build.sourceforge.net/META-spec-v1.3.html
- version: 1.3
+ url: http://module-build.sourceforge.net/META-spec-v1.4.html
+ version: 1.4
Modified: branches/upstream/liblingua-en-nameparse-perl/current/lib/Lingua/EN/NameParse.pm
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/liblingua-en-nameparse-perl/current/lib/Lingua/EN/NameParse.pm?rev=67639&op=diff
==============================================================================
--- branches/upstream/liblingua-en-nameparse-perl/current/lib/Lingua/EN/NameParse.pm (original)
+++ branches/upstream/liblingua-en-nameparse-perl/current/lib/Lingua/EN/NameParse.pm Fri Jan 21 22:16:37 2011
@@ -107,12 +107,13 @@
J_Adam_Smith
John_Smith
A_Smith
-
+ John
Precursors and suffixes are only applied to the following formats:
Mr_John_Adam_Smith
Mr_John_A_Smith
+ Mr_J_Adam_Smith
Mr_John_Smith
Mr_John_Smith
Mr_A_Smith
@@ -274,7 +275,7 @@
=head2 case_components
- %my_name = $name->components;
+ %my_name = $name->case_components;
$cased_surname = $my_name{surname_1};
@@ -391,22 +392,24 @@
=item type
The type of format a name is in, as one of the following strings:
-
- Mr_A_Smith_&_Ms_B_Jones
- Mr_&_Ms_A_&_B_Smith
- Mr_A_&_Ms_B_Smith
- Mr_&_Ms_A_Smith
- Mr_A_&_B_Smith
- Mr_John_Adam_Smith
- Mr_John_A_Smith
- Mr_John_Smith
- Mr_A_Smith
- John_Adam_Smith
- John_A_Smith
- J_Adam_Smith
- John_Smith
- A_Smith
- unknown
+
+ Mr_A_Smith_&_Ms_B_Jones
+ Mr_&_Ms_A_&_B_Smith
+ Mr_A_&_Ms_B_Smith
+ Mr_&_Ms_A_Smith
+ Mr_A_&_B_Smith
+ Mr_John_Adam_Smith
+ Mr_John_A_Smith
+ Mr_J_Adam_Smith
+ Mr_John_Smith
+ Mr_A_Smith
+ John_Adam_Smith
+ John_A_Smith
+ J_Adam_Smith
+ John_Smith
+ A_Smith
+ John
+ unknown
=item non_matching
@@ -523,7 +526,7 @@
=head1 COPYRIGHT AND LICENSE
-Copyright (c) 2008 Kim Ryan. All rights reserved.
+Copyright (c) 2011 Kim Ryan. All rights reserved.
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
@@ -539,13 +542,13 @@
use strict;
use warnings;
-use Lingua::EN::NameGrammar;
+use Lingua::EN::NameParse::Grammar;
use Parse::RecDescent;
use Exporter;
use vars qw (@ISA @EXPORT_OK);
-our $VERSION = '1.28';
+our $VERSION = '1.29';
@ISA = qw(Exporter);
@EXPORT_OK = qw(&clean &case_surname);
@@ -568,17 +571,10 @@
my $current_key;
foreach my $current_key (keys %args)
{
- if ( $current_key eq 'salutation' or $current_key eq 'sal_default' )
- {
- $name->{$current_key} = &_case_word($args{$current_key});
- }
- else
- {
- $name->{$current_key} = $args{$current_key};
- }
- }
-
- my $grammar = &Lingua::EN::NameGrammar::_create($name);
+ $name->{$current_key} = $args{$current_key};
+ }
+
+ my $grammar = &Lingua::EN::NameParse::Grammar::_create($name);
$name->{parse} = new Parse::RecDescent($grammar);
return ($name);
@@ -728,7 +724,8 @@
'John_A_Smith' => ['precursor','given_name_1','initials_1','surname_1','suffix'],
'J_Adam_Smith' => ['precursor','initials_1','middle_name','surname_1','suffix'],
'John_Smith' => ['precursor','given_name_1','surname_1','suffix'],
- 'A_Smith' => ['precursor','initials_1','surname_1','suffix']
+ 'A_Smith' => ['precursor','initials_1','surname_1','suffix'],
+ 'John' => ['given_name_1']
);
# only include names with a single surname
@@ -1065,7 +1062,12 @@
push(@greeting,$sal_default);
}
}
- }
+ }
+ else
+ {
+ warn "Invalid sal_type : ", $sal_type;
+ push(@greeting,$sal_default);
+ }
}
return(join(' ', at greeting));
}
@@ -1119,16 +1121,24 @@
# PRIVATE METHODS
#-------------------------------------------------------------------------------
-# Check that common reserved word (as found in company names) do not appear
+
sub _pre_parse
{
my $name = shift;
-
+ # Check that common reserved word (as found in company names) do not appear
if ( $name->{input_string} =~
/\bPty\.? Ltd\.?$|\bLtd\.?$|\bPLC$|Association|Department|National|Society/i )
{
$name->{error} = 1;
$name->{properties}{non_matching} = $name->{input_string};
+ }
+
+ # For the case of a single name such as 'Voltaire' we need to add a trailing space
+ # to the input string. This is because the grammar tree expects a terminator (the space)
+ # optionally followed by other productions or non matching text
+ if ( $name->{input_string} =~ /^[A-Z]{2,}(\-)?[A-Z]{0,}$/i )
+ {
+ $name->{input_string} .= ' ';
}
return($name);
@@ -1141,6 +1151,9 @@
{
my $name = shift;
+ # $::RD_TRACE = 1; # for debugging RecDescent output
+ # Use Parse::RecDescent to do the parsing. 'full_name' is a label for the complete grammar tree
+ # defined in Lingua::EN::NameParse::Grammar
my $parsed_name = $name->{parse}->full_name($name->{input_string});
# Place components into a separate hash, so they can be easily returned
Added: branches/upstream/liblingua-en-nameparse-perl/current/lib/Lingua/EN/NameParse/Grammar.pm
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/liblingua-en-nameparse-perl/current/lib/Lingua/EN/NameParse/Grammar.pm?rev=67639&op=file
==============================================================================
--- branches/upstream/liblingua-en-nameparse-perl/current/lib/Lingua/EN/NameParse/Grammar.pm (added)
+++ branches/upstream/liblingua-en-nameparse-perl/current/lib/Lingua/EN/NameParse/Grammar.pm Fri Jan 21 22:16:37 2011
@@ -1,0 +1,763 @@
+=head1 NAME
+
+Lingua::EN::NameGrammar - grammar tree for Lingua::EN::NameParse
+
+=head1 SYNOPSIS
+
+Internal functions called from NameParse.pm module
+
+=head1 DESCRIPTION
+
+Grammar tree of personal name syntax for Lingua::EN::NameParse module.
+
+The grammar defined here is for use with the Parse::RecDescent module.
+Note that parsing is done depth first, meaning match the shortest string first.
+To avoid premature matches, when one rule is a sub set of another longer rule,
+it must appear after the longer rule. See the Parse::RecDescent documentation
+for more details.
+
+
+=head1 AUTHOR
+
+NameParse::Grammar was written by Kim Ryan <kimryan at cpan dot org>.
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (c) 2011 Kim Ryan. All rights reserved.
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.8.4 or,
+at your option, any later version of Perl 5 you may have available.
+
+
+
+
+=cut
+#------------------------------------------------------------------------------
+
+package Lingua::EN::NameParse::Grammar;
+use strict;
+use warnings;
+
+our $VERSION = '1.29';
+
+
+# Rules that define valid orderings of a names components
+
+my $rules_start = q{ full_name : };
+
+my $rules_joint_names =
+q{
+
+ # A (?) refers to an optional component, occurring 0 or more times.
+ # Optional items are returned as an array, which for our case will
+ # always consist of one element, when they exist.
+
+ title given_name surname conjunction title given_name surname non_matching(?)
+ {
+ # block of code to define actions upon successful completion of a
+ # 'production' or rule
+
+ # Two separate people
+ $return =
+ {
+ # Parse::RecDescent lets you return a single scalar, which we use as
+ # an anonymous hash reference
+ title_1 => $item[1],
+ given_name_1 => $item[2],
+ surname_1 => $item[3],
+ conjunction_1 => $item[4],
+ title_2 => $item[5],
+ given_name_2 => $item[6],
+ surname_2 => $item[7],
+ non_matching => $item[8][0],
+ number => 2,
+ type => 'Mr_John_Smith_&_Ms_Mary_Jones'
+ }
+ }
+ |
+
+
+ title initials surname conjunction title initials surname non_matching(?)
+ {
+ $return =
+ {
+ title_1 => $item[1],
+ initials_1 => $item[2],
+ surname_1 => $item[3],
+ conjunction_1 => $item[4],
+ title_2 => $item[5],
+ initials_2 => $item[6],
+ surname_2 => $item[7],
+ non_matching => $item[8][0],
+ number => 2,
+ type => 'Mr_A_Smith_&_Ms_B_Jones'
+ }
+ }
+ |
+
+ title initials conjunction initials surname non_matching(?)
+ {
+ # Two related people, shared title, separate initials,
+ # shared surname. Example, father and son, sisters
+ $return =
+ {
+ title_1 => $item[1],
+ initials_1 => $item[2],
+ conjunction_1 => $item[3],
+ initials_2 => $item[4],
+ surname_1 => $item[5],
+ non_matching => $item[6][0],
+ number => 2,
+ type => 'Mr_A_&_B_Smith'
+ }
+ }
+ |
+
+ title conjunction title initials conjunction initials surname non_matching(?)
+ {
+ # Two related people, own initials, shared surname
+
+ $return =
+ {
+ title_1 => $item[1],
+ conjunction_1 => $item[2],
+ title_2 => $item[3],
+ initials_1 => $item[4],
+ conjunction_2 => $item[5],
+ initials_2 => $item[6],
+ surname_1 => $item[7],
+ non_matching => $item[8][0],
+ number => 2,
+ type => 'Mr_&_Ms_A_&_B_Smith'
+ }
+ }
+ |
+
+ title initials conjunction title initials surname non_matching(?)
+ {
+ # Two related people, own initials, shared surname
+ $return =
+ {
+ title_1 => $item[1],
+ initials_1 => $item[2],
+ conjunction_1 => $item[3],
+ title_2 => $item[4],
+ initials_2 => $item[5],
+ surname_1 => $item[6],
+ non_matching => $item[7][0],
+ number => 2,
+ type => 'Mr_A_&_Ms_B_Smith'
+ }
+ }
+ |
+
+ title conjunction title initials surname non_matching(?)
+ {
+ # Two related people, shared initials, shared surname
+ $return =
+ {
+ title_1 => $item[1],
+ conjunction_1 => $item[2],
+ title_2 => $item[3],
+ initials_1 => $item[4],
+ surname_1 => $item[5],
+ non_matching => $item[6][0],
+ number => 2,
+ type => 'Mr_&_Ms_A_Smith'
+ }
+ }
+ |
+
+ given_name surname conjunction given_name surname non_matching(?)
+ {
+ $return =
+ {
+ given_name_1 => $item[1],
+ surname_1 => $item[2],
+ conjunction_1 => $item[3],
+ given_name_2 => $item[4],
+ surname_2 => $item[5],
+ non_matching => $item[6][0],
+ number => 2,
+ type => 'John_Smith_&_Mary_Jones'
+ }
+ }
+ |
+
+ initials surname conjunction initials surname non_matching(?)
+ {
+ $return =
+ {
+ initials_1 => $item[1],
+ surname_1 => $item[2],
+ conjunction_1 => $item[3],
+ initials_2 => $item[4],
+ surname_2 => $item[5],
+ non_matching => $item[6][0],
+ number => 2,
+ type => 'A_Smith_&_B_Jones'
+ }
+ }
+ |
+
+ given_name conjunction given_name surname non_matching(?)
+ {
+ $return =
+ {
+ given_name_1 => $item[1],
+ conjunction_1 => $item[2],
+ given_name_2 => $item[3],
+ surname_2 => $item[4],
+ non_matching => $item[5][0],
+ number => 2,
+ type => 'John_&_Mary_Smith'
+ }
+ }
+ |
+
+};
+
+my $rules_single_names =
+q{
+
+ precursor(?) title given_name middle_name surname suffix(?) non_matching(?)
+ {
+ $return =
+ {
+ precursor => $item[1][0],
+ title_1 => $item[2],
+ given_name_1 => $item[3],
+ middle_name => $item[4],
+ surname_1 => $item[5],
+ suffix => $item[6][0],
+ non_matching => $item[7][0],
+ number => 1,
+ type => 'Mr_John_Adam_Smith'
+ }
+ }
+ |
+
+ precursor(?) title given_name single_initial surname suffix(?) non_matching(?)
+ {
+ $return =
+ {
+ precursor => $item[1][0],
+ title_1 => $item[2],
+ given_name_1 => $item[3],
+ initials_1 => $item[4],
+ surname_1 => $item[5],
+ suffix => $item[6][0],
+ non_matching => $item[7][0],
+ number => 1,
+ type => 'Mr_John_A_Smith'
+ }
+ }
+ |
+
+ precursor(?) title single_initial middle_name surname suffix(?) non_matching(?)
+ {
+ $return =
+ {
+ precursor => $item[1][0],
+ title_1 => $item[2],
+ initials_1 => $item[3],
+ middle_name => $item[4],
+ surname_1 => $item[5],
+ suffix => $item[6][0],
+ non_matching => $item[7][0],
+ number => 1,
+ type => 'Mr_J_Adam_Smith'
+ }
+ }
+ |
+
+
+
+ precursor(?) title given_name surname suffix(?) non_matching(?)
+ {
+ $return =
+ {
+ precursor => $item[1][0],
+ title_1 => $item[2],
+ given_name_1 => $item[3],
+ surname_1 => $item[4],
+ suffix => $item[5][0],
+ non_matching => $item[6][0],
+ number => 1,
+ type => 'Mr_John_Smith'
+ }
+ }
+ |
+
+ precursor(?) title initials surname suffix(?) non_matching(?)
+ {
+ $return =
+ {
+ precursor => $item[1][0],
+ title_1 => $item[2],
+ initials_1 => $item[3],
+ surname_1 => $item[4],
+ suffix => $item[5][0],
+ non_matching => $item[6][0],
+ number => 1,
+ type => 'Mr_A_Smith'
+ }
+ }
+ |
+
+ precursor(?) given_name_min_2 middle_name surname suffix(?) non_matching(?)
+ {
+ $return =
+ {
+ precursor => $item[1][0],
+ given_name_1 => $item[2],
+ middle_name => $item[3],
+ surname_1 => $item[4],
+ suffix => $item[5][0],
+ non_matching => $item[6][0],
+ number => 1,
+ type => 'John_Adam_Smith'
+ }
+ }
+ |
+
+ precursor(?) given_name_min_2 single_initial surname suffix(?) non_matching(?)
+ {
+ $return =
+ {
+ precursor => $item[1][0],
+ given_name_1 => $item[2],
+ initials_1 => $item[3],
+ surname_1 => $item[4],
+ suffix => $item[5][0],
+ non_matching => $item[6][0],
+ number => 1,
+ type => 'John_A_Smith'
+ }
+ }
+ |
+
+ precursor(?) single_initial middle_name surname suffix(?) non_matching(?)
+ {
+ $return =
+ {
+ precursor => $item[1][0],
+ initials_1 => $item[2],
+ middle_name => $item[3],
+ surname_1 => $item[4],
+ suffix => $item[5][0],
+ non_matching => $item[6][0],
+ number => 1,
+ type => 'J_Adam_Smith'
+ }
+ }
+ |
+
+ precursor(?) given_name surname suffix(?) non_matching(?)
+ {
+ $return =
+ {
+ precursor => $item[1][0],
+ given_name_1 => $item[2],
+ surname_1 => $item[3],
+ suffix => $item[4][0],
+ non_matching => $item[5][0],
+ number => 1,
+ type => 'John_Smith'
+ }
+ }
+ |
+
+ precursor(?) initials surname suffix(?) non_matching(?)
+ {
+ $return =
+ {
+ precursor => $item[1][0],
+ initials_1 => $item[2],
+ surname_1 => $item[3],
+ suffix => $item[4][0],
+ non_matching => $item[5][0],
+ number => 1,
+ type => 'A_Smith'
+ }
+ }
+ |
+
+ given_name non_matching(?)
+ {
+ $return =
+ {
+ given_name_1 => $item[1],
+ non_matching => $item[2][0],
+ number => 1,
+ type => 'John'
+ }
+ }
+ |
+
+ non_matching(?)
+ {
+ $return =
+ {
+ non_matching => $item[1][0],
+ number => 0,
+ type => 'unknown'
+ }
+ }
+};
+
+#------------------------------------------------------------------------------
+# Individual components that a name can be composed from. Components are
+# expressed as literals or Perl regular expressions.
+
+my $precursors =
+q
+{
+ precursor :
+
+ /Estate Of (The Late )?/i |
+ /His (Excellency|Honou?r) /i |
+ /Her (Excellency|Honou?r) /i |
+ /The Right Honou?rable /i |
+ /The Honou?rable /i |
+ /Right Honou?rable /i |
+ /The Rt\.? Hon\.? /i |
+ /The Hon\.? /i |
+ /Rt\.? Hon\.? /i
+
+};
+
+my $titles =
+q{
+
+ title :
+
+ /Mr\.? /i |
+ /Ms\.? /i |
+ /M\/s\.? /i |
+ /Mrs\.? /i |
+ /Miss\.? /i |
+
+ /Dr\.? /i |
+ /Sir /i |
+ /Dame /i
+
+};
+
+my $extended_titles =
+q{
+ |
+ /Messrs /i | # plural or Mr
+ /Mme\.? /i | # Madame
+ /Mister /i |
+ /Mast(\.|er)? /i |
+ /Ms?gr\.? /i | # Monsignor
+ /Count /i |
+ /Countess /i |
+ /Duke /i |
+ /Duchess /i |
+ /Lord /i |
+ /Lady /i |
+ /Marquess i/ |
+
+ /Madam(e)? /i |
+
+ # Medical
+ /Doctor /i |
+ /Sister /i |
+ /Matron /i |
+
+ # Legal
+ /Judge /i |
+ /Justice /i |
+
+ # Police
+ /Det\.? /i |
+ /Insp\.? /i |
+
+ # Military
+ /Brig(adier)? /i |
+ /Captain /i |
+ /Capt\.? /i |
+ /Colonel /i |
+ /Col\.? /i |
+ /Commander /i |
+ /Commodore /i |
+ /Cdr\.? /i | # Commander, Commodore
+ /Field Marshall /i |
+ /Fl\.? Off\.? /i |
+ /Flight Officer /i |
+ /Flt Lt /i |
+ /Flight Lieutenant /i |
+ /Gen(\.|eral)? /i |
+ /Gen\. /i |
+ /Pte\. /i |
+ /Private /i |
+ /Sgt\.? /i |
+ /Sargent /i |
+ /Air Commander /i |
+ /Air Commodore /i |
+ /Air Marshall /i |
+ /Lieutenant Colonel /i |
+ /Lt\.? Col\.? /i |
+ /Lt\.? Gen\.? /i |
+ /Lt\.? Cdr\.? /i |
+ /Lieutenant /i |
+ /(Lt|Leut|Lieut)\.? /i |
+ /Major General /i |
+ /Maj\.? Gen\.?/i |
+ /Major /i |
+ /Maj\.? /i |
+ /Pilot Officer /i |
+
+
+ # Religious
+ /Rabbi /i |
+ /Bishop /i |
+ /Brother /i |
+ /Chaplain /i |
+ /Father /i |
+ /Pastor /i |
+ /Mother Superior /i |
+ /Mother /i |
+ /Most Rever[e|a]nd /i |
+ /Very Rever[e|a]nd /i |
+ /Rever[e|a]nd /i |
+ /Mt\.? Revd\.? /i |
+ /V\.? Revd?\.? /i |
+ /Revd?\.? /i |
+
+
+ # Other
+ /Prof(\.|essor)? /i |
+ /Ald(\.|erman)? /i
+};
+
+my $conjunction = q{ conjunction : /And |& /i };
+
+# Used in the John_A_Smith and J_Adam_Smith name types. Although this
+# duplicates $initials_1, it is needed because this type of initial must
+# always be one character long, regardless of the length of initials set
+# by the user in the 'new' method.
+my $single_initial = q{ single_initial: /[A-Z]\.? /i };
+
+# Define given name combinations, specifying the minimum number of letters.
+# The correct pair of rules is determined by the 'initials' key in the hash
+# passed to the 'new' method.
+
+# Examples are Jo, Jo-Anne, D'Artagnan, O'Shaugnessy La'Keishia, T-Bone
+my $given_name_min_2 =
+q{
+ given_name: /[A-Z]{2,} /i | /[A-Z]{2,}\-[A-Z]{2,} /i | /[A-Z]{1,}\'[A-Z]{2,} /i | /T\-Bone /i
+};
+
+# Joe, Jo-Anne ...
+my $given_name_min_3 =
+q{
+ given_name: /[A-Z]{3,} /i | /[A-Z]{2,}\-[A-Z]{2,} /i | /[A-Z]{1,}\'[A-Z]{2,} /i | /T\-Bone /i
+};
+
+
+# John ...
+my $given_name_min_4 =
+q{
+ given_name: /[A-Z]{4,} /i | /[A-Z]{2,}\-[A-Z]{2,} /i | /[A-Z]{1,}\'[A-Z]{3,} /i | /T\-Bone /i
+};
+
+# For use with John_Adam_Smith and John_A_Smith name types
+my $fixed_length_given_name =
+q{
+ given_name_min_2 : /[A-Z]{2,} /i | /[A-Z]{2,}\-[A-Z]{2,} /i | /[A-Z]{1,}\'[A-Z]{2,} /i
+};
+
+
+# Define initials combinations specifying the minimum and maximum letters.
+# Order from most complex to simplest, to avoid premature matching.
+
+# 'A' 'A.'
+my $initials_1 = q{ initials: /[A-Z]\.? /i };
+
+# 'A. B.' 'A.B.' 'AB' 'A B'
+
+my $initials_2 =
+q{
+ initials: /([A-Z]\. ){1,2}/i | /([A-Z]\.){1,2} /i | /([A-Z] ){1,2}/i | /([A-Z]){1,2} /i
+};
+
+# 'A. B. C. ' 'A.B.C' 'ABC' 'A B C'
+my $initials_3 =
+q{
+ initials: /([A-Z]\. ){1,3}/i | /([A-Z]\.){1,3} /i | /([A-Z] ){1,3}/i | /([A-Z]){1,3} /i
+};
+
+
+# Jo, Jo-Anne, La'Keishia, D'Artagnan, O'Shaugnessy
+my $middle_name =
+q{
+ middle_name:
+
+ # Dont grab surname prefix too early. For example, John Van Dam could be
+ # interpreted as middle name of Van and Surname of Dam. So exclude prefixs
+ # from middle names
+ ...!prefix /[A-Z]{2,} /i | /[A-Z]{2,}\-[A-Z]{2,} /i | /[A-Z]{1,}\'[A-Z]{2,} /i
+ {
+ $return = $item[2];
+ }
+};
+
+
+my $full_surname =
+q{
+ # Use look-ahead to avoid ambiguity between surname and suffix. For example,
+ # John Smith Snr, would detect Snr as the surname and Smith as the middle name
+ surname : ...!suffix sub_surname second_name(?)
+ {
+ if ( $item[2] and $item[3][0] )
+ {
+ $return = "$item[2]$item[3][0]";
+ }
+ else
+ {
+ $return = $item[2];
+ }
+ }
+
+ sub_surname : prefix(?) name
+ {
+ # To prevent warnings when compiling with the -w switch,
+ # do not return uninitialized variables.
+ if ( $item[1][0] )
+ {
+ $return = "$item[1][0]$item[2]";
+ }
+ else
+ {
+ $return = $item[2];
+ }
+ }
+
+ second_name : '-' sub_surname
+ {
+ if ( $item[1] and $item[2] )
+ {
+ $return = "$item[1]$item[2]";
+ }
+ }
+
+ # Patronymic, place name and other surname prefixes
+ prefix:
+
+ /[A|E]l /i | # Arabic, Greek,
+ /Ap /i | # Welsh
+ /Ben /i | # Hebrew
+
+ /Dell([a|e])? /i | # ITALIAN
+ /Dalle /i |
+ /D[a|e]ll'/i |
+ /Dela /i |
+ /Del /i |
+ /De (La |Los )?/i |
+ /D[a|i|u] /i |
+ /L[a|e|o] /i |
+
+ /[D|L|O]'/i | # Italian, Irish or French
+ /St\.? /i | # abbreviation for Saint
+ /San /i | # Spanish
+
+ /Den /i | # DUTCH
+ /Von (Der )?/i |
+ /Van (De(n|r)? )?/i
+
+ # space needed for any following text
+ name: /[A-Z]{2,} ?/i
+
+};
+
+my $suffix =
+q{
+ suffix:
+
+ # word boundaries are used to stop partial matches from surnames such as
+ # the "VI" in "VINCE"
+
+ /Esq(\.|uire)?\b ?/i |
+ /Sn?r\.?\b ?/i | # Senior
+ /Jn?r\.?\b ?/i | # Junior
+ /PhD\.?\b ?/i |
+ /MD\.?\b ?/i |
+ /LLB\.?\b ?/i |
+
+
+ /XI{1,3}\b ?/i | # 11th, 12th, 13th
+ /X\b ?/i | # 10th
+ /IV\b ?/i | # 4th
+ /VI{1,3}\b ?/i | # 6th, 7th, 8th
+ /V\b ?/i | # 5th
+ /IX\b ?/i | # 9th
+ /I{1,3}\b ?/i # 1st, 2nd, 3rd
+};
+
+# Two or more characters. This is set to 2 as a work around for the problem
+# with detecting suffixes like Snr. and Jnr. The dot here gets picked up
+# as non matching.
+
+my $non_matching = q{ non_matching: /.{2,}/ };
+
+
+#-------------------------------------------------------------------------------
+# Assemble correct combination for grammar tree.
+
+sub _create
+{
+ my $name = shift;
+
+ my $grammar = $rules_start;
+
+ if ( $name->{joint_names} )
+ {
+ $grammar .= $rules_joint_names;
+ }
+ $grammar .= $rules_single_names . $precursors . $titles;
+
+ if ( $name->{extended_titles} )
+ {
+ $grammar .= $extended_titles;
+ }
+
+ $grammar .= $conjunction;
+
+ $grammar .= $single_initial;
+
+ $name->{initials} > 3 and $name->{initials} = 3;
+ $name->{initials} < 1 and $name->{initials} = 1;
+
+ # Define limit of when a string is treated as an initial, or
+ # a given name. For example, if initials are set to 2, MR TO SMITH
+ # will have initials of T & O and no given name, but MR TOM SMITH will
+ # have no initials, and a given name of Tom.
+
+ if ( $name->{initials} == 1 )
+ {
+ $grammar .= $given_name_min_2 . $initials_1;
+ }
+ elsif ( $name->{initials} == 2 )
+ {
+ $grammar .= $given_name_min_3 . $initials_2;
+ }
+ elsif ( $name->{initials} == 3 )
+ {
+ $grammar .= $given_name_min_4 . $initials_3;
+ }
+
+ $grammar .= $fixed_length_given_name
+ . $middle_name
+ . $full_surname
+ . $suffix
+ . $non_matching
+ ;
+
+ return($grammar);
+}
+#-------------------------------------------------------------------------------
+1;
Modified: branches/upstream/liblingua-en-nameparse-perl/current/t/rules.t
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/liblingua-en-nameparse-perl/current/t/rules.t?rev=67639&op=diff
==============================================================================
--- branches/upstream/liblingua-en-nameparse-perl/current/t/rules.t (original)
+++ branches/upstream/liblingua-en-nameparse-perl/current/t/rules.t Fri Jan 21 22:16:37 2011
@@ -4,7 +4,7 @@
#------------------------------------------------------------------------------
use strict;
-use Test::Simple tests => 18;
+use Test::Simple tests => 20;
use Lingua::EN::NameParse;
my %args =
@@ -67,6 +67,11 @@
%props = $name->properties;
ok( $props{type} eq 'Mr_John_Adam_Smith', 'Mr_John_Adam_Smith format');
+$input = "MR J FITZGERALD KENNEDY";
+$name->parse($input);
+%props = $name->properties;
+ok( $props{type} eq 'Mr_J_Adam_Smith', 'Mr_J_Adam_Smith format');
+
$input = "MR JOHN F KENNEDY";
$name->parse($input);
%props = $name->properties;
@@ -107,4 +112,9 @@
%props = $name->properties;
ok( $props{type} eq 'A_Smith', 'A_Smith format');
+$input = "Voltaire";
+$name->parse($input);
+%props = $name->properties;
+ok( $props{type} eq 'John', 'John format');
+
More information about the Pkg-perl-cvs-commits
mailing list