r33050 - in /branches/upstream/libwordnet-querydata-perl/current: ChangeLog META.yml Makefile.PL QueryData.pm test.pl

thialme-guest at users.alioth.debian.org thialme-guest at users.alioth.debian.org
Sat Apr 11 17:48:27 UTC 2009


Author: thialme-guest
Date: Sat Apr 11 17:48:23 2009
New Revision: 33050

URL: http://svn.debian.org/wsvn/pkg-perl/?sc=1&rev=33050
Log:
[svn-upgrade] Integrating new upstream version, libwordnet-querydata-perl (1.48)

Modified:
    branches/upstream/libwordnet-querydata-perl/current/ChangeLog
    branches/upstream/libwordnet-querydata-perl/current/META.yml
    branches/upstream/libwordnet-querydata-perl/current/Makefile.PL
    branches/upstream/libwordnet-querydata-perl/current/QueryData.pm
    branches/upstream/libwordnet-querydata-perl/current/test.pl

Modified: branches/upstream/libwordnet-querydata-perl/current/ChangeLog
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libwordnet-querydata-perl/current/ChangeLog?rev=33050&op=diff
==============================================================================
--- branches/upstream/libwordnet-querydata-perl/current/ChangeLog (original)
+++ branches/upstream/libwordnet-querydata-perl/current/ChangeLog Sat Apr 11 17:48:23 2009
@@ -1,3 +1,29 @@
+2009-03-20  Jason Rennie
+
+	* release 1.48
+	* fix handling of WNSEARCHDIR
+
+2009-03-14  Danny Brian
+
+	* added the ability for new() to take a named param list
+	* added a new() param "noload" to not preload index files, but to
+	instead use Search::Dict lookups thereafter
+	* added _getIndexFH() and _getDataFH() to consolidate opening and
+	caching of filehandles
+	* added _dataLookup() to consolidate reads from data files
+	* added _indexLookup() to consolidate reads from index files
+	* added _indexOffsetLookup() to consolidate offset reads from index
+	files
+	* added _parseIndexLine() to consolidate the parsing of index file lines
+	* moved path data to new(), so that everything reads off of $self->{dir}
+	* removed the cntlinst path special-casing
+	* all file opens are deferred until necessary; for noload this means as
+	long as possible, for caching it means during the constructor (see
+	_get*FH() functions)
+	* documented "noload" option
+	* loop tests again for "noload"
+	* cleaned up formatting
+
 2008-01-08  Jason Rennie
 
 	* release 1.47

Modified: branches/upstream/libwordnet-querydata-perl/current/META.yml
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libwordnet-querydata-perl/current/META.yml?rev=33050&op=diff
==============================================================================
--- branches/upstream/libwordnet-querydata-perl/current/META.yml (original)
+++ branches/upstream/libwordnet-querydata-perl/current/META.yml Sat Apr 11 17:48:23 2009
@@ -1,8 +1,8 @@
 # http://module-build.sourceforge.net/META-spec.html
 #XXXXXXX This is a prototype!!!  It will change in the future!!! XXXXX#
 name:         WordNet-QueryData
-version:      1.47
-version_from: QueryData.pm
+version:      1.48
+version_from: 
 installdirs:  site
 requires:
 

Modified: branches/upstream/libwordnet-querydata-perl/current/Makefile.PL
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libwordnet-querydata-perl/current/Makefile.PL?rev=33050&op=diff
==============================================================================
--- branches/upstream/libwordnet-querydata-perl/current/Makefile.PL (original)
+++ branches/upstream/libwordnet-querydata-perl/current/Makefile.PL Sat Apr 11 17:48:23 2009
@@ -5,10 +5,11 @@
 # $wnHomePC).  These need to be synchronized.  I need to import those
 # variables from QueryData.pm.
 
-die "*** Please set the WNHOME environment variable to the location of your\n***   WordNet installation.  QueryData.pm will not work otherwise.\n*** Alternatively, you can make the installation in the default\n***   location, C:\\Program Files\\WordNet\\3.0 on Windows, or /usr/local/WordNet-3.0 on unix.\n" unless exists $ENV{WNHOME} or -d "C:\\Program Files\\WordNet\\3.0" or -d "/usr/local/WordNet-3.0";
+die "*** Please set the WNHOME environment variable to the location of your\n***   WordNet installation.  QueryData.pm will not work otherwise.\n*** Alternatively, you can make the installation in the default\n***   location, C:\\Program Files\\WordNet\\3.0 on Windows, or /usr/local/WordNet-3.0 on unix.\n" unless exists $ENV{WNHOME} or exists $ENV{WNSEARCHDIR} or -d "C:\\Program Files\\WordNet\\3.0" or -d "/usr/local/WordNet-3.0";
 
 WriteMakefile(
     'dist'      => { 'COMPRESS'  => 'gzip', 'SUFFIX'    => '.gz', },
     'NAME'	=> 'WordNet::QueryData',
-    'VERSION_FROM'   => 'QueryData.pm',
+#    'VERSION_FROM'   => 'QueryData.pm',
+    'VERSION'   => '1.48',
 );

Modified: branches/upstream/libwordnet-querydata-perl/current/QueryData.pm
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libwordnet-querydata-perl/current/QueryData.pm?rev=33050&op=diff
==============================================================================
--- branches/upstream/libwordnet-querydata-perl/current/QueryData.pm (original)
+++ branches/upstream/libwordnet-querydata-perl/current/QueryData.pm Sat Apr 11 17:48:23 2009
@@ -8,8 +8,6 @@
 
 # This module is free software; you can redistribute it and/or modify
 # it under the same terms as Perl itself.
-
-# $Id: QueryData.pm,v 1.47 2008/01/08 17:34:24 jrennie Exp $
 
 ####### manual page & loadIndex ##########
 
@@ -218,8 +216,6 @@
 # Invalid way of identifying version as of WordNet 3.0
 #sub version { my $self = shift; return $self->{version}; }
 
-# report WordNet data dir -- Sid (05/01/2003)
-sub dataPath { my $self = shift; return $self->{wnpath}; }
 
 sub getResetError#
 {
@@ -264,10 +260,13 @@
     my $old_separator = $/;
     $/ = "\n";
     
-    # Load morphology exclusion mapping
-    $self->loadExclusions ();
+    # Load morphology exclusion mapping, indexes, open data file handles
+    unless ($self->{noload}) {
+        $self->loadExclusions ();
+    }
     $self->loadIndex ();
     $self->openData ();
+
     $self->{errorString} = "";
     $self->{errorVal} = "";
     warn "Done.\n" if ($self->{verbose});
@@ -280,14 +279,37 @@
 {
     # First argument is class
     my $class = shift;
-    # Second is location of WordNet dictionary; Third is verbosity
     
     my $self = {};
     bless $self, $class;
-    $self->{dir} = shift if (defined(@_ > 0));
-    $self->{verbose} = @_ ? shift : 0;
+
+    # try to preserve old calling syntax, at least for dir
+    if (scalar @_ == 1) {
+        $self->{dir} = shift;
+    }
+    # but allow an extensible params syntax
+    else
+    {
+        my %params = @_;
+        $self->{dir}     = $params{dir} if $params{dir};
+        $self->{verbose} = $params{verbose} if $params{verbose};
+        $self->{noload}  = $params{noload} if $params{noload};
+    }
+        
     warn "Dir = ", $self->{dir}, "\n" if ($self->{verbose});
     warn "Verbose = ", $self->{verbose}, "\n" if ($self->{verbose});
+    warn "Noload = ", $self->{noload}, "\n" if ($self->{verbose});
+    
+    ## set $self->{dir} here and avoid the confusion later on, and the {wnpath} stuff.
+    ## also fix up path endings to have trailing slashes if they didn't come that way.
+    if (-e $wnPrefixUnix) {
+        $self->{dir} ||= $wnPrefixUnix;
+        $self->{dir} .= "/" if $self->{dir} !~ m|/$|;
+    } elsif (-e $wnPrefixPC) {
+        $self->{dir} ||= $wnPrefixPC;
+        $self->{dir} .= "\\" if $self->{dir} !~ m|\\$|;
+    }
+    
     $self->_initialize ();
     return $self;
 }
@@ -296,9 +318,9 @@
 sub DESTROY#
 {
     my $self = shift;
-    
+
     for (my $i=1; $i <= 4; $i++) {
-	undef $self->{data_fh}->[$i];
+        undef $self->{data_fh}->[$i];
     }
 }
 
@@ -311,23 +333,17 @@
 
     for (my $i=1; $i <= 4; $i++)
     {
-	my $fileUnix = defined($self->{dir}) ? $self->{dir}."/".$excFile[$i] : "$wnPrefixUnix/$excFile[$i]";
-	my $filePC = defined($self->{dir}) ? $self->{dir}."\\".$excFile[$i] : "$wnPrefixPC\\$excFile[$i]";
-	
-	my $fh = new FileHandle($fileUnix);
-	$fh = new FileHandle($filePC) if (!defined($fh));
-	die "Not able to open $fileUnix or $filePC: $!" if (!defined($fh));
-	
-	while (my $line = <$fh>)
-	{
-	    my ($exc, @word) = split(/\s+/, $line);
-	    next if (!@word);
-	    if (!defined($self->{morph_exc}->[$i]->{$exc})) {
-		@{$self->{morph_exc}->[$i]->{$exc}} = @word;
-	    } else {
-		push @{$self->{morph_exc}->[$i]->{$exc}}, @word;
-	    }
-	}
+        my $file = $self->{dir} . "$excFile[$i]";
+        my $fh = new FileHandle($file);
+        die "Not able to open $file: $!" if (!defined($fh));
+        
+        while (my $line = <$fh>)
+        {
+            my ($exc, @word) = split(/\s+/, $line);
+            next if (!@word);
+            $self->{morph_exc}->[$i]->{$exc} ||= [];
+            push @{$self->{morph_exc}->[$i]->{$exc}}, @word;
+        }
     }
 }
 
@@ -338,38 +354,28 @@
 
     for (my $i=1; $i <= 4; $i++)
     {
-	my $fileUnix = defined($self->{dir}) ? $self->{dir}."/".$indexFile[$i] : "$wnPrefixUnix/$indexFile[$i]";
-	my $filePC = defined($self->{dir}) ? $self->{dir}."\\".$indexFile[$i] : "$wnPrefixPC\\$indexFile[$i]";
-	
-	my $fh = new FileHandle($fileUnix);
-	
-	# Added Code -- WordNet data path being used -- Sid (05/01/2003)
-	if (defined $fh) { $self->{wnpath} = defined($self->{dir}) ? $self->{dir} : $wnPrefixUnix; }
-	else { $self->{wnpath} = defined($self->{dir}) ? $self->{dir} : $wnPrefixPC; }
-	
-	$fh = new FileHandle($filePC) if (!defined($fh));
-	die "Not able to open $fileUnix or $filePC: $!" if (!defined($fh));
-	
-	my $line;
-	while ($line = <$fh>) {
-	    $self->{version} = $1 if ($line =~ m/WordNet (\S+)/);
-	    last if ($line =~ m/^\S/);
-	}
-	while (1) {
-	    my ($lemma, $pos, $sense_cnt, $p_cnt);
-	    ($lemma, $pos, $sense_cnt, $p_cnt, $line) = split(/\s+/, $line, 5);
-	    for (my $i=0; $i < $p_cnt; ++$i) {
-		(undef, $line) = split(/\s+/, $line, 2);
-	    }
-	    my (undef, $tagsense_cnt, @offset) = split(/\s+/, $line);
-	    $self->{"index"}->[$pos_num{$pos}]->{$lemma} = pack "i*", @offset;
-	    $self->{"tagsense_cnt"}->[$pos_num{$pos}]->{$lemma} = $tagsense_cnt;
-	    $line = <$fh>;
-	    last if (!$line);
-	}
-    }
-    warn "\n*** Version 1.6 of the WordNet database is no longer being supported as\n*** of QueryData 1.27.  It may still work, but consider yourself warned.\n" if ($self->{version} eq "1.6");
-    warn "\n*** Version 1.7 of the WordNet database is no longer being supported as\n*** of QueryData 1.27.  It may still work, but consider yourself warned.\n" if ($self->{version} eq "1.7");
+        my $file = $self->{dir} . "$indexFile[$i]";
+        ${$self->{indexFilePaths}}[$i] = $file;
+        
+        if (!$self->{noload})
+        {
+            my $fh = $self->_getIndexFH($pos_num{$i});            
+            my $line;
+            while ($line = <$fh>) {
+                $self->{version} = $1 if ($line =~ m/WordNet (\S+)/);
+                last if ($line =~ m/^\S/);
+            }
+            while (1) {
+                my ($lemma, $pos, $offsets, $sense_cnt, $p_cnt) = $self->_parseIndexLine($line);
+                $self->{"index"}->[$pos_num{$pos}]->{$lemma} = $offsets;
+                $self->{"tagsense_cnt"}->[$pos_num{$pos}]->{$lemma} = $sense_cnt;
+                $line = <$fh>;
+                last if (!$line);
+            }
+            warn "\n*** Version 1.6 of the WordNet database is no longer being supported as\n*** of QueryData 1.27.  It may still work, but consider yourself warned.\n" if ($self->{version} eq "1.6");
+            warn "\n*** Version 1.7 of the WordNet database is no longer being supported as\n*** of QueryData 1.27.  It may still work, but consider yourself warned.\n" if ($self->{version} eq "1.7");
+        }
+    }
 }
 
 # Open data files and return file handles
@@ -380,13 +386,9 @@
 
     for (my $i=1; $i <= 4; $i++)
     {
-	my $fileUnix = defined($self->{dir}) ? $self->{dir}."/".$dataFile[$i] : "$wnPrefixUnix/$dataFile[$i]";
-	my $filePC = defined($self->{dir}) ? $self->{dir}."\\".$dataFile[$i] : "$wnPrefixPC\\$dataFile[$i]";
-	
-	my $fh = new FileHandle($fileUnix);
-	$fh = new FileHandle($filePC) if (!defined($fh));
-	die "Not able to open $fileUnix or $filePC: $!" if (!defined($fh));
-	$self->{data_fh}->[$i] = $fh;
+        my $file = $self->{dir} . "$dataFile[$i]";
+        ${$self->{dataFilePaths}}[$i] = $file;
+        $self->_getDataFH($i);
     }
 }
 
@@ -464,7 +466,17 @@
     my $lword = lower($word);
     warn "(_forms) WORD=$word POS=$pos\n" if ($self->{verbose});
     # if word is in morph exclusion table, return that entry
-    return ($word, @{$self->{morph_exc}->[$pos]->{$lword}}) if (defined ($self->{morph_exc}->[$pos]->{$lword}));	
+    if ($self->{noload}) {
+        # for noload, only load exclusions when needed; we do cache these
+        # though because the list is short (40k) and used on repeated recursive
+        # calls.
+        if (! exists $self->{morph_exc}) {
+            $self->loadExclusions();
+        }
+    }
+    if (defined ($self->{morph_exc}->[$pos]->{$lword})) {
+        return ($word, @{$self->{morph_exc}->[$pos]->{$lword}});
+    }
 
     my @token = split (/[ _]/, $word);
     # If there is only one token, process via rules of detachment
@@ -472,8 +484,9 @@
     # Otherwise, process each token individually, then string together colloc's
     my @forms;
     for (my $i=0; $i < @token; $i++) {
-	push @{$forms[$i]}, _forms ($self, $token[$i], $pos);
-    }
+	   push @{$forms[$i]}, _forms ($self, $token[$i], $pos);
+    }
+    
     # Generate all possible token sequences (collocations)
     my @rtn;
     my @index;
@@ -508,7 +521,7 @@
     die "(forms) Bad part-of-speech: pos=$pos" if (!defined($pos) or !defined($pos_num{$pos}));
     my @rtn = _forms ($self, $word, $pos_num{$pos});
     for (my $i=0; $i < @rtn; ++$i) {
-	$rtn[$i] .= "\#$pos";
+	   $rtn[$i] .= "\#$pos";
     }
     return @rtn;
 }
@@ -527,7 +540,7 @@
     (undef, undef, undef, $w_cnt, $line) = split (/\s+/, $line, 5);
     $w_cnt = hex ($w_cnt);
     for (my $i=0; $i < $w_cnt; ++$i) {
-	(undef, undef, $line) = split(/\s+/, $line, 3);
+	   (undef, undef, $line) = split(/\s+/, $line, 3);
     }
     my $p_cnt;
     ($p_cnt, $line) = split(/\s+/, $line, 2);
@@ -555,18 +568,18 @@
     $w_cnt = hex ($w_cnt);
     my @word;
     for (my $i=0; $i < $w_cnt; ++$i) {
-	($word[$i], undef, $line) = split(/\s+/, $line, 3);
+	   ($word[$i], undef, $line) = split(/\s+/, $line, 3);
     }
     my $p_cnt;
     ($p_cnt, $line) = split(/\s+/, $line, 2);
     for (my $i=0; $i < $p_cnt; ++$i) {
-	my ($sym, $offset, $pos, $st);
-	# $st "source/target" is 2-part hexadecimal
-	($sym, $offset, $pos, $st, $line) = split(/\s+/, $line, 5);
-	next if (!$st);
-	my ($src, $tgt) = ($st =~ m/([0-9a-f]{2})([0-9a-f]{2})/);
-	push @rtn, $self->getWord($offset, $pos, hex($tgt))
-	    if (defined($ptr->{$sym}) and ($word[hex($src)-1] =~ m/$lword/i));
+        my ($sym, $offset, $pos, $st);
+        # $st "source/target" is 2-part hexadecimal
+        ($sym, $offset, $pos, $st, $line) = split(/\s+/, $line, 5);
+        next if (!$st);
+        my ($src, $tgt) = ($st =~ m/([0-9a-f]{2})([0-9a-f]{2})/);
+        push @rtn, $self->getWord($offset, $pos, hex($tgt))
+            if (defined($ptr->{$sym}) and ($word[hex($src)-1] =~ m/$lword/i));
     }
     return @rtn;
 }
@@ -578,26 +591,24 @@
     warn "(getAllSenses) offset=$offset pos=$pos\n" if ($self->{verbose});
 
     my @rtn;
-    my $fh = $self->{data_fh}->[$pos_num{$pos}];
-    seek $fh, $offset, 0;
-    my $line = <$fh>;
+    my $line = $self->_dataLookup($pos, $offset);
     my $w_cnt;
     (undef, undef, undef, $w_cnt, $line) = split(/\s+/, $line, 5);
     $w_cnt = hex ($w_cnt);
     my @words;
     for (my $i=0; $i < $w_cnt; ++$i) {
-	($words[$i], undef, $line) = split(/\s+/, $line, 3);
+	   ($words[$i], undef, $line) = split(/\s+/, $line, 3);
     }
     foreach my $word (@words) {
-	$word = delMarker($word);
-	my $lword = lower ($word);
-	my @offArr = (unpack "i*", $self->{"index"}->[$pos_num{$pos}]->{$lword});
-	for (my $i=0; $i < @offArr; $i++) {
-	    if ($offArr[$i] == $offset) {
-		push @rtn, "$word\#$pos\#".($i+1);
-		last;
-	    }
-	}
+        $word = delMarker($word);
+        my $lword = lower ($word);
+        my @offArr = $self->_indexOffsetLookup($lword, $pos);
+        for (my $i=0; $i < @offArr; $i++) {
+            if ($offArr[$i] == $offset) {
+                push @rtn, "$word\#$pos\#".($i+1);
+                last;
+            }
+        }
     }
     return @rtn;
 }
@@ -608,18 +619,98 @@
     my ($self, $offset, $pos) = @_;
     warn "(getSense) offset=$offset pos=$pos\n" if ($self->{verbose});
     
-    my $fh = $self->{data_fh}->[$pos_num{$pos}];
-    seek $fh, $offset, 0;
-    my $line = <$fh>;
+    my $line = $self->_dataLookup($pos, $offset);
+    
     my ($lexfn,$word);
     (undef, $lexfn, undef, undef, $word, $line) = split (/\s+/, $line, 6);
     $word = delMarker($word);
     my $lword = lower($word);
-    my @offArr = (unpack "i*", $self->{"index"}->[$pos_num{$pos}]->{$lword});
+    
+    my @offArr = $self->_indexOffsetLookup($word, $pos);
     for (my $i=0; $i < @offArr; $i++) {
-	return "$word\#$pos\#".($i+1) if ($offArr[$i] == $offset);
+	   return "$word\#$pos\#".($i+1) if ($offArr[$i] == $offset);
     }
     die "(getSense) Internal error: offset=$offset pos=$pos";
+}
+
+sub _getIndexFH {
+    my $self = shift;
+    my $pos = shift;
+    my $fh = $self->{index_fh}->[$pos_num{$pos}] ||= 
+                   FileHandle->new ( ${$self->{indexFilePaths}}[$pos_num{$pos}] );
+    unless ($fh) {
+        die "Couldn't open index file: " . ${$self->{indexFilePaths}}[$pos_num{$pos}];
+    }
+    return $fh;
+}
+
+sub _getDataFH {
+    my $self = shift;
+    my $pos = shift;
+    my $fh = $self->{data_fh}->[$pos_num{$pos}] ||= 
+                   FileHandle->new ( ${$self->{dataFilePaths}}[$pos_num{$pos}] );
+    unless ($fh) {
+        die "Couldn't open data file: " . ${$self->{indexFilePaths}}[$pos_num{$pos}];
+    }
+    return $fh;
+}
+
+## returns the offset(s) given word, pos, and sense
+sub _indexOffsetLookup {
+    my $self = shift;
+    my ($word, $pos, $sense) = @_;
+    my $lword = lower ($word);
+    # print STDERR "(_indexOffsetLookup) $word $pos $sense\n";
+    if ($sense) {
+        my $offset;
+        if ($self->{noload}) {
+            my $line = $self->_indexLookup($pos, $lword);
+            my ($lemma, $pos, $offsets, $sense_cnt, $p_cnt) = $self->_parseIndexLine($line);
+            $offset = $$offsets[$sense - 1] if ($lemma eq $lword); ## remember that look always succeeds
+        }
+        else
+        {
+            $offset = (unpack "i*", $self->{"index"}->[$pos_num{$pos}]->{$lword})[$sense-1]
+                if (exists $self->{"index"}->[$pos_num{$pos}]->{$lword});
+        }
+        return $offset;
+    }
+    else
+    {
+        my @offsets = ();
+        if ($self->{noload}) {
+            my $line = $self->_indexLookup($pos, $lword);
+            my ($lemma, $pos, $offsets, $sense_cnt, $p_cnt) = $self->_parseIndexLine($line);
+            @offsets = @$offsets if ($lemma eq $lword);
+        }
+        else
+        {
+            if (defined($self->{"index"}->[$pos_num{$pos}]->{$lword})) {
+                @offsets = (unpack "i*", $self->{"index"}->[$pos_num{$pos}]->{$lword});
+            }
+        }
+        return @offsets;
+    }
+}
+
+## returns line from index file
+sub _indexLookup {
+    my $self = shift;
+    my ($pos, $word) = @_;
+    my $fh = $self->_getIndexFH($pos);
+    look($fh, $word, 0);
+    my $line  = <$fh>;
+    return $line;
+}
+
+## returns line from data file
+sub _dataLookup {
+    my $self = shift;
+    my ($pos, $offset) = @_;
+    my $fh = $self->_getDataFH($pos);
+    seek($fh, $offset, 0);
+    my $line  = <$fh>;
+    return $line;
 }
 
 # returns word#pos#sense for given offset, pos and number
@@ -628,7 +719,7 @@
     my ($self, $offset, $pos, $num) = @_;
     warn "(getWord) offset=$offset pos=$pos num=$num" if ($self->{verbose});
     
-    my $fh = $self->{data_fh}->[$pos_num{$pos}];
+    my $fh = $self->_getDataFH($pos);
     seek $fh, $offset, 0;
     my $line = <$fh>;
     my $w_cnt;
@@ -636,15 +727,15 @@
     $w_cnt = hex ($w_cnt);
     my $word;
     for (my $i=0; $i < $w_cnt; ++$i) {
-	($word, undef, $line) = split(/\s+/, $line, 3);
-	$word = delMarker($word);
-	# (mich0212) return "$word\#$pos" if ($i+1 == $num);
-	last if ($i+1 == $num);
+	   ($word, undef, $line) = split(/\s+/, $line, 3);
+	   $word = delMarker($word);
+	   # (mich0212) return "$word\#$pos" if ($i+1 == $num);
+	   last if ($i+1 == $num);
     }
     my $lword = lower($word);
-    my @offArr = (unpack "i*", $self->{"index"}->[$pos_num{$pos}]->{$lword});
+    my @offArr = $self->_indexOffsetLookup($lword, $pos);;
     for (my $i=0; $i < @offArr; $i++) {
-	return "$word\#$pos\#".($i+1) if ($offArr[$i] == $offset);
+	   return "$word\#$pos\#".($i+1) if ($offArr[$i] == $offset);
     }
     die "(getWord) Bad number: offset=$offset pos=$pos num=$num";
 }
@@ -687,15 +778,13 @@
    }
 
    my $lword = lower($word);
-   if (exists($self->{'index'})
-       && exists($self->{"index"}->[$pos_num{$pos}]) 
-       && exists($self->{"index"}->[$pos_num{$pos}]->{$lword})) {
-       return (unpack "i*", $self->{"index"}->[$pos_num{$pos}]->{$lword})[$sense-1];
-   } else {
-       $self->{errorVal} = 2;
-       $self->{errorString} = "Index not initialized properly or `$word' not found in index";
-       return;
-   }
+   my $res = $self->_indexOffsetLookup($lword, $pos, $sense);
+
+   return $res if $res;
+
+   $self->{errorVal} = 2;
+   $self->{errorString} = "Index not initialized properly or `$word' not found in index";
+   return;
 }
 
 # Return the lexname for the type (3) query string
@@ -706,9 +795,7 @@
     my $offset = $self->offset($string);
     my ($word, $pos, $sense) = $string =~ /^([^\#]+)(?:\#([^\#]+)(?:\#(\d+))?)?$/; 
     warn "(lexname) word=$word pos=$pos sense=$sense offset=$offset\n" if ($self->{verbose});
-    my $fh = $self->{data_fh}->[$pos_num{$pos}];
-    seek $fh, $offset, 0;
-    my $line = <$fh>;
+    my $line = $self->_dataLookup($pos, $offset);
     my (undef, $lexfn, undef) = split (/\s+/, $line, 3);
     return $lexnames{$lexfn};
 }
@@ -721,27 +808,26 @@
     my ($word, $pos, $sense) = $string =~ /^([^\#]+)\#([^\#]+)\#([^\#]+)$/;
 
     unless (defined $word and defined $pos and defined $sense) {
-	croak "(frequency) Query string is not a valid type (3) string";
+	   croak "(frequency) Query string is not a valid type (3) string";
     }
 
     warn "(frequency) word=$word pos=$pos sense=$sense\n" if $self->{verbose};
 
-    my $dp = $self->dataPath;
-    my $cntfile = File::Spec->catfile ($dp, 'cntlist.rev');
+    my $cntfile = File::Spec->catfile ( $self->{dir} . 'cntlist.rev');
     open CFH, "<$cntfile" or die "Cannot open $cntfile: $!";
     
     # look() seek()s to the right position in the file
     my $position = Search::Dict::look (*CFH, "$word\%", 0, 0);
     while (<CFH>) {
-	if (/^$word\%(\d+):[^ ]+ (\d+) (\d+)/) {
-	    next unless $pos_map{$1} eq $pos;
-	    next unless $2 eq $sense;
-	    close CFH;
-	    return $3;
-	}
-	else {
-	    last;
-	}
+        if (/^$word\%(\d+):[^ ]+ (\d+) (\d+)/) {
+            next unless $pos_map{$1} eq $pos;
+            next unless $2 eq $sense;
+            close CFH;
+            return $3;
+        }
+        else {
+            last;
+        }
     }
     close CFH;
     return 0;
@@ -751,57 +837,64 @@
 {
     my $self = shift;
     my $string = shift;
+    
+    warn "(querySense) STRING=$string" if $self->{verbose};
     
     # Ensure that input record separator is "\n"
     my $old_separator = $/;
     $/ = "\n";
     my @rtn;
-    
+        
     # get word, pos, and sense from second argument:
     my ($word, $pos, $sense) = $string =~ /^([^\#]+)(?:\#([^\#]+)(?:\#(\d+))?)?$/; 
     die "(querySense) Bad query string: $string" if (!defined($word));
     my $lword = lower ($word);
-    die "(querySense) Bad part-of-speech: $string"
-	if (defined($pos) && !$pos_num{$pos});
+    die "(querySense) Bad part-of-speech: $string" if (defined($pos) && !$pos_num{$pos});
     
     if (defined($sense)) {
-	my $rel = shift;
-	warn "(querySense) WORD=$word POS=$pos SENSE=$sense RELATION=$rel\n" if ($self->{verbose});
-	die "(querySense) Relation required: $string" if (!defined($rel));
-	die "(querySense) Bad relation: $rel" 
-	    if (!defined($relNameSym{$rel}) and !defined($relSymName{$rel})
-		 and ($rel ne "glos") and ($rel ne "syns"));
-	$rel = $relSymName{$rel} if (defined($relSymName{$rel}));
-	
-	my $fh = $self->{data_fh}->[$pos_num{$pos}];
-	my $offset = (unpack "i*", $self->{"index"}->[$pos_num{$pos}]->{$lword})[$sense-1];
-	seek $fh, $offset, 0;
-	my $line = <$fh>;
-	
-	if ($rel eq "glos") {
-	    $line =~ m/.*\|\s*(.*)$/;
-	    $rtn[0] = $1;
-	} elsif ($rel eq "syns") {
-	    @rtn = $self->getAllSenses ($offset, $pos);
-	} else {
-	    @rtn = $self->getSensePointers($line, $relNameSym{$rel});
-	}
-    } elsif (defined($pos)) {
-	warn "(querySense) WORD=$word POS=$pos\n" if ($self->{verbose});
-	if (defined($self->{"index"}->[$pos_num{$pos}]->{$lword})) {
-	    my @offset = unpack "i*", $self->{"index"}->[$pos_num{$pos}]->{$lword};
-	    $word = underscore(delMarker($word));
-	    for (my $i=0; $i < @offset; $i++) {
-		push @rtn, "$word\#$pos\#".($i+1);
-	    }
-	}
-    } elsif (defined($word)) {
-	print STDERR "(querySense) WORD=$word\n" if ($self->{verbose});
-	$word = underscore(delMarker($word));
-	for (my $i=1; $i <= 4; $i++) {
-	    push @rtn, "$word\#".$pos_map{$i}
-	    if ($self->{"index"}->[$i]->{$lword});
-	}
+        my $rel = shift;
+        warn "(querySense) WORD=$word POS=$pos SENSE=$sense RELATION=$rel\n" if ($self->{verbose});
+        die "(querySense) Relation required: $string" if (!defined($rel));
+        die "(querySense) Bad relation: $rel" 
+            if (!defined($relNameSym{$rel}) and !defined($relSymName{$rel})
+             and ($rel ne "glos") and ($rel ne "syns"));
+        $rel = $relSymName{$rel} if (defined($relSymName{$rel}));
+        
+        my $offset = $self->_indexOffsetLookup($lword, $pos, $sense);
+        my $line = $self->_dataLookup($pos, $offset);
+        
+        if (!$line) {
+            die "Line not found for offset $offset!";
+        }
+        
+        if ($rel eq "glos") {
+            $line =~ m/.*\|\s*(.*)$/;
+            $rtn[0] = $1;
+        } elsif ($rel eq "syns") {
+            @rtn = $self->getAllSenses ($offset, $pos);
+        } else {
+            @rtn = $self->getSensePointers($line, $relNameSym{$rel});
+        }
+    }
+    elsif (defined($pos)) {
+        warn "(querySense) WORD=$word POS=$pos\n" if ($self->{verbose});
+        my @offsets = $self->_indexOffsetLookup($lword, $pos);
+        $word = underscore(delMarker($word));
+        for (my $i=0; $i < @offsets; $i++) {
+            push @rtn, "$word\#$pos\#".($i+1);
+        }
+    }
+    elsif (defined($word)) {
+        warn "(querySense) WORD=$word\n" if ($self->{verbose});
+        $word = underscore(delMarker($word));
+        for (my $i=1; $i <= 4; $i++) {
+            my ($offset) = $self->_indexOffsetLookup($lword, $i);
+            push @rtn, "$word\#".$pos_map{$i} if $offset;
+        }
+    }
+    else
+    {
+        warn "(querySense) no results being returned" if $self->{verbose};
     }
     # Return setting of input record separator
     $/ = $old_separator;
@@ -824,42 +917,39 @@
     my ($word, $pos, $sense) = $string =~ /^([^\#]+)(?:\#([^\#]+)(?:\#(\d+))?)?$/; 
     die "(queryWord) Bad query string: $string" if (!defined($word));
     my $lword = lower ($word);
-    die "(queryWord) Bad part-of-speech: $string"
-	if (defined($pos) && !$pos_num{$pos});
+    die "(queryWord) Bad part-of-speech: $string" if (defined($pos) && !$pos_num{$pos});
     
     if (defined($sense)) {
-	my $rel = shift;
-	warn "(queryWord) WORD=$word POS=$pos SENSE=$sense RELATION=$rel\n" 
-	    if ($self->{verbose});
-	die "(queryWord) Relation required: $string" if (!defined($rel));
-	die "(queryWord) Bad relation: $rel"
-	    if ((!defined($relNameSym{$rel}) and !defined($relSymName{$rel})));
-	$rel = $relSymName{$rel} if (defined($relSymName{$rel}));
-	
-	my $fh = $self->{data_fh}->[$pos_num{$pos}];
-	my $offset = (unpack "i*", 
-		      $self->{"index"}->[$pos_num{$pos}]->{$lword})[$sense-1];
-	seek $fh, $offset, 0;
-	my $line = <$fh>;
-	push @rtn, $self->getWordPointers($line, $relNameSym{$rel}, $word);
-    } elsif (defined($pos)) {
-	warn "(queryWord) WORD=$word POS=$pos\n" if ($self->{verbose});
-	if (defined($self->{"index"}->[$pos_num{$pos}]->{$lword})) {
-	    my @offset = unpack "i*", 
-	    $self->{"index"}->[$pos_num{$pos}]->{$lword};
-	    $word = underscore(delMarker($word));
-	    for (my $i=0; $i < @offset; $i++) {
-		push @rtn, "$word\#$pos\#".($i+1);
-	    }
-	}
-    } else {
-	print STDERR "(queryWord) WORD=$word\n" if ($self->{verbose});
-	
-	$word = underscore(delMarker($word));
-	for (my $i=1; $i <= 4; $i++) {
-	    push @rtn, "$word\#".$pos_map{$i}
-	    if ($self->{"index"}->[$i]->{$lword});
-	}
+        my $rel = shift;
+        warn "(queryWord) WORD=$word POS=$pos SENSE=$sense RELATION=$rel\n" 
+            if ($self->{verbose});
+        die "(queryWord) Relation required: $string" if (!defined($rel));
+        die "(queryWord) Bad relation: $rel"
+            if ((!defined($relNameSym{$rel}) and !defined($relSymName{$rel})));
+        $rel = $relSymName{$rel} if (defined($relSymName{$rel}));
+        
+        my $offset = $self->_indexOffsetLookup($lword, $pos, $sense);
+        my $line = $self->_dataLookup($pos, $offset);
+	    push @rtn, $self->getWordPointers($line, $relNameSym{$rel}, $word);
+    }
+    elsif (defined($pos))
+    {
+        warn "(queryWord) WORD=$word POS=$pos\n" if ($self->{verbose});
+        my @offsets = $self->_indexOffsetLookup($lword, $pos);
+        $word = underscore(delMarker($word));
+        for (my $i=0; $i < @offsets; $i++) {
+            push @rtn, "$word\#$pos\#".($i+1);
+        }
+    }
+    else
+    {
+        print STDERR "(queryWord) WORD=$word\n" if ($self->{verbose});
+        
+        $word = underscore(delMarker($word));
+        for (my $i=1; $i <= 4; $i++) {
+            my $offset = $self->_indexOffsetLookup($lword, $i);
+            push @rtn, "$word\#".$pos_map{$i} if $offset;
+        }
     }
     # Return setting of input record separator
     $/ = $old_separator;
@@ -876,12 +966,12 @@
     my ($word, $pos, $sense) = $string =~ /^([^\#]+)(?:\#([^\#]+)(?:\#(\d+))?)?$/; 
     warn "(valid_forms) Sense number ignored: $string\n" if (defined $sense);
     if (!defined($pos)) {
-	my @rtn;
-	push @rtn, $self->validForms($string."#n");
-	push @rtn, $self->validForms($string."#v");
-	push @rtn, $self->validForms($string."#a");
-	push @rtn, $self->validForms($string."#r");
-	return @rtn;
+        my @rtn;
+        push @rtn, $self->validForms($string."#n");
+        push @rtn, $self->validForms($string."#v");
+        push @rtn, $self->validForms($string."#a");
+        push @rtn, $self->validForms($string."#r");
+        return @rtn;
     }
     
     die "(valid_forms) Invalid part-of-speech: $pos" if (!defined($pos_map{$pos}));
@@ -891,11 +981,43 @@
     return @valid_forms;
 }
 
+sub _parseIndexLine {
+    my $self = shift;
+    my $line = shift;
+    my ($lemma, $pos, $sense_cnt, $p_cnt, $rline) = split(/\s+/, $line, 5);
+    for (my $i=0; $i < $p_cnt; ++$i) {
+        (undef, $rline) = split(/\s+/, $rline, 2);
+    }
+    my (undef, $tagsense_cnt, @offsets) = split(/\s+/, $rline);
+    ## return offset list packed if caching, otherwise just use an array ref
+    if ($self->{noload}) {
+        return ($lemma, $pos, \@offsets, $tagsense_cnt);
+    }
+    else
+    {
+        return ($lemma, $pos, (pack "i*", @offsets), $tagsense_cnt);
+    }
+}
+
 # List all words in WordNet database of a particular part of speech
 sub listAllWords#
 {
     my ($self, $pos) = @_;
-    return keys(%{$self->{"index"}->[$pos_num{$pos}]});
+    if ($self->{noload}) {
+        my @words;
+        my $fh = $self->_getIndexFH($pos);
+        seek($fh, 0, 0);
+        for my $line (<$fh>) {
+            next if ($line =~ m/^\s/);
+            my ($lemma, @rest) = $self->_parseIndexLine($line);
+            push @words, $lemma;
+        }
+        return @words;
+    }
+    else
+    {
+        return keys(%{$self->{"index"}->[$pos_num{$pos}]});
+    }
 }
 
 # Return length of (some) path to root, plus one (root is considered
@@ -907,7 +1029,7 @@
     
     for ($level=0; $word; ++$level)
     {
-	($word) = $self->querySense ($word, "hype");
+	   ($word) = $self->querySense ($word, "hype");
     }
     return $level;
 }
@@ -918,10 +1040,28 @@
     # get word, pos, and sense from second argument:
     my ($word, $pos, $sense) = $string =~ /^([^\#]+)(?:\#([^\#]+)(?:\#(\d+))?)?$/; 
     warn "(tagSenseCnt) Ignorning sense: $string" if (defined($sense));
-    die "Word and part-of-speech required word=$word pos=$pos"
-	if (!defined($word) or !defined($pos) or !defined($pos_num{$pos}));
+    die "Word and part-of-speech required word=$word pos=$pos" if (!defined($word) or !defined($pos) or !defined($pos_num{$pos}));
     my $lword = lower($word);
-    return $self->{"tagsense_cnt"}->[$pos_num{$pos}]->{$lword};
+    return $self->_getTagSenseCnt($lword, $pos);
+}
+
+sub dataPath {
+    my $self = shift;
+    return $self->{dir};
+}
+
+sub _getTagSenseCnt {
+    my $self = shift;
+    my ($lword, $pos) = @_;
+    if ($self->{noload}) {
+        my $line = $self->_indexLookup($pos, $lword);
+        my ($lemma, $pos, $offsets, $tagsense_cnt) = $self->_parseIndexLine($line);
+        return $tagsense_cnt if ($lemma eq $lword);
+    }
+    else
+    {
+        return $self->{"tagsense_cnt"}->[$pos_num{$pos}]->{$lword};
+    }
 }
 
 # module must return true
@@ -940,7 +1080,7 @@
 
   use WordNet::QueryData;
 
-  my $wn = WordNet::QueryData->new;
+  my $wn = WordNet::QueryData->new( noload => 1);
 
   print "Synset: ", join(", ", $wn->querySense("cat#n#7", "syns")), "\n";
   print "Hyponyms: ", join(", ", $wn->querySense("cat#n#1", "hypo")), "\n";
@@ -957,10 +1097,8 @@
 (http://www.cogsci.princeton.edu/~wn/).  It allows the user direct
 access to the full WordNet semantic lexicon.  All parts of speech are
 supported and access is generally very efficient because the index and
-morphical exclusion tables are loaded at initialization.  This
-initialization step is slow (appx. 10-15 seconds), but queries are
-very fast thereafter---thousands of queries can be completed every
-second.
+morphical exclusion tables are loaded at initialization. The module can 
+optionally be used to load the indexes into memory for extra-fast lookups.
 
 =head1 USAGE
 
@@ -985,9 +1123,47 @@
 
   my $wn = WordNet::QueryData->new("/usr/local/wordnet/dict");
 
-When calling "new" in this fashion, you can give it a second verbosity
-argument; a true value will have QueryData print debugging
-information.
+You can instead call the constructor with a hash of params, as in:
+
+  my $wn = WordNet::QueryData->new(
+      dir => "/usr/local/wordnet/dict",
+      verbose => 0,
+      noload => 1
+  );
+  
+When calling "new" in this fashion, two additional arguments are 
+supported; "verbose" will output debugging information, and "noload"
+will cause the object to *not* load the indexes at startup.
+
+=head2 CACHING VERSUS NOLOAD
+
+The "noload" option results in data being retrieved using a 
+dictionary lookup rather than caching the indexes in RAM.
+This method yields an immediate startup time but *slightly* (though
+less than you might think) longer lookup time. For the curious, here
+are some profile data for each method on a duo core intel mac, averaged
+seconds over 10000 iterations:
+
+=head3 Caching versus noload times in seconds
+
+                                          noload => 1  noload => 0
+------------------------------------------------------------------
+new()                                     0.00001      2.55
+queryWord("descending")                   0.0009       0.0001
+querySense("sunset#n#1", "hype")          0.0007       0.0001
+validForms ("lay down#2")                 0.0004       0.0001
+
+Obviously the new() comparison is not very useful, because nothing is 
+happening with the constructor in the case of noload => 1. Similarly,
+lookups with caching are basically just hash lookups, and therefore very
+fast. The lookup times for noload => 1 illustrate the tradeoff between 
+caching at new() time and using dictionary lookups.
+
+Because of the lookup speed increase when noload => 0, many users will
+find it useful to set noload to 1 during development cycles, and to 0
+when RAM is less of a concern than speed. The bottom line is that 
+noload => 1 saves you over 2 seconds of startup time, and costs you about 
+0.0005 seconds per lookup.
 
 =head2 QUERYING THE DATABASE
 

Modified: branches/upstream/libwordnet-querydata-perl/current/test.pl
URL: http://svn.debian.org/wsvn/pkg-perl/branches/upstream/libwordnet-querydata-perl/current/test.pl?rev=33050&op=diff
==============================================================================
--- branches/upstream/libwordnet-querydata-perl/current/test.pl (original)
+++ branches/upstream/libwordnet-querydata-perl/current/test.pl Sat Apr 11 17:48:23 2009
@@ -17,10 +17,20 @@
 # (correspondingly "not ok 13") depending on the success of chunk 13
 # of the test code):
 
-print "Loading index files.  This may take a while...\n";
-# Uses $WNHOME environment variable
-my $wn = WordNet::QueryData->new;
-#my $wn = WordNet::QueryData->new("/scratch/jrennie/WordNet-2.1/dict");
+# run tests once for index/excl/data loading, and again without
+for my $noload (1,0) {
+
+my $wn;
+if ($noload == 0) {
+    print "Loading index files.  This may take a while...\n";
+    # Uses $WNHOME environment variable
+    $wn = WordNet::QueryData->new( verbose => 0 );
+    #my $wn = WordNet::QueryData->new("/scratch/jrennie/WordNet-2.1/dict");
+}
+else
+{
+    $wn = WordNet::QueryData->new( noload => 1 );
+}
 
 #my $ver = $wn->version();
 #print "Found WordNet database version $ver\n";
@@ -39,6 +49,7 @@
 
 scalar $wn->querySense ("rabbit") == 2
     ? print "ok ", $i++, "\n" : print "not ok ", $i++, "\n";
+
 scalar $wn->querySense ("rabbit#n") == 3
     ? print "ok ", $i++, "\n" : print "not ok ", $i++, "\n";
 scalar $wn->querySense ("rabbit#n#1", "hypo") == 7
@@ -80,7 +91,8 @@
 ($wn->querySense("World_War_II#n#1", "mero"))[1] eq "Battle_of_Britain#n#1"
     ? print "ok ", $i++, "\n" : print "not ok ", $i++, "\n";
 # test tagSenseCnt function
-($wn->tagSenseCnt("academy#n") == 2)
+
+$wn->tagSenseCnt("academy#n") == 2
     ? print "ok ", $i++, "\n" : print "not ok ", $i++, "\n";
 
 # test "ies" -> "y" rule of detachment
@@ -152,9 +164,12 @@
     ? print "ok ", $i++, "\n" : print "not ok ", $i++, "\n";
 scalar $wn->offset("0#n#1") == 13742358
     ? print "ok ", $i++, "\n" : print "not ok ", $i++, "\n";
+
 scalar $wn->listAllWords("noun") == 117798
     ? print "ok ", $i++, "\n" : print "not ok ", $i++, "\n";
 $wn->offset("child#n#1") == 9917593
     ? print "ok ", $i++, "\n" : print "not ok ", $i++, "\n";
 my ($foo) = $wn->querySense ("cat#n#1", "glos");
 ($foo eq "feline mammal usually having thick soft fur and no ability to roar: domestic cats; wildcats  ") ? print "ok ", $i++, "\n" : print "not ok ", $i++, "\n";
+
+}




More information about the Pkg-perl-cvs-commits mailing list