[nltk] 01/03: Imported Upstream version 3.0.3
Daniel Stender
danstender-guest at moszumanska.debian.org
Mon Jun 15 04:54:50 UTC 2015
This is an automated email from the git hooks/post-receive script.
danstender-guest pushed a commit to branch master
in repository nltk.
commit c87559e2e2967a3188fc396fae6ffb11634a50a4
Author: Daniel Stender <debian at danielstender.com>
Date: Mon Jun 15 06:35:32 2015 +0200
Imported Upstream version 3.0.3
---
PKG-INFO | 2 +-
nltk.egg-info/PKG-INFO | 2 +-
nltk.egg-info/SOURCES.txt | 6 +
nltk/VERSION | 2 +-
nltk/__init__.py | 11 +
nltk/align/api.py | 52 ++
nltk/align/bleu_score.py | 50 +-
nltk/align/ibm1.py | 2 +-
nltk/align/ibm2.py | 2 +-
nltk/app/__init__.py | 4 +-
nltk/app/wordfreq_app.py | 6 +-
nltk/classify/__init__.py | 1 +
nltk/classify/naivebayes.py | 6 +-
nltk/classify/tadm.py | 2 +-
nltk/classify/textcat.py | 193 ++++++
nltk/corpus/__init__.py | 7 +
nltk/corpus/reader/__init__.py | 3 +-
nltk/corpus/reader/crubadan.py | 116 ++++
nltk/corpus/reader/lin.py | 2 +-
nltk/data.py | 2 +-
nltk/downloader.py | 5 +-
nltk/draw/dispersion.py | 6 +-
nltk/draw/util.py | 2 +-
nltk/internals.py | 21 +-
nltk/metrics/agreement.py | 4 +-
nltk/parse/dependencygraph.py | 63 +-
nltk/parse/malt.py | 2 +-
nltk/parse/nonprojectivedependencyparser.py | 3 +-
nltk/parse/transitionparser.py | 16 +-
nltk/probability.py | 21 +-
nltk/sem/boxer.py | 116 ++--
nltk/sem/drt.py | 12 +-
nltk/stem/snowball.py | 2 +-
nltk/tag/__init__.py | 6 +-
nltk/tag/crf.py | 2 +-
nltk/tag/hunpos.py | 2 +-
nltk/tag/stanford.py | 65 +-
nltk/test/crubadan.doctest | 65 ++
nltk/test/gensim.doctest | 144 +++++
nltk/test/unit/test_tgrep.py | 626 ++++++++++++++++++
nltk/tgrep.py | 941 ++++++++++++++++++++++++++++
nltk/tokenize/texttiling.py | 13 +-
nltk/tokenize/treebank.py | 4 +
setup.cfg | 2 +-
44 files changed, 2463 insertions(+), 151 deletions(-)
diff --git a/PKG-INFO b/PKG-INFO
index b5bf363..bdb38c0 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: nltk
-Version: 3.0.2
+Version: 3.0.3
Summary: Natural Language Toolkit
Home-page: http://nltk.org/
Author: Steven Bird
diff --git a/nltk.egg-info/PKG-INFO b/nltk.egg-info/PKG-INFO
index b5bf363..bdb38c0 100644
--- a/nltk.egg-info/PKG-INFO
+++ b/nltk.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: nltk
-Version: 3.0.2
+Version: 3.0.3
Summary: Natural Language Toolkit
Home-page: http://nltk.org/
Author: Steven Bird
diff --git a/nltk.egg-info/SOURCES.txt b/nltk.egg-info/SOURCES.txt
index d0587af..79c94ce 100644
--- a/nltk.egg-info/SOURCES.txt
+++ b/nltk.egg-info/SOURCES.txt
@@ -20,6 +20,7 @@ nltk/jsontags.py
nltk/lazyimport.py
nltk/probability.py
nltk/text.py
+nltk/tgrep.py
nltk/toolbox.py
nltk/tree.py
nltk/treeprettyprinter.py
@@ -80,6 +81,7 @@ nltk/classify/scikitlearn.py
nltk/classify/senna.py
nltk/classify/svm.py
nltk/classify/tadm.py
+nltk/classify/textcat.py
nltk/classify/util.py
nltk/classify/weka.py
nltk/cluster/__init__.py
@@ -101,6 +103,7 @@ nltk/corpus/reader/childes.py
nltk/corpus/reader/chunked.py
nltk/corpus/reader/cmudict.py
nltk/corpus/reader/conll.py
+nltk/corpus/reader/crubadan.py
nltk/corpus/reader/dependency.py
nltk/corpus/reader/framenet.py
nltk/corpus/reader/ieer.py
@@ -244,6 +247,7 @@ nltk/test/compat.doctest
nltk/test/compat_fixt.py
nltk/test/corpus.doctest
nltk/test/corpus_fixt.py
+nltk/test/crubadan.doctest
nltk/test/data.doctest
nltk/test/dependency.doctest
nltk/test/discourse.doctest
@@ -254,6 +258,7 @@ nltk/test/featgram.doctest
nltk/test/featstruct.doctest
nltk/test/framenet.doctest
nltk/test/generate.doctest
+nltk/test/gensim.doctest
nltk/test/gluesemantics.doctest
nltk/test/gluesemantics_malt.doctest
nltk/test/gluesemantics_malt_fixt.py
@@ -307,6 +312,7 @@ nltk/test/unit/test_naivebayes.py
nltk/test/unit/test_seekable_unicode_stream_reader.py
nltk/test/unit/test_stem.py
nltk/test/unit/test_tag.py
+nltk/test/unit/test_tgrep.py
nltk/test/unit/utils.py
nltk/tokenize/__init__.py
nltk/tokenize/api.py
diff --git a/nltk/VERSION b/nltk/VERSION
index b502146..75a22a2 100644
--- a/nltk/VERSION
+++ b/nltk/VERSION
@@ -1 +1 @@
-3.0.2
+3.0.3
diff --git a/nltk/__init__.py b/nltk/__init__.py
index 0882a57..963b882 100644
--- a/nltk/__init__.py
+++ b/nltk/__init__.py
@@ -94,6 +94,17 @@ try:
except ImportError:
pass
+# Override missing methods on environments where it cannot be used like GAE.
+import subprocess
+if not hasattr(subprocess, 'PIPE'):
+ def _fake_PIPE(*args, **kwargs):
+ raise NotImplementedError('subprocess.PIPE is not supported.')
+ subprocess.PIPE = _fake_PIPE
+if not hasattr(subprocess, 'Popen'):
+ def _fake_Popen(*args, **kwargs):
+ raise NotImplementedError('subprocess.Popen is not supported.')
+ subprocess.Popen = _fake_Popen
+
###########################################################
# TOP-LEVEL MODULES
###########################################################
diff --git a/nltk/align/api.py b/nltk/align/api.py
index ec8d890..ddb1470 100644
--- a/nltk/align/api.py
+++ b/nltk/align/api.py
@@ -11,6 +11,8 @@ from __future__ import print_function, unicode_literals
from nltk.compat import python_2_unicode_compatible, string_types
from nltk.metrics import precision, recall
+import subprocess
+
@python_2_unicode_compatible
class AlignedSent(object):
@@ -92,6 +94,56 @@ class AlignedSent(object):
return "AlignedSent(%s, %s, %r)" % (words, mots, self._alignment)
+ def _to_dot(self):
+ """
+ Dot representation of the aligned sentence
+ """
+ s = 'graph align {\n'
+ s += 'node[shape=plaintext]\n'
+
+ # Declare node
+ for w in self._words:
+ s += '"%s_source" [label="%s"] \n' % (w, w)
+
+ for w in self._mots:
+ s += '"%s_target" [label="%s"] \n' % (w, w)
+
+ # Alignment
+ for u,v in self._alignment:
+ s += '"%s_source" -- "%s_target" \n' % (self._words[u] , self._mots[v] )
+
+ # Connect the source words
+ for i in range(len(self._words)-1) :
+ s += '"%s_source" -- "%s_source" [style=invis]\n' % (self._words[i] , self._words[i+1])
+
+ # Connect the target words
+ for i in range(len(self._mots)-1) :
+ s += '"%s_target" -- "%s_target" [style=invis]\n' % (self._mots[i] , self._mots[i+1])
+
+ # Put it in the same rank
+ s += '{rank = same; %s}\n' % (' '.join('"%s_source"' % w for w in self._words))
+ s += '{rank = same; %s}\n' % (' '.join('"%s_target"' % w for w in self._mots))
+
+ s += '}'
+
+ return s
+
+ def _repr_svg_(self):
+ """
+ Ipython magic : show SVG representation of this ``AlignedSent``.
+ """
+ dot_string = self._to_dot().encode('utf8')
+ output_format = 'svg'
+ try:
+ process = subprocess.Popen(['dot', '-T%s' % output_format], stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ except OSError:
+ raise Exception('Cannot find the dot binary from Graphviz package')
+ out, err = process.communicate(dot_string)
+
+ return out
+
+
def __str__(self):
"""
Return a human-readable string representation for this ``AlignedSent``.
diff --git a/nltk/align/bleu_score.py b/nltk/align/bleu_score.py
index 5472509..8878250 100644
--- a/nltk/align/bleu_score.py
+++ b/nltk/align/bleu_score.py
@@ -203,9 +203,57 @@ def _brevity_penalty(candidate, references):
length sentence, brevity penalty is used to modify the overall BLEU
score according to length.
+ An example from the paper. There are three references with length 12, 15
+ and 17. And a terse candidate of the length 12. The brevity penalty is 1.
+
+ >>> references = [['a'] * 12, ['a'] * 15, ['a'] * 17]
+ >>> candidate = ['a'] * 12
+ >>> _brevity_penalty(candidate, references)
+ 1.0
+
+ In case a candidate translation is shorter than the references, penalty is
+ applied.
+
+ >>> references = [['a'] * 28, ['a'] * 28]
+ >>> candidate = ['a'] * 12
+ >>> _brevity_penalty(candidate, references)
+ 0.2635...
+
+ The length of the closest reference is used to compute the penalty. If the
+ length of a candidate is 12, and the reference lengths are 13 and 2, the
+ penalty is applied because the candidate length (12) is less then the
+ closest reference length (13).
+
+ >>> references = [['a'] * 13, ['a'] * 2]
+ >>> candidate = ['a'] * 12
+ >>> _brevity_penalty(candidate, references)
+ 0.92...
+
+ The brevity penalty doesn't depend on reference order. More importantly,
+ when two reference sentences are at the same distance, the shortest
+ reference sentence length is used.
+
+ >>> references = [['a'] * 13, ['a'] * 11]
+ >>> candidate = ['a'] * 12
+ >>> _brevity_penalty(candidate, references) == _brevity_penalty(candidate, reversed(references)) == 1
+ True
+
+ A test example from mteval-v13a.pl (starting from the line 705):
+
+ >>> references = [['a'] * 11, ['a'] * 8]
+ >>> candidate = ['a'] * 7
+ >>> _brevity_penalty(candidate, references)
+ 0.86...
+
+ >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
+ >>> candidate = ['a'] * 7
+ >>> _brevity_penalty(candidate, references)
+ 1.0
+
"""
c = len(candidate)
- r = min(abs(len(r) - c) for r in references)
+ ref_lens = (len(reference) for reference in references)
+ r = min(ref_lens, key=lambda ref_len: (abs(ref_len - c), ref_len))
if c > r:
return 1
diff --git a/nltk/align/ibm1.py b/nltk/align/ibm1.py
index e88fe10..43e618f 100644
--- a/nltk/align/ibm1.py
+++ b/nltk/align/ibm1.py
@@ -15,7 +15,6 @@
from __future__ import division
from collections import defaultdict
from nltk.align import AlignedSent
-from nltk.corpus import comtrans
class IBMModel1(object):
"""
@@ -28,6 +27,7 @@ class IBMModel1(object):
Step 2 - Estimate the probability of translation according to the
evidence from Step 1.
+ >>> from nltk.corpus import comtrans
>>> bitexts = comtrans.aligned_sents()[:100]
>>> ibm = IBMModel1(bitexts, 20)
diff --git a/nltk/align/ibm2.py b/nltk/align/ibm2.py
index f2f4b35..cfa70d4 100644
--- a/nltk/align/ibm2.py
+++ b/nltk/align/ibm2.py
@@ -9,7 +9,6 @@
from __future__ import division
from collections import defaultdict
from nltk.align import AlignedSent
-from nltk.corpus import comtrans
from nltk.align.ibm1 import IBMModel1
class IBMModel2(object):
@@ -26,6 +25,7 @@ class IBMModel2(object):
Step 3 - Estimate the probability of translation and alignment according
to the evidence from Step 2.
+ >>> from nltk.corpus import comtrans
>>> bitexts = comtrans.aligned_sents()[:100]
>>> ibm = IBMModel2(bitexts, 5)
>>> aligned_sent = ibm.align(bitexts[0])
diff --git a/nltk/app/__init__.py b/nltk/app/__init__.py
index 4297a51..7e02d78 100644
--- a/nltk/app/__init__.py
+++ b/nltk/app/__init__.py
@@ -39,11 +39,11 @@ else:
from nltk.app.wordnet_app import app as wordnet
try:
- import pylab
+ from matplotlib import pylab
except ImportError:
import warnings
warnings.warn("nltk.app.wordfreq not loaded "
- "(requires the pylab library).")
+ "(requires the matplotlib library).")
else:
from nltk.app.wordfreq_app import app as wordfreq
diff --git a/nltk/app/wordfreq_app.py b/nltk/app/wordfreq_app.py
index 3ced28a..2d9bb9b 100644
--- a/nltk/app/wordfreq_app.py
+++ b/nltk/app/wordfreq_app.py
@@ -5,8 +5,8 @@
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-import pylab
-import nltk.text
+from matplotlib import pylab
+from nltk.text import Text
from nltk.corpus import gutenberg
def plot_word_freq_dist(text):
@@ -23,7 +23,7 @@ def plot_word_freq_dist(text):
pylab.show()
def app():
- t1 = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
+ t1 = Text(gutenberg.words('melville-moby_dick.txt'))
plot_word_freq_dist(t1)
if __name__ == '__main__':
diff --git a/nltk/classify/__init__.py b/nltk/classify/__init__.py
index 972995b..1f57ee8 100644
--- a/nltk/classify/__init__.py
+++ b/nltk/classify/__init__.py
@@ -95,3 +95,4 @@ from nltk.classify.maxent import (MaxentClassifier, BinaryMaxentFeatureEncoding,
TypedMaxentFeatureEncoding,
ConditionalExponentialClassifier)
from nltk.classify.senna import Senna
+from nltk.classify.textcat import TextCat
diff --git a/nltk/classify/naivebayes.py b/nltk/classify/naivebayes.py
index 5f1cffb..6f473e2 100644
--- a/nltk/classify/naivebayes.py
+++ b/nltk/classify/naivebayes.py
@@ -178,8 +178,8 @@ class NaiveBayesClassifier(ClassifierI):
minprob[feature_]/maxprob[feature_])
return features[:n]
- @staticmethod
- def train(labeled_featuresets, estimator=ELEProbDist):
+ @classmethod
+ def train(cls, labeled_featuresets, estimator=ELEProbDist):
"""
:param labeled_featuresets: A list of classified featuresets,
i.e., a list of tuples ``(featureset, label)``.
@@ -225,7 +225,7 @@ class NaiveBayesClassifier(ClassifierI):
probdist = estimator(freqdist, bins=len(feature_values[fname]))
feature_probdist[label, fname] = probdist
- return NaiveBayesClassifier(label_probdist, feature_probdist)
+ return cls(label_probdist, feature_probdist)
##//////////////////////////////////////////////////////
## Demo
diff --git a/nltk/classify/tadm.py b/nltk/classify/tadm.py
index 6437d98..c019f00 100644
--- a/nltk/classify/tadm.py
+++ b/nltk/classify/tadm.py
@@ -14,7 +14,7 @@ from nltk.internals import find_binary
try:
import numpy
except ImportError:
- numpy = None
+ pass
_tadm_bin = None
def config_tadm(bin=None):
diff --git a/nltk/classify/textcat.py b/nltk/classify/textcat.py
new file mode 100644
index 0000000..cb29805
--- /dev/null
+++ b/nltk/classify/textcat.py
@@ -0,0 +1,193 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Language ID module using TextCat algorithm
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Avital Pekker <avital.pekker at utoronto.ca>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A module for language identification using the TextCat algorithm.
+An implementation of the text categorization algorithm
+presented in Cavnar, W. B. and J. M. Trenkle,
+"N-Gram-Based Text Categorization".
+
+The algorithm takes advantage of Zipf's law and uses
+n-gram frequencies to profile languages and text-yet to
+be identified-then compares using a distance measure.
+
+Language n-grams are provided by the "An Crubadan"
+project. A corpus reader was created seperately to read
+those files.
+
+For details regarding the algorithm, see:
+http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
+
+For details about An Crubadan, see:
+http://borel.slu.edu/crubadan/index.html
+"""
+
+# Ensure that literal strings default to unicode rather than str.
+from __future__ import print_function, unicode_literals
+
+from nltk.compat import PY3
+from nltk.util import trigrams
+
+if PY3:
+ from sys import maxsize
+else:
+ from sys import maxint
+
+# Note: this is NOT "re" you're likely used to. The regex module
+# is an alternative to the standard re module that supports
+# Unicode codepoint properties with the \p{} syntax.
+# You may have to "pip install regx"
+try:
+ import regex as re
+except ImportError:
+ re = None
+######################################################################
+## Language identification using TextCat
+######################################################################
+
+class TextCat(object):
+
+ _corpus = None
+ fingerprints = {}
+ _START_CHAR = "<"
+ _END_CHAR = ">"
+
+ last_distances = {}
+
+ def __init__(self):
+ if not re:
+ raise EnvironmentError("classify.textcat requires the regex module that "
+ "supports unicode. Try '$ pip install regex' and "
+ "see https://pypi.python.org/pypi/regex for "
+ "further details.")
+
+ from nltk.corpus import crubadan
+ self._corpus = crubadan
+ # Load all language ngrams into cache
+ for lang in self._corpus.langs():
+ self._corpus.lang_freq(lang)
+
+ def remove_punctuation(self, text):
+ ''' Get rid of punctuation except apostrophes '''
+ return re.sub(r"[^\P{P}\']+", "", text)
+
+ def profile(self, text):
+ ''' Create FreqDist of trigrams within text '''
+ from nltk import word_tokenize, FreqDist
+
+ clean_text = self.remove_punctuation(text)
+ tokens = word_tokenize(clean_text)
+
+ fingerprint = FreqDist()
+ for t in tokens:
+ token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
+ token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
+
+ for cur_trigram in token_trigrams:
+ if cur_trigram in fingerprint:
+ fingerprint[cur_trigram] += 1
+ else:
+ fingerprint[cur_trigram] = 1
+
+ return fingerprint
+
+ def calc_dist(self, lang, trigram, text_profile):
+ ''' Calculate the "out-of-place" measure between the
+ text and language profile for a single trigram '''
+
+ lang_fd = self._corpus.lang_freq(lang)
+ dist = 0
+
+ if trigram in lang_fd:
+ idx_lang_profile = list(lang_fd.keys()).index(trigram)
+ idx_text = list(text_profile.keys()).index(trigram)
+
+ #print(idx_lang_profile, ", ", idx_text)
+ dist = abs(idx_lang_profile - idx_text)
+ else:
+ # Arbitrary but should be larger than
+ # any possible trigram file length
+ # in terms of total lines
+ if PY3:
+ dist = maxsize
+ else:
+ dist = maxint
+
+ return dist
+
+ def lang_dists(self, text):
+ ''' Calculate the "out-of-place" measure between
+ the text and all languages '''
+
+ distances = {}
+ profile = self.profile(text)
+ # For all the languages
+ for lang in self._corpus._all_lang_freq.keys():
+ # Calculate distance metric for every trigram in
+ # input text to be identified
+ lang_dist = 0
+ for trigram in profile:
+ lang_dist += self.calc_dist(lang, trigram, profile)
+
+ distances[lang] = lang_dist
+
+ return distances
+
+ def guess_language(self, text):
+ ''' Find the language with the min distance
+ to the text and return its ISO 639-3 code '''
+ self.last_distances = self.lang_dists(text)
+
+ return min(self.last_distances, key=self.last_distances.get)
+ #################################################')
+
+def demo():
+ from nltk.corpus import udhr
+
+ langs = ['Kurdish-UTF8', 'Abkhaz-UTF8', 'Farsi_Persian-UTF8',
+ 'Hindi-UTF8', 'Hawaiian-UTF8', 'Russian-UTF8', 'Vietnamese-UTF8',
+ 'Serbian_Srpski-UTF8','Esperanto-UTF8']
+
+ friendly = {'kmr':'Northern Kurdish',
+ 'abk':'Abkhazian',
+ 'pes':'Iranian Persian',
+ 'hin':'Hindi',
+ 'haw':'Hawaiian',
+ 'rus':'Russian',
+ 'vie':'Vietnamese',
+ 'srp':'Serbian',
+ 'epo':'Esperanto'}
+
+ tc = TextCat()
+
+ for cur_lang in langs:
+ # Get raw data from UDHR corpus
+ raw_sentences = udhr.sents(cur_lang)
+ rows = len(raw_sentences) - 1
+ cols = list(map(len, raw_sentences))
+
+ sample = ''
+
+ # Generate a sample text of the language
+ for i in range(0, rows):
+ cur_sent = ''
+ for j in range(0, cols[i]):
+ cur_sent += ' ' + raw_sentences[i][j]
+
+ sample += cur_sent
+
+ # Try to detect what it is
+ print('Language snippet: ' + sample[0:140] + '...')
+ guess = tc.guess_language(sample)
+ print('Language detection: %s (%s)' % (guess, friendly[guess]))
+ print('#' * 140)
+
+
+if __name__ == '__main__':
+ demo()
diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py
index 101dd26..08d7011 100644
--- a/nltk/corpus/__init__.py
+++ b/nltk/corpus/__init__.py
@@ -95,6 +95,8 @@ conll2007 = LazyCorpusLoader(
'conll2007', DependencyCorpusReader, '.*\.(test|train).*', encoding=[
('eus', 'ISO-8859-2'),
('esp', 'utf8')])
+crubadan = LazyCorpusLoader(
+ 'crubadan', CrubadanCorpusReader, '.*\.txt')
dependency_treebank = LazyCorpusLoader(
'dependency_treebank', DependencyCorpusReader, '.*\.dp',
encoding='ascii')
@@ -182,6 +184,10 @@ stopwords = LazyCorpusLoader(
'stopwords', WordListCorpusReader, r'(?!README|\.).*', encoding='utf8')
swadesh = LazyCorpusLoader(
'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8')
+swadesh110 = LazyCorpusLoader(
+ 'panlex_swadesh', SwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8')
+swadesh207 = LazyCorpusLoader(
+ 'panlex_swadesh', SwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8')
switchboard = LazyCorpusLoader(
'switchboard', SwitchboardCorpusReader, tagset='wsj')
timit = LazyCorpusLoader(
@@ -246,6 +252,7 @@ semcor = LazyCorpusLoader(
'semcor', SemcorCorpusReader, r'brown./tagfiles/br-.*\.xml',
wordnet) # Must be defined *after* wordnet corpus.
+
def demo():
# This is out-of-date:
abc.demo()
diff --git a/nltk/corpus/reader/__init__.py b/nltk/corpus/reader/__init__.py
index e48be76..047f358 100644
--- a/nltk/corpus/reader/__init__.py
+++ b/nltk/corpus/reader/__init__.py
@@ -94,6 +94,7 @@ from nltk.corpus.reader.udhr import *
from nltk.corpus.reader.bnc import *
from nltk.corpus.reader.sentiwordnet import *
from nltk.corpus.reader.nkjp import *
+from nltk.corpus.reader.crubadan import *
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
# the function bracket_parse() defined in nltk.tree:
@@ -129,5 +130,5 @@ __all__ = [
'TimitTaggedCorpusReader', 'LinThesaurusCorpusReader',
'SemcorCorpusReader', 'FramenetCorpusReader', 'UdhrCorpusReader',
'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset',
- 'NKJPCorpusReader'
+ 'NKJPCorpusReader', 'CrubadanCorpusReader'
]
diff --git a/nltk/corpus/reader/crubadan.py b/nltk/corpus/reader/crubadan.py
new file mode 100644
index 0000000..73e3fbf
--- /dev/null
+++ b/nltk/corpus/reader/crubadan.py
@@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: An Crubadan N-grams Reader
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Avital Pekker <avital.pekker at utoronto.ca>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+An NLTK interface for the n-gram statistics gathered from
+the corpora for each language using An Crubadan.
+
+There are multiple potential applications for the data but
+this reader was created with the goal of using it in the
+context of language identification.
+
+For details about An Crubadan, this data, and its potential uses, see:
+http://borel.slu.edu/crubadan/index.html
+"""
+
+from __future__ import print_function, unicode_literals
+
+import re
+from nltk.compat import PY3
+from os import path
+from nltk.corpus.reader import CorpusReader
+from nltk.probability import FreqDist
+from nltk.data import ZipFilePathPointer
+
+class CrubadanCorpusReader(CorpusReader):
+ """
+ A corpus reader used to access language An Crubadan n-gram files.
+ """
+
+ _LANG_MAPPER_FILE = 'table.txt'
+ _all_lang_freq = {}
+
+ def __init__(self, root, fileids, encoding='utf8', tagset=None):
+ super(CrubadanCorpusReader, self).__init__(root, fileids, encoding='utf8')
+ self._lang_mapping_data = []
+ self._load_lang_mapping_data()
+
+ def lang_freq(self, lang):
+ ''' Return n-gram FreqDist for a specific language
+ given ISO 639-3 language code '''
+
+ if lang not in self._all_lang_freq:
+ self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
+
+ return self._all_lang_freq[lang]
+
+ def langs(self):
+ ''' Return a list of supported languages as ISO 639-3 codes '''
+ return [row[1] for row in self._lang_mapping_data]
+
+ def iso_to_crubadan(self, lang):
+ ''' Return internal Crubadan code based on ISO 639-3 code '''
+ for i in self._lang_mapping_data:
+ if i[1].lower() == lang.lower():
+ return i[0]
+
+ def crubadan_to_iso(self, lang):
+ ''' Return ISO 639-3 code given internal Crubadan code '''
+ for i in self._lang_mapping_data:
+ if i[0].lower() == lang.lower():
+ return i[1]
+
+ def _load_lang_mapping_data(self):
+ ''' Load language mappings between codes and description from table.txt '''
+ if isinstance(self.root, ZipFilePathPointer):
+ raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
+
+ mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
+ if self._LANG_MAPPER_FILE not in self.fileids():
+ raise RuntimeError("Could not find language mapper file: " + mapper_file)
+
+ if PY3:
+ raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
+ else:
+ raw = open(mapper_file, 'rU').read().decode('utf-8').strip()
+
+ self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')]
+
+ def _load_lang_ngrams(self, lang):
+ ''' Load single n-gram language file given the ISO 639-3 language code
+ and return its FreqDist '''
+
+ if lang not in self.langs():
+ raise RuntimeError("Unsupported language.")
+
+ crubadan_code = self.iso_to_crubadan(lang)
+ ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')
+
+ if not path.isfile(ngram_file):
+ raise Runtime("No N-gram file found for requested language.")
+
+ counts = FreqDist()
+ if PY3:
+ f = open(ngram_file, 'r', encoding='utf-8')
+ else:
+ f = open(ngram_file, 'rU')
+
+ for line in f:
+ if PY3:
+ data = line.split(' ')
+ else:
+ data = line.decode('utf8').split(' ')
+
+ ngram = data[1].strip('\n')
+ freq = int(data[0])
+
+ counts[ngram] = freq
+
+ return counts
+
diff --git a/nltk/corpus/reader/lin.py b/nltk/corpus/reader/lin.py
index d6b3d65..05aeb97 100644
--- a/nltk/corpus/reader/lin.py
+++ b/nltk/corpus/reader/lin.py
@@ -140,7 +140,7 @@ def demo():
print(thes.synonyms(word1))
print("Getting scored synonyms for " + word1)
- print(thes.synonyms(word1))
+ print(thes.scored_synonyms(word1))
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
print(thes.synonyms(word1, fileid="simN.lsp"))
diff --git a/nltk/data.py b/nltk/data.py
index e9e9fa4..93055a1 100644
--- a/nltk/data.py
+++ b/nltk/data.py
@@ -73,7 +73,7 @@ path = []
# User-specified locations:
path += [d for d in os.environ.get('NLTK_DATA', str('')).split(os.pathsep) if d]
-if os.path.expanduser('~/') != '~/':
+if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
path.append(os.path.expanduser(str('~/nltk_data')))
if sys.platform.startswith('win'):
diff --git a/nltk/downloader.py b/nltk/downloader.py
index 08390d4..9d9984a 100644
--- a/nltk/downloader.py
+++ b/nltk/downloader.py
@@ -924,6 +924,10 @@ class Downloader(object):
permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
"""
+ # Check if we are on GAE where we cannot write into filesystem.
+ if 'APPENGINE_RUNTIME' in os.environ:
+ return
+
# Check if we have sufficient permissions to install in a
# variety of system-wide locations.
for nltkdir in nltk.data.path:
@@ -2267,4 +2271,3 @@ if __name__ == '__main__':
downloader.download(download_dir=options.dir,
quiet=options.quiet, force=options.force,
halt_on_error=options.halt_on_error)
-
diff --git a/nltk/draw/dispersion.py b/nltk/draw/dispersion.py
index 2ba89e7..eddc36f 100644
--- a/nltk/draw/dispersion.py
+++ b/nltk/draw/dispersion.py
@@ -22,10 +22,10 @@ def dispersion_plot(text, words, ignore_case=False):
"""
try:
- import pylab
+ from matplotlib import pylab
except ImportError:
- raise ValueError('The plot function requires the matplotlib package (aka pylab).'
- 'See http://matplotlib.sourceforge.net/')
+ raise ValueError('The plot function requires matplotlib to be installed.'
+ 'See http://matplotlib.org/')
text = list(text)
words.reverse()
diff --git a/nltk/draw/util.py b/nltk/draw/util.py
index 0d5c83c..468775b 100644
--- a/nltk/draw/util.py
+++ b/nltk/draw/util.py
@@ -1930,7 +1930,7 @@ class EntryDialog(object):
self._original_text = original_text
self._set_callback = set_callback
- width = max(30, len(original_text)*3/2)
+ width = int(max(30, len(original_text)*3/2))
self._top = Toplevel(parent)
if title: self._top.title(title)
diff --git a/nltk/internals.py b/nltk/internals.py
index 4674d9e..cfe938e 100644
--- a/nltk/internals.py
+++ b/nltk/internals.py
@@ -500,7 +500,7 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
print('[Found %s: %s]' % (filename, path))
yielded = True
yield path
- except (KeyboardInterrupt, SystemExit):
+ except (KeyboardInterrupt, SystemExit, OSError):
raise
except:
pass
@@ -513,7 +513,7 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
if searchpath:
msg += '\n\n Searched in:'
msg += ''.join('\n - %s' % d for d in searchpath)
- if url: msg += ('\n\n For more information, on %s, see:\n <%s>' %
+ if url: msg += ('\n\n For more information on %s, see:\n <%s>' %
(filename, url))
div = '='*75
raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
@@ -592,6 +592,23 @@ def find_jar_iter(name_pattern, path_to_jar=None, env_vars=(),
print('[Found %s: %s]' % (name_pattern, cp))
yielded = True
yield cp
+ # The case where user put directory containing the jar file in the classpath
+ if os.path.isdir(cp):
+ if not is_regex:
+ if os.path.isfile(os.path.join(cp,name_pattern)):
+ if verbose:
+ print('[Found %s: %s]' % (name_pattern, cp))
+ yielded = True
+ yield os.path.join(cp,name_pattern)
+ else:
+ # Look for file using regular expression
+ for file_name in os.listdir(cp):
+ if re.match(name_pattern,file_name):
+ if verbose:
+ print('[Found %s: %s]' % (name_pattern, os.path.join(cp,file_name)))
+ yielded = True
+ yield os.path.join(cp,file_name)
+
else:
jar_env = os.environ[env_var]
jar_iter = ((os.path.join(jar_env, path_to_jar) for path_to_jar in os.listdir(jar_env))
diff --git a/nltk/metrics/agreement.py b/nltk/metrics/agreement.py
index 3a7f0ea..b379a07 100644
--- a/nltk/metrics/agreement.py
+++ b/nltk/metrics/agreement.py
@@ -232,8 +232,8 @@ class AnnotationTask(object):
data = (x for x in self.data if x['coder'] in (cA, cB))
for i, itemdata in self._grouped_data('item', data):
# we should have two items; distance doesn't care which comes first
- total += self.distance(itemdata.next()['labels'],
- itemdata.next()['labels'])
+ total += self.distance(next(itemdata)['labels'],
+ next(itemdata)['labels'])
ret = total / (len(self.I) * max_distance)
log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret)
diff --git a/nltk/parse/dependencygraph.py b/nltk/parse/dependencygraph.py
index 5646914..f5f6ef8 100755
--- a/nltk/parse/dependencygraph.py
+++ b/nltk/parse/dependencygraph.py
@@ -19,6 +19,7 @@ from collections import defaultdict
from itertools import chain
from pprint import pformat
import subprocess
+import warnings
from nltk.tree import Tree
from nltk.compat import python_2_unicode_compatible, string_types
@@ -50,14 +51,21 @@ class DependencyGraph(object):
are split by whitespace.
"""
- self.nodes = defaultdict(lambda: {'deps': defaultdict(list)})
+ self.nodes = defaultdict(lambda: {'address': None,
+ 'word': None,
+ 'lemma': None,
+ 'ctag': None,
+ 'tag': None,
+ 'feats': None,
+ 'head': None,
+ 'deps': defaultdict(list),
+ 'rel': None,
+ })
+
self.nodes[0].update(
{
- 'word': None,
- 'lemma': None,
'ctag': 'TOP',
'tag': 'TOP',
- 'feats': None,
'rel': 'TOP',
'address': 0,
}
@@ -291,13 +299,14 @@ class DependencyGraph(object):
rel = 'ROOT'
self.nodes[head]['deps'][rel].append(index)
- if not self.nodes[0]['deps']['ROOT']:
- raise DependencyGraphError(
- "The graph does'n contain a node "
+ if self.nodes[0]['deps']['ROOT']:
+ root_address = self.nodes[0]['deps']['ROOT'][0]
+ self.root = self.nodes[root_address]
+ else:
+ warnings.warn(
+ "The graph doesn't contain a node "
"that depends on the root element."
)
- root_address = self.nodes[0]['deps']['ROOT'][0]
- self.root = self.nodes[root_address]
def _word(self, node, filter=True):
w = node['word']
@@ -447,7 +456,7 @@ class DependencyGraph(object):
def nx_graph(self):
"""Convert the data in a ``nodelist`` into a networkx labeled directed graph."""
- import networkx as NX
+ import networkx
nx_nodelist = list(range(1, len(self.nodes)))
nx_edgelist = [
@@ -458,7 +467,7 @@ class DependencyGraph(object):
for n in nx_nodelist:
self.nx_labels[n] = self.nodes[n]['word']
- g = NX.XDiGraph()
+ g = networkx.MultiDiGraph()
g.add_nodes_from(nx_nodelist)
g.add_edges_from(nx_edgelist)
@@ -504,19 +513,19 @@ Nov. NNP 9 VMOD
tree.pprint()
if nx:
# currently doesn't work
- import networkx as NX
- import pylab as P
+ import networkx
+ from matplotlib import pylab
g = dg.nx_graph()
g.info()
- pos = NX.spring_layout(g, dim=1)
- NX.draw_networkx_nodes(g, pos, node_size=50)
- # NX.draw_networkx_edges(g, pos, edge_color='k', width=8)
- NX.draw_networkx_labels(g, pos, dg.nx_labels)
- P.xticks([])
- P.yticks([])
- P.savefig('tree.png')
- P.show()
+ pos = networkx.spring_layout(g, dim=1)
+ networkx.draw_networkx_nodes(g, pos, node_size=50)
+ # networkx.draw_networkx_edges(g, pos, edge_color='k', width=8)
+ networkx.draw_networkx_labels(g, pos, dg.nx_labels)
+ pylab.xticks([])
+ pylab.yticks([])
+ pylab.savefig('tree.png')
+ pylab.show()
def conll_demo():
@@ -545,13 +554,11 @@ def cycle_finding_demo():
dg = DependencyGraph(treebank_data)
print(dg.contains_cycle())
cyclic_dg = DependencyGraph()
- top = {'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0}
- child1 = {'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1}
- child2 = {'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2}
- child3 = {'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3}
- child4 = {'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4}
- cyclic_dg.nodelist = [top, child1, child2, child3, child4]
- cyclic_dg.root = top
+ cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0})
+ cyclic_dg.add_node({'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1})
+ cyclic_dg.add_node({'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2})
+ cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3})
+ cyclic_dg.add_node({'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4})
print(cyclic_dg.contains_cycle())
treebank_data = """Pierre NNP 2 NMOD
diff --git a/nltk/parse/malt.py b/nltk/parse/malt.py
index e86dde4..b9455ed 100644
--- a/nltk/parse/malt.py
+++ b/nltk/parse/malt.py
@@ -15,7 +15,6 @@ from functools import reduce
import subprocess
from nltk.data import ZipFilePathPointer
-from nltk.tag import RegexpTagger
from nltk.tokenize import word_tokenize
from nltk.internals import find_binary
@@ -43,6 +42,7 @@ class MaltParser(ParserI):
if tagger is not None:
self.tagger = tagger
else:
+ from nltk.tag import RegexpTagger
self.tagger = RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
diff --git a/nltk/parse/nonprojectivedependencyparser.py b/nltk/parse/nonprojectivedependencyparser.py
index 6de271e..d880e68 100644
--- a/nltk/parse/nonprojectivedependencyparser.py
+++ b/nltk/parse/nonprojectivedependencyparser.py
@@ -14,7 +14,6 @@ import logging
from nltk.compat import xrange
from nltk.parse.dependencygraph import DependencyGraph
-from nltk.classify import NaiveBayesClassifier
logger = logging.getLogger(__name__)
@@ -111,6 +110,8 @@ class NaiveBayesDependencyScorer(DependencyScorerI):
:param graphs: A list of dependency graphs to train the scorer.
"""
+ from nltk.classify import NaiveBayesClassifier
+
# Create training labeled training examples
labeled_examples = []
for graph in graphs:
diff --git a/nltk/parse/transitionparser.py b/nltk/parse/transitionparser.py
index 8a678d6..ae39701 100644
--- a/nltk/parse/transitionparser.py
+++ b/nltk/parse/transitionparser.py
@@ -16,8 +16,8 @@ from os import remove
from copy import deepcopy
from operator import itemgetter
try:
- from scipy import sparse
from numpy import array
+ from scipy import sparse
from sklearn.datasets import load_svmlight_file
from sklearn import svm
except ImportError:
@@ -328,11 +328,13 @@ class TransitionParser(ParserI):
def _is_projective(self, depgraph):
arc_list = []
for key in depgraph.nodes:
- node = depgraph.nodes[key]
+ node = depgraph.nodes[key]
+
if 'head' in node:
childIdx = node['address']
parentIdx = node['head']
- arc_list.append((parentIdx, childIdx))
+ if parentIdx is not None:
+ arc_list.append((parentIdx, childIdx))
for (parentIdx, childIdx) in arc_list:
# Ensure that childIdx < parentIdx
@@ -756,14 +758,14 @@ def demo():
>>> result = parser_std.parse([gold_sent], 'temp.arcstd.model')
>>> de = DependencyEvaluator(result, [gold_sent])
- >>> print(de.eval())
- (0.125, 0.0)
+ >>> de.eval() >= (0, 0)
+ True
B. Check the ARC-EAGER parser
>>> result = parser_eager.parse([gold_sent], 'temp.arceager.model')
>>> de = DependencyEvaluator(result, [gold_sent])
- >>> print(de.eval())
- (0.0, 0.0)
+ >>> de.eval() >= (0, 0)
+ True
Note that result is very poor because of only one training example.
"""
diff --git a/nltk/probability.py b/nltk/probability.py
index 1ae001f..b63835f 100644
--- a/nltk/probability.py
+++ b/nltk/probability.py
@@ -226,10 +226,10 @@ class FreqDist(Counter):
:type title: bool
"""
try:
- import pylab
+ from matplotlib import pylab
except ImportError:
- raise ValueError('The plot function requires the matplotlib package (aka pylab). '
- 'See http://matplotlib.sourceforge.net/')
+ raise ValueError('The plot function requires matplotlib to be installed.'
+ 'See http://matplotlib.org/')
if len(args) == 0:
args = [len(self)]
@@ -1272,6 +1272,11 @@ class SimpleGoodTuringProbDist(ProbDistI):
xy_cov += (x - x_mean) * (y - y_mean)
x_var += (x - x_mean)**2
self._slope = (xy_cov / x_var if x_var != 0 else 0.0)
+ if self._slope >= -1:
+ warnings.warn('SimpleGoodTuring did not find a proper best fit '
+ 'line for smoothing probabilities of occurrences. '
+ 'The probability estimates are likely to be '
+ 'unreliable.')
self._intercept = y_mean - self._slope * x_mean
def _switch(self, r, nr):
@@ -1516,9 +1521,9 @@ class KneserNeyProbDist(ProbDistI):
"""
def __init__(self, freqdist, bins=None, discount=0.75):
"""
- :param trigrams: The trigram frequency distribution upon which to base
+ :param freqdist: The trigram frequency distribution upon which to base
the estimation
- :type trigrams: FreqDist
+ :type freqdist: FreqDist
:param bins: Included for compatibility with nltk.tag.hmm
:type bins: int or float
:param discount: The discount applied when retrieving counts of
@@ -1739,10 +1744,10 @@ class ConditionalFreqDist(defaultdict):
:type conditions: list
"""
try:
- import pylab
+ from matplotlib import pylab
except ImportError:
- raise ValueError('The plot function requires the matplotlib package (aka pylab).'
- 'See http://matplotlib.sourceforge.net/')
+ raise ValueError('The plot function requires matplotlib to be installed.'
+ 'See http://matplotlib.org/')
cumulative = _get_kwarg(kwargs, 'cumulative', False)
conditions = _get_kwarg(kwargs, 'conditions', sorted(self.conditions()))
diff --git a/nltk/sem/boxer.py b/nltk/sem/boxer.py
index 19a66e1..41d74d7 100644
--- a/nltk/sem/boxer.py
+++ b/nltk/sem/boxer.py
@@ -171,7 +171,7 @@ class Boxer(object):
args = ['--box', 'false',
'--semantics', 'drs',
- '--flat', 'false',
+ #'--flat', 'false', # removed from boxer
'--resolve', 'true',
'--elimeq', ['false','true'][self._elimeq],
'--format', 'prolog',
@@ -237,7 +237,9 @@ class Boxer(object):
i += 1
line = lines[i]
assert line.startswith('sem(%s,' % drs_id)
- assert line.endswith(').')
+ if line[-4:] == "').'":
+ line = line[:-4] + ")."
+ assert line.endswith(').'), "can't parse line: %s" % line
search_start = len('sem(%s,[' % drs_id)
brace_count = 1
@@ -248,7 +250,11 @@ class Boxer(object):
if(c == ']'):
brace_count -= 1
if(brace_count == 0):
- drs_start = search_start + j + 2
+ drs_start = search_start + j + 1
+ if line[drs_start:drs_start+3] == "','":
+ drs_start = drs_start + 3
+ else:
+ drs_start = drs_start + 1
break
assert drs_start > -1
@@ -272,10 +278,8 @@ class BoxerOutputDrsParser(DrtParser):
self.discourse_id = discourse_id
self.sentence_id_offset = None
self.quote_chars = [("'", "'", "\\", False)]
- self._label_counter = None
def parse(self, data, signature=None):
- self._label_counter = Counter(-1)
return DrtParser.parse(self, data, signature)
def get_all_symbols(self):
@@ -339,6 +343,8 @@ class BoxerOutputDrsParser(DrtParser):
elif tok == 'whq':
conds = [self._handle_whq()]
+ elif tok == 'duplex':
+ conds = [self._handle_duplex()]
else:
conds = []
@@ -367,6 +373,38 @@ class BoxerOutputDrsParser(DrtParser):
return BoxerPred(self.discourse_id, sent_index, word_indices, variable, name, pos, sense)
return _handle_pred_f
+ def _handle_duplex(self):
+ #duplex(whq, drs(...), var, drs(...))
+ self.assertToken(self.token(), '(')
+ # self.assertToken(self.token(), '[')
+ ans_types = []
+ # while self.token(0) != ']':
+ # cat = self.token()
+ # self.assertToken(self.token(), ':')
+ # if cat == 'des':
+ # ans_types.append(self.token())
+ # elif cat == 'num':
+ # ans_types.append('number')
+ # typ = self.token()
+ # if typ == 'cou':
+ # ans_types.append('count')
+ # else:
+ # ans_types.append(typ)
+ # else:
+ # ans_types.append(self.token())
+ # self.token() #swallow the ']'
+
+ self.assertToken(self.token(), 'whq')
+ self.assertToken(self.token(), ',')
+ d1 = self.process_next_expression(None)
+ self.assertToken(self.token(), ',')
+ ref = self.parse_variable()
+ self.assertToken(self.token(), ',')
+ d2 = self.process_next_expression(None)
+ self.assertToken(self.token(), ')')
+ return lambda sent_index, word_indices: BoxerWhq(self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2)
+
+
def _handle_named(self):
#named(x0, john, per, 0)
self.assertToken(self.token(), '(')
@@ -376,7 +414,7 @@ class BoxerOutputDrsParser(DrtParser):
self.assertToken(self.token(), ',')
type = self.token()
self.assertToken(self.token(), ',')
- sense = int(self.token())
+ sense = self.token() # as per boxer rev 2554
self.assertToken(self.token(), ')')
return lambda sent_index, word_indices: BoxerNamed(self.discourse_id, sent_index, word_indices, variable, name, type, sense)
@@ -504,7 +542,6 @@ class BoxerOutputDrsParser(DrtParser):
#drs([[1001]:_G3943],
# [[1002]:pred(_G3943, dog, n, 0)]
# )
- label = self._label_counter.get()
self.assertToken(self.token(), '(')
self.assertToken(self.token(), '[')
refs = set()
@@ -524,7 +561,7 @@ class BoxerOutputDrsParser(DrtParser):
self.token() #swallow ','
self.token() #swallow ']'
self.assertToken(self.token(), ')')
- return BoxerDrs(label, list(refs), conds)
+ return BoxerDrs(list(refs), conds)
def _handle_binary_expression(self, make_callback):
self.assertToken(self.token(), '(')
@@ -583,18 +620,18 @@ class BoxerOutputDrsParser(DrtParser):
return lambda sent_index, word_indices: BoxerWhq(self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2)
def _make_merge_expression(self, sent_index, word_indices, drs1, drs2):
- return BoxerDrs(drs1.label, drs1.refs + drs2.refs, drs1.conds + drs2.conds)
+ return BoxerDrs(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
def _make_or_expression(self, sent_index, word_indices, drs1, drs2):
return BoxerOr(self.discourse_id, sent_index, word_indices, drs1, drs2)
def _make_imp_expression(self, sent_index, word_indices, drs1, drs2):
- return BoxerDrs(drs1.label, drs1.refs, drs1.conds, drs2)
+ return BoxerDrs(drs1.refs, drs1.conds, drs2)
def parse_variable(self):
var = self.token()
- assert re.match('^[ex]\d+$', var), var
- return int(var[1:])
+ assert re.match('^[exps]\d+$', var), var
+ return var
def parse_index(self):
return int(self.token())
@@ -631,16 +668,16 @@ class BoxerDrsParser(DrtParser):
def handle(self, tok, context):
try:
- if tok == 'drs':
- self.assertNextToken(DrtTokens.OPEN)
- label = int(self.token())
- self.assertNextToken(DrtTokens.COMMA)
- refs = list(map(int, self.handle_refs()))
- self.assertNextToken(DrtTokens.COMMA)
- conds = self.handle_conds(None)
- self.assertNextToken(DrtTokens.CLOSE)
- return BoxerDrs(label, refs, conds)
- elif tok == 'pred':
+# if tok == 'drs':
+# self.assertNextToken(DrtTokens.OPEN)
+# label = int(self.token())
+# self.assertNextToken(DrtTokens.COMMA)
+# refs = list(map(int, self.handle_refs()))
+# self.assertNextToken(DrtTokens.COMMA)
+# conds = self.handle_conds(None)
+# self.assertNextToken(DrtTokens.CLOSE)
+# return BoxerDrs(label, refs, conds)
+ if tok == 'pred':
self.assertNextToken(DrtTokens.OPEN)
disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
self.assertNextToken(DrtTokens.COMMA)
@@ -715,7 +752,7 @@ class BoxerDrsParser(DrtParser):
self.assertNextToken(DrtTokens.COMMA)
drs2 = self.process_next_expression(None)
self.assertNextToken(DrtTokens.CLOSE)
- return BoxerDrs(drs1.label, drs1.refs, drs1.conds, drs2)
+ return BoxerDrs(drs1.refs, drs1.conds, drs2)
elif tok == 'or':
self.assertNextToken(DrtTokens.OPEN)
disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
@@ -829,9 +866,8 @@ class AbstractBoxerDrs(object):
@python_2_unicode_compatible
class BoxerDrs(AbstractBoxerDrs):
- def __init__(self, label, refs, conds, consequent=None):
+ def __init__(self, refs, conds, consequent=None):
AbstractBoxerDrs.__init__(self)
- self.label = label
self.refs = refs
self.conds = conds
self.consequent = consequent
@@ -854,23 +890,21 @@ class BoxerDrs(AbstractBoxerDrs):
def clean(self):
consequent = (self.consequent.clean() if self.consequent else None)
- return BoxerDrs(self.label, self.refs, [c.clean() for c in self.conds], consequent)
+ return BoxerDrs(self.refs, [c.clean() for c in self.conds], consequent)
def renumber_sentences(self, f):
consequent = (self.consequent.renumber_sentences(f) if self.consequent else None)
- return BoxerDrs(self.label, self.refs, [c.renumber_sentences(f) for c in self.conds], consequent)
+ return BoxerDrs(self.refs, [c.renumber_sentences(f) for c in self.conds], consequent)
def __repr__(self):
- s = 'drs(%s, [%s], [%s])' % (self.label,
- ', '.join("%s" % r for r in self.refs),
- ', '.join("%s" % c for c in self.conds))
+ s = 'drs([%s], [%s])' % (', '.join("%s" % r for r in self.refs),
+ ', '.join("%s" % c for c in self.conds))
if self.consequent is not None:
s = 'imp(%s, %s)' % (s, self.consequent)
return s
def __eq__(self, other):
return self.__class__ == other.__class__ and \
- self.label == other.label and \
self.refs == other.refs and \
len(self.conds) == len(other.conds) and \
reduce(operator.and_, (c1==c2 for c1,c2 in zip(self.conds, other.conds))) and \
@@ -1151,9 +1185,7 @@ class NltkDrtBoxerDrsInterpreter(object):
:return: ``DrtExpression``
"""
if isinstance(ex, BoxerDrs):
- drs = DRS([Variable('x%d' % r) for r in ex.refs], list(map(self.interpret, ex.conds)))
- if ex.label is not None:
- drs.label = Variable('x%d' % ex.label)
+ drs = DRS([Variable(r) for r in ex.refs], list(map(self.interpret, ex.conds)))
if ex.consequent is not None:
drs.consequent = self.interpret(ex.consequent)
return drs
@@ -1161,21 +1193,21 @@ class NltkDrtBoxerDrsInterpreter(object):
return DrtNegatedExpression(self.interpret(ex.drs))
elif isinstance(ex, BoxerPred):
pred = self._add_occur_indexing('%s_%s' % (ex.pos, ex.name), ex)
- return self._make_atom(pred, 'x%d' % ex.var)
+ return self._make_atom(pred, ex.var)
elif isinstance(ex, BoxerNamed):
pred = self._add_occur_indexing('ne_%s_%s' % (ex.type, ex.name), ex)
- return self._make_atom(pred, 'x%d' % ex.var)
+ return self._make_atom(pred, ex.var)
elif isinstance(ex, BoxerRel):
pred = self._add_occur_indexing('%s' % (ex.rel), ex)
- return self._make_atom(pred, 'x%d' % ex.var1, 'x%d' % ex.var2)
+ return self._make_atom(pred, ex.var1, ex.var2)
elif isinstance(ex, BoxerProp):
- return DrtProposition(Variable('x%d' % ex.var), self.interpret(ex.drs))
+ return DrtProposition(Variable(ex.var), self.interpret(ex.drs))
elif isinstance(ex, BoxerEq):
- return DrtEqualityExpression(DrtVariableExpression(Variable('x%d' % ex.var1)),
- DrtVariableExpression(Variable('x%d' % ex.var2)))
+ return DrtEqualityExpression(DrtVariableExpression(Variable(ex.var1)),
+ DrtVariableExpression(Variable(ex.var2)))
elif isinstance(ex, BoxerCard):
pred = self._add_occur_indexing('card_%s_%s' % (ex.type, ex.value), ex)
- return self._make_atom(pred, 'x%d' % ex.var)
+ return self._make_atom(pred, ex.var)
elif isinstance(ex, BoxerOr):
return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2))
elif isinstance(ex, BoxerWhq):
@@ -1223,4 +1255,4 @@ if __name__ == '__main__':
if options.fol:
print(drs.fol().normalize())
else:
- drs.normalize().pprint()
+ drs.pretty_print()
diff --git a/nltk/sem/drt.py b/nltk/sem/drt.py
index 3e157de..f433c37 100644
--- a/nltk/sem/drt.py
+++ b/nltk/sem/drt.py
@@ -1217,12 +1217,12 @@ def demo():
print(resolve_anaphora(dexpr(r'([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])')))
print(resolve_anaphora(dexpr(r'(([x,y],[]) + ([],[PRO(x)]))')))
- print('='*20 + 'Test pprint()' + '='*20)
- dexpr(r"([],[])").pprint()
- dexpr(r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])").pprint()
- dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pprint()
- dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pprint()
- dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pprint()
+ print('='*20 + 'Test pretty_print()' + '='*20)
+ dexpr(r"([],[])").pretty_print()
+ dexpr(r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])").pretty_print()
+ dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pretty_print()
+ dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pretty_print()
+ dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pretty_print()
def test_draw():
diff --git a/nltk/stem/snowball.py b/nltk/stem/snowball.py
index 3fea2b3..1f4b751 100644
--- a/nltk/stem/snowball.py
+++ b/nltk/stem/snowball.py
@@ -2533,7 +2533,7 @@ class PortugueseStemmer(_StandardStemmer):
word = suffix_replace(word, suffix, "log")
rv = suffix_replace(rv, suffix, "log")
- elif suffix in ("ução", "uções"):
+ elif suffix in ("uça~o", "uço~es"):
word = suffix_replace(word, suffix, "u")
rv = suffix_replace(rv, suffix, "u")
diff --git a/nltk/tag/__init__.py b/nltk/tag/__init__.py
index c9ce8d5..1dce867 100644
--- a/nltk/tag/__init__.py
+++ b/nltk/tag/__init__.py
@@ -71,7 +71,7 @@ from nltk.tag.brill import BrillTagger
from nltk.tag.brill_trainer import BrillTaggerTrainer
from nltk.tag.tnt import TnT
from nltk.tag.hunpos import HunposTagger
-from nltk.tag.stanford import StanfordTagger
+from nltk.tag.stanford import StanfordTagger, StanfordPOSTagger, StanfordNERTagger
from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger
from nltk.tag.mapping import tagset_mapping, map_tag
@@ -83,7 +83,7 @@ from nltk.data import load
# Standard treebank POS tagger
_POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
-def pos_tag(tokens):
+def pos_tag(tokens, tagset=None):
"""
Use NLTK's currently recommended part of speech tagger to
tag the given list of tokens.
@@ -101,6 +101,8 @@ def pos_tag(tokens):
:rtype: list(tuple(str, str))
"""
tagger = load(_POS_TAGGER)
+ if tagset:
+ return [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagger.tag(tokens)]
return tagger.tag(tokens)
def pos_tag_sents(sentences):
diff --git a/nltk/tag/crf.py b/nltk/tag/crf.py
index 3842d87..0825944 100644
--- a/nltk/tag/crf.py
+++ b/nltk/tag/crf.py
@@ -24,7 +24,7 @@ class CRFTagger(TaggerI):
"""
A module for POS tagging using CRFSuite https://pypi.python.org/pypi/python-crfsuite
- >>> from nltk.tag.crf import CRFTagger
+ >>> from nltk.tag import CRFTagger
>>> ct = CRFTagger()
>>> train_data = [[('University','Noun'), ('is','Verb'), ('a','Det'), ('good','Adj'), ('place','Noun')],
diff --git a/nltk/tag/hunpos.py b/nltk/tag/hunpos.py
index 82e0560..f3ba445 100644
--- a/nltk/tag/hunpos.py
+++ b/nltk/tag/hunpos.py
@@ -33,7 +33,7 @@ class HunposTagger(TaggerI):
Example:
- >>> from nltk.tag.hunpos import HunposTagger
+ >>> from nltk.tag import HunposTagger
>>> ht = HunposTagger('en_wsj.model')
>>> ht.tag('What is the airspeed of an unladen swallow ?'.split())
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')]
diff --git a/nltk/tag/stanford.py b/nltk/tag/stanford.py
index 3ce7575..c88aff9 100644
--- a/nltk/tag/stanford.py
+++ b/nltk/tag/stanford.py
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
-# Natural Language Toolkit: Interface to the Stanford NER-tagger
+# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Nitin Madnani <nmadnani at ets.org>
@@ -9,6 +9,12 @@
"""
A module for interfacing with the Stanford taggers.
+
+Tagger models need to be downloaded from http://nlp.stanford.edu/software
+and the STANFORD_MODELS environment variable set (a colon-separated
+list of paths).
+
+For more details see the documentation for StanfordPOSTagger and StanfordNERTagger.
"""
import os
@@ -36,17 +42,17 @@ class StanfordTagger(TaggerI):
_SEPARATOR = ''
_JAR = ''
- def __init__(self, path_to_model, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
+ def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
if not self._JAR:
warnings.warn('The StanfordTagger class is not meant to be '
- 'instantiated directly. Did you mean POS- or NERTagger?')
+ 'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
self._stanford_jar = find_jar(
self._JAR, path_to_jar,
searchpath=(), url=_stanford_url,
verbose=verbose)
- self._stanford_model = find_file(path_to_model,
+ self._stanford_model = find_file(model_filename,
env_vars=('STANFORD_MODELS',), verbose=verbose)
self._encoding = encoding
self.java_options = java_options
@@ -56,7 +62,8 @@ class StanfordTagger(TaggerI):
raise NotImplementedError
def tag(self, tokens):
- return list(self.tag_sents([tokens]))
+ # This function should return list of tuple rather than list of list
+ return sum(self.tag_sents([tokens]), [])
def tag_sents(self, sentences):
encoding = self._encoding
@@ -80,16 +87,16 @@ class StanfordTagger(TaggerI):
stanpos_output, _stderr = java(self._cmd,classpath=self._stanford_jar,
stdout=PIPE, stderr=PIPE)
stanpos_output = stanpos_output.decode(encoding)
-
+
# Delete the temporary file
os.unlink(self._input_file_path)
# Return java configurations to their default values
config_java(options=default_options, verbose=False)
- return self.parse_output(stanpos_output)
+ return self.parse_output(stanpos_output, sentences)
- def parse_output(self, text):
+ def parse_output(self, text, sentences = None):
# Output the tagged sentences
tagged_sentences = []
for tagged_sentence in text.strip().split("\n"):
@@ -100,7 +107,7 @@ class StanfordTagger(TaggerI):
tagged_sentences.append(sentence)
return tagged_sentences
-class POSTagger(StanfordTagger):
+class StanfordPOSTagger(StanfordTagger):
"""
A class for pos tagging with Stanford Tagger. The input is the paths to:
- a model trained on training data
@@ -110,9 +117,8 @@ class POSTagger(StanfordTagger):
Example:
- >>> from nltk.tag.stanford import POSTagger
- >>> st = POSTagger('/usr/share/stanford-postagger/models/english-bidirectional-distsim.tagger',
- ... '/usr/share/stanford-postagger/stanford-postagger.jar') # doctest: +SKIP
+ >>> from nltk.tag import StanfordPOSTagger
+ >>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # doctest: +SKIP
>>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
"""
@@ -121,7 +127,7 @@ class POSTagger(StanfordTagger):
_JAR = 'stanford-postagger.jar'
def __init__(self, *args, **kwargs):
- super(POSTagger, self).__init__(*args, **kwargs)
+ super(StanfordPOSTagger, self).__init__(*args, **kwargs)
@property
def _cmd(self):
@@ -129,9 +135,9 @@ class POSTagger(StanfordTagger):
'-model', self._stanford_model, '-textFile',
self._input_file_path, '-tokenize', 'false','-outputFormatOptions', 'keepEmptySentences']
-class NERTagger(StanfordTagger):
+class StanfordNERTagger(StanfordTagger):
"""
- A class for ner tagging with Stanford Tagger. The input is the paths to:
+ A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to:
- a model trained on training data
- (optionally) the path to the stanford tagger jar file. If not specified here,
@@ -140,9 +146,8 @@ class NERTagger(StanfordTagger):
Example:
- >>> from nltk.tag.stanford import NERTagger
- >>> st = NERTagger('/usr/share/stanford-ner/classifiers/all.3class.distsim.crf.ser.gz',
- ... '/usr/share/stanford-ner/stanford-ner.jar') # doctest: +SKIP
+ >>> from nltk.tag import StanfordNERTagger
+ >>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP
>>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'),
('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'),
@@ -154,7 +159,7 @@ class NERTagger(StanfordTagger):
_FORMAT = 'slashTags'
def __init__(self, *args, **kwargs):
- super(NERTagger, self).__init__(*args, **kwargs)
+ super(StanfordNERTagger, self).__init__(*args, **kwargs)
@property
def _cmd(self):
@@ -163,10 +168,24 @@ class NERTagger(StanfordTagger):
'-loadClassifier', self._stanford_model, '-textFile',
self._input_file_path, '-outputFormat', self._FORMAT, '-tokenizerFactory', 'edu.stanford.nlp.process.WhitespaceTokenizer', '-tokenizerOptions','\"tokenizeNLs=false\"']
- def parse_output(self, text):
- if self._FORMAT == 'slashTags':
- return super(NERTagger, self).parse_output(text)
- raise NotImplementedError
+ def parse_output(self, text, sentences):
+ if self._FORMAT == 'slashTags':
+ # Joint together to a big list
+ tagged_sentences = []
+ for tagged_sentence in text.strip().split("\n"):
+ for tagged_word in tagged_sentence.strip().split():
+ word_tags = tagged_word.strip().split(self._SEPARATOR)
+ tagged_sentences.append((''.join(word_tags[:-1]), word_tags[-1]))
+
+ # Separate it according to the input
+ result = []
+ start = 0
+ for sent in sentences:
+ result.append(tagged_sentences[start:start + len(sent)])
+ start += len(sent);
+ return result
+
+ raise NotImplementedError
if __name__ == "__main__":
diff --git a/nltk/test/crubadan.doctest b/nltk/test/crubadan.doctest
new file mode 100644
index 0000000..c45fe91
--- /dev/null
+++ b/nltk/test/crubadan.doctest
@@ -0,0 +1,65 @@
+.. Copyright (C) 2001-2015 NLTK Project
+.. For license information, see LICENSE.TXT
+
+Crubadan Corpus Reader
+======================
+
+Crubadan is an NLTK corpus reader for ngram files provided
+by the Crubadan project. It supports several languages.
+
+ >>> from nltk.corpus import crubadan
+ >>> crubadan.langs() # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+ ['abk', 'abn',..., 'zpa', 'zul']
+
+----------------------------------------
+Language code mapping and helper methods
+----------------------------------------
+
+The web crawler that generates the 3-gram frequencies works at the
+level of "writing systems" rather than languages. Writing systems
+are assigned internal 2-3 letter codes that require mapping to the
+standard ISO 639-3 codes. For more information, please refer to
+the README in nltk_data/crubadan folder after installing it.
+
+To translate ISO 639-3 codes to "Crubadan Code":
+
+ >>> crubadan.iso_to_crubadan('eng')
+ 'en'
+ >>> crubadan.iso_to_crubadan('fra')
+ 'fr'
+ >>> crubadan.iso_to_crubadan('aaa')
+
+In reverse, print ISO 639-3 code if we have the Crubadan Code:
+
+ >>> crubadan.crubadan_to_iso('en')
+ 'eng'
+ >>> crubadan.crubadan_to_iso('fr')
+ 'fra'
+ >>> crubadan.crubadan_to_iso('aa')
+
+---------------------------
+Accessing ngram frequencies
+---------------------------
+
+On initialization the reader will create a dictionary of every
+language supported by the Crubadan project, mapping the ISO 639-3
+language code to its corresponding ngram frequency.
+
+You can access individual language FreqDist and the ngrams within them as follows:
+
+ >>> english_fd = crubadan.lang_freq('eng')
+ >>> english_fd['the']
+ 728135
+
+Above accesses the FreqDist of English and returns the frequency of the ngram 'the'.
+A ngram that isn't found within the language will return 0:
+
+ >>> english_fd['sometest']
+ 0
+
+A language that isn't supported will raise an exception:
+
+ >>> crubadan.lang_freq('elvish')
+ Traceback (most recent call last):
+ ...
+ RuntimeError: Unsupported language.
diff --git a/nltk/test/gensim.doctest b/nltk/test/gensim.doctest
new file mode 100644
index 0000000..4e7e176
--- /dev/null
+++ b/nltk/test/gensim.doctest
@@ -0,0 +1,144 @@
+.. Copyright (C) 2001-2015 NLTK Project
+.. For license information, see LICENSE.TXT
+
+=========================================
+ Test the word embedding function through Gensim package
+=========================================
+
+ >>> import gensim
+
+Overview
+~~~~~~~~
+Use Gensim package, we demo 3 functions.
+- Train the word embeddings using brown corpus.
+- Load the pre-trained model and perform simple tasks.
+- Pruning the pre-trained binary model.
+
+Train the model
+~~~~~~~~~~~~~~~~~~
+The word embedding is trained on Brown corpus
+
+ >>> from nltk.corpus import brown
+ >>> model = gensim.models.Word2Vec(brown.sents())
+
+It might take sometime to train the model, after the model is trained, probably you want to save and then use it latter
+ >>> model.save('brown.embedding')
+ >>> new_model = gensim.models.Word2Vec.load('brown.embedding')
+
+The model will be the list of words with their embedding. We can easily get the vector representation of a word.
+ >>> len(new_model['university'])
+ 100
+
+There are some supporting functions already implemented in Gensim to manipulate with word embeddings.
+For example, to compute the cosine similarity between 2 words
+ >>> new_model.similarity('university','school') > 0.3
+ True
+
+Using the pre-trained model
+~~~~~~~~~~~~~~~~~~~
+NLTK also include a pre-trained model which is part of a model that is trained on 100 billion words from Google News Dataset.
+The full model is from https://code.google.com/p/word2vec/ which is about 3 Gb.
+ >>> from nltk.data import find
+ >>> word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.bin'))
+ >>> model = gensim.models.Word2Vec.load(word2vec_sample)
+
+We pruned the model to only include the most common words (~44k words).
+ >>> len(model.vocab)
+ 43981
+
+Each of the word is represented in the space of 300 dimensions.
+ >>> len(model['university'])
+ 300
+
+Finding the top n word that similar to a target word is simple. The result is the list of n words with the score.
+ >>> model.most_similar(positive=['university'], topn = 3)
+ [(u'universities', 0.7003918886184692), (u'faculty', 0.6780908703804016), (u'undergraduate', 0.6587098240852356)]
+
+Find a word that is not in a list is also supported, although, implementing this by yourself is simple.
+ >>> model.doesnt_match('breakfast cereal dinner lunch'.split())
+ 'cereal'
+
+Mikolov et al. (2013) figured out that word embedding captures much of syntactic and semantic regularities. For example,
+Vector 'King - Man + Woman' results close to 'Queen' or 'Germany - Berlin + Paris' closes to vector 'France'.
+ >>> model.most_similar(positive=['woman','king'], negative=['man'], topn = 1)
+ [(u'queen', 0.7118192911148071)]
+
+ >>> model.most_similar(positive=['Paris','Germany'], negative=['Berlin'], topn = 1)
+ [(u'France', 0.7884092926979065)]
+
+We can visualize the word embeddings using t-SNE (http://lvdmaaten.github.io/tsne/). For demo, we just visualize the first 1000 words.
+You can just change it to a bigger value.
+
+ import numpy as np
+ labels = []
+ count = 0
+ max_count = 1000
+ X = np.zeros(shape=(max_count,len(model['university'])))
+
+ for term in model.vocab:
+ X[count] = model[term]
+ labels.append(term)
+ count+= 1
+ if count >= max_count: break
+
+ # It is recommended to use PCA first to reduce to ~50 dimensions
+ from sklearn.decomposition import PCA
+ pca = PCA(n_components=50)
+ X_50 = pca.fit_transform(X)
+
+ # Using TSNE to further reduce to 2 dimensions
+ from sklearn.manifold import TSNE
+ model_tsne = TSNE(n_components=2, random_state=0)
+ Y = model_tsne.fit_transform(X_50)
+
+ # Show the scatter plot
+ import matplotlib.pyplot as plt
+ plt.scatter(Y[:,0], Y[:,1], 20)
+
+ # Add labels
+ for label, x, y in zip(labels, Y[:, 0], Y[:, 1]):
+ plt.annotate(label, xy = (x,y), xytext = (0, 0), textcoords = 'offset points', size = 10)
+
+ plt.show()
+
+
+Prune the trained binary model
+~~~~~~~~~~~~~~~~~
+Here is the supporting code to extract part of the binary model (GoogleNews-vectors-negative300.bin.gz) from https://code.google.com/p/word2vec/
+We use this code to get the `word2vec_sample` model.
+
+ import gensim
+ from gensim.models.word2vec import Word2Vec
+ # Load the binary model
+ model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True);
+
+ # Only output word that appear in the Brown corpus
+ from nltk.corpus import brown
+ words = set(brown.words())
+ print (len(words))
+
+ # Output presented word to a temporary file
+ out_file = 'pruned.word2vec.txt'
+ f = open(out_file,'wb')
+
+ word_presented = words.intersection(model.vocab.keys())
+ f.write('{} {}\n'.format(len(word_presented),len(model['word'])))
+
+ for word in word_presented:
+ f.write('{} {}\n'.format(word, ' '.join(str(value) for value in model[word])))
+
+ f.close()
+
+ # Reload the model from text file
+ new_model = Word2Vec.load_word2vec_format(out_file, binary=False);
+
+ # Save it as the Gensim model
+ gensim_model = "pruned.word2vec.bin"
+ new_model.save(gensim_model)
+
+ # Load the model
+ very_new_model = gensim.models.Word2Vec.load(gensim_model)
+
+ # Test it
+ very_new_model.most_similar(positive=['king','woman'], negative=['man'], topn=1)
+
\ No newline at end of file
diff --git a/nltk/test/unit/test_tgrep.py b/nltk/test/unit/test_tgrep.py
new file mode 100644
index 0000000..224fb47
--- /dev/null
+++ b/nltk/test/unit/test_tgrep.py
@@ -0,0 +1,626 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Natural Language Toolkit: TGrep search
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Will Roberts <wildwilhelm at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+'''
+Unit tests for nltk.tgrep.
+'''
+
+from __future__ import absolute_import, print_function, unicode_literals
+from nltk.compat import b
+from nltk.tree import ParentedTree
+from nltk import tgrep
+import unittest
+
+class TestSequenceFunctions(unittest.TestCase):
+
+ '''
+ Class containing unit tests for nltk.tgrep.
+ '''
+
+ def test_tokenize_simple(self):
+ '''
+ Simple test of tokenization.
+ '''
+ tokens = tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]')
+ self.assertEqual(tokens,
+ ['A', '..', '(', 'B', '!', '<', 'C', '.', 'D', ')',
+ '|', '!', '[', '<<', '(', 'E', ',', 'F', ')', '$',
+ 'G', ']'])
+
+ def test_tokenize_encoding(self):
+ '''
+ Test that tokenization handles bytes and strs the same way.
+ '''
+ self.assertEqual(
+ tgrep.tgrep_tokenize(b('A .. (B !< C . D) | ![<< (E , F) $ G]')),
+ tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]'))
+
+ def test_tokenize_link_types(self):
+ '''
+ Test tokenization of basic link types.
+ '''
+ self.assertEqual(tgrep.tgrep_tokenize('A<B'), ['A', '<', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A>B'), ['A', '>', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A<3B'), ['A', '<3', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A>3B'), ['A', '>3', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A<,B'), ['A', '<,', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A>,B'), ['A', '>,', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A<-3B'), ['A', '<-3', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A>-3B'), ['A', '>-3', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A<-B'), ['A', '<-', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A>-B'), ['A', '>-', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A<\'B'), ['A', '<\'', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A>\'B'), ['A', '>\'', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A<:B'), ['A', '<:', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A>:B'), ['A', '>:', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A<<B'), ['A', '<<', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A>>B'), ['A', '>>', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A<<,B'), ['A', '<<,', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A>>,B'), ['A', '>>,', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A<<\'B'), ['A', '<<\'', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A>>\'B'), ['A', '>>\'', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A<<:B'), ['A', '<<:', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A>>:B'), ['A', '>>:', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A.B'), ['A', '.', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A,B'), ['A', ',', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A..B'), ['A', '..', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A,,B'), ['A', ',,', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A$B'), ['A', '$', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A$.B'), ['A', '$.', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A$,B'), ['A', '$,', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A$..B'), ['A', '$..', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A$,,B'), ['A', '$,,', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!<B'), ['A', '!', '<', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!>B'), ['A', '!', '>', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!<3B'), ['A', '!', '<3', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!>3B'), ['A', '!', '>3', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!<,B'), ['A', '!', '<,', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!>,B'), ['A', '!', '>,', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!<-3B'),
+ ['A', '!', '<-3', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!>-3B'),
+ ['A', '!', '>-3', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!<-B'), ['A', '!', '<-', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!>-B'), ['A', '!', '>-', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!<\'B'),
+ ['A', '!', '<\'', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!>\'B'),
+ ['A', '!', '>\'', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!<:B'), ['A', '!', '<:', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!>:B'), ['A', '!', '>:', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!<<B'), ['A', '!', '<<', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!>>B'), ['A', '!', '>>', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!<<,B'),
+ ['A', '!', '<<,', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!>>,B'),
+ ['A', '!', '>>,', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!<<\'B'),
+ ['A', '!', '<<\'', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!>>\'B'),
+ ['A', '!', '>>\'', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!<<:B'),
+ ['A', '!', '<<:', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!>>:B'),
+ ['A', '!', '>>:', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!.B'), ['A', '!', '.', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!,B'), ['A', '!', ',', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!..B'), ['A', '!', '..', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!,,B'), ['A', '!', ',,', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!$B'), ['A', '!', '$', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!$.B'), ['A', '!', '$.', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!$,B'), ['A', '!', '$,', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!$..B'),
+ ['A', '!', '$..', 'B'])
+ self.assertEqual(tgrep.tgrep_tokenize('A!$,,B'),
+ ['A', '!', '$,,', 'B'])
+
+ def test_tokenize_examples(self):
+ '''
+ Test tokenization of the TGrep2 manual example patterns.
+ '''
+ self.assertEqual(tgrep.tgrep_tokenize('NP < PP'),
+ ['NP', '<', 'PP'])
+ self.assertEqual(tgrep.tgrep_tokenize('/^NP/'),
+ ['/^NP/'])
+ self.assertEqual(tgrep.tgrep_tokenize('NP << PP . VP'),
+ ['NP', '<<', 'PP', '.', 'VP'])
+ self.assertEqual(tgrep.tgrep_tokenize('NP << PP | . VP'),
+ ['NP', '<<', 'PP', '|', '.', 'VP'])
+ self.assertEqual(tgrep.tgrep_tokenize('NP !<< PP [> NP | >> VP]'),
+ ['NP', '!', '<<', 'PP', '[', '>', 'NP', '|',
+ '>>', 'VP', ']'])
+ self.assertEqual(tgrep.tgrep_tokenize('NP << (PP . VP)'),
+ ['NP', '<<', '(', 'PP', '.', 'VP', ')'])
+ self.assertEqual(tgrep.tgrep_tokenize('NP <\' (PP <, (IN < on))'),
+ ['NP', '<\'', '(', 'PP', '<,', '(', 'IN', '<',
+ 'on', ')', ')'])
+ self.assertEqual(tgrep.tgrep_tokenize('S < (A < B) < C'),
+ ['S', '<', '(', 'A', '<', 'B', ')', '<', 'C'])
+ self.assertEqual(tgrep.tgrep_tokenize('S < ((A < B) < C)'),
+ ['S', '<', '(', '(', 'A', '<', 'B', ')',
+ '<', 'C', ')'])
+ self.assertEqual(tgrep.tgrep_tokenize('S < (A < B < C)'),
+ ['S', '<', '(', 'A', '<', 'B', '<', 'C', ')'])
+ self.assertEqual(tgrep.tgrep_tokenize('A<B&.C'),
+ ['A', '<', 'B', '&', '.', 'C'])
+
+ def test_tokenize_quoting(self):
+ '''
+ Test tokenization of quoting.
+ '''
+ self.assertEqual(tgrep.tgrep_tokenize('"A<<:B"<<:"A $.. B"<"A>3B"<C'),
+ ['"A<<:B"', '<<:', '"A $.. B"', '<', '"A>3B"',
+ '<', 'C'])
+
+ def test_tokenize_nodenames(self):
+ '''
+ Test tokenization of node names.
+ '''
+ self.assertEqual(tgrep.tgrep_tokenize('Robert'), ['Robert'])
+ self.assertEqual(tgrep.tgrep_tokenize('/^[Bb]ob/'), ['/^[Bb]ob/'])
+ self.assertEqual(tgrep.tgrep_tokenize('*'), ['*'])
+ self.assertEqual(tgrep.tgrep_tokenize('__'), ['__'])
+ # test tokenization of NLTK tree position syntax
+ self.assertEqual(tgrep.tgrep_tokenize('N()'),
+ ['N(', ')'])
+ self.assertEqual(tgrep.tgrep_tokenize('N(0,)'),
+ ['N(', '0', ',', ')'])
+ self.assertEqual(tgrep.tgrep_tokenize('N(0,0)'),
+ ['N(', '0', ',', '0', ')'])
+ self.assertEqual(tgrep.tgrep_tokenize('N(0,0,)'),
+ ['N(', '0', ',', '0', ',', ')'])
+
+ def test_tokenize_macros(self):
+ '''
+ Test tokenization of macro definitions.
+ '''
+ self.assertEqual(tgrep.tgrep_tokenize(
+ '@ NP /^NP/;\n@ NN /^NN/;\n at NP [!< NP | < @NN] !$.. @NN'),
+ ['@', 'NP', '/^NP/', ';', '@', 'NN', '/^NN/', ';',
+ '@NP', '[', '!', '<', 'NP', '|', '<', '@NN', ']',
+ '!', '$..', '@NN'])
+
+ def test_node_simple(self):
+ '''
+ Test a simple use of tgrep for finding nodes matching a given
+ pattern.
+ '''
+ tree = ParentedTree.fromstring(
+ '(S (NP (DT the) (JJ big) (NN dog)) '
+ '(VP bit) (NP (DT a) (NN cat)))')
+ self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])),
+ [[(0,2), (2,1)]])
+ self.assertEqual(list(tgrep.tgrep_nodes('NN', [tree])),
+ [[tree[0,2], tree[2,1]]])
+ self.assertEqual(list(tgrep.tgrep_positions('NN|JJ', [tree])),
+ [[(0, 1), (0, 2), (2, 1)]])
+
+ def test_node_printing(self):
+ '''Test that the tgrep print operator ' is properly ignored.'''
+ tree = ParentedTree.fromstring('(S (n x) (N x))')
+ self.assertEqual(list(tgrep.tgrep_positions('N', [tree])),
+ list(tgrep.tgrep_positions('\'N', [tree])))
+ self.assertEqual(list(tgrep.tgrep_positions('/[Nn]/', [tree])),
+ list(tgrep.tgrep_positions('\'/[Nn]/', [tree])))
+
+ def test_node_encoding(self):
+ '''
+ Test that tgrep search strings handles bytes and strs the same
+ way.
+ '''
+ tree = ParentedTree.fromstring(
+ '(S (NP (DT the) (JJ big) (NN dog)) '
+ '(VP bit) (NP (DT a) (NN cat)))')
+ self.assertEqual(list(tgrep.tgrep_positions(b('NN'), [tree])),
+ list(tgrep.tgrep_positions('NN', [tree])))
+ self.assertEqual(list(tgrep.tgrep_nodes(b('NN'), [tree])),
+ list(tgrep.tgrep_nodes('NN', [tree])))
+ self.assertEqual(list(tgrep.tgrep_positions(b('NN|JJ'), [tree])),
+ list(tgrep.tgrep_positions('NN|JJ', [tree])))
+
+ def test_node_nocase(self):
+ '''
+ Test selecting nodes using case insensitive node names.
+ '''
+ tree = ParentedTree.fromstring('(S (n x) (N x))')
+ self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
+
+ def test_node_quoted(self):
+ '''
+ Test selecting nodes using quoted node names.
+ '''
+ tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))')
+ self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]])
+ self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
+
+ def test_node_regex(self):
+ '''
+ Test regex matching on nodes.
+ '''
+ tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
+ # This is a regular expression that matches any node whose
+ # name starts with NP, including NP-SBJ:
+ self.assertEqual(list(tgrep.tgrep_positions('/^NP/', [tree])),
+ [[(0,), (1,)]])
+
+ def test_node_regex_2(self):
+ '''
+ Test regex matching on nodes.
+ '''
+ tree = ParentedTree.fromstring('(S (SBJ x) (SBJ1 x) (NP-SBJ x))')
+ self.assertEqual(list(tgrep.tgrep_positions('/^SBJ/', [tree])),
+ [[(0,), (1,)]])
+ # This is a regular expression that matches any node whose
+ # name includes SBJ, including NP-SBJ:
+ self.assertEqual(list(tgrep.tgrep_positions('/SBJ/', [tree])),
+ [[(0,), (1,), (2,)]])
+
+ def test_node_tree_position(self):
+ '''
+ Test matching on nodes based on NLTK tree position.
+ '''
+ tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
+ # test all tree positions that are not leaves
+ leaf_positions = set([tree.leaf_treeposition(x)
+ for x in range(len(tree.leaves()))])
+ tree_positions = [x for x in tree.treepositions()
+ if x not in leaf_positions]
+ for position in tree_positions:
+ node_id = 'N{0}'.format(position)
+ tgrep_positions = list(tgrep.tgrep_positions(node_id, [tree]))
+ self.assertEqual(len(tgrep_positions[0]), 1)
+ self.assertEqual(tgrep_positions[0][0], position)
+
+ def test_node_noleaves(self):
+ '''
+ Test node name matching with the search_leaves flag set to False.
+ '''
+ tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
+ self.assertEqual(list(tgrep.tgrep_positions('x', [tree])),
+ [[(0, 0, 0), (1, 0, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('x', [tree], False)),
+ [[]])
+
+ def tests_rel_dominance(self):
+ '''
+ Test matching nodes based on dominance relations.
+ '''
+ tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
+ self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])),
+ [[(0,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* < T > S', [tree])),
+ [[(0,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* !< T', [tree])),
+ [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* !< T > S', [tree])),
+ [[(1,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* > A', [tree])),
+ [[(0, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* > B', [tree])),
+ [[(1, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* !> B', [tree])),
+ [[(), (0,), (0, 0), (0, 0, 0), (1,), (1, 0, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* !> B >> S', [tree])),
+ [[(0,), (0, 0), (1,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* >> S', [tree])),
+ [[(0,), (0, 0), (1,), (1, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* >>, S', [tree])),
+ [[(0,), (0, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* >>\' S', [tree])),
+ [[(1,), (1, 0)]])
+ # Known issue:
+ #self.assertEqual(list(tgrep.tgrep_positions('* !>> S', [tree])),
+ # [[()]])
+ self.assertEqual(list(tgrep.tgrep_positions('* << T', [tree])),
+ [[(), (0,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* <<\' T', [tree])),
+ [[(0,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* <<1 N', [tree])),
+ [[(1,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* !<< T', [tree])),
+ [[(0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]])
+ tree = ParentedTree.fromstring('(S (A (T x)) (B (T x) (N x )))')
+ self.assertEqual(list(tgrep.tgrep_positions('* <: T', [tree])),
+ [[(0,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])),
+ [[(0,), (1,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* !<: T', [tree])),
+ [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0),
+ (1, 1), (1, 1, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* !<: T > S', [tree])),
+ [[(1,)]])
+ tree = ParentedTree.fromstring('(S (T (A x) (B x)) (T (C x)))')
+ self.assertEqual(list(tgrep.tgrep_positions('* >: T', [tree])),
+ [[(1, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* !>: T', [tree])),
+ [[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0),
+ (1,), (1, 0, 0)]])
+ tree = ParentedTree.fromstring('(S (A (B (C (D (E (T x))))))'
+ ' (A (B (C (D (E (T x))) (N x)))))')
+ self.assertEqual(list(tgrep.tgrep_positions('* <<: T', [tree])),
+ [[(0,), (0, 0), (0, 0, 0), (0, 0, 0, 0),
+ (0, 0, 0, 0, 0), (1, 0, 0, 0), (1, 0, 0, 0, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* >>: A', [tree])),
+ [[(0, 0), (0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0, 0),
+ (0, 0, 0, 0, 0, 0), (1, 0), (1, 0, 0)]])
+
+ def test_bad_operator(self):
+ '''
+ Test error handling of undefined tgrep operators.
+ '''
+ tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
+ self.assertRaises(
+ tgrep.TgrepException,
+ list,
+ tgrep.tgrep_positions('* >>> S', [tree]))
+
+ def test_comments(self):
+ '''
+ Test that comments are correctly filtered out of tgrep search
+ strings.
+ '''
+ tree = ParentedTree.fromstring('(S (NN x) (NP x) (NN x))')
+ search1 = '''
+ @ NP /^NP/;
+ @ NN /^NN/;
+ @NN
+ '''
+ self.assertEqual(list(tgrep.tgrep_positions(search1, [tree])),
+ [[(0,), (2,)]])
+ search2 = '''
+ # macros
+ @ NP /^NP/;
+ @ NN /^NN/;
+
+ # search string
+ @NN
+ '''
+ self.assertEqual(list(tgrep.tgrep_positions(search2, [tree])),
+ [[(0,), (2,)]])
+
+ def test_rel_sister_nodes(self):
+ '''
+ Test matching sister nodes in a tree.
+ '''
+ tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
+ self.assertEqual(list(tgrep.tgrep_positions('* $. B', [tree])), [[(0,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* $.. B', [tree])), [[(0,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* $, B', [tree])), [[(2,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* $,, B', [tree])), [[(2,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* $ B', [tree])), [[(0,), (2,)]])
+
+ def tests_rel_indexed_children(self):
+ '''
+ Test matching nodes based on their index in their parent node.
+ '''
+ tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
+ self.assertEqual(list(tgrep.tgrep_positions('* >, S', [tree])), [[(0,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* >1 S', [tree])), [[(0,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* >2 S', [tree])), [[(1,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* >3 S', [tree])), [[(2,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* >\' S', [tree])), [[(2,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* >-1 S', [tree])), [[(2,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* >-2 S', [tree])), [[(1,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* >-3 S', [tree])), [[(0,)]])
+ tree = ParentedTree.fromstring(
+ '(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) '
+ '(F (C x) (A x) (B x)))')
+ self.assertEqual(list(tgrep.tgrep_positions('* <, A', [tree])), [[(0,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* <1 A', [tree])), [[(0,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* <2 A', [tree])), [[(2,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* <3 A', [tree])), [[(1,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* <\' A', [tree])), [[(1,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* <-1 A', [tree])), [[(1,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* <-2 A', [tree])), [[(2,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* <-3 A', [tree])), [[(0,)]])
+
+ def test_rel_precedence(self):
+ '''
+ Test matching nodes based on precedence relations.
+ '''
+ tree = ParentedTree.fromstring('(S (NP (NP (PP x)) (NP (AP x)))'
+ ' (VP (AP (X (PP x)) (Y (AP x))))'
+ ' (NP (RC (NP (AP x)))))')
+ self.assertEqual(list(tgrep.tgrep_positions('* . X', [tree])),
+ [[(0,), (0, 1), (0, 1, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* . Y', [tree])),
+ [[(1, 0, 0), (1, 0, 0, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* .. X', [tree])),
+ [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* .. Y', [tree])),
+ [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0),
+ (1, 0, 0), (1, 0, 0, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* , X', [tree])),
+ [[(1, 0, 1), (1, 0, 1, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* , Y', [tree])),
+ [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* ,, X', [tree])),
+ [[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0),
+ (2, 0, 0, 0)]])
+ self.assertEqual(list(tgrep.tgrep_positions('* ,, Y', [tree])),
+ [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]])
+
+ def test_examples(self):
+ '''
+ Test the Basic Examples from the TGrep2 manual.
+ '''
+ tree = ParentedTree.fromstring('(S (NP (AP x)) (NP (PP x)))')
+ # This matches any NP node that immediately dominates a PP:
+ self.assertEqual(list(tgrep.tgrep_positions('NP < PP', [tree])),
+ [[(1,)]])
+
+ tree = ParentedTree.fromstring('(S (NP x) (VP x) (NP (PP x)) (VP x))')
+ # This matches an NP that dominates a PP and is immediately
+ # followed by a VP:
+ self.assertEqual(list(tgrep.tgrep_positions('NP << PP . VP', [tree])),
+ [[(2,)]])
+
+ tree = ParentedTree.fromstring('(S (NP (AP x)) (NP (PP x)) '
+ '(NP (DET x) (NN x)) (VP x))')
+ # This matches an NP that dominates a PP or is immediately
+ # followed by a VP:
+ self.assertEqual(list(tgrep.tgrep_positions('NP << PP | . VP', [tree])),
+ [[(1,), (2,)]])
+
+ tree = ParentedTree.fromstring('(S (NP (NP (PP x)) (NP (AP x)))'
+ ' (VP (AP (NP (PP x)) (NP (AP x))))'
+ ' (NP (RC (NP (AP x)))))')
+ # This matches an NP that does not dominate a PP. Also, the NP
+ # must either have a parent that is an NP or be dominated by a
+ # VP:
+ self.assertEqual(list(tgrep.tgrep_positions(
+ 'NP !<< PP [> NP | >> VP]', [tree])),
+ [[(0, 1), (1, 0, 1)]])
+
+ tree = ParentedTree.fromstring('(S (NP (AP (PP x) (VP x))) '
+ '(NP (AP (PP x) (NP x))) (NP x))')
+ # This matches an NP that dominates a PP which itself is
+ # immediately followed by a VP. Note the use of parentheses to
+ # group ". VP" with the PP rather than with the NP:
+ self.assertEqual(list(tgrep.tgrep_positions('NP << (PP . VP)', [tree])),
+ [[(0,)]])
+
+ tree = ParentedTree.fromstring(
+ '(S (NP (DET a) (NN cat) (PP (IN on) (NP x)))'
+ ' (NP (DET a) (NN cat) (PP (IN on) (NP x)) (PP x))'
+ ' (NP x))')
+ # This matches an NP whose last child is a PP that begins with
+ # the preposition "on":
+ self.assertEqual(list(tgrep.tgrep_positions(
+ 'NP <\' (PP <, (IN < on))', [tree])),
+ [[(0,)]])
+
+ tree = ParentedTree.fromstring(
+ '(S (S (C x) (A (B x))) (S (C x) (A x)) '
+ '(S (D x) (A (B x))))')
+ # The following pattern matches an S which has a child A and
+ # another child that is a C and that the A has a child B:
+ self.assertEqual(list(tgrep.tgrep_positions('S < (A < B) < C', [tree])),
+ [[(0,)]])
+
+ tree = ParentedTree.fromstring(
+ '(S (S (A (B x) (C x))) (S (S (C x) (A (B x)))))')
+ # However, this pattern means that S has child A and that A
+ # has children B and C:
+ self.assertEqual(list(tgrep.tgrep_positions('S < ((A < B) < C)', [tree])),
+ [[(0,)]])
+
+ # It is equivalent to this:
+ self.assertEqual(list(tgrep.tgrep_positions('S < (A < B < C)', [tree])),
+ [[(0,)]])
+
+ def test_use_macros(self):
+ '''
+ Test defining and using tgrep2 macros.
+ '''
+ tree = ParentedTree.fromstring(
+ '(VP (VB sold) (NP (DET the) '
+ '(NN heiress)) (NP (NN deed) (PREP to) '
+ '(NP (DET the) (NN school) (NN house))))')
+ self.assertEqual(list(tgrep.tgrep_positions(
+ '@ NP /^NP/;\n@ NN /^NN/;\n at NP !< @NP !$.. @NN',
+ [tree])),
+ [[(1,), (2, 2)]])
+ # use undefined macro @CNP
+ self.assertRaises(
+ tgrep.TgrepException,
+ list,
+ tgrep.tgrep_positions(
+ '@ NP /^NP/;\n@ NN /^NN/;\n at CNP !< @NP !$.. @NN', [tree]))
+
+ def test_tokenize_node_labels(self):
+ '''Test tokenization of labeled nodes.'''
+ self.assertEqual(tgrep.tgrep_tokenize(
+ 'S < @SBJ < (@VP < (@VB $.. @OBJ))'),
+ ['S', '<', '@SBJ', '<', '(', '@VP', '<', '(',
+ '@VB', '$..', '@OBJ', ')', ')'])
+ self.assertEqual(tgrep.tgrep_tokenize(
+ 'S < @SBJ=s < (@VP=v < (@VB $.. @OBJ))'),
+ ['S', '<', '@SBJ', '=', 's', '<', '(', '@VP',
+ '=', 'v', '<', '(', '@VB', '$..', '@OBJ', ')',
+ ')'])
+
+ def test_tokenize_segmented_patterns(self):
+ '''Test tokenization of segmented patterns.'''
+ self.assertEqual(tgrep.tgrep_tokenize(
+ 'S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'),
+ ['S', '<', '@SBJ', '=', 's', '<', '(', '@VP',
+ '=', 'v', '<', '(', '@VB', '$..', '@OBJ', ')',
+ ')', ':', '=s', '..', '=v'])
+
+ def test_labeled_nodes(self):
+ '''
+ Test labeled nodes.
+
+ Test case from Emily M. Bender.
+ '''
+ search = '''
+ # macros
+ @ SBJ /SBJ/;
+ @ VP /VP/;
+ @ VB /VB/;
+ @ VPoB /V[PB]/;
+ @ OBJ /OBJ/;
+
+ # 1 svo
+ S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'''
+ sent1 = ParentedTree.fromstring(
+ '(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))')
+ sent2 = ParentedTree.fromstring(
+ '(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))')
+ search_firsthalf = (search.split('\n\n')[0] +
+ 'S < @SBJ < (@VP < (@VB $.. @OBJ))')
+ search_rewrite = 'S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))'
+
+ self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0])
+ self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0])
+ self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0])
+ self.assertEqual(list(tgrep.tgrep_positions(search, [sent1])),
+ list(tgrep.tgrep_positions(search_rewrite, [sent1])))
+ self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0])
+ self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0])
+ self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0])
+ self.assertEqual(list(tgrep.tgrep_positions(search, [sent2])),
+ list(tgrep.tgrep_positions(search_rewrite, [sent2])))
+
+ def test_multiple_conjs(self):
+ '''
+ Test that multiple (3 or more) conjunctions of node relations are
+ handled properly.
+ '''
+ sent = ParentedTree.fromstring(
+ '((A (B b) (C c)) (A (B b) (C c) (D d)))')
+ # search = '(A < B < C < D)'
+ # search_tworels = '(A < B < C)'
+ self.assertEqual(list(tgrep.tgrep_positions('(A < B < C < D)', [sent])),
+ [[(1,)]])
+ self.assertEqual(list(tgrep.tgrep_positions('(A < B < C)', [sent])),
+ [[(0,), (1,)]])
+
+ def test_trailing_semicolon(self):
+ '''
+ Test that semicolons at the end of a tgrep2 search string won't
+ cause a parse failure.
+ '''
+ tree = ParentedTree.fromstring(
+ '(S (NP (DT the) (JJ big) (NN dog)) '
+ '(VP bit) (NP (DT a) (NN cat)))')
+ self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])),
+ [[(0,2), (2,1)]])
+ self.assertEqual(list(tgrep.tgrep_positions('NN;', [tree])),
+ [[(0,2), (2,1)]])
+ self.assertEqual(list(tgrep.tgrep_positions('NN;;', [tree])),
+ [[(0,2), (2,1)]])
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/nltk/tgrep.py b/nltk/tgrep.py
new file mode 100644
index 0000000..fd4dfa2
--- /dev/null
+++ b/nltk/tgrep.py
@@ -0,0 +1,941 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Natural Language Toolkit: TGrep search
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Will Roberts <wildwilhelm at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+'''
+============================================
+ TGrep search implementation for NLTK trees
+============================================
+
+This module supports TGrep2 syntax for matching parts of NLTK Trees.
+Note that many tgrep operators require the tree passed to be a
+``ParentedTree``.
+
+External links:
+
+- `Tgrep tutorial <http://www.stanford.edu/dept/linguistics/corpora/cas-tut-tgrep.html>`_
+- `Tgrep2 manual <http://tedlab.mit.edu/~dr/Tgrep2/tgrep2.pdf>`_
+- `Tgrep2 source <http://tedlab.mit.edu/~dr/Tgrep2/>`_
+
+Usage
+=====
+
+>>> from nltk.tree import ParentedTree
+>>> from nltk.tgrep import tgrep_nodes, tgrep_positions
+>>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))')
+>>> list(tgrep_nodes('NN', [tree]))
+[[ParentedTree('NN', ['dog']), ParentedTree('NN', ['cat'])]]
+>>> list(tgrep_positions('NN', [tree]))
+[[(0, 2), (2, 1)]]
+>>> list(tgrep_nodes('DT', [tree]))
+[[ParentedTree('DT', ['the']), ParentedTree('DT', ['a'])]]
+>>> list(tgrep_nodes('DT $ JJ', [tree]))
+[[ParentedTree('DT', ['the'])]]
+
+This implementation adds syntax to select nodes based on their NLTK
+tree position. This syntax is ``N`` plus a Python tuple representing
+the tree position. For instance, ``N()``, ``N(0,)``, ``N(0,0)`` are
+valid node selectors. Example:
+
+>>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))')
+>>> tree[0,0]
+ParentedTree('DT', ['the'])
+>>> tree[0,0].treeposition()
+(0, 0)
+>>> list(tgrep_nodes('N(0,0)', [tree]))
+[[ParentedTree('DT', ['the'])]]
+
+Caveats:
+========
+
+- Link modifiers: "?" and "=" are not implemented.
+- Tgrep compatibility: Using "@" for "!", "{" for "<", "}" for ">" are
+ not implemented.
+- The "=" and "~" links are not implemented.
+
+Known Issues:
+=============
+
+- There are some issues with link relations involving leaf nodes
+ (which are represented as bare strings in NLTK trees). For
+ instance, consider the tree::
+
+ (S (A x))
+
+ The search string ``* !>> S`` should select all nodes which are not
+ dominated in some way by an ``S`` node (i.e., all nodes which are
+ not descendants of an ``S``). Clearly, in this tree, the only node
+ which fulfills this criterion is the top node (since it is not
+ dominated by anything). However, the code here will find both the
+ top node and the leaf node ``x``. This is because we cannot recover
+ the parent of the leaf, since it is stored as a bare string.
+
+ A possible workaround, when performing this kind of search, would be
+ to filter out all leaf nodes.
+
+Implementation notes
+====================
+
+This implementation is (somewhat awkwardly) based on lambda functions
+which are predicates on a node. A predicate is a function which is
+either True or False; using a predicate function, we can identify sets
+of nodes with particular properties. A predicate function, could, for
+instance, return True only if a particular node has a label matching a
+particular regular expression, and has a daughter node which has no
+sisters. Because tgrep2 search strings can do things statefully (such
+as substituting in macros, and binding nodes with node labels), the
+actual predicate function is declared with three arguments::
+
+ pred = lambda n, m, l: return True # some logic here
+
+``n``
+ is a node in a tree; this argument must always be given
+
+``m``
+ contains a dictionary, mapping macro names onto predicate functions
+
+``l``
+ is a dictionary to map node labels onto nodes in the tree
+
+``m`` and ``l`` are declared to default to ``None``, and so need not be
+specified in a call to a predicate. Predicates which call other
+predicates must always pass the value of these arguments on. The
+top-level predicate (constructed by ``_tgrep_exprs_action``) binds the
+macro definitions to ``m`` and initialises ``l`` to an empty dictionary.
+'''
+
+from __future__ import absolute_import, print_function, unicode_literals
+from nltk.compat import binary_type, text_type
+import functools
+import nltk.tree
+try:
+ import pyparsing
+except ImportError:
+ print('Warning: nltk.tgrep will not work without the `pyparsing` package')
+ print('installed.')
+import re
+
+class TgrepException(Exception):
+ '''Tgrep exception type.'''
+ pass
+
+def ancestors(node):
+ '''
+ Returns the list of all nodes dominating the given tree node.
+ This method will not work with leaf nodes, since there is no way
+ to recover the parent.
+ '''
+ results = []
+ try:
+ current = node.parent()
+ except AttributeError:
+ # if node is a leaf, we cannot retrieve its parent
+ return results
+ while current:
+ results.append(current)
+ current = current.parent()
+ return results
+
+def unique_ancestors(node):
+ '''
+ Returns the list of all nodes dominating the given node, where
+ there is only a single path of descent.
+ '''
+ results = []
+ try:
+ current = node.parent()
+ except AttributeError:
+ # if node is a leaf, we cannot retrieve its parent
+ return results
+ while current and len(current) == 1:
+ results.append(current)
+ current = current.parent()
+ return results
+
+def _descendants(node):
+ '''
+ Returns the list of all nodes which are descended from the given
+ tree node in some way.
+ '''
+ try:
+ treepos = node.treepositions()
+ except AttributeError:
+ return []
+ return [node[x] for x in treepos[1:]]
+
+def _leftmost_descendants(node):
+ '''
+ Returns the set of all nodes descended in some way through
+ left branches from this node.
+ '''
+ try:
+ treepos = node.treepositions()
+ except AttributeError:
+ return []
+ return [node[x] for x in treepos[1:] if all(y == 0 for y in x)]
+
+def _rightmost_descendants(node):
+ '''
+ Returns the set of all nodes descended in some way through
+ right branches from this node.
+ '''
+ try:
+ rightmost_leaf = max(node.treepositions())
+ except AttributeError:
+ return []
+ return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)]
+
+def _istree(obj):
+ '''Predicate to check whether `obj` is a nltk.tree.Tree.'''
+ return isinstance(obj, nltk.tree.Tree)
+
+def _unique_descendants(node):
+ '''
+ Returns the list of all nodes descended from the given node, where
+ there is only a single path of descent.
+ '''
+ results = []
+ current = node
+ while current and _istree(current) and len(current) == 1:
+ current = current[0]
+ results.append(current)
+ return results
+
+def _before(node):
+ '''
+ Returns the set of all nodes that are before the given node.
+ '''
+ try:
+ pos = node.treeposition()
+ tree = node.root()
+ except AttributeError:
+ return []
+ return [tree[x] for x in tree.treepositions()
+ if x[:len(pos)] < pos[:len(x)]]
+
+def _immediately_before(node):
+ '''
+ Returns the set of all nodes that are immediately before the given
+ node.
+
+ Tree node A immediately precedes node B if the last terminal
+ symbol (word) produced by A immediately precedes the first
+ terminal symbol produced by B.
+ '''
+ try:
+ pos = node.treeposition()
+ tree = node.root()
+ except AttributeError:
+ return []
+ # go "upwards" from pos until there is a place we can go to the left
+ idx = len(pos) - 1
+ while 0 <= idx and pos[idx] == 0:
+ idx -= 1
+ if idx < 0:
+ return []
+ pos = list(pos[:idx + 1])
+ pos[-1] -= 1
+ before = tree[pos]
+ return [before] + _rightmost_descendants(before)
+
+def _after(node):
+ '''
+ Returns the set of all nodes that are after the given node.
+ '''
+ try:
+ pos = node.treeposition()
+ tree = node.root()
+ except AttributeError:
+ return []
+ return [tree[x] for x in tree.treepositions()
+ if x[:len(pos)] > pos[:len(x)]]
+
+def _immediately_after(node):
+ '''
+ Returns the set of all nodes that are immediately after the given
+ node.
+
+ Tree node A immediately follows node B if the first terminal
+ symbol (word) produced by A immediately follows the last
+ terminal symbol produced by B.
+ '''
+ try:
+ pos = node.treeposition()
+ tree = node.root()
+ current = node.parent()
+ except AttributeError:
+ return []
+ # go "upwards" from pos until there is a place we can go to the
+ # right
+ idx = len(pos) - 1
+ while 0 <= idx and pos[idx] == len(current) - 1:
+ idx -= 1
+ current = current.parent()
+ if idx < 0:
+ return []
+ pos = list(pos[:idx + 1])
+ pos[-1] += 1
+ after = tree[pos]
+ return [after] + _leftmost_descendants(after)
+
+def _tgrep_node_literal_value(node):
+ '''
+ Gets the string value of a given parse tree node, for comparison
+ using the tgrep node literal predicates.
+ '''
+ return (node.label() if _istree(node) else text_type(node))
+
+def _tgrep_macro_use_action(_s, _l, tokens):
+ '''
+ Builds a lambda function which looks up the macro name used.
+ '''
+ assert len(tokens) == 1
+ assert tokens[0][0] == '@'
+ macro_name = tokens[0][1:]
+ def macro_use(n, m=None, l=None):
+ if m is None or macro_name not in m:
+ raise TgrepException('macro {0} not defined'.format(macro_name))
+ return m[macro_name](n, m, l)
+ return macro_use
+
+def _tgrep_node_action(_s, _l, tokens):
+ '''
+ Builds a lambda function representing a predicate on a tree node
+ depending on the name of its node.
+ '''
+ # print 'node tokens: ', tokens
+ if tokens[0] == "'":
+ # strip initial apostrophe (tgrep2 print command)
+ tokens = tokens[1:]
+ if len(tokens) > 1:
+ # disjunctive definition of a node name
+ assert list(set(tokens[1::2])) == ['|']
+ # recursively call self to interpret each node name definition
+ tokens = [_tgrep_node_action(None, None, [node])
+ for node in tokens[::2]]
+ # capture tokens and return the disjunction
+ return (lambda t: lambda n, m=None, l=None: any(f(n, m, l) for f in t))(tokens)
+ else:
+ if hasattr(tokens[0], '__call__'):
+ # this is a previously interpreted parenthetical node
+ # definition (lambda function)
+ return tokens[0]
+ elif tokens[0] == '*' or tokens[0] == '__':
+ return lambda n, m=None, l=None: True
+ elif tokens[0].startswith('"'):
+ assert tokens[0].endswith('"')
+ node_lit = tokens[0][1:-1].replace('\\"', '"').replace('\\\\', '\\')
+ return (lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s)(node_lit)
+ elif tokens[0].startswith('/'):
+ assert tokens[0].endswith('/')
+ node_lit = tokens[0][1:-1]
+ return (lambda r: lambda n, m=None, l=None:
+ r.search(_tgrep_node_literal_value(n)))(re.compile(node_lit))
+ elif tokens[0].startswith('i@'):
+ node_func = _tgrep_node_action(_s, _l, [tokens[0][2:].lower()])
+ return (lambda f: lambda n, m=None, l=None:
+ f(_tgrep_node_literal_value(n).lower()))(node_func)
+ else:
+ return (lambda s: lambda n, m=None, l=None:
+ _tgrep_node_literal_value(n) == s)(tokens[0])
+
+def _tgrep_parens_action(_s, _l, tokens):
+ '''
+ Builds a lambda function representing a predicate on a tree node
+ from a parenthetical notation.
+ '''
+ # print 'parenthetical tokens: ', tokens
+ assert len(tokens) == 3
+ assert tokens[0] == '('
+ assert tokens[2] == ')'
+ return tokens[1]
+
+def _tgrep_nltk_tree_pos_action(_s, _l, tokens):
+ '''
+ Builds a lambda function representing a predicate on a tree node
+ which returns true if the node is located at a specific tree
+ position.
+ '''
+ # recover the tuple from the parsed sting
+ node_tree_position = tuple(int(x) for x in tokens if x.isdigit())
+ # capture the node's tree position
+ return (lambda i: lambda n, m=None, l=None: (hasattr(n, 'treeposition') and
+ n.treeposition() == i))(node_tree_position)
+
+def _tgrep_relation_action(_s, _l, tokens):
+ '''
+ Builds a lambda function representing a predicate on a tree node
+ depending on its relation to other nodes in the tree.
+ '''
+ # print 'relation tokens: ', tokens
+ # process negation first if needed
+ negated = False
+ if tokens[0] == '!':
+ negated = True
+ tokens = tokens[1:]
+ if tokens[0] == '[':
+ # process square-bracketed relation expressions
+ assert len(tokens) == 3
+ assert tokens[2] == ']'
+ retval = tokens[1]
+ else:
+ # process operator-node relation expressions
+ assert len(tokens) == 2
+ operator, predicate = tokens
+ # A < B A is the parent of (immediately dominates) B.
+ if operator == '<':
+ retval = lambda n, m=None, l=None: (_istree(n) and
+ any(predicate(x, m, l) for x in n))
+ # A > B A is the child of B.
+ elif operator == '>':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ bool(n.parent()) and
+ predicate(n.parent(), m, l))
+ # A <, B Synonymous with A <1 B.
+ elif operator == '<,' or operator == '<1':
+ retval = lambda n, m=None, l=None: (_istree(n) and
+ bool(list(n)) and
+ predicate(n[0], m, l))
+ # A >, B Synonymous with A >1 B.
+ elif operator == '>,' or operator == '>1':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ bool(n.parent()) and
+ (n is n.parent()[0]) and
+ predicate(n.parent(), m, l))
+ # A <N B B is the Nth child of A (the first child is <1).
+ elif operator[0] == '<' and operator[1:].isdigit():
+ idx = int(operator[1:])
+ # capture the index parameter
+ retval = (lambda i: lambda n, m=None, l=None: (_istree(n) and
+ bool(list(n)) and
+ 0 <= i < len(n) and
+ predicate(n[i], m, l)))(idx - 1)
+ # A >N B A is the Nth child of B (the first child is >1).
+ elif operator[0] == '>' and operator[1:].isdigit():
+ idx = int(operator[1:])
+ # capture the index parameter
+ retval = (lambda i: lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ bool(n.parent()) and
+ 0 <= i < len(n.parent()) and
+ (n is n.parent()[i]) and
+ predicate(n.parent(), m, l)))(idx - 1)
+ # A <' B B is the last child of A (also synonymous with A <-1 B).
+ # A <- B B is the last child of A (synonymous with A <-1 B).
+ elif operator == '<\'' or operator == '<-' or operator == '<-1':
+ retval = lambda n, m=None, l=None: (_istree(n) and bool(list(n))
+ and predicate(n[-1], m, l))
+ # A >' B A is the last child of B (also synonymous with A >-1 B).
+ # A >- B A is the last child of B (synonymous with A >-1 B).
+ elif operator == '>\'' or operator == '>-' or operator == '>-1':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ bool(n.parent()) and
+ (n is n.parent()[-1]) and
+ predicate(n.parent(), m, l))
+ # A <-N B B is the N th-to-last child of A (the last child is <-1).
+ elif operator[:2] == '<-' and operator[2:].isdigit():
+ idx = -int(operator[2:])
+ # capture the index parameter
+ retval = (lambda i: lambda n, m=None, l=None: (_istree(n) and
+ bool(list(n)) and
+ 0 <= (i + len(n)) < len(n) and
+ predicate(n[i + len(n)], m, l)))(idx)
+ # A >-N B A is the N th-to-last child of B (the last child is >-1).
+ elif operator[:2] == '>-' and operator[2:].isdigit():
+ idx = -int(operator[2:])
+ # capture the index parameter
+ retval = (lambda i: lambda n, m=None, l=None:
+ (hasattr(n, 'parent') and
+ bool(n.parent()) and
+ 0 <= (i + len(n.parent())) < len(n.parent()) and
+ (n is n.parent()[i + len(n.parent())]) and
+ predicate(n.parent(), m, l)))(idx)
+ # A <: B B is the only child of A
+ elif operator == '<:':
+ retval = lambda n, m=None, l=None: (_istree(n) and
+ len(n) == 1 and
+ predicate(n[0], m, l))
+ # A >: B A is the only child of B.
+ elif operator == '>:':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ bool(n.parent()) and
+ len(n.parent()) == 1 and
+ predicate(n.parent(), m, l))
+ # A << B A dominates B (A is an ancestor of B).
+ elif operator == '<<':
+ retval = lambda n, m=None, l=None: (_istree(n) and
+ any(predicate(x, m, l) for x in _descendants(n)))
+ # A >> B A is dominated by B (A is a descendant of B).
+ elif operator == '>>':
+ retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in ancestors(n))
+ # A <<, B B is a left-most descendant of A.
+ elif operator == '<<,' or operator == '<<1':
+ retval = lambda n, m=None, l=None: (_istree(n) and
+ any(predicate(x, m, l)
+ for x in _leftmost_descendants(n)))
+ # A >>, B A is a left-most descendant of B.
+ elif operator == '>>,':
+ retval = lambda n, m=None, l=None: any((predicate(x, m, l) and
+ n in _leftmost_descendants(x))
+ for x in ancestors(n))
+ # A <<' B B is a right-most descendant of A.
+ elif operator == '<<\'':
+ retval = lambda n, m=None, l=None: (_istree(n) and
+ any(predicate(x, m, l)
+ for x in _rightmost_descendants(n)))
+ # A >>' B A is a right-most descendant of B.
+ elif operator == '>>\'':
+ retval = lambda n, m=None, l=None: any((predicate(x, m, l) and
+ n in _rightmost_descendants(x))
+ for x in ancestors(n))
+ # A <<: B There is a single path of descent from A and B is on it.
+ elif operator == '<<:':
+ retval = lambda n, m=None, l=None: (_istree(n) and
+ any(predicate(x, m, l)
+ for x in _unique_descendants(n)))
+ # A >>: B There is a single path of descent from B and A is on it.
+ elif operator == '>>:':
+ retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in unique_ancestors(n))
+ # A . B A immediately precedes B.
+ elif operator == '.':
+ retval = lambda n, m=None, l=None: any(predicate(x, m, l)
+ for x in _immediately_after(n))
+ # A , B A immediately follows B.
+ elif operator == ',':
+ retval = lambda n, m=None, l=None: any(predicate(x, m, l)
+ for x in _immediately_before(n))
+ # A .. B A precedes B.
+ elif operator == '..':
+ retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in _after(n))
+ # A ,, B A follows B.
+ elif operator == ',,':
+ retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in _before(n))
+ # A $ B A is a sister of B (and A != B).
+ elif operator == '$' or operator == '%':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ bool(n.parent()) and
+ any(predicate(x, m, l)
+ for x in n.parent() if x is not n))
+ # A $. B A is a sister of and immediately precedes B.
+ elif operator == '$.' or operator == '%.':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'right_sibling') and
+ bool(n.right_sibling()) and
+ predicate(n.right_sibling(), m, l))
+ # A $, B A is a sister of and immediately follows B.
+ elif operator == '$,' or operator == '%,':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'left_sibling') and
+ bool(n.left_sibling()) and
+ predicate(n.left_sibling(), m, l))
+ # A $.. B A is a sister of and precedes B.
+ elif operator == '$..' or operator == '%..':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ hasattr(n, 'parent_index') and
+ bool(n.parent()) and
+ any(predicate(x, m, l) for x in
+ n.parent()[n.parent_index() + 1:]))
+ # A $,, B A is a sister of and follows B.
+ elif operator == '$,,' or operator == '%,,':
+ retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+ hasattr(n, 'parent_index') and
+ bool(n.parent()) and
+ any(predicate(x, m, l) for x in
+ n.parent()[:n.parent_index()]))
+ else:
+ raise TgrepException(
+ 'cannot interpret tgrep operator "{0}"'.format(operator))
+ # now return the built function
+ if negated:
+ return (lambda r: (lambda n, m=None, l=None: not r(n, m, l)))(retval)
+ else:
+ return retval
+
+def _tgrep_conjunction_action(_s, _l, tokens, join_char = '&'):
+ '''
+ Builds a lambda function representing a predicate on a tree node
+ from the conjunction of several other such lambda functions.
+
+ This is prototypically called for expressions like
+ (`tgrep_rel_conjunction`)::
+
+ < NP & < AP < VP
+
+ where tokens is a list of predicates representing the relations
+ (`< NP`, `< AP`, and `< VP`), possibly with the character `&`
+ included (as in the example here).
+
+ This is also called for expressions like (`tgrep_node_expr2`)::
+
+ NP < NN
+ S=s < /NP/=n : s < /VP/=v : n .. v
+
+ tokens[0] is a tgrep_expr predicate; tokens[1:] are an (optional)
+ list of segmented patterns (`tgrep_expr_labeled`, processed by
+ `_tgrep_segmented_pattern_action`).
+ '''
+ # filter out the ampersand
+ tokens = [x for x in tokens if x != join_char]
+ # print 'relation conjunction tokens: ', tokens
+ if len(tokens) == 1:
+ return tokens[0]
+ else:
+ return (lambda ts: lambda n, m=None, l=None: all(predicate(n, m, l)
+ for predicate in ts))(tokens)
+
+def _tgrep_segmented_pattern_action(_s, _l, tokens):
+ '''
+ Builds a lambda function representing a segmented pattern.
+
+ Called for expressions like (`tgrep_expr_labeled`)::
+
+ =s .. =v < =n
+
+ This is a segmented pattern, a tgrep2 expression which begins with
+ a node label.
+
+ The problem is that for segemented_pattern_action (': =v < =s'),
+ the first element (in this case, =v) is specifically selected by
+ virtue of matching a particular node in the tree; to retrieve
+ the node, we need the label, not a lambda function. For node
+ labels inside a tgrep_node_expr, we need a lambda function which
+ returns true if the node visited is the same as =v.
+
+ We solve this by creating two copies of a node_label_use in the
+ grammar; the label use inside a tgrep_expr_labeled has a separate
+ parse action to the pred use inside a node_expr. See
+ `_tgrep_node_label_use_action` and
+ `_tgrep_node_label_pred_use_action`.
+ '''
+ # tokens[0] is a string containing the node label
+ node_label = tokens[0]
+ # tokens[1:] is an (optional) list of predicates which must all
+ # hold of the bound node
+ reln_preds = tokens[1:]
+ def pattern_segment_pred(n, m=None, l=None):
+ '''This predicate function ignores its node argument.'''
+ # look up the bound node using its label
+ if l is None or node_label not in l:
+ raise TgrepException('node_label ={0} not bound in pattern'.format(
+ node_label))
+ node = l[node_label]
+ # match the relation predicates against the node
+ return all(pred(node, m, l) for pred in reln_preds)
+ return pattern_segment_pred
+
+def _tgrep_node_label_use_action(_s, _l, tokens):
+ '''
+ Returns the node label used to begin a tgrep_expr_labeled. See
+ `_tgrep_segmented_pattern_action`.
+
+ Called for expressions like (`tgrep_node_label_use`)::
+
+ =s
+
+ when they appear as the first element of a `tgrep_expr_labeled`
+ expression (see `_tgrep_segmented_pattern_action`).
+
+ It returns the node label.
+ '''
+ assert len(tokens) == 1
+ assert tokens[0].startswith('=')
+ return tokens[0][1:]
+
+def _tgrep_node_label_pred_use_action(_s, _l, tokens):
+ '''
+ Builds a lambda function representing a predicate on a tree node
+ which describes the use of a previously bound node label.
+
+ Called for expressions like (`tgrep_node_label_use_pred`)::
+
+ =s
+
+ when they appear inside a tgrep_node_expr (for example, inside a
+ relation). The predicate returns true if and only if its node
+ argument is identical the the node looked up in the node label
+ dictionary using the node's label.
+ '''
+ assert len(tokens) == 1
+ assert tokens[0].startswith('=')
+ node_label = tokens[0][1:]
+ def node_label_use_pred(n, m=None, l=None):
+ # look up the bound node using its label
+ if l is None or node_label not in l:
+ raise TgrepException('node_label ={0} not bound in pattern'.format(
+ node_label))
+ node = l[node_label]
+ # truth means the given node is this node
+ return n is node
+ return node_label_use_pred
+
+def _tgrep_bind_node_label_action(_s, _l, tokens):
+ '''
+ Builds a lambda function representing a predicate on a tree node
+ which can optionally bind a matching node into the tgrep2 string's
+ label_dict.
+
+ Called for expressions like (`tgrep_node_expr2`)::
+
+ /NP/
+ @NP=n
+ '''
+ # tokens[0] is a tgrep_node_expr
+ if len(tokens) == 1:
+ return tokens[0]
+ else:
+ # if present, tokens[1] is the character '=', and tokens[2] is
+ # a tgrep_node_label, a string value containing the node label
+ assert len(tokens) == 3
+ assert tokens[1] == '='
+ node_pred = tokens[0]
+ node_label = tokens[2]
+ def node_label_bind_pred(n, m=None, l=None):
+ if node_pred(n, m, l):
+ # bind `n` into the dictionary `l`
+ if l is None:
+ raise TgrepException(
+ 'cannot bind node_label {0}: label_dict is None'.format(
+ node_label))
+ l[node_label] = n
+ return True
+ else:
+ return False
+ return node_label_bind_pred
+
+def _tgrep_rel_disjunction_action(_s, _l, tokens):
+ '''
+ Builds a lambda function representing a predicate on a tree node
+ from the disjunction of several other such lambda functions.
+ '''
+ # filter out the pipe
+ tokens = [x for x in tokens if x != '|']
+ # print 'relation disjunction tokens: ', tokens
+ if len(tokens) == 1:
+ return tokens[0]
+ elif len(tokens) == 2:
+ return (lambda a, b: lambda n, m=None, l=None:
+ a(n, m, l) or b(n, m, l))(tokens[0], tokens[1])
+
+def _macro_defn_action(_s, _l, tokens):
+ '''
+ Builds a dictionary structure which defines the given macro.
+ '''
+ assert len(tokens) == 3
+ assert tokens[0] == '@'
+ return {tokens[1]: tokens[2]}
+
+def _tgrep_exprs_action(_s, _l, tokens):
+ '''
+ This is the top-lebel node in a tgrep2 search string; the
+ predicate function it returns binds together all the state of a
+ tgrep2 search string.
+
+ Builds a lambda function representing a predicate on a tree node
+ from the disjunction of several tgrep expressions. Also handles
+ macro definitions and macro name binding, and node label
+ definitions and node label binding.
+ '''
+ if len(tokens) == 1:
+ return lambda n, m=None, l=None: tokens[0](n, None, {})
+ # filter out all the semicolons
+ tokens = [x for x in tokens if x != ';']
+ # collect all macro definitions
+ macro_dict = {}
+ macro_defs = [tok for tok in tokens if isinstance(tok, dict)]
+ for macro_def in macro_defs:
+ macro_dict.update(macro_def)
+ # collect all tgrep expressions
+ tgrep_exprs = [tok for tok in tokens if not isinstance(tok, dict)]
+ # create a new scope for the node label dictionary
+ def top_level_pred(n, m=macro_dict, l=None):
+ label_dict = {}
+ # bind macro definitions and OR together all tgrep_exprs
+ return any(predicate(n, m, label_dict) for predicate in tgrep_exprs)
+ return top_level_pred
+
+def _build_tgrep_parser(set_parse_actions = True):
+ '''
+ Builds a pyparsing-based parser object for tokenizing and
+ interpreting tgrep search strings.
+ '''
+ tgrep_op = (pyparsing.Optional('!') +
+ pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*'))
+ tgrep_qstring = pyparsing.QuotedString(quoteChar='"', escChar='\\',
+ unquoteResults=False)
+ tgrep_node_regex = pyparsing.QuotedString(quoteChar='/', escChar='\\',
+ unquoteResults=False)
+ tgrep_qstring_icase = pyparsing.Regex(
+ 'i@\\"(?:[^"\\n\\r\\\\]|(?:\\\\.))*\\"')
+ tgrep_node_regex_icase = pyparsing.Regex(
+ 'i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/')
+ tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%\'^=]+')
+ tgrep_expr = pyparsing.Forward()
+ tgrep_relations = pyparsing.Forward()
+ tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')'
+ tgrep_nltk_tree_pos = (
+ pyparsing.Literal('N(') +
+ pyparsing.Optional(pyparsing.Word(pyparsing.nums) + ',' +
+ pyparsing.Optional(pyparsing.delimitedList(
+ pyparsing.Word(pyparsing.nums), delim=',') +
+ pyparsing.Optional(','))) + ')')
+ tgrep_node_label = pyparsing.Regex('[A-Za-z0-9]+')
+ tgrep_node_label_use = pyparsing.Combine('=' + tgrep_node_label)
+ # see _tgrep_segmented_pattern_action
+ tgrep_node_label_use_pred = tgrep_node_label_use.copy()
+ macro_name = pyparsing.Regex('[^];:.,&|<>()[$!@%\'^=\r\t\n ]+')
+ macro_name.setWhitespaceChars('')
+ macro_use = pyparsing.Combine('@' + macro_name)
+ tgrep_node_expr = (tgrep_node_label_use_pred |
+ macro_use |
+ tgrep_nltk_tree_pos |
+ tgrep_qstring_icase |
+ tgrep_node_regex_icase |
+ tgrep_qstring |
+ tgrep_node_regex |
+ '*' |
+ tgrep_node_literal)
+ tgrep_node_expr2 = ((tgrep_node_expr +
+ pyparsing.Literal('=').setWhitespaceChars('') +
+ tgrep_node_label.copy().setWhitespaceChars('')) |
+ tgrep_node_expr)
+ tgrep_node = (tgrep_parens |
+ (pyparsing.Optional("'") +
+ tgrep_node_expr2 +
+ pyparsing.ZeroOrMore("|" + tgrep_node_expr)))
+ tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']'
+ tgrep_relation = tgrep_brackets | (tgrep_op + tgrep_node)
+ tgrep_rel_conjunction = pyparsing.Forward()
+ tgrep_rel_conjunction << (tgrep_relation +
+ pyparsing.ZeroOrMore(pyparsing.Optional('&') +
+ tgrep_rel_conjunction))
+ tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore(
+ "|" + tgrep_relations)
+ tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations)
+ tgrep_expr_labeled = tgrep_node_label_use + pyparsing.Optional(tgrep_relations)
+ tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(':' + tgrep_expr_labeled)
+ macro_defn = (pyparsing.Literal('@') +
+ pyparsing.White().suppress() +
+ macro_name +
+ tgrep_expr2)
+ tgrep_exprs = (pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(';' + macro_defn) + ';') +
+ tgrep_expr2 +
+ pyparsing.ZeroOrMore(';' + (macro_defn | tgrep_expr2)) +
+ pyparsing.ZeroOrMore(';').suppress())
+ if set_parse_actions:
+ tgrep_node_label_use.setParseAction(_tgrep_node_label_use_action)
+ tgrep_node_label_use_pred.setParseAction(_tgrep_node_label_pred_use_action)
+ macro_use.setParseAction(_tgrep_macro_use_action)
+ tgrep_node.setParseAction(_tgrep_node_action)
+ tgrep_node_expr2.setParseAction(_tgrep_bind_node_label_action)
+ tgrep_parens.setParseAction(_tgrep_parens_action)
+ tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action)
+ tgrep_relation.setParseAction(_tgrep_relation_action)
+ tgrep_rel_conjunction.setParseAction(_tgrep_conjunction_action)
+ tgrep_relations.setParseAction(_tgrep_rel_disjunction_action)
+ macro_defn.setParseAction(_macro_defn_action)
+ # the whole expression is also the conjunction of two
+ # predicates: the first node predicate, and the remaining
+ # relation predicates
+ tgrep_expr.setParseAction(_tgrep_conjunction_action)
+ tgrep_expr_labeled.setParseAction(_tgrep_segmented_pattern_action)
+ tgrep_expr2.setParseAction(functools.partial(_tgrep_conjunction_action,
+ join_char = ':'))
+ tgrep_exprs.setParseAction(_tgrep_exprs_action)
+ return tgrep_exprs.ignore('#' + pyparsing.restOfLine)
+
+def tgrep_tokenize(tgrep_string):
+ '''
+ Tokenizes a TGrep search string into separate tokens.
+ '''
+ parser = _build_tgrep_parser(False)
+ if isinstance(tgrep_string, binary_type):
+ tgrep_string = tgrep_string.decode()
+ return list(parser.parseString(tgrep_string))
+
+def tgrep_compile(tgrep_string):
+ '''
+ Parses (and tokenizes, if necessary) a TGrep search string into a
+ lambda function.
+ '''
+ parser = _build_tgrep_parser(True)
+ if isinstance(tgrep_string, binary_type):
+ tgrep_string = tgrep_string.decode()
+ return list(parser.parseString(tgrep_string, parseAll=True))[0]
+
+def treepositions_no_leaves(tree):
+ '''
+ Returns all the tree positions in the given tree which are not
+ leaf nodes.
+ '''
+ treepositions = tree.treepositions()
+ # leaves are treeposition tuples that are not prefixes of any
+ # other treeposition
+ prefixes = set()
+ for pos in treepositions:
+ for length in range(len(pos)):
+ prefixes.add(pos[:length])
+ return [pos for pos in treepositions if pos in prefixes]
+
+def tgrep_positions(pattern, trees, search_leaves=True):
+ """
+ Return the tree positions in the trees which match the given pattern.
+
+ :param pattern: a tgrep search pattern
+ :type pattern: str or output of tgrep_compile()
+ :param trees: a sequence of NLTK trees (usually ParentedTrees)
+ :type trees: iter(ParentedTree) or iter(Tree)
+ :param search_leaves: whether ot return matching leaf nodes
+ :type search_leaves: bool
+ :rtype: iter(tree positions)
+ """
+
+ if isinstance(pattern, (binary_type, text_type)):
+ pattern = tgrep_compile(pattern)
+
+ for tree in trees:
+ try:
+ if search_leaves:
+ positions = tree.treepositions()
+ else:
+ positions = treepositions_no_leaves(tree)
+ yield [position for position in positions
+ if pattern(tree[position])]
+ except AttributeError:
+ yield []
+
+def tgrep_nodes(pattern, trees, search_leaves=True):
+ """
+ Return the tree nodes in the trees which match the given pattern.
+
+ :param pattern: a tgrep search pattern
+ :type pattern: str or output of tgrep_compile()
+ :param trees: a sequence of NLTK trees (usually ParentedTrees)
+ :type trees: iter(ParentedTree) or iter(Tree)
+ :param search_leaves: whether ot return matching leaf nodes
+ :type search_leaves: bool
+ :rtype: iter(tree nodes)
+ """
+
+ if isinstance(pattern, (binary_type, text_type)):
+ pattern = tgrep_compile(pattern)
+
+ for tree in trees:
+ try:
+ if search_leaves:
+ positions = tree.treepositions()
+ else:
+ positions = treepositions_no_leaves(tree)
+ yield [tree[position] for position in positions
+ if pattern(tree[position])]
+ except AttributeError:
+ yield []
+
+
+# run module doctests
+if __name__ == "__main__":
+ import doctest
+ doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
+
diff --git a/nltk/tokenize/texttiling.py b/nltk/tokenize/texttiling.py
index d8c3e62..9316fdf 100644
--- a/nltk/tokenize/texttiling.py
+++ b/nltk/tokenize/texttiling.py
@@ -52,6 +52,15 @@ class TextTilingTokenizer(TokenizerI):
:param cutoff_policy: The policy used to determine the number of boundaries:
`HC` (default) or `LC`
:type cutoff_policy: constant
+
+ >>> from nltk.corpus import brown
+ >>> tt = TextTilingTokenizer(demo_mode=True)
+ >>> text = brown.raw()[:10000]
+ >>> s, ss, d, b = tt.tokenize(text)
+ >>> b
+ [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
+ 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
+ 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]
"""
def __init__(self,
@@ -284,7 +293,7 @@ class TextTilingTokenizer(TokenizerI):
depth_tuples = sorted(zip(depth_scores, range(len(depth_scores))))
depth_tuples.reverse()
- hp = filter(lambda x:x[0]>cutoff, depth_tuples)
+ hp = list(filter(lambda x:x[0]>cutoff, depth_tuples))
for dt in hp:
boundaries[dt[1]] = 1
@@ -435,7 +444,7 @@ def smooth(x,window_len=11,window='flat'):
def demo(text=None):
from nltk.corpus import brown
- import pylab
+ from matplotlib import pylab
tt = TextTilingTokenizer(demo_mode=True)
if text is None: text = brown.raw()[:10000]
s, ss, d, b = tt.tokenize(text)
diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py
index 465f5d9..7453ef5 100644
--- a/nltk/tokenize/treebank.py
+++ b/nltk/tokenize/treebank.py
@@ -40,6 +40,9 @@ class TreebankWordTokenizer(TokenizerI):
>>> s = "They'll save and invest more."
>>> TreebankWordTokenizer().tokenize(s)
['They', "'ll", 'save', 'and', 'invest', 'more', '.']
+ >>> s = "hi, my name can't hello,"
+ >>> TreebankWordTokenizer().tokenize(s)
+ ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
"""
# List of contractions adapted from Robert MacIntyre's tokenizer.
@@ -64,6 +67,7 @@ class TreebankWordTokenizer(TokenizerI):
#punctuation
text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
+ text = re.sub(r'([:,])$', r' \1 ', text)
text = re.sub(r'\.\.\.', r' ... ', text)
text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
diff --git a/setup.cfg b/setup.cfg
index 861a9f5..6bc2ff3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[egg_info]
-tag_build =
tag_date = 0
+tag_build =
tag_svn_revision = 0
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/nltk.git
More information about the debian-science-commits
mailing list