[nltk] 01/03: Imported Upstream version 3.0.3

Daniel Stender danstender-guest at moszumanska.debian.org
Mon Jun 15 04:54:50 UTC 2015


This is an automated email from the git hooks/post-receive script.

danstender-guest pushed a commit to branch master
in repository nltk.

commit c87559e2e2967a3188fc396fae6ffb11634a50a4
Author: Daniel Stender <debian at danielstender.com>
Date:   Mon Jun 15 06:35:32 2015 +0200

    Imported Upstream version 3.0.3
---
 PKG-INFO                                    |   2 +-
 nltk.egg-info/PKG-INFO                      |   2 +-
 nltk.egg-info/SOURCES.txt                   |   6 +
 nltk/VERSION                                |   2 +-
 nltk/__init__.py                            |  11 +
 nltk/align/api.py                           |  52 ++
 nltk/align/bleu_score.py                    |  50 +-
 nltk/align/ibm1.py                          |   2 +-
 nltk/align/ibm2.py                          |   2 +-
 nltk/app/__init__.py                        |   4 +-
 nltk/app/wordfreq_app.py                    |   6 +-
 nltk/classify/__init__.py                   |   1 +
 nltk/classify/naivebayes.py                 |   6 +-
 nltk/classify/tadm.py                       |   2 +-
 nltk/classify/textcat.py                    | 193 ++++++
 nltk/corpus/__init__.py                     |   7 +
 nltk/corpus/reader/__init__.py              |   3 +-
 nltk/corpus/reader/crubadan.py              | 116 ++++
 nltk/corpus/reader/lin.py                   |   2 +-
 nltk/data.py                                |   2 +-
 nltk/downloader.py                          |   5 +-
 nltk/draw/dispersion.py                     |   6 +-
 nltk/draw/util.py                           |   2 +-
 nltk/internals.py                           |  21 +-
 nltk/metrics/agreement.py                   |   4 +-
 nltk/parse/dependencygraph.py               |  63 +-
 nltk/parse/malt.py                          |   2 +-
 nltk/parse/nonprojectivedependencyparser.py |   3 +-
 nltk/parse/transitionparser.py              |  16 +-
 nltk/probability.py                         |  21 +-
 nltk/sem/boxer.py                           | 116 ++--
 nltk/sem/drt.py                             |  12 +-
 nltk/stem/snowball.py                       |   2 +-
 nltk/tag/__init__.py                        |   6 +-
 nltk/tag/crf.py                             |   2 +-
 nltk/tag/hunpos.py                          |   2 +-
 nltk/tag/stanford.py                        |  65 +-
 nltk/test/crubadan.doctest                  |  65 ++
 nltk/test/gensim.doctest                    | 144 +++++
 nltk/test/unit/test_tgrep.py                | 626 ++++++++++++++++++
 nltk/tgrep.py                               | 941 ++++++++++++++++++++++++++++
 nltk/tokenize/texttiling.py                 |  13 +-
 nltk/tokenize/treebank.py                   |   4 +
 setup.cfg                                   |   2 +-
 44 files changed, 2463 insertions(+), 151 deletions(-)

diff --git a/PKG-INFO b/PKG-INFO
index b5bf363..bdb38c0 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: nltk
-Version: 3.0.2
+Version: 3.0.3
 Summary: Natural Language Toolkit
 Home-page: http://nltk.org/
 Author: Steven Bird
diff --git a/nltk.egg-info/PKG-INFO b/nltk.egg-info/PKG-INFO
index b5bf363..bdb38c0 100644
--- a/nltk.egg-info/PKG-INFO
+++ b/nltk.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: nltk
-Version: 3.0.2
+Version: 3.0.3
 Summary: Natural Language Toolkit
 Home-page: http://nltk.org/
 Author: Steven Bird
diff --git a/nltk.egg-info/SOURCES.txt b/nltk.egg-info/SOURCES.txt
index d0587af..79c94ce 100644
--- a/nltk.egg-info/SOURCES.txt
+++ b/nltk.egg-info/SOURCES.txt
@@ -20,6 +20,7 @@ nltk/jsontags.py
 nltk/lazyimport.py
 nltk/probability.py
 nltk/text.py
+nltk/tgrep.py
 nltk/toolbox.py
 nltk/tree.py
 nltk/treeprettyprinter.py
@@ -80,6 +81,7 @@ nltk/classify/scikitlearn.py
 nltk/classify/senna.py
 nltk/classify/svm.py
 nltk/classify/tadm.py
+nltk/classify/textcat.py
 nltk/classify/util.py
 nltk/classify/weka.py
 nltk/cluster/__init__.py
@@ -101,6 +103,7 @@ nltk/corpus/reader/childes.py
 nltk/corpus/reader/chunked.py
 nltk/corpus/reader/cmudict.py
 nltk/corpus/reader/conll.py
+nltk/corpus/reader/crubadan.py
 nltk/corpus/reader/dependency.py
 nltk/corpus/reader/framenet.py
 nltk/corpus/reader/ieer.py
@@ -244,6 +247,7 @@ nltk/test/compat.doctest
 nltk/test/compat_fixt.py
 nltk/test/corpus.doctest
 nltk/test/corpus_fixt.py
+nltk/test/crubadan.doctest
 nltk/test/data.doctest
 nltk/test/dependency.doctest
 nltk/test/discourse.doctest
@@ -254,6 +258,7 @@ nltk/test/featgram.doctest
 nltk/test/featstruct.doctest
 nltk/test/framenet.doctest
 nltk/test/generate.doctest
+nltk/test/gensim.doctest
 nltk/test/gluesemantics.doctest
 nltk/test/gluesemantics_malt.doctest
 nltk/test/gluesemantics_malt_fixt.py
@@ -307,6 +312,7 @@ nltk/test/unit/test_naivebayes.py
 nltk/test/unit/test_seekable_unicode_stream_reader.py
 nltk/test/unit/test_stem.py
 nltk/test/unit/test_tag.py
+nltk/test/unit/test_tgrep.py
 nltk/test/unit/utils.py
 nltk/tokenize/__init__.py
 nltk/tokenize/api.py
diff --git a/nltk/VERSION b/nltk/VERSION
index b502146..75a22a2 100644
--- a/nltk/VERSION
+++ b/nltk/VERSION
@@ -1 +1 @@
-3.0.2
+3.0.3
diff --git a/nltk/__init__.py b/nltk/__init__.py
index 0882a57..963b882 100644
--- a/nltk/__init__.py
+++ b/nltk/__init__.py
@@ -94,6 +94,17 @@ try:
 except ImportError:
     pass
 
+# Override missing methods on environments where it cannot be used like GAE.
+import subprocess
+if not hasattr(subprocess, 'PIPE'):
+    def _fake_PIPE(*args, **kwargs):
+        raise NotImplementedError('subprocess.PIPE is not supported.')
+    subprocess.PIPE = _fake_PIPE
+if not hasattr(subprocess, 'Popen'):
+    def _fake_Popen(*args, **kwargs):
+        raise NotImplementedError('subprocess.Popen is not supported.')
+    subprocess.Popen = _fake_Popen
+
 ###########################################################
 # TOP-LEVEL MODULES
 ###########################################################
diff --git a/nltk/align/api.py b/nltk/align/api.py
index ec8d890..ddb1470 100644
--- a/nltk/align/api.py
+++ b/nltk/align/api.py
@@ -11,6 +11,8 @@ from __future__ import print_function, unicode_literals
 
 from nltk.compat import python_2_unicode_compatible, string_types
 from nltk.metrics import precision, recall
+import subprocess
+
 
 @python_2_unicode_compatible
 class AlignedSent(object):
@@ -92,6 +94,56 @@ class AlignedSent(object):
 
         return "AlignedSent(%s, %s, %r)" % (words, mots, self._alignment)
 
+    def _to_dot(self):
+        """
+        Dot representation of the aligned sentence 
+        """ 
+        s = 'graph align {\n'
+        s += 'node[shape=plaintext]\n'
+        
+        # Declare node 
+        for w in self._words:
+            s += '"%s_source" [label="%s"] \n' % (w, w)
+            
+        for w in self._mots:
+            s += '"%s_target" [label="%s"] \n' % (w, w)
+            
+        # Alignment 
+        for u,v in self._alignment:           
+            s += '"%s_source" -- "%s_target" \n' % (self._words[u] , self._mots[v] )
+             
+        # Connect the source words 
+        for i in range(len(self._words)-1) :
+            s += '"%s_source" -- "%s_source" [style=invis]\n' % (self._words[i] , self._words[i+1])
+            
+        # Connect the target words 
+        for i in range(len(self._mots)-1) :
+            s += '"%s_target" -- "%s_target" [style=invis]\n' % (self._mots[i] , self._mots[i+1])
+            
+        # Put it in the same rank 
+        s  += '{rank = same; %s}\n' % (' '.join('"%s_source"' % w for w in self._words))
+        s  += '{rank = same; %s}\n' % (' '.join('"%s_target"' % w for w in self._mots))
+
+        s += '}'
+        
+        return s 
+        
+    def _repr_svg_(self):
+        """
+        Ipython magic : show SVG representation of this ``AlignedSent``. 
+        """
+        dot_string = self._to_dot().encode('utf8')
+        output_format = 'svg'
+        try:
+            process = subprocess.Popen(['dot', '-T%s' % output_format], stdin=subprocess.PIPE,
+                                       stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        except OSError:
+            raise Exception('Cannot find the dot binary from Graphviz package')
+        out, err = process.communicate(dot_string)
+         
+        return out
+    
+    
     def __str__(self):
         """
         Return a human-readable string representation for this ``AlignedSent``.
diff --git a/nltk/align/bleu_score.py b/nltk/align/bleu_score.py
index 5472509..8878250 100644
--- a/nltk/align/bleu_score.py
+++ b/nltk/align/bleu_score.py
@@ -203,9 +203,57 @@ def _brevity_penalty(candidate, references):
     length sentence, brevity penalty is used to modify the overall BLEU
     score according to length.
 
+    An example from the paper. There are three references with length 12, 15
+    and 17. And a terse candidate of the length 12. The brevity penalty is 1.
+
+    >>> references = [['a'] * 12, ['a'] * 15, ['a'] * 17]
+    >>> candidate = ['a'] * 12
+    >>> _brevity_penalty(candidate, references)
+    1.0
+
+    In case a candidate translation is shorter than the references, penalty is
+    applied.
+
+    >>> references = [['a'] * 28, ['a'] * 28]
+    >>> candidate = ['a'] * 12
+    >>> _brevity_penalty(candidate, references)
+    0.2635...
+
+    The length of the closest reference is used to compute the penalty. If the
+    length of a candidate is 12, and the reference lengths are 13 and 2, the
+    penalty is applied because the candidate length (12) is less then the
+    closest reference length (13).
+
+    >>> references = [['a'] * 13, ['a'] * 2]
+    >>> candidate = ['a'] * 12
+    >>> _brevity_penalty(candidate, references)
+    0.92...
+
+    The brevity penalty doesn't depend on reference order. More importantly,
+    when two reference sentences are at the same distance, the shortest
+    reference sentence length is used.
+
+    >>> references = [['a'] * 13, ['a'] * 11]
+    >>> candidate = ['a'] * 12
+    >>> _brevity_penalty(candidate, references) == _brevity_penalty(candidate, reversed(references)) == 1
+    True
+
+    A test example from mteval-v13a.pl (starting from the line 705):
+
+    >>> references = [['a'] * 11, ['a'] * 8]
+    >>> candidate = ['a'] * 7
+    >>> _brevity_penalty(candidate, references)
+    0.86...
+
+    >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
+    >>> candidate = ['a'] * 7
+    >>> _brevity_penalty(candidate, references)
+    1.0
+
     """
     c = len(candidate)
-    r = min(abs(len(r) - c) for r in references)
+    ref_lens = (len(reference) for reference in references)
+    r = min(ref_lens, key=lambda ref_len: (abs(ref_len - c), ref_len))
 
     if c > r:
         return 1
diff --git a/nltk/align/ibm1.py b/nltk/align/ibm1.py
index e88fe10..43e618f 100644
--- a/nltk/align/ibm1.py
+++ b/nltk/align/ibm1.py
@@ -15,7 +15,6 @@
 from __future__  import division
 from collections import defaultdict
 from nltk.align  import AlignedSent
-from nltk.corpus import comtrans
 
 class IBMModel1(object):
     """
@@ -28,6 +27,7 @@ class IBMModel1(object):
     Step 2 - Estimate the probability of translation according to the 
              evidence from Step 1. 
 
+    >>> from nltk.corpus import comtrans
     >>> bitexts = comtrans.aligned_sents()[:100]
     >>> ibm = IBMModel1(bitexts, 20)
 
diff --git a/nltk/align/ibm2.py b/nltk/align/ibm2.py
index f2f4b35..cfa70d4 100644
--- a/nltk/align/ibm2.py
+++ b/nltk/align/ibm2.py
@@ -9,7 +9,6 @@
 from __future__  import division
 from collections import defaultdict
 from nltk.align  import AlignedSent
-from nltk.corpus import comtrans
 from nltk.align.ibm1 import IBMModel1
 
 class IBMModel2(object):
@@ -26,6 +25,7 @@ class IBMModel2(object):
     Step 3 - Estimate the probability of translation and alignment according 
              to the evidence from Step 2. 
 
+    >>> from nltk.corpus import comtrans
     >>> bitexts = comtrans.aligned_sents()[:100]
     >>> ibm = IBMModel2(bitexts, 5)
     >>> aligned_sent = ibm.align(bitexts[0])
diff --git a/nltk/app/__init__.py b/nltk/app/__init__.py
index 4297a51..7e02d78 100644
--- a/nltk/app/__init__.py
+++ b/nltk/app/__init__.py
@@ -39,11 +39,11 @@ else:
     from nltk.app.wordnet_app import app as wordnet
 
     try:
-        import pylab
+        from matplotlib import pylab
     except ImportError:
         import warnings
         warnings.warn("nltk.app.wordfreq not loaded "
-                      "(requires the pylab library).")
+                      "(requires the matplotlib library).")
     else:
         from nltk.app.wordfreq_app import app as wordfreq
 
diff --git a/nltk/app/wordfreq_app.py b/nltk/app/wordfreq_app.py
index 3ced28a..2d9bb9b 100644
--- a/nltk/app/wordfreq_app.py
+++ b/nltk/app/wordfreq_app.py
@@ -5,8 +5,8 @@
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
-import pylab
-import nltk.text
+from matplotlib import pylab
+from nltk.text import Text
 from nltk.corpus import gutenberg
 
 def plot_word_freq_dist(text):
@@ -23,7 +23,7 @@ def plot_word_freq_dist(text):
     pylab.show()
 
 def app():
-    t1 = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
+    t1 = Text(gutenberg.words('melville-moby_dick.txt'))
     plot_word_freq_dist(t1)
 
 if __name__ == '__main__':
diff --git a/nltk/classify/__init__.py b/nltk/classify/__init__.py
index 972995b..1f57ee8 100644
--- a/nltk/classify/__init__.py
+++ b/nltk/classify/__init__.py
@@ -95,3 +95,4 @@ from nltk.classify.maxent import (MaxentClassifier, BinaryMaxentFeatureEncoding,
                                   TypedMaxentFeatureEncoding,
                                   ConditionalExponentialClassifier)
 from nltk.classify.senna import Senna
+from nltk.classify.textcat import TextCat
diff --git a/nltk/classify/naivebayes.py b/nltk/classify/naivebayes.py
index 5f1cffb..6f473e2 100644
--- a/nltk/classify/naivebayes.py
+++ b/nltk/classify/naivebayes.py
@@ -178,8 +178,8 @@ class NaiveBayesClassifier(ClassifierI):
                           minprob[feature_]/maxprob[feature_])
         return features[:n]
 
-    @staticmethod
-    def train(labeled_featuresets, estimator=ELEProbDist):
+    @classmethod
+    def train(cls, labeled_featuresets, estimator=ELEProbDist):
         """
         :param labeled_featuresets: A list of classified featuresets,
             i.e., a list of tuples ``(featureset, label)``.
@@ -225,7 +225,7 @@ class NaiveBayesClassifier(ClassifierI):
             probdist = estimator(freqdist, bins=len(feature_values[fname]))
             feature_probdist[label, fname] = probdist
 
-        return NaiveBayesClassifier(label_probdist, feature_probdist)
+        return cls(label_probdist, feature_probdist)
 
 ##//////////////////////////////////////////////////////
 ##  Demo
diff --git a/nltk/classify/tadm.py b/nltk/classify/tadm.py
index 6437d98..c019f00 100644
--- a/nltk/classify/tadm.py
+++ b/nltk/classify/tadm.py
@@ -14,7 +14,7 @@ from nltk.internals import find_binary
 try:
     import numpy
 except ImportError:
-    numpy = None
+    pass
 
 _tadm_bin = None
 def config_tadm(bin=None):
diff --git a/nltk/classify/textcat.py b/nltk/classify/textcat.py
new file mode 100644
index 0000000..cb29805
--- /dev/null
+++ b/nltk/classify/textcat.py
@@ -0,0 +1,193 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Language ID module using TextCat algorithm
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Avital Pekker <avital.pekker at utoronto.ca>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A module for language identification using the TextCat algorithm.
+An implementation of the text categorization algorithm
+presented in Cavnar, W. B. and J. M. Trenkle, 
+"N-Gram-Based Text Categorization".
+
+The algorithm takes advantage of Zipf's law and uses 
+n-gram frequencies to profile languages and text-yet to
+be identified-then compares using a distance measure.
+
+Language n-grams are provided by the "An Crubadan"
+project. A corpus reader was created seperately to read
+those files.
+
+For details regarding the algorithm, see:
+http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
+
+For details about An Crubadan, see:
+http://borel.slu.edu/crubadan/index.html
+"""
+
+# Ensure that literal strings default to unicode rather than str.
+from __future__ import print_function, unicode_literals
+
+from nltk.compat import PY3
+from nltk.util import trigrams
+
+if PY3:
+    from sys import maxsize
+else:
+    from sys import maxint
+
+# Note: this is NOT "re" you're likely used to. The regex module
+# is an alternative to the standard re module that supports
+# Unicode codepoint properties with the \p{} syntax.
+# You may have to "pip install regx"
+try:
+    import regex as re
+except ImportError:
+    re = None
+######################################################################
+##  Language identification using TextCat
+######################################################################
+
+class TextCat(object):
+
+    _corpus = None
+    fingerprints = {}
+    _START_CHAR = "<"
+    _END_CHAR = ">"
+    
+    last_distances = {}
+    
+    def __init__(self):
+        if not re:
+            raise EnvironmentError("classify.textcat requires the regex module that "
+                                   "supports unicode. Try '$ pip install regex' and "
+                                   "see https://pypi.python.org/pypi/regex for "
+                                   "further details.")
+
+        from nltk.corpus import crubadan
+        self._corpus = crubadan
+        # Load all language ngrams into cache
+        for lang in self._corpus.langs():
+            self._corpus.lang_freq(lang)
+        
+    def remove_punctuation(self, text):
+        ''' Get rid of punctuation except apostrophes '''
+        return re.sub(r"[^\P{P}\']+", "", text)
+    
+    def profile(self, text):
+        ''' Create FreqDist of trigrams within text '''
+        from nltk import word_tokenize, FreqDist
+
+        clean_text = self.remove_punctuation(text)
+        tokens = word_tokenize(clean_text)
+        
+        fingerprint = FreqDist()
+        for t in tokens:
+            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
+            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
+
+            for cur_trigram in token_trigrams:
+                if cur_trigram in fingerprint:
+                    fingerprint[cur_trigram] += 1
+                else:
+                    fingerprint[cur_trigram] = 1
+
+        return fingerprint
+        
+    def calc_dist(self, lang, trigram, text_profile):
+        ''' Calculate the "out-of-place" measure between the
+            text and language profile for a single trigram '''
+
+        lang_fd = self._corpus.lang_freq(lang)
+        dist = 0
+
+        if trigram in lang_fd:
+            idx_lang_profile = list(lang_fd.keys()).index(trigram)
+            idx_text = list(text_profile.keys()).index(trigram)
+
+            #print(idx_lang_profile, ", ", idx_text)
+            dist = abs(idx_lang_profile - idx_text) 
+        else:
+            # Arbitrary but should be larger than
+            # any possible trigram file length
+            # in terms of total lines
+            if PY3:
+                dist = maxsize
+            else:
+                dist = maxint
+
+        return dist
+        
+    def lang_dists(self, text):
+        ''' Calculate the "out-of-place" measure between
+            the text and all languages '''
+        
+        distances = {}
+        profile = self.profile(text)
+        # For all the languages
+        for lang in self._corpus._all_lang_freq.keys():
+            # Calculate distance metric for every trigram in
+            # input text to be identified
+            lang_dist = 0
+            for trigram in profile:
+                lang_dist += self.calc_dist(lang, trigram, profile)
+        
+            distances[lang] = lang_dist
+            
+        return distances
+    
+    def guess_language(self, text):
+        ''' Find the language with the min distance
+            to the text and return its ISO 639-3 code '''
+        self.last_distances = self.lang_dists(text)
+        
+        return min(self.last_distances, key=self.last_distances.get)
+        #################################################')
+
+def demo():
+    from nltk.corpus import udhr
+
+    langs = ['Kurdish-UTF8', 'Abkhaz-UTF8', 'Farsi_Persian-UTF8',
+             'Hindi-UTF8', 'Hawaiian-UTF8', 'Russian-UTF8', 'Vietnamese-UTF8',
+             'Serbian_Srpski-UTF8','Esperanto-UTF8']
+
+    friendly = {'kmr':'Northern Kurdish',
+                'abk':'Abkhazian',
+                'pes':'Iranian Persian',
+                'hin':'Hindi',
+                'haw':'Hawaiian',
+                'rus':'Russian',
+                'vie':'Vietnamese',
+                'srp':'Serbian',
+                'epo':'Esperanto'}
+        
+    tc = TextCat()
+
+    for cur_lang in langs:
+        # Get raw data from UDHR corpus
+        raw_sentences = udhr.sents(cur_lang)
+        rows = len(raw_sentences) - 1
+        cols = list(map(len, raw_sentences))
+
+        sample = ''
+          
+        # Generate a sample text of the language
+        for i in range(0, rows):
+            cur_sent = ''
+            for j in range(0, cols[i]):
+                cur_sent += ' ' + raw_sentences[i][j]
+            
+            sample += cur_sent
+          
+        # Try to detect what it is
+        print('Language snippet: ' + sample[0:140] + '...')
+        guess = tc.guess_language(sample)
+        print('Language detection: %s (%s)' % (guess, friendly[guess]))
+        print('#' * 140)
+
+
+if __name__ == '__main__':
+    demo()
diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py
index 101dd26..08d7011 100644
--- a/nltk/corpus/__init__.py
+++ b/nltk/corpus/__init__.py
@@ -95,6 +95,8 @@ conll2007 = LazyCorpusLoader(
     'conll2007', DependencyCorpusReader, '.*\.(test|train).*', encoding=[
         ('eus', 'ISO-8859-2'),
         ('esp', 'utf8')])
+crubadan = LazyCorpusLoader(
+    'crubadan', CrubadanCorpusReader, '.*\.txt')
 dependency_treebank = LazyCorpusLoader(
     'dependency_treebank', DependencyCorpusReader, '.*\.dp',
     encoding='ascii')
@@ -182,6 +184,10 @@ stopwords = LazyCorpusLoader(
     'stopwords', WordListCorpusReader, r'(?!README|\.).*', encoding='utf8')
 swadesh = LazyCorpusLoader(
     'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8')
+swadesh110 = LazyCorpusLoader(
+    'panlex_swadesh', SwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8')
+swadesh207 = LazyCorpusLoader(
+    'panlex_swadesh', SwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8')
 switchboard = LazyCorpusLoader(
     'switchboard', SwitchboardCorpusReader, tagset='wsj')
 timit = LazyCorpusLoader(
@@ -246,6 +252,7 @@ semcor = LazyCorpusLoader(
     'semcor', SemcorCorpusReader, r'brown./tagfiles/br-.*\.xml',
     wordnet) # Must be defined *after* wordnet corpus.
 
+  
 def demo():
     # This is out-of-date:
     abc.demo()
diff --git a/nltk/corpus/reader/__init__.py b/nltk/corpus/reader/__init__.py
index e48be76..047f358 100644
--- a/nltk/corpus/reader/__init__.py
+++ b/nltk/corpus/reader/__init__.py
@@ -94,6 +94,7 @@ from nltk.corpus.reader.udhr import *
 from nltk.corpus.reader.bnc import *
 from nltk.corpus.reader.sentiwordnet import *
 from nltk.corpus.reader.nkjp import *
+from nltk.corpus.reader.crubadan import *
 
 # Make sure that nltk.corpus.reader.bracket_parse gives the module, not
 # the function bracket_parse() defined in nltk.tree:
@@ -129,5 +130,5 @@ __all__ = [
     'TimitTaggedCorpusReader', 'LinThesaurusCorpusReader',
     'SemcorCorpusReader', 'FramenetCorpusReader', 'UdhrCorpusReader',
     'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset',
-    'NKJPCorpusReader'
+    'NKJPCorpusReader', 'CrubadanCorpusReader'
 ]
diff --git a/nltk/corpus/reader/crubadan.py b/nltk/corpus/reader/crubadan.py
new file mode 100644
index 0000000..73e3fbf
--- /dev/null
+++ b/nltk/corpus/reader/crubadan.py
@@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: An Crubadan N-grams Reader
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Avital Pekker <avital.pekker at utoronto.ca>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+An NLTK interface for the n-gram statistics gathered from
+the corpora for each language using An Crubadan.
+
+There are multiple potential applications for the data but
+this reader was created with the goal of using it in the
+context of language identification.
+
+For details about An Crubadan, this data, and its potential uses, see:
+http://borel.slu.edu/crubadan/index.html
+"""
+
+from __future__ import print_function, unicode_literals
+
+import re
+from nltk.compat import PY3
+from os import path
+from nltk.corpus.reader import CorpusReader
+from nltk.probability import FreqDist
+from nltk.data import ZipFilePathPointer
+
+class CrubadanCorpusReader(CorpusReader):
+    """
+    A corpus reader used to access language An Crubadan n-gram files.
+    """
+    
+    _LANG_MAPPER_FILE = 'table.txt'
+    _all_lang_freq = {}
+    
+    def __init__(self, root, fileids, encoding='utf8', tagset=None):
+        super(CrubadanCorpusReader, self).__init__(root, fileids, encoding='utf8')
+        self._lang_mapping_data = []
+        self._load_lang_mapping_data()
+        
+    def lang_freq(self, lang):
+        ''' Return n-gram FreqDist for a specific language
+            given ISO 639-3 language code '''
+        
+        if lang not in self._all_lang_freq:
+            self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
+
+        return self._all_lang_freq[lang]
+    
+    def langs(self):
+        ''' Return a list of supported languages as ISO 639-3 codes '''
+        return [row[1] for row in self._lang_mapping_data]
+            
+    def iso_to_crubadan(self, lang):
+        ''' Return internal Crubadan code based on ISO 639-3 code '''
+        for i in self._lang_mapping_data:
+            if i[1].lower() == lang.lower():
+                return i[0]
+    
+    def crubadan_to_iso(self, lang):
+        ''' Return ISO 639-3 code given internal Crubadan code '''
+        for i in self._lang_mapping_data:
+            if i[0].lower() == lang.lower():
+                return i[1]
+    
+    def _load_lang_mapping_data(self):
+        ''' Load language mappings between codes and description from table.txt '''
+        if isinstance(self.root, ZipFilePathPointer):
+            raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()")
+        
+        mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
+        if self._LANG_MAPPER_FILE not in self.fileids():
+            raise RuntimeError("Could not find language mapper file: " + mapper_file)
+
+        if PY3:
+            raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
+        else:
+            raw = open(mapper_file, 'rU').read().decode('utf-8').strip()
+
+        self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')]
+        
+    def _load_lang_ngrams(self, lang):
+        ''' Load single n-gram language file given the ISO 639-3 language code
+            and return its FreqDist '''
+
+        if lang not in self.langs():
+            raise RuntimeError("Unsupported language.")
+
+        crubadan_code = self.iso_to_crubadan(lang)
+        ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')
+
+        if not path.isfile(ngram_file):
+            raise Runtime("No N-gram file found for requested language.")
+
+        counts = FreqDist()
+        if PY3:
+            f = open(ngram_file, 'r', encoding='utf-8')
+        else:
+            f = open(ngram_file, 'rU')
+
+        for line in f:
+            if PY3:
+                data = line.split(' ')
+            else:
+                data = line.decode('utf8').split(' ')
+
+            ngram = data[1].strip('\n')
+            freq = int(data[0])
+            
+            counts[ngram] = freq
+            
+        return counts
+        
diff --git a/nltk/corpus/reader/lin.py b/nltk/corpus/reader/lin.py
index d6b3d65..05aeb97 100644
--- a/nltk/corpus/reader/lin.py
+++ b/nltk/corpus/reader/lin.py
@@ -140,7 +140,7 @@ def demo():
     print(thes.synonyms(word1))
 
     print("Getting scored synonyms for " + word1)
-    print(thes.synonyms(word1))
+    print(thes.scored_synonyms(word1))
 
     print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
     print(thes.synonyms(word1, fileid="simN.lsp"))
diff --git a/nltk/data.py b/nltk/data.py
index e9e9fa4..93055a1 100644
--- a/nltk/data.py
+++ b/nltk/data.py
@@ -73,7 +73,7 @@ path = []
 
 # User-specified locations:
 path += [d for d in os.environ.get('NLTK_DATA', str('')).split(os.pathsep) if d]
-if os.path.expanduser('~/') != '~/':
+if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
     path.append(os.path.expanduser(str('~/nltk_data')))
 
 if sys.platform.startswith('win'):
diff --git a/nltk/downloader.py b/nltk/downloader.py
index 08390d4..9d9984a 100644
--- a/nltk/downloader.py
+++ b/nltk/downloader.py
@@ -924,6 +924,10 @@ class Downloader(object):
         permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
         ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
         """
+        # Check if we are on GAE where we cannot write into filesystem.
+        if 'APPENGINE_RUNTIME' in os.environ:
+            return
+
         # Check if we have sufficient permissions to install in a
         # variety of system-wide locations.
         for nltkdir in nltk.data.path:
@@ -2267,4 +2271,3 @@ if __name__ == '__main__':
         downloader.download(download_dir=options.dir,
             quiet=options.quiet, force=options.force,
             halt_on_error=options.halt_on_error)
-
diff --git a/nltk/draw/dispersion.py b/nltk/draw/dispersion.py
index 2ba89e7..eddc36f 100644
--- a/nltk/draw/dispersion.py
+++ b/nltk/draw/dispersion.py
@@ -22,10 +22,10 @@ def dispersion_plot(text, words, ignore_case=False):
     """
 
     try:
-        import pylab
+        from matplotlib import pylab
     except ImportError:
-        raise ValueError('The plot function requires the matplotlib package (aka pylab).'
-                     'See http://matplotlib.sourceforge.net/')
+        raise ValueError('The plot function requires matplotlib to be installed.'
+                     'See http://matplotlib.org/')
 
     text = list(text)
     words.reverse()
diff --git a/nltk/draw/util.py b/nltk/draw/util.py
index 0d5c83c..468775b 100644
--- a/nltk/draw/util.py
+++ b/nltk/draw/util.py
@@ -1930,7 +1930,7 @@ class EntryDialog(object):
         self._original_text = original_text
         self._set_callback = set_callback
 
-        width = max(30, len(original_text)*3/2)
+        width = int(max(30, len(original_text)*3/2))
         self._top = Toplevel(parent)
 
         if title: self._top.title(title)
diff --git a/nltk/internals.py b/nltk/internals.py
index 4674d9e..cfe938e 100644
--- a/nltk/internals.py
+++ b/nltk/internals.py
@@ -500,7 +500,7 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
                         print('[Found %s: %s]' % (filename, path))
                     yielded = True
                     yield path
-            except (KeyboardInterrupt, SystemExit):
+            except (KeyboardInterrupt, SystemExit, OSError):
                 raise
             except:
                 pass
@@ -513,7 +513,7 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
         if searchpath:
             msg += '\n\n  Searched in:'
             msg += ''.join('\n    - %s' % d for d in searchpath)
-        if url: msg += ('\n\n  For more information, on %s, see:\n    <%s>' %
+        if url: msg += ('\n\n  For more information on %s, see:\n    <%s>' %
                         (filename, url))
         div = '='*75
         raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
@@ -592,6 +592,23 @@ def find_jar_iter(name_pattern, path_to_jar=None, env_vars=(),
                                 print('[Found %s: %s]' % (name_pattern, cp))
                             yielded = True
                             yield cp
+                    # The case where user put directory containing the jar file in the classpath 
+                    if os.path.isdir(cp):
+                        if not is_regex:
+                            if os.path.isfile(os.path.join(cp,name_pattern)):
+                                if verbose:
+                                    print('[Found %s: %s]' % (name_pattern, cp))
+                                yielded = True
+                                yield os.path.join(cp,name_pattern)
+                        else:
+                            # Look for file using regular expression 
+                            for file_name in os.listdir(cp):
+                                if re.match(name_pattern,file_name):
+                                    if verbose:
+                                        print('[Found %s: %s]' % (name_pattern, os.path.join(cp,file_name)))
+                                    yielded = True
+                                    yield os.path.join(cp,file_name)
+                                
             else:
                 jar_env = os.environ[env_var]
                 jar_iter = ((os.path.join(jar_env, path_to_jar) for path_to_jar in os.listdir(jar_env))
diff --git a/nltk/metrics/agreement.py b/nltk/metrics/agreement.py
index 3a7f0ea..b379a07 100644
--- a/nltk/metrics/agreement.py
+++ b/nltk/metrics/agreement.py
@@ -232,8 +232,8 @@ class AnnotationTask(object):
         data = (x for x in self.data if x['coder'] in (cA, cB))
         for i, itemdata in self._grouped_data('item', data):
             # we should have two items; distance doesn't care which comes first
-            total += self.distance(itemdata.next()['labels'],
-                    itemdata.next()['labels'])
+            total += self.distance(next(itemdata)['labels'],
+                                   next(itemdata)['labels'])
 
         ret = total / (len(self.I) * max_distance)
         log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret)
diff --git a/nltk/parse/dependencygraph.py b/nltk/parse/dependencygraph.py
index 5646914..f5f6ef8 100755
--- a/nltk/parse/dependencygraph.py
+++ b/nltk/parse/dependencygraph.py
@@ -19,6 +19,7 @@ from collections import defaultdict
 from itertools import chain
 from pprint import pformat
 import subprocess
+import warnings
 
 from nltk.tree import Tree
 from nltk.compat import python_2_unicode_compatible, string_types
@@ -50,14 +51,21 @@ class DependencyGraph(object):
         are split by whitespace.
 
         """
-        self.nodes = defaultdict(lambda: {'deps': defaultdict(list)})
+        self.nodes = defaultdict(lambda:  {'address': None,
+                                           'word': None,
+                                           'lemma': None,
+                                           'ctag': None,
+                                           'tag': None,
+                                           'feats': None,
+                                           'head': None,
+                                           'deps': defaultdict(list),
+                                           'rel': None,
+                                           })
+
         self.nodes[0].update(
             {
-                'word': None,
-                'lemma': None,
                 'ctag': 'TOP',
                 'tag': 'TOP',
-                'feats': None,
                 'rel': 'TOP',
                 'address': 0,
             }
@@ -291,13 +299,14 @@ class DependencyGraph(object):
                 rel = 'ROOT'
             self.nodes[head]['deps'][rel].append(index)
 
-        if not self.nodes[0]['deps']['ROOT']:
-            raise DependencyGraphError(
-                "The graph does'n contain a node "
+        if self.nodes[0]['deps']['ROOT']:
+            root_address = self.nodes[0]['deps']['ROOT'][0]
+            self.root = self.nodes[root_address]
+        else:
+            warnings.warn(
+                "The graph doesn't contain a node "
                 "that depends on the root element."
             )
-        root_address = self.nodes[0]['deps']['ROOT'][0]
-        self.root = self.nodes[root_address]
 
     def _word(self, node, filter=True):
         w = node['word']
@@ -447,7 +456,7 @@ class DependencyGraph(object):
 
     def nx_graph(self):
         """Convert the data in a ``nodelist`` into a networkx labeled directed graph."""
-        import networkx as NX
+        import networkx
 
         nx_nodelist = list(range(1, len(self.nodes)))
         nx_edgelist = [
@@ -458,7 +467,7 @@ class DependencyGraph(object):
         for n in nx_nodelist:
             self.nx_labels[n] = self.nodes[n]['word']
 
-        g = NX.XDiGraph()
+        g = networkx.MultiDiGraph()
         g.add_nodes_from(nx_nodelist)
         g.add_edges_from(nx_edgelist)
 
@@ -504,19 +513,19 @@ Nov.    NNP     9       VMOD
     tree.pprint()
     if nx:
         # currently doesn't work
-        import networkx as NX
-        import pylab as P
+        import networkx
+        from matplotlib import pylab
 
         g = dg.nx_graph()
         g.info()
-        pos = NX.spring_layout(g, dim=1)
-        NX.draw_networkx_nodes(g, pos, node_size=50)
-        # NX.draw_networkx_edges(g, pos, edge_color='k', width=8)
-        NX.draw_networkx_labels(g, pos, dg.nx_labels)
-        P.xticks([])
-        P.yticks([])
-        P.savefig('tree.png')
-        P.show()
+        pos = networkx.spring_layout(g, dim=1)
+        networkx.draw_networkx_nodes(g, pos, node_size=50)
+        # networkx.draw_networkx_edges(g, pos, edge_color='k', width=8)
+        networkx.draw_networkx_labels(g, pos, dg.nx_labels)
+        pylab.xticks([])
+        pylab.yticks([])
+        pylab.savefig('tree.png')
+        pylab.show()
 
 
 def conll_demo():
@@ -545,13 +554,11 @@ def cycle_finding_demo():
     dg = DependencyGraph(treebank_data)
     print(dg.contains_cycle())
     cyclic_dg = DependencyGraph()
-    top = {'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0}
-    child1 = {'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1}
-    child2 = {'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2}
-    child3 = {'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3}
-    child4 = {'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4}
-    cyclic_dg.nodelist = [top, child1, child2, child3, child4]
-    cyclic_dg.root = top
+    cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0})
+    cyclic_dg.add_node({'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1})
+    cyclic_dg.add_node({'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2})
+    cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3})
+    cyclic_dg.add_node({'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4})
     print(cyclic_dg.contains_cycle())
 
 treebank_data = """Pierre  NNP     2       NMOD
diff --git a/nltk/parse/malt.py b/nltk/parse/malt.py
index e86dde4..b9455ed 100644
--- a/nltk/parse/malt.py
+++ b/nltk/parse/malt.py
@@ -15,7 +15,6 @@ from functools import reduce
 import subprocess
 
 from nltk.data import ZipFilePathPointer
-from nltk.tag import RegexpTagger
 from nltk.tokenize import word_tokenize
 from nltk.internals import find_binary
 
@@ -43,6 +42,7 @@ class MaltParser(ParserI):
         if tagger is not None:
             self.tagger = tagger
         else:
+            from nltk.tag import RegexpTagger
             self.tagger = RegexpTagger(
             [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
              (r'(The|the|A|a|An|an)$', 'AT'),   # articles
diff --git a/nltk/parse/nonprojectivedependencyparser.py b/nltk/parse/nonprojectivedependencyparser.py
index 6de271e..d880e68 100644
--- a/nltk/parse/nonprojectivedependencyparser.py
+++ b/nltk/parse/nonprojectivedependencyparser.py
@@ -14,7 +14,6 @@ import logging
 from nltk.compat import xrange
 
 from nltk.parse.dependencygraph import DependencyGraph
-from nltk.classify import NaiveBayesClassifier
 
 logger = logging.getLogger(__name__)
 
@@ -111,6 +110,8 @@ class NaiveBayesDependencyScorer(DependencyScorerI):
         :param graphs: A list of dependency graphs to train the scorer.
         """
 
+        from nltk.classify import NaiveBayesClassifier
+
         # Create training labeled training examples
         labeled_examples = []
         for graph in graphs:
diff --git a/nltk/parse/transitionparser.py b/nltk/parse/transitionparser.py
index 8a678d6..ae39701 100644
--- a/nltk/parse/transitionparser.py
+++ b/nltk/parse/transitionparser.py
@@ -16,8 +16,8 @@ from os import remove
 from copy import deepcopy
 from operator import itemgetter
 try:
-    from scipy import sparse
     from numpy import array
+    from scipy import sparse
     from sklearn.datasets import load_svmlight_file
     from sklearn import svm
 except ImportError:
@@ -328,11 +328,13 @@ class TransitionParser(ParserI):
     def _is_projective(self, depgraph):
         arc_list = []
         for key in depgraph.nodes:
-            node = depgraph.nodes[key]
+            node = depgraph.nodes[key]           
+            
             if 'head' in node:
                 childIdx = node['address']
                 parentIdx = node['head']
-                arc_list.append((parentIdx, childIdx))
+                if parentIdx is not None:
+                    arc_list.append((parentIdx, childIdx))
 
         for (parentIdx, childIdx) in arc_list:
             # Ensure that childIdx < parentIdx
@@ -756,14 +758,14 @@ def demo():
 
     >>> result = parser_std.parse([gold_sent], 'temp.arcstd.model')
     >>> de = DependencyEvaluator(result, [gold_sent])
-    >>> print(de.eval())
-    (0.125, 0.0)
+    >>> de.eval() >= (0, 0)
+    True
 
     B. Check the ARC-EAGER parser
     >>> result = parser_eager.parse([gold_sent], 'temp.arceager.model')
     >>> de = DependencyEvaluator(result, [gold_sent])
-    >>> print(de.eval())
-    (0.0, 0.0)
+    >>> de.eval() >= (0, 0)
+    True
 
     Note that result is very poor because of only one training example.
     """
diff --git a/nltk/probability.py b/nltk/probability.py
index 1ae001f..b63835f 100644
--- a/nltk/probability.py
+++ b/nltk/probability.py
@@ -226,10 +226,10 @@ class FreqDist(Counter):
         :type title: bool
         """
         try:
-            import pylab
+            from matplotlib import pylab
         except ImportError:
-            raise ValueError('The plot function requires the matplotlib package (aka pylab). '
-                         'See http://matplotlib.sourceforge.net/')
+            raise ValueError('The plot function requires matplotlib to be installed.'
+                         'See http://matplotlib.org/')
 
         if len(args) == 0:
             args = [len(self)]
@@ -1272,6 +1272,11 @@ class SimpleGoodTuringProbDist(ProbDistI):
             xy_cov += (x - x_mean) * (y - y_mean)
             x_var += (x - x_mean)**2
         self._slope = (xy_cov / x_var if x_var != 0 else 0.0)
+        if self._slope >= -1:
+            warnings.warn('SimpleGoodTuring did not find a proper best fit '
+                          'line for smoothing probabilities of occurrences. '
+                          'The probability estimates are likely to be '
+                          'unreliable.')
         self._intercept = y_mean - self._slope * x_mean
 
     def _switch(self, r, nr):
@@ -1516,9 +1521,9 @@ class KneserNeyProbDist(ProbDistI):
     """
     def __init__(self, freqdist, bins=None, discount=0.75):
         """
-        :param trigrams: The trigram frequency distribution upon which to base
+        :param freqdist: The trigram frequency distribution upon which to base
             the estimation
-        :type trigrams: FreqDist
+        :type freqdist: FreqDist
         :param bins: Included for compatibility with nltk.tag.hmm
         :type bins: int or float
         :param discount: The discount applied when retrieving counts of
@@ -1739,10 +1744,10 @@ class ConditionalFreqDist(defaultdict):
         :type conditions: list
         """
         try:
-            import pylab
+            from matplotlib import pylab
         except ImportError:
-            raise ValueError('The plot function requires the matplotlib package (aka pylab).'
-                             'See http://matplotlib.sourceforge.net/')
+            raise ValueError('The plot function requires matplotlib to be installed.'
+                         'See http://matplotlib.org/')
 
         cumulative = _get_kwarg(kwargs, 'cumulative', False)
         conditions = _get_kwarg(kwargs, 'conditions', sorted(self.conditions()))
diff --git a/nltk/sem/boxer.py b/nltk/sem/boxer.py
index 19a66e1..41d74d7 100644
--- a/nltk/sem/boxer.py
+++ b/nltk/sem/boxer.py
@@ -171,7 +171,7 @@ class Boxer(object):
 
         args = ['--box', 'false',
                 '--semantics', 'drs',
-                '--flat', 'false',
+                #'--flat', 'false', # removed from boxer
                 '--resolve', 'true',
                 '--elimeq', ['false','true'][self._elimeq],
                 '--format', 'prolog',
@@ -237,7 +237,9 @@ class Boxer(object):
                 i += 1
                 line = lines[i]
                 assert line.startswith('sem(%s,' % drs_id)
-                assert line.endswith(').')
+                if line[-4:] == "').'":
+                    line = line[:-4] + ")."
+                assert line.endswith(').'), "can't parse line: %s" % line
 
                 search_start = len('sem(%s,[' % drs_id)
                 brace_count = 1
@@ -248,7 +250,11 @@ class Boxer(object):
                     if(c == ']'):
                         brace_count -= 1
                         if(brace_count == 0):
-                            drs_start = search_start + j + 2
+                            drs_start = search_start + j + 1
+                            if line[drs_start:drs_start+3] == "','":
+                                drs_start = drs_start + 3
+                            else:
+                                drs_start = drs_start + 1
                             break
                 assert drs_start > -1
 
@@ -272,10 +278,8 @@ class BoxerOutputDrsParser(DrtParser):
         self.discourse_id = discourse_id
         self.sentence_id_offset = None
         self.quote_chars = [("'", "'", "\\", False)]
-        self._label_counter = None
 
     def parse(self, data, signature=None):
-        self._label_counter = Counter(-1)
         return DrtParser.parse(self, data, signature)
 
     def get_all_symbols(self):
@@ -339,6 +343,8 @@ class BoxerOutputDrsParser(DrtParser):
 
         elif tok == 'whq':
             conds = [self._handle_whq()]
+        elif tok == 'duplex':
+                conds = [self._handle_duplex()]
 
         else:
             conds = []
@@ -367,6 +373,38 @@ class BoxerOutputDrsParser(DrtParser):
             return BoxerPred(self.discourse_id, sent_index, word_indices, variable, name, pos, sense)
         return _handle_pred_f
 
+    def _handle_duplex(self):
+        #duplex(whq, drs(...), var, drs(...))
+        self.assertToken(self.token(), '(')
+        # self.assertToken(self.token(), '[')
+        ans_types = []
+        # while self.token(0) != ']':
+        #     cat = self.token()
+        #     self.assertToken(self.token(), ':')
+        #     if cat == 'des':
+        #         ans_types.append(self.token())
+        #     elif cat == 'num':
+        #         ans_types.append('number')
+        #         typ = self.token()
+        #         if typ == 'cou':
+        #             ans_types.append('count')
+        #         else:
+        #             ans_types.append(typ)
+        #     else:
+        #         ans_types.append(self.token())
+        # self.token() #swallow the ']'
+      
+        self.assertToken(self.token(), 'whq')
+        self.assertToken(self.token(), ',')
+        d1 = self.process_next_expression(None)
+        self.assertToken(self.token(), ',')
+        ref = self.parse_variable()
+        self.assertToken(self.token(), ',')
+        d2 = self.process_next_expression(None)
+        self.assertToken(self.token(), ')')
+        return lambda sent_index, word_indices: BoxerWhq(self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2)
+
+
     def _handle_named(self):
         #named(x0, john, per, 0)
         self.assertToken(self.token(), '(')
@@ -376,7 +414,7 @@ class BoxerOutputDrsParser(DrtParser):
         self.assertToken(self.token(), ',')
         type = self.token()
         self.assertToken(self.token(), ',')
-        sense = int(self.token())
+        sense = self.token() # as per boxer rev 2554
         self.assertToken(self.token(), ')')
         return lambda sent_index, word_indices: BoxerNamed(self.discourse_id, sent_index, word_indices, variable, name, type, sense)
 
@@ -504,7 +542,6 @@ class BoxerOutputDrsParser(DrtParser):
         #drs([[1001]:_G3943],
         #    [[1002]:pred(_G3943, dog, n, 0)]
         #   )
-        label = self._label_counter.get()
         self.assertToken(self.token(), '(')
         self.assertToken(self.token(), '[')
         refs = set()
@@ -524,7 +561,7 @@ class BoxerOutputDrsParser(DrtParser):
                 self.token() #swallow ','
         self.token() #swallow ']'
         self.assertToken(self.token(), ')')
-        return BoxerDrs(label, list(refs), conds)
+        return BoxerDrs(list(refs), conds)
 
     def _handle_binary_expression(self, make_callback):
         self.assertToken(self.token(), '(')
@@ -583,18 +620,18 @@ class BoxerOutputDrsParser(DrtParser):
         return lambda sent_index, word_indices: BoxerWhq(self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2)
 
     def _make_merge_expression(self, sent_index, word_indices, drs1, drs2):
-        return BoxerDrs(drs1.label, drs1.refs + drs2.refs, drs1.conds + drs2.conds)
+        return BoxerDrs(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
 
     def _make_or_expression(self, sent_index, word_indices, drs1, drs2):
         return BoxerOr(self.discourse_id, sent_index, word_indices, drs1, drs2)
 
     def _make_imp_expression(self, sent_index, word_indices, drs1, drs2):
-        return BoxerDrs(drs1.label, drs1.refs, drs1.conds, drs2)
+        return BoxerDrs(drs1.refs, drs1.conds, drs2)
 
     def parse_variable(self):
         var = self.token()
-        assert re.match('^[ex]\d+$', var), var
-        return int(var[1:])
+        assert re.match('^[exps]\d+$', var), var
+        return var
 
     def parse_index(self):
         return int(self.token())
@@ -631,16 +668,16 @@ class BoxerDrsParser(DrtParser):
 
     def handle(self, tok, context):
         try:
-            if tok == 'drs':
-                self.assertNextToken(DrtTokens.OPEN)
-                label = int(self.token())
-                self.assertNextToken(DrtTokens.COMMA)
-                refs = list(map(int, self.handle_refs()))
-                self.assertNextToken(DrtTokens.COMMA)
-                conds = self.handle_conds(None)
-                self.assertNextToken(DrtTokens.CLOSE)
-                return BoxerDrs(label, refs, conds)
-            elif tok == 'pred':
+#             if tok == 'drs':
+#                 self.assertNextToken(DrtTokens.OPEN)
+#                 label = int(self.token())
+#                 self.assertNextToken(DrtTokens.COMMA)
+#                 refs = list(map(int, self.handle_refs()))
+#                 self.assertNextToken(DrtTokens.COMMA)
+#                 conds = self.handle_conds(None)
+#                 self.assertNextToken(DrtTokens.CLOSE)
+#                 return BoxerDrs(label, refs, conds)
+            if tok == 'pred':
                 self.assertNextToken(DrtTokens.OPEN)
                 disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
                 self.assertNextToken(DrtTokens.COMMA)
@@ -715,7 +752,7 @@ class BoxerDrsParser(DrtParser):
                 self.assertNextToken(DrtTokens.COMMA)
                 drs2 = self.process_next_expression(None)
                 self.assertNextToken(DrtTokens.CLOSE)
-                return BoxerDrs(drs1.label, drs1.refs, drs1.conds, drs2)
+                return BoxerDrs(drs1.refs, drs1.conds, drs2)
             elif tok == 'or':
                 self.assertNextToken(DrtTokens.OPEN)
                 disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None]
@@ -829,9 +866,8 @@ class AbstractBoxerDrs(object):
 
 @python_2_unicode_compatible
 class BoxerDrs(AbstractBoxerDrs):
-    def __init__(self, label, refs, conds, consequent=None):
+    def __init__(self, refs, conds, consequent=None):
         AbstractBoxerDrs.__init__(self)
-        self.label = label
         self.refs = refs
         self.conds = conds
         self.consequent = consequent
@@ -854,23 +890,21 @@ class BoxerDrs(AbstractBoxerDrs):
 
     def clean(self):
         consequent = (self.consequent.clean() if self.consequent else None)
-        return BoxerDrs(self.label, self.refs, [c.clean() for c in self.conds], consequent)
+        return BoxerDrs(self.refs, [c.clean() for c in self.conds], consequent)
 
     def renumber_sentences(self, f):
         consequent = (self.consequent.renumber_sentences(f) if self.consequent else None)
-        return BoxerDrs(self.label, self.refs, [c.renumber_sentences(f) for c in self.conds], consequent)
+        return BoxerDrs(self.refs, [c.renumber_sentences(f) for c in self.conds], consequent)
 
     def __repr__(self):
-        s = 'drs(%s, [%s], [%s])' % (self.label,
-                                    ', '.join("%s" % r for r in self.refs),
-                                    ', '.join("%s" % c for c in self.conds))
+        s = 'drs([%s], [%s])' % (', '.join("%s" % r for r in self.refs),
+                                 ', '.join("%s" % c for c in self.conds))
         if self.consequent is not None:
             s = 'imp(%s, %s)' % (s, self.consequent)
         return s
 
     def __eq__(self, other):
         return self.__class__ == other.__class__ and \
-               self.label == other.label and \
                self.refs == other.refs and \
                len(self.conds) == len(other.conds) and \
                reduce(operator.and_, (c1==c2 for c1,c2 in zip(self.conds, other.conds))) and \
@@ -1151,9 +1185,7 @@ class NltkDrtBoxerDrsInterpreter(object):
         :return: ``DrtExpression``
         """
         if isinstance(ex, BoxerDrs):
-            drs = DRS([Variable('x%d' % r) for r in ex.refs], list(map(self.interpret, ex.conds)))
-            if ex.label is not None:
-                drs.label = Variable('x%d' % ex.label)
+            drs = DRS([Variable(r) for r in ex.refs], list(map(self.interpret, ex.conds)))
             if ex.consequent is not None:
                 drs.consequent = self.interpret(ex.consequent)
             return drs
@@ -1161,21 +1193,21 @@ class NltkDrtBoxerDrsInterpreter(object):
             return DrtNegatedExpression(self.interpret(ex.drs))
         elif isinstance(ex, BoxerPred):
             pred = self._add_occur_indexing('%s_%s' % (ex.pos, ex.name), ex)
-            return self._make_atom(pred, 'x%d' % ex.var)
+            return self._make_atom(pred, ex.var)
         elif isinstance(ex, BoxerNamed):
             pred = self._add_occur_indexing('ne_%s_%s' % (ex.type, ex.name), ex)
-            return self._make_atom(pred, 'x%d' % ex.var)
+            return self._make_atom(pred, ex.var)
         elif isinstance(ex, BoxerRel):
             pred = self._add_occur_indexing('%s' % (ex.rel), ex)
-            return self._make_atom(pred, 'x%d' % ex.var1, 'x%d' % ex.var2)
+            return self._make_atom(pred, ex.var1, ex.var2)
         elif isinstance(ex, BoxerProp):
-            return DrtProposition(Variable('x%d' % ex.var), self.interpret(ex.drs))
+            return DrtProposition(Variable(ex.var), self.interpret(ex.drs))
         elif isinstance(ex, BoxerEq):
-            return DrtEqualityExpression(DrtVariableExpression(Variable('x%d' % ex.var1)),
-                                         DrtVariableExpression(Variable('x%d' % ex.var2)))
+            return DrtEqualityExpression(DrtVariableExpression(Variable(ex.var1)),
+                                         DrtVariableExpression(Variable(ex.var2)))
         elif isinstance(ex, BoxerCard):
             pred = self._add_occur_indexing('card_%s_%s' % (ex.type, ex.value), ex)
-            return self._make_atom(pred, 'x%d' % ex.var)
+            return self._make_atom(pred, ex.var)
         elif isinstance(ex, BoxerOr):
             return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2))
         elif isinstance(ex, BoxerWhq):
@@ -1223,4 +1255,4 @@ if __name__ == '__main__':
         if options.fol:
             print(drs.fol().normalize())
         else:
-            drs.normalize().pprint()
+            drs.pretty_print()
diff --git a/nltk/sem/drt.py b/nltk/sem/drt.py
index 3e157de..f433c37 100644
--- a/nltk/sem/drt.py
+++ b/nltk/sem/drt.py
@@ -1217,12 +1217,12 @@ def demo():
     print(resolve_anaphora(dexpr(r'([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])')))
     print(resolve_anaphora(dexpr(r'(([x,y],[]) + ([],[PRO(x)]))')))
 
-    print('='*20 + 'Test pprint()' + '='*20)
-    dexpr(r"([],[])").pprint()
-    dexpr(r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])").pprint()
-    dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pprint()
-    dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pprint()
-    dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pprint()
+    print('='*20 + 'Test pretty_print()' + '='*20)
+    dexpr(r"([],[])").pretty_print()
+    dexpr(r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])").pretty_print()
+    dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pretty_print()
+    dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pretty_print()
+    dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pretty_print()
 
 
 def test_draw():
diff --git a/nltk/stem/snowball.py b/nltk/stem/snowball.py
index 3fea2b3..1f4b751 100644
--- a/nltk/stem/snowball.py
+++ b/nltk/stem/snowball.py
@@ -2533,7 +2533,7 @@ class PortugueseStemmer(_StandardStemmer):
                         word = suffix_replace(word, suffix, "log")
                         rv = suffix_replace(rv, suffix, "log")
 
-                    elif suffix in ("ução", "uções"):
+                    elif suffix in ("uça~o", "uço~es"):
                         word = suffix_replace(word, suffix, "u")
                         rv = suffix_replace(rv, suffix, "u")
 
diff --git a/nltk/tag/__init__.py b/nltk/tag/__init__.py
index c9ce8d5..1dce867 100644
--- a/nltk/tag/__init__.py
+++ b/nltk/tag/__init__.py
@@ -71,7 +71,7 @@ from nltk.tag.brill         import BrillTagger
 from nltk.tag.brill_trainer import BrillTaggerTrainer
 from nltk.tag.tnt           import TnT
 from nltk.tag.hunpos        import HunposTagger
-from nltk.tag.stanford      import StanfordTagger
+from nltk.tag.stanford      import StanfordTagger, StanfordPOSTagger, StanfordNERTagger
 from nltk.tag.hmm           import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
 from nltk.tag.senna         import SennaTagger, SennaChunkTagger, SennaNERTagger
 from nltk.tag.mapping       import tagset_mapping, map_tag
@@ -83,7 +83,7 @@ from nltk.data import load
 # Standard treebank POS tagger
 _POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
 
-def pos_tag(tokens):
+def pos_tag(tokens, tagset=None):
     """
     Use NLTK's currently recommended part of speech tagger to
     tag the given list of tokens.
@@ -101,6 +101,8 @@ def pos_tag(tokens):
     :rtype: list(tuple(str, str))
     """
     tagger = load(_POS_TAGGER)
+    if tagset:
+        return [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagger.tag(tokens)]
     return tagger.tag(tokens)
 
 def pos_tag_sents(sentences):
diff --git a/nltk/tag/crf.py b/nltk/tag/crf.py
index 3842d87..0825944 100644
--- a/nltk/tag/crf.py
+++ b/nltk/tag/crf.py
@@ -24,7 +24,7 @@ class CRFTagger(TaggerI):
     """
     A module for POS tagging using CRFSuite https://pypi.python.org/pypi/python-crfsuite
     
-    >>> from nltk.tag.crf import CRFTagger
+    >>> from nltk.tag import CRFTagger
     >>> ct = CRFTagger()
  
     >>> train_data = [[('University','Noun'), ('is','Verb'), ('a','Det'), ('good','Adj'), ('place','Noun')],
diff --git a/nltk/tag/hunpos.py b/nltk/tag/hunpos.py
index 82e0560..f3ba445 100644
--- a/nltk/tag/hunpos.py
+++ b/nltk/tag/hunpos.py
@@ -33,7 +33,7 @@ class HunposTagger(TaggerI):
 
     Example:
 
-        >>> from nltk.tag.hunpos import HunposTagger
+        >>> from nltk.tag import HunposTagger
         >>> ht = HunposTagger('en_wsj.model')
         >>> ht.tag('What is the airspeed of an unladen swallow ?'.split())
         [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')]
diff --git a/nltk/tag/stanford.py b/nltk/tag/stanford.py
index 3ce7575..c88aff9 100644
--- a/nltk/tag/stanford.py
+++ b/nltk/tag/stanford.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Natural Language Toolkit: Interface to the Stanford NER-tagger
+# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
 #
 # Copyright (C) 2001-2015 NLTK Project
 # Author: Nitin Madnani <nmadnani at ets.org>
@@ -9,6 +9,12 @@
 
 """
 A module for interfacing with the Stanford taggers.
+
+Tagger models need to be downloaded from http://nlp.stanford.edu/software
+and the STANFORD_MODELS environment variable set (a colon-separated
+list of paths).
+
+For more details see the documentation for StanfordPOSTagger and StanfordNERTagger.
 """
 
 import os
@@ -36,17 +42,17 @@ class StanfordTagger(TaggerI):
     _SEPARATOR = ''
     _JAR = ''
 
-    def __init__(self, path_to_model, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
+    def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
 
         if not self._JAR:
             warnings.warn('The StanfordTagger class is not meant to be '
-                    'instantiated directly. Did you mean POS- or NERTagger?')
+                    'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
         self._stanford_jar = find_jar(
                 self._JAR, path_to_jar,
                 searchpath=(), url=_stanford_url,
                 verbose=verbose)
 
-        self._stanford_model = find_file(path_to_model,
+        self._stanford_model = find_file(model_filename,
                 env_vars=('STANFORD_MODELS',), verbose=verbose)
         self._encoding = encoding
         self.java_options = java_options
@@ -56,7 +62,8 @@ class StanfordTagger(TaggerI):
       raise NotImplementedError
 
     def tag(self, tokens):
-        return list(self.tag_sents([tokens]))
+        # This function should return list of tuple rather than list of list 
+        return sum(self.tag_sents([tokens]), []) 
 
     def tag_sents(self, sentences):
         encoding = self._encoding
@@ -80,16 +87,16 @@ class StanfordTagger(TaggerI):
         stanpos_output, _stderr = java(self._cmd,classpath=self._stanford_jar,
                                                        stdout=PIPE, stderr=PIPE)
         stanpos_output = stanpos_output.decode(encoding)
-
+        
         # Delete the temporary file
         os.unlink(self._input_file_path) 
 
         # Return java configurations to their default values
         config_java(options=default_options, verbose=False)
                 
-        return self.parse_output(stanpos_output)
+        return self.parse_output(stanpos_output, sentences)
 
-    def parse_output(self, text):
+    def parse_output(self, text, sentences = None):
         # Output the tagged sentences
         tagged_sentences = []
         for tagged_sentence in text.strip().split("\n"):
@@ -100,7 +107,7 @@ class StanfordTagger(TaggerI):
             tagged_sentences.append(sentence)
         return tagged_sentences
 
-class POSTagger(StanfordTagger):
+class StanfordPOSTagger(StanfordTagger):
     """
     A class for pos tagging with Stanford Tagger. The input is the paths to:
      - a model trained on training data
@@ -110,9 +117,8 @@ class POSTagger(StanfordTagger):
 
     Example:
 
-        >>> from nltk.tag.stanford import POSTagger
-        >>> st = POSTagger('/usr/share/stanford-postagger/models/english-bidirectional-distsim.tagger',
-        ...                '/usr/share/stanford-postagger/stanford-postagger.jar') # doctest: +SKIP
+        >>> from nltk.tag import StanfordPOSTagger
+        >>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # doctest: +SKIP
         >>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
         [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
     """
@@ -121,7 +127,7 @@ class POSTagger(StanfordTagger):
     _JAR = 'stanford-postagger.jar'
 
     def __init__(self, *args, **kwargs):
-        super(POSTagger, self).__init__(*args, **kwargs)
+        super(StanfordPOSTagger, self).__init__(*args, **kwargs)
 
     @property
     def _cmd(self):
@@ -129,9 +135,9 @@ class POSTagger(StanfordTagger):
                 '-model', self._stanford_model, '-textFile',
                 self._input_file_path, '-tokenize', 'false','-outputFormatOptions', 'keepEmptySentences']
 
-class NERTagger(StanfordTagger):
+class StanfordNERTagger(StanfordTagger):
     """
-    A class for ner tagging with Stanford Tagger. The input is the paths to:
+    A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to:
 
     - a model trained on training data
     - (optionally) the path to the stanford tagger jar file. If not specified here,
@@ -140,9 +146,8 @@ class NERTagger(StanfordTagger):
 
     Example:
 
-        >>> from nltk.tag.stanford import NERTagger
-        >>> st = NERTagger('/usr/share/stanford-ner/classifiers/all.3class.distsim.crf.ser.gz',
-        ...                '/usr/share/stanford-ner/stanford-ner.jar') # doctest: +SKIP
+        >>> from nltk.tag import StanfordNERTagger
+        >>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP
         >>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
         [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'),
          ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'),
@@ -154,7 +159,7 @@ class NERTagger(StanfordTagger):
     _FORMAT = 'slashTags'
 
     def __init__(self, *args, **kwargs):
-        super(NERTagger, self).__init__(*args, **kwargs)
+        super(StanfordNERTagger, self).__init__(*args, **kwargs)
 
     @property
     def _cmd(self):
@@ -163,10 +168,24 @@ class NERTagger(StanfordTagger):
                 '-loadClassifier', self._stanford_model, '-textFile',
                 self._input_file_path, '-outputFormat', self._FORMAT, '-tokenizerFactory', 'edu.stanford.nlp.process.WhitespaceTokenizer', '-tokenizerOptions','\"tokenizeNLs=false\"']
 
-    def parse_output(self, text):
-      if self._FORMAT == 'slashTags':
-        return super(NERTagger, self).parse_output(text)
-      raise NotImplementedError
+    def parse_output(self, text, sentences):
+        if self._FORMAT == 'slashTags':
+            # Joint together to a big list    
+            tagged_sentences = []
+            for tagged_sentence in text.strip().split("\n"):
+                for tagged_word in tagged_sentence.strip().split():
+                    word_tags = tagged_word.strip().split(self._SEPARATOR)
+                    tagged_sentences.append((''.join(word_tags[:-1]), word_tags[-1]))
+                
+            # Separate it according to the input
+            result = []
+            start = 0 
+            for sent in sentences:
+                result.append(tagged_sentences[start:start + len(sent)])
+                start += len(sent);
+            return result 
+
+        raise NotImplementedError
 
 
 if __name__ == "__main__":
diff --git a/nltk/test/crubadan.doctest b/nltk/test/crubadan.doctest
new file mode 100644
index 0000000..c45fe91
--- /dev/null
+++ b/nltk/test/crubadan.doctest
@@ -0,0 +1,65 @@
+.. Copyright (C) 2001-2015 NLTK Project
+.. For license information, see LICENSE.TXT
+
+Crubadan Corpus Reader
+======================
+
+Crubadan is an NLTK corpus reader for ngram files provided
+by the Crubadan project. It supports several languages.
+
+    >>> from nltk.corpus import crubadan
+    >>> crubadan.langs() # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+    ['abk', 'abn',..., 'zpa', 'zul']
+
+----------------------------------------
+Language code mapping and helper methods
+----------------------------------------
+
+The web crawler that generates the 3-gram frequencies works at the
+level of "writing systems" rather than languages. Writing systems
+are assigned internal 2-3 letter codes that require mapping to the
+standard ISO 639-3 codes. For more information, please refer to 
+the README in nltk_data/crubadan folder after installing it.
+
+To translate ISO 639-3 codes to "Crubadan Code":
+
+    >>> crubadan.iso_to_crubadan('eng')
+    'en'
+    >>> crubadan.iso_to_crubadan('fra')
+    'fr'
+    >>> crubadan.iso_to_crubadan('aaa')
+
+In reverse, print ISO 639-3 code if we have the Crubadan Code:
+
+    >>> crubadan.crubadan_to_iso('en')
+    'eng'
+    >>> crubadan.crubadan_to_iso('fr')
+    'fra'
+    >>> crubadan.crubadan_to_iso('aa')
+
+---------------------------
+Accessing ngram frequencies
+---------------------------
+
+On initialization the reader will create a dictionary of every
+language supported by the Crubadan project, mapping the ISO 639-3
+language code to its corresponding ngram frequency.
+
+You can access individual language FreqDist and the ngrams within them as follows:
+
+    >>> english_fd = crubadan.lang_freq('eng')
+    >>> english_fd['the']
+    728135
+
+Above accesses the FreqDist of English and returns the frequency of the ngram 'the'.
+A ngram that isn't found within the language will return 0:
+
+    >>> english_fd['sometest']
+    0
+
+A language that isn't supported will raise an exception:
+
+    >>> crubadan.lang_freq('elvish')
+    Traceback (most recent call last):
+    ...
+    RuntimeError: Unsupported language.
diff --git a/nltk/test/gensim.doctest b/nltk/test/gensim.doctest
new file mode 100644
index 0000000..4e7e176
--- /dev/null
+++ b/nltk/test/gensim.doctest
@@ -0,0 +1,144 @@
+.. Copyright (C) 2001-2015 NLTK Project
+.. For license information, see LICENSE.TXT
+
+=========================================
+ Test the word embedding function through Gensim package
+=========================================
+
+    >>> import gensim  
+
+Overview
+~~~~~~~~
+Use Gensim package, we demo 3 functions. 
+- Train the word embeddings using brown corpus. 
+- Load the pre-trained model and perform simple tasks.
+- Pruning the pre-trained binary model.      
+ 
+Train the model 
+~~~~~~~~~~~~~~~~~~
+The word embedding is trained on Brown corpus 
+
+    >>> from nltk.corpus import brown
+    >>> model = gensim.models.Word2Vec(brown.sents())
+
+It might take sometime to train the model, after the model is trained, probably you want to save and then use it latter  
+	>>> model.save('brown.embedding')
+    >>> new_model =  gensim.models.Word2Vec.load('brown.embedding')
+
+The model will be the list of words with their embedding. We can easily get the vector representation of a word.  
+	>>> len(new_model['university']) 
+	100
+	
+There are some supporting functions already implemented in Gensim to manipulate with word embeddings. 
+For example, to compute the cosine similarity between 2 words 
+	>>> new_model.similarity('university','school') > 0.3
+	True
+	
+Using the pre-trained model
+~~~~~~~~~~~~~~~~~~~
+NLTK also include a pre-trained model which is part of a model that is trained on 100 billion words from Google News Dataset. 
+The full model is from https://code.google.com/p/word2vec/  which is about 3 Gb.    
+ 	>>> from nltk.data import find
+	>>> word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.bin'))
+	>>> model = gensim.models.Word2Vec.load(word2vec_sample)
+	
+We pruned the model to only include the most common words (~44k words). 
+	>>> len(model.vocab)
+	43981
+ 
+Each of the word is represented in the space of 300 dimensions. 
+ 	>>> len(model['university'])
+ 	300
+ 	
+Finding the top n word that similar to a target word is simple. The result is the list of n words with the score.  
+ 	>>> model.most_similar(positive=['university'], topn = 3)
+ 	[(u'universities', 0.7003918886184692), (u'faculty', 0.6780908703804016), (u'undergraduate', 0.6587098240852356)]
+ 	 
+Find a word that is not in a list is also supported, although, implementing this by yourself is simple. 
+	>>> model.doesnt_match('breakfast cereal dinner lunch'.split())
+	'cereal'
+
+Mikolov et al. (2013) figured out that word embedding captures much of syntactic and semantic regularities. For example, 
+Vector 'King - Man + Woman' results close to 'Queen' or 'Germany - Berlin + Paris' closes to vector 'France'.     
+    >>> model.most_similar(positive=['woman','king'], negative=['man'], topn = 1)
+    [(u'queen', 0.7118192911148071)]
+    
+    >>> model.most_similar(positive=['Paris','Germany'], negative=['Berlin'], topn = 1)
+    [(u'France', 0.7884092926979065)]
+    
+We can visualize the word embeddings using t-SNE (http://lvdmaaten.github.io/tsne/). For demo, we just visualize the first 1000 words. 
+You can just change it to a bigger value.  
+
+	import numpy as np  
+	labels = [] 
+	count = 0 
+	max_count = 1000
+	X = np.zeros(shape=(max_count,len(model['university'])))
+	    
+	for term in model.vocab:
+		X[count] = model[term]
+		labels.append(term)	
+		count+= 1 
+		if count >= max_count: break 	
+	
+	# It is recommended to use PCA first to reduce to ~50 dimensions  
+	from sklearn.decomposition import PCA
+	pca = PCA(n_components=50)
+	X_50 = pca.fit_transform(X)
+	
+	# Using TSNE to further reduce to 2 dimensions
+	from sklearn.manifold import TSNE  
+	model_tsne = TSNE(n_components=2, random_state=0)
+	Y = model_tsne.fit_transform(X_50)
+	
+	# Show the scatter plot 
+	import matplotlib.pyplot as plt
+	plt.scatter(Y[:,0], Y[:,1], 20)
+	
+	# Add labels 
+	for label, x, y in zip(labels, Y[:, 0], Y[:, 1]): 
+		plt.annotate(label, xy = (x,y), xytext = (0, 0), textcoords = 'offset points', size = 10)	
+	
+	plt.show()
+
+
+Prune the trained binary model
+~~~~~~~~~~~~~~~~~
+Here is the supporting code to extract part of the binary model (GoogleNews-vectors-negative300.bin.gz) from  https://code.google.com/p/word2vec/
+We use this code to get the  `word2vec_sample` model. 
+
+	import gensim
+	from gensim.models.word2vec import Word2Vec
+	# Load the binary model 
+    model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True);
+    
+    # Only output word that appear in the Brown corpus 
+    from nltk.corpus import brown
+    words = set(brown.words())
+    print (len(words))
+    
+    # Output presented word to a temporary file
+    out_file = 'pruned.word2vec.txt'
+    f = open(out_file,'wb')
+
+    word_presented = words.intersection(model.vocab.keys())                       
+    f.write('{} {}\n'.format(len(word_presented),len(model['word'])))
+    
+    for word in word_presented:        
+        f.write('{} {}\n'.format(word, ' '.join(str(value) for value in model[word])))
+                        
+    f.close()
+    
+    # Reload the model from text file 
+    new_model = Word2Vec.load_word2vec_format(out_file, binary=False);
+    
+    # Save it as the Gensim model
+    gensim_model = "pruned.word2vec.bin" 
+    new_model.save(gensim_model)
+    
+    # Load the model 
+    very_new_model = gensim.models.Word2Vec.load(gensim_model)
+    
+    # Test it 
+    very_new_model.most_similar(positive=['king','woman'], negative=['man'], topn=1)	
+	
\ No newline at end of file
diff --git a/nltk/test/unit/test_tgrep.py b/nltk/test/unit/test_tgrep.py
new file mode 100644
index 0000000..224fb47
--- /dev/null
+++ b/nltk/test/unit/test_tgrep.py
@@ -0,0 +1,626 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Natural Language Toolkit: TGrep search
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Will Roberts <wildwilhelm at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+'''
+Unit tests for nltk.tgrep.
+'''
+
+from __future__ import absolute_import, print_function, unicode_literals
+from nltk.compat import b
+from nltk.tree import ParentedTree
+from nltk import tgrep
+import unittest
+
+class TestSequenceFunctions(unittest.TestCase):
+
+    '''
+    Class containing unit tests for nltk.tgrep.
+    '''
+
+    def test_tokenize_simple(self):
+        '''
+        Simple test of tokenization.
+        '''
+        tokens = tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]')
+        self.assertEqual(tokens,
+                         ['A', '..', '(', 'B', '!', '<', 'C', '.', 'D', ')',
+                          '|', '!', '[', '<<', '(', 'E', ',', 'F', ')', '$',
+                          'G', ']'])
+
+    def test_tokenize_encoding(self):
+        '''
+        Test that tokenization handles bytes and strs the same way.
+        '''
+        self.assertEqual(
+            tgrep.tgrep_tokenize(b('A .. (B !< C . D) | ![<< (E , F) $ G]')),
+            tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]'))
+
+    def test_tokenize_link_types(self):
+        '''
+        Test tokenization of basic link types.
+        '''
+        self.assertEqual(tgrep.tgrep_tokenize('A<B'),     ['A', '<', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A>B'),     ['A', '>', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A<3B'),    ['A', '<3', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A>3B'),    ['A', '>3', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A<,B'),    ['A', '<,', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A>,B'),    ['A', '>,', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A<-3B'),   ['A', '<-3', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A>-3B'),   ['A', '>-3', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A<-B'),    ['A', '<-', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A>-B'),    ['A', '>-', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A<\'B'),   ['A', '<\'', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A>\'B'),   ['A', '>\'', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A<:B'),    ['A', '<:', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A>:B'),    ['A', '>:', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A<<B'),    ['A', '<<', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A>>B'),    ['A', '>>', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A<<,B'),   ['A', '<<,', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A>>,B'),   ['A', '>>,', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A<<\'B'),  ['A', '<<\'', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A>>\'B'),  ['A', '>>\'', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A<<:B'),   ['A', '<<:', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A>>:B'),   ['A', '>>:', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A.B'),     ['A', '.', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A,B'),     ['A', ',', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A..B'),    ['A', '..', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A,,B'),    ['A', ',,', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A$B'),     ['A', '$', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A$.B'),    ['A', '$.', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A$,B'),    ['A', '$,', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A$..B'),   ['A', '$..', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A$,,B'),   ['A', '$,,', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!<B'),    ['A', '!', '<', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!>B'),    ['A', '!', '>', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!<3B'),   ['A', '!', '<3', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!>3B'),   ['A', '!', '>3', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!<,B'),   ['A', '!', '<,', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!>,B'),   ['A', '!', '>,', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!<-3B'),
+                         ['A', '!', '<-3', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!>-3B'),
+                         ['A', '!', '>-3', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!<-B'),   ['A', '!', '<-', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!>-B'),   ['A', '!', '>-', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!<\'B'),
+                         ['A', '!', '<\'', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!>\'B'),
+                         ['A', '!', '>\'', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!<:B'),   ['A', '!', '<:', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!>:B'),   ['A', '!', '>:', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!<<B'),   ['A', '!', '<<', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!>>B'),   ['A', '!', '>>', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!<<,B'),
+                         ['A', '!', '<<,', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!>>,B'),
+                         ['A', '!', '>>,', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!<<\'B'),
+                         ['A', '!', '<<\'', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!>>\'B'),
+                         ['A', '!', '>>\'', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!<<:B'),
+                         ['A', '!', '<<:', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!>>:B'),
+                         ['A', '!', '>>:', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!.B'),    ['A', '!', '.', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!,B'),    ['A', '!', ',', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!..B'),   ['A', '!', '..', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!,,B'),   ['A', '!', ',,', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!$B'),    ['A', '!', '$', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!$.B'),   ['A', '!', '$.', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!$,B'),   ['A', '!', '$,', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!$..B'),
+                         ['A', '!', '$..', 'B'])
+        self.assertEqual(tgrep.tgrep_tokenize('A!$,,B'),
+                         ['A', '!', '$,,', 'B'])
+
+    def test_tokenize_examples(self):
+        '''
+        Test tokenization of the TGrep2 manual example patterns.
+        '''
+        self.assertEqual(tgrep.tgrep_tokenize('NP < PP'),
+                         ['NP', '<', 'PP'])
+        self.assertEqual(tgrep.tgrep_tokenize('/^NP/'),
+                         ['/^NP/'])
+        self.assertEqual(tgrep.tgrep_tokenize('NP << PP . VP'),
+                         ['NP', '<<', 'PP', '.', 'VP'])
+        self.assertEqual(tgrep.tgrep_tokenize('NP << PP | . VP'),
+                         ['NP', '<<', 'PP', '|', '.', 'VP'])
+        self.assertEqual(tgrep.tgrep_tokenize('NP !<< PP [> NP | >> VP]'),
+                         ['NP', '!', '<<', 'PP', '[', '>', 'NP', '|',
+                          '>>', 'VP', ']'])
+        self.assertEqual(tgrep.tgrep_tokenize('NP << (PP . VP)'),
+                         ['NP', '<<', '(', 'PP', '.', 'VP', ')'])
+        self.assertEqual(tgrep.tgrep_tokenize('NP <\' (PP <, (IN < on))'),
+                         ['NP', '<\'', '(', 'PP', '<,', '(', 'IN', '<',
+                          'on', ')', ')'])
+        self.assertEqual(tgrep.tgrep_tokenize('S < (A < B) < C'),
+                         ['S', '<', '(', 'A', '<', 'B', ')', '<', 'C'])
+        self.assertEqual(tgrep.tgrep_tokenize('S < ((A < B) < C)'),
+                         ['S', '<', '(', '(', 'A', '<', 'B', ')',
+                          '<', 'C', ')'])
+        self.assertEqual(tgrep.tgrep_tokenize('S < (A < B < C)'),
+                         ['S', '<', '(', 'A', '<', 'B', '<', 'C', ')'])
+        self.assertEqual(tgrep.tgrep_tokenize('A<B&.C'),
+                         ['A', '<', 'B', '&', '.', 'C'])
+
+    def test_tokenize_quoting(self):
+        '''
+        Test tokenization of quoting.
+        '''
+        self.assertEqual(tgrep.tgrep_tokenize('"A<<:B"<<:"A $.. B"<"A>3B"<C'),
+                         ['"A<<:B"', '<<:', '"A $.. B"', '<', '"A>3B"',
+                          '<', 'C'])
+
+    def test_tokenize_nodenames(self):
+        '''
+        Test tokenization of node names.
+        '''
+        self.assertEqual(tgrep.tgrep_tokenize('Robert'), ['Robert'])
+        self.assertEqual(tgrep.tgrep_tokenize('/^[Bb]ob/'), ['/^[Bb]ob/'])
+        self.assertEqual(tgrep.tgrep_tokenize('*'), ['*'])
+        self.assertEqual(tgrep.tgrep_tokenize('__'), ['__'])
+        # test tokenization of NLTK tree position syntax
+        self.assertEqual(tgrep.tgrep_tokenize('N()'),
+                         ['N(', ')'])
+        self.assertEqual(tgrep.tgrep_tokenize('N(0,)'),
+                         ['N(', '0', ',', ')'])
+        self.assertEqual(tgrep.tgrep_tokenize('N(0,0)'),
+                         ['N(', '0', ',', '0', ')'])
+        self.assertEqual(tgrep.tgrep_tokenize('N(0,0,)'),
+                         ['N(', '0', ',', '0', ',', ')'])
+
+    def test_tokenize_macros(self):
+        '''
+        Test tokenization of macro definitions.
+        '''
+        self.assertEqual(tgrep.tgrep_tokenize(
+            '@ NP /^NP/;\n@ NN /^NN/;\n at NP [!< NP | < @NN] !$.. @NN'),
+                         ['@', 'NP', '/^NP/', ';', '@', 'NN', '/^NN/', ';',
+                          '@NP', '[', '!', '<', 'NP', '|', '<', '@NN', ']',
+                          '!', '$..', '@NN'])
+
+    def test_node_simple(self):
+        '''
+        Test a simple use of tgrep for finding nodes matching a given
+        pattern.
+        '''
+        tree = ParentedTree.fromstring(
+            '(S (NP (DT the) (JJ big) (NN dog)) '
+            '(VP bit) (NP (DT a) (NN cat)))')
+        self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])),
+                         [[(0,2), (2,1)]])
+        self.assertEqual(list(tgrep.tgrep_nodes('NN', [tree])),
+                         [[tree[0,2], tree[2,1]]])
+        self.assertEqual(list(tgrep.tgrep_positions('NN|JJ', [tree])),
+                         [[(0, 1), (0, 2), (2, 1)]])
+
+    def test_node_printing(self):
+        '''Test that the tgrep print operator ' is properly ignored.'''
+        tree = ParentedTree.fromstring('(S (n x) (N x))')
+        self.assertEqual(list(tgrep.tgrep_positions('N', [tree])),
+                         list(tgrep.tgrep_positions('\'N', [tree])))
+        self.assertEqual(list(tgrep.tgrep_positions('/[Nn]/', [tree])),
+                         list(tgrep.tgrep_positions('\'/[Nn]/', [tree])))
+
+    def test_node_encoding(self):
+        '''
+        Test that tgrep search strings handles bytes and strs the same
+        way.
+        '''
+        tree = ParentedTree.fromstring(
+            '(S (NP (DT the) (JJ big) (NN dog)) '
+            '(VP bit) (NP (DT a) (NN cat)))')
+        self.assertEqual(list(tgrep.tgrep_positions(b('NN'), [tree])),
+                         list(tgrep.tgrep_positions('NN', [tree])))
+        self.assertEqual(list(tgrep.tgrep_nodes(b('NN'), [tree])),
+                         list(tgrep.tgrep_nodes('NN', [tree])))
+        self.assertEqual(list(tgrep.tgrep_positions(b('NN|JJ'), [tree])),
+                         list(tgrep.tgrep_positions('NN|JJ', [tree])))
+
+    def test_node_nocase(self):
+        '''
+        Test selecting nodes using case insensitive node names.
+        '''
+        tree = ParentedTree.fromstring('(S (n x) (N x))')
+        self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
+
+    def test_node_quoted(self):
+        '''
+        Test selecting nodes using quoted node names.
+        '''
+        tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))')
+        self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]])
+        self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
+
+    def test_node_regex(self):
+        '''
+        Test regex matching on nodes.
+        '''
+        tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
+        # This is a regular expression that matches any node whose
+        # name starts with NP, including NP-SBJ:
+        self.assertEqual(list(tgrep.tgrep_positions('/^NP/', [tree])),
+                         [[(0,), (1,)]])
+
+    def test_node_regex_2(self):
+        '''
+        Test regex matching on nodes.
+        '''
+        tree = ParentedTree.fromstring('(S (SBJ x) (SBJ1 x) (NP-SBJ x))')
+        self.assertEqual(list(tgrep.tgrep_positions('/^SBJ/', [tree])),
+                         [[(0,), (1,)]])
+        # This is a regular expression that matches any node whose
+        # name includes SBJ, including NP-SBJ:
+        self.assertEqual(list(tgrep.tgrep_positions('/SBJ/', [tree])),
+                         [[(0,), (1,), (2,)]])
+
+    def test_node_tree_position(self):
+        '''
+        Test matching on nodes based on NLTK tree position.
+        '''
+        tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
+        # test all tree positions that are not leaves
+        leaf_positions = set([tree.leaf_treeposition(x)
+                              for x in range(len(tree.leaves()))])
+        tree_positions = [x for x in tree.treepositions()
+                          if x not in leaf_positions]
+        for position in tree_positions:
+            node_id = 'N{0}'.format(position)
+            tgrep_positions = list(tgrep.tgrep_positions(node_id, [tree]))
+            self.assertEqual(len(tgrep_positions[0]), 1)
+            self.assertEqual(tgrep_positions[0][0], position)
+
+    def test_node_noleaves(self):
+        '''
+        Test node name matching with the search_leaves flag set to False.
+        '''
+        tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
+        self.assertEqual(list(tgrep.tgrep_positions('x', [tree])),
+                         [[(0, 0, 0), (1, 0, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('x', [tree], False)),
+                         [[]])
+
+    def tests_rel_dominance(self):
+        '''
+        Test matching nodes based on dominance relations.
+        '''
+        tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
+        self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])),
+                         [[(0,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* < T > S', [tree])),
+                         [[(0,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* !< T', [tree])),
+                         [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* !< T > S', [tree])),
+                         [[(1,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* > A', [tree])),
+                         [[(0, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* > B', [tree])),
+                         [[(1, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* !> B', [tree])),
+                         [[(), (0,), (0, 0), (0, 0, 0), (1,), (1, 0, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* !> B >> S', [tree])),
+                         [[(0,), (0, 0), (1,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* >> S', [tree])),
+                         [[(0,), (0, 0), (1,), (1, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* >>, S', [tree])),
+                         [[(0,), (0, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* >>\' S', [tree])),
+                         [[(1,), (1, 0)]])
+        # Known issue:
+        #self.assertEqual(list(tgrep.tgrep_positions('* !>> S', [tree])),
+        #                 [[()]])
+        self.assertEqual(list(tgrep.tgrep_positions('* << T', [tree])),
+                         [[(), (0,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* <<\' T', [tree])),
+                         [[(0,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* <<1 N', [tree])),
+                         [[(1,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* !<< T', [tree])),
+                         [[(0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]])
+        tree = ParentedTree.fromstring('(S (A (T x)) (B (T x) (N x )))')
+        self.assertEqual(list(tgrep.tgrep_positions('* <: T', [tree])),
+                         [[(0,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])),
+                         [[(0,), (1,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* !<: T', [tree])),
+                         [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0),
+                          (1, 1), (1, 1, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* !<: T > S', [tree])),
+                         [[(1,)]])
+        tree = ParentedTree.fromstring('(S (T (A x) (B x)) (T (C x)))')
+        self.assertEqual(list(tgrep.tgrep_positions('* >: T', [tree])),
+                         [[(1, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* !>: T', [tree])),
+                         [[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0),
+                          (1,), (1, 0, 0)]])
+        tree = ParentedTree.fromstring('(S (A (B (C (D (E (T x))))))'
+                                       ' (A (B (C (D (E (T x))) (N x)))))')
+        self.assertEqual(list(tgrep.tgrep_positions('* <<: T', [tree])),
+                         [[(0,), (0, 0), (0, 0, 0), (0, 0, 0, 0),
+                          (0, 0, 0, 0, 0), (1, 0, 0, 0), (1, 0, 0, 0, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* >>: A', [tree])),
+                         [[(0, 0), (0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0, 0),
+                          (0, 0, 0, 0, 0, 0), (1, 0), (1, 0, 0)]])
+
+    def test_bad_operator(self):
+        '''
+        Test error handling of undefined tgrep operators.
+        '''
+        tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
+        self.assertRaises(
+            tgrep.TgrepException,
+            list,
+            tgrep.tgrep_positions('* >>> S', [tree]))
+
+    def test_comments(self):
+        '''
+        Test that comments are correctly filtered out of tgrep search
+        strings.
+        '''
+        tree = ParentedTree.fromstring('(S (NN x) (NP x) (NN x))')
+        search1 = '''
+        @ NP /^NP/;
+        @ NN /^NN/;
+        @NN
+        '''
+        self.assertEqual(list(tgrep.tgrep_positions(search1, [tree])),
+                         [[(0,), (2,)]])
+        search2 = '''
+        # macros
+        @ NP /^NP/;
+        @ NN /^NN/;
+
+        # search string
+        @NN
+        '''
+        self.assertEqual(list(tgrep.tgrep_positions(search2, [tree])),
+                         [[(0,), (2,)]])
+
+    def test_rel_sister_nodes(self):
+        '''
+        Test matching sister nodes in a tree.
+        '''
+        tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
+        self.assertEqual(list(tgrep.tgrep_positions('* $. B', [tree])),  [[(0,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* $.. B', [tree])), [[(0,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* $, B', [tree])),  [[(2,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* $,, B', [tree])), [[(2,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* $ B', [tree])),   [[(0,), (2,)]])
+
+    def tests_rel_indexed_children(self):
+        '''
+        Test matching nodes based on their index in their parent node.
+        '''
+        tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
+        self.assertEqual(list(tgrep.tgrep_positions('* >, S', [tree])),   [[(0,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* >1 S', [tree])),   [[(0,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* >2 S', [tree])),   [[(1,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* >3 S', [tree])),   [[(2,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* >\' S', [tree])),  [[(2,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* >-1 S', [tree])),  [[(2,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* >-2 S', [tree])),  [[(1,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* >-3 S', [tree])),  [[(0,)]])
+        tree = ParentedTree.fromstring(
+            '(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) '
+            '(F (C x) (A x) (B x)))')
+        self.assertEqual(list(tgrep.tgrep_positions('* <, A', [tree])),   [[(0,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* <1 A', [tree])),   [[(0,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* <2 A', [tree])),   [[(2,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* <3 A', [tree])),   [[(1,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* <\' A', [tree])),  [[(1,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* <-1 A', [tree])),  [[(1,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* <-2 A', [tree])),  [[(2,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* <-3 A', [tree])),  [[(0,)]])
+
+    def test_rel_precedence(self):
+        '''
+        Test matching nodes based on precedence relations.
+        '''
+        tree = ParentedTree.fromstring('(S (NP (NP (PP x)) (NP (AP x)))'
+                                       ' (VP (AP (X (PP x)) (Y (AP x))))'
+                                       ' (NP (RC (NP (AP x)))))')
+        self.assertEqual(list(tgrep.tgrep_positions('* . X', [tree])),
+                         [[(0,), (0, 1), (0, 1, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* . Y', [tree])),
+                         [[(1, 0, 0), (1, 0, 0, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* .. X', [tree])),
+                         [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* .. Y', [tree])),
+                         [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0),
+                          (1, 0, 0), (1, 0, 0, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* , X', [tree])),
+                         [[(1, 0, 1), (1, 0, 1, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* , Y', [tree])),
+                         [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* ,, X', [tree])),
+                         [[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0),
+                          (2, 0, 0, 0)]])
+        self.assertEqual(list(tgrep.tgrep_positions('* ,, Y', [tree])),
+                         [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]])
+
+    def test_examples(self):
+        '''
+        Test the Basic Examples from the TGrep2 manual.
+        '''
+        tree = ParentedTree.fromstring('(S (NP (AP x)) (NP (PP x)))')
+        # This matches any NP node that immediately dominates a PP:
+        self.assertEqual(list(tgrep.tgrep_positions('NP < PP', [tree])),
+                         [[(1,)]])
+
+        tree = ParentedTree.fromstring('(S (NP x) (VP x) (NP (PP x)) (VP x))')
+        # This matches an NP that dominates a PP and is immediately
+        # followed by a VP:
+        self.assertEqual(list(tgrep.tgrep_positions('NP << PP . VP', [tree])),
+                         [[(2,)]])
+
+        tree = ParentedTree.fromstring('(S (NP (AP x)) (NP (PP x)) '
+                                       '(NP (DET x) (NN x)) (VP x))')
+        # This matches an NP that dominates a PP or is immediately
+        # followed by a VP:
+        self.assertEqual(list(tgrep.tgrep_positions('NP << PP | . VP', [tree])),
+                         [[(1,), (2,)]])
+
+        tree = ParentedTree.fromstring('(S (NP (NP (PP x)) (NP (AP x)))'
+                                       ' (VP (AP (NP (PP x)) (NP (AP x))))'
+                                       ' (NP (RC (NP (AP x)))))')
+        # This matches an NP that does not dominate a PP. Also, the NP
+        # must either have a parent that is an NP or be dominated by a
+        # VP:
+        self.assertEqual(list(tgrep.tgrep_positions(
+                                               'NP !<< PP [> NP | >> VP]', [tree])),
+                         [[(0, 1), (1, 0, 1)]])
+
+        tree = ParentedTree.fromstring('(S (NP (AP (PP x) (VP x))) '
+                                       '(NP (AP (PP x) (NP x))) (NP x))')
+        # This matches an NP that dominates a PP which itself is
+        # immediately followed by a VP. Note the use of parentheses to
+        # group ". VP" with the PP rather than with the NP:
+        self.assertEqual(list(tgrep.tgrep_positions('NP << (PP . VP)', [tree])),
+                         [[(0,)]])
+
+        tree = ParentedTree.fromstring(
+            '(S (NP (DET a) (NN cat) (PP (IN on) (NP x)))'
+            ' (NP (DET a) (NN cat) (PP (IN on) (NP x)) (PP x))'
+            ' (NP x))')
+        # This matches an NP whose last child is a PP that begins with
+        # the preposition "on":
+        self.assertEqual(list(tgrep.tgrep_positions(
+                                               'NP <\' (PP <, (IN < on))', [tree])),
+                         [[(0,)]])
+
+        tree = ParentedTree.fromstring(
+            '(S (S (C x) (A (B x))) (S (C x) (A x)) '
+            '(S (D x) (A (B x))))')
+        # The following pattern matches an S which has a child A and
+        # another child that is a C and that the A has a child B:
+        self.assertEqual(list(tgrep.tgrep_positions('S < (A < B) < C', [tree])),
+                         [[(0,)]])
+
+        tree = ParentedTree.fromstring(
+            '(S (S (A (B x) (C x))) (S (S (C x) (A (B x)))))')
+        # However, this pattern means that S has child A and that A
+        # has children B and C:
+        self.assertEqual(list(tgrep.tgrep_positions('S < ((A < B) < C)', [tree])),
+                         [[(0,)]])
+
+        # It is equivalent to this:
+        self.assertEqual(list(tgrep.tgrep_positions('S < (A < B < C)', [tree])),
+                         [[(0,)]])
+
+    def test_use_macros(self):
+        '''
+        Test defining and using tgrep2 macros.
+        '''
+        tree = ParentedTree.fromstring(
+            '(VP (VB sold) (NP (DET the) '
+            '(NN heiress)) (NP (NN deed) (PREP to) '
+            '(NP (DET the) (NN school) (NN house))))')
+        self.assertEqual(list(tgrep.tgrep_positions(
+            '@ NP /^NP/;\n@ NN /^NN/;\n at NP !< @NP !$.. @NN',
+            [tree])),
+                         [[(1,), (2, 2)]])
+        # use undefined macro @CNP
+        self.assertRaises(
+            tgrep.TgrepException,
+            list,
+            tgrep.tgrep_positions(
+                '@ NP /^NP/;\n@ NN /^NN/;\n at CNP !< @NP !$.. @NN', [tree]))
+
+    def test_tokenize_node_labels(self):
+        '''Test tokenization of labeled nodes.'''
+        self.assertEqual(tgrep.tgrep_tokenize(
+            'S < @SBJ < (@VP < (@VB $.. @OBJ))'),
+                         ['S', '<', '@SBJ', '<', '(', '@VP', '<', '(',
+                          '@VB', '$..', '@OBJ', ')', ')'])
+        self.assertEqual(tgrep.tgrep_tokenize(
+            'S < @SBJ=s < (@VP=v < (@VB $.. @OBJ))'),
+                         ['S', '<', '@SBJ', '=', 's', '<', '(', '@VP',
+                          '=', 'v', '<', '(', '@VB', '$..', '@OBJ', ')',
+                          ')'])
+
+    def test_tokenize_segmented_patterns(self):
+        '''Test tokenization of segmented patterns.'''
+        self.assertEqual(tgrep.tgrep_tokenize(
+            'S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'),
+                         ['S', '<', '@SBJ', '=', 's', '<', '(', '@VP',
+                          '=', 'v', '<', '(', '@VB', '$..', '@OBJ', ')',
+                          ')', ':', '=s', '..', '=v'])
+
+    def test_labeled_nodes(self):
+        '''
+        Test labeled nodes.
+
+        Test case from Emily M. Bender.
+        '''
+        search = '''
+            # macros
+            @ SBJ /SBJ/;
+            @ VP /VP/;
+            @ VB /VB/;
+            @ VPoB /V[PB]/;
+            @ OBJ /OBJ/;
+
+            # 1 svo
+            S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'''
+        sent1 = ParentedTree.fromstring(
+            '(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))')
+        sent2 = ParentedTree.fromstring(
+            '(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))')
+        search_firsthalf = (search.split('\n\n')[0] +
+                            'S < @SBJ < (@VP < (@VB $.. @OBJ))')
+        search_rewrite = 'S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))'
+
+        self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0])
+        self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0])
+        self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0])
+        self.assertEqual(list(tgrep.tgrep_positions(search, [sent1])),
+                         list(tgrep.tgrep_positions(search_rewrite, [sent1])))
+        self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0])
+        self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0])
+        self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0])
+        self.assertEqual(list(tgrep.tgrep_positions(search, [sent2])),
+                         list(tgrep.tgrep_positions(search_rewrite, [sent2])))
+
+    def test_multiple_conjs(self):
+        '''
+        Test that multiple (3 or more) conjunctions of node relations are
+        handled properly.
+        '''
+        sent = ParentedTree.fromstring(
+            '((A (B b) (C c)) (A (B b) (C c) (D d)))')
+        # search = '(A < B < C < D)'
+        # search_tworels = '(A < B < C)'
+        self.assertEqual(list(tgrep.tgrep_positions('(A < B < C < D)', [sent])),
+                         [[(1,)]])
+        self.assertEqual(list(tgrep.tgrep_positions('(A < B < C)', [sent])),
+                         [[(0,), (1,)]])
+
+    def test_trailing_semicolon(self):
+        '''
+        Test that semicolons at the end of a tgrep2 search string won't
+        cause a parse failure.
+        '''
+        tree = ParentedTree.fromstring(
+            '(S (NP (DT the) (JJ big) (NN dog)) '
+            '(VP bit) (NP (DT a) (NN cat)))')
+        self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])),
+                         [[(0,2), (2,1)]])
+        self.assertEqual(list(tgrep.tgrep_positions('NN;', [tree])),
+                         [[(0,2), (2,1)]])
+        self.assertEqual(list(tgrep.tgrep_positions('NN;;', [tree])),
+                         [[(0,2), (2,1)]])
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/nltk/tgrep.py b/nltk/tgrep.py
new file mode 100644
index 0000000..fd4dfa2
--- /dev/null
+++ b/nltk/tgrep.py
@@ -0,0 +1,941 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Natural Language Toolkit: TGrep search
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Will Roberts <wildwilhelm at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+'''
+============================================
+ TGrep search implementation for NLTK trees
+============================================
+
+This module supports TGrep2 syntax for matching parts of NLTK Trees.
+Note that many tgrep operators require the tree passed to be a
+``ParentedTree``.
+
+External links:
+
+- `Tgrep tutorial <http://www.stanford.edu/dept/linguistics/corpora/cas-tut-tgrep.html>`_
+- `Tgrep2 manual <http://tedlab.mit.edu/~dr/Tgrep2/tgrep2.pdf>`_
+- `Tgrep2 source <http://tedlab.mit.edu/~dr/Tgrep2/>`_
+
+Usage
+=====
+
+>>> from nltk.tree import ParentedTree
+>>> from nltk.tgrep import tgrep_nodes, tgrep_positions
+>>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))')
+>>> list(tgrep_nodes('NN', [tree]))
+[[ParentedTree('NN', ['dog']), ParentedTree('NN', ['cat'])]]
+>>> list(tgrep_positions('NN', [tree]))
+[[(0, 2), (2, 1)]]
+>>> list(tgrep_nodes('DT', [tree]))
+[[ParentedTree('DT', ['the']), ParentedTree('DT', ['a'])]]
+>>> list(tgrep_nodes('DT $ JJ', [tree]))
+[[ParentedTree('DT', ['the'])]]
+
+This implementation adds syntax to select nodes based on their NLTK
+tree position.  This syntax is ``N`` plus a Python tuple representing
+the tree position.  For instance, ``N()``, ``N(0,)``, ``N(0,0)`` are
+valid node selectors.  Example:
+
+>>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))')
+>>> tree[0,0]
+ParentedTree('DT', ['the'])
+>>> tree[0,0].treeposition()
+(0, 0)
+>>> list(tgrep_nodes('N(0,0)', [tree]))
+[[ParentedTree('DT', ['the'])]]
+
+Caveats:
+========
+
+- Link modifiers: "?" and "=" are not implemented.
+- Tgrep compatibility: Using "@" for "!", "{" for "<", "}" for ">" are
+  not implemented.
+- The "=" and "~" links are not implemented.
+
+Known Issues:
+=============
+
+- There are some issues with link relations involving leaf nodes
+  (which are represented as bare strings in NLTK trees).  For
+  instance, consider the tree::
+
+      (S (A x))
+
+  The search string ``* !>> S`` should select all nodes which are not
+  dominated in some way by an ``S`` node (i.e., all nodes which are
+  not descendants of an ``S``).  Clearly, in this tree, the only node
+  which fulfills this criterion is the top node (since it is not
+  dominated by anything).  However, the code here will find both the
+  top node and the leaf node ``x``.  This is because we cannot recover
+  the parent of the leaf, since it is stored as a bare string.
+
+  A possible workaround, when performing this kind of search, would be
+  to filter out all leaf nodes.
+
+Implementation notes
+====================
+
+This implementation is (somewhat awkwardly) based on lambda functions
+which are predicates on a node.  A predicate is a function which is
+either True or False; using a predicate function, we can identify sets
+of nodes with particular properties.  A predicate function, could, for
+instance, return True only if a particular node has a label matching a
+particular regular expression, and has a daughter node which has no
+sisters.  Because tgrep2 search strings can do things statefully (such
+as substituting in macros, and binding nodes with node labels), the
+actual predicate function is declared with three arguments::
+
+    pred = lambda n, m, l: return True # some logic here
+
+``n``
+    is a node in a tree; this argument must always be given
+
+``m``
+    contains a dictionary, mapping macro names onto predicate functions
+
+``l``
+    is a dictionary to map node labels onto nodes in the tree
+
+``m`` and ``l`` are declared to default to ``None``, and so need not be
+specified in a call to a predicate.  Predicates which call other
+predicates must always pass the value of these arguments on.  The
+top-level predicate (constructed by ``_tgrep_exprs_action``) binds the
+macro definitions to ``m`` and initialises ``l`` to an empty dictionary.
+'''
+
+from __future__ import absolute_import, print_function, unicode_literals
+from nltk.compat import binary_type, text_type
+import functools
+import nltk.tree
+try:
+    import pyparsing
+except ImportError:
+    print('Warning: nltk.tgrep will not work without the `pyparsing` package')
+    print('installed.')
+import re
+
+class TgrepException(Exception):
+    '''Tgrep exception type.'''
+    pass
+
+def ancestors(node):
+    '''
+    Returns the list of all nodes dominating the given tree node.
+    This method will not work with leaf nodes, since there is no way
+    to recover the parent.
+    '''
+    results = []
+    try:
+        current = node.parent()
+    except AttributeError:
+        # if node is a leaf, we cannot retrieve its parent
+        return results
+    while current:
+        results.append(current)
+        current = current.parent()
+    return results
+
+def unique_ancestors(node):
+    '''
+    Returns the list of all nodes dominating the given node, where
+    there is only a single path of descent.
+    '''
+    results = []
+    try:
+        current = node.parent()
+    except AttributeError:
+        # if node is a leaf, we cannot retrieve its parent
+        return results
+    while current and len(current) == 1:
+        results.append(current)
+        current = current.parent()
+    return results
+
+def _descendants(node):
+    '''
+    Returns the list of all nodes which are descended from the given
+    tree node in some way.
+    '''
+    try:
+        treepos = node.treepositions()
+    except AttributeError:
+        return []
+    return [node[x] for x in treepos[1:]]
+
+def _leftmost_descendants(node):
+    '''
+    Returns the set of all nodes descended in some way through
+    left branches from this node.
+    '''
+    try:
+        treepos = node.treepositions()
+    except AttributeError:
+        return []
+    return [node[x] for x in treepos[1:] if all(y == 0 for y in x)]
+
+def _rightmost_descendants(node):
+    '''
+    Returns the set of all nodes descended in some way through
+    right branches from this node.
+    '''
+    try:
+        rightmost_leaf = max(node.treepositions())
+    except AttributeError:
+        return []
+    return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)]
+
+def _istree(obj):
+    '''Predicate to check whether `obj` is a nltk.tree.Tree.'''
+    return isinstance(obj, nltk.tree.Tree)
+
+def _unique_descendants(node):
+    '''
+    Returns the list of all nodes descended from the given node, where
+    there is only a single path of descent.
+    '''
+    results = []
+    current = node
+    while current and _istree(current) and len(current) == 1:
+        current = current[0]
+        results.append(current)
+    return results
+
+def _before(node):
+    '''
+    Returns the set of all nodes that are before the given node.
+    '''
+    try:
+        pos = node.treeposition()
+        tree = node.root()
+    except AttributeError:
+        return []
+    return [tree[x] for x in tree.treepositions()
+            if x[:len(pos)] < pos[:len(x)]]
+
+def _immediately_before(node):
+    '''
+    Returns the set of all nodes that are immediately before the given
+    node.
+
+    Tree node A immediately precedes node B if the last terminal
+    symbol (word) produced by A immediately precedes the first
+    terminal symbol produced by B.
+    '''
+    try:
+        pos = node.treeposition()
+        tree = node.root()
+    except AttributeError:
+        return []
+    # go "upwards" from pos until there is a place we can go to the left
+    idx = len(pos) - 1
+    while 0 <= idx and pos[idx] == 0:
+        idx -= 1
+    if idx < 0:
+        return []
+    pos = list(pos[:idx + 1])
+    pos[-1] -= 1
+    before = tree[pos]
+    return [before] + _rightmost_descendants(before)
+
+def _after(node):
+    '''
+    Returns the set of all nodes that are after the given node.
+    '''
+    try:
+        pos = node.treeposition()
+        tree = node.root()
+    except AttributeError:
+        return []
+    return [tree[x] for x in tree.treepositions()
+            if x[:len(pos)] > pos[:len(x)]]
+
+def _immediately_after(node):
+    '''
+    Returns the set of all nodes that are immediately after the given
+    node.
+
+    Tree node A immediately follows node B if the first terminal
+    symbol (word) produced by A immediately follows the last
+    terminal symbol produced by B.
+    '''
+    try:
+        pos = node.treeposition()
+        tree = node.root()
+        current = node.parent()
+    except AttributeError:
+        return []
+    # go "upwards" from pos until there is a place we can go to the
+    # right
+    idx = len(pos) - 1
+    while 0 <= idx and pos[idx] == len(current) - 1:
+        idx -= 1
+        current = current.parent()
+    if idx < 0:
+        return []
+    pos = list(pos[:idx + 1])
+    pos[-1] += 1
+    after = tree[pos]
+    return [after] + _leftmost_descendants(after)
+
+def _tgrep_node_literal_value(node):
+    '''
+    Gets the string value of a given parse tree node, for comparison
+    using the tgrep node literal predicates.
+    '''
+    return (node.label() if _istree(node) else text_type(node))
+
+def _tgrep_macro_use_action(_s, _l, tokens):
+    '''
+    Builds a lambda function which looks up the macro name used.
+    '''
+    assert len(tokens) == 1
+    assert tokens[0][0] == '@'
+    macro_name = tokens[0][1:]
+    def macro_use(n, m=None, l=None):
+        if m is None or macro_name not in m:
+            raise TgrepException('macro {0} not defined'.format(macro_name))
+        return m[macro_name](n, m, l)
+    return macro_use
+
+def _tgrep_node_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    depending on the name of its node.
+    '''
+    # print 'node tokens: ', tokens
+    if tokens[0] == "'":
+        # strip initial apostrophe (tgrep2 print command)
+        tokens = tokens[1:]
+    if len(tokens) > 1:
+        # disjunctive definition of a node name
+        assert list(set(tokens[1::2])) == ['|']
+        # recursively call self to interpret each node name definition
+        tokens = [_tgrep_node_action(None, None, [node])
+                  for node in tokens[::2]]
+        # capture tokens and return the disjunction
+        return (lambda t: lambda n, m=None, l=None: any(f(n, m, l) for f in t))(tokens)
+    else:
+        if hasattr(tokens[0], '__call__'):
+            # this is a previously interpreted parenthetical node
+            # definition (lambda function)
+            return tokens[0]
+        elif tokens[0] == '*' or tokens[0] == '__':
+            return lambda n, m=None, l=None: True
+        elif tokens[0].startswith('"'):
+            assert tokens[0].endswith('"')
+            node_lit = tokens[0][1:-1].replace('\\"', '"').replace('\\\\', '\\')
+            return (lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s)(node_lit)
+        elif tokens[0].startswith('/'):
+            assert tokens[0].endswith('/')
+            node_lit = tokens[0][1:-1]
+            return (lambda r: lambda n, m=None, l=None:
+                    r.search(_tgrep_node_literal_value(n)))(re.compile(node_lit))
+        elif tokens[0].startswith('i@'):
+            node_func = _tgrep_node_action(_s, _l, [tokens[0][2:].lower()])
+            return (lambda f: lambda n, m=None, l=None:
+                    f(_tgrep_node_literal_value(n).lower()))(node_func)
+        else:
+            return (lambda s: lambda n, m=None, l=None:
+                    _tgrep_node_literal_value(n) == s)(tokens[0])
+
+def _tgrep_parens_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    from a parenthetical notation.
+    '''
+    # print 'parenthetical tokens: ', tokens
+    assert len(tokens) == 3
+    assert tokens[0] == '('
+    assert tokens[2] == ')'
+    return tokens[1]
+
+def _tgrep_nltk_tree_pos_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    which returns true if the node is located at a specific tree
+    position.
+    '''
+    # recover the tuple from the parsed sting
+    node_tree_position = tuple(int(x) for x in tokens if x.isdigit())
+    # capture the node's tree position
+    return (lambda i: lambda n, m=None, l=None: (hasattr(n, 'treeposition') and
+                                                 n.treeposition() == i))(node_tree_position)
+
+def _tgrep_relation_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    depending on its relation to other nodes in the tree.
+    '''
+    # print 'relation tokens: ', tokens
+    # process negation first if needed
+    negated = False
+    if tokens[0] == '!':
+        negated = True
+        tokens = tokens[1:]
+    if tokens[0] == '[':
+        # process square-bracketed relation expressions
+        assert len(tokens) == 3
+        assert tokens[2] == ']'
+        retval = tokens[1]
+    else:
+        # process operator-node relation expressions
+        assert len(tokens) == 2
+        operator, predicate = tokens
+        # A < B       A is the parent of (immediately dominates) B.
+        if operator == '<':
+            retval = lambda n, m=None, l=None: (_istree(n) and
+                                                any(predicate(x, m, l) for x in n))
+        # A > B       A is the child of B.
+        elif operator == '>':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                bool(n.parent()) and
+                                                predicate(n.parent(), m, l))
+        # A <, B      Synonymous with A <1 B.
+        elif operator == '<,' or operator == '<1':
+            retval = lambda n, m=None, l=None: (_istree(n) and
+                                                bool(list(n)) and
+                                                predicate(n[0], m, l))
+        # A >, B      Synonymous with A >1 B.
+        elif operator == '>,' or operator == '>1':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                bool(n.parent()) and
+                                                (n is n.parent()[0]) and
+                                                predicate(n.parent(), m, l))
+        # A <N B      B is the Nth child of A (the first child is <1).
+        elif operator[0] == '<' and operator[1:].isdigit():
+            idx = int(operator[1:])
+            # capture the index parameter
+            retval = (lambda i: lambda n, m=None, l=None: (_istree(n) and
+                                                           bool(list(n)) and
+                                                           0 <= i < len(n) and
+                                                           predicate(n[i], m, l)))(idx - 1)
+        # A >N B      A is the Nth child of B (the first child is >1).
+        elif operator[0] == '>' and operator[1:].isdigit():
+            idx = int(operator[1:])
+            # capture the index parameter
+            retval = (lambda i: lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                           bool(n.parent()) and
+                                                           0 <= i < len(n.parent()) and
+                                                           (n is n.parent()[i]) and
+                                                           predicate(n.parent(), m, l)))(idx - 1)
+        # A <' B      B is the last child of A (also synonymous with A <-1 B).
+        # A <- B      B is the last child of A (synonymous with A <-1 B).
+        elif operator == '<\'' or operator == '<-' or operator == '<-1':
+            retval = lambda n, m=None, l=None: (_istree(n) and bool(list(n))
+                                                and predicate(n[-1], m, l))
+        # A >' B      A is the last child of B (also synonymous with A >-1 B).
+        # A >- B      A is the last child of B (synonymous with A >-1 B).
+        elif operator == '>\'' or operator == '>-' or operator == '>-1':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                bool(n.parent()) and
+                                                (n is n.parent()[-1]) and
+                                                predicate(n.parent(), m, l))
+        # A <-N B 	  B is the N th-to-last child of A (the last child is <-1).
+        elif operator[:2] == '<-' and operator[2:].isdigit():
+            idx = -int(operator[2:])
+            # capture the index parameter
+            retval = (lambda i: lambda n, m=None, l=None: (_istree(n) and
+                                                           bool(list(n)) and
+                                                           0 <= (i + len(n)) < len(n) and
+                                                           predicate(n[i + len(n)], m, l)))(idx)
+        # A >-N B 	  A is the N th-to-last child of B (the last child is >-1).
+        elif operator[:2] == '>-' and operator[2:].isdigit():
+            idx = -int(operator[2:])
+            # capture the index parameter
+            retval = (lambda i: lambda n, m=None, l=None:
+                          (hasattr(n, 'parent') and
+                           bool(n.parent()) and
+                           0 <= (i + len(n.parent())) < len(n.parent()) and
+                           (n is n.parent()[i + len(n.parent())]) and
+                           predicate(n.parent(), m, l)))(idx)
+        # A <: B      B is the only child of A
+        elif operator == '<:':
+            retval = lambda n, m=None, l=None: (_istree(n) and
+                                                len(n) == 1 and
+                                                predicate(n[0], m, l))
+        # A >: B      A is the only child of B.
+        elif operator == '>:':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                bool(n.parent()) and
+                                                len(n.parent()) == 1 and
+                                                predicate(n.parent(), m, l))
+        # A << B      A dominates B (A is an ancestor of B).
+        elif operator == '<<':
+            retval = lambda n, m=None, l=None: (_istree(n) and
+                                                any(predicate(x, m, l) for x in _descendants(n)))
+        # A >> B      A is dominated by B (A is a descendant of B).
+        elif operator == '>>':
+            retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in ancestors(n))
+        # A <<, B     B is a left-most descendant of A.
+        elif operator == '<<,' or operator == '<<1':
+            retval = lambda n, m=None, l=None: (_istree(n) and
+                                                any(predicate(x, m, l)
+                                                    for x in _leftmost_descendants(n)))
+        # A >>, B     A is a left-most descendant of B.
+        elif operator == '>>,':
+            retval = lambda n, m=None, l=None: any((predicate(x, m, l) and
+                                                    n in _leftmost_descendants(x))
+                                                   for x in ancestors(n))
+        # A <<' B     B is a right-most descendant of A.
+        elif operator == '<<\'':
+            retval = lambda n, m=None, l=None: (_istree(n) and
+                                                any(predicate(x, m, l)
+                                                    for x in _rightmost_descendants(n)))
+        # A >>' B     A is a right-most descendant of B.
+        elif operator == '>>\'':
+            retval = lambda n, m=None, l=None: any((predicate(x, m, l) and
+                                                    n in _rightmost_descendants(x))
+                                                   for x in ancestors(n))
+        # A <<: B     There is a single path of descent from A and B is on it.
+        elif operator == '<<:':
+            retval = lambda n, m=None, l=None: (_istree(n) and
+                                                any(predicate(x, m, l)
+                                                    for x in _unique_descendants(n)))
+        # A >>: B     There is a single path of descent from B and A is on it.
+        elif operator == '>>:':
+            retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in unique_ancestors(n))
+        # A . B       A immediately precedes B.
+        elif operator == '.':
+            retval = lambda n, m=None, l=None: any(predicate(x, m, l)
+                                                   for x in _immediately_after(n))
+        # A , B       A immediately follows B.
+        elif operator == ',':
+            retval = lambda n, m=None, l=None: any(predicate(x, m, l)
+                                                   for x in _immediately_before(n))
+        # A .. B      A precedes B.
+        elif operator == '..':
+            retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in _after(n))
+        # A ,, B      A follows B.
+        elif operator == ',,':
+            retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in _before(n))
+        # A $ B       A is a sister of B (and A != B).
+        elif operator == '$' or operator == '%':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                bool(n.parent()) and
+                                                any(predicate(x, m, l)
+                                                    for x in n.parent() if x is not n))
+        # A $. B      A is a sister of and immediately precedes B.
+        elif operator == '$.' or operator == '%.':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'right_sibling') and
+                                                bool(n.right_sibling()) and
+                                                predicate(n.right_sibling(), m, l))
+        # A $, B      A is a sister of and immediately follows B.
+        elif operator == '$,' or operator == '%,':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'left_sibling') and
+                                                bool(n.left_sibling()) and
+                                                predicate(n.left_sibling(), m, l))
+        # A $.. B     A is a sister of and precedes B.
+        elif operator == '$..' or operator == '%..':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                hasattr(n, 'parent_index') and
+                                                bool(n.parent()) and
+                                                any(predicate(x, m, l) for x in
+                                                    n.parent()[n.parent_index() + 1:]))
+        # A $,, B     A is a sister of and follows B.
+        elif operator == '$,,' or operator == '%,,':
+            retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and
+                                                hasattr(n, 'parent_index') and
+                                                bool(n.parent()) and
+                                                any(predicate(x, m, l) for x in
+                                                    n.parent()[:n.parent_index()]))
+        else:
+            raise TgrepException(
+                'cannot interpret tgrep operator "{0}"'.format(operator))
+    # now return the built function
+    if negated:
+        return (lambda r: (lambda n, m=None, l=None: not r(n, m, l)))(retval)
+    else:
+        return retval
+
+def _tgrep_conjunction_action(_s, _l, tokens, join_char = '&'):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    from the conjunction of several other such lambda functions.
+
+    This is prototypically called for expressions like
+    (`tgrep_rel_conjunction`)::
+
+        < NP & < AP < VP
+
+    where tokens is a list of predicates representing the relations
+    (`< NP`, `< AP`, and `< VP`), possibly with the character `&`
+    included (as in the example here).
+
+    This is also called for expressions like (`tgrep_node_expr2`)::
+
+        NP < NN
+        S=s < /NP/=n : s < /VP/=v : n .. v
+
+    tokens[0] is a tgrep_expr predicate; tokens[1:] are an (optional)
+    list of segmented patterns (`tgrep_expr_labeled`, processed by
+    `_tgrep_segmented_pattern_action`).
+    '''
+    # filter out the ampersand
+    tokens = [x for x in tokens if x != join_char]
+    # print 'relation conjunction tokens: ', tokens
+    if len(tokens) == 1:
+        return tokens[0]
+    else:
+        return (lambda ts: lambda n, m=None, l=None: all(predicate(n, m, l)
+                                                         for predicate in ts))(tokens)
+
+def _tgrep_segmented_pattern_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a segmented pattern.
+
+    Called for expressions like (`tgrep_expr_labeled`)::
+
+        =s .. =v < =n
+
+    This is a segmented pattern, a tgrep2 expression which begins with
+    a node label.
+
+    The problem is that for segemented_pattern_action (': =v < =s'),
+    the first element (in this case, =v) is specifically selected by
+    virtue of matching a particular node in the tree; to retrieve
+    the node, we need the label, not a lambda function.  For node
+    labels inside a tgrep_node_expr, we need a lambda function which
+    returns true if the node visited is the same as =v.
+
+    We solve this by creating two copies of a node_label_use in the
+    grammar; the label use inside a tgrep_expr_labeled has a separate
+    parse action to the pred use inside a node_expr.  See
+    `_tgrep_node_label_use_action` and
+    `_tgrep_node_label_pred_use_action`.
+    '''
+    # tokens[0] is a string containing the node label
+    node_label = tokens[0]
+    # tokens[1:] is an (optional) list of predicates which must all
+    # hold of the bound node
+    reln_preds = tokens[1:]
+    def pattern_segment_pred(n, m=None, l=None):
+        '''This predicate function ignores its node argument.'''
+        # look up the bound node using its label
+        if l is None or node_label not in l:
+            raise TgrepException('node_label ={0} not bound in pattern'.format(
+                node_label))
+        node = l[node_label]
+        # match the relation predicates against the node
+        return all(pred(node, m, l) for pred in reln_preds)
+    return pattern_segment_pred
+
+def _tgrep_node_label_use_action(_s, _l, tokens):
+    '''
+    Returns the node label used to begin a tgrep_expr_labeled.  See
+    `_tgrep_segmented_pattern_action`.
+
+    Called for expressions like (`tgrep_node_label_use`)::
+
+        =s
+
+    when they appear as the first element of a `tgrep_expr_labeled`
+    expression (see `_tgrep_segmented_pattern_action`).
+
+    It returns the node label.
+    '''
+    assert len(tokens) == 1
+    assert tokens[0].startswith('=')
+    return tokens[0][1:]
+
+def _tgrep_node_label_pred_use_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    which describes the use of a previously bound node label.
+
+    Called for expressions like (`tgrep_node_label_use_pred`)::
+
+        =s
+
+    when they appear inside a tgrep_node_expr (for example, inside a
+    relation).  The predicate returns true if and only if its node
+    argument is identical the the node looked up in the node label
+    dictionary using the node's label.
+    '''
+    assert len(tokens) == 1
+    assert tokens[0].startswith('=')
+    node_label = tokens[0][1:]
+    def node_label_use_pred(n, m=None, l=None):
+        # look up the bound node using its label
+        if l is None or node_label not in l:
+            raise TgrepException('node_label ={0} not bound in pattern'.format(
+                node_label))
+        node = l[node_label]
+        # truth means the given node is this node
+        return n is node
+    return node_label_use_pred
+
+def _tgrep_bind_node_label_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    which can optionally bind a matching node into the tgrep2 string's
+    label_dict.
+
+    Called for expressions like (`tgrep_node_expr2`)::
+
+        /NP/
+        @NP=n
+    '''
+    # tokens[0] is a tgrep_node_expr
+    if len(tokens) == 1:
+        return tokens[0]
+    else:
+        # if present, tokens[1] is the character '=', and tokens[2] is
+        # a tgrep_node_label, a string value containing the node label
+        assert len(tokens) == 3
+        assert tokens[1] == '='
+        node_pred = tokens[0]
+        node_label = tokens[2]
+        def node_label_bind_pred(n, m=None, l=None):
+            if node_pred(n, m, l):
+                # bind `n` into the dictionary `l`
+                if l is None:
+                    raise TgrepException(
+                        'cannot bind node_label {0}: label_dict is None'.format(
+                            node_label))
+                l[node_label] = n
+                return True
+            else:
+                return False
+        return node_label_bind_pred
+
+def _tgrep_rel_disjunction_action(_s, _l, tokens):
+    '''
+    Builds a lambda function representing a predicate on a tree node
+    from the disjunction of several other such lambda functions.
+    '''
+    # filter out the pipe
+    tokens = [x for x in tokens if x != '|']
+    # print 'relation disjunction tokens: ', tokens
+    if len(tokens) == 1:
+        return tokens[0]
+    elif len(tokens) == 2:
+        return (lambda a, b: lambda n, m=None, l=None:
+                a(n, m, l) or b(n, m, l))(tokens[0], tokens[1])
+
+def _macro_defn_action(_s, _l, tokens):
+    '''
+    Builds a dictionary structure which defines the given macro.
+    '''
+    assert len(tokens) == 3
+    assert tokens[0] == '@'
+    return {tokens[1]: tokens[2]}
+
+def _tgrep_exprs_action(_s, _l, tokens):
+    '''
+    This is the top-lebel node in a tgrep2 search string; the
+    predicate function it returns binds together all the state of a
+    tgrep2 search string.
+
+    Builds a lambda function representing a predicate on a tree node
+    from the disjunction of several tgrep expressions.  Also handles
+    macro definitions and macro name binding, and node label
+    definitions and node label binding.
+    '''
+    if len(tokens) == 1:
+        return lambda n, m=None, l=None: tokens[0](n, None, {})
+    # filter out all the semicolons
+    tokens = [x for x in tokens if x != ';']
+    # collect all macro definitions
+    macro_dict = {}
+    macro_defs = [tok for tok in tokens if isinstance(tok, dict)]
+    for macro_def in macro_defs:
+        macro_dict.update(macro_def)
+    # collect all tgrep expressions
+    tgrep_exprs = [tok for tok in tokens if not isinstance(tok, dict)]
+    # create a new scope for the node label dictionary
+    def top_level_pred(n, m=macro_dict, l=None):
+        label_dict = {}
+        # bind macro definitions and OR together all tgrep_exprs
+        return any(predicate(n, m, label_dict) for predicate in tgrep_exprs)
+    return top_level_pred
+
+def _build_tgrep_parser(set_parse_actions = True):
+    '''
+    Builds a pyparsing-based parser object for tokenizing and
+    interpreting tgrep search strings.
+    '''
+    tgrep_op = (pyparsing.Optional('!') +
+                pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*'))
+    tgrep_qstring = pyparsing.QuotedString(quoteChar='"', escChar='\\',
+                                           unquoteResults=False)
+    tgrep_node_regex = pyparsing.QuotedString(quoteChar='/', escChar='\\',
+                                              unquoteResults=False)
+    tgrep_qstring_icase = pyparsing.Regex(
+        'i@\\"(?:[^"\\n\\r\\\\]|(?:\\\\.))*\\"')
+    tgrep_node_regex_icase = pyparsing.Regex(
+        'i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/')
+    tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%\'^=]+')
+    tgrep_expr = pyparsing.Forward()
+    tgrep_relations = pyparsing.Forward()
+    tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')'
+    tgrep_nltk_tree_pos = (
+        pyparsing.Literal('N(') +
+        pyparsing.Optional(pyparsing.Word(pyparsing.nums) + ',' +
+                           pyparsing.Optional(pyparsing.delimitedList(
+                    pyparsing.Word(pyparsing.nums), delim=',') +
+                                              pyparsing.Optional(','))) + ')')
+    tgrep_node_label = pyparsing.Regex('[A-Za-z0-9]+')
+    tgrep_node_label_use = pyparsing.Combine('=' + tgrep_node_label)
+    # see _tgrep_segmented_pattern_action
+    tgrep_node_label_use_pred = tgrep_node_label_use.copy()
+    macro_name = pyparsing.Regex('[^];:.,&|<>()[$!@%\'^=\r\t\n ]+')
+    macro_name.setWhitespaceChars('')
+    macro_use = pyparsing.Combine('@' + macro_name)
+    tgrep_node_expr = (tgrep_node_label_use_pred |
+                       macro_use |
+                       tgrep_nltk_tree_pos |
+                       tgrep_qstring_icase |
+                       tgrep_node_regex_icase |
+                       tgrep_qstring |
+                       tgrep_node_regex |
+                       '*' |
+                       tgrep_node_literal)
+    tgrep_node_expr2 = ((tgrep_node_expr +
+                         pyparsing.Literal('=').setWhitespaceChars('') +
+                         tgrep_node_label.copy().setWhitespaceChars('')) |
+                        tgrep_node_expr)
+    tgrep_node = (tgrep_parens |
+                  (pyparsing.Optional("'") +
+                   tgrep_node_expr2 +
+                   pyparsing.ZeroOrMore("|" + tgrep_node_expr)))
+    tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']'
+    tgrep_relation = tgrep_brackets | (tgrep_op + tgrep_node)
+    tgrep_rel_conjunction = pyparsing.Forward()
+    tgrep_rel_conjunction << (tgrep_relation +
+                              pyparsing.ZeroOrMore(pyparsing.Optional('&') +
+                                                   tgrep_rel_conjunction))
+    tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore(
+        "|" + tgrep_relations)
+    tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations)
+    tgrep_expr_labeled = tgrep_node_label_use + pyparsing.Optional(tgrep_relations)
+    tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(':' + tgrep_expr_labeled)
+    macro_defn = (pyparsing.Literal('@') +
+                  pyparsing.White().suppress() +
+                  macro_name +
+                  tgrep_expr2)
+    tgrep_exprs = (pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(';' + macro_defn) + ';') +
+                   tgrep_expr2 +
+                   pyparsing.ZeroOrMore(';' + (macro_defn | tgrep_expr2)) +
+                   pyparsing.ZeroOrMore(';').suppress())
+    if set_parse_actions:
+        tgrep_node_label_use.setParseAction(_tgrep_node_label_use_action)
+        tgrep_node_label_use_pred.setParseAction(_tgrep_node_label_pred_use_action)
+        macro_use.setParseAction(_tgrep_macro_use_action)
+        tgrep_node.setParseAction(_tgrep_node_action)
+        tgrep_node_expr2.setParseAction(_tgrep_bind_node_label_action)
+        tgrep_parens.setParseAction(_tgrep_parens_action)
+        tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action)
+        tgrep_relation.setParseAction(_tgrep_relation_action)
+        tgrep_rel_conjunction.setParseAction(_tgrep_conjunction_action)
+        tgrep_relations.setParseAction(_tgrep_rel_disjunction_action)
+        macro_defn.setParseAction(_macro_defn_action)
+        # the whole expression is also the conjunction of two
+        # predicates: the first node predicate, and the remaining
+        # relation predicates
+        tgrep_expr.setParseAction(_tgrep_conjunction_action)
+        tgrep_expr_labeled.setParseAction(_tgrep_segmented_pattern_action)
+        tgrep_expr2.setParseAction(functools.partial(_tgrep_conjunction_action,
+                                                     join_char = ':'))
+        tgrep_exprs.setParseAction(_tgrep_exprs_action)
+    return tgrep_exprs.ignore('#' + pyparsing.restOfLine)
+
+def tgrep_tokenize(tgrep_string):
+    '''
+    Tokenizes a TGrep search string into separate tokens.
+    '''
+    parser = _build_tgrep_parser(False)
+    if isinstance(tgrep_string, binary_type):
+        tgrep_string = tgrep_string.decode()
+    return list(parser.parseString(tgrep_string))
+
+def tgrep_compile(tgrep_string):
+    '''
+    Parses (and tokenizes, if necessary) a TGrep search string into a
+    lambda function.
+    '''
+    parser = _build_tgrep_parser(True)
+    if isinstance(tgrep_string, binary_type):
+        tgrep_string = tgrep_string.decode()
+    return list(parser.parseString(tgrep_string, parseAll=True))[0]
+
+def treepositions_no_leaves(tree):
+    '''
+    Returns all the tree positions in the given tree which are not
+    leaf nodes.
+    '''
+    treepositions = tree.treepositions()
+    # leaves are treeposition tuples that are not prefixes of any
+    # other treeposition
+    prefixes = set()
+    for pos in treepositions:
+        for length in range(len(pos)):
+            prefixes.add(pos[:length])
+    return [pos for pos in treepositions if pos in prefixes]
+
+def tgrep_positions(pattern, trees, search_leaves=True):
+    """
+    Return the tree positions in the trees which match the given pattern.
+
+    :param pattern: a tgrep search pattern
+    :type pattern: str or output of tgrep_compile()
+    :param trees: a sequence of NLTK trees (usually ParentedTrees)
+    :type trees: iter(ParentedTree) or iter(Tree)
+    :param search_leaves: whether ot return matching leaf nodes
+    :type search_leaves: bool
+    :rtype: iter(tree positions)
+    """
+
+    if isinstance(pattern, (binary_type, text_type)):
+        pattern = tgrep_compile(pattern)
+
+    for tree in trees:
+        try:
+            if search_leaves:
+                positions = tree.treepositions()
+            else:
+                positions = treepositions_no_leaves(tree)
+            yield [position for position in positions
+                      if pattern(tree[position])]
+        except AttributeError:
+            yield []
+
+def tgrep_nodes(pattern, trees, search_leaves=True):
+    """
+    Return the tree nodes in the trees which match the given pattern.
+
+    :param pattern: a tgrep search pattern
+    :type pattern: str or output of tgrep_compile()
+    :param trees: a sequence of NLTK trees (usually ParentedTrees)
+    :type trees: iter(ParentedTree) or iter(Tree)
+    :param search_leaves: whether ot return matching leaf nodes
+    :type search_leaves: bool
+    :rtype: iter(tree nodes)
+    """
+
+    if isinstance(pattern, (binary_type, text_type)):
+        pattern = tgrep_compile(pattern)
+
+    for tree in trees:
+        try:
+            if search_leaves:
+                positions = tree.treepositions()
+            else:
+                positions = treepositions_no_leaves(tree)
+            yield [tree[position] for position in positions
+                      if pattern(tree[position])]
+        except AttributeError:
+            yield []
+
+
+# run module doctests
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
+
diff --git a/nltk/tokenize/texttiling.py b/nltk/tokenize/texttiling.py
index d8c3e62..9316fdf 100644
--- a/nltk/tokenize/texttiling.py
+++ b/nltk/tokenize/texttiling.py
@@ -52,6 +52,15 @@ class TextTilingTokenizer(TokenizerI):
     :param cutoff_policy: The policy used to determine the number of boundaries:
       `HC` (default) or `LC`
     :type cutoff_policy: constant
+
+    >>> from nltk.corpus import brown
+    >>> tt = TextTilingTokenizer(demo_mode=True)
+    >>> text = brown.raw()[:10000]
+    >>> s, ss, d, b = tt.tokenize(text)
+    >>> b
+    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
+     0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
+     0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]
     """
 
     def __init__(self,
@@ -284,7 +293,7 @@ class TextTilingTokenizer(TokenizerI):
 
         depth_tuples = sorted(zip(depth_scores, range(len(depth_scores))))
         depth_tuples.reverse()
-        hp = filter(lambda x:x[0]>cutoff, depth_tuples)
+        hp = list(filter(lambda x:x[0]>cutoff, depth_tuples))
 
         for dt in hp:
             boundaries[dt[1]] = 1
@@ -435,7 +444,7 @@ def smooth(x,window_len=11,window='flat'):
 
 def demo(text=None):
     from nltk.corpus import brown
-    import pylab
+    from matplotlib import pylab
     tt = TextTilingTokenizer(demo_mode=True)
     if text is None: text = brown.raw()[:10000]
     s, ss, d, b = tt.tokenize(text)
diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py
index 465f5d9..7453ef5 100644
--- a/nltk/tokenize/treebank.py
+++ b/nltk/tokenize/treebank.py
@@ -40,6 +40,9 @@ class TreebankWordTokenizer(TokenizerI):
         >>> s = "They'll save and invest more."
         >>> TreebankWordTokenizer().tokenize(s)
         ['They', "'ll", 'save', 'and', 'invest', 'more', '.']
+        >>> s = "hi, my name can't hello,"
+        >>> TreebankWordTokenizer().tokenize(s)
+        ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
     """
 
     # List of contractions adapted from Robert MacIntyre's tokenizer.
@@ -64,6 +67,7 @@ class TreebankWordTokenizer(TokenizerI):
 
         #punctuation
         text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
+        text = re.sub(r'([:,])$', r' \1 ', text)
         text = re.sub(r'\.\.\.', r' ... ', text)
         text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
         text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
diff --git a/setup.cfg b/setup.cfg
index 861a9f5..6bc2ff3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [egg_info]
-tag_build = 
 tag_date = 0
+tag_build = 
 tag_svn_revision = 0
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/nltk.git



More information about the debian-science-commits mailing list