[nltk] 01/05: New upstream version 3.2.5

Gianfranco Costamagna locutusofborg at moszumanska.debian.org
Thu Oct 26 06:57:11 UTC 2017


This is an automated email from the git hooks/post-receive script.

locutusofborg pushed a commit to branch master
in repository nltk.

commit 2a6229cc9ad9fbf8a705d9b815e8d0505b073984
Author: Gianfranco Costamagna <costamagnagianfranco at yahoo.it>
Date:   Thu Oct 26 08:53:50 2017 +0200

    New upstream version 3.2.5
---
 PKG-INFO                            |   2 +-
 nltk.egg-info/PKG-INFO              |   2 +-
 nltk.egg-info/SOURCES.txt           |   5 +
 nltk.egg-info/requires.txt          |  12 +-
 nltk/VERSION                        |   2 +-
 nltk/book.py                        |   1 +
 nltk/classify/decisiontree.py       |  10 +-
 nltk/classify/naivebayes.py         |   2 +-
 nltk/classify/rte_classify.py       |   4 +-
 nltk/cluster/api.py                 |   1 +
 nltk/collections.py                 |   3 +-
 nltk/corpus/reader/framenet.py      |  15 +-
 nltk/corpus/reader/nombank.py       |   4 +-
 nltk/corpus/reader/propbank.py      |   4 +-
 nltk/corpus/reader/verbnet.py       | 402 +++++++++++++++++++--------
 nltk/corpus/reader/wordlist.py      |   6 +-
 nltk/corpus/reader/wordnet.py       | 111 +-------
 nltk/data.py                        |  38 ++-
 nltk/downloader.py                  |   3 +-
 nltk/parse/corenlp.py               |  49 +++-
 nltk/parse/recursivedescent.py      |   3 +-
 nltk/parse/shiftreduce.py           |   3 +-
 nltk/sem/logic.py                   |  19 +-
 nltk/sentiment/util.py              |  11 +-
 nltk/sentiment/vader.py             |  24 +-
 nltk/stem/arlstem.py                | 355 ++++++++++++++++++++++++
 nltk/stem/snowball.py               | 531 +++++++++++++++++++++++++++++++++++-
 nltk/stem/util.py                   |  10 +
 nltk/tag/__init__.py                |   9 +-
 nltk/tag/perceptron.py              |   4 +-
 nltk/tag/stanford.py                |  78 +++++-
 nltk/test/corpus.doctest            |  30 ++
 nltk/test/stem.doctest              |   2 +-
 nltk/test/tokenize.doctest          |   9 +-
 nltk/test/unit/test_corenlp.py      | 412 ++++++++++++++++++++++++++++
 nltk/test/unit/test_stem.py         |  14 +
 nltk/test/unit/test_tokenize.py     |  42 ++-
 nltk/test/unit/test_wordnet.py      | 134 +++++++++
 nltk/tokenize/__init__.py           |   2 -
 nltk/tokenize/moses.py              |  39 ++-
 nltk/tokenize/nist.py               | 167 ++++++++++++
 nltk/tokenize/punkt.py              |   4 +-
 nltk/tokenize/stanford.py           |  40 ++-
 nltk/tokenize/stanford_segmenter.py |  34 ++-
 nltk/tokenize/treebank.py           |  43 ++-
 nltk/tokenize/util.py               |  26 +-
 nltk/translate/ibm1.py              |  10 +-
 nltk/translate/ibm_model.py         |   3 +
 nltk/translate/nist_score.py        | 155 +++++++++++
 49 files changed, 2566 insertions(+), 323 deletions(-)

diff --git a/PKG-INFO b/PKG-INFO
index 8f2b835..69e88e6 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: nltk
-Version: 3.2.4
+Version: 3.2.5
 Summary: Natural Language Toolkit
 Home-page: http://nltk.org/
 Author: Steven Bird
diff --git a/nltk.egg-info/PKG-INFO b/nltk.egg-info/PKG-INFO
index 8f2b835..69e88e6 100644
--- a/nltk.egg-info/PKG-INFO
+++ b/nltk.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: nltk
-Version: 3.2.4
+Version: 3.2.5
 Summary: Natural Language Toolkit
 Home-page: http://nltk.org/
 Author: Steven Bird
diff --git a/nltk.egg-info/SOURCES.txt b/nltk.egg-info/SOURCES.txt
index 0ed3d0b..10e352a 100644
--- a/nltk.egg-info/SOURCES.txt
+++ b/nltk.egg-info/SOURCES.txt
@@ -206,6 +206,7 @@ nltk/sentiment/util.py
 nltk/sentiment/vader.py
 nltk/stem/__init__.py
 nltk/stem/api.py
+nltk/stem/arlstem.py
 nltk/stem/isri.py
 nltk/stem/lancaster.py
 nltk/stem/porter.py
@@ -316,6 +317,7 @@ nltk/test/unit/test_aline.py
 nltk/test/unit/test_chunk.py
 nltk/test/unit/test_classify.py
 nltk/test/unit/test_collocations.py
+nltk/test/unit/test_corenlp.py
 nltk/test/unit/test_corpora.py
 nltk/test/unit/test_corpus_views.py
 nltk/test/unit/test_hmm.py
@@ -328,6 +330,7 @@ nltk/test/unit/test_tag.py
 nltk/test/unit/test_tgrep.py
 nltk/test/unit/test_tokenize.py
 nltk/test/unit/test_twitter_auth.py
+nltk/test/unit/test_wordnet.py
 nltk/test/unit/utils.py
 nltk/test/unit/translate/__init__.py
 nltk/test/unit/translate/test_bleu.py
@@ -343,6 +346,7 @@ nltk/tokenize/api.py
 nltk/tokenize/casual.py
 nltk/tokenize/moses.py
 nltk/tokenize/mwe.py
+nltk/tokenize/nist.py
 nltk/tokenize/punkt.py
 nltk/tokenize/regexp.py
 nltk/tokenize/repp.py
@@ -368,6 +372,7 @@ nltk/translate/ibm4.py
 nltk/translate/ibm5.py
 nltk/translate/ibm_model.py
 nltk/translate/metrics.py
+nltk/translate/nist_score.py
 nltk/translate/phrase_based.py
 nltk/translate/ribes_score.py
 nltk/translate/stack_decoder.py
diff --git a/nltk.egg-info/requires.txt b/nltk.egg-info/requires.txt
index 12c2271..72836ce 100644
--- a/nltk.egg-info/requires.txt
+++ b/nltk.egg-info/requires.txt
@@ -1,15 +1,15 @@
 six
 
 [all]
-requests
-twython
-gensim
-scipy
 pyparsing
 matplotlib
-python-crfsuite
-numpy
+gensim
+twython
 scikit-learn
+scipy
+numpy
+requests
+python-crfsuite
 
 [corenlp]
 requests
diff --git a/nltk/VERSION b/nltk/VERSION
index 351227f..5ae69bd 100644
--- a/nltk/VERSION
+++ b/nltk/VERSION
@@ -1 +1 @@
-3.2.4
+3.2.5
diff --git a/nltk/book.py b/nltk/book.py
index 7e006d2..5394736 100644
--- a/nltk/book.py
+++ b/nltk/book.py
@@ -11,6 +11,7 @@ from nltk.corpus import (gutenberg, genesis, inaugural,
                          nps_chat, webtext, treebank, wordnet)
 from nltk.text import Text
 from nltk.probability import FreqDist
+from nltk.util import bigrams
 
 print("*** Introductory Examples for the NLTK Book ***")
 print("Loading text1, ..., text9 and sent1, ..., sent9")
diff --git a/nltk/classify/decisiontree.py b/nltk/classify/decisiontree.py
index 27897ab..2bf5742 100644
--- a/nltk/classify/decisiontree.py
+++ b/nltk/classify/decisiontree.py
@@ -266,12 +266,12 @@ class DecisionTreeClassifier(ClassifierI):
                 if stump_error < best_error:
                     best_error = stump_error
                     best_stump = stump
-        if best_stump._decisions:
-            descr = '{0}={1}'.format(best_stump._fname,
-                               list(best_stump._decisions.keys())[0])
-        else:
-            descr = '(default)'
         if verbose:
+            if best_stump._decisions:
+                descr = '{0}={1}'.format(best_stump._fname,
+                                         list(best_stump._decisions.keys())[0])
+            else:
+                descr = '(default)'
             print(('best stump for {:6d} toks uses {:20} err={:6.4f}'.format \
                    (len(labeled_featuresets), descr, best_error)))
         return best_stump
diff --git a/nltk/classify/naivebayes.py b/nltk/classify/naivebayes.py
index 22f0861..b547a7a 100644
--- a/nltk/classify/naivebayes.py
+++ b/nltk/classify/naivebayes.py
@@ -21,7 +21,7 @@ independent, given the label:
 |  P(label|features) = --------------------------------------------
 |                                         P(features)
 
-Rather than computing P(featues) explicitly, the algorithm just
+Rather than computing P(features) explicitly, the algorithm just
 calculates the numerator for each label, and normalizes them so they
 sum to one:
 
diff --git a/nltk/classify/rte_classify.py b/nltk/classify/rte_classify.py
index 1693560..968a223 100644
--- a/nltk/classify/rte_classify.py
+++ b/nltk/classify/rte_classify.py
@@ -46,7 +46,7 @@ class RTEFeatureExtractor(object):
     This builds a bag of words for both the text and the hypothesis after
     throwing away some stopwords, then calculates overlap and difference.
     """
-    def __init__(self, rtepair, stop=True, lemmatize=False):
+    def __init__(self, rtepair, stop=True, use_lemmatize=False):
         """
         :param rtepair: a ``RTEPair`` from which features should be extracted
         :param stop: if ``True``, stopwords are thrown away.
@@ -69,7 +69,7 @@ class RTEFeatureExtractor(object):
         self.text_words = set(self.text_tokens)
         self.hyp_words = set(self.hyp_tokens)
 
-        if lemmatize:
+        if use_lemmatize:
             self.text_words = set(lemmatize(token) for token in self.text_tokens)
             self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)
 
diff --git a/nltk/cluster/api.py b/nltk/cluster/api.py
index bf2f4ad..8679324 100644
--- a/nltk/cluster/api.py
+++ b/nltk/cluster/api.py
@@ -63,6 +63,7 @@ class ClusterI(object):
     def cluster_names(self):
         """
         Returns the names of the clusters.
+        :rtype: list
         """
         return list(range(self.num_clusters()))
 
diff --git a/nltk/collections.py b/nltk/collections.py
index d915c1f..1107f7d 100644
--- a/nltk/collections.py
+++ b/nltk/collections.py
@@ -228,8 +228,7 @@ class AbstractLazySequence(object):
             length += len(pieces[-1]) + 2
             if length > self._MAX_REPR_SIZE and len(pieces) > 2:
                 return '[%s, ...]' % text_type(', ').join(pieces[:-1])
-        else:
-            return '[%s]' % text_type(', ').join(pieces)
+        return '[%s]' % text_type(', ').join(pieces)
 
     def __eq__(self, other):
         return (type(self) == type(other) and list(self) == list(other))
diff --git a/nltk/corpus/reader/framenet.py b/nltk/corpus/reader/framenet.py
index 26fa96e..344efb4 100644
--- a/nltk/corpus/reader/framenet.py
+++ b/nltk/corpus/reader/framenet.py
@@ -877,8 +877,7 @@ class PrettyLazyMap(LazyMap):
             length += len(pieces[-1]) + 2
             if length > self._MAX_REPR_SIZE and len(pieces) > 2:
                 return "[%s, ...]" % text_type(', ').join(pieces[:-1])
-        else:
-            return "[%s]" % text_type(', ').join(pieces)
+        return "[%s]" % text_type(', ').join(pieces)
 
 @python_2_unicode_compatible
 class PrettyLazyIteratorList(LazyIteratorList):
@@ -900,8 +899,7 @@ class PrettyLazyIteratorList(LazyIteratorList):
             length += len(pieces[-1]) + 2
             if length > self._MAX_REPR_SIZE and len(pieces) > 2:
                 return "[%s, ...]" % text_type(', ').join(pieces[:-1])
-        else:
-            return "[%s]" % text_type(', ').join(pieces)
+        return "[%s]" % text_type(', ').join(pieces)
 
 @python_2_unicode_compatible
 class PrettyLazyConcatenation(LazyConcatenation):
@@ -923,8 +921,7 @@ class PrettyLazyConcatenation(LazyConcatenation):
             length += len(pieces[-1]) + 2
             if length > self._MAX_REPR_SIZE and len(pieces) > 2:
                 return "[%s, ...]" % text_type(', ').join(pieces[:-1])
-        else:
-            return "[%s]" % text_type(', ').join(pieces)
+        return "[%s]" % text_type(', ').join(pieces)
 
     def __add__(self, other):
         """Return a list concatenating self with other."""
@@ -1003,6 +1000,10 @@ class FramenetCorpusReader(XMLCorpusReader):
 
 
         msg = """
+Citation: Nathan Schneider and Chuck Wooters (2017), 
+"The NLTK FrameNet API: Designing for Discoverability with a Rich Linguistic Resource". 
+Proceedings of EMNLP: System Demonstrations. https://arxiv.org/abs/1703.07438
+
 Use the following methods to access data in FrameNet.
 Provide a method name to `help()` for more information.
 
@@ -1023,7 +1024,7 @@ fes() to get frame elements (a.k.a. roles) matching a name pattern, optionally c
 LEXICAL UNITS
 =============
 
-lu() to look up a frame by its ID
+lu() to look up an LU by its ID
 lus() to get lexical units matching a name pattern, optionally constrained by frame
 lu_ids_and_names() to get a mapping from LU IDs to names
 
diff --git a/nltk/corpus/reader/nombank.py b/nltk/corpus/reader/nombank.py
index e1427ac..c6d7d16 100644
--- a/nltk/corpus/reader/nombank.py
+++ b/nltk/corpus/reader/nombank.py
@@ -111,9 +111,7 @@ class NombankCorpusReader(CorpusReader):
         for roleset in etree.findall('predicate/roleset'):
             if roleset.attrib['id'] == roleset_id:
                 return roleset
-        else:
-            raise ValueError('Roleset %s not found in %s' %
-                             (roleset_id, framefile))
+        raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
 
     def rolesets(self, baseform=None):
         """
diff --git a/nltk/corpus/reader/propbank.py b/nltk/corpus/reader/propbank.py
index 320c75a..343858a 100644
--- a/nltk/corpus/reader/propbank.py
+++ b/nltk/corpus/reader/propbank.py
@@ -108,9 +108,7 @@ class PropbankCorpusReader(CorpusReader):
         for roleset in etree.findall('predicate/roleset'):
             if roleset.attrib['id'] == roleset_id:
                 return roleset
-        else:
-            raise ValueError('Roleset %s not found in %s' %
-                             (roleset_id, framefile))
+        raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
 
     def rolesets(self, baseform=None):
         """
diff --git a/nltk/corpus/reader/verbnet.py b/nltk/corpus/reader/verbnet.py
index 6a34113..641cff9 100644
--- a/nltk/corpus/reader/verbnet.py
+++ b/nltk/corpus/reader/verbnet.py
@@ -9,7 +9,7 @@
 An NLTK interface to the VerbNet verb lexicon
 
 For details about VerbNet see:
-http://verbs.colorado.edu/~mpalmer/projects/verbnet.html
+https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
 """
 from __future__ import unicode_literals
 
@@ -21,6 +21,7 @@ from six import string_types
 
 from nltk.corpus.reader.xmldocs import XMLCorpusReader
 
+
 class VerbnetCorpusReader(XMLCorpusReader):
     """
     An NLTK interface to the VerbNet verb lexicon.
@@ -28,11 +29,11 @@ class VerbnetCorpusReader(XMLCorpusReader):
     From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest
     on-line verb lexicon currently available for English. It is a hierarchical
     domain-independent, broad-coverage verb lexicon with mappings to other
-    lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), Xtag
+    lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), XTAG
     (XTAG Research Group, 2001), and FrameNet (Baker et al., 1998)."
 
     For details about VerbNet see:
-    http://verbs.colorado.edu/~mpalmer/projects/verbnet.html
+    https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
     """
 
     # No unicode encoding param, since the data files are all XML.
@@ -41,11 +42,11 @@ class VerbnetCorpusReader(XMLCorpusReader):
 
         self._lemma_to_class = defaultdict(list)
         """A dictionary mapping from verb lemma strings to lists of
-        verbnet class identifiers."""
+        VerbNet class identifiers."""
 
         self._wordnet_to_class = defaultdict(list)
         """A dictionary mapping from wordnet identifier strings to
-        lists of verbnet class identifiers."""
+        lists of VerbNet class identifiers."""
 
         self._class_to_fileid = {}
         """A dictionary mapping from class identifiers to
@@ -70,50 +71,49 @@ class VerbnetCorpusReader(XMLCorpusReader):
     """Regular expression used by ``_index()`` to quickly scan the corpus
        for basic information."""
 
-    def lemmas(self, classid=None):
+    def lemmas(self, vnclass=None):
         """
         Return a list of all verb lemmas that appear in any class, or
         in the ``classid`` if specified.
         """
-        if classid is None:
+        if vnclass is None:
             return sorted(self._lemma_to_class.keys())
         else:
             # [xx] should this include subclass members?
-            vnclass = self.vnclass(classid)
+            if isinstance(vnclass, string_types):
+                vnclass = self.vnclass(vnclass)
             return [member.get('name') for member in
                     vnclass.findall('MEMBERS/MEMBER')]
 
-    def wordnetids(self, classid=None):
+    def wordnetids(self, vnclass=None):
         """
         Return a list of all wordnet identifiers that appear in any
         class, or in ``classid`` if specified.
         """
-        if classid is None:
+        if vnclass is None:
             return sorted(self._wordnet_to_class.keys())
         else:
             # [xx] should this include subclass members?
-            vnclass = self.vnclass(classid)
-            return sum([member.get('wn','').split() for member in
+            if isinstance(vnclass, string_types):
+                vnclass = self.vnclass(vnclass)
+            return sum([member.get('wn', '').split() for member in
                         vnclass.findall('MEMBERS/MEMBER')], [])
 
     def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
         """
-        Return a list of the verbnet class identifiers.  If a file
-        identifier is specified, then return only the verbnet class
+        Return a list of the VerbNet class identifiers.  If a file
+        identifier is specified, then return only the VerbNet class
         identifiers for classes (and subclasses) defined by that file.
-        If a lemma is specified, then return only verbnet class
+        If a lemma is specified, then return only VerbNet class
         identifiers for classes that contain that lemma as a member.
         If a wordnetid is specified, then return only identifiers for
         classes that contain that wordnetid as a member.  If a classid
         is specified, then return only identifiers for subclasses of
-        the specified verbnet class.
+        the specified VerbNet class.
+        If nothing is specified, return all classids within VerbNet
         """
-        if len([x for x in [lemma, wordnetid, fileid, classid]
-                if x is not None]) > 1:
-            raise ValueError('Specify at most one of: fileid, wordnetid, '
-                             'fileid, classid')
         if fileid is not None:
-            return [c for (c,f) in self._class_to_fileid.items()
+            return [c for (c, f) in self._class_to_fileid.items()
                     if f == fileid]
         elif lemma is not None:
             return self._lemma_to_class[lemma]
@@ -127,14 +127,15 @@ class VerbnetCorpusReader(XMLCorpusReader):
             return sorted(self._class_to_fileid.keys())
 
     def vnclass(self, fileid_or_classid):
-        """
+        """Returns VerbNet class ElementTree
+        
         Return an ElementTree containing the xml for the specified
-        verbnet class.
+        VerbNet class.
 
         :param fileid_or_classid: An identifier specifying which class
             should be returned.  Can be a file identifier (such as
-            ``'put-9.1.xml'``), or a verbnet class identifier (such as
-            ``'put-9.1'``) or a short verbnet class identifier (such as
+            ``'put-9.1.xml'``), or a VerbNet class identifier (such as
+            ``'put-9.1'``) or a short VerbNet class identifier (such as
             ``'9.1'``).
         """
         # File identifier: just return the xml.
@@ -153,16 +154,16 @@ class VerbnetCorpusReader(XMLCorpusReader):
                     if classid == subclass.get('ID'):
                         return subclass
                 else:
-                    assert False # we saw it during _index()!
+                    assert False  # we saw it during _index()!
 
         else:
-            raise ValueError('Unknown identifier %s' % fileid_or_classid)
+            raise ValueError('Unknown identifier {}'.format(fileid_or_classid))
 
     def fileids(self, vnclass_ids=None):
         """
         Return a list of fileids that make up this corpus.  If
         ``vnclass_ids`` is specified, then return the fileids that make
-        up the specified verbnet class(es).
+        up the specified VerbNet class(es).
         """
         if vnclass_ids is None:
             return self._fileids
@@ -172,9 +173,74 @@ class VerbnetCorpusReader(XMLCorpusReader):
             return [self._class_to_fileid[self.longid(vnclass_id)]
                     for vnclass_id in vnclass_ids]
 
+    def frames(self, vnclass):
+        """Given a VerbNet class, this method returns VerbNet frames
+        
+        The members returned are:
+        1) Example
+        2) Description
+        3) Syntax
+        4) Semantics
+        
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        :return: frames - a list of frame dictionaries
+        """
+        if isinstance(vnclass, string_types):
+            vnclass = self.vnclass(vnclass)
+        frames = []
+        vnframes = vnclass.findall('FRAMES/FRAME')
+        for vnframe in vnframes:
+            frames.append({
+                'example': self._get_example_within_frame(vnframe),
+                'description': self._get_description_within_frame(vnframe),
+                'syntax': self._get_syntactic_list_within_frame(vnframe),
+                'semantics': self._get_semantics_within_frame(vnframe)
+            })
+        return frames
+
+    def subclasses(self, vnclass):
+        """Returns subclass ids, if any exist 
+        
+        Given a VerbNet class, this method returns subclass ids (if they exist)
+        in a list of strings.
+        
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        :return: list of subclasses
+        """
+        if isinstance(vnclass, string_types):
+            vnclass = self.vnclass(vnclass)
+
+        subclasses = [subclass.get('ID') for subclass in
+                      vnclass.findall('SUBCLASSES/VNSUBCLASS')]
+        return subclasses
+
+    def themroles(self, vnclass):
+        """Returns thematic roles participating in a VerbNet class
+        
+        Members returned as part of roles are-
+        1) Type
+        2) Modifiers
+        
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
+        :return: themroles: A list of thematic roles in the VerbNet class
+        """
+        if isinstance(vnclass, string_types):
+            vnclass = self.vnclass(vnclass)
+
+        themroles = []
+        for trole in vnclass.findall('THEMROLES/THEMROLE'):
+            themroles.append({
+                'type': trole.get('type'),
+                'modifiers': [{'value': restr.get('Value'), 'type': restr.get('type')}
+                              for restr in trole.findall('SELRESTRS/SELRESTR')]
+            })
+        return themroles
 
     ######################################################################
-    #{ Index Initialization
+    # { Index Initialization
     ######################################################################
 
     def _index(self):
@@ -205,7 +271,7 @@ class VerbnetCorpusReader(XMLCorpusReader):
         Initialize the indexes ``_lemma_to_class``,
         ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
         through the corpus fileids.  This doesn't do proper xml parsing,
-        but is good enough to find everything in the standard verbnet
+        but is good enough to find everything in the standard VerbNet
         corpus -- and it runs about 30 times faster than xml parsing
         (with the python ElementTree; only 2-3 times faster with
         cElementTree).
@@ -213,7 +279,7 @@ class VerbnetCorpusReader(XMLCorpusReader):
         # nb: if we got rid of wordnet_to_class, this would run 2-3
         # times faster.
         for fileid in self._fileids:
-            vnclass = fileid[:-4] # strip the '.xml'
+            vnclass = fileid[:-4]  # strip the '.xml'
             self._class_to_fileid[vnclass] = fileid
             self._shortid_to_longid[self.shortid(vnclass)] = vnclass
             for m in self._INDEX_RE.finditer(self.open(fileid).read()):
@@ -224,21 +290,23 @@ class VerbnetCorpusReader(XMLCorpusReader):
                         self._wordnet_to_class[wn].append(vnclass)
                 elif groups[2] is not None:
                     self._class_to_fileid[groups[2]] = fileid
-                    vnclass = groups[2] # for <MEMBER> elts.
+                    vnclass = groups[2]  # for <MEMBER> elts.
                     self._shortid_to_longid[self.shortid(vnclass)] = vnclass
                 else:
                     assert False, 'unexpected match condition'
 
     ######################################################################
-    #{ Identifier conversion
+    # { Identifier conversion
     ######################################################################
 
     def longid(self, shortid):
-        """Given a short verbnet class identifier (eg '37.10'), map it
+        """Returns longid of a VerbNet class
+        
+        Given a short VerbNet class identifier (eg '37.10'), map it
         to a long id (eg 'confess-37.10').  If ``shortid`` is already a
         long id, then return it as-is"""
         if self._LONGID_RE.match(shortid):
-            return shortid # it's already a longid.
+            return shortid  # it's already a longid.
         elif not self._SHORTID_RE.match(shortid):
             raise ValueError('vnclass identifier %r not found' % shortid)
         try:
@@ -247,11 +315,13 @@ class VerbnetCorpusReader(XMLCorpusReader):
             raise ValueError('vnclass identifier %r not found' % shortid)
 
     def shortid(self, longid):
-        """Given a long verbnet class identifier (eg 'confess-37.10'),
+        """Returns shortid of a VerbNet class
+        
+        Given a long VerbNet class identifier (eg 'confess-37.10'),
         map it to a short id (eg '37.10').  If ``longid`` is already a
         short id, then return it as-is."""
         if self._SHORTID_RE.match(longid):
-            return longid # it's already a shortid.
+            return longid  # it's already a shortid.
         m = self._LONGID_RE.match(longid)
         if m:
             return m.group(2)
@@ -259,16 +329,102 @@ class VerbnetCorpusReader(XMLCorpusReader):
             raise ValueError('vnclass identifier %r not found' % longid)
 
     ######################################################################
-    #{ Pretty Printing
+    # { Frame access utility functions
     ######################################################################
 
-    def pprint(self, vnclass):
+    def _get_semantics_within_frame(self, vnframe):
+        """Returns semantics within a single frame
+        
+        A utility function to retrieve semantics within a frame in VerbNet
+        Members of the semantics dictionary:
+        1) Predicate value 
+        2) Arguments
+        
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: semantics: semantics dictionary
         """
+        semantics_within_single_frame = []
+        for pred in vnframe.findall('SEMANTICS/PRED'):
+            arguments = [{'type': arg.get('type'), 'value': arg.get('value')}
+                         for arg in pred.findall('ARGS/ARG')]
+            semantics_within_single_frame.append({
+                'predicate_value': pred.get('value'),
+                'arguments': arguments
+            })
+        return semantics_within_single_frame
+
+    def _get_example_within_frame(self, vnframe):
+        """Returns example within a frame
+        
+        A utility function to retrieve an example within a frame in VerbNet.
+        
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: example_text: The example sentence for this particular frame
+        """
+        example_element = vnframe.find('EXAMPLES/EXAMPLE')
+        if example_element is not None:
+            example_text = example_element.text
+        else:
+            example_text = ""
+        return example_text
+
+    def _get_description_within_frame(self, vnframe):
+        """Returns member description within frame
+         
+        A utility function to retrieve a description of participating members
+        within a frame in VerbNet.
+        
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: description: a description dictionary with members - primary and secondary 
+        """
+        description_element = vnframe.find('DESCRIPTION')
+        return {
+            'primary': description_element.attrib['primary'],
+            'secondary': description_element.get('secondary', '')
+        }
+
+    def _get_syntactic_list_within_frame(self, vnframe):
+        """Returns semantics within a frame
+        
+        A utility function to retrieve semantics within a frame in VerbNet.
+        Members of the syntactic dictionary:
+        1) POS Tag
+        2) Modifiers
+        
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
+        :return: syntax_within_single_frame
+        """
+        syntax_within_single_frame = []
+        for elt in vnframe.find('SYNTAX'):
+            pos_tag = elt.tag
+            modifiers = dict()
+            modifiers['value'] = elt.get('value') if 'value' in elt.attrib else ""
+            modifiers['selrestrs'] = [{'value': restr.get('Value'), 'type': restr.get('type')}
+                                      for restr in elt.findall('SELRESTRS/SELRESTR')]
+            modifiers['synrestrs'] = [{'value': restr.get('Value'), 'type': restr.get('type')}
+                                      for restr in elt.findall('SYNRESTRS/SYNRESTR')]
+            syntax_within_single_frame.append({
+                'pos_tag': pos_tag,
+                'modifiers': modifiers
+            })
+        return syntax_within_single_frame
+
+    ######################################################################
+    # { Pretty Printing
+    ######################################################################
+
+    def pprint(self, vnclass):
+        """Returns pretty printed version of a VerbNet class
+        
         Return a string containing a pretty-printed representation of
-        the given verbnet class.
+        the given VerbNet class.
 
-        :param vnclass: A verbnet class identifier; or an ElementTree
-        containing the xml contents of a verbnet class.
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+        containing the xml contents of a VerbNet class.
         """
         if isinstance(vnclass, string_types):
             vnclass = self.vnclass(vnclass)
@@ -279,129 +435,161 @@ class VerbnetCorpusReader(XMLCorpusReader):
         s += '  Thematic roles:\n'
         s += self.pprint_themroles(vnclass, indent='    ') + '\n'
         s += '  Frames:\n'
-        s += '\n'.join(self.pprint_frame(vnframe, indent='    ')
-                       for vnframe in vnclass.findall('FRAMES/FRAME'))
+        s += self.pprint_frames(vnclass, indent='    ')
         return s
 
     def pprint_subclasses(self, vnclass, indent=''):
-        """
+        """Returns pretty printed version of subclasses of VerbNet class
+        
         Return a string containing a pretty-printed representation of
-        the given verbnet class's subclasses.
+        the given VerbNet class's subclasses.
 
-        :param vnclass: A verbnet class identifier; or an ElementTree
-            containing the xml contents of a verbnet class.
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
         """
         if isinstance(vnclass, string_types):
             vnclass = self.vnclass(vnclass)
 
-        subclasses = [subclass.get('ID') for subclass in
-                      vnclass.findall('SUBCLASSES/VNSUBCLASS')]
+        subclasses = self.subclasses(vnclass)
         if not subclasses: subclasses = ['(none)']
         s = 'Subclasses: ' + ' '.join(subclasses)
         return textwrap.fill(s, 70, initial_indent=indent,
-                             subsequent_indent=indent+'  ')
+                             subsequent_indent=indent + '  ')
 
     def pprint_members(self, vnclass, indent=''):
-        """
+        """Returns pretty printed version of members in a VerbNet class
+        
         Return a string containing a pretty-printed representation of
-        the given verbnet class's member verbs.
+        the given VerbNet class's member verbs.
 
-        :param vnclass: A verbnet class identifier; or an ElementTree
-            containing the xml contents of a verbnet class.
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
         """
         if isinstance(vnclass, string_types):
             vnclass = self.vnclass(vnclass)
 
-        members = [member.get('name') for member in
-                   vnclass.findall('MEMBERS/MEMBER')]
-        if not members: members = ['(none)']
+        members = self.lemmas(vnclass)
+        if not members:
+            members = ['(none)']
         s = 'Members: ' + ' '.join(members)
         return textwrap.fill(s, 70, initial_indent=indent,
-                             subsequent_indent=indent+'  ')
+                             subsequent_indent=indent + '  ')
 
     def pprint_themroles(self, vnclass, indent=''):
-        """
+        """Returns pretty printed version of thematic roles in a VerbNet class
+        
         Return a string containing a pretty-printed representation of
-        the given verbnet class's thematic roles.
+        the given VerbNet class's thematic roles.
 
-        :param vnclass: A verbnet class identifier; or an ElementTree
-            containing the xml contents of a verbnet class.
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
         """
         if isinstance(vnclass, string_types):
             vnclass = self.vnclass(vnclass)
 
         pieces = []
-        for themrole in vnclass.findall('THEMROLES/THEMROLE'):
+        for themrole in self.themroles(vnclass):
             piece = indent + '* ' + themrole.get('type')
-            modifiers = ['%(Value)s%(type)s' % restr.attrib
-                         for restr in themrole.findall('SELRESTRS/SELRESTR')]
+            modifiers = [modifier['value'] + modifier['type']
+                         for modifier in themrole['modifiers']]
             if modifiers:
-                piece += '[%s]' % ' '.join(modifiers)
+                piece += '[{}]'.format(' '.join(modifiers))
             pieces.append(piece)
-
         return '\n'.join(pieces)
 
-    def pprint_frame(self, vnframe, indent=''):
-        """
+    def pprint_frames(self, vnclass, indent=''):
+        """Returns pretty version of all frames in a VerbNet class
+        
         Return a string containing a pretty-printed representation of
-        the given verbnet frame.
+        the list of frames within the VerbNet class.
 
-        :param vnframe: An ElementTree containing the xml contents of
-            a verbnet frame.
+        :param vnclass: A VerbNet class identifier; or an ElementTree
+            containing the xml contents of a VerbNet class.
         """
-        s = self.pprint_description(vnframe, indent) + '\n'
-        s += self.pprint_syntax(vnframe, indent+'  Syntax: ') + '\n'
-        s += indent + '  Semantics:\n'
-        s += self.pprint_semantics(vnframe, indent+'    ')
-        return s
+        if isinstance(vnclass, string_types):
+            vnclass = self.vnclass(vnclass)
+        pieces = []
+        for vnframe in self.frames(vnclass):
+            pieces.append(self._pprint_single_frame(vnframe, indent))
+        return '\n'.join(pieces)
 
-    def pprint_description(self, vnframe, indent=''):
+    def _pprint_single_frame(self, vnframe, indent=''):
+        """Returns pretty printed version of a single frame in a VerbNet class
+        
+        Returns a string containing a pretty-printed representation of
+        the given frame.
+        
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
         """
+        frame_string = self._pprint_description_within_frame(vnframe, indent) + '\n'
+        frame_string += self._pprint_example_within_frame(vnframe, indent + ' ') + '\n'
+        frame_string += self._pprint_syntax_within_frame(vnframe, indent + '  Syntax: ') + '\n'
+        frame_string += indent + '  Semantics:\n'
+        frame_string += self._pprint_semantics_within_frame(vnframe, indent + '    ')
+        return frame_string
+
+    def _pprint_example_within_frame(self, vnframe, indent=''):
+        """Returns pretty printed version of example within frame in a VerbNet class
+        
         Return a string containing a pretty-printed representation of
-        the given verbnet frame description.
+        the given VerbNet frame example.
 
         :param vnframe: An ElementTree containing the xml contents of
-            a verbnet frame.
+            a Verbnet frame.
         """
-        descr = vnframe.find('DESCRIPTION')
-        s = indent + descr.attrib['primary']
-        if descr.get('secondary', ''):
-            s += ' (%s)' % descr.get('secondary')
-        return s
+        if vnframe['example']:
+            return indent + ' Example: ' + vnframe['example']
 
-    def pprint_syntax(self, vnframe, indent=''):
+    def _pprint_description_within_frame(self, vnframe, indent=''):
+        """Returns pretty printed version of a VerbNet frame description
+        
+        Return a string containing a pretty-printed representation of
+        the given VerbNet frame description.
+
+        :param vnframe: An ElementTree containing the xml contents of
+            a VerbNet frame.
         """
+        description = indent + vnframe['description']['primary']
+        if vnframe['description']['secondary']:
+            description += ' ({})'.format(vnframe['description']['secondary'])
+        return description
+
+    def _pprint_syntax_within_frame(self, vnframe, indent=''):
+        """Returns pretty printed version of syntax within a frame in a VerbNet class 
+        
         Return a string containing a pretty-printed representation of
-        the given verbnet frame syntax.
+        the given VerbNet frame syntax.
 
         :param vnframe: An ElementTree containing the xml contents of
-            a verbnet frame.
+            a VerbNet frame.
         """
         pieces = []
-        for elt in vnframe.find('SYNTAX'):
-            piece = elt.tag
-            modifiers = []
-            if 'value' in elt.attrib:
-                modifiers.append(elt.get('value'))
-            modifiers += ['%(Value)s%(type)s' % restr.attrib
-                          for restr in (elt.findall('SELRESTRS/SELRESTR') +
-                                        elt.findall('SYNRESTRS/SYNRESTR'))]
-            if modifiers:
-                piece += '[%s]' % ' '.join(modifiers)
+        for element in vnframe['syntax']:
+            piece = element['pos_tag']
+            modifier_list = []
+            if 'value' in element['modifiers'] and element['modifiers']['value']:
+                modifier_list.append(element['modifiers']['value'])
+            modifier_list += ['{}{}'.format(restr['value'], restr['type'])
+                              for restr in (element['modifiers']['selrestrs'] +
+                                            element['modifiers']['synrestrs'])]
+            if modifier_list:
+                piece += '[{}]'.format(' '.join(modifier_list))
             pieces.append(piece)
 
         return indent + ' '.join(pieces)
 
-    def pprint_semantics(self, vnframe, indent=''):
-        """
+    def _pprint_semantics_within_frame(self, vnframe, indent=''):
+        """Returns a pretty printed version of semantics within frame in a VerbNet class
+        
         Return a string containing a pretty-printed representation of
-        the given verbnet frame semantics.
+        the given VerbNet frame semantics.
 
         :param vnframe: An ElementTree containing the xml contents of
-            a verbnet frame.
+            a VerbNet frame.
         """
         pieces = []
-        for pred in vnframe.findall('SEMANTICS/PRED'):
-            args = [arg.get('value') for arg in pred.findall('ARGS/ARG')]
-            pieces.append('%s(%s)' % (pred.get('value'), ', '.join(args)))
-        return '\n'.join('%s* %s' % (indent, piece) for piece in pieces)
+        for predicate in vnframe['semantics']:
+            arguments = [argument['value'] for argument in predicate['arguments']]
+            pieces.append('{}({})'.format(predicate['predicate_value'], ', '.join(arguments)))
+        return '\n'.join('{}* {}'.format(indent, piece) for piece in pieces)
diff --git a/nltk/corpus/reader/wordlist.py b/nltk/corpus/reader/wordlist.py
index 85f529e..24e06ae 100644
--- a/nltk/corpus/reader/wordlist.py
+++ b/nltk/corpus/reader/wordlist.py
@@ -88,7 +88,9 @@ class UnicharsCorpusReader(WordListCorpusReader):
     # These are categories similar to the Perl Unicode Properties
     available_categories = ['Close_Punctuation', 'Currency_Symbol',
                             'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc',
-                            'IsSo', 'Open_Punctuation']
+                            'IsSo', 'IsUpper', 'Line_Separator', 'Number',
+                            'Open_Punctuation', 'Punctuation', 'Separator',
+                            'Symbol']
 
     def chars(self, category=None, fileids=None):
         """
@@ -101,7 +103,7 @@ class UnicharsCorpusReader(WordListCorpusReader):
         >>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5']
         True
         >>> pup.available_categories
-        ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'Open_Punctuation']
+        ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol']
 
         :return: a list of characters given the specific unicode character category
         """
diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py
index 3069a83..7063aed 100644
--- a/nltk/corpus/reader/wordnet.py
+++ b/nltk/corpus/reader/wordnet.py
@@ -112,7 +112,7 @@ VERB_FRAME_STRINGS = (
     "It %s that CLAUSE",
     "Something %s INFINITIVE")
 
-SENSENUM_RE = re.compile(r'\.\d\d\.')
+SENSENUM_RE = re.compile(r'\.[\d]+\.')
 
 
 ######################################################################
@@ -132,13 +132,13 @@ class _WordNetObject(object):
         return self._related('@')
 
     def _hypernyms(self):
-        return self._related('@', sort=False)
+        return self._related('@')
 
     def instance_hypernyms(self):
         return self._related('@i')
 
     def _instance_hypernyms(self):
-        return self._related('@i', sort=False)
+        return self._related('@i')
 
     def hyponyms(self):
         return self._related('~')
@@ -905,7 +905,7 @@ class Synset(_WordNetObject):
         if len(subsumers) == 0:
             return None
 
-        subsumer = subsumers[0]
+        subsumer = self if self in subsumers else subsumers[0]
 
         # Get the longest path from the LCS to the root,
         # including a correction:
@@ -1244,7 +1244,13 @@ class WordNetCorpusReader(CorpusReader):
         # cannot simply split on first '.',
         # e.g.: '.45_caliber.a.01..45_caliber'
         separator = SENSENUM_RE.search(name).start()
-        synset_name, lemma_name = name[:separator+3], name[separator+4:]
+
+        leadingZero = int(name[separator+1]) == 0
+        if (leadingZero):
+            synset_name, lemma_name = name[:separator+3], name[separator+4:]
+        else:
+            synset_name, lemma_name = name[:separator+2], name[separator+3:]
+        
         synset = self.synset(synset_name)
         for lemma in synset.lemmas(lang):
             if lemma._name == lemma_name:
@@ -2056,98 +2062,3 @@ def teardown_module(module=None):
     from nltk.corpus import wordnet
     wordnet._unload()
 
-
-######################################################################
-# Demo
-######################################################################
-
-def demo():
-    import nltk
-    print('loading wordnet')
-    wn = WordNetCorpusReader(nltk.data.find('corpora/wordnet'), None)
-    print('done loading')
-    S = wn.synset
-    L = wn.lemma
-
-    print('getting a synset for go')
-    move_synset = S('go.v.21')
-    print(move_synset.name(), move_synset.pos(), move_synset.lexname())
-    print(move_synset.lemma_names())
-    print(move_synset.definition())
-    print(move_synset.examples())
-
-    zap_n = ['zap.n.01']
-    zap_v = ['zap.v.01', 'zap.v.02', 'nuke.v.01', 'microwave.v.01']
-
-    def _get_synsets(synset_strings):
-        return [S(synset) for synset in synset_strings]
-
-    zap_n_synsets = _get_synsets(zap_n)
-    zap_v_synsets = _get_synsets(zap_v)
-
-    print(zap_n_synsets)
-    print(zap_v_synsets)
-
-    print("Navigations:")
-    print(S('travel.v.01').hypernyms())
-    print(S('travel.v.02').hypernyms())
-    print(S('travel.v.03').hypernyms())
-
-    print(L('zap.v.03.nuke').derivationally_related_forms())
-    print(L('zap.v.03.atomize').derivationally_related_forms())
-    print(L('zap.v.03.atomise').derivationally_related_forms())
-    print(L('zap.v.03.zap').derivationally_related_forms())
-
-    print(S('dog.n.01').member_holonyms())
-    print(S('dog.n.01').part_meronyms())
-
-    print(S('breakfast.n.1').hypernyms())
-    print(S('meal.n.1').hyponyms())
-    print(S('Austen.n.1').instance_hypernyms())
-    print(S('composer.n.1').instance_hyponyms())
-
-    print(S('faculty.n.2').member_meronyms())
-    print(S('copilot.n.1').member_holonyms())
-
-    print(S('table.n.2').part_meronyms())
-    print(S('course.n.7').part_holonyms())
-
-    print(S('water.n.1').substance_meronyms())
-    print(S('gin.n.1').substance_holonyms())
-
-    print(L('leader.n.1.leader').antonyms())
-    print(L('increase.v.1.increase').antonyms())
-
-    print(S('snore.v.1').entailments())
-    print(S('heavy.a.1').similar_tos())
-    print(S('light.a.1').attributes())
-    print(S('heavy.a.1').attributes())
-
-    print(L('English.a.1.English').pertainyms())
-
-    print(S('person.n.01').root_hypernyms())
-    print(S('sail.v.01').root_hypernyms())
-    print(S('fall.v.12').root_hypernyms())
-
-    print(S('person.n.01').lowest_common_hypernyms(S('dog.n.01')))
-    print(S('woman.n.01').lowest_common_hypernyms(S('girlfriend.n.02')))
-
-    print(S('dog.n.01').path_similarity(S('cat.n.01')))
-    print(S('dog.n.01').lch_similarity(S('cat.n.01')))
-    print(S('dog.n.01').wup_similarity(S('cat.n.01')))
-
-    wnic = WordNetICCorpusReader(nltk.data.find('corpora/wordnet_ic'),
-                                 '.*\.dat')
-    ic = wnic.ic('ic-brown.dat')
-    print(S('dog.n.01').jcn_similarity(S('cat.n.01'), ic))
-
-    ic = wnic.ic('ic-semcor.dat')
-    print(S('dog.n.01').lin_similarity(S('cat.n.01'), ic))
-
-    print(S('code.n.03').topic_domains())
-    print(S('pukka.a.01').region_domains())
-    print(S('freaky.a.01').usage_domains())
-
-
-if __name__ == '__main__':
-    demo()
diff --git a/nltk/data.py b/nltk/data.py
index 4f4e375..3295bb8 100644
--- a/nltk/data.py
+++ b/nltk/data.py
@@ -35,16 +35,27 @@ from __future__ import division
 from abc import ABCMeta, abstractmethod
 from six import add_metaclass
 
-import sys
+import functools
+import textwrap
 import io
 import os
-import textwrap
 import re
+import sys
 import zipfile
 import codecs
 
 from gzip import GzipFile, READ as GZ_READ, WRITE as GZ_WRITE
 
+try: # Python 3.
+    textwrap_indent = functools.partial(textwrap.indent, prefix='  ')
+except AttributeError: # Python 2; indent() not available for Python2.
+    textwrap_fill = functools.partial(textwrap.fill,
+                                        initial_indent='  ',
+                                        subsequent_indent='  ',
+                                        replace_whitespace=False)
+    def textwrap_indent(text):
+        return '\n'.join(textwrap_fill(line) for line in text.splitlines())
+
 try:
     from zlib import Z_SYNC_FLUSH as FLUSH
 except ImportError:
@@ -94,7 +105,9 @@ else:
         str('/usr/share/nltk_data'),
         str('/usr/local/share/nltk_data'),
         str('/usr/lib/nltk_data'),
-        str('/usr/local/lib/nltk_data')
+        str('/usr/local/lib/nltk_data'),
+        os.path.join(sys.prefix, str('nltk_data')),
+        os.path.join(sys.prefix, str('lib'), str('nltk_data'))
     ]
 
 
@@ -641,15 +654,22 @@ def find(resource_name, paths=None):
             except LookupError:
                 pass
 
+    # Identify the package (i.e. the .zip file) to download.
+    resource_zipname = resource_name.split('/')[1]
+    if resource_zipname.endswith('.zip'):
+        resource_zipname = resource_zipname.rpartition('.')[0]
     # Display a friendly error message if the resource wasn't found:
-    msg = textwrap.fill(
-        'Resource %r not found.  Please use the NLTK Downloader to '
-        'obtain the resource:  >>> nltk.download()' %
-        (resource_name,), initial_indent='  ', subsequent_indent='  ',
-        width=66)
+    msg = str("Resource \33[93m{resource}\033[0m not found.\n"
+              "Please use the NLTK Downloader to obtain the resource:\n\n"
+              "\33[31m" # To display red text in terminal.
+              ">>> import nltk\n"
+              ">>> nltk.download(\'{resource}\')\n"
+              "\033[0m").format(resource=resource_zipname)
+    msg = textwrap_indent(msg)
+
     msg += '\n  Searched in:' + ''.join('\n    - %r' % d for d in paths)
     sep = '*' * 70
-    resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
+    resource_not_found = '\n%s\n%s\n%s\n' % (sep, msg, sep)
     raise LookupError(resource_not_found)
 
 
diff --git a/nltk/downloader.py b/nltk/downloader.py
index 7beb2c4..452fade 100644
--- a/nltk/downloader.py
+++ b/nltk/downloader.py
@@ -2258,7 +2258,8 @@ if __name__ == '__main__':
     parser.add_option("-e", "--exit-on-error", dest="halt_on_error", action="store_true",
         default=False, help="exit if an error occurs")
     parser.add_option("-u", "--url", dest="server_index_url",
-        default=None, help="download server index url")
+        default=os.environ.get('NLTK_DOWNLOAD_URL'),
+        help="download server index url")
 
     (options, args) = parser.parse_args()
 
diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py
index 49c428b..f6043ef 100644
--- a/nltk/parse/corenlp.py
+++ b/nltk/parse/corenlp.py
@@ -201,7 +201,7 @@ class GenericCoreNLPParser(ParserI, TokenizerI):
         :type sentences: list(list(str))
         :rtype: iter(iter(Tree))
         """
-
+        # Converting list(list(str)) -> list(str)
         sentences = (' '.join(words) for words in sentences)
         return self.raw_parse_sents(sentences, *args, **kwargs)
 
@@ -271,11 +271,13 @@ class GenericCoreNLPParser(ParserI, TokenizerI):
 
         """
         default_properties = {
-            'ssplit.isOneSentence': 'true',
+            # Only splits on '\n', never inside the sentence.
+            'ssplit.ssplit.eolonly': 'true',
         }
 
         default_properties.update(properties or {})
 
+        """
         for sentence in sentences:
             parsed_data = self.api_call(sentence, properties=default_properties)
 
@@ -284,6 +286,12 @@ class GenericCoreNLPParser(ParserI, TokenizerI):
             for parse in parsed_data['sentences']:
                 tree = self.make_tree(parse)
                 yield iter([tree])
+        """
+        parsed_data = self.api_call('\n'.join(sentences), properties=default_properties)
+        for parsed_sent in parsed_data['sentences']:
+            tree = self.make_tree(parsed_sent)
+            yield iter([tree])
+
 
     def parse_text(self, text, *args, **kwargs):
         """Parse a piece of text.
@@ -320,6 +328,7 @@ class GenericCoreNLPParser(ParserI, TokenizerI):
         """
         default_properties = {
             'annotators': 'tokenize,ssplit',
+
         }
 
         default_properties.update(properties or {})
@@ -328,7 +337,7 @@ class GenericCoreNLPParser(ParserI, TokenizerI):
 
         for sentence in result['sentences']:
             for token in sentence['tokens']:
-                yield token['originalText']
+                yield token['originalText'] or token['word']
 
 
 class CoreNLPParser(GenericCoreNLPParser):
@@ -611,6 +620,40 @@ class CoreNLPDependencyParser(GenericCoreNLPParser):
     ... )
     10
 
+    >>> print(
+    ...     next(
+    ...         dep_parser.raw_parse('The underscore _ should not simply disappear.')
+    ...     ).to_conll(4)
+    ... )  # doctest: +NORMALIZE_WHITESPACE
+    The         DT  3   det
+    underscore  VBP 3   amod
+    _           NN  7   nsubj
+    should      MD  7   aux
+    not         RB  7   neg
+    simply      RB  7   advmod
+    disappear   VB  0   ROOT
+    .           .   7   punct
+
+    >>> print(
+    ...     '\\n'.join(
+    ...         next(
+    ...             dep_parser.raw_parse(
+    ...                 'for all of its insights into the dream world of teen life , and its electronic expression through '
+    ...                 'cyber culture , the film gives no quarter to anyone seeking to pull a cohesive story out of its 2 '
+    ...                 '1/2-hour running time .'
+    ...             )
+    ...         ).to_conll(4).split('\\n')[-8:]
+    ...     )
+    ... )
+    its	PRP$	40	nmod:poss
+    2 1/2	CD	40	nummod
+    -	:	40	punct
+    hour	NN	31	nmod
+    running	VBG	42	amod
+    time	NN	40	dep
+    .	.	24	punct
+    <BLANKLINE>
+
     """
 
     _OUTPUT_FORMAT = 'conll2007'
diff --git a/nltk/parse/recursivedescent.py b/nltk/parse/recursivedescent.py
index d59eb3e..a84a12f 100644
--- a/nltk/parse/recursivedescent.py
+++ b/nltk/parse/recursivedescent.py
@@ -351,8 +351,7 @@ class SteppingRecursiveDescentParser(RecursiveDescentParser):
     :see: ``nltk.grammar``
     """
     def __init__(self, grammar, trace=0):
-        self._grammar = grammar
-        self._trace = trace
+        super(SteppingRecursiveDescentParser, self).__init__(grammar, trace)
         self._rtext = None
         self._tree = None
         self._frontier = [()]
diff --git a/nltk/parse/shiftreduce.py b/nltk/parse/shiftreduce.py
index 4ade68a..7fc8289 100644
--- a/nltk/parse/shiftreduce.py
+++ b/nltk/parse/shiftreduce.py
@@ -290,8 +290,7 @@ class SteppingShiftReduceParser(ShiftReduceParser):
     :see: ``nltk.grammar``
     """
     def __init__(self, grammar, trace=0):
-        self._grammar = grammar
-        self._trace = trace
+        super(SteppingShiftReduceParser, self).__init__(grammar, trace)
         self._stack = None
         self._remaining_text = None
         self._history = []
diff --git a/nltk/sem/logic.py b/nltk/sem/logic.py
index dd144d9..1053802 100644
--- a/nltk/sem/logic.py
+++ b/nltk/sem/logic.py
@@ -803,7 +803,7 @@ def read_type(type_string):
 
 class TypeException(Exception):
     def __init__(self, msg):
-        Exception.__init__(self, msg)
+        super(TypeException, self).__init__(msg)
 
 class InconsistentTypeHierarchyException(TypeException):
     def __init__(self, variable, expression=None):
@@ -813,21 +813,20 @@ class InconsistentTypeHierarchyException(TypeException):
         else:
             msg = "The variable '%s' was found in multiple places with different"\
                 " types." % (variable)
-        Exception.__init__(self, msg)
+        super(InconsistentTypeHierarchyException, self).__init__(msg)
 
 class TypeResolutionException(TypeException):
     def __init__(self, expression, other_type):
-        Exception.__init__(self, "The type of '%s', '%s', cannot be "
-                           "resolved with type '%s'" % \
-                           (expression, expression.type, other_type))
+        super(TypeResolutionException, self).__init__(
+            "The type of '%s', '%s', cannot be resolved with type '%s'" %
+            (expression, expression.type, other_type))
 
 class IllegalTypeException(TypeException):
     def __init__(self, expression, other_type, allowed_type):
-        Exception.__init__(self, "Cannot set type of %s '%s' to '%s'; "
-                           "must match type '%s'." %
-                           (expression.__class__.__name__, expression,
-                            other_type, allowed_type))
-
+        super(IllegalTypeException, self).__init__(
+            "Cannot set type of %s '%s' to '%s'; must match type '%s'." %
+            (expression.__class__.__name__, expression, other_type,
+            allowed_type))
 
 def typecheck(expressions, signature=None):
     """
diff --git a/nltk/sentiment/util.py b/nltk/sentiment/util.py
index a26a2b5..b8e3fbe 100644
--- a/nltk/sentiment/util.py
+++ b/nltk/sentiment/util.py
@@ -12,7 +12,6 @@ Utility methods for Sentiment Analysis.
 """
 from __future__ import division
 
-from copy import deepcopy
 import codecs
 import csv
 import json
@@ -21,6 +20,8 @@ import random
 import re
 import sys
 import time
+from copy import deepcopy
+from itertools import tee
 
 import nltk
 from nltk.corpus import CategorizedPlaintextCorpusReader
@@ -64,6 +65,7 @@ SAD = set([
     ':c', ':{', '>:\\', ';('
     ])
 
+
 def timer(method):
     """
     A timer decorator to measure execution performance of methods.
@@ -84,6 +86,13 @@ def timer(method):
         return result
     return timed
 
+
+def pairwise(iterable):
+    """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
+    a, b = tee(iterable)
+    next(b, None)
+    return zip(a, b)
+
 #////////////////////////////////////////////////////////////
 #{ Feature extractor functions
 #////////////////////////////////////////////////////////////
diff --git a/nltk/sentiment/vader.py b/nltk/sentiment/vader.py
index 72e0ed9..2d232ba 100644
--- a/nltk/sentiment/vader.py
+++ b/nltk/sentiment/vader.py
@@ -27,6 +27,7 @@ import re
 import string
 from itertools import product
 import nltk.data
+from .util import pairwise
 
 ##Constants##
 
@@ -45,15 +46,14 @@ REGEX_REMOVE_PUNCTUATION = re.compile('[{0}]'.format(re.escape(string.punctuatio
 
 PUNC_LIST = [".", "!", "?", ",", ";", ":", "-", "'", "\"",
              "!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?"]
-NEGATE = \
-["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
+NEGATE = {"aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
  "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
  "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
  "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
  "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
  "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
  "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
- "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]
+ "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"}
 
 # booster/dampener 'intensifiers' or 'degree adverbs'
 # http://en.wiktionary.org/wiki/Category:English_degree_adverbs
@@ -88,18 +88,14 @@ def negated(input_words, include_nt=True):
     """
     Determine if input contains negation words
     """
-    neg_words = []
-    neg_words.extend(NEGATE)
-    for word in neg_words:
-        if word in input_words:
-            return True
+    neg_words = NEGATE
+    if any(word.lower() in neg_words for word in input_words):
+        return True
     if include_nt:
-        for word in input_words:
-            if "n't" in word:
-                return True
-    if "least" in input_words:
-        i = input_words.index("least")
-        if i > 0 and input_words[i-1] != "at":
+        if any("n't" in word.lower() for word in input_words):
+            return True
+    for first, second in pairwise(input_words):
+        if second.lower() == "least" and first.lower() != 'at':
             return True
     return False
 
diff --git a/nltk/stem/arlstem.py b/nltk/stem/arlstem.py
new file mode 100644
index 0000000..81de360
--- /dev/null
+++ b/nltk/stem/arlstem.py
@@ -0,0 +1,355 @@
+# -*- coding: utf-8 -*-
+#
+# Natural Language Toolkit: ARLSTem Stemmer
+#
+# Copyright (C) 2001-2017 NLTK Project
+#
+# Author: Kheireddine Abainia (x-programer) <k.abainia at gmail.com>
+# Algorithms: Kheireddine Abainia <k.abainia at gmail.com>
+#                         Siham Ouamour
+#                         Halim Sayoud
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+
+"""
+ARLSTem Arabic Stemmer
+The details about the implementation of this algorithm are described in:
+K. Abainia, S. Ouamour and H. Sayoud, A Novel Robust Arabic Light Stemmer ,
+Journal of Experimental & Theoretical Artificial Intelligence (JETAI'17),
+Vol. 29, No. 3, 2017, pp. 557-573.
+The ARLSTem is a light Arabic stemmer that is based on removing the affixes
+from the word (i.e. prefixes, suffixes and infixes). It was evaluated and
+compared to several other stemmers using Paice's parameters (under-stemming
+index, over-stemming index and stemming weight), and the results showed that
+ARLSTem is promising and producing high performances. This stemmer is not
+based on any dictionary and can be used on-line effectively.
+"""
+from __future__ import unicode_literals
+import re
+
+from nltk.stem.api import StemmerI
+
+
+class ARLSTem(StemmerI):
+    '''
+    ARLSTem stemmer : a light Arabic Stemming algorithm without any dictionary.
+    Department of Telecommunication & Information Processing. USTHB University,
+    Algiers, Algeria.
+    ARLSTem.stem(token) returns the Arabic stem for the input token.
+    The ARLSTem Stemmer requires that all tokens are encoded using Unicode
+    encoding.
+    '''
+
+    def __init__(self):
+        # different Alif with hamza
+        self.re_hamzated_alif = re.compile(r'[\u0622\u0623\u0625]')
+        self.re_alifMaqsura = re.compile(r'[\u0649]')
+        self.re_diacritics = re.compile(r'[\u064B-\u065F]')
+
+        # Alif Laam, Laam Laam, Fa Laam, Fa Ba
+        self.pr2 = [
+            '\u0627\u0644', '\u0644\u0644',
+            '\u0641\u0644', '\u0641\u0628'
+            ]
+        # Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam
+        self.pr3 = [
+            '\u0628\u0627\u0644',
+            '\u0643\u0627\u0644',
+            '\u0648\u0627\u0644'
+            ]
+        # Fa Laam Laam, Waaw Laam Laam
+        self.pr32 = ['\u0641\u0644\u0644', '\u0648\u0644\u0644']
+        # Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam
+        self.pr4 = [
+            '\u0641\u0628\u0627\u0644',
+            '\u0648\u0628\u0627\u0644',
+            '\u0641\u0643\u0627\u0644'
+            ]
+
+        # Kaf Yaa, Kaf Miim
+        self.su2 = [
+            '\u0643\u064A',
+            '\u0643\u0645'
+            ]
+        # Ha Alif, Ha Miim
+        self.su22 = ['\u0647\u0627', '\u0647\u0645']
+        # Kaf Miim Alif, Kaf Noon Shadda
+        self.su3 = ['\u0643\u0645\u0627', '\u0643\u0646\u0651']
+        # Ha Miim Alif, Ha Noon Shadda
+        self.su32 = ['\u0647\u0645\u0627', '\u0647\u0646\u0651']
+
+        # Alif Noon, Ya Noon, Waaw Noon
+        self.pl_si2 = ['\u0627\u0646', '\u064A\u0646', '\u0648\u0646']
+        # Taa Alif Noon, Taa Ya Noon
+        self.pl_si3 = ['\u062A\u0627\u0646', '\u062A\u064A\u0646']
+
+        # Alif Noon, Waaw Noon
+        self.verb_su2 = ['\u0627\u0646', '\u0648\u0646']
+        # Siin Taa, Siin Yaa
+        self.verb_pr2 = ['\u0633\u062A', '\u0633\u064A']
+        # Siin Alif, Siin Noon
+        self.verb_pr22 = ['\u0633\u0627', '\u0633\u0646']
+
+        # Taa Miim Alif, Taa Noon Shadda
+        self.verb_suf3 = ['\u062A\u0645\u0627', '\u062A\u0646\u0651']
+        # Noon Alif, Taa Miim, Taa Alif, Waaw Alif
+        self.verb_suf2 = [
+            '\u0646\u0627', '\u062A\u0645',
+            '\u062A\u0627', '\u0648\u0627'
+            ]
+        # Taa, Alif, Noon
+        self.verb_suf1 = ['\u062A', '\u0627', '\u0646']
+
+    def stem(self, token):
+        """
+            call this function to get the word's stem based on ARLSTem .
+        """
+        try:
+            if token is None:
+                raise ValueError("The word could not be stemmed, because \
+                                 it is empty !")
+            # remove Arabic diacritics and replace some letters with others
+            token = self.norm(token)
+            # strip common prefixes of the nouns
+            pre = self.pref(token)
+            if pre is not None:
+                token = pre
+            # strip the suffixes which are common to nouns and verbs
+            token = self.suff(token)
+            # transform a plural noun to a singular noun
+            ps = self.plur2sing(token)
+            if ps is None:
+                # transform from the feminine form to the masculine form
+                fm = self.fem2masc(token)
+                if fm is not None:
+                    return fm
+                else:
+                    if pre is None:  # if the prefixes are not stripped
+                        # strip the verb prefixes and suffixes
+                        return self.verb(token)
+            else:
+                return ps
+            return token
+        except ValueError as e:
+            print(e)
+
+    def norm(self, token):
+        """
+            normalize the word by removing diacritics, replacing hamzated Alif
+            with Alif replacing AlifMaqsura with Yaa and removing Waaw at the
+            beginning.
+        """
+        # strip Arabic diacritics
+        token = self.re_diacritics.sub('', token)
+        # replace Hamzated Alif with Alif bare
+        token = self.re_hamzated_alif.sub('\u0627', token)
+        # replace alifMaqsura with Yaa
+        token = self.re_alifMaqsura.sub('\u064A', token)
+        # strip the Waaw from the word beginning if the remaining is 3 letters
+        # at least
+        if token.startswith('\u0648') and len(token) > 3:
+            token = token[1:]
+        return token
+
+    def pref(self, token):
+        """
+            remove prefixes from the words' beginning.
+        """
+        if len(token) > 5:
+            for p3 in self.pr3:
+                if token.startswith(p3):
+                    return token[3:]
+        if len(token) > 6:
+            for p4 in self.pr4:
+                if token.startswith(p4):
+                    return token[4:]
+        if len(token) > 5:
+            for p3 in self.pr32:
+                if token.startswith(p3):
+                    return token[3:]
+        if len(token) > 4:
+            for p2 in self.pr2:
+                if token.startswith(p2):
+                    return token[2:]
+
+    def suff(self, token):
+        """
+            remove suffixes from the word's end.
+        """
+        if token.endswith('\u0643') and len(token) > 3:
+            return token[:-1]
+        if len(token) > 4:
+            for s2 in self.su2:
+                if token.endswith(s2):
+                    return token[:-2]
+        if len(token) > 5:
+            for s3 in self.su3:
+                if token.endswith(s3):
+                    return token[:-3]
+        if token.endswith('\u0647') and len(token) > 3:
+            token = token[:-1]
+            return token
+        if len(token) > 4:
+            for s2 in self.su22:
+                if token.endswith(s2):
+                    return token[:-2]
+        if len(token) > 5:
+            for s3 in self.su32:
+                if token.endswith(s3):
+                    return token[:-3]
+        if token.endswith('\u0646\u0627') and len(token) > 4:
+            return token[:-2]
+        return token
+
+    def fem2masc(self, token):
+        """
+            transform the word from the feminine form to the masculine form.
+        """
+        if token.endswith('\u0629') and len(token) > 3:
+            return token[:-1]
+
+    def plur2sing(self, token):
+        """
+            transform the word from the plural form to the singular form.
+        """
+        if len(token) > 4:
+            for ps2 in self.pl_si2:
+                if token.endswith(ps2):
+                    return token[:-2]
+        if len(token) > 5:
+            for ps3 in self.pl_si3:
+                if token.endswith(ps3):
+                    return token[:-3]
+        if len(token) > 3 and token.endswith('\u0627\u062A'):
+            return token[:-2]
+        if (len(token) > 3 and token.startswith('\u0627')
+           and token[2] == '\u0627'):
+            return token[:2] + token[3:]
+        if (len(token) > 4 and token.startswith('\u0627')
+           and token[-2] == '\u0627'):
+            return token[1:-2] + token[-1]
+
+    def verb(self, token):
+        """
+            stem the verb prefixes and suffixes or both
+        """
+        vb = self.verb_t1(token)
+        if vb is not None:
+            return vb
+        vb = self.verb_t2(token)
+        if vb is not None:
+            return vb
+        vb = self.verb_t3(token)
+        if vb is not None:
+            return vb
+        vb = self.verb_t4(token)
+        if vb is not None:
+            return vb
+        return self.verb_t5(token)
+
+    def verb_t1(self, token):
+        """
+            stem the present prefixes and suffixes
+        """
+        if len(token) > 5 and token.startswith('\u062A'):  # Taa
+            for s2 in self.pl_si2:
+                if token.endswith(s2):
+                    return token[1:-2]
+        if len(token) > 5 and token.startswith('\u064A'):  # Yaa
+            for s2 in self.verb_su2:
+                if token.endswith(s2):
+                    return token[1:-2]
+        if len(token) > 4 and token.startswith('\u0627'):  # Alif
+            # Waaw Alif
+            if len(token) > 5 and token.endswith('\u0648\u0627'):
+                return token[1:-2]
+            # Yaa
+            if token.endswith('\u064A'):
+                return token[1:-1]
+            # Alif
+            if token.endswith('\u0627'):
+                return token[1:-1]
+            # Noon
+            if token.endswith('\u0646'):
+                return token[1:-1]
+        # ^Yaa, Noon$
+        if (len(token) > 4
+           and token.startswith('\u064A')
+           and token.endswith('\u0646')):
+            return token[1:-1]
+        # ^Taa, Noon$
+        if (len(token) > 4
+           and token.startswith('\u062A')
+           and token.endswith('\u0646')):
+            return token[1:-1]
+
+    def verb_t2(self, token):
+        """
+            stem the future prefixes and suffixes
+        """
+        if len(token) > 6:
+            for s2 in self.pl_si2:
+                # ^Siin Taa
+                if (token.startswith(self.verb_pr2[0])
+                   and token.endswith(s2)):
+                    return token[2:-2]
+            # ^Siin Yaa, Alif Noon$
+            if (token.startswith(self.verb_pr2[1])
+               and token.endswith(self.pl_si2[0])):
+                return token[2:-2]
+            # ^Siin Yaa, Waaw Noon$
+            if (token.startswith(self.verb_pr2[1])
+               and token.endswith(self.pl_si2[2])):
+                return token[2:-2]
+        # ^Siin Taa, Noon$
+        if (len(token) > 5
+           and token.startswith(self.verb_pr2[0])
+           and token.endswith('\u0646')):
+            return token[2:-1]
+        # ^Siin Yaa, Noon$
+        if (len(token) > 5
+           and token.startswith(self.verb_pr2[1])
+           and token.endswith('\u0646')):
+            return token[2:-1]
+
+    def verb_t3(self, token):
+        """
+            stem the present suffixes
+        """
+        if len(token) > 5:
+            for su3 in self.verb_suf3:
+                if(token.endswith(su3)):
+                    return token[:-3]
+        if len(token) > 4:
+            for su2 in self.verb_suf2:
+                if token.endswith(su2):
+                    return token[:-2]
+        if len(token) > 3:
+            for su1 in self.verb_suf1:
+                if token.endswith(su1):
+                    return token[:-1]
+
+    def verb_t4(self, token):
+        """
+            stem the present prefixes
+        """
+        if len(token) > 3:
+            for pr1 in self.verb_suf1:
+                if token.startswith(pr1):
+                    return token[1:]
+            if token.startswith('\u064A'):
+                return token[1:]
+
+    def verb_t5(self, token):
+        """
+            stem the future prefixes
+        """
+        if len(token) > 4:
+            for pr2 in self.verb_pr22:
+                if token.startswith(pr2):
+                    return token[2:]
+            for pr2 in self.verb_pr2:
+                if token.startswith(pr2):
+                    return token[2:]
+        return token
diff --git a/nltk/stem/snowball.py b/nltk/stem/snowball.py
index 3ed2dbb..00b511c 100644
--- a/nltk/stem/snowball.py
+++ b/nltk/stem/snowball.py
@@ -5,7 +5,12 @@
 # Copyright (C) 2001-2017 NLTK Project
 # Author: Peter Michael Stahl <pemistahl at gmail.com>
 #         Peter Ljunglof <peter.ljunglof at heatherleaf.se> (revisions)
+#         Lakhdar Benzahia <lakhdar.benzahia at gmail.com>  (co-writer)
+#         Assem Chelli <assem.ch at gmail.com>  (reviewer arabicstemmer)
+#         Abdelkrim Aries <ab_aries at esi.dz> (reviewer arabicstemmer)
 # Algorithms: Dr Martin Porter <martin at tartarus.org>
+#             Assem Chelli <assem.ch at gmail.com>  arabic stemming algorithm
+#             Benzahia Lakhdar <lakhdar.benzahia at gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
@@ -21,11 +26,12 @@ There is also a demo function: `snowball.demo()`.
 from __future__ import unicode_literals, print_function
 
 from six.moves import input
+import re
 
 from nltk import compat
 from nltk.corpus import stopwords
 from nltk.stem import porter
-from nltk.stem.util import suffix_replace
+from nltk.stem.util import suffix_replace, prefix_replace
 
 from nltk.stem.api import StemmerI
 
@@ -36,7 +42,7 @@ class SnowballStemmer(StemmerI):
     Snowball Stemmer
 
     The following languages are supported:
-    Danish, Dutch, English, Finnish, French, German,
+    Arabic, Danish, Dutch, English, Finnish, French, German,
     Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian,
     Spanish and Swedish.
 
@@ -55,7 +61,7 @@ class SnowballStemmer(StemmerI):
 
     >>> from nltk.stem import SnowballStemmer
     >>> print(" ".join(SnowballStemmer.languages)) # See which languages are supported
-    danish dutch english finnish french german hungarian
+    arabic danish dutch english finnish french german hungarian
     italian norwegian porter portuguese romanian russian
     spanish swedish
     >>> stemmer = SnowballStemmer("german") # Choose a language
@@ -81,7 +87,7 @@ class SnowballStemmer(StemmerI):
                            language, a ValueError is raised.
     """
 
-    languages = ("danish", "dutch", "english", "finnish", "french", "german",
+    languages = ("arabic", "danish", "dutch", "english", "finnish", "french", "german",
                  "hungarian", "italian", "norwegian", "porter", "portuguese",
                  "romanian", "russian", "spanish", "swedish")
 
@@ -288,6 +294,520 @@ class _StandardStemmer(_LanguageSpecificStemmer):
 
         return rv
 
+class ArabicStemmer(_LanguageSpecificStemmer):
+    """
+        https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm)
+        The Snowball Arabic light Stemmer
+        Algorithm : Assem Chelli
+                   Abdelkrim Aries
+                   Lakhdar Benzahia
+        Nltk Version Author : Lakhdar Benzahia
+    """
+    # Normalize_pre stes
+    __vocalization = re.compile(r'[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]') # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ
+
+    __kasheeda = re.compile(r'[\u0640]') # ـ tatweel/kasheeda
+
+    __arabic_punctuation_marks = re.compile(r'[\u060C-\u061B-\u061F]') #  ؛ ، ؟
+
+    # Normalize_post
+    __last_hamzat = ('\u0623', '\u0625', '\u0622', '\u0624', '\u0626') # أ، إ، آ، ؤ، ئ
+
+    # normalize other hamza's
+    __initial_hamzat = re.compile(r'^[\u0622\u0623\u0625]') #  أ، إ، آ
+
+    __waw_hamza = re.compile(r'[\u0624]') # ؤ
+
+    __yeh_hamza = re.compile(r'[\u0626]') # ئ
+
+    __alefat = re.compile(r'[\u0623\u0622\u0625]') #  أ، إ، آ
+
+    # Checks
+    __checks1 = ('\u0643\u0627\u0644', '\u0628\u0627\u0644',  # بال، كال
+                 '\u0627\u0644', '\u0644\u0644' # لل، ال
+                 )
+
+    __checks2 = ('\u0629', # ة
+                 '\u0627\u062a'  #  female plural ات
+                 )
+
+    # Suffixes
+    __suffix_noun_step1a = ('\u064a', '\u0643', '\u0647', # ي، ك، ه
+                            '\u0646\u0627', '\u0643\u0645', '\u0647\u0627', '\u0647\u0646', '\u0647\u0645', # نا، كم، ها، هن، هم
+                            '\u0643\u0645\u0627', '\u0647\u0645\u0627' # كما، هما
+                            )
+
+    __suffix_noun_step1b = ('\u0646') # ن
+
+    __suffix_noun_step2a = ('\u0627', '\u064a', '\u0648') # ا، ي، و
+
+    __suffix_noun_step2b = ('\u0627\u062a') # ات
+
+    __suffix_noun_step2c1 = ('\u062a') # ت
+
+    __suffix_noun_step2c2 = ('\u0629') # ة
+
+    __suffix_noun_step3 = ('\u064a') # ي
+
+    __suffix_verb_step1 = ('\u0647', '\u0643', # ه، ك
+                           '\u0646\u064a', '\u0646\u0627', '\u0647\u0627', '\u0647\u0645', # ني، نا، ها، هم
+                           '\u0647\u0646', '\u0643\u0645', '\u0643\u0646', # هن، كم، كن
+                           '\u0647\u0645\u0627', '\u0643\u0645\u0627', '\u0643\u0645\u0648' # هما، كما، كمو
+                          )
+
+    __suffix_verb_step2a = ( '\u062a', '\u0627', '\u0646' , '\u064a', # ت، ا، ن، ي
+                             '\u0646\u0627', '\u062a\u0627', '\u062a\u0646', # نا، تا، تن Past
+                             '\u0627\u0646', '\u0648\u0646', '\u064a\u0646', # ان، هن، ين Present
+                             '\u062a\u0645\u0627' # تما
+                           )
+
+    __suffix_verb_step2b = ('\u0648\u0627','\u062a\u0645') # وا، تم
+
+    __suffix_verb_step2c = ('\u0648', # و
+                            '\u062a\u0645\u0648' # تمو
+                           )
+
+    __suffix_all_alef_maqsura = ('\u0649') # ى
+
+    # Prefixes
+    __prefix_step1 = ('\u0623', # أ
+                      '\u0623\u0623', '\u0623\u0622', '\u0623\u0624', '\u0623\u0627', '\u0623\u0625', # أأ، أآ، أؤ، أا، أإ
+                      )
+
+    __prefix_step2a = ('\u0641\u0627\u0644', '\u0648\u0627\u0644') # فال، وال
+
+    __prefix_step2b = ('\u0641', '\u0648') # ف، و
+
+    __prefix_step3a_noun = ('\u0627\u0644', '\u0644\u0644', # لل، ال
+                            '\u0643\u0627\u0644', '\u0628\u0627\u0644', # بال، كال
+                            )
+
+    __prefix_step3b_noun = ('\u0628', '\u0643', '\u0644', # ب، ك، ل
+                            '\u0628\u0628', '\u0643\u0643' # بب، كك
+                           )
+
+    __prefix_step3_verb = ('\u0633\u064a', '\u0633\u062a', '\u0633\u0646', '\u0633\u0623') # سي، ست، سن، سأ
+
+    __prefix_step4_verb = ('\u064a\u0633\u062a', '\u0646\u0633\u062a', '\u062a\u0633\u062a') # يست، نست، تست
+
+    # Suffixes added due to Conjugation Verbs
+    __conjugation_suffix_verb_1 = ('\u0647', '\u0643') # ه، ك
+
+    __conjugation_suffix_verb_2 = ('\u0646\u064a', '\u0646\u0627','\u0647\u0627', # ني، نا، ها
+                                   '\u0647\u0645', '\u0647\u0646', '\u0643\u0645', # هم، هن، كم
+                                   '\u0643\u0646' # كن
+                                   )
+    __conjugation_suffix_verb_3 = ('\u0647\u0645\u0627', '\u0643\u0645\u0627', '\u0643\u0645\u0648') # هما، كما، كمو
+
+    __conjugation_suffix_verb_4 = ('\u0627', '\u0646', '\u064a') # ا، ن، ي
+
+    __conjugation_suffix_verb_past = ('\u0646\u0627', '\u062a\u0627', '\u062a\u0646') # نا، تا، تن
+
+    __conjugation_suffix_verb_presnet = ('\u0627\u0646', '\u0648\u0646', '\u064a\u0646') # ان، ون، ين
+
+    # Suffixes added due to derivation Names
+    __conjugation_suffix_noun_1 = ('\u064a', '\u0643', '\u0647') # ي، ك، ه
+
+    __conjugation_suffix_noun_2 = ('\u0646\u0627', '\u0643\u0645', # نا، كم
+                                   '\u0647\u0627', '\u0647\u0646', '\u0647\u0645' # ها، هن، هم
+                                   )
+
+    __conjugation_suffix_noun_3 = ('\u0643\u0645\u0627', '\u0647\u0645\u0627') # كما، هما
+
+    # Prefixes added due to derivation Names
+    __prefixes1 = ('\u0648\u0627', '\u0641\u0627') # فا، وا
+
+    __articles_3len = ('\u0643\u0627\u0644', '\u0628\u0627\u0644')  # بال كال
+
+    __articles_2len = ('\u0627\u0644', '\u0644\u0644')  # ال لل
+
+    # Prepositions letters
+    __prepositions1 = ('\u0643', '\u0644') # ك، ل
+    __prepositions2 = ('\u0628\u0628', '\u0643\u0643') # بب، كك
+
+    is_verb = True
+    is_noun = True
+    is_defined = False
+
+    suffixes_verb_step1_success = False
+    suffix_verb_step2a_success = False
+    suffix_verb_step2b_success = False
+    suffix_noun_step2c2_success = False
+    suffix_noun_step1a_success = False
+    suffix_noun_step2a_success = False
+    suffix_noun_step2b_success = False
+    suffixe_noun_step1b_success = False
+    prefix_step2a_success = False
+    prefix_step3a_noun_success = False
+    prefix_step3b_noun_success = False
+
+    def __normalize_pre(self, token):
+        """
+        :param token: string
+        :return: normalized token type string
+        """
+        # strip diacritics
+        token = self.__vocalization.sub('', token)
+        #strip kasheeda
+        token = self.__kasheeda.sub('', token)
+        # strip punctuation marks
+        token = self.__arabic_punctuation_marks.sub('', token)
+        return token
+
+    def __normalize_post(self, token):
+        # normalize last hamza
+        for hamza in self.__last_hamzat:
+            if token.endswith(hamza):
+                token = suffix_replace(token, hamza, '\u0621')
+                break
+        # normalize other hamzat
+        token = self.__initial_hamzat.sub('\u0627', token)
+        token = self.__waw_hamza.sub('\u0648', token)
+        token = self.__yeh_hamza.sub('\u064a', token)
+        token = self.__alefat.sub('\u0627', token)
+        return  token
+
+    def __checks_1(self, token):
+        for prefix in self.__checks1 :
+            if token.startswith(prefix):
+                if prefix in self.__articles_3len and len(token) > 4 :
+                    self.is_noun = True
+                    self.is_verb = False
+                    self.is_defined = True
+                    break
+
+                if prefix in self.__articles_2len and len(token) > 3 :
+                    self.is_noun = True
+                    self.is_verb = False
+                    self.is_defined = True
+                    break
+
+    def __checks_2(self, token):
+        for suffix in self.__checks2:
+            if token.endswith(suffix):
+                if suffix == '\u0629' and len(token) > 2:
+                    self.is_noun = True
+                    self.is_verb = False
+                    break
+
+                if suffix == '\u0627\u062a' and len(token) > 3:
+                    self.is_noun = True
+                    self.is_verb = False
+                    break
+
+    def __Suffix_Verb_Step1(self, token):
+        for suffix in self.__suffix_verb_step1:
+            if token.endswith(suffix):
+                if suffix in self.__conjugation_suffix_verb_1 and len(token) >= 4:
+                    token = token[:-1]
+                    self.suffixes_verb_step1_success = True
+                    break
+
+                if suffix in self.__conjugation_suffix_verb_2 and len(token) >= 5:
+                    token = token[:-2]
+                    self.suffixes_verb_step1_success = True
+                    break
+
+                if suffix in self.__conjugation_suffix_verb_3 and len(token) >= 6:
+                    token = token[:-3]
+                    self.suffixes_verb_step1_success = True
+                    break
+        return token
+
+    def __Suffix_Verb_Step2a(self, token):
+        for suffix in self.__suffix_verb_step2a:
+            if token.endswith(suffix):
+                if suffix == '\u062a' and len(token) >= 4:
+                    token = token[:-1]
+                    self.suffix_verb_step2a_success = True
+                    break
+
+                if suffix in self.__conjugation_suffix_verb_4 and len(token) >= 4:
+                    token = token[:-1]
+                    self.suffix_verb_step2a_success = True
+                    break
+
+                if suffix in self.__conjugation_suffix_verb_past and len(token) >= 5:
+                    token = token[:-2]  # past
+                    self.suffix_verb_step2a_success = True
+                    break
+
+                if suffix in self.__conjugation_suffix_verb_present and len(token) > 5:
+                    token = token[:-2]  # present
+                    self.suffix_verb_step2a_success = True
+                    break
+
+                if suffix == '\u062a\u0645\u0627' and len(token) >= 6:
+                    token = token[:-3]
+                    self.suffix_verb_step2a_success = True
+                    break
+        return  token
+
+    def __Suffix_Verb_Step2c(self, token):
+        for suffix in self.__suffix_verb_step2c:
+            if token.endswith(suffix):
+                if suffix == '\u062a\u0645\u0648' and len(token) >= 6:
+                    token = token[:-3]
+                    break
+
+                if suffix == '\u0648' and len(token) >= 4:
+                    token = token[:-1]
+                    break
+        return token
+
+    def __Suffix_Verb_Step2b(self, token):
+        for suffix in self.__suffix_verb_step2b:
+            if token.endswith(suffix) and len(token) >= 5:
+                token = token[:-2]
+                self.suffix_verb_step2b_success = True
+                break
+        return  token
+
+    def __Suffix_Noun_Step2c2(self, token):
+        for suffix in self.__suffix_noun_step2c2:
+            if token.endswith(suffix) and len(token) >= 3:
+                token = token[:-1]
+                self.suffix_noun_step2c2_success = True
+                break
+        return token
+
+    def __Suffix_Noun_Step1a(self, token):
+        for suffix in self.__suffix_noun_step1a:
+            if token.endswith(suffix):
+                if suffix in self.__conjugation_suffix_noun_1 and len(token) >= 4:
+                    token = token[:-1]
+                    self.suffix_noun_step1a_success = True
+                    break
+
+                if suffix in self.__conjugation_suffix_noun_2 and len(token) >= 5:
+                    token = token[:-2]
+                    self.suffix_noun_step1a_success = True
+                    break
+
+                if suffix in self.__conjugation_suffix_noun_3 and len(token) >= 6:
+                    token = token[:-3]
+                    self.suffix_noun_step1a_success = True
+                    break
+        return token
+
+    def __Suffix_Noun_Step2a(self, token):
+        for suffix in self.__suffix_noun_step2a:
+            if token.endswith(suffix) and len(token) > 4:
+                token = token[:-1]
+                self.suffix_noun_step2a_success = True
+                break
+        return token
+
+    def __Suffix_Noun_Step2b(self, token):
+        for suffix in self.__suffix_noun_step2b:
+            if token.endswith(suffix) and len(token) >= 5:
+                token = token[:-2]
+                self.suffix_noun_step2b_success = True
+                break
+        return  token
+
+    def __Suffix_Noun_Step2c1(self, token):
+        for suffix in self.__suffix_noun_step2c1:
+            if token.endswith(suffix) and len(token) >= 4:
+                token = token[:-1]
+                break
+        return token
+
+    def __Suffix_Noun_Step1b(self, token):
+        for suffix in self.__suffix_noun_step1b:
+            if token.endswith(suffix) and len(token) > 5:
+                token = token[:-1]
+                self.suffixe_noun_step1b_success = True
+                break
+        return token
+
+    def __Suffix_Noun_Step3(self, token):
+        for suffix in self.__suffix_noun_step3:
+            if token.endswith(suffix) and len(token) >= 3:
+                token = token[:-1]  # ya' nisbiya
+                break
+        return token
+
+    def __Suffix_All_alef_maqsura(self, token):
+        for suffix in self.__suffix_all_alef_maqsura:
+            if token.endswith(suffix):
+                token = suffix_replace(token, suffix, '\u064a')
+        return  token
+
+    def __Prefix_Step1(self, token):
+        for prefix in self.__prefix_step1:
+            if token.startswith(prefix) and len(token) > 3:
+                if prefix == '\u0623\u0623':
+                    token = prefix_replace(token, prefix, '\u0623')
+                    break
+
+                elif prefix == '\u0623\u0622':
+                    token = prefix_replace(token, prefix, '\u0622')
+                    break
+
+                elif prefix == '\u0623\u0624':
+                    token = prefix_replace(token, prefix, '\u0624')
+                    break
+
+                elif prefix == '\u0623\u0627' :
+                    token = prefix_replace(token, prefix, '\u0627')
+                    break
+
+                elif prefix == '\u0623\u0625' :
+                    token = prefix_replace(token, prefix, '\u0625')
+                    break
+        return token
+
+    def __Prefix_Step2a(self, token):
+        for prefix in self.__prefix_step2a:
+            if token.startswith(prefix) and len(token) > 5:
+                token = token[len(prefix):]
+                self.prefix_step2a_success = True
+                break
+        return  token
+
+    def __Prefix_Step2b(self, token):
+        for prefix in self.__prefix_step2b:
+            if token.startswith(prefix) and len(token) > 3 :
+                if token[:2] not in self.__prefixes1:
+                    token = token[len(prefix):]
+                    break
+        return token
+
+    def __Prefix_Step3a_Noun(self, token):
+        for prefix in self.__prefix_step3a_noun:
+            if token.startswith(prefix):
+                if prefix in self.__articles_2len and len(token) > 4:
+                    token =  token[len(prefix):]
+                    self.prefix_step3a_noun_success = True
+                    break
+                if prefix in self.__articles_3len  and len(token) > 5:
+                    token = token[len(prefix):]
+                    break
+        return token
+
+    def __Prefix_Step3b_Noun(self, token):
+        for prefix in self.__prefix_step3b_noun:
+            if token.startswith(prefix):
+                if len(token) > 3:
+                    if prefix == '\u0628':
+                        token = token[len(prefix):]
+                        self.prefix_step3b_noun_success = True
+                        break
+
+                    if prefix in self.__prepositions2:
+                        token = prefix_replace(token, prefix, prefix[1])
+                        self.prefix_step3b_noun_success = True
+                        break
+
+                if prefix in self.__prepositions1 and len(token) > 4:
+                    token = token[len(prefix):]  # BUG: cause confusion
+                    self.prefix_step3b_noun_success = True
+                    break
+        return token
+
+    def __Prefix_Step3_Verb(self, token):
+        for prefix in self.__prefix_step3_verb:
+            if token.startswith(prefix) and len(token) > 4:
+                token = prefix_replace(token, prefix, prefix[1])
+                break
+        return token
+
+    def __Prefix_Step4_Verb(self, token):
+        for prefix in self.__prefix_step4_verb:
+            if token.startswith(prefix) and len(token) > 4:
+                token = prefix_replace(token, prefix, '\u0627\u0633\u062a')
+                self.is_verb = True
+                self.is_noun = False
+                break
+        return token
+
+    def stem(self, word):
+        """
+         Stem an Arabic word and return the stemmed form.
+        :param word: string
+        :return: string
+        """
+        # set initial values
+        self.is_verb = True
+        self.is_noun = True
+        self.is_defined = False
+
+        self.suffix_verb_step2a_success = False
+        self.suffix_verb_step2b_success = False
+        self.suffix_noun_step2c2_success = False
+        self.suffix_noun_step1a_success = False
+        self.suffix_noun_step2a_success = False
+        self.suffix_noun_step2b_success = False
+        self.suffixe_noun_step1b_success = False
+        self.prefix_step2a_success = False
+        self.prefix_step3a_noun_success = False
+        self.prefix_step3b_noun_success = False
+
+        modified_word = word
+        # guess type and properties
+        # checks1
+        self.__checks_1(modified_word)
+        # checks2
+        self.__checks_2(modified_word)
+        modified_word = self.__normalize_pre(modified_word)
+        if self.is_verb:
+            modified_word = self.__Suffix_Verb_Step1(modified_word)
+            if  self.suffixes_verb_step1_success:
+                modified_word = self.__Suffix_Verb_Step2a(modified_word)
+                if not self.suffix_verb_step2a_success :
+                    modified_word = self.__Suffix_Verb_Step2c(modified_word)
+                #or next
+            else:
+                modified_word = self.__Suffix_Verb_Step2b(modified_word)
+                if not self.suffix_verb_step2b_success:
+                    modified_word = self.__Suffix_Verb_Step2a(modified_word)
+        if self.is_noun:
+            modified_word = self.__Suffix_Noun_Step2c2(modified_word)
+            if not self.suffix_noun_step2c2_success:
+                if not self.is_defined:
+                    modified_word = self.__Suffix_Noun_Step1a(modified_word)
+                    #if self.suffix_noun_step1a_success:
+                    modified_word = self.__Suffix_Noun_Step2a(modified_word)
+                    if not self.suffix_noun_step2a_success:
+                         modified_word = self.__Suffix_Noun_Step2b(modified_word)
+                    if not self.suffix_noun_step2b_success and not self.suffix_noun_step2a_success:
+                        modified_word = self.__Suffix_Noun_Step2c1(modified_word)
+                    # or next ? todo : how to deal with or next
+                else:
+                    modified_word =  self.__Suffix_Noun_Step1b(modified_word)
+                    if self.suffixe_noun_step1b_success:
+                        modified_word = self.__Suffix_Noun_Step2a(modified_word)
+                        if not self.suffix_noun_step2a_success:
+                            modified_word = self.__Suffix_Noun_Step2b(modified_word)
+                        if not self.suffix_noun_step2b_success and not self.suffix_noun_step2a_success:
+                            modified_word = self.__Suffix_Noun_Step2c1(modified_word)
+                    else:
+                        if not self.is_defined:
+                            modified_word = self.__Suffix_Noun_Step2a(modified_word)
+                        modified_word = self.__Suffix_Noun_Step2b(modified_word)
+            modified_word = self.__Suffix_Noun_Step3(modified_word)
+        if not self.is_noun and self.is_verb:
+            modified_word = self.__Suffix_All_alef_maqsura(modified_word)
+
+        # prefixes
+        modified_word = self.__Prefix_Step1(modified_word)
+        modified_word = self.__Prefix_Step2a(modified_word)
+        if not self.prefix_step2a_success:
+            modified_word = self.__Prefix_Step2b(modified_word)
+        modified_word = self.__Prefix_Step3a_Noun(modified_word)
+        if not self.prefix_step3a_noun_success and self.is_noun:
+            modified_word = self.__Prefix_Step3b_Noun(modified_word)
+        else:
+            if not self.prefix_step3b_noun_success and self.is_verb:
+                modified_word = self.__Prefix_Step3_Verb(modified_word)
+                modified_word = self.__Prefix_Step4_Verb(modified_word)
+
+        # post normalization stemming
+        modified_word = self.__normalize_post(modified_word)
+        stemmed_word = modified_word
+        return stemmed_word
+
 class DanishStemmer(_ScandinavianStemmer):
 
     """
@@ -3658,7 +4178,8 @@ def demo():
     import re
     from nltk.corpus import udhr
 
-    udhr_corpus = {"danish":     "Danish_Dansk-Latin1",
+    udhr_corpus = {"arabic":     "Arabic_Alarabia-Arabic",
+                   "danish":     "Danish_Dansk-Latin1",
                    "dutch":      "Dutch_Nederlands-Latin1",
                    "english":    "English-Latin1",
                    "finnish":    "Finnish_Suomi-Latin1",
diff --git a/nltk/stem/util.py b/nltk/stem/util.py
index c3d9b90..2ba8547 100644
--- a/nltk/stem/util.py
+++ b/nltk/stem/util.py
@@ -10,3 +10,13 @@ def suffix_replace(original, old, new):
     Replaces the old suffix of the original string by a new suffix
     """
     return original[:-len(old)] + new
+
+def prefix_replace(original, old, new):
+    """
+     Replaces the old prefix of the original string by a new suffix
+    :param original: string
+    :param old: string
+    :param new: string
+    :return: string
+    """
+    return new + original[len(old):]
\ No newline at end of file
diff --git a/nltk/tag/__init__.py b/nltk/tag/__init__.py
index 0de452a..34c8798 100644
--- a/nltk/tag/__init__.py
+++ b/nltk/tag/__init__.py
@@ -19,13 +19,20 @@ the word ``'fly'`` with a noun part of speech tag (``'NN'``):
 
     >>> tagged_tok = ('fly', 'NN')
 
-An off-the-shelf tagger is available.  It uses the Penn Treebank tagset:
+An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset:
 
     >>> from nltk import pos_tag, word_tokenize
     >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
     [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
     ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
 
+A Russian tagger is also available if you specify lang="rus". It uses 
+the Russian National Corpus tagset:
+
+    >>> pos_tag(word_tokenize("Илья оторопел и дважды перечитал бумажку."), lang='rus')    # doctest: +SKIP
+    [('Илья', 'S'), ('оторопел', 'V'), ('и', 'CONJ'), ('дважды', 'ADV'), ('перечитал', 'V'),
+    ('бумажку', 'S'), ('.', 'NONLEX')]
+
 This package defines several taggers, which take a list of tokens,
 assign a tag to each one, and return the resulting list of tagged tokens.
 Most of the taggers are built automatically based on a training corpus.
diff --git a/nltk/tag/perceptron.py b/nltk/tag/perceptron.py
index b194ad0..4cedd8d 100644
--- a/nltk/tag/perceptron.py
+++ b/nltk/tag/perceptron.py
@@ -28,7 +28,7 @@ class AveragedPerceptron(object):
     '''An averaged perceptron, as implemented by Matthew Honnibal.
 
     See more implementation details here:
-        http://spacy.io/blog/part-of-speech-POS-tagger-in-python/
+        https://explosion.ai/blog/part-of-speech-pos-tagger-in-python
     '''
 
     def __init__(self):
@@ -101,7 +101,7 @@ class PerceptronTagger(TaggerI):
     '''
     Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
     See more implementation details here:
-        http://spacy.io/blog/part-of-speech-POS-tagger-in-python/
+        https://explosion.ai/blog/part-of-speech-pos-tagger-in-python
     
     >>> from nltk.tag.perceptron import PerceptronTagger
 
diff --git a/nltk/tag/stanford.py b/nltk/tag/stanford.py
index d055e5d..26ac640 100644
--- a/nltk/tag/stanford.py
+++ b/nltk/tag/stanford.py
@@ -27,6 +27,7 @@ from six import text_type
 
 from nltk.internals import find_file, find_jar, config_java, java, _java_options
 from nltk.tag.api import TaggerI
+from nltk.parse.corenlp import CoreNLPParser
 
 _stanford_url = 'https://nlp.stanford.edu/software'
 
@@ -47,7 +48,14 @@ class StanfordTagger(TaggerI):
 
     def __init__(self, model_filename, path_to_jar=None, encoding='utf8',
                  verbose=False, java_options='-mx1000m'):
-
+        # Raise deprecation warning.
+        warnings.simplefilter('always', DeprecationWarning)
+        warnings.warn(str("\nThe StanfordTokenizer will "
+                          "be deprecated in version 3.2.5.\n"
+                          "Please use \033[91mnltk.tag.corenlp.CoreNLPPOSTagger\033[0m "
+                          "or \033[91mnltk.tag.corenlp.CoreNLPNERTagger\033[0m instead."),
+                      DeprecationWarning, stacklevel=2)
+        warnings.simplefilter('ignore', DeprecationWarning)
         if not self._JAR:
             warnings.warn('The StanfordTagger class is not meant to be '
                           'instantiated directly. Did you mean '
@@ -204,6 +212,67 @@ class StanfordNERTagger(StanfordTagger):
 
         raise NotImplementedError
 
+class CoreNLPTagger(CoreNLPParser, TaggerI):
+    def __init__(self, tagtype, url='http://localhost:9000', encoding='utf8'):
+        """
+        An abstract interface to POS/NER taggers of CoreNLP that returns the
+        POS/NER tags from the Stanford CoreNLP API at nltk.parse.corenlp.
+        """
+        self.tagtype = tagtype
+        super(CoreNLPTagger, self).__init__(url, encoding)
+
+    def tag_sents(self, sentences):
+        # Converting list(list(str)) -> list(str)
+        sentences = (' '.join(words) for words in sentences)
+        return list(self.raw_tag_sents(sentences))
+
+
+    def tag(self, sentence):
+        return self.tag_sents([sentence])[0]
+
+    def raw_tag_sents(self, sentences):
+        """
+        This function will interface the `GenericCoreNLPParser.api_call` to
+        retreive the JSON output and return the annotations required.
+        """
+        default_properties = {'ssplit.isOneSentence': 'true',
+                              'annotators': 'tokenize,ssplit,' }
+        # Supports only 'pos' or 'ner' tags.
+        assert self.tagtype in ['pos', 'ner']
+        default_properties['annotators'] += self.tagtype
+        for sentence in sentences:
+            tagged_data = self.api_call(sentence, properties=default_properties)
+            assert len(tagged_data['sentences']) == 1
+            # Taggers only need to return 1-best sentence.
+            yield [(token['word'], token[self.tagtype]) for token in tagged_data['sentences'][0]['tokens']]
+
+
+class CoreNLPPOSTagger(CoreNLPTagger):
+    """
+    This is a subclass of the CoreNLPTagger that wraps around the
+    nltk.parse.CoreNLPParser for Part-of-Sppech tagging.
+
+        >>> from nltk.tag.stanford import CoreNLPPOSTagger
+        >>> CoreNLPPOSTagger(url='http://localhost:9000').tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
+        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
+    """
+    def __init__(self, url='http://localhost:9000', encoding='utf8'):
+        super(CoreNLPPOSTagger, self).__init__('pos', url, encoding)
+
+
+class CoreNLPNERTagger(CoreNLPTagger):
+    """
+    This is a subclass of the CoreNLPTagger that wraps around the
+    nltk.parse.CoreNLPParser for Named-Entity tagging.
+
+        >>> from nltk.tag.stanford import CoreNLPNERTagger
+        >>> CoreNLPNERTagger(url='http://localhost:9000').tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
+        [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'O')]
+    """
+    def __init__(self, url='http://localhost:9000', encoding='utf8'):
+        super(CoreNLPNERTagger, self).__init__('ner', url, encoding)
+
+
 def setup_module(module):
     from nose import SkipTest
 
@@ -212,3 +281,10 @@ def setup_module(module):
     except LookupError:
         raise SkipTest('Doctests from nltk.tag.stanford are skipped because one \
                        of the stanford jars cannot be found.')
+
+    try:
+        CoreNLPPOSTagger()
+        CoreNLPNERTagger()
+    except LookupError:
+        raise SkipTest('Doctests from nltk.tag.stanford.CoreNLPTokenizer'
+                       'are skipped because the stanford corenlp server not started')
diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest
index b658e53..c21fedd 100644
--- a/nltk/test/corpus.doctest
+++ b/nltk/test/corpus.doctest
@@ -977,6 +977,12 @@ a given lemma belongs to:
     >>> verbnet.classids('accept')
     ['approve-77', 'characterize-29.2-1-1', 'obtain-13.5.2']
 
+The `classids()` method may additionally be used to retrieve all classes
+within verbnet if nothing is passed:
+
+    >>> verbnet.classids()
+    ['accompany-51.7', 'admire-31.2', 'admire-31.2-1', 'admit-65', 'adopt-93', 'advise-37.9', 'advise-37.9-1', 'allow-64', 'amalgamate-22.2', 'amalgamate-22.2-1', 'amalgamate-22.2-1-1', 'amalgamate-22.2-2', 'amalgamate-22.2-2-1', 'amalgamate-22.2-3', 'amalgamate-22.2-3-1', 'amalgamate-22.2-3-1-1', 'amalgamate-22.2-3-2', 'amuse-31.1', 'animal_sounds-38', 'appeal-31.4', 'appeal-31.4-1', 'appeal-31.4-2', 'appeal-31.4-3', 'appear-48.1.1', 'appoint-29.1', 'approve-77', 'assessment-34', 'assum [...]
+
 The primary object in the lexicon is a class record, which is stored
 as an ElementTree xml object.  The class record for a given class
 identifier is returned by the `vnclass()` method:
@@ -1017,18 +1023,42 @@ concise form.  The simplest such method is `pprint()`:
         * Theme[+concrete +force]
       Frames:
         Intransitive (Expletive Subject)
+          Example: It's raining.
           Syntax: LEX[it] LEX[[+be]] VERB
           Semantics:
             * weather(during(E), Weather_type, ?Theme)
         NP (Expletive Subject, Theme Object)
+          Example: It's raining cats and dogs.
           Syntax: LEX[it] LEX[[+be]] VERB NP[Theme]
           Semantics:
             * weather(during(E), Weather_type, Theme)
         PP (Expletive Subject, Theme-PP)
+          Example: It was pelting with rain.
           Syntax: LEX[it[+be]] VERB PREP[with] NP[Theme]
           Semantics:
             * weather(during(E), Weather_type, Theme)
 
+Verbnet gives us frames that link the syntax and semantics using an example.
+These frames are part of the corpus and we can use `frames()` to get a frame
+for a given verbnet class.
+
+    >>> frame = verbnet.frames('57')
+    >>> frame == [{'semantics': [{'arguments': [{'value': 'during(E)', 'type': 'Event'}, {'value': 'Weather_type', 'type': 'VerbSpecific'}, {'value': '?Theme', 'type': 'ThemRole'}], 'predicate_value': 'weather'}], 'example': "It's raining.", 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'LEX', 'modifiers': {'value': '[+be]', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'synrestrs': [], 'sel [...]
+    True
+
+Verbnet corpus lets us access thematic roles individually using `themroles()`.
+
+    >>> themroles = verbnet.themroles('57')
+    >>> themroles == [{'modifiers': [{'type': 'concrete', 'value': '+'}, {'type': 'force', 'value': '+'}], 'type': 'Theme'}]
+    True
+
+Verbnet classes may also have subclasses sharing similar syntactic and semantic properties
+while having differences with the superclass. The Verbnet corpus allows us to access these
+subclasses using `subclasses()`.
+
+    >>> print(verbnet.subclasses('9.1')) #Testing for 9.1 since '57' does not have subclasses
+    ['put-9.1-1', 'put-9.1-2']
+
 
 nps_chat
 --------
diff --git a/nltk/test/stem.doctest b/nltk/test/stem.doctest
index d8427af..eff4d2c 100644
--- a/nltk/test/stem.doctest
+++ b/nltk/test/stem.doctest
@@ -46,7 +46,7 @@ Unit tests for Snowball stemmer
 See which languages are supported.
 
     >>> print(" ".join(SnowballStemmer.languages))
-    danish dutch english finnish french german hungarian italian
+    arabic danish dutch english finnish french german hungarian italian
     norwegian porter portuguese romanian russian spanish swedish
 
 Create a new instance of a language specific subclass.
diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest
index 9d0d668..07ab178 100644
--- a/nltk/test/tokenize.doctest
+++ b/nltk/test/tokenize.doctest
@@ -193,7 +193,14 @@ The sentence splitter should remove whitespace following the sentence boundary.
     ['See Section 3.', ')  Or Section 2.', ')']
 
 
-Regression Tests: aling_tokens
+Two instances of PunktSentenceTokenizer should not share PunktParameters.
+
+    >>> pst = PunktSentenceTokenizer()
+    >>> pst2 = PunktSentenceTokenizer()
+    >>> pst._params is pst2._params
+    False
+
+Regression Tests: align_tokens
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Post-hoc alignment of tokens with a source string
 
diff --git a/nltk/test/unit/test_corenlp.py b/nltk/test/unit/test_corenlp.py
new file mode 100644
index 0000000..feb84cd
--- /dev/null
+++ b/nltk/test/unit/test_corenlp.py
@@ -0,0 +1,412 @@
+# -*- coding: utf-8 -*-
+
+"""
+Mock test for Stanford CoreNLP wrappers.
+"""
+
+import sys
+from itertools import chain
+from unittest import TestCase, SkipTest
+
+try:
+    from unittest.mock import patch # Tries to import mock in Python3.
+except ImportError:
+    raise SkipTest('unittest.mock no supported in Python2')
+
+from nltk.tag.stanford import CoreNLPPOSTagger, CoreNLPNERTagger
+from nltk.tokenize.stanford import CoreNLPTokenizer
+
+
+class TestTokenizerAPI(TestCase):
+    @patch('nltk.tokenize.stanford.CoreNLPTokenizer')
+    def test_tokenize(self, MockTokenizer):
+        corenlp_tokenizer = MockTokenizer()
+        input_string = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
+        corenlp_tokenizer.api_call.return_value = {
+        u'sentences': [   {   u'index': 0,
+                          u'tokens': [   {   u'after': u' ',
+                                             u'before': u'',
+                                             u'characterOffsetBegin': 0,
+                                             u'characterOffsetEnd': 4,
+                                             u'index': 1,
+                                             u'originalText': u'Good',
+                                             u'word': u'Good'},
+                                         {   u'after': u' ',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 5,
+                                             u'characterOffsetEnd': 12,
+                                             u'index': 2,
+                                             u'originalText': u'muffins',
+                                             u'word': u'muffins'},
+                                         {   u'after': u' ',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 13,
+                                             u'characterOffsetEnd': 17,
+                                             u'index': 3,
+                                             u'originalText': u'cost',
+                                             u'word': u'cost'},
+                                         {   u'after': u'',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 18,
+                                             u'characterOffsetEnd': 19,
+                                             u'index': 4,
+                                             u'originalText': u'$',
+                                             u'word': u'$'},
+                                         {   u'after': u'\n',
+                                             u'before': u'',
+                                             u'characterOffsetBegin': 19,
+                                             u'characterOffsetEnd': 23,
+                                             u'index': 5,
+                                             u'originalText': u'3.88',
+                                             u'word': u'3.88'},
+                                         {   u'after': u' ',
+                                             u'before': u'\n',
+                                             u'characterOffsetBegin': 24,
+                                             u'characterOffsetEnd': 26,
+                                             u'index': 6,
+                                             u'originalText': u'in',
+                                             u'word': u'in'},
+                                         {   u'after': u' ',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 27,
+                                             u'characterOffsetEnd': 30,
+                                             u'index': 7,
+                                             u'originalText': u'New',
+                                             u'word': u'New'},
+                                         {   u'after': u'',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 31,
+                                             u'characterOffsetEnd': 35,
+                                             u'index': 8,
+                                             u'originalText': u'York',
+                                             u'word': u'York'},
+                                         {   u'after': u'  ',
+                                             u'before': u'',
+                                             u'characterOffsetBegin': 35,
+                                             u'characterOffsetEnd': 36,
+                                             u'index': 9,
+                                             u'originalText': u'.',
+                                             u'word': u'.'}]},
+                      {   u'index': 1,
+                          u'tokens': [   {   u'after': u' ',
+                                             u'before': u'  ',
+                                             u'characterOffsetBegin': 38,
+                                             u'characterOffsetEnd': 44,
+                                             u'index': 1,
+                                             u'originalText': u'Please',
+                                             u'word': u'Please'},
+                                         {   u'after': u' ',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 45,
+                                             u'characterOffsetEnd': 48,
+                                             u'index': 2,
+                                             u'originalText': u'buy',
+                                             u'word': u'buy'},
+                                         {   u'after': u'\n',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 49,
+                                             u'characterOffsetEnd': 51,
+                                             u'index': 3,
+                                             u'originalText': u'me',
+                                             u'word': u'me'},
+                                         {   u'after': u' ',
+                                             u'before': u'\n',
+                                             u'characterOffsetBegin': 52,
+                                             u'characterOffsetEnd': 55,
+                                             u'index': 4,
+                                             u'originalText': u'two',
+                                             u'word': u'two'},
+                                         {   u'after': u' ',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 56,
+                                             u'characterOffsetEnd': 58,
+                                             u'index': 5,
+                                             u'originalText': u'of',
+                                             u'word': u'of'},
+                                         {   u'after': u'',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 59,
+                                             u'characterOffsetEnd': 63,
+                                             u'index': 6,
+                                             u'originalText': u'them',
+                                             u'word': u'them'},
+                                         {   u'after': u'\n',
+                                             u'before': u'',
+                                             u'characterOffsetBegin': 63,
+                                             u'characterOffsetEnd': 64,
+                                             u'index': 7,
+                                             u'originalText': u'.',
+                                             u'word': u'.'}]},
+                      {   u'index': 2,
+                          u'tokens': [   {   u'after': u'',
+                                             u'before': u'\n',
+                                             u'characterOffsetBegin': 65,
+                                             u'characterOffsetEnd': 71,
+                                             u'index': 1,
+                                             u'originalText': u'Thanks',
+                                             u'word': u'Thanks'},
+                                         {   u'after': u'',
+                                             u'before': u'',
+                                             u'characterOffsetBegin': 71,
+                                             u'characterOffsetEnd': 72,
+                                             u'index': 2,
+                                             u'originalText': u'.',
+                                             u'word': u'.'}]}]
+                                             }
+        # Should return the mocked json.
+        api_call_output = corenlp_tokenizer.api_call(input_string)
+        self.assertIsInstance(api_call_output, dict)
+        # Emulates the tokenization process.
+        # Note: We're not calling the corenlpt_tokenizer.tokenize() directly because
+        #       it will not return the desired value but a MagicMock object.
+        # >>> corenlp_tokenizer.tokenize(input_string)
+        # >>> <MagicMock name='CoreNLPTokenizer().tokenize()' id='140308440963224'>
+        print (corenlp_tokenizer.tokenize(input_string))
+        tokenized_output = [token['originalText'] or token['word']
+                            for sentence in api_call_output['sentences']
+                            for token in sentence['tokens']]
+        expected_output = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in',
+                           u'New', u'York', u'.', u'Please', u'buy', u'me',
+                           u'two', u'of', u'them', u'.', u'Thanks', u'.']
+        self.assertEqual(expected_output, tokenized_output)
+
+
+class TestTaggerAPI(TestCase):
+    @patch('nltk.tag.stanford.CoreNLPTagger')
+    def test_blog_posts(self, MockTagger):
+        corenlp_tagger = MockTagger()
+        input_tokens = 'What is the airspeed of an unladen swallow ?'.split()
+        corenlp_tagger.api_call.return_value = {
+        u'sentences': [   {   u'basicDependencies': [   {   u'dep': u'ROOT',
+                                                        u'dependent': 1,
+                                                        u'dependentGloss': u'What',
+                                                        u'governor': 0,
+                                                        u'governorGloss': u'ROOT'},
+                                                    {   u'dep': u'cop',
+                                                        u'dependent': 2,
+                                                        u'dependentGloss': u'is',
+                                                        u'governor': 1,
+                                                        u'governorGloss': u'What'},
+                                                    {   u'dep': u'det',
+                                                        u'dependent': 3,
+                                                        u'dependentGloss': u'the',
+                                                        u'governor': 4,
+                                                        u'governorGloss': u'airspeed'},
+                                                    {   u'dep': u'nsubj',
+                                                        u'dependent': 4,
+                                                        u'dependentGloss': u'airspeed',
+                                                        u'governor': 1,
+                                                        u'governorGloss': u'What'},
+                                                    {   u'dep': u'case',
+                                                        u'dependent': 5,
+                                                        u'dependentGloss': u'of',
+                                                        u'governor': 8,
+                                                        u'governorGloss': u'swallow'},
+                                                    {   u'dep': u'det',
+                                                        u'dependent': 6,
+                                                        u'dependentGloss': u'an',
+                                                        u'governor': 8,
+                                                        u'governorGloss': u'swallow'},
+                                                    {   u'dep': u'compound',
+                                                        u'dependent': 7,
+                                                        u'dependentGloss': u'unladen',
+                                                        u'governor': 8,
+                                                        u'governorGloss': u'swallow'},
+                                                    {   u'dep': u'nmod',
+                                                        u'dependent': 8,
+                                                        u'dependentGloss': u'swallow',
+                                                        u'governor': 4,
+                                                        u'governorGloss': u'airspeed'},
+                                                    {   u'dep': u'punct',
+                                                        u'dependent': 9,
+                                                        u'dependentGloss': u'?',
+                                                        u'governor': 1,
+                                                        u'governorGloss': u'What'}],
+                          u'enhancedDependencies': [   {   u'dep': u'ROOT',
+                                                           u'dependent': 1,
+                                                           u'dependentGloss': u'What',
+                                                           u'governor': 0,
+                                                           u'governorGloss': u'ROOT'},
+                                                       {   u'dep': u'cop',
+                                                           u'dependent': 2,
+                                                           u'dependentGloss': u'is',
+                                                           u'governor': 1,
+                                                           u'governorGloss': u'What'},
+                                                       {   u'dep': u'det',
+                                                           u'dependent': 3,
+                                                           u'dependentGloss': u'the',
+                                                           u'governor': 4,
+                                                           u'governorGloss': u'airspeed'},
+                                                       {   u'dep': u'nsubj',
+                                                           u'dependent': 4,
+                                                           u'dependentGloss': u'airspeed',
+                                                           u'governor': 1,
+                                                           u'governorGloss': u'What'},
+                                                       {   u'dep': u'case',
+                                                           u'dependent': 5,
+                                                           u'dependentGloss': u'of',
+                                                           u'governor': 8,
+                                                           u'governorGloss': u'swallow'},
+                                                       {   u'dep': u'det',
+                                                           u'dependent': 6,
+                                                           u'dependentGloss': u'an',
+                                                           u'governor': 8,
+                                                           u'governorGloss': u'swallow'},
+                                                       {   u'dep': u'compound',
+                                                           u'dependent': 7,
+                                                           u'dependentGloss': u'unladen',
+                                                           u'governor': 8,
+                                                           u'governorGloss': u'swallow'},
+                                                       {   u'dep': u'nmod:of',
+                                                           u'dependent': 8,
+                                                           u'dependentGloss': u'swallow',
+                                                           u'governor': 4,
+                                                           u'governorGloss': u'airspeed'},
+                                                       {   u'dep': u'punct',
+                                                           u'dependent': 9,
+                                                           u'dependentGloss': u'?',
+                                                           u'governor': 1,
+                                                           u'governorGloss': u'What'}],
+                          u'enhancedPlusPlusDependencies': [   {   u'dep': u'ROOT',
+                                                                   u'dependent': 1,
+                                                                   u'dependentGloss': u'What',
+                                                                   u'governor': 0,
+                                                                   u'governorGloss': u'ROOT'},
+                                                               {   u'dep': u'cop',
+                                                                   u'dependent': 2,
+                                                                   u'dependentGloss': u'is',
+                                                                   u'governor': 1,
+                                                                   u'governorGloss': u'What'},
+                                                               {   u'dep': u'det',
+                                                                   u'dependent': 3,
+                                                                   u'dependentGloss': u'the',
+                                                                   u'governor': 4,
+                                                                   u'governorGloss': u'airspeed'},
+                                                               {   u'dep': u'nsubj',
+                                                                   u'dependent': 4,
+                                                                   u'dependentGloss': u'airspeed',
+                                                                   u'governor': 1,
+                                                                   u'governorGloss': u'What'},
+                                                               {   u'dep': u'case',
+                                                                   u'dependent': 5,
+                                                                   u'dependentGloss': u'of',
+                                                                   u'governor': 8,
+                                                                   u'governorGloss': u'swallow'},
+                                                               {   u'dep': u'det',
+                                                                   u'dependent': 6,
+                                                                   u'dependentGloss': u'an',
+                                                                   u'governor': 8,
+                                                                   u'governorGloss': u'swallow'},
+                                                               {   u'dep': u'compound',
+                                                                   u'dependent': 7,
+                                                                   u'dependentGloss': u'unladen',
+                                                                   u'governor': 8,
+                                                                   u'governorGloss': u'swallow'},
+                                                               {   u'dep': u'nmod:of',
+                                                                   u'dependent': 8,
+                                                                   u'dependentGloss': u'swallow',
+                                                                   u'governor': 4,
+                                                                   u'governorGloss': u'airspeed'},
+                                                               {   u'dep': u'punct',
+                                                                   u'dependent': 9,
+                                                                   u'dependentGloss': u'?',
+                                                                   u'governor': 1,
+                                                                   u'governorGloss': u'What'}],
+                          u'index': 0,
+                          u'parse': u'(ROOT\n  (SBARQ\n    (WHNP (WP What))\n    (SQ (VBZ is)\n      (NP\n        (NP (DT the) (NN airspeed))\n        (PP (IN of)\n          (NP (DT an) (NN unladen) (NN swallow)))))\n    (. ?)))',
+                          u'tokens': [   {   u'after': u' ',
+                                             u'before': u'',
+                                             u'characterOffsetBegin': 0,
+                                             u'characterOffsetEnd': 4,
+                                             u'index': 1,
+                                             u'lemma': u'what',
+                                             u'originalText': u'What',
+                                             u'pos': u'WP',
+                                             u'word': u'What'},
+                                         {   u'after': u' ',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 5,
+                                             u'characterOffsetEnd': 7,
+                                             u'index': 2,
+                                             u'lemma': u'be',
+                                             u'originalText': u'is',
+                                             u'pos': u'VBZ',
+                                             u'word': u'is'},
+                                         {   u'after': u' ',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 8,
+                                             u'characterOffsetEnd': 11,
+                                             u'index': 3,
+                                             u'lemma': u'the',
+                                             u'originalText': u'the',
+                                             u'pos': u'DT',
+                                             u'word': u'the'},
+                                         {   u'after': u' ',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 12,
+                                             u'characterOffsetEnd': 20,
+                                             u'index': 4,
+                                             u'lemma': u'airspeed',
+                                             u'originalText': u'airspeed',
+                                             u'pos': u'NN',
+                                             u'word': u'airspeed'},
+                                         {   u'after': u' ',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 21,
+                                             u'characterOffsetEnd': 23,
+                                             u'index': 5,
+                                             u'lemma': u'of',
+                                             u'originalText': u'of',
+                                             u'pos': u'IN',
+                                             u'word': u'of'},
+                                         {   u'after': u' ',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 24,
+                                             u'characterOffsetEnd': 26,
+                                             u'index': 6,
+                                             u'lemma': u'a',
+                                             u'originalText': u'an',
+                                             u'pos': u'DT',
+                                             u'word': u'an'},
+                                         {   u'after': u' ',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 27,
+                                             u'characterOffsetEnd': 34,
+                                             u'index': 7,
+                                             u'lemma': u'unladen',
+                                             u'originalText': u'unladen',
+                                             u'pos': u'JJ',
+                                             u'word': u'unladen'},
+                                         {   u'after': u' ',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 35,
+                                             u'characterOffsetEnd': 42,
+                                             u'index': 8,
+                                             u'lemma': u'swallow',
+                                             u'originalText': u'swallow',
+                                             u'pos': u'VB',
+                                             u'word': u'swallow'},
+                                         {   u'after': u'',
+                                             u'before': u' ',
+                                             u'characterOffsetBegin': 43,
+                                             u'characterOffsetEnd': 44,
+                                             u'index': 9,
+                                             u'lemma': u'?',
+                                             u'originalText': u'?',
+                                             u'pos': u'.',
+                                             u'word': u'?'}]}]
+                                        }
+        expected_output = [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'),
+                           ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'),
+                           ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
+        tagged_data = corenlp_tagger.api_call(input_tokens,
+                                              properties={'ssplit.isOneSentence': 'true',
+                                                          'annotators': 'tokenize,ssplit,pos' })
+        # Emulates the tagging function.
+        # Note: We're not calling the corenlp_tagger.tag() directly because
+        #       it will not return the desired value but a MagicMock object.
+        # >>> corenlp_tagger.tag(input_tokens)
+        # >>> <MagicMock name='CoreNLPTagger().tag()' id='140395802719848'>
+        tagged_output = [(token['word'], token['pos'])
+                         for token in tagged_data['sentences'][0]['tokens']]
+        self.assertEqual(expected_output, tagged_output)
diff --git a/nltk/test/unit/test_stem.py b/nltk/test/unit/test_stem.py
index 6287f42..5f359d4 100644
--- a/nltk/test/unit/test_stem.py
+++ b/nltk/test/unit/test_stem.py
@@ -10,6 +10,20 @@ import os
 
 class SnowballTest(unittest.TestCase):
 
+    def test_arabic(self):
+        """
+        this unit testing for test the snowball arabic light stemmer
+        this stemmer deals with prefixes and suffixes
+        """
+        ar_stemmer = SnowballStemmer("arabic")
+        assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
+        assert ar_stemmer.stem("العربية") == "عرب"
+        assert ar_stemmer.stem("فقالوا") == "قال"
+        assert ar_stemmer.stem("الطالبات") == "طالب"
+        assert ar_stemmer.stem("فالطالبات") == "طالب"
+        assert ar_stemmer.stem("والطالبات") == "طالب"
+        assert ar_stemmer.stem("الطالبون") == "طالب"
+
     def test_russian(self):
         # Russian words both consisting of Cyrillic
         # and Roman letters can be stemmed.
diff --git a/nltk/test/unit/test_tokenize.py b/nltk/test/unit/test_tokenize.py
index 45fba66..a46ec82 100644
--- a/nltk/test/unit/test_tokenize.py
+++ b/nltk/test/unit/test_tokenize.py
@@ -5,7 +5,7 @@ See also nltk/test/tokenize.doctest
 """
 
 from __future__ import unicode_literals
-from nltk.tokenize import TweetTokenizer, StanfordSegmenter
+from nltk.tokenize import TweetTokenizer, StanfordSegmenter, TreebankWordTokenizer
 from nose import SkipTest
 import unittest
 import os
@@ -107,3 +107,43 @@ class TestTokenize(unittest.TestCase):
         expected = ['u', '@abcde', '@abcdefghijklmnopqrst', '@abcde', '_', '@abcde', '5', '@abcde']
         result = tokenizer.tokenize(test7)
         self.assertEqual(result, expected)
+
+    def test_treebank_span_tokenizer(self):
+        """
+        Test TreebankWordTokenizer.span_tokenize function
+        """
+
+        tokenizer = TreebankWordTokenizer()
+
+        # Test case in the docstring
+        test1 = "Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks)."
+        expected = [
+            (0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
+            (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
+            (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
+            (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)
+        ]
+        result = tokenizer.span_tokenize(test1)
+        self.assertEqual(result, expected)
+
+        # Test case with double quotation
+        test2 = "The DUP is similar to the \"religious right\" in the United States and takes a hardline stance on social issues"
+        expected = [
+            (0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27),
+            (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64),
+            (65, 68), (69, 74), (75, 76), (77, 85), (86, 92), (93, 95), (96, 102),
+            (103, 109)
+        ]
+        result = tokenizer.span_tokenize(test2)
+        self.assertEqual(result, expected)
+
+        # Test case with double qoutation as well as converted quotations
+        test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
+        expected = [
+            (0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27),
+            (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64),
+            (65, 68), (69, 74), (75, 76), (77, 79), (79, 87), (87, 89), (90, 96),
+            (97, 99), (100, 106), (107, 113)
+        ]
+        result = tokenizer.span_tokenize(test3)
+        self.assertEqual(result, expected)
diff --git a/nltk/test/unit/test_wordnet.py b/nltk/test/unit/test_wordnet.py
new file mode 100644
index 0000000..c45ee3f
--- /dev/null
+++ b/nltk/test/unit/test_wordnet.py
@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+"""
+Unit tests for nltk.corpus.wordnet
+See also nltk/test/wordnet.doctest
+"""
+
+from __future__ import unicode_literals
+from nose import SkipTest
+import unittest
+import os
+
+from nltk.corpus.reader.wordnet import WordNetCorpusReader
+from nltk.corpus import wordnet as wn
+from nltk.corpus import wordnet_ic as wnic
+from nltk.data import find as find_data
+
+
+wn.ensure_loaded()
+S = wn.synset
+L = wn.lemma
+
+class WordnNetDemo(unittest.TestCase):
+
+    def test_retrieve_synset(self):
+        move_synset = S('go.v.21')
+        self.assertEqual(move_synset.name(), "move.v.15")
+        self.assertEqual(move_synset.lemma_names(), ['move', 'go'])
+        self.assertEqual(move_synset.definition(), "have a turn; make one's move in a game")
+        self.assertEqual(move_synset.examples(), ['Can I go now?'])
+
+
+    def test_retrieve_synsets(self):
+        self.assertEqual(sorted(wn.synsets('zap', pos='n')),
+                        [S('zap.n.01')])
+        self.assertEqual(sorted(wn.synsets('zap', pos='v')),
+                        [S('microwave.v.01'), S('nuke.v.01'), S('zap.v.01'), S('zap.v.02')])
+
+    def test_hyperhyponyms(self):
+        # Not every synset as hypernyms()
+        self.assertEqual(S('travel.v.01').hypernyms(), [])
+        self.assertEqual(S('travel.v.02').hypernyms(),
+                        [S('travel.v.03')])
+        self.assertEqual(S('travel.v.03').hypernyms(), [])
+
+        # Test hyper-/hyponyms.
+        self.assertEqual(S('breakfast.n.1').hypernyms(), [S('meal.n.01')])
+        first_five_meal_hypo = [S('banquet.n.02'), S('bite.n.04'), S('breakfast.n.01'), S('brunch.n.01'), S('buffet.n.02')]
+        self.assertEqual(sorted(S('meal.n.1').hyponyms()[:5]), first_five_meal_hypo)
+        self.assertEqual(S('Austen.n.1').instance_hypernyms(), [S('writer.n.01')])
+        first_five_composer_hypo = [S('ambrose.n.01'), S('bach.n.01'), S('barber.n.01'), S('bartok.n.01'), S('beethoven.n.01')]
+        self.assertEqual(S('composer.n.1').instance_hyponyms()[:5], first_five_composer_hypo)
+
+        # Test root hyper-/hyponyms
+        self.assertEqual(S('person.n.01').root_hypernyms(), [S('entity.n.01')])
+        self.assertEqual(S('sail.v.01').root_hypernyms(), [S('travel.v.01')])
+        self.assertEqual(S('fall.v.12').root_hypernyms(), [S('act.v.01'), S('fall.v.17')])
+
+    def test_derivationally_related_forms(self):
+        # Test `derivationally_related_forms()`
+        self.assertEqual(L('zap.v.03.nuke').derivationally_related_forms(),
+                        [L('atomic_warhead.n.01.nuke')])
+        self.assertEqual(L('zap.v.03.atomize').derivationally_related_forms(),
+                        [L('atomization.n.02.atomization')])
+        self.assertEqual(L('zap.v.03.atomise').derivationally_related_forms(),
+                        [L('atomization.n.02.atomisation')])
+        self.assertEqual(L('zap.v.03.zap').derivationally_related_forms(),
+                        [])
+
+    def test_meronyms_holonyms(self):
+        # Test meronyms, holonyms.
+        self.assertEqual(S('dog.n.01').member_holonyms(), [S('canis.n.01'), S('pack.n.06')])
+        self.assertEqual(S('dog.n.01').part_meronyms(), [S('flag.n.07')])
+
+        self.assertEqual(S('faculty.n.2').member_meronyms(),
+                        [S('professor.n.01')])
+        self.assertEqual(S('copilot.n.1').member_holonyms(),
+                        [S('crew.n.01')])
+
+        self.assertEqual(S('table.n.2').part_meronyms(),
+                        [S('leg.n.03'), S('tabletop.n.01'), S('tableware.n.01')])
+        self.assertEqual(S('course.n.7').part_holonyms(),
+                        [S('meal.n.01')])
+
+        self.assertEqual(S('water.n.1').substance_meronyms(),
+                        [S('hydrogen.n.01'), S('oxygen.n.01')])
+        self.assertEqual(S('gin.n.1').substance_holonyms(),
+                        [S('gin_and_it.n.01'), S('gin_and_tonic.n.01'),
+                         S('martini.n.01'), S('pink_lady.n.01')])
+
+    def test_antonyms(self):
+        # Test antonyms.
+        self.assertEqual(L('leader.n.1.leader').antonyms(), [L('follower.n.01.follower')])
+        self.assertEqual(L('increase.v.1.increase').antonyms(), [L('decrease.v.01.decrease')])
+
+
+    def test_misc_relations(self):
+        # Test misc relations.
+        self.assertEqual(S('snore.v.1').entailments(), [S('sleep.v.01')])
+        self.assertEqual(S('heavy.a.1').similar_tos(),
+                        [S('dense.s.03'), S('doughy.s.01'),
+                         S('heavier-than-air.s.01'), S('hefty.s.02'),
+                         S('massive.s.04'), S('non-buoyant.s.01'),
+                         S('ponderous.s.02')])
+        self.assertEqual(S('light.a.1').attributes(), [S('weight.n.01')])
+        self.assertEqual(S('heavy.a.1').attributes(), [S('weight.n.01')])
+
+        # Test pertainyms.
+        self.assertEqual(L('English.a.1.English').pertainyms(),
+                         [L('england.n.01.England')])
+
+    def test_lch(self):
+        # Test LCH.
+        self.assertEqual(S('person.n.01').lowest_common_hypernyms(S('dog.n.01')),
+                         [S('organism.n.01')])
+        self.assertEqual(S('woman.n.01').lowest_common_hypernyms(S('girlfriend.n.02')),
+                         [S('woman.n.01')])
+
+    def test_domains(self):
+        # Test domains.
+        self.assertEqual(S('code.n.03').topic_domains(), [S('computer_science.n.01')])
+        self.assertEqual(S('pukka.a.01').region_domains(), [S('india.n.01')])
+        self.assertEqual(S('freaky.a.01').usage_domains(), [S('slang.n.02')])
+
+    def test_wordnet_similarities(self):
+        # Path based similarities.
+        self.assertAlmostEqual(S('cat.n.01').path_similarity(S('cat.n.01')), 1.0)
+        self.assertAlmostEqual(S('dog.n.01').path_similarity(S('cat.n.01')), 0.2)
+        self.assertAlmostEqual(S('dog.n.01').lch_similarity(S('cat.n.01')), 2.028, places=3)
+        self.assertAlmostEqual(S('dog.n.01').wup_similarity(S('cat.n.01')), 0.8571, places=3)
+        # Information Content similarities.
+        brown_ic = wnic.ic('ic-brown.dat')
+        self.assertAlmostEqual(S('dog.n.01').jcn_similarity(S('cat.n.01'), brown_ic), 0.4497, places=3)
+        semcor_ic = wnic.ic('ic-semcor.dat')
+        self.assertAlmostEqual(S('dog.n.01').lin_similarity(S('cat.n.01'), semcor_ic), 0.8863, places=3)
diff --git a/nltk/tokenize/__init__.py b/nltk/tokenize/__init__.py
index b4b6dd7..6d03924 100644
--- a/nltk/tokenize/__init__.py
+++ b/nltk/tokenize/__init__.py
@@ -74,14 +74,12 @@ from nltk.tokenize.repp     import ReppTokenizer
 from nltk.tokenize.sexpr    import SExprTokenizer, sexpr_tokenize
 from nltk.tokenize.simple   import (SpaceTokenizer, TabTokenizer, LineTokenizer,
                                     line_tokenize)
-from nltk.tokenize.stanford import StanfordTokenizer
 from nltk.tokenize.texttiling import TextTilingTokenizer
 from nltk.tokenize.toktok   import ToktokTokenizer
 from nltk.tokenize.treebank import TreebankWordTokenizer
 from nltk.tokenize.util     import string_span_tokenize, regexp_span_tokenize
 from nltk.tokenize.stanford_segmenter import StanfordSegmenter
 
-
 # Standard sentence tokenizer.
 def sent_tokenize(text, language='english'):
     """
diff --git a/nltk/tokenize/moses.py b/nltk/tokenize/moses.py
index 44fcace..0f7d31d 100644
--- a/nltk/tokenize/moses.py
+++ b/nltk/tokenize/moses.py
@@ -36,6 +36,19 @@ class MosesTokenizer(TokenizerI):
     >>> m = MosesTokenizer()
     >>> m.tokenize('abc def.')
     [u'abc', u'def', u'.']
+
+    The nonbreaking prefixes should deal the situation when numeric only prefix is the last token.
+    In below example, "pp" is the last element, and there is no digit after it.
+
+    >>> m = MosesTokenizer()
+    >>> m.tokenize('2016, pp.')
+    [u'2016', u',', u'pp', u'.']
+    
+    >>> sent = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
+    >>> m.tokenize(sent, escape=True)
+    ['This', 'ain', ''t', 'funny', '.', 'It', ''s', 'actually', 'hillarious', ',', 'yet', 'double', 'Ls', '.', '|', '[', ']', '<', '>', '[', ']', '&', 'You', ''re', 'gonna', 'shake', 'it', 'off', '?', 'Don', ''t', '?']
+    >>> m.tokenize(sent, escape=False)
+    ['This', 'ain', "'t", 'funny', '.', 'It', "'s", 'actually', 'hillarious', ',', 'yet', 'double', 'Ls', '.', '|', '[', ']', '<', '>', '[', ']', '&', 'You', "'re", 'gonna', 'shake', 'it', 'off', '?', 'Don', "'t", '?']
     """
 
     # Perl Unicode Properties character sets.
@@ -234,7 +247,7 @@ class MosesTokenizer(TokenizerI):
         super(MosesTokenizer, self).__init__()
         self.lang = lang
         # Initialize the language specific nonbreaking prefixes.
-        self.NONBREAKING_PREFIXES = nonbreaking_prefixes.words(lang)
+        self.NONBREAKING_PREFIXES = [_nbp.strip() for _nbp in nonbreaking_prefixes.words(lang)]
         self.NUMERIC_ONLY_PREFIXES = [w.rpartition(' ')[0] for w in
                                       self.NONBREAKING_PREFIXES if
                                       self.has_numeric_only(w)]
@@ -286,6 +299,7 @@ class MosesTokenizer(TokenizerI):
                 # Checks if the prefix is in NUMERIC_ONLY_PREFIXES
                 # and ensures that the next word is a digit.
                 elif (prefix in self.NUMERIC_ONLY_PREFIXES and
+                      (i + 1) < num_tokens and
                       re.search(r'^[0-9]+', tokens[i+1])):
                     pass # No change to the token.
                 else: # Otherwise, adds a space after the tokens before a dot.
@@ -315,7 +329,7 @@ class MosesTokenizer(TokenizerI):
             text = re.sub(regexp, substitution, text)
         return text if return_str else text.split()
 
-    def tokenize(self, text, agressive_dash_splits=False, return_str=False):
+    def tokenize(self, text, agressive_dash_splits=False, return_str=False, escape=True):
         """
         Python port of the Moses tokenizer.
 
@@ -374,8 +388,9 @@ class MosesTokenizer(TokenizerI):
         text = re.sub(regexp,substitution, text).strip()
         # Restore multidots.
         text = self.restore_multidots(text)
-        # Escape XML symbols.
-        text = self.escape_xml(text)
+        if escape:
+            # Escape XML symbols.
+            text = self.escape_xml(text)
 
         return text if return_str else text.split()
 
@@ -408,6 +423,11 @@ class MosesDetokenizer(TokenizerI):
     >>> detokens = d.detokenize(tokens)
     >>> " ".join(detokens) == expected_detokens
     True
+    
+    >>> d.detokenize(expected_tokens, unescape=True)
+    ['This', "ain't", 'funny.', "It's", 'actually', 'hillarious,', 'yet', 'double', 'Ls.', '|', '[]', '<', '>', '[]', '&', "You're", 'gonna', 'shake', 'it', 'off?', "Don't?"]
+    >>> d.detokenize(expected_tokens, unescape=False)
+    ['This', 'ain', ''t', 'funny.', 'It', ''s', 'actually', 'hillarious,', 'yet', 'double', 'Ls.', '|', '[', ']', '<', '>', '[', ']', '&', 'You', ''re', 'gonna', 'shake', 'it', 'off?', 'Don', ''t?']
     """
     # Currency Symbols.
     IsAlnum = text_type(''.join(perluniprops.chars('IsAlnum')))
@@ -474,7 +494,7 @@ class MosesDetokenizer(TokenizerI):
         return text
 
 
-    def tokenize(self, tokens, return_str=False):
+    def tokenize(self, tokens, return_str=False, unescape=True):
         """
         Python port of the Moses detokenizer.
 
@@ -489,8 +509,9 @@ class MosesDetokenizer(TokenizerI):
         # Detokenize the agressive hyphen split.
         regexp, substitution = self.AGGRESSIVE_HYPHEN_SPLIT
         text = re.sub(regexp, substitution, text)
-        # Unescape the XML symbols.
-        text = self.unescape_xml(text)
+        if unescape:
+            # Unescape the XML symbols.
+            text = self.unescape_xml(text)
         # Keep track of no. of quotation marks.
         quote_counts = {u"'":0 , u'"':0, u"``":0, u"`":0, u"''":0}
 
@@ -608,6 +629,6 @@ class MosesDetokenizer(TokenizerI):
 
         return detokenized_text if return_str else detokenized_text.split()
 
-    def detokenize(self, tokens, return_str=False):
+    def detokenize(self, tokens, return_str=False, unescape=True):
         """ Duck-typing the abstract *tokenize()*."""
-        return self.tokenize(tokens, return_str)
+        return self.tokenize(tokens, return_str, unescape)
diff --git a/nltk/tokenize/nist.py b/nltk/tokenize/nist.py
new file mode 100644
index 0000000..0b2e5fb
--- /dev/null
+++ b/nltk/tokenize/nist.py
@@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Python port of the tok-tok.pl tokenizer.
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Liling Tan (ported from ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-v14.pl)
+# Contributors: Ozan Caglayan, Wiktor Stribizew
+#
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+"""
+This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
+https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
+which was also ported into Python in
+https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
+"""
+
+from __future__ import unicode_literals
+
+import io
+import re
+from six import text_type
+
+from nltk.corpus import perluniprops
+from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.util import xml_unescape
+
+
+class NISTTokenizer(TokenizerI):
+    """
+    This NIST tokenizer is sentence-based instead of the original
+    paragraph-based tokenization from mteval-14.pl; The sentence-based
+    tokenization is consistent with the other tokenizers available in NLTK.
+
+    >>> from six import text_type
+    >>> from nltk.tokenize.nist import NISTTokenizer
+    >>> nist = NISTTokenizer()
+    >>> s = "Good muffins cost $3.88 in New York."
+    >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
+    >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
+    >>> nist.tokenize(s, lowercase=False) == expected_cased
+    True
+    >>> nist.tokenize(s, lowercase=True) == expected_lower  # Lowercased.
+    True
+
+    The international_tokenize() is the preferred function when tokenizing
+    non-european text, e.g.
+
+    >>> from nltk.tokenize.nist import NISTTokenizer
+    >>> nist = NISTTokenizer()
+
+    # Input strings.
+    >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) us a Chinese e-commerce company...'
+    >>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'
+    >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'
+
+    # Expected tokens.
+    >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'\u963f\u91cc\u5df4\u5df4\u96c6\u56e2\u63a7\u80a1', u'\u6709\u9650\u516c\u53f8', u')']
+    >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'\u02c8\xe6', u'm']
+    >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'\u697d\u5929\u682a\u5f0f\u4f1a\u793e', u'Rakuten', u'Kabushiki', u'-', u'gaisha']
+
+    >>> nist.international_tokenize(albb)[:10] == expected_albb
+    True
+    >>> nist.international_tokenize(amz)[:10] == expected_amz
+    True
+    >>> nist.international_tokenize(rkt)[:10] == expected_rkt
+    True
+    """
+    # Strip "skipped" tags
+    STRIP_SKIP = re.compile('<skipped>'), ''
+    #  Strip end-of-line hyphenation and join lines
+    STRIP_EOL_HYPHEN = re.compile(u'\u2028'), ' '
+    # Tokenize punctuation.
+    PUNCT = re.compile('([\{-\~\[-\` -\&\(-\+\:-\@\/])'), ' \\1 '
+    # Tokenize period and comma unless preceded by a digit.
+    PERIOD_COMMA_PRECEED = re.compile('([^0-9])([\.,])'), '\\1 \\2 '
+    # Tokenize period and comma unless followed by a digit.
+    PERIOD_COMMA_FOLLOW = re.compile('([\.,])([^0-9])'), ' \\1 \\2'
+    # Tokenize dash when preceded by a digit
+    DASH_PRECEED_DIGIT = re.compile('([0-9])(-)'), '\\1 \\2 '
+
+    LANG_DEPENDENT_REGEXES = [PUNCT, PERIOD_COMMA_PRECEED,
+                              PERIOD_COMMA_FOLLOW, DASH_PRECEED_DIGIT]
+
+    # Perluniprops characters used in NIST tokenizer.
+    pup_number = text_type(''.join(set(perluniprops.chars('Number')))) # i.e. \p{N}
+    pup_punct = text_type(''.join(set(perluniprops.chars('Punctuation')))) # i.e. \p{P}
+    pup_symbol = text_type(''.join(set(perluniprops.chars('Symbol')))) # i.e. \p{S}
+
+    # Python regexes needs to escape some special symbols, see
+    # see https://stackoverflow.com/q/45670950/610569
+    number_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_number)
+    punct_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_punct)
+    symbol_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_symbol)
+
+    # Note: In the original perl implementation, \p{Z} and \p{Zl} were used to
+    #       (i) strip trailing and heading spaces  and
+    #       (ii) de-deuplicate spaces.
+    #       In Python, this would do: ' '.join(str.strip().split())
+    # Thus, the next two lines were commented out.
+    #Line_Separator = text_type(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
+    #Separator = text_type(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
+
+    # Pads non-ascii strings with space.
+    NONASCII = re.compile('([\x00-\x7f]+)'), r' \1 '
+    #  Tokenize any punctuation unless followed AND preceded by a digit.
+    PUNCT_1 = re.compile(u"([{n}])([{p}])".format(n=number_regex, p=punct_regex)), '\\1 \\2 '
+    PUNCT_2 = re.compile(u"([{p}])([{n}])".format(n=number_regex, p=punct_regex)), ' \\1 \\2'
+    # Tokenize symbols
+    SYMBOLS = re.compile(u"({s})".format(s=symbol_regex)), ' \\1 '
+
+    INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS]
+
+    def lang_independent_sub(self, text):
+        """Performs the language independent string substituitions. """
+        # It's a strange order of regexes.
+        # It'll be better to unescape after STRIP_EOL_HYPHEN
+        # but let's keep it close to the original NIST implementation.
+        regexp, substitution = self.STRIP_SKIP
+        text = regexp.sub(substitution, text)
+        text = xml_unescape(text)
+        regexp, substitution = self.STRIP_EOL_HYPHEN
+        text = regexp.sub(substitution, text)
+        return text
+
+    def tokenize(self, text, lowercase=False,
+                 western_lang=True, return_str=False):
+        text = text_type(text)
+        # Language independent regex.
+        text = self.lang_independent_sub(text)
+        # Language dependent regex.
+        if western_lang:
+            # Pad string with whitespace.
+            text = ' ' + text + ' '
+            if lowercase:
+                text = text.lower()
+            for regexp, substitution in self.LANG_DEPENDENT_REGEXES:
+                text = regexp.sub(substitution, text)
+        # Remove contiguous whitespaces.
+        text = ' '.join(text.split())
+        # Finally, strips heading and trailing spaces
+        # and converts output string into unicode.
+        text = text_type(text.strip())
+        return text if return_str else text.split()
+
+    def international_tokenize(self, text, lowercase=False,
+                               split_non_ascii=True,
+                               return_str=False):
+        text = text_type(text)
+        # Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied
+        # first before unescaping.
+        regexp, substitution = self.STRIP_SKIP
+        text = regexp.sub(substitution, text)
+        regexp, substitution = self.STRIP_EOL_HYPHEN
+        text = regexp.sub(substitution, text)
+        text = xml_unescape(text)
+
+        if lowercase:
+            text = text.lower()
+
+        for regexp, substitution in self.INTERNATIONAL_REGEXES:
+            text = regexp.sub(substitution, text)
+
+        # Make sure that there's only one space only between words.
+        # Strip leading and trailing spaces.
+        text = ' '.join(text.strip().split())
+        return text if return_str else text.split()
diff --git a/nltk/tokenize/punkt.py b/nltk/tokenize/punkt.py
index b5b724c..afd73a1 100644
--- a/nltk/tokenize/punkt.py
+++ b/nltk/tokenize/punkt.py
@@ -521,7 +521,9 @@ class PunktBaseClass(object):
     """
 
     def __init__(self, lang_vars=PunktLanguageVars(), token_cls=PunktToken,
-            params=PunktParameters()):
+            params=None):
+        if params is None:
+            params = PunktParameters() 
         self._params = params
         self._lang_vars = lang_vars
         self._Token = token_cls
diff --git a/nltk/tokenize/stanford.py b/nltk/tokenize/stanford.py
index ec6b312..9ac8352 100644
--- a/nltk/tokenize/stanford.py
+++ b/nltk/tokenize/stanford.py
@@ -13,12 +13,13 @@ import tempfile
 import os
 import json
 from subprocess import PIPE
+import warnings
 
 from six import text_type
 
 from nltk.internals import find_jar, config_java, java, _java_options
-
 from nltk.tokenize.api import TokenizerI
+from nltk.parse.corenlp import CoreNLPParser
 
 _stanford_url = 'https://nlp.stanford.edu/software/tokenizer.shtml'
 
@@ -26,7 +27,7 @@ class StanfordTokenizer(TokenizerI):
     r"""
     Interface to the Stanford Tokenizer
 
-    >>> from nltk.tokenize import StanfordTokenizer
+    >>> from nltk.tokenize.stanford import StanfordTokenizer
     >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
     >>> StanfordTokenizer().tokenize(s)
     ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
@@ -38,6 +39,13 @@ class StanfordTokenizer(TokenizerI):
     _JAR = 'stanford-postagger.jar'
 
     def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'):
+        # Raise deprecation warning.
+        warnings.simplefilter('always', DeprecationWarning)
+        warnings.warn(str("\nThe StanfordTokenizer will "
+                          "be deprecated in version 3.2.5.\n"
+                          "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"),
+                      DeprecationWarning, stacklevel=2)
+        warnings.simplefilter('ignore', DeprecationWarning)
         self._stanford_jar = find_jar(
             self._JAR, path_to_jar,
             env_vars=('STANFORD_POSTAGGER',),
@@ -99,6 +107,28 @@ class StanfordTokenizer(TokenizerI):
         return stdout
 
 
+class CoreNLPTokenizer(CoreNLPParser):
+    def __init__(self, url='http://localhost:9000', encoding='utf8'):
+        r"""
+        This is a duck-type of CoreNLPParser that has the tokenizing
+        functionality similar to the original Stanford POS tagger.
+
+            >>> from nltk.tokenize.stanford import CoreNLPTokenizer
+            >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
+            >>> CoreNLPTokenizer(url='http://localhost:9000').tokenize(s) == expected # doctest: +SKIP
+            [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.', u'Please', u'buy', u'me', u'two', u'of', u'them', u'.', u'Thanks', u'.']
+        """
+        super(CoreNLPTokenizer, self).__init__(url, encoding)
+
+    def tokenize(self, text, properties=None):
+        """
+        Tokenize a string of text. Consistent with the StanfordTokenizer, This
+        function returns a list of string. The orignal CoreNLPParser.tokenize()
+        returns a generator of string.
+        """
+        return list(super(CoreNLPTokenizer, self).tokenize(text, properties))
+
+
 def setup_module(module):
     from nose import SkipTest
 
@@ -106,3 +136,9 @@ def setup_module(module):
         StanfordTokenizer()
     except LookupError:
         raise SkipTest('doctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn\'t exist')
+
+    try:
+        CoreNLPTokenizer()
+    except LookupError:
+        raise SkipTest('doctests from nltk.tokenize.stanford.CoreNLPTokenizer are skipped because the '
+                       'stanford corenlp server not started')
diff --git a/nltk/tokenize/stanford_segmenter.py b/nltk/tokenize/stanford_segmenter.py
index 40613fc..077cbef 100644
--- a/nltk/tokenize/stanford_segmenter.py
+++ b/nltk/tokenize/stanford_segmenter.py
@@ -17,6 +17,7 @@ import tempfile
 import os
 import json
 from subprocess import PIPE
+import warnings
 
 from nltk import compat
 from nltk.internals import find_jar, find_file, find_dir, \
@@ -29,8 +30,13 @@ _stanford_url = 'https://nlp.stanford.edu/software'
 
 
 class StanfordSegmenter(TokenizerI):
-    """
-    Interface to the Stanford Segmenter
+    """Interface to the Stanford Segmenter
+
+    If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
+    should be provieded, for example::
+
+        seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')
+
     >>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
     >>> seg = StanfordSegmenter()
     >>> seg.default_config('zh')
@@ -46,10 +52,10 @@ class StanfordSegmenter(TokenizerI):
     """
 
     _JAR = 'stanford-segmenter.jar'
-    _SLF4J = 'slf4j-api.jar'
 
     def __init__(self,
-                 path_to_jar=None, path_to_slf4j=None,
+                 path_to_jar=None,
+                 path_to_slf4j=None,
                  java_class=None,
                  path_to_model=None,
                  path_to_dict=None,
@@ -58,21 +64,33 @@ class StanfordSegmenter(TokenizerI):
                  keep_whitespaces='false',
                  encoding='UTF-8', options=None,
                  verbose=False, java_options='-mx2g'):
+        # Raise deprecation warning.
+        warnings.simplefilter('always', DeprecationWarning)
+        warnings.warn(str("\nThe StanfordTokenizer will "
+                          "be deprecated in version 3.2.5.\n"
+                          "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"),
+                      DeprecationWarning, stacklevel=2)
+        warnings.simplefilter('ignore', DeprecationWarning)
 
         stanford_segmenter = find_jar(
                 self._JAR, path_to_jar,
                 env_vars=('STANFORD_SEGMENTER',),
                 searchpath=(), url=_stanford_url,
                 verbose=verbose)
-        slf4j = find_jar(
-                self._SLF4J, path_to_slf4j,
+        if path_to_slf4j is not None:
+            slf4j = find_jar(
+                'slf4j-api.jar', path_to_slf4j,
                 env_vars=('SLF4J', 'STANFORD_SEGMENTER',),
                 searchpath=(), url=_stanford_url,
                 verbose=verbose)
+        else:
+            slf4j = None
 
-        # This is passed to java as the -cp option, the segmenter needs slf4j.
+        # This is passed to java as the -cp option, the old version of segmenter needs slf4j.
+        # The new version of stanford-segmenter-2016-10-31 doesn't need slf4j
         self._stanford_jar = os.pathsep.join(
-            [_ for _ in [stanford_segmenter, slf4j] if not _ is None])
+            _ for _ in [stanford_segmenter, slf4j] if _ is not None
+        )
 
         self._java_class = java_class
         self._model = path_to_model
diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py
index 2d7b162..f3ae637 100644
--- a/nltk/tokenize/treebank.py
+++ b/nltk/tokenize/treebank.py
@@ -163,7 +163,19 @@ class TreebankWordTokenizer(TokenizerI):
             True
 
         """
-        tokens = self.tokenize(text)
+        raw_tokens = self.tokenize(text)
+
+        # Convert converted quotes back to original double quotes
+        # Do this only if original text contains double quote(s)
+        if '"' in text:
+            # Find double quotes and converted quotes
+            matched = [m.group() for m in re.finditer(r'[(``)(\'\')(")]+', text)]
+            
+            # Replace converted quotes back to double quotes
+            tokens = [matched.pop(0) if tok in ['"', "``", "''"] else tok for tok in raw_tokens]
+        else:
+            tokens = raw_tokens
+
         return align_tokens(tokens, text)
 
 
@@ -202,6 +214,29 @@ class TreebankWordDetokenizer(TokenizerI):
     >>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
     >>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
     True
+
+    During tokenization it's safe to add more spaces but during detokenization,
+    simply undoing the padding doesn't really help. 
+
+    - During tokenization, left and right pad is added to [!?], when
+      detokenizing, only left shift the [!?] is needed.
+      Thus (re.compile(r'\s([?!])'), r'\g<1>')
+
+    - During tokenization [:,] are left and right padded but when detokenizing,
+      only left shift is necessary and we keep right pad after comma/colon
+      if the string after is a non-digit.
+      Thus (re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')
+
+    >>> from nltk.tokenize.treebank import TreebankWordDetokenizer
+    >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
+    >>> twd = TreebankWordDetokenizer()
+    >>> twd.detokenize(toks)
+    "hello, i can't feel my feet! Help!!"
+
+    >>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
+    ... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
+    >>> twd.detokenize(toks)
+    "hello, i can't feel; my feet! Help!! He said: Help, help?!"
     """
     _contractions = MacIntyreContractions()
     CONTRACTIONS2 = [re.compile(pattern.replace('(?#X)', '\s'))
@@ -235,7 +270,8 @@ class TreebankWordDetokenizer(TokenizerI):
     #punctuation
     PUNCTUATION = [
         (re.compile(r"([^'])\s'\s"), r"\1' "),
-        (re.compile(r'\s([?!])\s'), r'\g<1>'),
+        (re.compile(r'\s([?!])'), r'\g<1>'), # Strip left pad for [?!]
+        #(re.compile(r'\s([?!])\s'), r'\g<1>'),
         (re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r'\1\2\3'),
         # When tokenizing, [;@#$%&] are padded with whitespace regardless of
         # whether there are spaces before or after them.
@@ -246,7 +282,8 @@ class TreebankWordDetokenizer(TokenizerI):
         (re.compile(r'\s([&])\s'), r' \g<1> '), # Unknown pad.
         (re.compile(r'\s\.\.\.\s'), r'...'),
         (re.compile(r'\s([:,])\s$'), r'\1'),
-        (re.compile(r'\s([:,])\s([^\d])'), r'\1\2')
+        (re.compile(r'\s([:,])\s([^\d])'), r'\1 \2') # Keep right pad after comma/colon before non-digits.
+        #(re.compile(r'\s([:,])\s([^\d])'), r'\1\2')
         ]
 
     #starting quotes
diff --git a/nltk/tokenize/util.py b/nltk/tokenize/util.py
index 7229e21..f19894b 100644
--- a/nltk/tokenize/util.py
+++ b/nltk/tokenize/util.py
@@ -7,7 +7,7 @@
 # For license information, see LICENSE.TXT
 
 from re import finditer
-from xml.sax.saxutils import escape
+from xml.sax.saxutils import escape, unescape
 
 def string_span_tokenize(s, sep):
     r"""
@@ -193,6 +193,30 @@ def xml_escape(text):
                                    r"[": r"[",  r"]": r"]", })
 
 
+def xml_unescape(text):
+    """
+    This function transforms the "escaped" version suitable
+    for well-formed XML formatting into humanly-readable string.
+
+    Note that the default xml.sax.saxutils.unescape() function don't unescape
+    some characters that Moses does so we have to manually add them to the
+    entities dictionary.
+
+        >>> from xml.sax.saxutils import unescape
+        >>> s = ')| & < > ' " ] ['
+        >>> expected = ''')| & < > \' " ] ['''
+        >>> xml_unescape(s) == expected
+        True
+
+    :param text: The text that needs to be unescaped.
+    :type text: str
+    :rtype: str
+    """
+    return unescape(text, entities={ r"'":r"'", r""":r'"',
+                                     r"|":r"|",
+                                     r"[":r"[",  r"]":r"]", })
+
+
 def align_tokens(tokens, sentence):
     """
     This module attempt to find the offsets of the tokens in *s*, as a sequence
diff --git a/nltk/translate/ibm1.py b/nltk/translate/ibm1.py
index c516cf1..35e0420 100644
--- a/nltk/translate/ibm1.py
+++ b/nltk/translate/ibm1.py
@@ -16,7 +16,9 @@
 Lexical translation model that ignores word order.
 
 In IBM Model 1, word order is ignored for simplicity. Thus, the
-following two alignments are equally likely.
+following three alignments are equally likely. As long as the word
+alignments are equivalent, it doesn't matter where the word
+occurs in the source or target sentence.
 
 Source: je mange du jambon
 Target: i eat some ham
@@ -24,7 +26,11 @@ Alignment: (1,1) (2,2) (3,3) (4,4)
 
 Source: je mange du jambon
 Target: some ham eat i
-Alignment: (1,4) (2,3) (3,2) (4,1)
+Alignment: (1,4) (2,3) (3,1) (4,2)
+
+Source: du jambon je mange
+Target: eat i some ham
+Alignment: (1,3) (2,4) (3,2) (4,1)
 
 The EM algorithm used in Model 1 is:
 E step - In the training data, count how many times a source language
diff --git a/nltk/translate/ibm_model.py b/nltk/translate/ibm_model.py
index fa5312f..4dfe4e6 100644
--- a/nltk/translate/ibm_model.py
+++ b/nltk/translate/ibm_model.py
@@ -496,6 +496,9 @@ class AlignmentInfo(object):
     def __eq__(self, other):
         return self.alignment == other.alignment
 
+    def __ne__(self, other):
+        return not self == other
+
     def __hash__(self):
         return hash(self.alignment)
 
diff --git a/nltk/translate/nist_score.py b/nltk/translate/nist_score.py
new file mode 100644
index 0000000..1bedf65
--- /dev/null
+++ b/nltk/translate/nist_score.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: NIST Score
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors:
+# Contributors:
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""NIST score implementation."""
+from __future__ import division
+
+import math
+import fractions
+from collections import Counter
+
+from nltk.util import ngrams
+from nltk.translate.bleu_score import modified_precision, closest_ref_length
+
+try:
+    fractions.Fraction(0, 1000, _normalize=False)
+    from fractions import Fraction
+except TypeError:
+    from nltk.compat import Fraction
+
+
+def sentence_nist(references, hypothesis, n=5):
+    """
+    Calculate NIST score from
+    George Doddington. 2002. "Automatic evaluation of machine translation quality
+    using n-gram co-occurrence statistics." Proceedings of HLT.
+    Morgan Kaufmann Publishers Inc. http://dl.acm.org/citation.cfm?id=1289189.1289273
+
+    DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU
+    score. The official script used by NIST to compute BLEU and NIST score is
+    mteval-14.pl. The main differences are:
+
+     - BLEU uses geometric mean of the ngram overlaps, NIST uses arithmetic mean.
+     - NIST has a different brevity penalty
+     - NIST score from mteval-14.pl has a self-contained tokenizer
+
+    Note: The mteval-14.pl includes a smoothing function for BLEU score that is NOT
+          used in the NIST score computation.
+
+    >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+    ...               'ensures', 'that', 'the', 'military', 'always',
+    ...               'obeys', 'the', 'commands', 'of', 'the', 'party']
+
+    >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
+    ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
+    ...               'that', 'party', 'direct']
+
+    >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+    ...               'ensures', 'that', 'the', 'military', 'will', 'forever',
+    ...               'heed', 'Party', 'commands']
+
+    >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+    ...               'guarantees', 'the', 'military', 'forces', 'always',
+    ...               'being', 'under', 'the', 'command', 'of', 'the',
+    ...               'Party']
+
+    >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+    ...               'army', 'always', 'to', 'heed', 'the', 'directions',
+    ...               'of', 'the', 'party']
+
+    >>> sentence_nist([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
+    0.0854...
+
+    >>> sentence_nist([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS
+    0.1485...
+
+    :param references: reference sentences
+    :type references: list(list(str))
+    :param hypothesis: a hypothesis sentence
+    :type hypothesis: list(str)
+    :param n: highest n-gram order
+    :type n: int
+    """
+    return corpus_nist([references], [hypothesis], n)
+
+def corpus_nist(list_of_references, hypotheses, n=5):
+    """
+    Calculate a single corpus-level NIST score (aka. system-level BLEU) for all
+    the hypotheses and their respective references.
+
+    :param references: a corpus of lists of reference sentences, w.r.t. hypotheses
+    :type references: list(list(list(str)))
+    :param hypotheses: a list of hypothesis sentences
+    :type hypotheses: list(list(str))
+    :param n: highest n-gram order
+    :type n: int
+    """
+    # Before proceeding to compute NIST, perform sanity checks.
+    assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same"
+
+    p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
+    p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
+    sysoutput_lengths = Counter() # Key = ngram order, and value = no. of ngram in hyp.
+    hyp_lengths, ref_lengths = 0, 0
+
+    # Iterate through each hypothesis and their corresponding references.
+    for references, hypothesis in zip(list_of_references, hypotheses):
+        # For each order of ngram, calculate the numerator and
+        # denominator for the corpus-level modified precision.
+        for i, _ in enumerate(range(1,n+1)):
+            p_i = modified_precision(references, hypothesis, i)
+            p_numerators[i] += p_i.numerator
+            p_denominators[i] += p_i.denominator
+            # Adds the no. of ngrams in the hypothesis.
+            sysoutput_lengths[i] += len(hypothesis) - (i - 1)
+
+        # Calculate the hypothesis length and the closest reference length.
+        # Adds them to the corpus-level hypothesis and reference counts.
+        hyp_len =  len(hypothesis)
+        hyp_lengths += hyp_len
+        ref_lengths += closest_ref_length(references, hyp_len)
+
+    # Calculate corpus-level brevity penalty.
+    bp = nist_length_penalty(ref_lengths, hyp_lengths)
+
+    # Collects the various precision values for the different ngram orders.
+    p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
+           for i, _ in enumerate(range(1,n+1))]
+
+    # Eqn 2 in Doddington (2002):
+    # Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ]
+    info = [0 if p_n[i].numerator == 0 or p_n[i+1].numerator == 0 # Handles math domain and zero division errors.
+            else math.log(p_n[i].numerator / p_n[i+1].numerator)
+            for i in range(len(p_n)-1)]
+    return sum(info_i/sysoutput_lengths[i] for i, info_i in enumerate(info)) * bp
+
+
+def nist_length_penalty(closest_ref_len, hyp_len):
+    """
+    Calculates the NIST length penalty, from Eq. 3 in Doddington (2002)
+
+        penalty = exp( beta * log( min( len(hyp)/len(ref) , 1.0 )))
+
+    where,
+
+        `beta` is chosen to make the brevity penalty factor = 0.5 when the
+        no. of words in the system output (hyp) is 2/3 of the average
+        no. of words in the reference translation (ref)
+
+    The NIST penalty is different from BLEU's such that it minimize the impact
+    of the score of small variations in the length of a translation.
+    See Fig. 4 in  Doddington (2002)
+    """
+    ratio = closest_ref_len / hyp_len
+    if 0 < ratio < 1:
+        ratio_x, score_x = 1.5, 0.5
+        beta = math.log(score_x) / math.log(score_x)**2
+        return math.exp(beta * math.log(ratio)**2)
+    else: # ratio <= 0 or ratio >= 1
+        return max(min(ratio, 1.0), 0.0)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/nltk.git



More information about the debian-science-commits mailing list