[nltk] 01/11: Imported Upstream version 3.1

Daniel Stender danstender-guest at moszumanska.debian.org
Sat Oct 24 17:44:57 UTC 2015


This is an automated email from the git hooks/post-receive script.

danstender-guest pushed a commit to branch master
in repository nltk.

commit 69e8e0a505fb65d6f5f90b679bbded2bc7085940
Author: Daniel Stender <debian at danielstender.com>
Date:   Sat Oct 24 17:17:21 2015 +0200

    Imported Upstream version 3.1
---
 PKG-INFO                                           |   5 +-
 nltk.egg-info/PKG-INFO                             |   5 +-
 nltk.egg-info/SOURCES.txt                          |  63 +-
 nltk.egg-info/requires.txt                         |   1 -
 nltk/VERSION                                       |   2 +-
 nltk/__init__.py                                   |  17 +-
 nltk/align/__init__.py                             |  24 -
 nltk/align/bleu_score.py                           | 263 -------
 nltk/align/ibm2.py                                 | 257 ------
 nltk/align/util.py                                 |   7 -
 nltk/book.py                                       |   6 +-
 nltk/ccg/chart.py                                  |   6 +-
 nltk/ccg/combinator.py                             |  80 +-
 nltk/ccg/lexicon.py                                | 151 ++--
 nltk/chunk/util.py                                 |   9 +-
 nltk/classify/rte_classify.py                      |   2 +-
 nltk/collocations.py                               |  43 +-
 nltk/compat.py                                     |  66 +-
 nltk/corpus/__init__.py                            |  29 +-
 nltk/corpus/reader/__init__.py                     |  11 +-
 nltk/corpus/reader/aligned.py                      |   4 +-
 nltk/corpus/reader/api.py                          |  16 +-
 nltk/corpus/reader/bracket_parse.py                |   6 +-
 nltk/corpus/reader/categorized_sents.py            | 178 +++++
 nltk/corpus/reader/chunked.py                      |  36 +-
 nltk/corpus/reader/comparative_sents.py            | 278 +++++++
 nltk/corpus/reader/mte.py                          | 300 +++++++
 nltk/corpus/reader/opinion_lexicon.py              | 115 +++
 nltk/corpus/reader/pros_cons.py                    | 126 +++
 nltk/corpus/reader/reviews.py                      | 324 ++++++++
 nltk/corpus/reader/wordnet.py                      |  94 ++-
 nltk/data.py                                       |  91 ++-
 nltk/internals.py                                  |  30 +-
 nltk/parse/dependencygraph.py                      |  68 +-
 nltk/parse/malt.py                                 | 116 ++-
 nltk/parse/stanford.py                             | 304 ++++++--
 nltk/parse/util.py                                 |  36 +-
 nltk/sem/glue.py                                   |   2 +-
 nltk/sentiment/__init__.py                         |  14 +
 nltk/sentiment/sentiment_analyzer.py               | 228 ++++++
 nltk/sentiment/util.py                             | 752 ++++++++++++++++++
 nltk/sentiment/vader.py                            | 445 +++++++++++
 nltk/six.py                                        | 868 +++++++++++++++++++++
 nltk/tag/__init__.py                               |  71 +-
 nltk/tag/perceptron.py                             | 318 ++++++++
 nltk/tag/senna.py                                  |   2 +-
 nltk/test/bleu.doctest                             |   4 +-
 nltk/test/corpus.doctest                           | 221 +++++-
 nltk/test/dependency.doctest                       |  58 ++
 nltk/test/gensim_fixt.py                           |   9 +
 nltk/test/sentiment.doctest                        | 233 ++++++
 nltk/test/tokenize.doctest                         |  36 +-
 nltk/test/{align.doctest => translate.doctest}     |  51 +-
 nltk/test/{align_fixt.py => translate_fixt.py}     |   0
 nltk/test/unit/align/__init__.py                   |   1 -
 nltk/test/unit/test_corpora.py                     |   2 +
 nltk/test/unit/test_json2csv_corpus.py             |   6 +-
 nltk/test/unit/translate/__init__.py               |   0
 nltk/test/unit/translate/test_bleu.py              |  81 ++
 nltk/test/unit/{align => translate}/test_ibm1.py   |  39 +-
 nltk/test/unit/{align => translate}/test_ibm2.py   |  39 +-
 nltk/test/unit/translate/test_ibm3.py              | 105 +++
 nltk/test/unit/{align => translate}/test_ibm4.py   |  12 +-
 nltk/test/unit/{align => translate}/test_ibm5.py   |  18 +-
 .../unit/{align => translate}/test_ibm_model.py    |   6 +-
 nltk/test/unit/translate/test_stack_decoder.py     | 300 +++++++
 nltk/test/wordnet.doctest                          |  41 +-
 nltk/text.py                                       |   6 +-
 nltk/tokenize/__init__.py                          |   3 +-
 nltk/tokenize/mwe.py                               | 112 +++
 nltk/tokenize/regexp.py                            |  10 +-
 nltk/translate/__init__.py                         |  23 +
 nltk/{align => translate}/api.py                   | 219 +++---
 nltk/translate/bleu_score.py                       | 244 ++++++
 nltk/{align => translate}/gale_church.py           |   0
 nltk/{align => translate}/gdfa.py                  |   0
 nltk/{align => translate}/ibm1.py                  | 136 ++--
 nltk/translate/ibm2.py                             | 308 ++++++++
 nltk/{align => translate}/ibm3.py                  | 308 ++++----
 nltk/{align => translate}/ibm4.py                  |  70 +-
 nltk/{align => translate}/ibm5.py                  |  55 +-
 nltk/{align => translate}/ibm_model.py             |  19 +-
 nltk/translate/metrics.py                          |  39 +
 nltk/{align => translate}/phrase_based.py          |  70 +-
 nltk/translate/stack_decoder.py                    | 495 ++++++++++++
 nltk/twitter/__init__.py                           |  13 +-
 nltk/twitter/api.py                                |  71 +-
 nltk/twitter/{util.py => common.py}                | 143 +---
 nltk/twitter/twitter_demo.py                       |  52 +-
 nltk/twitter/twitterclient.py                      | 126 +--
 nltk/twitter/util.py                               | 249 +-----
 nltk/util.py                                       |  54 +-
 setup.cfg                                          |   4 +-
 setup.py                                           |   5 +-
 94 files changed, 7835 insertions(+), 2130 deletions(-)

diff --git a/PKG-INFO b/PKG-INFO
index 91d3468..fd16391 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,13 +1,13 @@
 Metadata-Version: 1.1
 Name: nltk
-Version: 3.0.5
+Version: 3.1
 Summary: Natural Language Toolkit
 Home-page: http://nltk.org/
 Author: Steven Bird
 Author-email: stevenbird1 at gmail.com
 License: Apache License, Version 2.0
 Description: The Natural Language Toolkit (NLTK) is a Python package for
-        natural language processing.  NLTK requires Python 2.6, 2.7, or 3.2+.
+        natural language processing.  NLTK requires Python 2.7, or 3.2+.
 Keywords: NLP,CL,natural language processing,computational linguistics,parsing,tagging,tokenizing,syntax,linguistics,language,natural language,text analytics
 Platform: UNKNOWN
 Classifier: Development Status :: 5 - Production/Stable
@@ -17,7 +17,6 @@ Classifier: Intended Audience :: Information Technology
 Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
-Classifier: Programming Language :: Python :: 2.6
 Classifier: Programming Language :: Python :: 2.7
 Classifier: Programming Language :: Python :: 3.2
 Classifier: Programming Language :: Python :: 3.3
diff --git a/nltk.egg-info/PKG-INFO b/nltk.egg-info/PKG-INFO
index 91d3468..fd16391 100644
--- a/nltk.egg-info/PKG-INFO
+++ b/nltk.egg-info/PKG-INFO
@@ -1,13 +1,13 @@
 Metadata-Version: 1.1
 Name: nltk
-Version: 3.0.5
+Version: 3.1
 Summary: Natural Language Toolkit
 Home-page: http://nltk.org/
 Author: Steven Bird
 Author-email: stevenbird1 at gmail.com
 License: Apache License, Version 2.0
 Description: The Natural Language Toolkit (NLTK) is a Python package for
-        natural language processing.  NLTK requires Python 2.6, 2.7, or 3.2+.
+        natural language processing.  NLTK requires Python 2.7, or 3.2+.
 Keywords: NLP,CL,natural language processing,computational linguistics,parsing,tagging,tokenizing,syntax,linguistics,language,natural language,text analytics
 Platform: UNKNOWN
 Classifier: Development Status :: 5 - Production/Stable
@@ -17,7 +17,6 @@ Classifier: Intended Audience :: Information Technology
 Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
-Classifier: Programming Language :: Python :: 2.6
 Classifier: Programming Language :: Python :: 2.7
 Classifier: Programming Language :: Python :: 3.2
 Classifier: Programming Language :: Python :: 3.3
diff --git a/nltk.egg-info/SOURCES.txt b/nltk.egg-info/SOURCES.txt
index 9aa27ea..f905614 100644
--- a/nltk.egg-info/SOURCES.txt
+++ b/nltk.egg-info/SOURCES.txt
@@ -18,6 +18,7 @@ nltk/internals.py
 nltk/jsontags.py
 nltk/lazyimport.py
 nltk/probability.py
+nltk/six.py
 nltk/text.py
 nltk/tgrep.py
 nltk/toolbox.py
@@ -30,21 +31,7 @@ nltk.egg-info/PKG-INFO
 nltk.egg-info/SOURCES.txt
 nltk.egg-info/dependency_links.txt
 nltk.egg-info/not-zip-safe
-nltk.egg-info/requires.txt
 nltk.egg-info/top_level.txt
-nltk/align/__init__.py
-nltk/align/api.py
-nltk/align/bleu_score.py
-nltk/align/gale_church.py
-nltk/align/gdfa.py
-nltk/align/ibm1.py
-nltk/align/ibm2.py
-nltk/align/ibm3.py
-nltk/align/ibm4.py
-nltk/align/ibm5.py
-nltk/align/ibm_model.py
-nltk/align/phrase_based.py
-nltk/align/util.py
 nltk/app/__init__.py
 nltk/app/chartparser_app.py
 nltk/app/chunkparser_app.py
@@ -101,10 +88,12 @@ nltk/corpus/reader/aligned.py
 nltk/corpus/reader/api.py
 nltk/corpus/reader/bnc.py
 nltk/corpus/reader/bracket_parse.py
+nltk/corpus/reader/categorized_sents.py
 nltk/corpus/reader/chasen.py
 nltk/corpus/reader/childes.py
 nltk/corpus/reader/chunked.py
 nltk/corpus/reader/cmudict.py
+nltk/corpus/reader/comparative_sents.py
 nltk/corpus/reader/conll.py
 nltk/corpus/reader/crubadan.py
 nltk/corpus/reader/dependency.py
@@ -114,13 +103,17 @@ nltk/corpus/reader/indian.py
 nltk/corpus/reader/ipipan.py
 nltk/corpus/reader/knbc.py
 nltk/corpus/reader/lin.py
+nltk/corpus/reader/mte.py
 nltk/corpus/reader/nkjp.py
 nltk/corpus/reader/nombank.py
 nltk/corpus/reader/nps_chat.py
+nltk/corpus/reader/opinion_lexicon.py
 nltk/corpus/reader/pl196x.py
 nltk/corpus/reader/plaintext.py
 nltk/corpus/reader/ppattach.py
 nltk/corpus/reader/propbank.py
+nltk/corpus/reader/pros_cons.py
+nltk/corpus/reader/reviews.py
 nltk/corpus/reader/rte.py
 nltk/corpus/reader/semcor.py
 nltk/corpus/reader/senseval.py
@@ -202,6 +195,10 @@ nltk/sem/logic.py
 nltk/sem/relextract.py
 nltk/sem/skolemize.py
 nltk/sem/util.py
+nltk/sentiment/__init__.py
+nltk/sentiment/sentiment_analyzer.py
+nltk/sentiment/util.py
+nltk/sentiment/vader.py
 nltk/stem/__init__.py
 nltk/stem/api.py
 nltk/stem/isri.py
@@ -220,6 +217,7 @@ nltk/tag/crf.py
 nltk/tag/hmm.py
 nltk/tag/hunpos.py
 nltk/tag/mapping.py
+nltk/tag/perceptron.py
 nltk/tag/senna.py
 nltk/tag/sequential.py
 nltk/tag/stanford.py
@@ -233,8 +231,6 @@ nltk/tbl/feature.py
 nltk/tbl/rule.py
 nltk/tbl/template.py
 nltk/test/__init__.py
-nltk/test/align.doctest
-nltk/test/align_fixt.py
 nltk/test/all.py
 nltk/test/bleu.doctest
 nltk/test/bnc.doctest
@@ -262,6 +258,7 @@ nltk/test/featstruct.doctest
 nltk/test/framenet.doctest
 nltk/test/generate.doctest
 nltk/test/gensim.doctest
+nltk/test/gensim_fixt.py
 nltk/test/gluesemantics.doctest
 nltk/test/gluesemantics_malt.doctest
 nltk/test/gluesemantics_malt_fixt.py
@@ -290,12 +287,15 @@ nltk/test/runtests.py
 nltk/test/segmentation_fixt.py
 nltk/test/semantics.doctest
 nltk/test/semantics_fixt.py
+nltk/test/sentiment.doctest
 nltk/test/sentiwordnet.doctest
 nltk/test/simple.doctest
 nltk/test/stem.doctest
 nltk/test/tag.doctest
 nltk/test/tokenize.doctest
 nltk/test/toolbox.doctest
+nltk/test/translate.doctest
+nltk/test/translate_fixt.py
 nltk/test/tree.doctest
 nltk/test/treeprettyprinter.doctest
 nltk/test/treetransforms.doctest
@@ -319,15 +319,19 @@ nltk/test/unit/test_tag.py
 nltk/test/unit/test_tgrep.py
 nltk/test/unit/test_twitter_auth.py
 nltk/test/unit/utils.py
-nltk/test/unit/align/__init__.py
-nltk/test/unit/align/test_ibm1.py
-nltk/test/unit/align/test_ibm2.py
-nltk/test/unit/align/test_ibm4.py
-nltk/test/unit/align/test_ibm5.py
-nltk/test/unit/align/test_ibm_model.py
+nltk/test/unit/translate/__init__.py
+nltk/test/unit/translate/test_bleu.py
+nltk/test/unit/translate/test_ibm1.py
+nltk/test/unit/translate/test_ibm2.py
+nltk/test/unit/translate/test_ibm3.py
+nltk/test/unit/translate/test_ibm4.py
+nltk/test/unit/translate/test_ibm5.py
+nltk/test/unit/translate/test_ibm_model.py
+nltk/test/unit/translate/test_stack_decoder.py
 nltk/tokenize/__init__.py
 nltk/tokenize/api.py
 nltk/tokenize/casual.py
+nltk/tokenize/mwe.py
 nltk/tokenize/punkt.py
 nltk/tokenize/regexp.py
 nltk/tokenize/sexpr.py
@@ -336,8 +340,23 @@ nltk/tokenize/stanford.py
 nltk/tokenize/texttiling.py
 nltk/tokenize/treebank.py
 nltk/tokenize/util.py
+nltk/translate/__init__.py
+nltk/translate/api.py
+nltk/translate/bleu_score.py
+nltk/translate/gale_church.py
+nltk/translate/gdfa.py
+nltk/translate/ibm1.py
+nltk/translate/ibm2.py
+nltk/translate/ibm3.py
+nltk/translate/ibm4.py
+nltk/translate/ibm5.py
+nltk/translate/ibm_model.py
+nltk/translate/metrics.py
+nltk/translate/phrase_based.py
+nltk/translate/stack_decoder.py
 nltk/twitter/__init__.py
 nltk/twitter/api.py
+nltk/twitter/common.py
 nltk/twitter/twitter_demo.py
 nltk/twitter/twitterclient.py
 nltk/twitter/util.py
\ No newline at end of file
diff --git a/nltk.egg-info/requires.txt b/nltk.egg-info/requires.txt
deleted file mode 100644
index dde8185..0000000
--- a/nltk.egg-info/requires.txt
+++ /dev/null
@@ -1 +0,0 @@
-six>=1.9.0
diff --git a/nltk/VERSION b/nltk/VERSION
index eca690e..8c50098 100644
--- a/nltk/VERSION
+++ b/nltk/VERSION
@@ -1 +1 @@
-3.0.5
+3.1
diff --git a/nltk/__init__.py b/nltk/__init__.py
index 963b882..5f3467d 100644
--- a/nltk/__init__.py
+++ b/nltk/__init__.py
@@ -19,9 +19,9 @@ from __future__ import print_function, absolute_import
 
 import os
 
-##//////////////////////////////////////////////////////
-##  Metadata
-##//////////////////////////////////////////////////////
+# //////////////////////////////////////////////////////
+# Metadata
+# //////////////////////////////////////////////////////
 
 # Version.  For each new release, the version number should be updated
 # in the file VERSION.
@@ -35,7 +35,7 @@ except NameError:
 except IOError as ex:
     __version__ = "unknown (%s)" % ex
 
-if __doc__ is not None: # fix for the ``python -OO``
+if __doc__ is not None:  # fix for the ``python -OO``
     __doc__ += '\n at version: ' + __version__
 
 
@@ -84,7 +84,7 @@ __classifiers__ = [
     'Topic :: Text Processing :: General',
     'Topic :: Text Processing :: Indexing',
     'Topic :: Text Processing :: Linguistic',
-    ]
+]
 
 from nltk.internals import config_java
 
@@ -125,7 +125,6 @@ from nltk.jsontags import *
 # PACKAGES
 ###########################################################
 
-from nltk.align import *
 from nltk.chunk import *
 from nltk.classify import *
 from nltk.inference import *
@@ -133,6 +132,7 @@ from nltk.metrics import *
 from nltk.parse import *
 from nltk.tag import *
 from nltk.tokenize import *
+from nltk.translate import *
 from nltk.sem import *
 from nltk.stem import *
 
@@ -174,10 +174,11 @@ else:
 # they override the same names inadvertently imported
 # from a subpackage)
 
-from nltk import align, ccg, chunk, classify, collocations
+from nltk import ccg, chunk, classify, collocations
 from nltk import data, featstruct, grammar, help, inference, metrics
 from nltk import misc, parse, probability, sem, stem, wsd
-from nltk import tag, tbl, text, tokenize, tree, treetransforms, util
+from nltk import tag, tbl, text, tokenize, translate, tree, treetransforms, util
+
 
 # override any accidentally imported demo
 def demo():
diff --git a/nltk/align/__init__.py b/nltk/align/__init__.py
deleted file mode 100644
index 82de78e..0000000
--- a/nltk/align/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: Aligners
-#
-# Copyright (C) 2001-2013 NLTK Project
-# Author: Steven Bird <stevenbird1 at gmail.com> (minor additions)
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-"""
-Experimental functionality for bitext alignment.
-These interfaces are prone to change.
-"""
-
-from nltk.align.api import AlignedSent, Alignment
-from nltk.align.ibm_model import IBMModel
-from nltk.align.ibm1 import IBMModel1
-from nltk.align.ibm2 import IBMModel2
-from nltk.align.ibm3 import IBMModel3
-from nltk.align.ibm4 import IBMModel4
-from nltk.align.ibm5 import IBMModel5
-from nltk.align.bleu_score import bleu
-
-
-
diff --git a/nltk/align/bleu_score.py b/nltk/align/bleu_score.py
deleted file mode 100644
index 404428f..0000000
--- a/nltk/align/bleu_score.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: BLEU Score
-#
-# Copyright (C) 2001-2015 NLTK Project
-# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
-# Contributors: Dmitrijs Milajevs
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-"""BLEU score implementation."""
-
-from __future__ import division
-
-import math
-
-from nltk.tokenize import word_tokenize
-from nltk.compat import Counter
-from nltk.util import ngrams
-
-
-def bleu(candidate, references, weights):
-    """Calculate BLEU score (Bilingual Evaluation Understudy)
-
-    :param candidate: a candidate sentence
-    :type candidate: list(str)
-    :param references: reference sentences
-    :type references: list(list(str))
-    :param weights: weights for unigrams, bigrams, trigrams and so on
-    :type weights: list(float)
-
-    >>> weights = [0.25, 0.25, 0.25, 0.25]
-    >>> candidate1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
-    ...               'ensures', 'that', 'the', 'military', 'always',
-    ...               'obeys', 'the', 'commands', 'of', 'the', 'party']
-
-    >>> candidate2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
-    ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
-    ...               'that', 'party', 'direct']
-
-    >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
-    ...               'ensures', 'that', 'the', 'military', 'will', 'forever',
-    ...               'heed', 'Party', 'commands']
-
-    >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
-    ...               'guarantees', 'the', 'military', 'forces', 'always',
-    ...               'being', 'under', 'the', 'command', 'of', 'the',
-    ...               'Party']
-
-    >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
-    ...               'army', 'always', 'to', 'heed', 'the', 'directions',
-    ...               'of', 'the', 'party']
-
-    >>> bleu(candidate1, [reference1, reference2, reference3], weights)
-    0.504...
-
-    >>> bleu(candidate2, [reference1, reference2, reference3], weights)
-    0
-
-    Papineni, Kishore, et al. "BLEU: A method for automatic evaluation of
-    machine translation." Proceedings of the 40th annual meeting on association for
-    computational linguistics. Association for Computational Linguistics, 2002.
-    http://www.aclweb.org/anthology/P02-1040.pdf
-
-    """
-    p_ns = (
-        _modified_precision(candidate, references, i)
-        for i, _ in enumerate(weights, start=1)
-    )
-
-    try:
-        s = math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns))
-    except ValueError:
-        # some p_ns is 0
-        return 0
-
-    bp = _brevity_penalty(candidate, references)
-    return bp * math.exp(s)
-
-
-def _modified_precision(candidate, references, n):
-    """Calculate modified ngram precision.
-
-    The normal precision method may lead to some wrong translations with
-    high-precision, e.g., the translation, in which a word of reference
-    repeats several times, has very high precision. So in the modified
-    n-gram precision, a reference word will be considered exhausted after
-    a matching candidate word is identified.
-
-    Paper examples:
-
-    >>> _modified_precision(
-    ...    'the the the the the the the'.split(),
-    ...    ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
-    ...    n=1,
-    ... )
-    0.28...
-
-    >>> _modified_precision(
-    ...    'the the the the the the the'.split(),
-    ...    ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
-    ...    n=2,
-    ... )
-    0.0
-
-    >>> _modified_precision(
-    ...    'of the'.split(),
-    ...    [
-    ...        'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
-    ...        'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
-    ...        'It is the practical guide for the army always to heed the directions of the party'.split(),
-    ...    ],
-    ...    n=1,
-    ... )
-    1.0
-
-    >>> _modified_precision(
-    ...    'of the'.split(),
-    ...    [
-    ...        'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
-    ...        'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
-    ...        'It is the practical guide for the army always to heed the directions of the party'.split(),
-    ...    ],
-    ...    n=2,
-    ... )
-    1.0
-
-    More examples:
-
-    >>> weights = [0.25, 0.25, 0.25, 0.25]
-    >>> candidate1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
-    ...               'ensures', 'that', 'the', 'military', 'always',
-    ...               'obeys', 'the', 'commands', 'of', 'the', 'party']
-
-    >>> candidate2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
-    ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
-    ...               'that', 'party', 'direct']
-
-    >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
-    ...               'ensures', 'that', 'the', 'military', 'will', 'forever',
-    ...               'heed', 'Party', 'commands']
-
-    >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
-    ...               'guarantees', 'the', 'military', 'forces', 'always',
-    ...               'being', 'under', 'the', 'command', 'of', 'the',
-    ...               'Party']
-
-    >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
-    ...               'army', 'always', 'to', 'heed', 'the', 'directions',
-    ...               'of', 'the', 'party']
-
-    Unigrams:
-
-    >>> _modified_precision(
-    ...    candidate1,
-    ...    [reference1, reference2, reference3],
-    ...    n=1,
-    ... )
-    0.94...
-
-    >>> _modified_precision(
-    ...    candidate2,
-    ...    [reference1, reference2, reference3],
-    ...    n=1,
-    ... )
-    0.57...
-
-    Bigrams:
-
-    >>> _modified_precision(
-    ...    candidate1,
-    ...    [reference1, reference2, reference3],
-    ...    n=2,
-    ... )
-    0.58...
-
-    >>> _modified_precision(
-    ...    candidate2,
-    ...    [reference1, reference2, reference3],
-    ...    n=2,
-    ... )
-    0.07...
-
-    """
-    counts = Counter(ngrams(candidate, n))
-
-    if not counts:
-        return 0
-
-    max_counts = {}
-    for reference in references:
-        reference_counts = Counter(ngrams(reference, n))
-        for ngram in counts:
-            max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
-
-    clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items())
-
-    return sum(clipped_counts.values()) / sum(counts.values())
-
-
-def _brevity_penalty(candidate, references):
-    """Calculate brevity penalty.
-
-    As the modified n-gram precision still has the problem from the short
-    length sentence, brevity penalty is used to modify the overall BLEU
-    score according to length.
-
-    An example from the paper. There are three references with length 12, 15
-    and 17. And a terse candidate of the length 12. The brevity penalty is 1.
-
-    >>> references = [['a'] * 12, ['a'] * 15, ['a'] * 17]
-    >>> candidate = ['a'] * 12
-    >>> _brevity_penalty(candidate, references)
-    1.0
-
-    In case a candidate translation is shorter than the references, penalty is
-    applied.
-
-    >>> references = [['a'] * 28, ['a'] * 28]
-    >>> candidate = ['a'] * 12
-    >>> _brevity_penalty(candidate, references)
-    0.2635...
-
-    The length of the closest reference is used to compute the penalty. If the
-    length of a candidate is 12, and the reference lengths are 13 and 2, the
-    penalty is applied because the candidate length (12) is less then the
-    closest reference length (13).
-
-    >>> references = [['a'] * 13, ['a'] * 2]
-    >>> candidate = ['a'] * 12
-    >>> _brevity_penalty(candidate, references)
-    0.92...
-
-    The brevity penalty doesn't depend on reference order. More importantly,
-    when two reference sentences are at the same distance, the shortest
-    reference sentence length is used.
-
-    >>> references = [['a'] * 13, ['a'] * 11]
-    >>> candidate = ['a'] * 12
-    >>> _brevity_penalty(candidate, references) == _brevity_penalty(candidate, reversed(references)) == 1
-    True
-
-    A test example from mteval-v13a.pl (starting from the line 705):
-
-    >>> references = [['a'] * 11, ['a'] * 8]
-    >>> candidate = ['a'] * 7
-    >>> _brevity_penalty(candidate, references)
-    0.86...
-
-    >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
-    >>> candidate = ['a'] * 7
-    >>> _brevity_penalty(candidate, references)
-    1.0
-
-    """
-    c = len(candidate)
-    ref_lens = (len(reference) for reference in references)
-    r = min(ref_lens, key=lambda ref_len: (abs(ref_len - c), ref_len))
-
-    if c > r:
-        return 1
-    else:
-        return math.exp(1 - r / c)
-
-
diff --git a/nltk/align/ibm2.py b/nltk/align/ibm2.py
deleted file mode 100644
index 45e768a..0000000
--- a/nltk/align/ibm2.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: IBM Model 2
-#
-# Copyright (C) 2001-2013 NLTK Project
-# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-"""
-Lexical translation model that considers word order.
-
-IBM Model 2 improves on Model 1 by accounting for word order.
-An alignment probability is introduced, a(i | j,l,m), which predicts
-a source word position, given its aligned target word's position.
-
-The EM algorithm used in Model 2 is:
-E step - In the training data, collect counts, weighted by prior
-         probabilities.
-         (a) count how many times a source language word is translated
-             into a target language word
-         (b) count how many times a particular position in the source
-             sentence is aligned to a particular position in the target
-             sentence
-
-M step - Estimate new probabilities based on the counts from the E step
-
-
-Notations:
-i: Position in the source sentence
-    Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
-j: Position in the target sentence
-    Valid values are 1, 2, ..., length of target sentence
-l: Number of words in the source sentence, excluding NULL
-m: Number of words in the target sentence
-s: A word in the source language
-t: A word in the target language
-
-
-References:
-Philipp Koehn. 2010. Statistical Machine Translation.
-Cambridge University Press, New York.
-
-Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
-Robert L. Mercer. 1993. The Mathematics of Statistical Machine
-Translation: Parameter Estimation. Computational Linguistics, 19 (2),
-263-311.
-"""
-
-from __future__ import division
-from collections import defaultdict
-from nltk.align import AlignedSent
-from nltk.align import Alignment
-from nltk.align import IBMModel
-from nltk.align import IBMModel1
-import warnings
-
-
-class IBMModel2(IBMModel):
-    """
-    Lexical translation model that considers word order
-
-    >>> bitext = []
-    >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
-    >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
-    >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
-    >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
-    >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
-    >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
-
-    >>> ibm2 = IBMModel2(bitext, 5)
-
-    >>> print('{0:.3f}'.format(ibm2.translation_table['buch']['book']))
-    1.000
-    >>> print('{0:.3f}'.format(ibm2.translation_table['das']['book']))
-    0.000
-    >>> print('{0:.3f}'.format(ibm2.translation_table['buch'][None]))
-    0.000
-    >>> print('{0:.3f}'.format(ibm2.translation_table['ja'][None]))
-    0.000
-
-    >>> print('{0:.3f}'.format(ibm2.alignment_table[1][1][2][2]))
-    0.939
-    >>> print('{0:.3f}'.format(ibm2.alignment_table[1][2][2][2]))
-    0.000
-    >>> print('{0:.3f}'.format(ibm2.alignment_table[2][2][4][5]))
-    1.000
-
-    >>> test_sentence = bitext[2]
-    >>> test_sentence.words
-    ['das', 'buch', 'ist', 'ja', 'klein']
-    >>> test_sentence.mots
-    ['the', 'book', 'is', 'small']
-    >>> test_sentence.alignment
-    Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
-
-    """
-
-    def __init__(self, sentence_aligned_corpus, iterations):
-        """
-        Train on ``sentence_aligned_corpus`` and create a lexical
-        translation model and an alignment model.
-
-        Translation direction is from ``AlignedSent.mots`` to
-        ``AlignedSent.words``.
-
-        Runs a few iterations of Model 1 training to initialize
-        model parameters.
-
-        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
-        :type sentence_aligned_corpus: list(AlignedSent)
-
-        :param iterations: Number of iterations to run training algorithm
-        :type iterations: int
-        """
-        super(IBMModel2, self).__init__(sentence_aligned_corpus)
-
-        # Get initial translation probability distribution
-        # from a few iterations of Model 1 training.
-        ibm1 = IBMModel1(sentence_aligned_corpus, 10)
-        self.translation_table = ibm1.translation_table
-
-        # Initialize the distribution of alignment probability,
-        # a(i | j,l,m) = 1 / (l+1) for all i, j, l, m
-        for aligned_sentence in sentence_aligned_corpus:
-            l = len(aligned_sentence.mots)
-            m = len(aligned_sentence.words)
-            initial_value = 1 / (l + 1)
-            if initial_value > IBMModel.MIN_PROB:
-                for i in range(0, l + 1):
-                    for j in range(1, m + 1):
-                        self.alignment_table[i][j][l][m] = initial_value
-            else:
-                warnings.warn("Source sentence is too long (" + str(l) +
-                              " words). Results may be less accurate.")
-
-        self.train(sentence_aligned_corpus, iterations)
-        self.__align_all(sentence_aligned_corpus)
-
-    def train(self, parallel_corpus, iterations):
-        for i in range(0, iterations):
-            count_t_given_s = defaultdict(lambda: defaultdict(float))
-            count_any_t_given_s = defaultdict(float)
-
-            # count of i given j, l, m
-            alignment_count = defaultdict(
-                lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
-                    lambda: 0.0))))
-            alignment_count_for_any_i = defaultdict(
-                lambda: defaultdict(lambda: defaultdict(
-                    lambda: 0.0)))
-
-            for aligned_sentence in parallel_corpus:
-                src_sentence = [None] + aligned_sentence.mots
-                trg_sentence = ['UNUSED'] + aligned_sentence.words  # 1-indexed
-                l = len(aligned_sentence.mots)
-                m = len(aligned_sentence.words)
-                total_count = defaultdict(float)
-
-                # E step (a): Compute normalization factors to weigh counts
-                for j in range(1, m + 1):
-                    t = trg_sentence[j]
-                    total_count[t] = 0
-                    for i in range(0, l + 1):
-                        s = src_sentence[i]
-                        count = (self.translation_table[t][s] *
-                                 self.alignment_table[i][j][l][m])
-                        total_count[t] += count
-
-                # E step (b): Collect counts
-                for j in range(1, m + 1):
-                    t = trg_sentence[j]
-                    for i in range(0, l + 1):
-                        s = src_sentence[i]
-                        count = (self.translation_table[t][s] *
-                                 self.alignment_table[i][j][l][m])
-                        normalized_count = count / total_count[t]
-
-                        count_t_given_s[t][s] += normalized_count
-                        count_any_t_given_s[s] += normalized_count
-                        alignment_count[i][j][l][m] += normalized_count
-                        alignment_count_for_any_i[j][l][m] += normalized_count
-
-            # M step: Update probabilities with maximum likelihood estimates
-            for s in self.src_vocab:
-                for t in self.trg_vocab:
-                    estimate = count_t_given_s[t][s] / count_any_t_given_s[s]
-                    self.translation_table[t][s] = max(estimate,
-                                                       IBMModel.MIN_PROB)
-
-            for aligned_sentence in parallel_corpus:
-                l = len(aligned_sentence.mots)
-                m = len(aligned_sentence.words)
-                for i in range(0, l + 1):
-                    for j in range(1, m + 1):
-                        estimate = (alignment_count[i][j][l][m] /
-                                    alignment_count_for_any_i[j][l][m])
-                        self.alignment_table[i][j][l][m] = max(estimate,
-                                                              IBMModel.MIN_PROB)
-
-    def prob_t_a_given_s(self, alignment_info):
-        """
-        Probability of target sentence and an alignment given the
-        source sentence
-        """
-        prob = 1.0
-        l = len(alignment_info.src_sentence) - 1
-        m = len(alignment_info.trg_sentence) - 1
-
-        for j, i in enumerate(alignment_info.alignment):
-            if j == 0:
-                continue  # skip the dummy zeroeth element
-            trg_word = alignment_info.trg_sentence[j]
-            src_word = alignment_info.src_sentence[i]
-            prob *= (self.translation_table[trg_word][src_word] *
-                     self.alignment_table[i][j][l][m])
-
-        return max(prob, IBMModel.MIN_PROB)
-
-    def __align_all(self, parallel_corpus):
-        for sentence_pair in parallel_corpus:
-            self.__align(sentence_pair)
-
-    def __align(self, sentence_pair):
-        """
-        Determines the best word alignment for one sentence pair from
-        the corpus that the model was trained on.
-
-        The best alignment will be set in ``sentence_pair`` when the
-        method returns. In contrast with the internal implementation of
-        IBM models, the word indices in the ``Alignment`` are zero-
-        indexed, not one-indexed.
-
-        :param sentence_pair: A sentence in the source language and its
-            counterpart sentence in the target language
-        :type sentence_pair: AlignedSent
-        """
-        best_alignment = []
-
-        l = len(sentence_pair.mots)
-        m = len(sentence_pair.words)
-
-        for j, trg_word in enumerate(sentence_pair.words):
-            # Initialize trg_word to align with the NULL token
-            best_prob = (self.translation_table[trg_word][None] *
-                         self.alignment_table[0][j + 1][l][m])
-            best_prob = max(best_prob, IBMModel.MIN_PROB)
-            best_alignment_point = None
-            for i, src_word in enumerate(sentence_pair.mots):
-                align_prob = (self.translation_table[trg_word][src_word] *
-                              self.alignment_table[i + 1][j + 1][l][m])
-                if align_prob >= best_prob:
-                    best_prob = align_prob
-                    best_alignment_point = i
-
-            best_alignment.append((j, best_alignment_point))
-
-        sentence_pair.alignment = Alignment(best_alignment)
diff --git a/nltk/align/util.py b/nltk/align/util.py
deleted file mode 100644
index f70d060..0000000
--- a/nltk/align/util.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Natural Language Toolkit: Aligner Utilities
-#
-# Copyright (C) 2001-2013 NLTK Project
-# Author: 
-# URL: <http://www.nltk.org/>
-# For license information, see LICENSE.TXT
-
diff --git a/nltk/book.py b/nltk/book.py
index 8cded01..7357446 100644
--- a/nltk/book.py
+++ b/nltk/book.py
@@ -10,6 +10,7 @@ from __future__ import print_function
 from nltk.corpus import (gutenberg, genesis, inaugural,
                          nps_chat, webtext, treebank, wordnet)
 from nltk.text import Text
+from nltk.probability import FreqDist
 
 print("*** Introductory Examples for the NLTK Book ***")
 print("Loading text1, ..., text9 and sent1, ..., sent9")
@@ -31,7 +32,8 @@ print("text4:", text4.name)
 text5 = Text(nps_chat.words(), name="Chat Corpus")
 print("text5:", text5.name)
 
-text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
+text6 = Text(webtext.words('grail.txt'),
+             name="Monty Python and the Holy Grail")
 print("text6:", text6.name)
 
 text7 = Text(treebank.words(), name="Wall Street Journal")
@@ -43,6 +45,7 @@ print("text8:", text8.name)
 text9 = Text(gutenberg.words('chesterton-thursday.txt'))
 print("text9:", text9.name)
 
+
 def texts():
     print("text1:", text1.name)
     print("text2:", text2.name)
@@ -74,6 +77,7 @@ sent9 = ["THE", "suburb", "of", "Saffron", "Park", "lay", "on", "the",
          "sunset", "side", "of", "London", ",", "as", "red", "and",
          "ragged", "as", "a", "cloud", "of", "sunset", "."]
 
+
 def sents():
     print("sent1:", " ".join(sent1))
     print("sent2:", " ".join(sent2))
diff --git a/nltk/ccg/chart.py b/nltk/ccg/chart.py
index bdd5cf9..9b89e8d 100644
--- a/nltk/ccg/chart.py
+++ b/nltk/ccg/chart.py
@@ -7,7 +7,7 @@
 
 """
 The lexicon is constructed by calling
-``lexicon.parseLexicon(<lexicon string>)``.
+``lexicon.fromstring(<lexicon string>)``.
 
 In order to construct a parser, you also need a rule set.
 The standard English rules are provided in chart as
@@ -37,7 +37,7 @@ from nltk.parse import ParserI
 from nltk.parse.chart import AbstractChartRule, EdgeI, Chart
 from nltk.tree import Tree
 
-from nltk.ccg.lexicon import parseLexicon
+from nltk.ccg.lexicon import fromstring
 from nltk.ccg.combinator import (ForwardT, BackwardT, ForwardApplication,
                                  BackwardApplication, ForwardComposition,
                                  BackwardComposition, ForwardSubstitution,
@@ -321,7 +321,7 @@ def printCCGTree(lwidth,tree):
 ### Demonstration code
 
 # Construct the lexicon
-lex = parseLexicon('''
+lex = fromstring('''
     :- S, NP, N, VP    # Primitive categories, S is the target primitive
 
     Det :: NP/N         # Family of words
diff --git a/nltk/ccg/combinator.py b/nltk/ccg/combinator.py
index a41dccd..d79e8b9 100644
--- a/nltk/ccg/combinator.py
+++ b/nltk/ccg/combinator.py
@@ -4,6 +4,10 @@
 # Author: Graeme Gange <ggange at csse.unimelb.edu.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+"""
+CCG Combinators
+"""
+
 from __future__ import unicode_literals
 
 from nltk.compat import python_2_unicode_compatible
@@ -23,7 +27,7 @@ class UndirectedBinaryCombinator(object):
     def can_combine(self, function, argument):
         raise NotImplementedError()
 
-    def combine (self,function,argument):
+    def combine (self, function, argument):
         raise NotImplementedError()
 
 class DirectedBinaryCombinator(object):
@@ -41,23 +45,23 @@ class DirectedBinaryCombinator(object):
 
 @python_2_unicode_compatible
 class ForwardCombinator(DirectedBinaryCombinator):
-    '''
+    """
     Class representing combinators where the primary functor is on the left.
 
     Takes an undirected combinator, and a predicate which adds constraints
     restricting the cases in which it may apply.
-    '''
+    """
     def __init__(self, combinator, predicate, suffix=''):
         self._combinator = combinator
         self._predicate = predicate
         self._suffix = suffix
 
     def can_combine(self, left, right):
-        return (self._combinator.can_combine(left,right) and
-                  self._predicate(left,right))
+        return (self._combinator.can_combine(left, right) and
+                self._predicate(left, right))
 
     def combine(self, left, right):
-        for cat in self._combinator.combine(left,right):
+        for cat in self._combinator.combine(left, right):
             yield cat
 
     def __str__(self):
@@ -65,9 +69,9 @@ class ForwardCombinator(DirectedBinaryCombinator):
 
 @python_2_unicode_compatible
 class BackwardCombinator(DirectedBinaryCombinator):
-    '''
+    """
     The backward equivalent of the ForwardCombinator class.
-    '''
+    """
     def __init__(self, combinator, predicate, suffix=''):
         self._combinator = combinator
         self._predicate = predicate
@@ -75,7 +79,7 @@ class BackwardCombinator(DirectedBinaryCombinator):
 
     def can_combine(self, left, right):
         return (self._combinator.can_combine(right, left) and
-                  self._predicate(left,right))
+                self._predicate(left, right))
     def combine(self, left, right):
         for cat in self._combinator.combine(right, left):
             yield cat
@@ -98,7 +102,7 @@ class UndirectedFunctionApplication(UndirectedBinaryCombinator):
 
         return not function.arg().can_unify(argument) is None
 
-    def combine(self,function,argument):
+    def combine(self, function, argument):
         if not function.is_function():
             return
 
@@ -115,18 +119,18 @@ class UndirectedFunctionApplication(UndirectedBinaryCombinator):
 # Predicates for function application.
 
 # Ensures the left functor takes an argument on the right
-def forwardOnly(left,right):
+def forwardOnly(left, right):
     return left.dir().is_forward()
 
 # Ensures the right functor takes an argument on the left
-def backwardOnly(left,right):
+def backwardOnly(left, right):
     return right.dir().is_backward()
 
 # Application combinator instances
 ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(),
-                        forwardOnly)
+                                       forwardOnly)
 BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(),
-                        backwardOnly)
+                                         backwardOnly)
 
 
 @python_2_unicode_compatible
@@ -153,24 +157,24 @@ class UndirectedComposition(UndirectedBinaryCombinator):
             subs = function.arg().can_unify(argument.res())
             if not subs is None:
                 yield FunctionalCategory(function.res().substitute(subs),
-                            argument.arg().substitute(subs),argument.dir())
+                                         argument.arg().substitute(subs), argument.dir())
 
     def __str__(self):
         return 'B'
 
 # Predicates for restricting application of straight composition.
-def bothForward(left,right):
+def bothForward(left, right):
     return left.dir().is_forward() and right.dir().is_forward()
 
-def bothBackward(left,right):
+def bothBackward(left, right):
     return left.dir().is_backward() and right.dir().is_backward()
 
 # Predicates for crossed composition
 
-def crossedDirs(left,right):
+def crossedDirs(left, right):
     return left.dir().is_forward() and right.dir().is_backward()
 
-def backwardBxConstraint(left,right):
+def backwardBxConstraint(left, right):
     # The functors must be crossed inwards
     if not crossedDirs(left, right):
         return False
@@ -182,13 +186,13 @@ def backwardBxConstraint(left,right):
 
 # Straight composition combinators
 ForwardComposition = ForwardCombinator(UndirectedComposition(),
-                           forwardOnly)
+                                       forwardOnly)
 BackwardComposition = BackwardCombinator(UndirectedComposition(),
-                           backwardOnly)
+                                         backwardOnly)
 
 # Backward crossed composition
-BackwardBx = BackwardCombinator(UndirectedComposition(),backwardBxConstraint,
-                suffix='x')
+BackwardBx = BackwardCombinator(UndirectedComposition(), backwardBxConstraint,
+                                suffix='x')
 
 @python_2_unicode_compatible
 class UndirectedSubstitution(UndirectedBinaryCombinator):
@@ -213,9 +217,9 @@ class UndirectedSubstitution(UndirectedBinaryCombinator):
             return False
         return (function.res().arg() == argument.res()) and (function.arg() == argument.arg())
 
-    def combine(self,function,argument):
-        if self.can_combine(function,argument):
-            yield FunctionalCategory(function.res().res(),argument.arg(),argument.dir())
+    def combine(self, function, argument):
+        if self.can_combine(function, argument):
+            yield FunctionalCategory(function.res().res(), argument.arg(), argument.dir())
 
     def __str__(self):
         return 'S'
@@ -227,7 +231,7 @@ def forwardSConstraint(left, right):
     return left.res().dir().is_forward() and left.arg().is_primitive()
 
 # Predicate for backward crossed substitution
-def backwardSxConstraint(left,right):
+def backwardSxConstraint(left, right):
     if not left.dir().can_cross() and right.dir().can_cross():
         return False
     if not bothForward(left, right):
@@ -236,9 +240,9 @@ def backwardSxConstraint(left,right):
 
 # Instances of substitution combinators
 ForwardSubstitution = ForwardCombinator(UndirectedSubstitution(),
-                            forwardSConstraint)
+                                        forwardSConstraint)
 BackwardSx = BackwardCombinator(UndirectedSubstitution(),
-                    backwardSxConstraint,'x')
+                                backwardSxConstraint, 'x')
 
 
 # Retrieves the left-most functional category.
@@ -250,10 +254,10 @@ def innermostFunction(categ):
 
 @python_2_unicode_compatible
 class UndirectedTypeRaise(UndirectedBinaryCombinator):
-    '''
+    """
     Undirected combinator for type raising.
-    '''
-    def can_combine(self,function,arg):
+    """
+    def can_combine(self, function, arg):
         # The argument must be a function.
         # The restriction that arg.res() must be a function
         # merely reduces redundant type-raising; if arg.res() is
@@ -262,7 +266,7 @@ class UndirectedTypeRaise(UndirectedBinaryCombinator):
         # which is equivalent to
         # X Y\X =>(<) Y
         if not (arg.is_function() and arg.res().is_function()):
-                return False
+            return False
 
         arg = innermostFunction(arg)
 
@@ -272,7 +276,7 @@ class UndirectedTypeRaise(UndirectedBinaryCombinator):
             return True
         return False
 
-    def combine(self,function,arg):
+    def combine(self, function, arg):
         if not (function.is_primitive() and
                 arg.is_function() and arg.res().is_function()):
             return
@@ -284,8 +288,8 @@ class UndirectedTypeRaise(UndirectedBinaryCombinator):
         if subs is not None:
             xcat = arg.res().substitute(subs)
             yield FunctionalCategory(xcat,
-                    FunctionalCategory(xcat,function,arg.dir()),
-                    -(arg.dir()))
+                                     FunctionalCategory(xcat, function, arg.dir()),
+                                     -(arg.dir()))
 
     def __str__(self):
         return 'T'
@@ -295,11 +299,11 @@ class UndirectedTypeRaise(UndirectedBinaryCombinator):
 # the primary functor.
 # The restriction that the variable must be primitive is not
 # common to all versions of CCGs; some authors have other restrictions.
-def forwardTConstraint(left,right):
+def forwardTConstraint(left, right):
     arg = innermostFunction(right)
     return arg.dir().is_backward() and arg.res().is_primitive()
 
-def backwardTConstraint(left,right):
+def backwardTConstraint(left, right):
     arg = innermostFunction(left)
     return arg.dir().is_forward() and arg.res().is_primitive()
 
diff --git a/nltk/ccg/lexicon.py b/nltk/ccg/lexicon.py
index 5439a14..6c3e12b 100644
--- a/nltk/ccg/lexicon.py
+++ b/nltk/ccg/lexicon.py
@@ -4,6 +4,10 @@
 # Author: Graeme Gange <ggange at csse.unimelb.edu.au>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
+"""
+CCG Lexicons
+"""
+
 from __future__ import unicode_literals
 
 import re
@@ -11,79 +15,92 @@ from collections import defaultdict
 
 from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory
 from nltk.compat import python_2_unicode_compatible
+from nltk.internals import deprecated
 
 #------------
 # Regular expressions used for parsing components of the lexicon
 #------------
 
 # Parses a primitive category and subscripts
-rePrim = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''')
+PRIM_RE = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''')
 
 # Separates the next primitive category from the remainder of the
 # string
-reNextPrim = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
+NEXTPRIM_RE = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
 
 # Separates the next application operator from the remainder
-reApp = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
+APP_RE = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
 
 # Parses the definition of the category of either a word or a family
-reLex = re.compile(r'''([\w_]+)\s*(::|[-=]+>)\s*(.+)''', re.UNICODE)
+LEX_RE = re.compile(r'''([\w_]+)\s*(::|[-=]+>)\s*(.+)''', re.UNICODE)
 
 # Strips comments from a line
-reComm = re.compile('''([^#]*)(?:#.*)?''')
+COMMENTS_RE = re.compile('''([^#]*)(?:#.*)?''')
 
 #----------
 # Lexicons
 #----------
+
 @python_2_unicode_compatible
 class CCGLexicon(object):
-    '''
+    """
     Class representing a lexicon for CCG grammars.
-    primitives - The list of primitive categories for the lexicon
-    families - Families of categories
-    entries - A mapping of words to possible categories
-    '''
-    def __init__(self,start,primitives,families,entries):
+
+    * `primitives`: The list of primitive categories for the lexicon
+    * `families`: Families of categories
+    * `entries`: A mapping of words to possible categories
+    """
+    def __init__(self, start, primitives, families, entries):
         self._start = PrimitiveCategory(start)
         self._primitives = primitives
         self._families = families
         self._entries = entries
 
-    # Returns all the possible categories for a word
+
     def categories(self, word):
+        """
+        Returns all the possible categories for a word
+        """
         return self._entries[word]
 
-    # Returns the target category for the parser
+
     def start(self):
+        """
+        Return the target category for the parser
+        """
         return self._start
 
-    # String representation of the lexicon
-    # Used for debugging
     def __str__(self):
-        st = ""
+        """
+        String representation of the lexicon. Used for debugging.
+        """
+        string = ""
         first = True
         for ident in self._entries:
             if not first:
-                st = st + "\n"
-            st = st + ident + " => "
+                string = string + "\n"
+            string = string + ident + " => "
 
             first = True
             for cat in self._entries[ident]:
                 if not first:
-                    st = st + " | "
+                    string = string + " | "
                 else:
                     first = False
-                st = st + "%s" % cat
-        return st
+                string = string + "%s" % cat
+        return string
 
 
 #-----------
 # Parsing lexicons
 #-----------
 
-# Separates the contents matching the first set of brackets
-# from the rest of the input.
+
 def matchBrackets(string):
+    """
+    Separate the contents matching the first set of brackets from the rest of
+    the input.
+    """
     rest = string[1:]
     inside = "("
 
@@ -98,27 +115,39 @@ def matchBrackets(string):
         return (inside + ')', rest[1:])
     raise AssertionError('Unmatched bracket in string \'' + string + '\'')
 
-# Separates the string for the next portion of the category
-# from the rest of the string
+
 def nextCategory(string):
+    """
+    Separate the string for the next portion of the category from the rest
+    of the string
+    """
     if string.startswith('('):
         return matchBrackets(string)
-    return reNextPrim.match(string).groups()
+    return NEXTPRIM_RE.match(string).groups()
 
-# Parses an application operator
 def parseApplication(app):
+    """
+    Parse an application operator
+    """
     return Direction(app[0], app[1:])
 
-# Parses the subscripts for a primitive category
+
 def parseSubscripts(subscr):
+    """
+    Parse the subscripts for a primitive category
+    """
     if subscr:
         return subscr[1:-1].split(',')
     return []
 
-# Parse a primitive category
+
 def parsePrimitiveCategory(chunks, primitives, families, var):
-    # If the primitive is the special category 'var',
-    # replace it with the correct CCGVar
+    """
+    Parse a primitive category
+
+    If the primitive is the special category 'var', replace it with the
+    correct `CCGVar`.
+    """
     if chunks[0] == "var":
         if chunks[1] is None:
             if var is None:
@@ -139,45 +168,57 @@ def parsePrimitiveCategory(chunks, primitives, families, var):
         return (PrimitiveCategory(catstr, subscrs), var)
     raise AssertionError('String \'' + catstr + '\' is neither a family nor primitive category.')
 
-# parseCategory drops the 'var' from the tuple
+
 def parseCategory(line, primitives, families):
+    """
+    Drop the 'var' from the tuple
+    """
     return augParseCategory(line, primitives, families)[0]
 
-# Parses a string representing a category, and returns
-# a tuple with (possibly) the CCG variable for the category
+
 def augParseCategory(line, primitives, families, var=None):
-    (str, rest) = nextCategory(line)
+    """
+    Parse a string representing a category, and returns a tuple with
+    (possibly) the CCG variable for the category
+    """
+    (cat_string, rest) = nextCategory(line)
 
-    if str.startswith('('):
-        (res, var) = augParseCategory(str[1:-1], primitives, families, var)
+    if cat_string.startswith('('):
+        (res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
 
     else:
 #        print rePrim.match(str).groups()
-        (res, var) = parsePrimitiveCategory(rePrim.match(str).groups(),
-	                                    primitives, families, var)
+        (res, var) =\
+            parsePrimitiveCategory(PRIM_RE.match(cat_string).groups(), primitives,
+                                   families, var)
 
     while rest != "":
-        app = reApp.match(rest).groups()
-        dir = parseApplication(app[0:3])
+        app = APP_RE.match(rest).groups()
+        direction = parseApplication(app[0:3])
         rest = app[3]
 
-        (str, rest) = nextCategory(rest)
-        if str.startswith('('):
-            (arg, var) = augParseCategory(str[1:-1], primitives, families, var)
+        (cat_string, rest) = nextCategory(rest)
+        if cat_string.startswith('('):
+            (arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
         else:
-            (arg, var) = parsePrimitiveCategory(rePrim.match(str).groups(), primitives, families, var)
-        res = FunctionalCategory(res, arg, dir)
+            (arg, var) =\
+                parsePrimitiveCategory(PRIM_RE.match(cat_string).groups(),
+                                       primitives, families, var)
+        res = FunctionalCategory(res, arg, direction)
 
     return (res, var)
 
-# Takes an input string, and converts it into a lexicon for CCGs.
-def parseLexicon(lex_str):
+
+def fromstring(lex_str):
+    """
+    Convert string representation into a lexicon for CCGs.
+    """
     primitives = []
     families = {}
     entries = defaultdict(list)
     for line in lex_str.splitlines():
         # Strip comments and leading/trailing whitespace.
-        line = reComm.match(line).groups()[0].strip()
+        line = COMMENTS_RE.match(line).groups()[0].strip()
         if line == "":
             continue
 
@@ -185,10 +226,10 @@ def parseLexicon(lex_str):
             # A line of primitive categories.
             # The first one is the target category
             # ie, :- S, N, NP, VP
-            primitives = primitives + [ prim.strip() for prim in line[2:].strip().split(',') ]
+            primitives = primitives + [prim.strip() for prim in line[2:].strip().split(',')]
         else:
             # Either a family definition, or a word definition
-            (ident, sep, catstr) = reLex.match(line).groups()
+            (ident, sep, catstr) = LEX_RE.match(line).groups()
             (cat, var) = augParseCategory(catstr, primitives, families)
             if sep == '::':
                 # Family definition
@@ -201,7 +242,11 @@ def parseLexicon(lex_str):
     return CCGLexicon(primitives[0], primitives, families, entries)
 
 
-openccg_tinytiny = parseLexicon('''
+ at deprecated('Use fromstring() instead.')
+def parseLexicon(lex_str):
+    return fromstring(lex_str)
+
+openccg_tinytiny = fromstring("""
     # Rather minimal lexicon based on the openccg `tinytiny' grammar.
     # Only incorporates a subset of the morphological subcategories, however.
     :- S,NP,N                    # Primitive categories
@@ -242,4 +287,4 @@ openccg_tinytiny = parseLexicon('''
 
     see => TransVpl
     sees => TransVsg
-    ''')
+    """)
diff --git a/nltk/chunk/util.py b/nltk/chunk/util.py
index 7fb13f1..e6b99ac 100644
--- a/nltk/chunk/util.py
+++ b/nltk/chunk/util.py
@@ -10,6 +10,7 @@ from __future__ import print_function, unicode_literals
 import re
 
 from nltk.tree import Tree
+from nltk.tag.mapping import map_tag
 from nltk.tag.util import str2tuple
 from nltk.compat import python_2_unicode_compatible
 
@@ -307,7 +308,8 @@ def _chunksets(t, count, chunk_label):
     return set(chunks)
 
 
-def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/'):
+def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/',
+                source_tagset=None, target_tagset=None):
     """
     Divide a string of bracketted tagged text into
     chunks and unchunked tokens, and produce a Tree.
@@ -344,7 +346,10 @@ def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/'):
             if sep is None:
                 stack[-1].append(text)
             else:
-                stack[-1].append(str2tuple(text, sep))
+                word, tag = str2tuple(text, sep)
+                if source_tagset and target_tagset:
+                    tag = map_tag(source_tagset, target_tagset, tag)
+                stack[-1].append((word, tag))
 
     if len(stack) != 1:
         raise ValueError('Expected ] at char %d' % len(s))
diff --git a/nltk/classify/rte_classify.py b/nltk/classify/rte_classify.py
index 5f34545..5bcca62 100644
--- a/nltk/classify/rte_classify.py
+++ b/nltk/classify/rte_classify.py
@@ -36,7 +36,7 @@ def lemmatize(word):
     """
     Use morphy from WordNet to find the base form of verbs.
     """
-    lemma = nltk.corpus.wordnet.morphy(word, pos='verb')
+    lemma = nltk.corpus.wordnet.morphy(word, pos=nltk.corpus.wordnet.VERB)
     if lemma is not None:
         return lemma
     return word
diff --git a/nltk/collocations.py b/nltk/collocations.py
index fdb4b22..ee48b4c 100644
--- a/nltk/collocations.py
+++ b/nltk/collocations.py
@@ -53,30 +53,31 @@ class AbstractCollocationFinder(object):
 
     def __init__(self, word_fd, ngram_fd):
         self.word_fd = word_fd
+        self.N = word_fd.N()
         self.ngram_fd = ngram_fd
-    
+
     @classmethod
-    def _build_new_documents(cls,documents, window_size, pad_left=False, pad_right=False, pad_symbol=None):
+    def _build_new_documents(cls, documents, window_size, pad_left=False, pad_right=False, pad_symbol=None):
         '''
-        Pad the document with the place holder according to the window_size 
+        Pad the document with the place holder according to the window_size
         '''
         padding = (pad_symbol,) * (window_size - 1)
         if pad_right:
             return _itertools.chain.from_iterable(_itertools.chain(doc, padding) for doc in documents)
         if pad_left:
             return _itertools.chain.from_iterable(_itertools.chain(padding, doc) for doc in documents)
-    
+
     @classmethod
     def from_documents(cls, documents):
         """Constructs a collocation finder given a collection of documents,
         each of which is a list (or iterable) of tokens.
         """
         #return cls.from_words(_itertools.chain(*documents))
-        return cls.from_words(cls._build_new_documents(documents, cls.default_ws, pad_right=True)) 
+        return cls.from_words(cls._build_new_documents(documents, cls.default_ws, pad_right=True))
 
     @staticmethod
     def _ngram_freqdist(words, n):
-        return FreqDist(tuple(words[i:i+n]) for i in range(len(words)-1))
+        return FreqDist(tuple(words[i:i + n]) for i in range(len(words) - 1))
 
     def _apply_filter(self, fn=lambda ngram, freq: False):
         """Generic filter removes ngrams from the frequency distribution
@@ -139,8 +140,8 @@ class BigramCollocationFinder(AbstractCollocationFinder):
     association measures. It is often useful to use from_words() rather than
     constructing an instance directly.
     """
-    default_ws = 2 
-    
+    default_ws = 2
+
     def __init__(self, word_fd, bigram_fd, window_size=2):
         """Construct a BigramCollocationFinder, given FreqDists for
         appearances of words and (possibly non-contiguous) bigrams.
@@ -162,7 +163,7 @@ class BigramCollocationFinder(AbstractCollocationFinder):
 
         for window in ngrams(words, window_size, pad_right=True):
             w1 = window[0]
-            if w1 is None: 
+            if w1 is None:
                 continue
             wfd[w1] += 1
             for w2 in window[1:]:
@@ -175,7 +176,7 @@ class BigramCollocationFinder(AbstractCollocationFinder):
         function.  Following Church and Hanks (1990), counts are scaled by
         a factor of 1/(window_size - 1).
         """
-        n_all = self.word_fd.N()
+        n_all = self.N
         n_ii = self.ngram_fd[(w1, w2)] / (self.window_size - 1.0)
         if not n_ii:
             return
@@ -190,7 +191,7 @@ class TrigramCollocationFinder(AbstractCollocationFinder):
     constructing an instance directly.
     """
     default_ws = 3
-    
+
     def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd):
         """Construct a TrigramCollocationFinder, given FreqDists for
         appearances of words, bigrams, two words with any word between them,
@@ -214,7 +215,7 @@ class TrigramCollocationFinder(AbstractCollocationFinder):
         tfd = FreqDist()
         for window in ngrams(words, window_size, pad_right=True):
             w1 = window[0]
-            if w1 is None: 
+            if w1 is None:
                 continue
             for w2, w3 in _itertools.combinations(window[1:], 2):
                 wfd[w1] += 1
@@ -238,7 +239,7 @@ class TrigramCollocationFinder(AbstractCollocationFinder):
         """Returns the score for a given trigram using the given scoring
         function.
         """
-        n_all = self.word_fd.N()
+        n_all = self.N
         n_iii = self.ngram_fd[(w1, w2, w3)]
         if not n_iii:
             return
@@ -259,7 +260,7 @@ class QuadgramCollocationFinder(AbstractCollocationFinder):
     It is often useful to use from_words() rather than constructing an instance directly.
     """
     default_ws = 4
-    
+
     def __init__(self, word_fd, quadgram_fd, ii, iii, ixi, ixxi, iixi, ixii):
         """Construct a QuadgramCollocationFinder, given FreqDists for appearances of words,
         bigrams, trigrams, two words with one word and two words between them, three words
@@ -288,7 +289,7 @@ class QuadgramCollocationFinder(AbstractCollocationFinder):
 
         for window in ngrams(words, window_size, pad_right=True):
             w1 = window[0]
-            if w1 is None: 
+            if w1 is None:
                 continue
             for w2, w3, w4 in _itertools.combinations(window[1:], 3):
                 ixxx[w1] += 1
@@ -309,7 +310,7 @@ class QuadgramCollocationFinder(AbstractCollocationFinder):
         return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii)
 
     def score_ngram(self, score_fn, w1, w2, w3, w4):
-        n_all = self.word_fd.N()
+        n_all = self.N
         n_iiii = self.ngram_fd[(w1, w2, w3, w4)]
         if not n_iiii:
             return
@@ -358,12 +359,11 @@ def demo(scorer=None, compare_scorer=None):
         cf.apply_freq_filter(3)
         cf.apply_word_filter(word_filter)
 
+        corr = spearman_correlation(ranks_from_scores(cf.score_ngrams(scorer)),
+                                    ranks_from_scores(cf.score_ngrams(compare_scorer)))
         print(file)
         print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)])
-        print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__,
-                                               spearman_correlation(
-                                                   ranks_from_scores(cf.score_ngrams(scorer)),
-                                                   ranks_from_scores(cf.score_ngrams(compare_scorer)))))
+        print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, corr))
 
 # Slows down loading too much
 # bigram_measures = BigramAssocMeasures()
@@ -384,4 +384,5 @@ if __name__ == '__main__':
 
     demo(scorer, compare_scorer)
 
-__all__ = ['BigramCollocationFinder', 'TrigramCollocationFinder', 'QuadgramCollocationFinder']
+__all__ = ['BigramCollocationFinder',
+           'TrigramCollocationFinder', 'QuadgramCollocationFinder']
diff --git a/nltk/compat.py b/nltk/compat.py
index 1534c55..04ac2be 100755
--- a/nltk/compat.py
+++ b/nltk/compat.py
@@ -19,6 +19,7 @@ PY26 = sys.version_info[:2] == (2, 6)
 if PY3:
     def b(s):
         return s.encode("latin-1")
+
     def u(s):
         return s
 
@@ -46,9 +47,9 @@ if PY3:
 
     import html.entities as htmlentitydefs
     from urllib.request import (urlopen, ProxyHandler, build_opener,
-        install_opener, getproxies, HTTPPasswordMgrWithDefaultRealm,
-        ProxyBasicAuthHandler, ProxyDigestAuthHandler, Request,
-        url2pathname)
+                                install_opener, getproxies, HTTPPasswordMgrWithDefaultRealm,
+                                ProxyBasicAuthHandler, ProxyDigestAuthHandler, Request,
+                                url2pathname)
     from urllib.error import HTTPError, URLError
     from urllib.parse import quote_plus, unquote_plus, urlencode
 
@@ -58,7 +59,7 @@ if PY3:
     UTC = timezone.utc
 
     from tempfile import TemporaryDirectory
-    
+
     unichr = chr
     if sys.version_info[1] <= 1:
         def int2byte(i):
@@ -71,6 +72,7 @@ if PY3:
 else:
     def b(s):
         return s
+
     def u(s):
         return unicode(s, "unicode_escape")
 
@@ -97,9 +99,9 @@ else:
 
     import htmlentitydefs
     from urllib2 import (urlopen, HTTPError, URLError,
-        ProxyHandler, build_opener, install_opener,
-        HTTPPasswordMgrWithDefaultRealm, ProxyBasicAuthHandler,
-        ProxyDigestAuthHandler, Request)
+                         ProxyHandler, build_opener, install_opener,
+                         HTTPPasswordMgrWithDefaultRealm, ProxyBasicAuthHandler,
+                         ProxyDigestAuthHandler, Request)
     from urllib import getproxies, quote_plus, unquote_plus, urlencode, url2pathname
 
     # Maps py2 tkinter package structure to py3 using import hook (PEP 302)
@@ -107,6 +109,7 @@ else:
         def __init__(self):
             self.mod = __import__("Tkinter")
             self.__path__ = ["nltk_py2_tkinter_package_path"]
+
         def __getattr__(self, name):
             return getattr(self.mod, name)
 
@@ -119,11 +122,13 @@ else:
                 "tkinter.font": "tkFont",
                 "tkinter.messagebox": "tkMessageBox",
             }
+
         def find_module(self, name, path=None):
             # we are only interested in tkinter modules listed
             # in self.module_map
             if name in self.module_map:
                 return self
+
         def load_module(self, name):
             if name not in sys.modules:
                 if name == 'tkinter':
@@ -143,22 +148,25 @@ else:
     # A UTC class for python 2.7
     class UTC(tzinfo):
         """UTC"""
-    
+
         def utcoffset(self, dt):
             return ZERO
-    
+
         def tzname(self, dt):
             return "UTC"
-    
+
         def dst(self, dt):
             return ZERO
-    
+
     UTC = UTC()
 
     unichr = unichr
     int2byte = chr
-    
-    import csv, codecs, cStringIO
+
+    import csv
+    import codecs
+    import cStringIO
+
     class UnicodeWriter:
         """
         A CSV writer which will write rows to CSV file "f",
@@ -171,7 +179,8 @@ else:
             self.queue = cStringIO.StringIO()
             self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
             self.stream = f
-            self.encoder = codecs.getincrementalencoder(encoding)(errors=errors)
+            encoder_cls = codecs.getincrementalencoder(encoding)
+            self.encoder = encoder_cls(errors=errors)
 
         def encode(self, data):
             if isinstance(data, basestring):
@@ -191,7 +200,6 @@ else:
             # empty queue
             self.queue.truncate(0)
 
-
     import warnings as _warnings
     import os as _os
     from tempfile import mkdtemp
@@ -212,7 +220,7 @@ else:
 
         def __init__(self, suffix="", prefix="tmp", dir=None):
             self._closed = False
-            self.name = None # Handle mkdtemp raising an exception
+            self.name = None  # Handle mkdtemp raising an exception
             self.name = mkdtemp(suffix, prefix, dir)
 
         def __repr__(self):
@@ -279,7 +287,6 @@ else:
             except OSError:
                 pass
 
-
     if PY26:
         from operator import itemgetter
         from heapq import nlargest
@@ -338,7 +345,8 @@ else:
                     for _ in repeat(None, count):
                         yield elem
 
-            # Override dict methods where the meaning changes for Counter objects.
+            # Override dict methods where the meaning changes for Counter
+            # objects.
 
             @classmethod
             def fromkeys(cls, iterable, v=None):
@@ -365,7 +373,8 @@ else:
                             for elem, count in iterable.iteritems():
                                 self[elem] = self_get(elem, 0) + count
                         else:
-                            dict.update(self, iterable) # fast path when counter is empty
+                            # fast path when counter is empty
+                            dict.update(self, iterable)
                     else:
                         self_get = self.get
                         for elem in iterable:
@@ -474,17 +483,19 @@ def iterkeys(d):
     """Return an iterator over the keys of a dictionary."""
     return getattr(d, _iterkeys)()
 
+
 def itervalues(d):
     """Return an iterator over the values of a dictionary."""
     return getattr(d, _itervalues)()
 
+
 def iteritems(d):
     """Return an iterator over the (key, value) pairs of a dictionary."""
     return getattr(d, _iteritems)()
 
 try:
     from functools import total_ordering
-except ImportError: # python 2.6
+except ImportError:  # python 2.6
     def total_ordering(cls):
         """Class decorator that fills in missing ordering methods"""
         convert = {
@@ -503,7 +514,8 @@ except ImportError: # python 2.6
         }
         roots = set(dir(cls)) & set(convert)
         if not roots:
-            raise ValueError('must define at least one ordering operation: < > <= >=')
+            raise ValueError(
+                'must define at least one ordering operation: < > <= >=')
         root = max(roots)       # prefer __lt__ to __le__ to __gt__ to __ge__
         for opname, opfunc in convert[root]:
             if opname not in roots:
@@ -527,21 +539,23 @@ if sys.platform.startswith('win'):
                          "tokenizers\punkt"]
 else:
     _PY3_DATA_UPDATES = ["chunkers/maxent_ne_chunker",
-                        "help/tagsets",
-                        "taggers/maxent_treebank_pos_tagger",
-                        "tokenizers/punkt"]
+                         "help/tagsets",
+                         "taggers/maxent_treebank_pos_tagger",
+                         "tokenizers/punkt"]
+
 
 def add_py3_data(path):
     if PY3:
         for item in _PY3_DATA_UPDATES:
             if item in str(path) and "/PY3" not in str(path):
                 pos = path.index(item) + len(item)
-                if path[pos:pos+4] == ".zip":
+                if path[pos:pos + 4] == ".zip":
                     pos += 4
                 path = path[:pos] + "/PY3" + path[pos:]
                 break
     return path
 
+
 # for use in adding /PY3 to the second (filename) argument
 # of the file pointers in data.py
 def py3_data(init_func):
@@ -555,6 +569,7 @@ def py3_data(init_func):
 import unicodedata
 import functools
 
+
 def remove_accents(text):
 
     if isinstance(text, bytes):
@@ -610,7 +625,6 @@ def python_2_unicode_compatible(klass):
         if not PY3:
             klass.__str__ = _7bit(_transliterated(klass.__unicode__))
 
-
     if not _was_fixed(klass.__repr__):
         klass.unicode_repr = klass.__repr__
         if not PY3:
diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py
index daaf3ad..c0d40db 100644
--- a/nltk/corpus/__init__.py
+++ b/nltk/corpus/__init__.py
@@ -76,14 +76,17 @@ brown = LazyCorpusLoader(
     cat_file='cats.txt', tagset='brown', encoding="ascii")
 cess_cat = LazyCorpusLoader(
     'cess_cat', BracketParseCorpusReader, r'(?!\.).*\.tbf',
-    tagset='unknown', encoding='ISO-8859-2')
+    tagset='unknown', encoding='ISO-8859-15')
 cess_esp = LazyCorpusLoader(
     'cess_esp', BracketParseCorpusReader, r'(?!\.).*\.tbf',
-    tagset='unknown', encoding='ISO-8859-2')
+    tagset='unknown', encoding='ISO-8859-15')
 cmudict = LazyCorpusLoader(
     'cmudict', CMUDictCorpusReader, ['cmudict'])
 comtrans = LazyCorpusLoader(
     'comtrans', AlignedCorpusReader, r'(?!\.).*\.txt')
+comparative_sentences = LazyCorpusLoader(
+    'comparative_sentences', ComparativeSentencesCorpusReader, r'labeledSentences\.txt',
+    encoding='latin-1')
 conll2000 = LazyCorpusLoader(
     'conll2000', ConllChunkCorpusReader,
     ['train.txt', 'test.txt'], ('NP','VP','PP'),
@@ -147,17 +150,29 @@ movie_reviews = LazyCorpusLoader(
     'movie_reviews', CategorizedPlaintextCorpusReader,
     r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',
     encoding='ascii')
+multext_east = LazyCorpusLoader(
+    'mte_teip5', MTECorpusReader, r'(oana).*\.xml', encoding="utf-8")
 names = LazyCorpusLoader(
     'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii')
 nkjp = LazyCorpusLoader(
     'nkjp', NKJPCorpusReader, r'', encoding='utf8')
 nps_chat = LazyCorpusLoader(
     'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj')
+opinion_lexicon = LazyCorpusLoader(
+    'opinion_lexicon', OpinionLexiconCorpusReader, r'(\w+)\-words\.txt',
+    encoding='ISO-8859-2')
 pl196x = LazyCorpusLoader(
     'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
     cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
 ppattach = LazyCorpusLoader(
     'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset'])
+product_reviews_1 = LazyCorpusLoader(
+    'product_reviews_1', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8')
+product_reviews_2 = LazyCorpusLoader(
+    'product_reviews_2', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8')
+pros_cons = LazyCorpusLoader(
+    'pros_cons', ProsConsCorpusReader, r'Integrated(Cons|Pros)\.txt',
+    cat_pattern=r'Integrated(Cons|Pros)\.txt', encoding='ISO-8859-2')
 ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
     'ptb', CategorizedBracketParseCorpusReader, r'(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG',
     cat_file='allcats.txt', tagset='wsj')
@@ -170,6 +185,9 @@ rte = LazyCorpusLoader(
     'rte', RTECorpusReader, r'(?!\.).*\.xml')
 senseval = LazyCorpusLoader(
     'senseval', SensevalCorpusReader, r'(?!\.).*\.pos')
+sentence_polarity = LazyCorpusLoader(
+    'sentence_polarity', CategorizedSentencesCorpusReader, r'rt-polarity\.(neg|pos)',
+    cat_pattern=r'rt-polarity\.(neg|pos)', encoding='utf-8')
 sentiwordnet = LazyCorpusLoader(
     'sentiwordnet', SentiWordNetCorpusReader, 'SentiWordNet_3.0.0.txt', encoding='utf-8')
 shakespeare = LazyCorpusLoader(
@@ -182,6 +200,9 @@ state_union = LazyCorpusLoader(
     encoding='ISO-8859-2')
 stopwords = LazyCorpusLoader(
     'stopwords', WordListCorpusReader, r'(?!README|\.).*', encoding='utf8')
+subjectivity = LazyCorpusLoader(
+    'subjectivity', CategorizedSentencesCorpusReader, r'(quote.tok.gt9|plot.tok.gt9)\.5000',
+    cat_map={'quote.tok.gt9.5000':['subj'], 'plot.tok.gt9.5000':['obj']}, encoding='latin-1')
 swadesh = LazyCorpusLoader(
     'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8')
 swadesh110 = LazyCorpusLoader(
@@ -203,7 +224,7 @@ treebank = LazyCorpusLoader(
 treebank_chunk = LazyCorpusLoader(
     'treebank/tagged', ChunkedCorpusReader, r'wsj_.*\.pos',
     sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
-    para_block_reader=tagged_treebank_para_block_reader, encoding='ascii')
+    para_block_reader=tagged_treebank_para_block_reader, tagset='wsj', encoding='ascii')
 treebank_raw = LazyCorpusLoader(
     'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
 twitter_samples = LazyCorpusLoader(
@@ -254,7 +275,7 @@ semcor = LazyCorpusLoader(
     'semcor', SemcorCorpusReader, r'brown./tagfiles/br-.*\.xml',
     wordnet) # Must be defined *after* wordnet corpus.
 
-  
+
 def demo():
     # This is out-of-date:
     abc.demo()
diff --git a/nltk/corpus/reader/__init__.py b/nltk/corpus/reader/__init__.py
index de78ba1..ca33bb3 100644
--- a/nltk/corpus/reader/__init__.py
+++ b/nltk/corpus/reader/__init__.py
@@ -96,6 +96,12 @@ from nltk.corpus.reader.sentiwordnet import *
 from nltk.corpus.reader.twitter import *
 from nltk.corpus.reader.nkjp import *
 from nltk.corpus.reader.crubadan import *
+from nltk.corpus.reader.mte import *
+from nltk.corpus.reader.reviews import *
+from nltk.corpus.reader.opinion_lexicon import *
+from nltk.corpus.reader.pros_cons import *
+from nltk.corpus.reader.categorized_sents import *
+from nltk.corpus.reader.comparative_sents import *
 
 # Make sure that nltk.corpus.reader.bracket_parse gives the module, not
 # the function bracket_parse() defined in nltk.tree:
@@ -131,5 +137,8 @@ __all__ = [
     'TimitTaggedCorpusReader', 'LinThesaurusCorpusReader',
     'SemcorCorpusReader', 'FramenetCorpusReader', 'UdhrCorpusReader',
     'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset',
-    'TwitterCorpusReader', 'NKJPCorpusReader', 'CrubadanCorpusReader'
+    'TwitterCorpusReader', 'NKJPCorpusReader', 'CrubadanCorpusReader',
+    'MTECorpusReader', 'ReviewsCorpusReader', 'OpinionLexiconCorpusReader',
+    'ProsConsCorpusReader', 'CategorizedSentencesCorpusReader',
+    'ComparativeSentencesCorpusReader'
 ]
diff --git a/nltk/corpus/reader/aligned.py b/nltk/corpus/reader/aligned.py
index 62a76eb..133f443 100644
--- a/nltk/corpus/reader/aligned.py
+++ b/nltk/corpus/reader/aligned.py
@@ -7,7 +7,7 @@
 
 from nltk import compat
 from nltk.tokenize import WhitespaceTokenizer, RegexpTokenizer
-from nltk.align import AlignedSent
+from nltk.translate import AlignedSent, Alignment
 
 from nltk.corpus.reader.api import CorpusReader
 from nltk.corpus.reader.util import StreamBackedCorpusView, concat,\
@@ -104,7 +104,7 @@ class AlignedSentCorpusView(StreamBackedCorpusView):
                  for alignedsent_str in self._alignedsent_block_reader(stream)
                  for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)]
         if self._aligned:
-            block[2] = " ".join(block[2]) # kludge; we shouldn't have tokenized the alignment string
+            block[2] = Alignment.fromstring(" ".join(block[2])) # kludge; we shouldn't have tokenized the alignment string
             block = [AlignedSent(*block)]
         elif self._group_by_sent:
             block = [block[0]]
diff --git a/nltk/corpus/reader/api.py b/nltk/corpus/reader/api.py
index fc8d97e..837e090 100644
--- a/nltk/corpus/reader/api.py
+++ b/nltk/corpus/reader/api.py
@@ -133,6 +133,18 @@ class CorpusReader(object):
         """
         return self.open("README").read()
 
+    def license(self):
+        """
+        Return the contents of the corpus LICENSE file, if it exists.
+        """
+        return self.open("LICENSE").read()
+
+    def citation(self):
+        """
+        Return the contents of the corpus citation.bib file, if it exists.
+        """
+        return self.open("citation.bib").read()
+
     def fileids(self):
         """
         Return a list of file identifiers for the fileids that make up
@@ -144,8 +156,8 @@ class CorpusReader(object):
         """
         Return the absolute path for the given file.
 
-        :type file: str
-        :param file: The file identifier for the file whose path
+        :type fileid: str
+        :param fileid: The file identifier for the file whose path
             should be returned.
         :rtype: PathPointer
         """
diff --git a/nltk/corpus/reader/bracket_parse.py b/nltk/corpus/reader/bracket_parse.py
index 7785d69..a2a1f88 100644
--- a/nltk/corpus/reader/bracket_parse.py
+++ b/nltk/corpus/reader/bracket_parse.py
@@ -25,8 +25,10 @@ EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')
 
 class BracketParseCorpusReader(SyntaxCorpusReader):
     """
-    Reader for corpora that consist of parenthesis-delineated parse
-    trees.
+    Reader for corpora that consist of parenthesis-delineated parse trees,
+    like those found in the "combined" section of the Penn Treebank,
+    e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
+
     """
     def __init__(self, root, fileids, comment_char=None,
                  detect_blocks='unindented_paren', encoding='utf8',
diff --git a/nltk/corpus/reader/categorized_sents.py b/nltk/corpus/reader/categorized_sents.py
new file mode 100644
index 0000000..e74b5fe
--- /dev/null
+++ b/nltk/corpus/reader/categorized_sents.py
@@ -0,0 +1,178 @@
+# Natural Language Toolkit: Categorized Sentences Corpus Reader
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader structured for corpora that contain one instance on each row.
+This CorpusReader is specifically used for the Subjectivity Dataset and the
+Sentence Polarity Dataset.
+
+- Subjectivity Dataset information -
+
+Authors: Bo Pang and Lillian Lee.
+Url: http://www.cs.cornell.edu/people/pabo/movie-review-data
+
+Distributed with permission.
+
+Related papers:
+
+- Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using
+    Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL,
+    2004.
+
+- Sentence Polarity Dataset information -
+
+Authors: Bo Pang and Lillian Lee.
+Url: http://www.cs.cornell.edu/people/pabo/movie-review-data
+
+Related papers:
+
+- Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for
+    sentiment categorization with respect to rating scales". Proceedings of the
+    ACL, 2005.
+"""
+
+from nltk.corpus.reader.api import *
+from nltk.tokenize import *
+
+class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
+    """
+    A reader for corpora in which each row represents a single instance, mainly
+    a sentence. Istances are divided into categories based on their file identifiers
+    (see CategorizedCorpusReader).
+    Since many corpora allow rows that contain more than one sentence, it is
+    possible to specify a sentence tokenizer to retrieve all sentences instead
+    than all rows.
+
+    Examples using the Subjectivity Dataset:
+
+    >>> from nltk.corpus import subjectivity
+    >>> subjectivity.sents()[23]
+    ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits',
+    'happened', 'off', 'screen', '.']
+    >>> subjectivity.categories()
+    ['obj', 'subj']
+    >>> subjectivity.words(categories='subj')
+    ['smart', 'and', 'alert', ',', 'thirteen', ...]
+
+    Examples using the Sentence Polarity Dataset:
+
+    >>> from nltk.corpus import sentence_polarity
+    >>> sentence_polarity.sents()
+    [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish',
+    'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find',
+    'it', 'funny', '.'], ...]
+    >>> sentence_polarity.categories()
+    ['neg', 'pos']
+    """
+
+    CorpusView = StreamBackedCorpusView
+
+    def __init__(self, root, fileids, word_tokenizer=WhitespaceTokenizer(),
+                 sent_tokenizer=None, encoding='utf8', **kwargs):
+        """
+        :param root: The root directory for the corpus.
+        :param fileids: a list or regexp specifying the fileids in the corpus.
+        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
+            into words. Default: `WhitespaceTokenizer`
+        :param sent_tokenizer: a tokenizer for breaking paragraphs into sentences.
+        :param encoding: the encoding that should be used to read the corpus.
+        :param kwargs: additional parameters passed to CategorizedCorpusReader.
+        """
+
+        CorpusReader.__init__(self, root, fileids, encoding)
+        CategorizedCorpusReader.__init__(self, kwargs)
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+
+    def _resolve(self, fileids, categories):
+        if fileids is not None and categories is not None:
+            raise ValueError('Specify fileids or categories, not both')
+        if categories is not None:
+            return self.fileids(categories)
+        else:
+            return fileids
+
+    def raw(self, fileids=None, categories=None):
+        """
+        :param fileids: a list or regexp specifying the fileids that have to be
+            returned as a raw string.
+        :param categories: a list specifying the categories whose files have to
+            be returned as a raw string.
+        :return: the given file(s) as a single string.
+        :rtype: str
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def readme(self):
+        """
+        Return the contents of the corpus Readme.txt file.
+        """
+        return self.open("README").read()
+
+    def sents(self, fileids=None, categories=None):
+        """
+        Return all sentences in the corpus or in the specified file(s).
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            sentences have to be returned.
+        :param categories: a list specifying the categories whose sentences have
+            to be returned.
+        :return: the given file(s) as a list of sentences.
+            Each sentence is tokenized using the specified word_tokenizer.
+        :rtype: list(list(str))
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, compat.string_types):
+            fileids = [fileids]
+        return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
+            for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+    def words(self, fileids=None, categories=None):
+        """
+        Return all words and punctuation symbols in the corpus or in the specified
+        file(s).
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :param categories: a list specifying the categories whose words have to
+            be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, compat.string_types):
+            fileids = [fileids]
+        return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+            for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+    def _read_sent_block(self, stream):
+        sents = []
+        for i in range(20): # Read 20 lines at a time.
+            line = stream.readline()
+            if not line:
+                continue
+            if self._sent_tokenizer:
+                sents.extend([self._word_tokenizer.tokenize(sent)
+                              for sent in self._sent_tokenizer.tokenize(line)])
+            else:
+                sents.append(self._word_tokenizer.tokenize(line))
+        return sents
+
+    def _read_word_block(self, stream):
+        words = []
+        for sent in self._read_sent_block(stream):
+            words.extend(sent)
+        return words
diff --git a/nltk/corpus/reader/chunked.py b/nltk/corpus/reader/chunked.py
index 7a666d6..10f9ab3 100644
--- a/nltk/corpus/reader/chunked.py
+++ b/nltk/corpus/reader/chunked.py
@@ -37,14 +37,13 @@ class ChunkedCorpusReader(CorpusReader):
                  str2chunktree=tagstr2tree,
                  sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                  para_block_reader=read_blankline_block,
-                 encoding='utf8'):
+                 encoding='utf8', tagset=None):
         """
         :param root: The root directory for this corpus.
         :param fileids: A list or regexp specifying the fileids in this corpus.
         """
         CorpusReader.__init__(self, root, fileids, encoding)
-
-        self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader)
+        self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
         """Arguments for corpus views generated by this corpus: a tuple
         (str2chunktree, sent_tokenizer, para_block_tokenizer)"""
 
@@ -86,37 +85,37 @@ class ChunkedCorpusReader(CorpusReader):
         return concat([ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
                        for (f, enc) in self.abspaths(fileids, True)])
 
-    def tagged_words(self, fileids=None):
+    def tagged_words(self, fileids=None, tagset=None):
         """
         :return: the given file(s) as a list of tagged
             words and punctuation symbols, encoded as tuples
             ``(word,tag)``.
         :rtype: list(tuple(str,str))
         """
-        return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 0, *self._cv_args)
+        return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset)
                        for (f, enc) in self.abspaths(fileids, True)])
 
-    def tagged_sents(self, fileids=None):
+    def tagged_sents(self, fileids=None, tagset=None):
         """
         :return: the given file(s) as a list of
             sentences, each encoded as a list of ``(word,tag)`` tuples.
 
         :rtype: list(list(tuple(str,str)))
         """
-        return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 0, *self._cv_args)
+        return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset)
                        for (f, enc) in self.abspaths(fileids, True)])
 
-    def tagged_paras(self, fileids=None):
+    def tagged_paras(self, fileids=None, tagset=None):
         """
         :return: the given file(s) as a list of
             paragraphs, each encoded as a list of sentences, which are
             in turn encoded as lists of ``(word,tag)`` tuples.
         :rtype: list(list(list(tuple(str,str))))
         """
-        return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 0, *self._cv_args)
+        return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset)
                        for (f, enc) in self.abspaths(fileids, True)])
 
-    def chunked_words(self, fileids=None):
+    def chunked_words(self, fileids=None, tagset=None):
         """
         :return: the given file(s) as a list of tagged
             words and chunks.  Words are encoded as ``(word, tag)``
@@ -125,10 +124,10 @@ class ChunkedCorpusReader(CorpusReader):
             trees over ``(word,tag)`` tuples or word strings.
         :rtype: list(tuple(str,str) and Tree)
         """
-        return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 1, *self._cv_args)
+        return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset)
                        for (f, enc) in self.abspaths(fileids, True)])
 
-    def chunked_sents(self, fileids=None):
+    def chunked_sents(self, fileids=None, tagset=None):
         """
         :return: the given file(s) as a list of
             sentences, each encoded as a shallow Tree.  The leaves
@@ -137,10 +136,10 @@ class ChunkedCorpusReader(CorpusReader):
             tags).
         :rtype: list(Tree)
         """
-        return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 1, *self._cv_args)
+        return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset)
                        for (f, enc) in self.abspaths(fileids, True)])
 
-    def chunked_paras(self, fileids=None):
+    def chunked_paras(self, fileids=None, tagset=None):
         """
         :return: the given file(s) as a list of
             paragraphs, each encoded as a list of sentences, which are
@@ -149,7 +148,7 @@ class ChunkedCorpusReader(CorpusReader):
             has tags) or word strings (if the corpus has no tags).
         :rtype: list(list(Tree))
         """
-        return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 1, *self._cv_args)
+        return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset)
                        for (f, enc) in self.abspaths(fileids, True)])
 
     def _read_block(self, stream):
@@ -158,7 +157,7 @@ class ChunkedCorpusReader(CorpusReader):
 class ChunkedCorpusView(StreamBackedCorpusView):
     def __init__(self, fileid, encoding, tagged, group_by_sent,
                  group_by_para, chunked, str2chunktree, sent_tokenizer,
-                 para_block_reader):
+                 para_block_reader, source_tagset=None, target_tagset=None):
         StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
         self._tagged = tagged
         self._group_by_sent = group_by_sent
@@ -167,13 +166,16 @@ class ChunkedCorpusView(StreamBackedCorpusView):
         self._str2chunktree = str2chunktree
         self._sent_tokenizer = sent_tokenizer
         self._para_block_reader = para_block_reader
+        self._source_tagset = source_tagset
+        self._target_tagset = target_tagset
 
     def read_block(self, stream):
         block = []
         for para_str in self._para_block_reader(stream):
             para = []
             for sent_str in self._sent_tokenizer.tokenize(para_str):
-                sent = self._str2chunktree(sent_str)
+                sent = self._str2chunktree(sent_str, source_tagset=self._source_tagset,
+                                           target_tagset=self._target_tagset)
 
                 # If requested, throw away the tags.
                 if not self._tagged:
diff --git a/nltk/corpus/reader/comparative_sents.py b/nltk/corpus/reader/comparative_sents.py
new file mode 100644
index 0000000..f577cee
--- /dev/null
+++ b/nltk/corpus/reader/comparative_sents.py
@@ -0,0 +1,278 @@
+# Natural Language Toolkit: Comparative Sentence Corpus Reader
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader for the Comparative Sentence Dataset.
+
+- Comparative Sentence Dataset information -
+
+Annotated by: Nitin Jindal and Bing Liu, 2006.
+              Department of Computer Sicence
+              University of Illinois at Chicago
+
+Contact: Nitin Jindal, njindal at cs.uic.edu
+         Bing Liu, liub at cs.uic.edu (http://www.cs.uic.edu/~liub)
+
+Distributed with permission.
+
+Related papers:
+
+- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
+   Proceedings of the ACM SIGIR International Conference on Information Retrieval
+   (SIGIR-06), 2006.
+
+- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
+   Proceedings of Twenty First National Conference on Artificial Intelligence
+   (AAAI-2006), 2006.
+
+- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
+    Proceedings of the 22nd International Conference on Computational Linguistics
+    (Coling-2008), Manchester, 18-22 August, 2008.
+"""
+import re
+
+from nltk.corpus.reader.api import *
+from nltk.tokenize import *
+
+# Regular expressions for dataset components
+STARS = re.compile(r'^\*+$')
+COMPARISON = re.compile(r'<cs-[1234]>')
+CLOSE_COMPARISON = re.compile(r'</cs-[1234]>')
+GRAD_COMPARISON = re.compile(r'<cs-[123]>')
+NON_GRAD_COMPARISON = re.compile(r'<cs-4>')
+ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
+KEYWORD = re.compile(r'\((?!.*\()(.*)\)$')
+
+class Comparison(object):
+    """
+    A Comparison represents a comparative sentence and its constituents.
+    """
+    def __init__(self, text=None, comp_type=None, entity_1=None, entity_2=None,
+                 feature=None, keyword=None):
+        """
+        :param text: a string (optionally tokenized) containing a comparation.
+        :param comp_type: an integer defining the type of comparison expressed.
+            Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
+            4 (Non-gradable).
+        :param entity_1: the first entity considered in the comparison relation.
+        :param entity_2: the second entity considered in the comparison relation.
+        :param feature: the feature considered in the comparison relation.
+        :param keyword: the word or phrase which is used for that comparative relation.
+        """
+        self.text = text
+        self.comp_type = comp_type
+        self.entity_1 = entity_1
+        self.entity_2 = entity_2
+        self.feature = feature
+        self.keyword = keyword
+
+    def __repr__(self):
+        return ("Comparison(text=\"{}\", comp_type={}, entity_1=\"{}\", entity_2=\"{}\", "
+                "feature=\"{}\", keyword=\"{}\")").format(self.text, self.comp_type,
+                self.entity_1, self.entity_2, self.feature, self.keyword)
+
+class ComparativeSentencesCorpusReader(CorpusReader):
+    """
+    Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).
+
+        >>> from nltk.corpus import comparative_sentences
+        >>> comparison = comparative_sentences.comparisons()[0]
+        >>> comparison.text
+        ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
+        'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
+        'had', '.']
+        >>> comparison.entity_2
+        'models'
+        >>> (comparison.feature, comparison.keyword)
+        ('rewind', 'more')
+        >>> len(comparative_sentences.comparisons())
+        853
+    """
+    CorpusView = StreamBackedCorpusView
+
+    def __init__(self, root, fileids, word_tokenizer=WhitespaceTokenizer(),
+                 sent_tokenizer=None, encoding='utf8'):
+        """
+        :param root: The root directory for this corpus.
+        :param fileids: a list or regexp specifying the fileids in this corpus.
+        :param word_tokenizer: tokenizer for breaking sentences or paragraphs
+            into words. Default: `WhitespaceTokenizer`
+        :param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
+        :param encoding: the encoding that should be used to read the corpus.
+        """
+
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._word_tokenizer = word_tokenizer
+        self._sent_tokenizer = sent_tokenizer
+
+    def comparisons(self, fileids=None):
+        """
+        Return all comparisons in the corpus.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            comparisons have to be returned.
+        :return: the given file(s) as a list of Comparison objects.
+        :rtype: list(Comparison)
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, compat.string_types):
+            fileids = [fileids]
+        return concat([self.CorpusView(path, self._read_comparison_block, encoding=enc)
+            for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+    def keywords(self, fileids=None):
+        """
+        Return a set of all keywords used in the corpus.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            keywords have to be returned.
+        :return: the set of keywords and comparative phrases used in the corpus.
+        :rtype: set(str)
+        """
+        all_keywords = concat([self.CorpusView(path, self._read_keyword_block, encoding=enc)
+                       for (path, enc, fileid)
+                       in self.abspaths(fileids, True, True)])
+
+        keywords_set = set([keyword.lower() for keyword in all_keywords if keyword])
+        return keywords_set
+
+    def keywords_readme(self):
+        """
+        Return the list of words and constituents considered as clues of a
+        comparison (from listOfkeywords.txt).
+        """
+        keywords = []
+        raw_text = self.open("listOfkeywords.txt").read()
+        for line in raw_text.split("\n"):
+            if not line or line.startswith("//"):
+                continue
+            keywords.append(line.strip())
+        return keywords
+
+    def raw(self, fileids=None):
+        """
+        :param fileids: a list or regexp specifying the fileids that have to be
+            returned as a raw string.
+        :return: the given file(s) as a single string.
+        :rtype: str
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def readme(self):
+        """
+        Return the contents of the corpus readme file.
+        """
+        return self.open("README.txt").read()
+
+    def sents(self, fileids=None):
+        """
+        Return all sentences in the corpus.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            sentences have to be returned.
+        :return: all sentences of the corpus as lists of tokens (or as plain
+            strings, if no word tokenizer is specified).
+        :rtype: list(list(str)) or list(str)
+        """
+        return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
+            for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+    def words(self, fileids=None):
+        """
+        Return all words and punctuation symbols in the corpus.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+                       for (path, enc, fileid)
+                       in self.abspaths(fileids, True, True)])
+
+    def _read_comparison_block(self, stream):
+        while True:
+            line = stream.readline()
+            if not line:
+                return [] # end of file.
+            comparison_tags = re.findall(COMPARISON, line)
+            if comparison_tags:
+                grad_comparisons = re.findall(GRAD_COMPARISON, line)
+                non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
+                # Advance to the next line (it contains the comparative sentence)
+                comparison_text = stream.readline().strip()
+                if self._word_tokenizer:
+                    comparison_text = self._word_tokenizer.tokenize(comparison_text)
+                # Skip the next line (it contains closing comparison tags)
+                stream.readline()
+                # If gradable comparisons are found, create Comparison instances
+                # and populate their fields
+                comparison_bundle = []
+                if grad_comparisons:
+                    # Each comparison tag has its own relations on a separate line
+                    for comp in grad_comparisons:
+                        comp_type = int(re.match(r'<cs-(\d)>', comp).group(1))
+                        comparison = Comparison(text=comparison_text, comp_type=comp_type)
+                        line = stream.readline()
+                        entities_feats = ENTITIES_FEATS.findall(line)
+                        if entities_feats:
+                            for (code, entity_feat) in entities_feats:
+                                if code == '1':
+                                    comparison.entity_1 = entity_feat.strip()
+                                elif code == '2':
+                                    comparison.entity_2 = entity_feat.strip()
+                                elif code == '3':
+                                    comparison.feature = entity_feat.strip()
+                        keyword = KEYWORD.findall(line)
+                        if keyword:
+                            comparison.keyword = keyword[0]
+                        comparison_bundle.append(comparison)
+                # If non-gradable comparisons are found, create a simple Comparison
+                # instance for each one
+                if non_grad_comparisons:
+                    for comp in non_grad_comparisons:
+                        # comp_type in this case should always be 4.
+                        comp_type = int(re.match(r'<cs-(\d)>', comp).group(1))
+                        comparison = Comparison(text=comparison_text, comp_type=comp_type)
+                        comparison_bundle.append(comparison)
+                # Flatten the list of comparisons before returning them
+                # return concat([comparison_bundle])
+                return comparison_bundle
+
+    def _read_keyword_block(self, stream):
+        keywords = []
+        for comparison in self._read_comparison_block(stream):
+            keywords.append(comparison.keyword)
+        return keywords
+
+    def _read_sent_block(self, stream):
+        while True:
+            line = stream.readline()
+            if re.match(STARS, line):
+                while True:
+                    line = stream.readline()
+                    if re.match(STARS, line):
+                        break
+                continue
+            if not re.findall(COMPARISON, line) and not ENTITIES_FEATS.findall(line) \
+            and not re.findall(CLOSE_COMPARISON, line):
+                if self._sent_tokenizer:
+                    return [self._word_tokenizer.tokenize(sent)
+                        for sent in self._sent_tokenizer.tokenize(line)]
+                else:
+                    return [self._word_tokenizer.tokenize(line)]
+
+    def _read_word_block(self, stream):
+        words = []
+        for sent in self._read_sent_block(stream):
+            words.extend(sent)
+        return words
diff --git a/nltk/corpus/reader/mte.py b/nltk/corpus/reader/mte.py
new file mode 100644
index 0000000..28d5767
--- /dev/null
+++ b/nltk/corpus/reader/mte.py
@@ -0,0 +1,300 @@
+"""
+A reader for corpora whose documents are in MTE format.
+"""
+import os
+from functools import reduce
+from nltk import compat
+from nltk.corpus.reader import concat, TaggedCorpusReader
+
+lxmlAvailable = False
+try:
+    from lxml import etree
+    lxmlAvailable = True
+except ImportError:
+    #first try c version of ElementTree
+    try:
+        import xml.etree.cElementTree as etree
+    except ImportError:
+        import xml.etree.ElementTree as etree
+import re
+
+def xpath(root, path, ns):
+    if lxmlAvailable:
+        return root.xpath(path, namespaces=ns)
+    else:
+        return root.findall(path, ns)
+
+
+class MTEFileReader:
+    """
+    Class for loading the content of the multext-east corpus. It
+    parses the xml files and does some tag-filtering depending on the
+    given method parameters.
+    """
+    ns = {'tei': 'http://www.tei-c.org/ns/1.0', 'xml': 'http://www.w3.org/XML/1998/namespace'}
+    tag_ns = '{http://www.tei-c.org/ns/1.0}'
+    xml_ns = '{http://www.w3.org/XML/1998/namespace}'
+
+    def __init__(self, file_path):
+        tree = etree.parse(file_path)
+        self.__root = xpath(tree.getroot(), './tei:text/tei:body', self.ns)[0]
+
+    @classmethod
+    def _words(self, text_root):
+        return [w.text for w in xpath(text_root, './/*', self.ns) if
+                w.tag == self.tag_ns + "w" or w.tag == self.tag_ns + "c"]
+
+    @classmethod
+    def _sents(self, text_root):
+        return [MTEFileReader._words(s) for s in xpath(text_root, './/tei:s', self.ns)]
+
+    @classmethod
+    def _paras(self, text_root):
+        return [MTEFileReader._sents(p) for p in xpath(text_root, './/tei:p', self.ns)]
+
+    @classmethod
+    def _lemma_words(self, text_root):
+        return [(w.text, w.attrib['lemma']) for w in xpath(text_root, './/tei:w', self.ns)]
+
+    @classmethod
+    def _tagged_words(self, text_root, tags=""):
+        if tags is None or tags == "":
+            return [(w.text, w.attrib['ana']) for w in xpath(text_root, './/tei:w', self.ns)]
+
+        else:
+            tags = re.compile('^' + re.sub("-",".",tags) + '.*$')
+            return [(w.text, w.attrib['ana']) for w in xpath(text_root, './/tei:w', self.ns)
+                                              if tags.match(w.attrib['ana'])]
+
+    @classmethod
+    def _lemma_sents(self, text_root):
+        return [MTEFileReader._lemma_words(s) for s in xpath(text_root, './/tei:s', self.ns)]
+
+    @classmethod
+    def _tagged_sents(self, text_root, tags=""):
+        # double list comprehension to remove empty sentences in case there is a sentence only containing punctuation marks
+        return [t for t in [MTEFileReader._tagged_words(s, tags) for s in xpath(text_root, './/tei:s', self.ns)] if len(t) > 0]
+
+    @classmethod
+    def _lemma_paras(self, text_root):
+        return [MTEFileReader._lemma_sents(p) for p in xpath(text_root, './/tei:p', self.ns)]
+
+    @classmethod
+    def _tagged_paras(self, text_root, tags=""):
+        return [t for t in [MTEFileReader._tagged_sents(p, tags) for p in xpath(text_root, './/tei:p', self.ns)] if len(t) > 0]
+
+
+    def words(self):
+        return MTEFileReader._words(self.__root)
+
+    def sents(self):
+        return MTEFileReader._sents(self.__root)
+
+    def paras(self):
+        return MTEFileReader._paras(self.__root)
+
+    def lemma_words(self):
+        return MTEFileReader._lemma_words(self.__root)
+
+    def tagged_words(self, tags=""):
+        return MTEFileReader._tagged_words(self.__root, tags)
+
+    def lemma_sents(self):
+        return MTEFileReader._lemma_sents(self.__root)
+
+    def tagged_sents(self, tags=""):
+        return MTEFileReader._tagged_sents(self.__root)
+
+    def lemma_paras(self):
+        return MTEFileReader._lemma_paras(self.__root)
+
+    def tagged_paras(self, tags=""):
+        return MTEFileReader._tagged_paras(self.__root)
+
+class MTETagConverter:
+    """
+    Class for converting msd tags to universal tags, more conversion
+    options are currently not implemented.
+    """
+
+    mapping_msd_universal = {
+        'A': 'ADJ', 'S': 'ADP', 'R': 'ADV', 'C': 'CONJ',
+        'D': 'DET', 'N': 'NOUN', 'M': 'NUM', 'Q': 'PRT',
+        'P': 'PRON', 'V': 'VERB', '.': '.', '-': 'X'}
+
+    @staticmethod
+    def msd_to_universal(tag):
+        """
+        This function converts the annotation from the Multex-East to the universal tagset
+        as described in Chapter 5 of the NLTK-Book
+
+        Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
+        """
+        indicator = tag[0] if not tag[0] == "#" else tag[1]
+
+        if not indicator in MTETagConverter.mapping_msd_universal:
+            indicator = '-'
+
+        return MTETagConverter.mapping_msd_universal[indicator]
+
+class MTECorpusReader(TaggedCorpusReader):
+    """
+    Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
+    MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
+    scheme. These tags can be converted to the Universal tagset
+    """
+
+    def __init__(self, root=None, fileids=None, encoding='utf8'):
+        """
+        Construct a new MTECorpusreader for a set of documents
+        located at the given root directory.  Example usage:
+
+            >>> root = '/...path to corpus.../'
+            >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP
+
+        :param root: The root directory for this corpus. (default points to location in multext config file)
+        :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
+        :param enconding: The encoding of the given files (default is utf8)
+        """
+        TaggedCorpusReader.__init__(self, root, fileids, encoding)
+
+    def __fileids(self, fileids):
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, compat.string_types): fileids = [fileids]
+        # filter wrong userinput
+        fileids = filter(lambda x : x in self._fileids, fileids)
+        # filter multext-east sourcefiles that are not compatible to the teip5 specification
+        fileids = filter(lambda x : x not in ["oana-bg.xml", "oana-mk.xml"], fileids)
+        if not fileids:
+            print("No valid multext-east file specified")
+        return fileids
+
+    def readme(self):
+        """
+        Prints some information about this corpus.
+        :return: the content of the attached README file
+        :rtype: str
+        """
+        return self.open("00README.txt").read()
+
+    def raw(self, fileids=None):
+        """
+	    :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a single string.
+        :rtype: str
+        """
+        return concat([self.open(f).read() for f in self.__fileids(fileids)])
+
+    def words(self, fileids=None):
+        """
+	    :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        return  reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).words() for f in self.__fileids(fileids)], [])
+
+    def sents(self, fileids=None):
+        """
+	    :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of sentences or utterances,
+                 each encoded as a list of word strings
+        :rtype: list(list(str))
+        """
+        return  reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).sents() for f in self.__fileids(fileids)], [])
+
+    def paras(self, fileids=None):
+        """
+	    :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of paragraphs, each encoded as a list
+                 of sentences, which are in turn encoded as lists of word string
+        :rtype: list(list(list(str)))
+        """
+        return  reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).paras() for f in self.__fileids(fileids)], [])
+
+    def lemma_words(self, fileids=None):
+        """
+	    :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of words, the corresponding lemmas
+                 and punctuation symbols, encoded as tuples (word, lemma)
+        :rtype: list(tuple(str,str))
+        """
+        return  reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).lemma_words() for f in self.__fileids(fileids)], [])
+
+    def tagged_words(self, fileids=None, tagset="msd", tags=None):
+        """
+	    :param fileids: A list specifying the fileids that should be used.
+        :param tagset: The tagset that should be used in the returned object,
+                       either "universal" or "msd", "msd" is the default
+        :param tags: An MSD Tag that is used to filter all parts of the used corpus
+                     that are not more precise or at least equal to the given tag
+        :return: the given file(s) as a list of tagged words and punctuation symbols
+                 encoded as tuples (word, tag)
+        :rtype: list(tuple(str, str))
+        """
+        words = reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).tagged_words(tags=tags) for f in self.__fileids(fileids)], [])
+        if tagset == "universal":
+            return map(lambda wt : (wt[0], MTETagConverter.msd_to_universal(wt[1])), words)
+        elif tagset == "msd":
+            return words
+        else:
+            print("Unknown tagset specified.")
+
+    def lemma_sents(self, fileids=None):
+        """
+	    :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of sentences or utterances, each
+                 encoded as a list of tuples of the word and the corresponding
+                 lemma (word, lemma)
+        :rtype: list(list(tuple(str, str)))
+        """
+        return  reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).lemma_sents() for f in self.__fileids(fileids)], [])
+
+
+    def tagged_sents(self, fileids=None, tagset="msd", tags=None):
+        """
+	    :param fileids: A list specifying the fileids that should be used.
+        :param tagset: The tagset that should be used in the returned object,
+                       either "universal" or "msd", "msd" is the default
+        :param tags: An MSD Tag that is used to filter all parts of the used corpus
+                     that are not more precise or at least equal to the given tag
+        :return: the given file(s) as a list of sentences or utterances, each
+                 each encoded as a list of (word,tag) tuples
+        :rtype: list(list(tuple(str, str)))
+        """
+        sents = reduce(lambda a, b : a + b, [MTEFileReader(os.path.join(self._root, f)).tagged_sents(tags=tags) for f in self.__fileids(fileids)], [])
+        if tagset == "universal":
+            return map(lambda s : map (lambda wt : (wt[0], MTETagConverter.msd_to_universal(wt[1])), s), sents)
+        elif tagset == "msd":
+            return sents
+        else:
+            print("Unknown tagset specified.")
+
+    def lemma_paras(self, fileids=None):
+        """
+	    :param fileids: A list specifying the fileids that should be used.
+        :return: the given file(s) as a list of paragraphs, each encoded as a
+                 list of sentences, which are in turn encoded as a list of
+                 tuples of the word and the corresponding lemma (word, lemma)
+        :rtype: list(List(List(tuple(str, str))))
+        """
+        return reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).lemma_paras() for f in self.__fileids(fileids)], [])
+
+    def tagged_paras(self, fileids=None, tagset="msd", tags=None):
+        """
+	    :param fileids: A list specifying the fileids that should be used.
+        :param tagset: The tagset that should be used in the returned object,
+                       either "universal" or "msd", "msd" is the default
+        :param tags: An MSD Tag that is used to filter all parts of the used corpus
+                     that are not more precise or at least equal to the given tag
+        :return: the given file(s) as a list of paragraphs, each encoded as a
+                 list of sentences, which are in turn encoded as a list
+                 of (word,tag) tuples
+        :rtype: list(list(list(tuple(str, str))))
+        """
+        paras = reduce(lambda a, b : a + b, [MTEFileReader(os.path.join(self._root, f)).tagged_paras(tags=tags) for f in self.__fileids(fileids)], [])
+        if tagset == "universal":
+            return map(lambda p : map(lambda s : map (lambda wt : (wt[0], MTETagConverter.msd_to_universal(wt[0])), s), p), paras)
+        elif tagset == "msd":
+            return paras
+        else:
+            print("Unknown tagset specified.")
diff --git a/nltk/corpus/reader/opinion_lexicon.py b/nltk/corpus/reader/opinion_lexicon.py
new file mode 100644
index 0000000..baaf096
--- /dev/null
+++ b/nltk/corpus/reader/opinion_lexicon.py
@@ -0,0 +1,115 @@
+# Natural Language Toolkit: Opinion Lexicon Corpus Reader
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader for the Opinion Lexicon.
+
+- Opinion Lexicon information -
+Authors: Minqing Hu and Bing Liu, 2004.
+    Department of Computer Sicence
+    University of Illinois at Chicago
+
+Contact: Bing Liu, liub at cs.uic.edu
+        http://www.cs.uic.edu/~liub
+
+Distributed with permission.
+
+Related papers:
+- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
+    Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery
+    & Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA.
+
+- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and
+    Comparing Opinions on the Web". Proceedings of the 14th International World
+    Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.
+"""
+
+from nltk.compat import string_types
+from nltk.corpus.reader import WordListCorpusReader
+from nltk.corpus.reader.api import *
+
+class IgnoreReadmeCorpusView(StreamBackedCorpusView):
+    """
+    This CorpusView is used to skip the initial readme block of the corpus.
+    """
+    def __init__(self, *args, **kwargs):
+        StreamBackedCorpusView.__init__(self, *args, **kwargs)
+        # open self._stream
+        self._open()
+        # skip the readme block
+        read_blankline_block(self._stream)
+        # Set the initial position to the current stream position
+        self._filepos = [self._stream.tell()]
+
+
+class OpinionLexiconCorpusReader(WordListCorpusReader):
+    """
+    Reader for Liu and Hu opinion lexicon.  Blank lines and readme are ignored.
+
+        >>> from nltk.corpus import opinion_lexicon
+        >>> opinion_lexicon.words()
+        ['2-faced', '2-faces', 'abnormal', 'abolish', ...]
+
+    The OpinionLexiconCorpusReader provides shortcuts to retrieve positive/negative
+    words:
+
+        >>> opinion_lexicon.negative()
+        ['2-faced', '2-faces', 'abnormal', 'abolish', ...]
+
+    Note that words from `words()` method are sorted by file id, not alphabetically:
+
+        >>> opinion_lexicon.words()[0:10]
+        ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably',
+        'abominate', 'abomination', 'abort', 'aborted']
+        >>> sorted(opinion_lexicon.words())[0:10]
+        ['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably',
+        'abominate', 'abomination', 'abort']
+    """
+
+    CorpusView = IgnoreReadmeCorpusView
+
+    def words(self, fileids=None):
+        """
+        Return all words in the opinion lexicon. Note that these words are not
+        sorted in alphabetical order.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        if fileids is None: fileids = self._fileids
+        elif isinstance(fileids, compat.string_types): fileids = [fileids]
+        return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+            for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+    def positive(self):
+        """
+        Return all positive words in alphabetical order.
+
+        :return: a list of positive words.
+        :rtype: list(str)
+        """
+        return self.words('positive-words.txt')
+
+    def negative(self):
+        """
+        Return all negative words in alphabetical order.
+
+        :return: a list of negative words.
+        :rtype: list(str)
+        """
+        return self.words('negative-words.txt')
+
+    def _read_word_block(self, stream):
+        words = []
+        for i in range(20): # Read 20 lines at a time.
+            line = stream.readline()
+            if not line:
+                continue
+            words.append(line.strip())
+        return words
diff --git a/nltk/corpus/reader/pros_cons.py b/nltk/corpus/reader/pros_cons.py
new file mode 100644
index 0000000..5f5e2c6
--- /dev/null
+++ b/nltk/corpus/reader/pros_cons.py
@@ -0,0 +1,126 @@
+# Natural Language Toolkit: Pros and Cons Corpus Reader
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader for the Pros and Cons dataset.
+
+- Pros and Cons dataset information -
+
+Contact: Bing Liu, liub at cs.uic.edu
+        http://www.cs.uic.edu/~liub
+
+Distributed with permission.
+
+Related papers:
+
+- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
+    Proceedings of the 22nd International Conference on Computational Linguistics
+    (Coling-2008), Manchester, 18-22 August, 2008.
+
+- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
+    Opinions on the Web". Proceedings of the 14th international World Wide Web
+    conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
+"""
+import re
+
+from nltk.corpus.reader.api import *
+from nltk.tokenize import *
+
+
+class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
+    """
+    Reader for the Pros and Cons sentence dataset.
+
+        >>> from nltk.corpus import pros_cons
+        >>> pros_cons.sents(categories='Cons')
+        [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
+        'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
+        ...]
+        >>> pros_cons.words('IntegratedPros.txt')
+        ['Easy', 'to', 'use', ',', 'economical', '!', ...]
+    """
+    CorpusView = StreamBackedCorpusView
+
+    def __init__(self, root, fileids, word_tokenizer=WordPunctTokenizer(),
+                 encoding='utf8', **kwargs):
+        """
+        :param root: The root directory for the corpus.
+        :param fileids: a list or regexp specifying the fileids in the corpus.
+        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
+            into words. Default: `WhitespaceTokenizer`
+        :param encoding: the encoding that should be used to read the corpus.
+        :param kwargs: additional parameters passed to CategorizedCorpusReader.
+        """
+
+        CorpusReader.__init__(self, root, fileids, encoding)
+        CategorizedCorpusReader.__init__(self, kwargs)
+        self._word_tokenizer = word_tokenizer
+
+    def sents(self, fileids=None, categories=None):
+        """
+        Return all sentences in the corpus or in the specified files/categories.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            sentences have to be returned.
+        :param categories: a list specifying the categories whose sentences
+            have to be returned.
+        :return: the given file(s) as a list of sentences. Each sentence is
+            tokenized using the specified word_tokenizer.
+        :rtype: list(list(str))
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, compat.string_types):
+            fileids = [fileids]
+        return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
+            for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+    def words(self, fileids=None, categories=None):
+        """
+        Return all words and punctuation symbols in the corpus or in the specified
+        files/categories.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :param categories: a list specifying the categories whose words have
+            to be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        fileids = self._resolve(fileids, categories)
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, compat.string_types):
+            fileids = [fileids]
+        return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+            for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+    def _read_sent_block(self, stream):
+        sents = []
+        for i in range(20): # Read 20 lines at a time.
+            line = stream.readline()
+            if not line:
+                continue
+            sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>", line)
+            if sent:
+                sents.append(self._word_tokenizer.tokenize(sent.group(2).strip()))
+        return sents
+
+    def _read_word_block(self, stream):
+        words = []
+        for sent in self._read_sent_block(stream):
+            words.extend(sent)
+        return words
+
+    def _resolve(self, fileids, categories):
+        if fileids is not None and categories is not None:
+            raise ValueError('Specify fileids or categories, not both')
+        if categories is not None:
+            return self.fileids(categories)
+        else:
+            return fileids
diff --git a/nltk/corpus/reader/reviews.py b/nltk/corpus/reader/reviews.py
new file mode 100644
index 0000000..accc294
--- /dev/null
+++ b/nltk/corpus/reader/reviews.py
@@ -0,0 +1,324 @@
+# Natural Language Toolkit: Product Reviews Corpus Reader
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+CorpusReader for reviews corpora (syntax based on Customer Review Corpus).
+
+- Customer Review Corpus information -
+Annotated by: Minqing Hu and Bing Liu, 2004.
+    Department of Computer Sicence
+    University of Illinois at Chicago
+
+Contact: Bing Liu, liub at cs.uic.edu
+        http://www.cs.uic.edu/~liub
+
+Distributed with permission.
+
+The "product_reviews_1" and "product_reviews_2" datasets respectively contain
+annotated customer reviews of 5 and 9 products from amazon.com.
+
+Related papers:
+
+- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
+    Proceedings of the ACM SIGKDD International Conference on Knowledge
+    Discovery & Data Mining (KDD-04), 2004.
+
+- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
+    Proceedings of Nineteeth National Conference on Artificial Intelligence
+    (AAAI-2004), 2004.
+
+- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
+    Opinion Mining." Proceedings of First ACM International Conference on Web
+    Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
+    Stanford, California, USA.
+
+Symbols used in the annotated reviews:
+
+    [t] : the title of the review: Each [t] tag starts a review.
+    xxxx[+|-n]: xxxx is a product feature.
+    [+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
+          Note that the strength is quite subjective.
+          You may want ignore it, but only considering + and -
+    [-n]: Negative opinion
+    ##  : start of each sentence. Each line is a sentence.
+    [u] : feature not appeared in the sentence.
+    [p] : feature not appeared in the sentence. Pronoun resolution is needed.
+    [s] : suggestion or recommendation.
+    [cc]: comparison with a competing product from a different brand.
+    [cs]: comparison with a competing product from the same brand.
+
+Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
+    provide separation between different reviews. This is due to the fact that
+    the dataset was specifically designed for aspect/feature-based sentiment
+    analysis, for which sentence-level annotation is sufficient. For document-
+    level classification and analysis, this peculiarity should be taken into
+    consideration.
+"""
+import re
+
+from nltk.corpus.reader.api import *
+from nltk.tokenize import *
+
+TITLE = re.compile(r'^\[t\](.*)$') # [t] Title
+FEATURES = re.compile(r'((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]') # find 'feature' in feature[+3]
+NOTES = re.compile(r'\[(?!t)(p|u|s|cc|cs)\]') # find 'p' in camera[+2][p]
+SENT = re.compile(r'##(.*)$') # find tokenized sentence
+
+
+ at compat.python_2_unicode_compatible
+class Review(object):
+    """
+    A Review is the main block of a ReviewsCorpusReader.
+    """
+    def __init__(self, title=None, review_lines=None):
+        """
+        :param title: the title of the review.
+        :param review_lines: the list of the ReviewLines that belong to the Review.
+        """
+        self.title = title
+        if review_lines is None:
+            self.review_lines = []
+        else:
+            self.review_lines = review_lines
+
+    def add_line(self, review_line):
+        """
+        Add a line (ReviewLine) to the review.
+
+        :param review_line: a ReviewLine instance that belongs to the Review.
+        """
+        assert isinstance(review_line, ReviewLine)
+        self.review_lines.append(review_line)
+
+    def features(self):
+        """
+        Return a list of features in the review. Each feature is a tuple made of
+        the specific item feature and the opinion strength about that feature.
+
+        :return: all features of the review as a list of tuples (feat, score).
+        :rtype: list(tuple)
+        """
+        features = []
+        for review_line in self.review_lines:
+            features.extend(review_line.features)
+        return features
+
+    def sents(self):
+        """
+        Return all tokenized sentences in the review.
+
+        :return: all sentences of the review as lists of tokens.
+        :rtype: list(list(str))
+        """
+        return [review_line.sent for review_line in self.review_lines]
+
+    def __repr__(self):
+        return 'Review(title=\"{}\", review_lines={})'.format(self.title, self.review_lines)
+
+
+ at compat.python_2_unicode_compatible
+class ReviewLine(object):
+    """
+    A ReviewLine represents a sentence of the review, together with (optional)
+    annotations of its features and notes about the reviewed item.
+    """
+    def __init__(self, sent, features=None, notes=None):
+        self.sent = sent
+        if features is None:
+            self.features = []
+        else:
+            self.features = features
+
+        if notes is None:
+            self.notes = []
+        else:
+            self.notes = notes
+
+    def __repr__(self):
+        return ('ReviewLine(features={}, notes={}, sent={})'.format(
+            self.features, self.notes, self.sent))
+
+
+class ReviewsCorpusReader(CorpusReader):
+    """
+    Reader for the Customer Review Data dataset by Hu, Liu (2004).
+    Note: we are not applying any sentence tokenization at the moment, just word
+    tokenization.
+
+        >>> from nltk.corpus import product_reviews_1
+        >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
+        >>> review = camera_reviews[0]
+        >>> review.sents()[0]
+        ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
+        'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
+        >>> review.features()
+        [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
+        ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
+        ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
+        ('option', '+1')]
+
+    We can also reach the same information directly from the stream:
+
+        >>> product_reviews_1.features('Canon_G3.txt')
+        [('canon powershot g3', '+3'), ('use', '+2'), ...]
+
+    We can compute stats for specific product features:
+
+        >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
+        >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
+        >>> # We use float for backward compatibility with division in Python2.7
+        >>> mean = float(tot)/n_reviews
+        >>> print(n_reviews, tot, mean)
+        15 24 1.6
+    """
+    CorpusView = StreamBackedCorpusView
+
+    def __init__(self, root, fileids, word_tokenizer=WordPunctTokenizer(),
+                 encoding='utf8'):
+        """
+        :param root: The root directory for the corpus.
+        :param fileids: a list or regexp specifying the fileids in the corpus.
+        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
+            into words. Default: `WordPunctTokenizer`
+        :param encoding: the encoding that should be used to read the corpus.
+        """
+
+        CorpusReader.__init__(self, root, fileids, encoding)
+        self._word_tokenizer = word_tokenizer
+
+    def features(self, fileids=None):
+        """
+        Return a list of features. Each feature is a tuple made of the specific
+        item feature and the opinion strength about that feature.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            features have to be returned.
+        :return: all features for the item(s) in the given file(s).
+        :rtype: list(tuple)
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.CorpusView(fileid, self._read_features, encoding=enc)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def raw(self, fileids=None):
+        """
+        :param fileids: a list or regexp specifying the fileids of the files that
+            have to be returned as a raw string.
+        :return: the given file(s) as a single string.
+        :rtype: str
+        """
+        if fileids is None:
+            fileids = self._fileids
+        elif isinstance(fileids, string_types):
+            fileids = [fileids]
+        return concat([self.open(f).read() for f in fileids])
+
+    def readme(self):
+        """
+        Return the contents of the corpus README.txt file.
+        """
+        return self.open("README.txt").read()
+
+    def reviews(self, fileids=None):
+        """
+        Return all the reviews as a list of Review objects. If `fileids` is
+        specified, return all the reviews from each of the specified files.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            reviews have to be returned.
+        :return: the given file(s) as a list of reviews.
+        """
+        if fileids is None:
+            fileids = self._fileids
+        return concat([self.CorpusView(fileid, self._read_review_block, encoding=enc)
+                       for (fileid, enc) in self.abspaths(fileids, True)])
+
+    def sents(self, fileids=None):
+        """
+        Return all sentences in the corpus or in the specified files.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            sentences have to be returned.
+        :return: the given file(s) as a list of sentences, each encoded as a
+            list of word strings.
+        :rtype: list(list(str))
+        """
+        return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
+                       for (path, enc, fileid)
+                       in self.abspaths(fileids, True, True)])
+
+    def words(self, fileids=None):
+        """
+        Return all words and punctuation symbols in the corpus or in the specified
+        files.
+
+        :param fileids: a list or regexp specifying the ids of the files whose
+            words have to be returned.
+        :return: the given file(s) as a list of words and punctuation symbols.
+        :rtype: list(str)
+        """
+        return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
+                       for (path, enc, fileid)
+                       in self.abspaths(fileids, True, True)])
+
+    def _read_features(self, stream):
+        features = []
+        for i in range(20):
+            line = stream.readline()
+            if not line:
+                return features
+            features.extend(re.findall(FEATURES, line))
+        return features
+
+    def _read_review_block(self, stream):
+        while True:
+            line = stream.readline()
+            if not line:
+                return [] # end of file.
+            title_match = re.match(TITLE, line)
+            if title_match:
+                review = Review(title=title_match.group(1).strip()) # We create a new review
+                break
+
+        # Scan until we find another line matching the regexp, or EOF.
+        while True:
+            oldpos = stream.tell()
+            line = stream.readline()
+            # End of file:
+            if not line:
+                return [review]
+            # Start of a new review: backup to just before it starts, and
+            # return the review we've already collected.
+            if re.match(TITLE, line):
+                stream.seek(oldpos)
+                return [review]
+            # Anything else is part of the review line.
+            feats = re.findall(FEATURES, line)
+            notes = re.findall(NOTES, line)
+            sent = re.findall(SENT, line)
+            if sent:
+                sent = self._word_tokenizer.tokenize(sent[0])
+            review_line = ReviewLine(sent=sent, features=feats, notes=notes)
+            review.add_line(review_line)
+
+    def _read_sent_block(self, stream):
+        sents = []
+        for review in self._read_review_block(stream):
+            sents.extend([sent for sent in review.sents()])
+        return sents
+
+    def _read_word_block(self, stream):
+        words = []
+        for i in range(20): # Read 20 lines at a time.
+            line = stream.readline()
+            sent = re.findall(SENT, line)
+            if sent:
+                words.extend(self._word_tokenizer.tokenize(sent[0]))
+        return words
diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py
index b4ee146..7fec8d9 100644
--- a/nltk/corpus/reader/wordnet.py
+++ b/nltk/corpus/reader/wordnet.py
@@ -6,6 +6,10 @@
 #         Steven Bird <stevenbird1 at gmail.com>
 #         Edward Loper <edloper at gmail.com>
 #         Nitin Madnani <nmadnani at ets.org>
+#         Nasruddin A’aidil Shari
+#         Sim Wei Ying Geraldine
+#         Soe Lynn
+#         Francis Bond <bond at ieee.org>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
@@ -18,6 +22,11 @@ such as hypernyms, hyponyms, synonyms, antonyms etc.
 
 For details about WordNet see:
 http://wordnet.princeton.edu/
+
+This module also allows you to find lemmas in languages 
+other than English from the Open Multilingual Wordnet
+http://compling.hss.ntu.edu.sg/omw/
+
 """
 
 from __future__ import print_function, unicode_literals
@@ -252,7 +261,7 @@ class Lemma(_WordNetObject):
         self._frame_ids = []
         self._lexname_index = lexname_index
         self._lex_id = lex_id
-        self._lang = "en"
+        self._lang = 'eng'
 
         self._key = None # gets set later.
 
@@ -410,21 +419,22 @@ class Synset(_WordNetObject):
         elif self._pos == VERB:
             return True
 
-    def lemma_names(self, lang='en'):
+    def lemma_names(self, lang='eng'):
         '''Return all the lemma_names associated with the synset'''
-        if lang=='en':
+        if lang=='eng':
             return self._lemma_names
         else:
             self._wordnet_corpus_reader._load_lang_data(lang)
 
             i = self._wordnet_corpus_reader.ss2of(self)
-            for x in self._wordnet_corpus_reader._lang_data[lang][0].keys():
-                if x == i:
-                    return self._wordnet_corpus_reader._lang_data[lang][0][x]
+            if i in self._wordnet_corpus_reader._lang_data[lang][0]:
+                return self._wordnet_corpus_reader._lang_data[lang][0][i]
+            else:
+                return []
                 
-    def lemmas(self, lang='en'):
+    def lemmas(self, lang='eng'):
         '''Return all the lemma objects associated with the synset'''
-        if lang=='en':
+        if lang=='eng':
             return self._lemmas
         else:
             self._wordnet_corpus_reader._load_lang_data(lang)
@@ -1055,8 +1065,8 @@ class WordNetCorpusReader(CorpusReader):
         return self._synset_from_pos_and_offset(of[-1], int(of[:8]))      
 
     def ss2of(self, ss):
-        ''' return the ILI of the synset '''
-        return ( "0"*8 + str(ss.offset()) +"-"+ str(ss.pos()))[-10:]
+        ''' return the ID of the synset '''
+        return ("{:08d}-{}".format(ss.offset(), ss.pos()))
     
     def _load_lang_data(self, lang):
         ''' load the wordnet data of the requested language from the file to the cache, _lang_data '''
@@ -1084,7 +1094,7 @@ class WordNetCorpusReader(CorpusReader):
     def langs(self):
         ''' return a list of languages supported by Multilingual Wordnet '''
         import os
-        langs = []
+        langs = [ 'eng' ]
         fileids = self._omw_reader.fileids()
         for fileid in fileids:
             file_name, file_extension = os.path.splitext(fileid)
@@ -1175,7 +1185,7 @@ class WordNetCorpusReader(CorpusReader):
     # Loading Lemmas
     #////////////////////////////////////////////////////////////
 
-    def lemma(self, name, lang='en'):
+    def lemma(self, name, lang='eng'):
         '''Return lemma object that matches the name'''
         # cannot simply split on first '.', e.g.: '.45_caliber.a.01..45_caliber'
         separator = SENSENUM_RE.search(name).start()
@@ -1396,7 +1406,7 @@ class WordNetCorpusReader(CorpusReader):
     # Retrieve synsets and lemmas.
     #////////////////////////////////////////////////////////////
 
-    def synsets(self, lemma, pos=None, lang='en'):
+    def synsets(self, lemma, pos=None, lang='eng'):
         """Load all synsets with a given lemma and part of speech tag.
         If no pos is specified, all synsets for all parts of speech
         will be loaded. 
@@ -1405,7 +1415,7 @@ class WordNetCorpusReader(CorpusReader):
         """
         lemma = lemma.lower()
         
-        if lang == 'en':
+        if lang == 'eng':
             get_synset = self._synset_from_pos_and_offset
             index = self._lemma_pos_offset_map
             if pos is None:
@@ -1424,12 +1434,12 @@ class WordNetCorpusReader(CorpusReader):
                 synset_list.append(self.of2ss(l))
             return synset_list
 
-    def lemmas(self, lemma, pos=None, lang='en'):
+    def lemmas(self, lemma, pos=None, lang='eng'):
         """Return all Lemma objects with a name matching the specified lemma
         name and part of speech tag. Matches any part of speech tag if none is
         specified."""
 
-        if lang == 'en':
+        if lang == 'eng':
             lemma = lemma.lower()
             return [lemma_obj
                     for synset in self.synsets(lemma, pos)
@@ -1448,12 +1458,12 @@ class WordNetCorpusReader(CorpusReader):
                 lemmas.append(a)
             return lemmas
 
-    def all_lemma_names(self, pos=None, lang='en'):
+    def all_lemma_names(self, pos=None, lang='eng'):
         """Return all lemma names for all synsets for the given
         part of speech tag and langauge or languages. If pos is not specified, all synsets
         for all parts of speech will be used."""
 
-        if lang == 'en':
+        if lang == 'eng':
             if pos is None:
                 return iter(self._lemma_pos_offset_map)
             else:
@@ -1529,11 +1539,59 @@ class WordNetCorpusReader(CorpusReader):
             else:
                 data_file.close()
 
+    def words(self, lang='eng'):
+        """return lemmas of the given language as list of words"""
+        return self.all_lemma_names(lang=lang)
+
+    def license(self, lang='eng'):
+        """Return the contents of LICENSE (for omw)
+           use lang=lang to get the license for an individual language"""
+        if lang == 'eng':
+            return self.open("LICENSE").read()
+        elif lang in self.langs():
+            return self._omw_reader.open("{}/LICENSE".format(lang)).read()
+        elif lang == 'omw':
+            ### under the not unreasonable assumption you don't mean Omwunra-Toqura
+            return self._omw_reader.open("LICENSE").read()
+        else:
+            raise WordNetError("Language is not supported.")
+ 
+    def readme(self, lang='omw'):
+        """Return the contents of README (for omw)
+           use lang=lang to get the readme for an individual language"""
+        if lang == 'eng':
+            return self.open("README").read()
+        elif lang in self.langs():
+            return self._omw_reader.open("{}/README".format(lang)).read()
+        elif lang == 'omw':
+            ### under the not unreasonable assumption you don't mean Omwunra-Toqura
+            return self._omw_reader.open("README").read()
+        else:
+            raise WordNetError("Language is not supported.")
+
+    def citation(self, lang='omw'):
+        """Return the contents of citation.bib file (for omw)
+           use lang=lang to get the citation for an individual language"""
+        if lang == 'eng':
+            return self.open("citation.bib").read()
+        elif lang in self.langs():
+            return self._omw_reader.open("{}/citation.bib".format(lang)).read()
+        elif lang == 'omw':
+            ### under the not unreasonable assumption you don't mean Omwunra-Toqura
+            return self._omw_reader.open("citation.bib").read()
+        else:
+            raise WordNetError("Language is not supported.")
+
+
+
     #////////////////////////////////////////////////////////////
     # Misc
     #////////////////////////////////////////////////////////////
     def lemma_count(self, lemma):
         """Return the frequency count for this Lemma"""
+        # Currently, count is only work for English
+        if lemma._lang != 'eng':
+            return 0
         # open the count file if we haven't already
         if self._key_count_file is None:
             self._key_count_file = self.open('cntlist.rev')
diff --git a/nltk/data.py b/nltk/data.py
index 93055a1..ffef466 100644
--- a/nltk/data.py
+++ b/nltk/data.py
@@ -72,7 +72,8 @@ path = []
    (e.g., in their home directory under ~/nltk_data)."""
 
 # User-specified locations:
-path += [d for d in os.environ.get('NLTK_DATA', str('')).split(os.pathsep) if d]
+_paths_from_env = os.environ.get('NLTK_DATA', str('')).split(os.pathsep)
+path += [d for d in _paths_from_env if d]
 if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
     path.append(os.path.expanduser(str('~/nltk_data')))
 
@@ -82,7 +83,8 @@ if sys.platform.startswith('win'):
         str(r'C:\nltk_data'), str(r'D:\nltk_data'), str(r'E:\nltk_data'),
         os.path.join(sys.prefix, str('nltk_data')),
         os.path.join(sys.prefix, str('lib'), str('nltk_data')),
-        os.path.join(os.environ.get(str('APPDATA'), str('C:\\')), str('nltk_data'))
+        os.path.join(
+            os.environ.get(str('APPDATA'), str('C:\\')), str('nltk_data'))
     ]
 else:
     # Common locations on UNIX & OS X:
@@ -101,9 +103,10 @@ else:
 def gzip_open_unicode(filename, mode="rb", compresslevel=9,
                       encoding='utf-8', fileobj=None, errors=None, newline=None):
     if fileobj is None:
-        fileobj=GzipFile(filename, mode, compresslevel, fileobj)
+        fileobj = GzipFile(filename, mode, compresslevel, fileobj)
     return io.TextIOWrapper(fileobj, encoding, errors, newline)
 
+
 def split_resource_url(resource_url):
     """
     Splits a resource url into "<protocol>:<path>".
@@ -130,6 +133,7 @@ def split_resource_url(resource_url):
         path_ = re.sub(r'^/{0,2}', '', path_)
     return protocol, path_
 
+
 def normalize_resource_url(resource_url):
     r"""
     Normalizes a resource url
@@ -185,6 +189,7 @@ def normalize_resource_url(resource_url):
         protocol += '://'
     return ''.join([protocol, name])
 
+
 def normalize_resource_name(resource_name, allow_relative=True, relative_path=None):
     """
     :type resource_name: str or unicode
@@ -222,7 +227,8 @@ def normalize_resource_name(resource_name, allow_relative=True, relative_path=No
     else:
         if relative_path is None:
             relative_path = os.curdir
-        resource_name = os.path.abspath(os.path.join(relative_path, resource_name))
+        resource_name = os.path.abspath(
+            os.path.join(relative_path, resource_name))
     resource_name = resource_name.replace('\\', '/').replace(os.path.sep, '/')
     if sys.platform.startswith('win') and os.path.isabs(resource_name):
         resource_name = '/' + resource_name
@@ -244,6 +250,7 @@ class PathPointer(object):
     identifies a file contained within a zipfile, that can be accessed
     by reading that zipfile.
     """
+
     def open(self, encoding=None):
         """
         Return a seekable read-only stream that can be used to read
@@ -335,7 +342,8 @@ class BufferedGzipFile(GzipFile):
     ``BufferedGzipFile`` is useful for loading large gzipped pickle objects
     as well as writing large encoded feature files for classifier training.
     """
-    SIZE = 2 * 2**20
+    MB = 2 ** 20
+    SIZE = 2 * MB
 
     @py3_data
     def __init__(self, filename=None, mode=None, compresslevel=9,
@@ -430,6 +438,7 @@ class GzipFileSystemPathPointer(FileSystemPathPointer):
     file located at a given absolute path.  ``GzipFileSystemPathPointer`` is
     appropriate for loading large gzip-compressed pickle objects efficiently.
     """
+
     def open(self, encoding=None):
         stream = BufferedGzipFile(self._path, 'rb')
         if encoding:
@@ -467,8 +476,8 @@ class ZipFilePathPointer(PathPointer):
                 # then check if the zipfile contains any files that
                 # are under the given directory.
                 if (entry.endswith('/') and
-                    [n for n in zipfile.namelist() if n.startswith(entry)]):
-                    pass # zipfile contains a file in that directory.
+                        [n for n in zipfile.namelist() if n.startswith(entry)]):
+                    pass  # zipfile contains a file in that directory.
                 else:
                     # Otherwise, complain.
                     raise IOError('Zipfile %r does not contain %r' %
@@ -525,6 +534,7 @@ _resource_cache = {}
 """A dictionary used to cache resources so that they won't
    need to be loaded more than once."""
 
+
 def find(resource_name, paths=None):
     """
     Find the given resource by searching through the directories and
@@ -564,7 +574,8 @@ def find(resource_name, paths=None):
     """
     resource_name = normalize_resource_name(resource_name, True)
 
-    # Resolve default paths at runtime in-case the user overrides nltk.data.path
+    # Resolve default paths at runtime in-case the user overrides
+    # nltk.data.path
     if paths is None:
         paths = path
 
@@ -606,7 +617,8 @@ def find(resource_name, paths=None):
     if zipfile is None:
         pieces = resource_name.split('/')
         for i in range(len(pieces)):
-            modified_name = '/'.join(pieces[:i]+[pieces[i]+'.zip']+pieces[i:])
+            modified_name = '/'.join(pieces[:i] +
+                                     [pieces[i] + '.zip'] + pieces[i:])
             try:
                 return find(modified_name, paths)
             except LookupError:
@@ -619,10 +631,11 @@ def find(resource_name, paths=None):
         (resource_name,), initial_indent='  ', subsequent_indent='  ',
         width=66)
     msg += '\n  Searched in:' + ''.join('\n    - %r' % d for d in paths)
-    sep = '*'*70
+    sep = '*' * 70
     resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
     raise LookupError(resource_not_found)
 
+
 def retrieve(resource_url, filename=None, verbose=True):
     """
     Copy the given resource to a local file.  If no filename is
@@ -653,9 +666,10 @@ def retrieve(resource_url, filename=None, verbose=True):
     # Copy infile -> outfile, using 64k blocks.
     with open(filename, "wb") as outfile:
         while True:
-            s = infile.read(1024*64) # 64k blocks.
+            s = infile.read(1024 * 64)  # 64k blocks.
             outfile.write(s)
-            if not s: break
+            if not s:
+                break
 
     infile.close()
 
@@ -696,6 +710,7 @@ AUTO_FORMATS = {
     'text': 'text',
 }
 
+
 def load(resource_url, format='auto', cache=True, verbose=False,
          logic_parser=None, fstruct_reader=None, encoding=None):
     """
@@ -847,6 +862,7 @@ def load(resource_url, format='auto', cache=True, verbose=False,
 
     return resource_val
 
+
 def show_cfg(resource_url, escape='##'):
     """
     Write out a grammar file, ignoring escaped and empty lines.
@@ -862,8 +878,10 @@ def show_cfg(resource_url, escape='##'):
     resource_val = load(resource_url, format='text', cache=False)
     lines = resource_val.splitlines()
     for l in lines:
-        if l.startswith(escape): continue
-        if re.match('^$', l): continue
+        if l.startswith(escape):
+            continue
+        if re.match('^$', l):
+            continue
         print(l)
 
 
@@ -874,6 +892,7 @@ def clear_cache():
     """
     _resource_cache.clear()
 
+
 def _open(resource_url):
     """
     Helper function that returns an open file object for a resource,
@@ -906,7 +925,9 @@ def _open(resource_url):
 # We shouldn't apply @python_2_unicode_compatible
 # decorator to LazyLoader, this is resource.__class__ responsibility.
 
+
 class LazyLoader(object):
+
     @py3_data
     def __init__(self, _path):
         self._path = _path
@@ -976,6 +997,7 @@ class OpenOnDemandZipFile(zipfile.ZipFile):
 #{ Seekable Unicode Stream Reader
 ######################################################################
 
+
 class SeekableUnicodeStreamReader(object):
     """
     A stream reader that automatically encodes the source byte stream
@@ -992,7 +1014,7 @@ class SeekableUnicodeStreamReader(object):
     this shouldn't cause a problem with any of python's builtin
     unicode encodings.
     """
-    DEBUG = True #: If true, then perform extra sanity checks.
+    DEBUG = True  # : If true, then perform extra sanity checks.
 
     @py3_data
     def __init__(self, stream, encoding, errors='strict'):
@@ -1111,13 +1133,14 @@ class SeekableUnicodeStreamReader(object):
             if len(lines) > 1:
                 line = lines[0]
                 self.linebuffer = lines[1:]
-                self._rewind_numchars = len(new_chars)-(len(chars)-len(line))
+                self._rewind_numchars = (len(new_chars) -
+                                         (len(chars) - len(line)))
                 self._rewind_checkpoint = startpos
                 break
             elif len(lines) == 1:
                 line0withend = lines[0]
                 line0withoutend = lines[0].splitlines(False)[0]
-                if line0withend != line0withoutend: # complete line
+                if line0withend != line0withoutend:  # complete line
                     line = line0withend
                     break
 
@@ -1145,8 +1168,10 @@ class SeekableUnicodeStreamReader(object):
     def next(self):
         """Return the next decoded line from the underlying stream."""
         line = self.readline()
-        if line: return line
-        else: raise StopIteration
+        if line:
+            return line
+        else:
+            raise StopIteration
 
     def __next__(self):
         return self.next()
@@ -1230,12 +1255,13 @@ class SeekableUnicodeStreamReader(object):
             bytes that will be neded to move forward by ``offset`` chars.
             Defaults to ``offset``.
         """
-        if est_bytes is None: est_bytes = offset
+        if est_bytes is None:
+            est_bytes = offset
         bytes = b''
 
         while True:
             # Read in a block of bytes.
-            newbytes = self.stream.read(est_bytes-len(bytes))
+            newbytes = self.stream.read(est_bytes - len(bytes))
             bytes += newbytes
 
             # Decode the bytes to characters.
@@ -1244,7 +1270,7 @@ class SeekableUnicodeStreamReader(object):
             # If we got the right number of characters, then seek
             # backwards over any truncated characters, and return.
             if len(chars) == offset:
-                self.stream.seek(-len(bytes)+bytes_decoded, 1)
+                self.stream.seek(-len(bytes) + bytes_decoded, 1)
                 return
 
             # If we went too far, then we can back-up until we get it
@@ -1252,9 +1278,9 @@ class SeekableUnicodeStreamReader(object):
             if len(chars) > offset:
                 while len(chars) > offset:
                     # Assume at least one byte/char.
-                    est_bytes += offset-len(chars)
+                    est_bytes += offset - len(chars)
                     chars, bytes_decoded = self._incr_decode(bytes[:est_bytes])
-                self.stream.seek(-len(bytes)+bytes_decoded, 1)
+                self.stream.seek(-len(bytes) + bytes_decoded, 1)
                 return
 
             # Otherwise, we haven't read enough bytes yet; loop again.
@@ -1278,11 +1304,11 @@ class SeekableUnicodeStreamReader(object):
         orig_filepos = self.stream.tell()
 
         # Calculate an estimate of where we think the newline is.
-        bytes_read = ( (orig_filepos-len(self.bytebuffer)) -
-                       self._rewind_checkpoint )
+        bytes_read = ((orig_filepos - len(self.bytebuffer)) -
+                      self._rewind_checkpoint)
         buf_size = sum(len(line) for line in self.linebuffer)
         est_bytes = int((bytes_read * self._rewind_numchars /
-                     (self._rewind_numchars + buf_size)))
+                         (self._rewind_numchars + buf_size)))
 
         self.stream.seek(self._rewind_checkpoint)
         self._char_seek_forward(self._rewind_numchars, est_bytes)
@@ -1312,7 +1338,8 @@ class SeekableUnicodeStreamReader(object):
         them using this reader's encoding, and return the resulting
         unicode string.  ``linebuffer`` is not included in the result.
         """
-        if size == 0: return ''
+        if size == 0:
+            return ''
 
         # Skip past the byte order marker, if present.
         if self._bom and self.stream.tell() == 0:
@@ -1332,7 +1359,8 @@ class SeekableUnicodeStreamReader(object):
         if (size is not None) and (not chars) and (len(new_bytes) > 0):
             while not chars:
                 new_bytes = self.stream.read(1)
-                if not new_bytes: break # end of file.
+                if not new_bytes:
+                    break  # end of file.
                 bytes += new_bytes
                 chars, bytes_decoded = self._incr_decode(bytes)
 
@@ -1382,7 +1410,7 @@ class SeekableUnicodeStreamReader(object):
                   (codecs.BOM_UTF32_BE, 'utf32-be')],
         'utf32le': [(codecs.BOM_UTF32_LE, None)],
         'utf32be': [(codecs.BOM_UTF32_BE, None)],
-        }
+    }
 
     def _check_bom(self):
         # Normalize our encoding name
@@ -1399,7 +1427,8 @@ class SeekableUnicodeStreamReader(object):
             # Check for each possible BOM.
             for (bom, new_encoding) in bom_info:
                 if bytes.startswith(bom):
-                    if new_encoding: self.encoding = new_encoding
+                    if new_encoding:
+                        self.encoding = new_encoding
                     return len(bom)
 
         return None
diff --git a/nltk/internals.py b/nltk/internals.py
index 4524405..41e378d 100644
--- a/nltk/internals.py
+++ b/nltk/internals.py
@@ -11,7 +11,7 @@ from __future__ import print_function
 import subprocess
 import os
 import fnmatch
-import re, sre_constants, sre_parse, sre_compile
+import re
 import warnings
 import textwrap
 import types
@@ -27,34 +27,6 @@ except ImportError:
 
 from nltk import __file__
 from nltk import compat
-######################################################################
-# Regular Expression Processing
-######################################################################
-
-def compile_regexp_to_noncapturing(pattern, flags=0):
-    """
-    Compile the regexp pattern after switching all grouping parentheses
-    in the given regexp pattern to non-capturing groups.
-
-    :type pattern: str
-    :rtype: str
-    """
-    def convert_regexp_to_noncapturing_parsed(parsed_pattern):
-        res_data = []
-        for key, value in parsed_pattern.data:
-            if key == sre_constants.SUBPATTERN:
-                index, subpattern = value
-                value = (None, convert_regexp_to_noncapturing_parsed(subpattern))
-            elif key == sre_constants.GROUPREF:
-                raise ValueError('Regular expressions with back-references are not supported: {0}'.format(pattern))
-            res_data.append((key, value))
-        parsed_pattern.data = res_data
-        parsed_pattern.pattern.groups = 1
-        parsed_pattern.pattern.groupdict = {}
-        return parsed_pattern
-
-    return sre_compile.compile(convert_regexp_to_noncapturing_parsed(sre_parse.parse(pattern)), flags=flags)
-
 
 ##########################################################################
 # Java Via Command-Line
diff --git a/nltk/parse/dependencygraph.py b/nltk/parse/dependencygraph.py
index 72016f4..e76cef8 100755
--- a/nltk/parse/dependencygraph.py
+++ b/nltk/parse/dependencygraph.py
@@ -36,7 +36,7 @@ class DependencyGraph(object):
     A container for the nodes and labelled edges of a dependency structure.
     """
 
-    def __init__(self, tree_str=None, cell_extractor=None, zero_based=False, cell_separator=None):
+    def __init__(self, tree_str=None, cell_extractor=None, zero_based=False, cell_separator=None, top_relation_label='ROOT'):
         """Dependency graph.
 
         We place a dummy `TOP` node with the index 0, since the root node is
@@ -50,6 +50,9 @@ class DependencyGraph(object):
         :param str cell_separator: the cell separator. If not provided, cells
         are split by whitespace.
 
+        :param str top_relation_label: the label by which the top relation is
+        identified, for examlple, `ROOT`, `null` or `TOP`.
+
         """
         self.nodes = defaultdict(lambda:  {'address': None,
                                            'word': None,
@@ -66,7 +69,6 @@ class DependencyGraph(object):
             {
                 'ctag': 'TOP',
                 'tag': 'TOP',
-                'rel': 'TOP',
                 'address': 0,
             }
         )
@@ -79,6 +81,7 @@ class DependencyGraph(object):
                 cell_extractor=cell_extractor,
                 zero_based=zero_based,
                 cell_separator=cell_separator,
+                top_relation_label=top_relation_label,
             )
 
     def remove_by_address(self, address):
@@ -216,13 +219,15 @@ class DependencyGraph(object):
         return "<DependencyGraph with {0} nodes>".format(len(self.nodes))
 
     @staticmethod
-    def load(filename, zero_based=False, cell_separator=None):
+    def load(filename, zero_based=False, cell_separator=None, top_relation_label='ROOT'):
         """
         :param filename: a name of a file in Malt-TAB format
         :param zero_based: nodes in the input file are numbered starting from 0
         rather than 1 (as produced by, e.g., zpar)
         :param str cell_separator: the cell separator. If not provided, cells
         are split by whitespace.
+        :param str top_relation_label: the label by which the top relation is
+        identified, for examlple, `ROOT`, `null` or `TOP`.
 
         :return: a list of DependencyGraphs
 
@@ -233,6 +238,7 @@ class DependencyGraph(object):
                     tree_str,
                     zero_based=zero_based,
                     cell_separator=cell_separator,
+                    top_relation_label=top_relation_label,
                 )
                 for tree_str in infile.read().split('\n\n')
             ]
@@ -259,7 +265,7 @@ class DependencyGraph(object):
         if not self.contains_address(node['address']):
             self.nodes[node['address']].update(node)
 
-    def _parse(self, input_, cell_extractor=None, zero_based=False, cell_separator=None):
+    def _parse(self, input_, cell_extractor=None, zero_based=False, cell_separator=None, top_relation_label='ROOT'):
         """Parse a sentence.
 
         :param extractor: a function that given a tuple of cells returns a
@@ -269,23 +275,41 @@ class DependencyGraph(object):
         :param str cell_separator: the cell separator. If not provided, cells
         are split by whitespace.
 
+        :param str top_relation_label: the label by which the top relation is
+        identified, for examlple, `ROOT`, `null` or `TOP`.
+
         """
 
-        def extract_3_cells(cells):
+        def extract_3_cells(cells, index):
             word, tag, head = cells
-            return word, word, tag, tag, '', head, ''
+            return index, word, word, tag, tag, '', head, ''
 
-        def extract_4_cells(cells):
+        def extract_4_cells(cells, index):
             word, tag, head, rel = cells
-            return word, word, tag, tag, '', head, rel
-
-        def extract_10_cells(cells):
-            _, word, lemma, ctag, tag, feats, head, rel, _, _ = cells
-            return word, lemma, ctag, tag, feats, head, rel
+            return index, word, word, tag, tag, '', head, rel
+
+        def extract_7_cells(cells, index):
+            line_index, word, lemma, tag, _, head, rel = cells
+            try:
+                index = int(line_index)
+            except ValueError:
+                # index can't be parsed as an integer, use default
+                pass
+            return index, word, lemma, tag, tag, '', head, rel
+
+        def extract_10_cells(cells, index):
+            line_index, word, lemma, ctag, tag, feats, head, rel, _, _ = cells
+            try:
+                index = int(line_index)
+            except ValueError:
+                # index can't be parsed as an integer, use default
+                pass
+            return index, word, lemma, ctag, tag, feats, head, rel
 
         extractors = {
             3: extract_3_cells,
             4: extract_4_cells,
+            7: extract_7_cells,
             10: extract_10_cells,
         }
 
@@ -312,7 +336,16 @@ class DependencyGraph(object):
                         'CoNLL(10) or Malt-Tab(4) format'.format(cell_number)
                     )
 
-            word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells)
+            try:
+                index, word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells, index)
+            except (TypeError, ValueError):
+                # cell_extractor doesn't take 2 arguments or doesn't return 8
+                # values; assume the cell_extractor is an older external
+                # extractor and doesn't accept or return an index.
+                word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells)
+
+            if head == '_':
+                continue
 
             head = int(head)
             if zero_based:
@@ -331,14 +364,15 @@ class DependencyGraph(object):
                 }
             )
 
-            # Make sure that he fake root node has labeled dependencies.
+            # Make sure that the fake root node has labeled dependencies.
             if (cell_number == 3) and (head == 0):
-                rel = 'ROOT'
+                rel = top_relation_label
             self.nodes[head]['deps'][rel].append(index)
 
-        if self.nodes[0]['deps']['ROOT']:
-            root_address = self.nodes[0]['deps']['ROOT'][0]
+        if self.nodes[0]['deps'][top_relation_label]:
+            root_address = self.nodes[0]['deps'][top_relation_label][0]
             self.root = self.nodes[root_address]
+            self.top_relation_label = top_relation_label
         else:
             warnings.warn(
                 "The graph doesn't contain a node "
diff --git a/nltk/parse/malt.py b/nltk/parse/malt.py
index 49f6b9e..41141fe 100644
--- a/nltk/parse/malt.py
+++ b/nltk/parse/malt.py
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*- 
+# -*- coding: utf-8 -*-
 # Natural Language Toolkit: Interface to MaltParser
 #
 # Author: Dan Garrette <dhgarrette at gmail.com>
@@ -9,15 +9,13 @@
 
 from __future__ import print_function
 from __future__ import unicode_literals
-from six import text_type
+from nltk.six import text_type
 
 import os
 import tempfile
 import subprocess
 import inspect
 
-from nltk.tokenize import word_tokenize
-from nltk.tag import pos_tag
 from nltk.data import ZipFilePathPointer
 from nltk.internals import find_dir, find_file, find_jars_within_path
 
@@ -65,12 +63,12 @@ def find_maltparser(parser_dirname):
         _malt_dir = parser_dirname
     else: # Try to find path to maltparser directory in environment variables.
         _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',))
-    # Checks that that the found directory contains all the necessary .jar 
+    # Checks that that the found directory contains all the necessary .jar
     malt_dependencies = ['','','']
     _malt_jars = set(find_jars_within_path(_malt_dir))
     _jars = set(jar.rpartition('/')[2] for jar in _malt_jars)
     malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar'])
-    
+
     assert malt_dependencies.issubset(_jars)
     assert any(filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars))
     return list(_malt_jars)
@@ -95,7 +93,7 @@ class MaltParser(ParserI):
     - (optionally) the path to a pre-trained MaltParser .mco model file
     - (optionally) the tagger to use for POS tagging before parsing
     - (optionally) additional Java arguments
-    
+
     Example:
         >>> from nltk.parse import malt
         >>> # With MALT_PARSER and MALT_MODEL environment set.
@@ -111,24 +109,24 @@ class MaltParser(ParserI):
         """
         An interface for parsing with the Malt Parser.
 
-        :param parser_dirname: The path to the maltparser directory that 
+        :param parser_dirname: The path to the maltparser directory that
         contains the maltparser-1.x.jar
         :type parser_dirname: str
-        :param model_filename: The name of the pre-trained model with .mco file 
+        :param model_filename: The name of the pre-trained model with .mco file
         extension. If provided, training will not be required.
-        (see http://www.maltparser.org/mco/mco.html and 
+        (see http://www.maltparser.org/mco/mco.html and
         see http://www.patful.com/chalk/node/185)
         :type model_filename: str
-        :param tagger: The tagger used to POS tag the raw string before 
+        :param tagger: The tagger used to POS tag the raw string before
         formatting to CONLL format. It should behave like `nltk.pos_tag`
         :type tagger: function
-        :param additional_java_args: This is the additional Java arguments that 
-        one can use when calling Maltparser, usually this is the heapsize 
+        :param additional_java_args: This is the additional Java arguments that
+        one can use when calling Maltparser, usually this is the heapsize
         limits, e.g. `additional_java_args=['-Xmx1024m']`
         (see http://goo.gl/mpDBvQ)
         :type additional_java_args: list
         """
-        
+
         # Find all the necessary jar files for MaltParser.
         self.malt_jars = find_maltparser(parser_dirname)
         # Initialize additional java arguments.
@@ -138,90 +136,65 @@ class MaltParser(ParserI):
         self.model = find_malt_model(model_filename)
         self._trained = self.model != 'malt_temp.mco'
         # Set the working_dir parameters i.e. `-w` from MaltParser's option.
-        self.working_dir = tempfile.gettempdir() 
+        self.working_dir = tempfile.gettempdir()
         # Initialize POS tagger.
         self.tagger = tagger if tagger is not None else malt_regex_tagger()
 
-    def pretrained_model_sanity_checks(self, tree_str):
-        """
-        Performs sanity checks and replace oddities in pre-trained model
-        outputs from http://www.maltparser.org/mco/english_parser/engmalt.html
-        Note: This hack function should go away once nltk.parse.DependencyGraph 
-        handles optional TOP label!!!
-        :param tree_str: The CONLL output file for a single parse
-        :type tree_str: str
-        :return: str
-        """
-        # Checks for oddities in English pre-trained model.
-        if (
-        '\t0\tnull\t' in tree_str and 
-        self.model.endswith(('engmalt.linear-1.7.mco', 'engmalt.poly-1.7.mco'))
-        ):
-            tree_str = tree_str.replace('\t0\tnull\t','\t0\tROOT\t')
-        # Checks for oddities in French pre-trained model.
-        if '\t0\troot\t' in tree_str and \
-        self.model.endswith('fremalt-1.7.mco'):
-            tree_str = tree_str.replace('\t0\troot\t','\t0\tROOT\t')
-        return tree_str    
-    
-    def parse_tagged_sents(self, sentences, verbose=False):
+    def parse_tagged_sents(self, sentences, verbose=False, top_relation_label='null'):
         """
-        Use MaltParser to parse multiple POS tagged sentences. Takes multiple 
-        sentences where each sentence is a list of (word, tag) tuples. 
+        Use MaltParser to parse multiple POS tagged sentences. Takes multiple
+        sentences where each sentence is a list of (word, tag) tuples.
         The sentences must have already been tokenized and tagged.
 
         :param sentences: Input sentences to parse
         :type sentence: list(list(tuple(str, str)))
-        :return: iter(iter(``DependencyGraph``)) the dependency graph 
+        :return: iter(iter(``DependencyGraph``)) the dependency graph
         representation of each sentence
         """
         if not self._trained:
             raise Exception("Parser has not been trained. Call train() first.")
 
-
-        with tempfile.NamedTemporaryFile(prefix='malt_input.conll.', 
+        with tempfile.NamedTemporaryFile(prefix='malt_input.conll.',
               dir=self.working_dir, mode='w', delete=False) as input_file:
-              with tempfile.NamedTemporaryFile(prefix='malt_output.conll.', 
+              with tempfile.NamedTemporaryFile(prefix='malt_output.conll.',
                      dir=self.working_dir, mode='w', delete=False) as output_file:
                 # Convert list of sentences to CONLL format.
                 for line in taggedsents_to_conll(sentences):
                     input_file.write(text_type(line))
                 input_file.close()
-    
+
                 # Generate command to run maltparser.
-                cmd =self.generate_malt_command(input_file.name, 
+                cmd =self.generate_malt_command(input_file.name,
                                 output_file.name, mode="parse")
-    
-                # This is a maltparser quirk, it needs to be run 
+
+                # This is a maltparser quirk, it needs to be run
                 # where the model file is. otherwise it goes into an awkward
                 # missing .jars or strange -w working_dir problem.
                 _current_path = os.getcwd() # Remembers the current path.
                 try: # Change to modelfile path
-                    os.chdir(os.path.split(self.model)[0]) 
+                    os.chdir(os.path.split(self.model)[0])
                 except:
                     pass
                 ret = self._execute(cmd, verbose) # Run command.
                 os.chdir(_current_path) # Change back to current path.
-    
+
                 if ret is not 0:
                     raise Exception("MaltParser parsing (%s) failed with exit "
                             "code %d" % (' '.join(cmd), ret))
-    
+
                 # Must return iter(iter(Tree))
                 with open(output_file.name) as infile:
                     for tree_str in infile.read().split('\n\n'):
-                        tree_str = self.pretrained_model_sanity_checks(tree_str)
-                        yield(iter([DependencyGraph(tree_str)]))
+                        yield(iter([DependencyGraph(tree_str, top_relation_label=top_relation_label)]))
 
         os.remove(input_file.name)
         os.remove(output_file.name)
 
-    
-    def parse_sents(self, sentences, verbose=False):
+    def parse_sents(self, sentences, verbose=False, top_relation_label='null'):
         """
-        Use MaltParser to parse multiple sentences. 
+        Use MaltParser to parse multiple sentences.
         Takes a list of sentences, where each sentence is a list of words.
-        Each sentence will be automatically tagged with this 
+        Each sentence will be automatically tagged with this
         MaltParser instance's tagger.
 
         :param sentences: Input sentences to parse
@@ -229,9 +202,8 @@ class MaltParser(ParserI):
         :return: iter(DependencyGraph)
         """
         tagged_sentences = (self.tagger(sentence) for sentence in sentences)
-        return self.parse_tagged_sents(tagged_sentences, verbose)
-        
-        
+        return self.parse_tagged_sents(tagged_sentences, verbose, top_relation_label=top_relation_label)
+
     def generate_malt_command(self, inputfilename, outputfilename=None, mode=None):
         """
         This function generates the maltparser command use at the terminal.
@@ -249,7 +221,7 @@ class MaltParser(ParserI):
 
         # Adds the model file.
         if os.path.exists(self.model): # when parsing
-            cmd+= ['-c', os.path.split(self.model)[-1]] 
+            cmd+= ['-c', os.path.split(self.model)[-1]]
         else: # when learning
             cmd+= ['-c', self.model]
 
@@ -272,7 +244,7 @@ class MaltParser(ParserI):
         :param depgraphs: list of ``DependencyGraph`` objects for training input data
         :type depgraphs: DependencyGraph
         """
-        
+
         # Write the conll_str to malt_train.conll file in /tmp/
         with tempfile.NamedTemporaryFile(prefix='malt_train.conll.',
              dir=self.working_dir, mode='w', delete=False) as input_file:
@@ -282,7 +254,7 @@ class MaltParser(ParserI):
         self.train_from_file(input_file.name, verbose=verbose)
         # Removes the malt_train.conll once training finishes.
         os.remove(input_file.name)
-            
+
     def train_from_file(self, conll_file, verbose=False):
         """
         Train MaltParser from a file
@@ -290,11 +262,11 @@ class MaltParser(ParserI):
         :type conll_file: str
         """
 
-        # If conll_file is a ZipFilePathPointer, 
+        # If conll_file is a ZipFilePathPointer,
         # then we need to do some extra massaging
         if isinstance(conll_file, ZipFilePathPointer):
             with tempfile.NamedTemporaryFile(prefix='malt_train.conll.',
-            dir=self.working_dir, mode='w', delete=False) as input_file: 
+            dir=self.working_dir, mode='w', delete=False) as input_file:
                 with conll_file.open() as conll_input_file:
                     conll_str = conll_input_file.read()
                     input_file.write(text_type(conll_str))
@@ -308,17 +280,18 @@ class MaltParser(ParserI):
                     "code %d" % (' '.join(cmd), ret))
         self._trained = True
 
-    
+
 if __name__ == '__main__':
     '''
     A demostration function to show how NLTK users can use the malt parser API.
-    
+
+    >>> from nltk import pos_tag
     >>> assert 'MALT_PARSER' in os.environ, str(
     ... "Please set MALT_PARSER in your global environment, e.g.:\n"
     ... "$ export MALT_PARSER='/home/user/maltparser-1.7.2/'")
     >>>
     >>> assert 'MALT_MODEL' in os.environ, str(
-    ... "Please set MALT_MODEL in your global environment, e.g.:\n" 
+    ... "Please set MALT_MODEL in your global environment, e.g.:\n"
     ... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'")
     >>>
     >>> _dg1_str = str("1    John    _    NNP   _    _    2    SUBJ    _    _\n"
@@ -326,8 +299,8 @@ if __name__ == '__main__':
     ...             "3    a       _    DT    _    _    4    SPEC    _    _\n"
     ...             "4    dog     _    NN    _    _    2    OBJ     _    _\n"
     ...             "5    .     _    .    _    _    2    PUNCT     _    _\n")
-    >>> 
-    >>> 
+    >>>
+    >>>
     >>> _dg2_str  = str("1    John    _    NNP   _    _    2    SUBJ    _    _\n"
     ...             "2    walks   _    VB    _    _    0    ROOT    _    _\n"
     ...             "3    .     _    .    _    _    2    PUNCT     _    _\n")
@@ -361,7 +334,7 @@ if __name__ == '__main__':
     >>> # Initialize a MaltParser object with an English pre-trained model.
     >>> parser_dirname = 'maltparser-1.7.2'
     >>> model_name = 'engmalt.linear-1.7.mco'
-    >>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag)    
+    >>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag)
     >>> sent1 = 'I shot an elephant in my pajamas .'.split()
     >>> sent2 = 'Time flies like banana .'.split()
     >>> # Parse a single sentence.
@@ -377,4 +350,3 @@ if __name__ == '__main__':
     '''
     import doctest
     doctest.testmod()
-
diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py
index 11379f9..63b1a4f 100644
--- a/nltk/parse/stanford.py
+++ b/nltk/parse/stanford.py
@@ -12,107 +12,83 @@ from __future__ import unicode_literals
 import tempfile
 import os
 import re
+import warnings
 from subprocess import PIPE
+from io import StringIO
 
 from nltk import compat
 from nltk.internals import find_jar, find_jar_iter, config_java, java, _java_options
 
 from nltk.parse.api import ParserI
+from nltk.parse.dependencygraph import DependencyGraph
 from nltk.tree import Tree
 
 _stanford_url = 'http://nlp.stanford.edu/software/lex-parser.shtml'
 
-class StanfordParser(ParserI):
-    r"""
-    Interface to the Stanford Parser
+class GenericStanfordParser(ParserI):
+    """Interface to the Stanford Parser"""
 
-    >>> parser=StanfordParser(
-    ...     model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
-    ... )
-
-    >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog"))
-    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), 
-    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), 
-    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]
-
-    >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
-    ...     "the quick brown fox jumps over the lazy dog",
-    ...     "the quick grey wolf jumps over the lazy fox"
-    ... ))], [])
-    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
-    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
-    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
-    [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP',
-    [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
-    Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]
-
-    >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
-    ...     "I 'm a dog".split(),
-    ...     "This is my friends ' cat ( the tabby )".split(),
-    ... ))], [])
-    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
-    Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
-    [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
-    Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', ['-LRB-']),
-    Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', ['-RRB-'])])])])])])]
-
-    >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
-    ...     (
-    ...         ("The", "DT"),
-    ...         ("quick", "JJ"),
-    ...         ("brown", "JJ"),
-    ...         ("fox", "NN"),
-    ...         ("jumped", "VBD"),
-    ...         ("over", "IN"),
-    ...         ("the", "DT"),
-    ...         ("lazy", "JJ"),
-    ...         ("dog", "NN"),
-    ...         (".", "."),
-    ...     ),
-    ... ))],[])
-    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
-    Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
-    [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
-    """
     _MODEL_JAR_PATTERN = r'stanford-parser-(\d+)(\.(\d+))+-models\.jar'
-    _JAR = 'stanford-parser.jar'
+    _JAR = r'stanford-parser\.jar'
+    _MAIN_CLASS = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser'
+
+    _USE_STDIN = False
+    _DOUBLE_SPACED_OUTPUT = False
 
     def __init__(self, path_to_jar=None, path_to_models_jar=None,
                  model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
-                 encoding='utf8', verbose=False, java_options='-mx1000m'):
+                 encoding='utf8', verbose=False,
+                 java_options='-mx1000m', corenlp_options=''):
 
-        self._stanford_jar = find_jar(
-            self._JAR, path_to_jar,
-            env_vars=('STANFORD_PARSER',),
-            searchpath=(), url=_stanford_url,
-            verbose=verbose
+        # find the most recent code and model jar
+        stanford_jar = max(
+            find_jar_iter(
+                self._JAR, path_to_jar,
+                env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'),
+                searchpath=(), url=_stanford_url,
+                verbose=verbose, is_regex=True
+            ),
+            key=lambda model_name: re.match(self._JAR, model_name)
         )
 
-        # find the most recent model
-        self._model_jar=max(
+        model_jar=max(
             find_jar_iter(
                 self._MODEL_JAR_PATTERN, path_to_models_jar,
-                env_vars=('STANFORD_MODELS',),
+                env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'),
                 searchpath=(), url=_stanford_url,
                 verbose=verbose, is_regex=True
             ),
             key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name)
         )
 
+        self._classpath = (stanford_jar, model_jar)
+
         self.model_path = model_path
         self._encoding = encoding
+        self.corenlp_options = corenlp_options
         self.java_options = java_options
 
-    @staticmethod
-    def _parse_trees_output(output_):
+    def _parse_trees_output(self, output_):
         res = []
         cur_lines = []
+        cur_trees = []
+        blank = False
         for line in output_.splitlines(False):
             if line == '':
-                res.append(iter([Tree.fromstring('\n'.join(cur_lines))]))
-                cur_lines = []
+                if blank:
+                    res.append(iter(cur_trees))
+                    cur_trees = []
+                    blank = False
+                elif self._DOUBLE_SPACED_OUTPUT:
+                    cur_trees.append(self._make_tree('\n'.join(cur_lines)))
+                    cur_lines = []
+                    blank = True
+                else:
+                    res.append(iter([self._make_tree('\n'.join(cur_lines))]))
+                    cur_lines = []
             else:
                 cur_lines.append(line)
+                blank = False
         return iter(res)
 
     def parse_sents(self, sentences, verbose=False):
@@ -129,10 +105,10 @@ class StanfordParser(ParserI):
         :rtype: iter(iter(Tree))
         """
         cmd = [
-            'edu.stanford.nlp.parser.lexparser.LexicalizedParser',
+            self._MAIN_CLASS,
             '-model', self.model_path,
             '-sentences', 'newline',
-            '-outputFormat', 'penn',
+            '-outputFormat', self._OUTPUT_FORMAT,
             '-tokenized',
             '-escaper', 'edu.stanford.nlp.process.PTBEscapingProcessor',
         ]
@@ -162,10 +138,10 @@ class StanfordParser(ParserI):
         :rtype: iter(iter(Tree))
         """
         cmd = [
-            'edu.stanford.nlp.parser.lexparser.LexicalizedParser',
+            self._MAIN_CLASS,
             '-model', self.model_path,
             '-sentences', 'newline',
-            '-outputFormat', 'penn',
+            '-outputFormat', self._OUTPUT_FORMAT,
         ]
         return self._parse_trees_output(self._execute(cmd, '\n'.join(sentences), verbose))
 
@@ -193,10 +169,10 @@ class StanfordParser(ParserI):
         """
         tag_separator = '/'
         cmd = [
-            'edu.stanford.nlp.parser.lexparser.LexicalizedParser',
+            self._MAIN_CLASS,
             '-model', self.model_path,
             '-sentences', 'newline',
-            '-outputFormat', 'penn',
+            '-outputFormat', self._OUTPUT_FORMAT,
             '-tokenized',
             '-tagSeparator', tag_separator,
             '-tokenizerFactory', 'edu.stanford.nlp.process.WhitespaceTokenizer',
@@ -209,6 +185,8 @@ class StanfordParser(ParserI):
     def _execute(self, cmd, input_, verbose=False):
         encoding = self._encoding
         cmd.extend(['-encoding', encoding])
+        if self.corenlp_options:
+            cmd.append(self.corenlp_options)
 
         default_options = ' '.join(_java_options)
 
@@ -223,11 +201,16 @@ class StanfordParser(ParserI):
             input_file.write(input_)
             input_file.flush()
 
-            cmd.append(input_file.name)
-
             # Run the tagger and get the output.
-            stdout, stderr = java(cmd, classpath=(self._stanford_jar, self._model_jar),
-                                  stdout=PIPE, stderr=PIPE)
+            if self._USE_STDIN:
+                input_file.seek(0)
+                stdout, stderr = java(cmd, classpath=self._classpath,
+                                      stdin=input_file, stdout=PIPE, stderr=PIPE)
+            else:
+                cmd.append(input_file.name)
+                stdout, stderr = java(cmd, classpath=self._classpath,
+                                      stdout=PIPE, stderr=PIPE)
+
             stdout = stdout.decode(encoding)
 
         os.unlink(input_file.name)
@@ -237,6 +220,173 @@ class StanfordParser(ParserI):
 
         return stdout
 
+class StanfordParser(GenericStanfordParser):
+    """
+    >>> parser=StanfordParser(
+    ...     model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
+    ... )
+
+    >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
+    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
+    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]
+
+    >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
+    ...     "the quick brown fox jumps over the lazy dog",
+    ...     "the quick grey wolf jumps over the lazy fox"
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
+    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
+    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
+    [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP',
+    [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
+    Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]
+
+    >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
+    ...     "I 'm a dog".split(),
+    ...     "This is my friends ' cat ( the tabby )".split(),
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
+    Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
+    [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
+    Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', ['-LRB-']),
+    Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', ['-RRB-'])])])])])])]
+
+    >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
+    ...     (
+    ...         ("The", "DT"),
+    ...         ("quick", "JJ"),
+    ...         ("brown", "JJ"),
+    ...         ("fox", "NN"),
+    ...         ("jumped", "VBD"),
+    ...         ("over", "IN"),
+    ...         ("the", "DT"),
+    ...         ("lazy", "JJ"),
+    ...         ("dog", "NN"),
+    ...         (".", "."),
+    ...     ),
+    ... ))],[]) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
+    Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
+    [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
+    """
+
+    _OUTPUT_FORMAT = 'penn'
+
+    def _make_tree(self, result):
+        return Tree.fromstring(result)
+
+
+class StanfordDependencyParser(GenericStanfordParser):
+
+    """
+    >>> dep_parser=StanfordDependencyParser(
+    ...     model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
+    ... )
+
+    >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
+    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]
+
+    >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
+    [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
+    ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
+    ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
+    ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
+
+    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
+    ...     "The quick brown fox jumps over the lazy dog.",
+    ...     "The quick grey wolf jumps over the lazy fox."
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),
+    Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]
+
+    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
+    ...     "I 'm a dog".split(),
+    ...     "This is my friends ' cat ( the tabby )".split(),
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]
+
+    >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents((
+    ...     (
+    ...         ("The", "DT"),
+    ...         ("quick", "JJ"),
+    ...         ("brown", "JJ"),
+    ...         ("fox", "NN"),
+    ...         ("jumped", "VBD"),
+    ...         ("over", "IN"),
+    ...         ("the", "DT"),
+    ...         ("lazy", "JJ"),
+    ...         ("dog", "NN"),
+    ...         (".", "."),
+    ...     ),
+    ... ))],[]) # doctest: +NORMALIZE_WHITESPACE
+    [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
+    ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
+    ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
+    ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
+
+    """
+
+    _OUTPUT_FORMAT = 'conll2007'
+
+    def _make_tree(self, result):
+        return DependencyGraph(result, top_relation_label='root')
+
+
+class StanfordNeuralDependencyParser(GenericStanfordParser):
+    '''
+    >>> from nltk.parse.stanford import StanfordNeuralDependencyParser
+    >>> dep_parser=StanfordNeuralDependencyParser()
+
+    >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
+    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]
+
+    >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
+    [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
+    ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
+    ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
+    ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
+
+    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
+    ...     "The quick brown fox jumps over the lazy dog.",
+    ...     "The quick grey wolf jumps over the lazy fox."
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),
+    Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]
+
+    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
+    ...     "I 'm a dog".split(),
+    ...     "This is my friends ' cat ( the tabby )".split(),
+    ... ))], []) # doctest: +NORMALIZE_WHITESPACE
+    [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]
+    '''
+
+    _OUTPUT_FORMAT = 'conll'
+    _MAIN_CLASS = 'edu.stanford.nlp.pipeline.StanfordCoreNLP'
+    _JAR = r'stanford-corenlp-(\d+)(\.(\d+))+\.jar'
+    _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)(\.(\d+))+-models\.jar'
+    _USE_STDIN = True
+    _DOUBLE_SPACED_OUTPUT = True
+
+    def __init__(self, *args, **kwargs):
+        super(StanfordNeuralDependencyParser, self).__init__(*args, **kwargs)
+        self.corenlp_options += '-annotators tokenize,ssplit,pos,depparse'
+
+    def tagged_parse_sents(self, sentences, verbose=False):
+        '''
+        Currently unimplemented because the neural dependency parser (and
+        the StanfordCoreNLP pipeline class) doesn't support passing in pre-
+        tagged tokens.
+        '''
+        raise NotImplementedError(
+            'tagged_parse[_sents] is not supported by '
+            'StanfordNeuralDependencyParser; use '
+            'parse[_sents] or raw_parse[_sents] instead.'
+        )
+
+    def _make_tree(self, result):
+        return DependencyGraph(result, top_relation_label='ROOT')
+
 
 def setup_module(module):
     from nose import SkipTest
@@ -245,6 +395,6 @@ def setup_module(module):
         StanfordParser(
             model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
         )
+        StanfordNeuralDependencyParser()
     except LookupError:
-        raise SkipTest('doctests from nltk.parse.stanford are skipped because the stanford parser jar doesn\'t exist')
-    
+        raise SkipTest('doctests from nltk.parse.stanford are skipped because one of the stanford parser or CoreNLP jars doesn\'t exist')
diff --git a/nltk/parse/util.py b/nltk/parse/util.py
index 18c5824..c117fe0 100644
--- a/nltk/parse/util.py
+++ b/nltk/parse/util.py
@@ -84,12 +84,12 @@ def taggedsent_to_conll(sentence):
 	>>> text = "This is a foobar sentence."
 	>>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))):
 	... 	print(line, end="")
-	1    This    _    DT    DT    _    0    a    _    _
-	2    is    _    VBZ    VBZ    _    0    a    _    _
-	3    a    _    DT    DT    _    0    a    _    _
-	4    foobar    _    NN    NN    _    0    a    _    _
-	5    sentence    _    NN    NN    _    0    a    _    _
-	6    .    _    .    .    _    0    a    _    _
+        1	This	_	DT	DT	_	0	a	_	_
+        2	is	_	VBZ	VBZ	_	0	a	_	_
+        3	a	_	DT	DT	_	0	a	_	_
+        4	foobar	_	JJ	JJ	_	0	a	_	_
+        5	sentence	_	NN	NN	_	0	a	_	_
+        6	.		_	.	.	_	0	a	_	_
 	
 	:param sentence: A single input sentence to parse
 	:type sentence: list(tuple(str, str))
@@ -115,16 +115,20 @@ def taggedsents_to_conll(sentences):
 	>>> for line in taggedsents_to_conll(sentences):
         ...     if line:
 	...         print(line, end="")
-	1    This    _    DT    DT    _    0    a    _    _
-	2    is    _    VBZ    VBZ    _    0    a    _    _
-	3    a    _    DT    DT    _    0    a    _    _
-	4    foobar    _    NN    NN    _    0    a    _    _
-	5    sentence    _    NN    NN    _    0    a    _    _
-	6    .    _    .    .    _    0    a    _    _
-	1    Is    _    VBZ    VBZ    _    0    a    _    _
-	2    that    _    IN    IN    _    0    a    _    _
-	3    right    _    JJ    JJ    _    0    a    _    _
-	4    ?    _    .    .    _    0    a    _    _
+        1	This	_	DT	DT	_	0	a	_	_
+        2	is	_	VBZ	VBZ	_	0	a	_	_
+        3	a	_	DT	DT	_	0	a	_	_
+        4	foobar	_	JJ	JJ	_	0	a	_	_
+        5	sentence	_	NN	NN	_	0	a	_	_
+        6	.		_	.	.	_	0	a	_	_
+        <BLANKLINE>
+        <BLANKLINE>
+        1	Is	_	VBZ	VBZ	_	0	a	_	_
+        2	that	_	IN	IN	_	0	a	_	_
+        3	right	_	NN	NN	_	0	a	_	_
+        4	?	_	.	.	_	0	a	_	_
+        <BLANKLINE>
+        <BLANKLINE>
 
 	:param sentences: Input sentences to parse
 	:type sentence: list(list(tuple(str, str)))
diff --git a/nltk/sem/glue.py b/nltk/sem/glue.py
index 7b36fef..7ec94a2 100644
--- a/nltk/sem/glue.py
+++ b/nltk/sem/glue.py
@@ -12,7 +12,6 @@ import os
 import nltk
 from nltk.internals import Counter
 from nltk.compat import string_types
-from nltk.corpus import brown
 from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, RegexpTagger
 from nltk.sem.logic import (Expression, Variable, VariableExpression,
                             LambdaExpression, AbstractVariableExpression)
@@ -577,6 +576,7 @@ class Glue(object):
         return return_list
 
     def get_pos_tagger(self):
+        from nltk.corpus import brown
         regexp_tagger = RegexpTagger(
             [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
              (r'(The|the|A|a|An|an)$', 'AT'),   # articles
diff --git a/nltk/sentiment/__init__.py b/nltk/sentiment/__init__.py
new file mode 100644
index 0000000..33477f1
--- /dev/null
+++ b/nltk/sentiment/__init__.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Sentiment Analysis
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Ewan Klein <ewan at inf.ed.ac.uk>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+NLTK Sentiment Analysis Package
+
+"""
+from nltk.sentiment.sentiment_analyzer import SentimentAnalyzer
+from nltk.sentiment.vader import SentimentIntensityAnalyzer
diff --git a/nltk/sentiment/sentiment_analyzer.py b/nltk/sentiment/sentiment_analyzer.py
new file mode 100644
index 0000000..094b8c7
--- /dev/null
+++ b/nltk/sentiment/sentiment_analyzer.py
@@ -0,0 +1,228 @@
+# coding: utf-8
+#
+# Natural Language Toolkit: Sentiment Analyzer
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A SentimentAnalyzer is a tool to implement and facilitate Sentiment Analysis tasks
+using NLTK features and classifiers, especially for teaching and demonstrative
+purposes.
+"""
+
+from __future__ import print_function
+from collections import defaultdict
+
+from nltk.classify.util import apply_features, accuracy as eval_accuracy
+from nltk.collocations import BigramCollocationFinder
+from nltk.metrics import (BigramAssocMeasures, precision as eval_precision,
+    recall as eval_recall, f_measure as eval_f_measure)
+
+from nltk.probability import FreqDist
+
+from nltk.sentiment.util import save_file, timer
+
+class SentimentAnalyzer(object):
+    """
+    A Sentiment Analysis tool based on machine learning approaches.
+    """
+    def __init__(self, classifier=None):
+        self.feat_extractors = defaultdict(list)
+        self.classifier = classifier
+
+    def all_words(self, documents, labeled=None):
+        """
+        Return all words/tokens from the documents (with duplicates).
+        :param documents: a list of (words, label) tuples.
+        :param labeled: if `True`, assume that each document is represented by a
+            (words, label) tuple: (list(str), str). If `False`, each document is
+            considered as being a simple list of strings: list(str).
+        :rtype: list(str)
+        :return: A list of all words/tokens in `documents`.
+        """
+        all_words = []
+        if labeled is None:
+            labeled = documents and isinstance(documents[0], tuple)
+        if labeled == True:
+            for words, sentiment in documents:
+                all_words.extend(words)
+        elif labeled == False:
+            for words in documents:
+                all_words.extend(words)
+        return all_words
+
+    def apply_features(self, documents, labeled=None):
+        """
+        Apply all feature extractor functions to the documents. This is a wrapper
+        around `nltk.classify.util.apply_features`.
+
+        If `labeled=False`, return featuresets as:
+            [feature_func(doc) for doc in documents]
+        If `labeled=True`, return featuresets as:
+            [(feature_func(tok), label) for (tok, label) in toks]
+
+        :param documents: a list of documents. `If labeled=True`, the method expects
+            a list of (words, label) tuples.
+        :rtype: LazyMap
+        """
+        return apply_features(self.extract_features, documents, labeled)
+
+    def unigram_word_feats(self, words, top_n=None, min_freq=0):
+        """
+        Return most common top_n word features.
+
+        :param words: a list of words/tokens.
+        :param top_n: number of best words/tokens to use, sorted by frequency.
+        :rtype: list(str)
+        :return: A list of `top_n` words/tokens (with no duplicates) sorted by
+            frequency.
+        """
+        # Stopwords are not removed
+        unigram_feats_freqs = FreqDist(word for word in words)
+        return [w for w, f in unigram_feats_freqs.most_common(top_n)
+                if unigram_feats_freqs[w] > min_freq]
+
+    def bigram_collocation_feats(self, documents, top_n=None, min_freq=3,
+                                 assoc_measure=BigramAssocMeasures.pmi):
+        """
+        Return `top_n` bigram features (using `assoc_measure`).
+        Note that this method is based on bigram collocations measures, and not
+        on simple bigram frequency.
+
+        :param documents: a list (or iterable) of tokens.
+        :param top_n: number of best words/tokens to use, sorted by association
+            measure.
+        :param assoc_measure: bigram association measure to use as score function.
+        :param min_freq: the minimum number of occurrencies of bigrams to take
+            into consideration.
+
+        :return: `top_n` ngrams scored by the given association measure.
+        """
+        finder = BigramCollocationFinder.from_documents(documents)
+        finder.apply_freq_filter(min_freq)
+        return finder.nbest(assoc_measure, top_n)
+
+    def classify(self, instance):
+        """
+        Classify a single instance applying the features that have already been
+        stored in the SentimentAnalyzer.
+
+        :param instance: a list (or iterable) of tokens.
+        :return: the classification result given by applying the classifier.
+        """
+        instance_feats = self.apply_features([instance], labeled=False)
+        return self.classifier.classify(instance_feats[0])
+
+    def add_feat_extractor(self, function, **kwargs):
+        """
+        Add a new function to extract features from a document. This function will
+        be used in extract_features().
+        Important: in this step our kwargs are only representing additional parameters,
+        and NOT the document we have to parse. The document will always be the first
+        parameter in the parameter list, and it will be added in the extract_features()
+        function.
+
+        :param function: the extractor function to add to the list of feature extractors.
+        :param kwargs: additional parameters required by the `function` function.
+        """
+        self.feat_extractors[function].append(kwargs)
+
+    def extract_features(self, document):
+        """
+        Apply extractor functions (and their parameters) to the present document.
+        We pass `document` as the first parameter of the extractor functions.
+        If we want to use the same extractor function multiple times, we have to
+        add it to the extractors with `add_feat_extractor` using multiple sets of
+        parameters (one for each call of the extractor function).
+
+        :param document: the document that will be passed as argument to the
+            feature extractor functions.
+        :return: A dictionary of populated features extracted from the document.
+        :rtype: dict
+        """
+        all_features = {}
+        for extractor in self.feat_extractors:
+            for param_set in self.feat_extractors[extractor]:
+                feats = extractor(document, **param_set)
+            all_features.update(feats)
+        return all_features
+
+    def train(self, trainer, training_set, save_classifier=None, **kwargs):
+        """
+        Train classifier on the training set, optionally saving the output in the
+        file specified by `save_classifier`.
+        Additional arguments depend on the specific trainer used. For example,
+        a MaxentClassifier can use `max_iter` parameter to specify the number
+        of iterations, while a NaiveBayesClassifier cannot.
+
+        :param trainer: `train` method of a classifier.
+            E.g.: NaiveBayesClassifier.train
+        :param training_set: the training set to be passed as argument to the
+            classifier `train` method.
+        :param save_classifier: the filename of the file where the classifier
+            will be stored (optional).
+        :param kwargs: additional parameters that will be passed as arguments to
+            the classifier `train` function.
+        :return: A classifier instance trained on the training set.
+        """
+        print("Training classifier")
+        self.classifier = trainer(training_set, **kwargs)
+        if save_classifier:
+            save_file(self.classifier, save_classifier)
+
+        return self.classifier
+
+    def evaluate(self, test_set, classifier=None, accuracy=True, f_measure=True,
+                 precision=True, recall=True, verbose=False):
+        """
+        Evaluate and print classifier performance on the test set.
+
+        :param test_set: A list of (tokens, label) tuples to use as gold set.
+        :param classifier: a classifier instance (previously trained).
+        :param accuracy: if `True`, evaluate classifier accuracy.
+        :param f_measure: if `True`, evaluate classifier f_measure.
+        :param precision: if `True`, evaluate classifier precision.
+        :param recall: if `True`, evaluate classifier recall.
+        :return: evaluation results.
+        :rtype: dict
+        """
+        if classifier is None:
+            classifier = self.classifier
+        print("Evaluating {0} results...".format(type(classifier).__name__))
+        metrics_results = {}
+        if accuracy == True:
+            accuracy_score = eval_accuracy(classifier, test_set)
+            metrics_results['Accuracy'] = accuracy_score
+
+        gold_results = defaultdict(set)
+        test_results = defaultdict(set)
+        labels = set()
+        for i, (feats, label) in enumerate(test_set):
+            labels.add(label)
+            gold_results[label].add(i)
+            observed = classifier.classify(feats)
+            test_results[observed].add(i)
+
+        for label in labels:
+            if precision == True:
+                precision_score = eval_precision(gold_results[label],
+                    test_results[label])
+                metrics_results['Precision [{0}]'.format(label)] = precision_score
+            if recall == True:
+                recall_score = eval_recall(gold_results[label],
+                    test_results[label])
+                metrics_results['Recall [{0}]'.format(label)] = recall_score
+            if f_measure == True:
+                f_measure_score = eval_f_measure(gold_results[label],
+                    test_results[label])
+                metrics_results['F-measure [{0}]'.format(label)] = f_measure_score
+
+        # Print evaluation results (in alphabetical order)
+        if verbose == True:
+            for result in sorted(metrics_results):
+                print('{0}: {1}'.format(result, metrics_results[result]))
+
+        return metrics_results
diff --git a/nltk/sentiment/util.py b/nltk/sentiment/util.py
new file mode 100644
index 0000000..009348a
--- /dev/null
+++ b/nltk/sentiment/util.py
@@ -0,0 +1,752 @@
+# coding: utf-8
+#
+# Natural Language Toolkit: Sentiment Analyzer
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Pierpaolo Pantone <24alsecondo at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Utility methods for Sentiment Analysis.
+"""
+
+from copy import deepcopy
+import codecs
+import csv
+import json
+import pickle
+import random
+import re
+import sys
+import time
+
+import nltk
+from nltk.corpus import CategorizedPlaintextCorpusReader
+from nltk.data import load
+from nltk.tokenize.casual import EMOTICON_RE
+from nltk.twitter.common import outf_writer_compat, extract_fields
+
+#////////////////////////////////////////////////////////////
+#{ Regular expressions
+#////////////////////////////////////////////////////////////
+
+# Regular expression for negation by Christopher Potts
+NEGATION = r"""
+    (?:
+        ^(?:never|no|nothing|nowhere|noone|none|not|
+            havent|hasnt|hadnt|cant|couldnt|shouldnt|
+            wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
+        )$
+    )
+    |
+    n't"""
+
+NEGATION_RE = re.compile(NEGATION, re.VERBOSE)
+
+CLAUSE_PUNCT = r'^[.:;!?]$'
+CLAUSE_PUNCT_RE = re.compile(CLAUSE_PUNCT)
+
+# Happy and sad emoticons
+
+HAPPY = set([
+    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
+    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
+    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
+    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
+    '<3'
+    ])
+
+SAD = set([
+    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
+    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
+    ':c', ':{', '>:\\', ';('
+    ])
+
+def timer(method):
+    """
+    A timer decorator to measure execution performance of methods.
+    """
+    def timed(*args, **kw):
+        start = time.time()
+        result = method(*args, **kw)
+        end = time.time()
+        tot_time = end - start
+        hours = int(tot_time / 3600)
+        mins = int((tot_time / 60) % 60)
+        # in Python 2.x round() will return a float, so we convert it to int
+        secs = int(round(tot_time % 60))
+        if hours == 0 and mins == 0 and secs < 10:
+            print('[TIMER] {0}(): {:.3f} seconds'.format(method.__name__, tot_time))
+        else:
+            print('[TIMER] {0}(): {1}h {2}m {3}s'.format(method.__name__, hours, mins, secs))
+        return result
+    return timed
+
+#////////////////////////////////////////////////////////////
+#{ Feature extractor functions
+#////////////////////////////////////////////////////////////
+"""
+Feature extractor functions are declared outside the SentimentAnalyzer class.
+Users should have the possibility to create their own feature extractors
+without modifying SentimentAnalyzer.
+"""
+
+def extract_unigram_feats(document, unigrams, handle_negation=False):
+    """
+    Populate a dictionary of unigram features, reflecting the presence/absence in
+    the document of each of the tokens in `unigrams`.
+
+    :param document: a list of words/tokens.
+    :param unigrams: a list of words/tokens whose presence/absence has to be
+        checked in `document`.
+    :param handle_negation: if `handle_negation == True` apply `mark_negation`
+        method to `document` before checking for unigram presence/absence.
+    :return: a dictionary of unigram features {unigram : boolean}.
+
+    >>> words = ['ice', 'police', 'riot']
+    >>> document = 'ice is melting due to global warming'.split()
+    >>> sorted(extract_unigram_feats(document, words).items())
+    [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)]
+    """
+    features = {}
+    if handle_negation:
+        document = mark_negation(document)
+    for word in unigrams:
+        features['contains({0})'.format(word)] = word in set(document)
+    return features
+
+def extract_bigram_feats(document, bigrams):
+    """
+    Populate a dictionary of bigram features, reflecting the presence/absence in
+    the document of each of the tokens in `bigrams`. This extractor function only
+    considers contiguous bigrams obtained by `nltk.bigrams`.
+
+    :param document: a list of words/tokens.
+    :param unigrams: a list of bigrams whose presence/absence has to be
+        checked in `document`.
+    :return: a dictionary of bigram features {bigram : boolean}.
+
+    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
+    >>> document = 'ice is melting due to global warming'.split()
+    >>> sorted(extract_bigram_feats(document, bigrams).items())
+    [('contains(global - warming)', True), ('contains(love - you)', False),
+    ('contains(police - prevented)', False)]
+    """
+    features = {}
+    for bigr in bigrams:
+        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
+    return features
+
+#////////////////////////////////////////////////////////////
+#{ Helper Functions
+#////////////////////////////////////////////////////////////
+
+def mark_negation(document, double_neg_flip=False, shallow=False):
+    """
+    Append _NEG suffix to words that appear in the scope between a negation
+    and a punctuation mark.
+
+    :param document: a list of words/tokens, or a tuple (words, label).
+    :param shallow: if True, the method will modify the original document in place.
+    :param double_neg_flip: if True, double negation is considered affirmation
+        (we activate/deactivate negation scope everytime we find a negation).
+    :return: if `shallow == True` the method will modify the original document
+        and return it. If `shallow == False` the method will return a modified
+        document, leaving the original unmodified.
+
+    >>> sent = "I didn't like this movie . It was bad .".split()
+    >>> mark_negation(sent)
+    ['I', "didn't", 'like_NEG', 'this_NEG', 'movie_NEG', '.', 'It', 'was', 'bad', '.']
+    """
+    if not shallow:
+        document = deepcopy(document)
+    # check if the document is labeled. If so, do not consider the label.
+    labeled = document and isinstance(document[0], (tuple, list))
+    if labeled:
+        doc = document[0]
+    else:
+        doc = document
+    neg_scope = False
+    for i, word in enumerate(doc):
+        if NEGATION_RE.search(word):
+            if not neg_scope or (neg_scope and double_neg_flip):
+                neg_scope = not neg_scope
+                continue
+            else:
+                doc[i] += '_NEG'
+        elif neg_scope and CLAUSE_PUNCT_RE.search(word):
+            neg_scope = not neg_scope
+        elif neg_scope and not CLAUSE_PUNCT_RE.search(word):
+            doc[i] += '_NEG'
+
+    return document
+
+def output_markdown(filename, **kwargs):
+    """
+    Write the output of an analysis to a file.
+    """
+    with codecs.open(filename, 'at') as outfile:
+        text = '\n*** \n\n'
+        text += '{0} \n\n'.format(time.strftime("%d/%m/%Y, %H:%M"))
+        for k in sorted(kwargs):
+            if isinstance(kwargs[k], dict):
+                dictionary = kwargs[k]
+                text += '  - **{0}:**\n'.format(k)
+                for entry in sorted(dictionary):
+                    text += '    - {0}: {1} \n'.format(entry, dictionary[entry])
+            elif isinstance(kwargs[k], list):
+                text += '  - **{0}:**\n'.format(k)
+                for entry in kwargs[k]:
+                    text += '    - {0}\n'.format(entry)
+            else:
+                text += '  - **{0}:** {1} \n'.format(k, kwargs[k])
+        outfile.write(text)
+
+def save_file(content, filename):
+    """
+    Store `content` in `filename`. Can be used to store a SentimentAnalyzer.
+    """
+    print("Saving", filename)
+    with codecs.open(filename, 'wb') as storage_file:
+        # The protocol=2 parameter is for python2 compatibility
+        pickle.dump(content, storage_file, protocol=2)
+
+def split_train_test(all_instances, n=None):
+    """
+    Randomly split `n` instances of the dataset into train and test sets.
+
+    :param all_instances: a list of instances (e.g. documents) that will be split.
+    :param n: the number of instances to consider (in case we want to use only a
+        subset).
+    :return: two lists of instances. Train set is 8/10 of the total and test set
+        is 2/10 of the total.
+    """
+    random.seed(12345)
+    random.shuffle(all_instances)
+    if not n or n > len(all_instances):
+        n = len(all_instances)
+    train_set = all_instances[:int(.8*n)]
+    test_set = all_instances[int(.8*n):n]
+
+    return train_set, test_set
+
+def _show_plot(x_values, y_values, x_labels=None, y_labels=None):
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError:
+        raise ImportError('The plot function requires matplotlib to be installed.'
+                         'See http://matplotlib.org/')
+
+    plt.locator_params(axis='y', nbins=3)
+    axes = plt.axes()
+    axes.yaxis.grid()
+    plt.plot(x_values, y_values, 'ro', color='red')
+    plt.ylim(ymin=-1.2, ymax=1.2)
+    plt.tight_layout(pad=5)
+    if x_labels:
+        plt.xticks(x_values, x_labels, rotation='vertical')
+    if y_labels:
+        plt.yticks([-1, 0, 1], y_labels, rotation='horizontal')
+    # Pad margins so that markers are not clipped by the axes
+    plt.margins(0.2)
+    plt.show()
+
+#////////////////////////////////////////////////////////////
+#{ Parsing and conversion functions
+#////////////////////////////////////////////////////////////
+
+def json2csv_preprocess(json_file, outfile, fields, encoding='utf8', errors='replace',
+            gzip_compress=False, skip_retweets=True, skip_tongue_tweets=True,
+            skip_ambiguous_tweets=True, strip_off_emoticons=True, remove_duplicates=True,
+            limit=None):
+    """
+    Convert json file to csv file, preprocessing each row to obtain a suitable
+    dataset for tweets Semantic Analysis.
+
+    :param json_file: the original json file containing tweets.
+    :param outfile: the output csv filename.
+    :param fields: a list of fields that will be extracted from the json file and
+        kept in the output csv file.
+    :param encoding: the encoding of the files.
+    :param errors: the error handling strategy for the output writer.
+    :param gzip_compress: if True, create a compressed GZIP file.
+
+    :param skip_retweets: if True, remove retweets.
+    :param skip_tongue_tweets: if True, remove tweets containing ":P" and ":-P"
+        emoticons.
+    :param skip_ambiguous_tweets: if True, remove tweets containing both happy
+        and sad emoticons.
+    :param strip_off_emoticons: if True, strip off emoticons from all tweets.
+    :param remove_duplicates: if True, remove tweets appearing more than once.
+    :param limit: an integer to set the number of tweets to convert. After the
+        limit is reached the conversion will stop. It can be useful to create
+        subsets of the original tweets json data.
+    """
+    with codecs.open(json_file, encoding=encoding) as fp:
+        (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
+        # write the list of fields as header
+        writer.writerow(fields)
+
+        if remove_duplicates == True:
+            tweets_cache = []
+        i = 0
+        for line in fp:
+            tweet = json.loads(line)
+            row = extract_fields(tweet, fields)
+            try:
+                text = row[fields.index('text')]
+                # Remove retweets
+                if skip_retweets == True:
+                    if re.search(r'\bRT\b', text):
+                        continue
+                # Remove tweets containing ":P" and ":-P" emoticons
+                if skip_tongue_tweets == True:
+                    if re.search(r'\:\-?P\b', text):
+                        continue
+                # Remove tweets containing both happy and sad emoticons
+                if skip_ambiguous_tweets == True:
+                    all_emoticons = EMOTICON_RE.findall(text)
+                    if all_emoticons:
+                        if (set(all_emoticons) & HAPPY) and (set(all_emoticons) & SAD):
+                            continue
+                # Strip off emoticons from all tweets
+                if strip_off_emoticons == True:
+                    row[fields.index('text')] = re.sub(r'(?!\n)\s+', ' ', EMOTICON_RE.sub('', text))
+                # Remove duplicate tweets
+                if remove_duplicates == True:
+                    if row[fields.index('text')] in tweets_cache:
+                        continue
+                    else:
+                        tweets_cache.append(row[fields.index('text')])
+            except ValueError:
+                pass
+            writer.writerow(row)
+            i += 1
+            if limit and i >= limit:
+                break
+        outf.close()
+
+def parse_tweets_set(filename, label, word_tokenizer=None, sent_tokenizer=None,
+                     skip_header=True):
+    """
+    Parse csv file containing tweets and output data a list of (text, label) tuples.
+
+    :param filename: the input csv filename.
+    :param label: the label to be appended to each tweet contained in the csv file.
+    :param word_tokenizer: the tokenizer instance that will be used to tokenize
+        each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()).
+        If no word_tokenizer is specified, tweets will not be tokenized.
+    :param sent_tokenizer: the tokenizer that will be used to split each tweet into
+        sentences.
+    :param skip_header: if True, skip the first line of the csv file (which usually
+        contains headers).
+
+    :return: a list of (text, label) tuples.
+    """
+    tweets = []
+    if not sent_tokenizer:
+        sent_tokenizer = load('tokenizers/punkt/english.pickle')
+
+    # If we use Python3.x we can proceed using the 'rt' flag
+    if sys.version_info[0] == 3:
+        with codecs.open(filename, 'rt') as csvfile:
+            reader = csv.reader(csvfile)
+            if skip_header == True:
+                next(reader, None) # skip the header
+            i = 0
+            for tweet_id, text in reader:
+                # text = text[1]
+                i += 1
+                sys.stdout.write('Loaded {0} tweets\r'.format(i))
+                # Apply sentence and word tokenizer to text
+                if word_tokenizer:
+                    tweet = [w for sent in sent_tokenizer.tokenize(text)
+                                       for w in word_tokenizer.tokenize(sent)]
+                else:
+                    tweet = text
+                tweets.append((tweet, label))
+    # If we use Python2.x we need to handle encoding problems
+    elif sys.version_info[0] < 3:
+        with codecs.open(filename) as csvfile:
+            reader = csv.reader(csvfile)
+            if skip_header == True:
+                next(reader, None) # skip the header
+            i = 0
+            for row in reader:
+                unicode_row = [x.decode('utf8') for x in row]
+                text = unicode_row[1]
+                i += 1
+                sys.stdout.write('Loaded {0} tweets\r'.format(i))
+                # Apply sentence and word tokenizer to text
+                if word_tokenizer:
+                    tweet = [w.encode('utf8') for sent in sent_tokenizer.tokenize(text)
+                                       for w in word_tokenizer.tokenize(sent)]
+                else:
+                    tweet = text
+                tweets.append((tweet, label))
+    print("Loaded {0} tweets".format(i))
+    return tweets
+
+#////////////////////////////////////////////////////////////
+#{ Demos
+#////////////////////////////////////////////////////////////
+
+def demo_tweets(trainer, n_instances=None, output=None):
+    """
+    Train and test Naive Bayes classifier on 10000 tweets, tokenized using
+    TweetTokenizer.
+    Features are composed of:
+        - 1000 most frequent unigrams
+        - 100 top bigrams (using BigramAssocMeasures.pmi)
+
+    :param trainer: `train` method of a classifier.
+    :param n_instances: the number of total tweets that have to be used for
+        training and testing. Tweets will be equally split between positive and
+        negative.
+    :param output: the output file where results have to be reported.
+    """
+    from nltk.tokenize import TweetTokenizer
+    from sentiment_analyzer import SentimentAnalyzer
+    from nltk.corpus import twitter_samples, stopwords
+
+    # Different customizations for the TweetTokenizer
+    tokenizer = TweetTokenizer(preserve_case=False)
+    # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
+    # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
+
+    if n_instances is not None:
+        n_instances = int(n_instances/2)
+
+    fields = ['id', 'text']
+    positive_json = twitter_samples.abspath("positive_tweets.json")
+    positive_csv = 'positive_tweets.csv'
+    json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)
+
+    negative_json = twitter_samples.abspath("negative_tweets.json")
+    negative_csv = 'negative_tweets.csv'
+    json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)
+
+    neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer)
+    pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer)
+
+    # We separately split subjective and objective instances to keep a balanced
+    # uniform class distribution in both train and test sets.
+    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
+    train_neg_docs, test_neg_docs = split_train_test(neg_docs)
+
+    training_tweets = train_pos_docs+train_neg_docs
+    testing_tweets = test_pos_docs+test_neg_docs
+
+    sentim_analyzer = SentimentAnalyzer()
+    # stopwords = stopwords.words('english')
+    # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords]
+    all_words = [word for word in sentim_analyzer.all_words(training_tweets)]
+
+    # Add simple unigram word features
+    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000)
+    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
+
+    # Add bigram collocation features
+    bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats([tweet[0] for tweet in training_tweets],
+        top_n=100, min_freq=12)
+    sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats)
+
+    training_set = sentim_analyzer.apply_features(training_tweets)
+    test_set = sentim_analyzer.apply_features(testing_tweets)
+
+    classifier = sentim_analyzer.train(trainer, training_set)
+    # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4)
+    try:
+        classifier.show_most_informative_features()
+    except AttributeError:
+        print('Your classifier does not provide a show_most_informative_features() method.')
+    results = sentim_analyzer.evaluate(test_set)
+
+    if output:
+        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
+        output_markdown(output, Dataset='labeled_tweets', Classifier=type(classifier).__name__,
+                        Tokenizer=tokenizer.__class__.__name__, Feats=extr,
+                        Results=results, Instances=n_instances)
+
+def demo_movie_reviews(trainer, n_instances=None, output=None):
+    """
+    Train classifier on all instances of the Movie Reviews dataset.
+    The corpus has been preprocessed using the default sentence tokenizer and
+    WordPunctTokenizer.
+    Features are composed of:
+        - most frequent unigrams
+
+    :param trainer: `train` method of a classifier.
+    :param n_instances: the number of total reviews that have to be used for
+        training and testing. Reviews will be equally split between positive and
+        negative.
+    :param output: the output file where results have to be reported.
+    """
+    from nltk.corpus import movie_reviews
+    from sentiment_analyzer import SentimentAnalyzer
+
+    if n_instances is not None:
+        n_instances = int(n_instances/2)
+
+    pos_docs = [(list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances]]
+    neg_docs = [(list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances]]
+    # We separately split positive and negative instances to keep a balanced
+    # uniform class distribution in both train and test sets.
+    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
+    train_neg_docs, test_neg_docs = split_train_test(neg_docs)
+
+    training_docs = train_pos_docs+train_neg_docs
+    testing_docs = test_pos_docs+test_neg_docs
+
+    sentim_analyzer = SentimentAnalyzer()
+    all_words = sentim_analyzer.all_words(training_docs)
+
+    # Add simple unigram word features
+    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
+    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
+    # Apply features to obtain a feature-value representation of our datasets
+    training_set = sentim_analyzer.apply_features(training_docs)
+    test_set = sentim_analyzer.apply_features(testing_docs)
+
+    classifier = sentim_analyzer.train(trainer, training_set)
+    try:
+        classifier.show_most_informative_features()
+    except AttributeError:
+        print('Your classifier does not provide a show_most_informative_features() method.')
+    results = sentim_analyzer.evaluate(test_set)
+
+    if output:
+        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
+        output_markdown(output, Dataset='Movie_reviews', Classifier=type(classifier).__name__,
+                        Tokenizer='WordPunctTokenizer', Feats=extr, Results=results,
+                        Instances=n_instances)
+
+def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
+    """
+    Train and test a classifier on instances of the Subjective Dataset by Pang and
+    Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
+    All tokens (words and punctuation marks) are separated by a whitespace, so
+    we use the basic WhitespaceTokenizer to parse the data.
+
+    :param trainer: `train` method of a classifier.
+    :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
+    :param n_instances: the number of total sentences that have to be used for
+        training and testing. Sentences will be equally split between positive
+        and negative.
+    :param output: the output file where results have to be reported.
+    """
+    from sentiment_analyzer import SentimentAnalyzer
+    from nltk.corpus import subjectivity
+
+    if n_instances is not None:
+        n_instances = int(n_instances/2)
+
+    subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
+    obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
+
+    # We separately split subjective and objective instances to keep a balanced
+    # uniform class distribution in both train and test sets.
+    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
+    train_obj_docs, test_obj_docs = split_train_test(obj_docs)
+
+    training_docs = train_subj_docs+train_obj_docs
+    testing_docs = test_subj_docs+test_obj_docs
+
+    sentim_analyzer = SentimentAnalyzer()
+    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
+
+    # Add simple unigram word features handling negation
+    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
+    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
+
+    # Apply features to obtain a feature-value representation of our datasets
+    training_set = sentim_analyzer.apply_features(training_docs)
+    test_set = sentim_analyzer.apply_features(testing_docs)
+
+    classifier = sentim_analyzer.train(trainer, training_set)
+    try:
+        classifier.show_most_informative_features()
+    except AttributeError:
+        print('Your classifier does not provide a show_most_informative_features() method.')
+    results = sentim_analyzer.evaluate(test_set)
+
+    if save_analyzer == True:
+        save_file(sentim_analyzer, 'sa_subjectivity.pickle')
+
+    if output:
+        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
+        output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__,
+                        Tokenizer='WhitespaceTokenizer', Feats=extr,
+                        Instances=n_instances, Results=results)
+
+    return sentim_analyzer
+
+def demo_sent_subjectivity(text):
+    """
+    Classify a single sentence as subjective or objective using a stored
+    SentimentAnalyzer.
+
+    :param text: a sentence whose subjectivity has to be classified.
+    """
+    from nltk.classify import NaiveBayesClassifier
+    from nltk.tokenize import regexp
+    word_tokenizer = regexp.WhitespaceTokenizer()
+    try:
+        sentim_analyzer = load('sa_subjectivity.pickle')
+    except LookupError:
+        print('Cannot find the sentiment analyzer you want to load.')
+        print('Training a new one using NaiveBayesClassifier.')
+        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
+
+    # Tokenize and convert to lower case
+    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
+    print(sentim_analyzer.classify(tokenized_text))
+
+def demo_liu_hu_lexicon(sentence, plot=False):
+    """
+    Basic example of sentiment classification using Liu and Hu opinion lexicon.
+    This function simply counts the number of positive, negative and neutral words
+    in the sentence and classifies it depending on which polarity is more represented.
+    Words that do not appear in the lexicon are considered as neutral.
+
+    :param sentence: a sentence whose polarity has to be classified.
+    :param plot: if True, plot a visual representation of the sentence polarity.
+    """
+    from nltk.corpus import opinion_lexicon
+    from nltk.tokenize import treebank
+
+    tokenizer = treebank.TreebankWordTokenizer()
+    pos_words = 0
+    neg_words = 0
+    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
+
+    x = list(range(len(tokenized_sent))) # x axis for the plot
+    y = []
+
+    for word in tokenized_sent:
+        if word in opinion_lexicon.positive():
+            pos_words += 1
+            y.append(1) # positive
+        elif word in opinion_lexicon.negative():
+            neg_words += 1
+            y.append(-1) # negative
+        else:
+            y.append(0) # neutral
+
+    if pos_words > neg_words:
+        print('Positive')
+    elif pos_words < neg_words:
+        print('Negative')
+    elif pos_words == neg_words:
+        print('Neutral')
+
+    if plot == True:
+        _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'])
+
+def demo_vader_instance(text):
+    """
+    Output polarity scores for a text using Vader approach.
+
+    :param text: a text whose polarity has to be evaluated.
+    """
+    from vader import SentimentIntensityAnalyzer
+    vader_analyzer = SentimentIntensityAnalyzer()
+    print(vader_analyzer.polarity_scores(text))
+
+def demo_vader_tweets(n_instances=None, output=None):
+    """
+    Classify 10000 positive and negative tweets using Vader approach.
+
+    :param n_instances: the number of total tweets that have to be classified.
+    :param output: the output file where results have to be reported.
+    """
+    from collections import defaultdict
+    from nltk.corpus import twitter_samples
+    from vader import SentimentIntensityAnalyzer
+    from nltk.metrics import (accuracy as eval_accuracy, precision as eval_precision,
+        recall as eval_recall, f_measure as eval_f_measure)
+
+    if n_instances is not None:
+        n_instances = int(n_instances/2)
+
+    fields = ['id', 'text']
+    positive_json = twitter_samples.abspath("positive_tweets.json")
+    positive_csv = 'positive_tweets.csv'
+    json2csv_preprocess(positive_json, positive_csv, fields, strip_off_emoticons=False,
+                        limit=n_instances)
+
+    negative_json = twitter_samples.abspath("negative_tweets.json")
+    negative_csv = 'negative_tweets.csv'
+    json2csv_preprocess(negative_json, negative_csv, fields, strip_off_emoticons=False,
+                        limit=n_instances)
+
+    pos_docs = parse_tweets_set(positive_csv, label='pos')
+    neg_docs = parse_tweets_set(negative_csv, label='neg')
+
+    # We separately split subjective and objective instances to keep a balanced
+    # uniform class distribution in both train and test sets.
+    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
+    train_neg_docs, test_neg_docs = split_train_test(neg_docs)
+
+    training_tweets = train_pos_docs+train_neg_docs
+    testing_tweets = test_pos_docs+test_neg_docs
+
+    vader_analyzer = SentimentIntensityAnalyzer()
+
+    gold_results = defaultdict(set)
+    test_results = defaultdict(set)
+    acc_gold_results = []
+    acc_test_results = []
+    labels = set()
+    num = 0
+    for i, (text, label) in enumerate(testing_tweets):
+        labels.add(label)
+        gold_results[label].add(i)
+        acc_gold_results.append(label)
+        score = vader_analyzer.polarity_scores(text)['compound']
+        if score > 0:
+            observed = 'pos'
+        else:
+            observed = 'neg'
+        num += 1
+        acc_test_results.append(observed)
+        test_results[observed].add(i)
+    metrics_results = {}
+    for label in labels:
+        accuracy_score = eval_accuracy(acc_gold_results,
+            acc_test_results)
+        metrics_results['Accuracy'] = accuracy_score
+        precision_score = eval_precision(gold_results[label],
+            test_results[label])
+        metrics_results['Precision [{0}]'.format(label)] = precision_score
+        recall_score = eval_recall(gold_results[label],
+            test_results[label])
+        metrics_results['Recall [{0}]'.format(label)] = recall_score
+        f_measure_score = eval_f_measure(gold_results[label],
+            test_results[label])
+        metrics_results['F-measure [{0}]'.format(label)] = f_measure_score
+
+    for result in sorted(metrics_results):
+            print('{0}: {1}'.format(result, metrics_results[result]))
+
+    if output:
+        output_markdown(output, Approach='Vader', Dataset='labeled_tweets',
+            Instances=n_instances, Results=metrics_results)
+
+if __name__ == '__main__':
+    from nltk.classify import NaiveBayesClassifier, MaxentClassifier
+    from nltk.classify.scikitlearn import SklearnClassifier
+    from sklearn.svm import LinearSVC
+
+    naive_bayes = NaiveBayesClassifier.train
+    svm = SklearnClassifier(LinearSVC()).train
+    maxent = MaxentClassifier.train
+
+    demo_tweets(naive_bayes)
+    # demo_movie_reviews(svm)
+    # demo_subjectivity(svm)
+    # demo_sent_subjectivity("she's an artist , but hasn't picked up a brush in a year . ")
+    # demo_liu_hu_lexicon("This movie was actually neither that funny, nor super witty.", plot=True)
+    # demo_vader_instance("This movie was actually neither that funny, nor super witty.")
+    # demo_vader_tweets()
diff --git a/nltk/sentiment/vader.py b/nltk/sentiment/vader.py
new file mode 100644
index 0000000..9f826ad
--- /dev/null
+++ b/nltk/sentiment/vader.py
@@ -0,0 +1,445 @@
+# coding: utf-8
+# Natural Language Toolkit: vader
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: C.J. Hutto <Clayton.Hutto at gtri.gatech.edu>
+#         Ewan Klein <ewan at inf.ed.ac.uk> (modifications)
+#         Pierpaolo Pantone <24alsecondo at gmail.com> (modifications)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+# Modifications to the original VADER code have been made in order to
+# integrate it into NLTK. These have involved changes to
+# ensure Python 3 compatibility, and refactoring to achieve greater modularity.
+
+"""
+If you use the VADER sentiment analysis tools, please cite:
+
+Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
+Sentiment Analysis of Social Media Text. Eighth International Conference on
+Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
+"""
+
+import codecs
+import math
+import os
+import re
+import string
+
+##Constants##
+
+# (empirically derived mean sentiment intensity rating increase for booster words)
+B_INCR = 0.293
+B_DECR = -0.293
+
+# (empirically derived mean sentiment intensity rating increase for using
+# ALLCAPs to emphasize a word)
+C_INCR = 0.733
+
+N_SCALAR = -0.74
+
+# for removing punctuation
+REGEX_REMOVE_PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation))
+
+PUNC_LIST = [".", "!", "?", ",", ";", ":", "-", "'", "\"",
+             "!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?"]
+NEGATE = \
+["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
+ "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
+ "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
+ "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
+ "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
+ "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
+ "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
+ "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]
+
+# booster/dampener 'intensifiers' or 'degree adverbs'
+# http://en.wiktionary.org/wiki/Category:English_degree_adverbs
+
+BOOSTER_DICT = \
+{"absolutely": B_INCR, "amazingly": B_INCR, "awfully": B_INCR, "completely": B_INCR, "considerably": B_INCR,
+ "decidedly": B_INCR, "deeply": B_INCR, "effing": B_INCR, "enormously": B_INCR,
+ "entirely": B_INCR, "especially": B_INCR, "exceptionally": B_INCR, "extremely": B_INCR,
+ "fabulously": B_INCR, "flipping": B_INCR, "flippin": B_INCR,
+ "fricking": B_INCR, "frickin": B_INCR, "frigging": B_INCR, "friggin": B_INCR, "fully": B_INCR, "fucking": B_INCR,
+ "greatly": B_INCR, "hella": B_INCR, "highly": B_INCR, "hugely": B_INCR, "incredibly": B_INCR,
+ "intensely": B_INCR, "majorly": B_INCR, "more": B_INCR, "most": B_INCR, "particularly": B_INCR,
+ "purely": B_INCR, "quite": B_INCR, "really": B_INCR, "remarkably": B_INCR,
+ "so": B_INCR, "substantially": B_INCR,
+ "thoroughly": B_INCR, "totally": B_INCR, "tremendously": B_INCR,
+ "uber": B_INCR, "unbelievably": B_INCR, "unusually": B_INCR, "utterly": B_INCR,
+ "very": B_INCR,
+ "almost": B_DECR, "barely": B_DECR, "hardly": B_DECR, "just enough": B_DECR,
+ "kind of": B_DECR, "kinda": B_DECR, "kindof": B_DECR, "kind-of": B_DECR,
+ "less": B_DECR, "little": B_DECR, "marginally": B_DECR, "occasionally": B_DECR, "partly": B_DECR,
+ "scarcely": B_DECR, "slightly": B_DECR, "somewhat": B_DECR,
+ "sort of": B_DECR, "sorta": B_DECR, "sortof": B_DECR, "sort-of": B_DECR}
+
+# check for special case idioms using a sentiment-laden keyword known to SAGE
+SPECIAL_CASE_IDIOMS = {"the shit": 3, "the bomb": 3, "bad ass": 1.5, "yeah right": -2,
+                       "cut the mustard": 2, "kiss of death": -1.5, "hand to mouth": -2}
+
+
+##Static methods##
+
+def negated(input_words, include_nt=True):
+    """
+    Determine if input contains negation words
+    """
+    neg_words = []
+    neg_words.extend(NEGATE)
+    for word in neg_words:
+        if word in input_words:
+            return True
+    if include_nt:
+        for word in input_words:
+            if "n't" in word:
+                return True
+    if "least" in input_words:
+        i = input_words.index("least")
+        if i > 0 and input_words[i-1] != "at":
+            return True
+    return False
+
+
+def normalize(score, alpha=15):
+    """
+    Normalize the score to be between -1 and 1 using an alpha that
+    approximates the max expected value
+    """
+    norm_score = score/math.sqrt((score*score) + alpha)
+    return norm_score
+
+
+def allcap_differential(words):
+    """
+    Check whether just some words in the input are ALL CAPS
+
+    :param list words: The words to inspect
+    :returns: `True` if some but not all items in `words` are ALL CAPS
+    """
+    is_different = False
+    allcap_words = 0
+    for word in words:
+        if word.isupper():
+            allcap_words += 1
+    cap_differential = len(words) - allcap_words
+    if cap_differential > 0 and cap_differential < len(words):
+        is_different = True
+    return is_different
+
+
+def scalar_inc_dec(word, valence, is_cap_diff):
+    """
+    Check if the preceding words increase, decrease, or negate/nullify the
+    valence
+    """
+    scalar = 0.0
+    word_lower = word.lower()
+    if word_lower in BOOSTER_DICT:
+        scalar = BOOSTER_DICT[word_lower]
+        if valence < 0:
+            scalar *= -1
+        #check if booster/dampener word is in ALLCAPS (while others aren't)
+        if word.isupper() and is_cap_diff:
+            if valence > 0:
+                scalar += C_INCR
+            else: scalar -= C_INCR
+    return scalar
+
+class SentiText(object):
+    """
+    Identify sentiment-relevant string-level properties of input text.
+    """
+    def __init__(self, text):
+        if not isinstance(text, str):
+            text = str(text.encode('utf-8'))
+        self.text = text
+        self.words_and_emoticons = self._words_and_emoticons()
+        # doesn't separate words from\
+        # adjacent punctuation (keeps emoticons & contractions)
+        self.is_cap_diff = allcap_differential(self.words_and_emoticons)
+
+    def _words_only(self):
+        text_mod = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
+        # removes punctuation (but loses emoticons & contractions)
+        words_only = text_mod.split()
+        # get rid of empty items or single letter "words" like 'a' and 'I'
+        words_only = [word for word in words_only if len(word) > 1]
+        return words_only
+
+    def _words_and_emoticons(self):
+        wes = self.text.split()
+
+        # get rid of residual empty items or single letter words
+        wes = [we for we in wes if len(we) > 1]
+
+        for word in self._words_only():
+            for punct in PUNC_LIST:
+                pword = punct + word
+                x1 = wes.count(pword)
+                while x1 > 0:
+                    i = wes.index(pword)
+                    wes.remove(pword)
+                    wes.insert(i, word)
+                    x1 = wes.count(pword)
+
+                wordp = word + punct
+                x2 = wes.count(wordp)
+                while x2 > 0:
+                    i = wes.index(wordp)
+                    wes.remove(wordp)
+                    wes.insert(i, word)
+                    x2 = wes.count(wordp)
+        return wes
+
+class SentimentIntensityAnalyzer(object):
+    """
+    Give a sentiment intensity score to sentences.
+    """
+    def __init__(self, lexicon_file="vader_lexicon.txt"):
+        self.lexicon_file = os.path.join(os.path.dirname(__file__), lexicon_file)
+        self.lexicon = self.make_lex_dict()
+
+    def make_lex_dict(self):
+        """
+        Convert lexicon file to a dictionary
+        """
+        lex_dict = {}
+        with codecs.open(self.lexicon_file, encoding='utf8') as infile:
+            for line in infile:
+                (word, measure) = line.strip().split('\t')[0:2]
+                lex_dict[word] = float(measure)
+        return lex_dict
+
+    def polarity_scores(self, text):
+        """
+        Return a float for sentiment strength based on the input text.
+        Positive values are positive valence, negative value are negative
+        valence.
+        """
+        sentitext = SentiText(text)
+        #text, words_and_emoticons, is_cap_diff = self.preprocess(text)
+
+        sentiments = []
+        words_and_emoticons = sentitext.words_and_emoticons
+        for item in words_and_emoticons:
+            valence = 0
+            i = words_and_emoticons.index(item)
+            if (i < len(words_and_emoticons) - 1 and item.lower() == "kind" and \
+                words_and_emoticons[i+1].lower() == "of") or \
+                item.lower() in BOOSTER_DICT:
+                sentiments.append(valence)
+                continue
+
+            sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)
+
+        sentiments = self._but_check(words_and_emoticons, sentiments)
+
+        return self.score_valence(sentiments, text)
+
+    def sentiment_valence(self, valence, sentitext, item, i, sentiments):
+        is_cap_diff = sentitext.is_cap_diff
+        words_and_emoticons = sentitext.words_and_emoticons
+        item_lowercase = item.lower()
+        if item_lowercase in self.lexicon:
+            #get the sentiment valence
+            valence = self.lexicon[item_lowercase]
+
+            #check if sentiment laden word is in ALL CAPS (while others aren't)
+            if item.isupper() and is_cap_diff:
+                if valence > 0:
+                    valence += C_INCR
+                else:
+                    valence -= C_INCR
+
+            for start_i in range(0,3):
+                if i > start_i and words_and_emoticons[i-(start_i+1)].lower() not in self.lexicon:
+                    # dampen the scalar modifier of preceding words and emoticons
+                    # (excluding the ones that immediately preceed the item) based
+                    # on their distance from the current item.
+                    s = scalar_inc_dec(words_and_emoticons[i-(start_i+1)], valence, is_cap_diff)
+                    if start_i == 1 and s != 0:
+                        s = s*0.95
+                    if start_i == 2 and s != 0:
+                        s = s*0.9
+                    valence = valence+s
+                    valence = self._never_check(valence, words_and_emoticons, start_i, i)
+                    if start_i == 2:
+                        valence = self._idioms_check(valence, words_and_emoticons, i)
+
+                        # future work: consider other sentiment-laden idioms
+                        # other_idioms =
+                        # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
+                        #  "upper hand": 1, "break a leg": 2,
+                        #  "cooking with gas": 2, "in the black": 2, "in the red": -2,
+                        #  "on the ball": 2,"under the weather": -2}
+
+            valence = self._least_check(valence, words_and_emoticons, i)
+
+        sentiments.append(valence)
+        return sentiments
+
+    def _least_check(self, valence, words_and_emoticons, i):
+        # check for negation case using "least"
+        if i > 1 and words_and_emoticons[i-1].lower() not in self.lexicon \
+           and words_and_emoticons[i-1].lower() == "least":
+            if words_and_emoticons[i-2].lower() != "at" and words_and_emoticons[i-2].lower() != "very":
+                valence = valence*N_SCALAR
+        elif i > 0 and words_and_emoticons[i-1].lower() not in self.lexicon \
+             and words_and_emoticons[i-1].lower() == "least":
+            valence = valence*N_SCALAR
+        return valence
+
+    def _but_check(self, words_and_emoticons, sentiments):
+        # check for modification in sentiment due to contrastive conjunction 'but'
+        if 'but' in words_and_emoticons or 'BUT' in words_and_emoticons:
+            try:
+                bi = words_and_emoticons.index('but')
+            except ValueError:
+                bi = words_and_emoticons.index('BUT')
+            for sentiment in sentiments:
+                si = sentiments.index(sentiment)
+                if si < bi:
+                    sentiments.pop(si)
+                    sentiments.insert(si, sentiment*0.5)
+                elif si > bi:
+                    sentiments.pop(si)
+                    sentiments.insert(si, sentiment*1.5)
+        return sentiments
+
+    def _idioms_check(self, valence, words_and_emoticons, i):
+        onezero = "{0} {1}".format(words_and_emoticons[i-1], words_and_emoticons[i])
+
+        twoonezero = "{0} {1} {2}".format(words_and_emoticons[i-2],
+                                       words_and_emoticons[i-1], words_and_emoticons[i])
+
+        twoone = "{0} {1}".format(words_and_emoticons[i-2], words_and_emoticons[i-1])
+
+        threetwoone = "{0} {1} {2}".format(words_and_emoticons[i-3],
+                                        words_and_emoticons[i-2], words_and_emoticons[i-1])
+
+        threetwo = "{0} {1}".format(words_and_emoticons[i-3], words_and_emoticons[i-2])
+
+        sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]
+
+        for seq in sequences:
+            if seq in SPECIAL_CASE_IDIOMS:
+                valence = SPECIAL_CASE_IDIOMS[seq]
+                break
+
+        if len(words_and_emoticons)-1 > i:
+            zeroone = "{0} {1}".format(words_and_emoticons[i], words_and_emoticons[i+1])
+            if zeroone in SPECIAL_CASE_IDIOMS:
+                valence = SPECIAL_CASE_IDIOMS[zeroone]
+        if len(words_and_emoticons)-1 > i+1:
+            zeroonetwo = "{0} {1} {2}".format(words_and_emoticons[i], words_and_emoticons[i+1], words_and_emoticons[i+2])
+            if zeroonetwo in SPECIAL_CASE_IDIOMS:
+                valence = SPECIAL_CASE_IDIOMS[zeroonetwo]
+
+        # check for booster/dampener bi-grams such as 'sort of' or 'kind of'
+        if threetwo in BOOSTER_DICT or twoone in BOOSTER_DICT:
+            valence = valence+B_DECR
+        return valence
+
+    def _never_check(self, valence, words_and_emoticons, start_i, i):
+        if start_i == 0:
+            if negated([words_and_emoticons[i-1]]):
+                    valence = valence*N_SCALAR
+        if start_i == 1:
+            if words_and_emoticons[i-2] == "never" and\
+               (words_and_emoticons[i-1] == "so" or
+                words_and_emoticons[i-1] == "this"):
+                valence = valence*1.5
+            elif negated([words_and_emoticons[i-(start_i+1)]]):
+                valence = valence*N_SCALAR
+        if start_i == 2:
+            if words_and_emoticons[i-3] == "never" and \
+               (words_and_emoticons[i-2] == "so" or words_and_emoticons[i-2] == "this") or \
+               (words_and_emoticons[i-1] == "so" or words_and_emoticons[i-1] == "this"):
+                valence = valence*1.25
+            elif negated([words_and_emoticons[i-(start_i+1)]]):
+                valence = valence*N_SCALAR
+        return valence
+
+    def _punctuation_emphasis(self, sum_s, text):
+        # add emphasis from exclamation points and question marks
+        ep_amplifier = self._amplify_ep(text)
+        qm_amplifier = self._amplify_qm(text)
+        punct_emph_amplifier = ep_amplifier+qm_amplifier
+        return punct_emph_amplifier
+
+    def _amplify_ep(self, text):
+        # check for added emphasis resulting from exclamation points (up to 4 of them)
+        ep_count = text.count("!")
+        if ep_count > 4:
+            ep_count = 4
+        # (empirically derived mean sentiment intensity rating increase for
+        # exclamation points)
+        ep_amplifier = ep_count*0.292
+        return ep_amplifier
+
+    def _amplify_qm(self, text):
+        # check for added emphasis resulting from question marks (2 or 3+)
+        qm_count = text.count("?")
+        qm_amplifier = 0
+        if qm_count > 1:
+            if qm_count <= 3:
+                # (empirically derived mean sentiment intensity rating increase for
+                # question marks)
+                qm_amplifier = qm_count*0.18
+            else:
+                qm_amplifier = 0.96
+        return qm_amplifier
+
+    def _sift_sentiment_scores(self, sentiments):
+        # want separate positive versus negative sentiment scores
+        pos_sum = 0.0
+        neg_sum = 0.0
+        neu_count = 0
+        for sentiment_score in sentiments:
+            if sentiment_score > 0:
+                pos_sum += (float(sentiment_score) +1) # compensates for neutral words that are counted as 1
+            if sentiment_score < 0:
+                neg_sum += (float(sentiment_score) -1) # when used with math.fabs(), compensates for neutrals
+            if sentiment_score == 0:
+                neu_count += 1
+        return pos_sum, neg_sum, neu_count
+
+    def score_valence(self, sentiments, text):
+        if sentiments:
+            sum_s = float(sum(sentiments))
+            # compute and add emphasis from punctuation in text
+            punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
+            if sum_s > 0:
+                sum_s += punct_emph_amplifier
+            elif  sum_s < 0:
+                sum_s -= punct_emph_amplifier
+
+            compound = normalize(sum_s)
+            # discriminate between positive, negative and neutral sentiment scores
+            pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
+
+            if pos_sum > math.fabs(neg_sum):
+                pos_sum += (punct_emph_amplifier)
+            elif pos_sum < math.fabs(neg_sum):
+                neg_sum -= (punct_emph_amplifier)
+
+            total = pos_sum + math.fabs(neg_sum) + neu_count
+            pos = math.fabs(pos_sum / total)
+            neg = math.fabs(neg_sum / total)
+            neu = math.fabs(neu_count / total)
+
+        else:
+            compound = 0.0
+            pos = 0.0
+            neg = 0.0
+            neu = 0.0
+
+        sentiment_dict = \
+            {"neg" : round(neg, 3),
+             "neu" : round(neu, 3),
+             "pos" : round(pos, 3),
+             "compound" : round(compound, 4)}
+
+        return sentiment_dict
diff --git a/nltk/six.py b/nltk/six.py
new file mode 100644
index 0000000..190c023
--- /dev/null
+++ b/nltk/six.py
@@ -0,0 +1,868 @@
+"""Utilities for writing code that runs on Python 2 and 3"""
+
+# Copyright (c) 2010-2015 Benjamin Peterson
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import absolute_import
+
+import functools
+import itertools
+import operator
+import sys
+import types
+
+__author__ = "Benjamin Peterson <benjamin at python.org>"
+__version__ = "1.10.0"
+
+
+# Useful for very coarse version differentiation.
+PY2 = sys.version_info[0] == 2
+PY3 = sys.version_info[0] == 3
+PY34 = sys.version_info[0:2] >= (3, 4)
+
+if PY3:
+    string_types = str,
+    integer_types = int,
+    class_types = type,
+    text_type = str
+    binary_type = bytes
+
+    MAXSIZE = sys.maxsize
+else:
+    string_types = basestring,
+    integer_types = (int, long)
+    class_types = (type, types.ClassType)
+    text_type = unicode
+    binary_type = str
+
+    if sys.platform.startswith("java"):
+        # Jython always uses 32 bits.
+        MAXSIZE = int((1 << 31) - 1)
+    else:
+        # It's possible to have sizeof(long) != sizeof(Py_ssize_t).
+        class X(object):
+
+            def __len__(self):
+                return 1 << 31
+        try:
+            len(X())
+        except OverflowError:
+            # 32-bit
+            MAXSIZE = int((1 << 31) - 1)
+        else:
+            # 64-bit
+            MAXSIZE = int((1 << 63) - 1)
+        del X
+
+
+def _add_doc(func, doc):
+    """Add documentation to a function."""
+    func.__doc__ = doc
+
+
+def _import_module(name):
+    """Import module, returning the module after the last dot."""
+    __import__(name)
+    return sys.modules[name]
+
+
+class _LazyDescr(object):
+
+    def __init__(self, name):
+        self.name = name
+
+    def __get__(self, obj, tp):
+        result = self._resolve()
+        setattr(obj, self.name, result)  # Invokes __set__.
+        try:
+            # This is a bit ugly, but it avoids running this again by
+            # removing this descriptor.
+            delattr(obj.__class__, self.name)
+        except AttributeError:
+            pass
+        return result
+
+
+class MovedModule(_LazyDescr):
+
+    def __init__(self, name, old, new=None):
+        super(MovedModule, self).__init__(name)
+        if PY3:
+            if new is None:
+                new = name
+            self.mod = new
+        else:
+            self.mod = old
+
+    def _resolve(self):
+        return _import_module(self.mod)
+
+    def __getattr__(self, attr):
+        _module = self._resolve()
+        value = getattr(_module, attr)
+        setattr(self, attr, value)
+        return value
+
+
+class _LazyModule(types.ModuleType):
+
+    def __init__(self, name):
+        super(_LazyModule, self).__init__(name)
+        self.__doc__ = self.__class__.__doc__
+
+    def __dir__(self):
+        attrs = ["__doc__", "__name__"]
+        attrs += [attr.name for attr in self._moved_attributes]
+        return attrs
+
+    # Subclasses should override this
+    _moved_attributes = []
+
+
+class MovedAttribute(_LazyDescr):
+
+    def __init__(self, name, old_mod, new_mod, old_attr=None, new_attr=None):
+        super(MovedAttribute, self).__init__(name)
+        if PY3:
+            if new_mod is None:
+                new_mod = name
+            self.mod = new_mod
+            if new_attr is None:
+                if old_attr is None:
+                    new_attr = name
+                else:
+                    new_attr = old_attr
+            self.attr = new_attr
+        else:
+            self.mod = old_mod
+            if old_attr is None:
+                old_attr = name
+            self.attr = old_attr
+
+    def _resolve(self):
+        module = _import_module(self.mod)
+        return getattr(module, self.attr)
+
+
+class _SixMetaPathImporter(object):
+
+    """
+    A meta path importer to import six.moves and its submodules.
+
+    This class implements a PEP302 finder and loader. It should be compatible
+    with Python 2.5 and all existing versions of Python3
+    """
+
+    def __init__(self, six_module_name):
+        self.name = six_module_name
+        self.known_modules = {}
+
+    def _add_module(self, mod, *fullnames):
+        for fullname in fullnames:
+            self.known_modules[self.name + "." + fullname] = mod
+
+    def _get_module(self, fullname):
+        return self.known_modules[self.name + "." + fullname]
+
+    def find_module(self, fullname, path=None):
+        if fullname in self.known_modules:
+            return self
+        return None
+
+    def __get_module(self, fullname):
+        try:
+            return self.known_modules[fullname]
+        except KeyError:
+            raise ImportError("This loader does not know module " + fullname)
+
+    def load_module(self, fullname):
+        try:
+            # in case of a reload
+            return sys.modules[fullname]
+        except KeyError:
+            pass
+        mod = self.__get_module(fullname)
+        if isinstance(mod, MovedModule):
+            mod = mod._resolve()
+        else:
+            mod.__loader__ = self
+        sys.modules[fullname] = mod
+        return mod
+
+    def is_package(self, fullname):
+        """
+        Return true, if the named module is a package.
+
+        We need this method to get correct spec objects with
+        Python 3.4 (see PEP451)
+        """
+        return hasattr(self.__get_module(fullname), "__path__")
+
+    def get_code(self, fullname):
+        """Return None
+
+        Required, if is_package is implemented"""
+        self.__get_module(fullname)  # eventually raises ImportError
+        return None
+    get_source = get_code  # same as get_code
+
+_importer = _SixMetaPathImporter(__name__)
+
+
+class _MovedItems(_LazyModule):
+
+    """Lazy loading of moved objects"""
+    __path__ = []  # mark as package
+
+
+_moved_attributes = [
+    MovedAttribute("cStringIO", "cStringIO", "io", "StringIO"),
+    MovedAttribute("filter", "itertools", "builtins", "ifilter", "filter"),
+    MovedAttribute("filterfalse", "itertools", "itertools", "ifilterfalse", "filterfalse"),
+    MovedAttribute("input", "__builtin__", "builtins", "raw_input", "input"),
+    MovedAttribute("intern", "__builtin__", "sys"),
+    MovedAttribute("map", "itertools", "builtins", "imap", "map"),
+    MovedAttribute("getcwd", "os", "os", "getcwdu", "getcwd"),
+    MovedAttribute("getcwdb", "os", "os", "getcwd", "getcwdb"),
+    MovedAttribute("range", "__builtin__", "builtins", "xrange", "range"),
+    MovedAttribute("reload_module", "__builtin__", "importlib" if PY34 else "imp", "reload"),
+    MovedAttribute("reduce", "__builtin__", "functools"),
+    MovedAttribute("shlex_quote", "pipes", "shlex", "quote"),
+    MovedAttribute("StringIO", "StringIO", "io"),
+    MovedAttribute("UserDict", "UserDict", "collections"),
+    MovedAttribute("UserList", "UserList", "collections"),
+    MovedAttribute("UserString", "UserString", "collections"),
+    MovedAttribute("xrange", "__builtin__", "builtins", "xrange", "range"),
+    MovedAttribute("zip", "itertools", "builtins", "izip", "zip"),
+    MovedAttribute("zip_longest", "itertools", "itertools", "izip_longest", "zip_longest"),
+    MovedModule("builtins", "__builtin__"),
+    MovedModule("configparser", "ConfigParser"),
+    MovedModule("copyreg", "copy_reg"),
+    MovedModule("dbm_gnu", "gdbm", "dbm.gnu"),
+    MovedModule("_dummy_thread", "dummy_thread", "_dummy_thread"),
+    MovedModule("http_cookiejar", "cookielib", "http.cookiejar"),
+    MovedModule("http_cookies", "Cookie", "http.cookies"),
+    MovedModule("html_entities", "htmlentitydefs", "html.entities"),
+    MovedModule("html_parser", "HTMLParser", "html.parser"),
+    MovedModule("http_client", "httplib", "http.client"),
+    MovedModule("email_mime_multipart", "email.MIMEMultipart", "email.mime.multipart"),
+    MovedModule("email_mime_nonmultipart", "email.MIMENonMultipart", "email.mime.nonmultipart"),
+    MovedModule("email_mime_text", "email.MIMEText", "email.mime.text"),
+    MovedModule("email_mime_base", "email.MIMEBase", "email.mime.base"),
+    MovedModule("BaseHTTPServer", "BaseHTTPServer", "http.server"),
+    MovedModule("CGIHTTPServer", "CGIHTTPServer", "http.server"),
+    MovedModule("SimpleHTTPServer", "SimpleHTTPServer", "http.server"),
+    MovedModule("cPickle", "cPickle", "pickle"),
+    MovedModule("queue", "Queue"),
+    MovedModule("reprlib", "repr"),
+    MovedModule("socketserver", "SocketServer"),
+    MovedModule("_thread", "thread", "_thread"),
+    MovedModule("tkinter", "Tkinter"),
+    MovedModule("tkinter_dialog", "Dialog", "tkinter.dialog"),
+    MovedModule("tkinter_filedialog", "FileDialog", "tkinter.filedialog"),
+    MovedModule("tkinter_scrolledtext", "ScrolledText", "tkinter.scrolledtext"),
+    MovedModule("tkinter_simpledialog", "SimpleDialog", "tkinter.simpledialog"),
+    MovedModule("tkinter_tix", "Tix", "tkinter.tix"),
+    MovedModule("tkinter_ttk", "ttk", "tkinter.ttk"),
+    MovedModule("tkinter_constants", "Tkconstants", "tkinter.constants"),
+    MovedModule("tkinter_dnd", "Tkdnd", "tkinter.dnd"),
+    MovedModule("tkinter_colorchooser", "tkColorChooser",
+                "tkinter.colorchooser"),
+    MovedModule("tkinter_commondialog", "tkCommonDialog",
+                "tkinter.commondialog"),
+    MovedModule("tkinter_tkfiledialog", "tkFileDialog", "tkinter.filedialog"),
+    MovedModule("tkinter_font", "tkFont", "tkinter.font"),
+    MovedModule("tkinter_messagebox", "tkMessageBox", "tkinter.messagebox"),
+    MovedModule("tkinter_tksimpledialog", "tkSimpleDialog",
+                "tkinter.simpledialog"),
+    MovedModule("urllib_parse", __name__ + ".moves.urllib_parse", "urllib.parse"),
+    MovedModule("urllib_error", __name__ + ".moves.urllib_error", "urllib.error"),
+    MovedModule("urllib", __name__ + ".moves.urllib", __name__ + ".moves.urllib"),
+    MovedModule("urllib_robotparser", "robotparser", "urllib.robotparser"),
+    MovedModule("xmlrpc_client", "xmlrpclib", "xmlrpc.client"),
+    MovedModule("xmlrpc_server", "SimpleXMLRPCServer", "xmlrpc.server"),
+]
+# Add windows specific modules.
+if sys.platform == "win32":
+    _moved_attributes += [
+        MovedModule("winreg", "_winreg"),
+    ]
+
+for attr in _moved_attributes:
+    setattr(_MovedItems, attr.name, attr)
+    if isinstance(attr, MovedModule):
+        _importer._add_module(attr, "moves." + attr.name)
+del attr
+
+_MovedItems._moved_attributes = _moved_attributes
+
+moves = _MovedItems(__name__ + ".moves")
+_importer._add_module(moves, "moves")
+
+
+class Module_six_moves_urllib_parse(_LazyModule):
+
+    """Lazy loading of moved objects in six.moves.urllib_parse"""
+
+
+_urllib_parse_moved_attributes = [
+    MovedAttribute("ParseResult", "urlparse", "urllib.parse"),
+    MovedAttribute("SplitResult", "urlparse", "urllib.parse"),
+    MovedAttribute("parse_qs", "urlparse", "urllib.parse"),
+    MovedAttribute("parse_qsl", "urlparse", "urllib.parse"),
+    MovedAttribute("urldefrag", "urlparse", "urllib.parse"),
+    MovedAttribute("urljoin", "urlparse", "urllib.parse"),
+    MovedAttribute("urlparse", "urlparse", "urllib.parse"),
+    MovedAttribute("urlsplit", "urlparse", "urllib.parse"),
+    MovedAttribute("urlunparse", "urlparse", "urllib.parse"),
+    MovedAttribute("urlunsplit", "urlparse", "urllib.parse"),
+    MovedAttribute("quote", "urllib", "urllib.parse"),
+    MovedAttribute("quote_plus", "urllib", "urllib.parse"),
+    MovedAttribute("unquote", "urllib", "urllib.parse"),
+    MovedAttribute("unquote_plus", "urllib", "urllib.parse"),
+    MovedAttribute("urlencode", "urllib", "urllib.parse"),
+    MovedAttribute("splitquery", "urllib", "urllib.parse"),
+    MovedAttribute("splittag", "urllib", "urllib.parse"),
+    MovedAttribute("splituser", "urllib", "urllib.parse"),
+    MovedAttribute("uses_fragment", "urlparse", "urllib.parse"),
+    MovedAttribute("uses_netloc", "urlparse", "urllib.parse"),
+    MovedAttribute("uses_params", "urlparse", "urllib.parse"),
+    MovedAttribute("uses_query", "urlparse", "urllib.parse"),
+    MovedAttribute("uses_relative", "urlparse", "urllib.parse"),
+]
+for attr in _urllib_parse_moved_attributes:
+    setattr(Module_six_moves_urllib_parse, attr.name, attr)
+del attr
+
+Module_six_moves_urllib_parse._moved_attributes = _urllib_parse_moved_attributes
+
+_importer._add_module(Module_six_moves_urllib_parse(__name__ + ".moves.urllib_parse"),
+                      "moves.urllib_parse", "moves.urllib.parse")
+
+
+class Module_six_moves_urllib_error(_LazyModule):
+
+    """Lazy loading of moved objects in six.moves.urllib_error"""
+
+
+_urllib_error_moved_attributes = [
+    MovedAttribute("URLError", "urllib2", "urllib.error"),
+    MovedAttribute("HTTPError", "urllib2", "urllib.error"),
+    MovedAttribute("ContentTooShortError", "urllib", "urllib.error"),
+]
+for attr in _urllib_error_moved_attributes:
+    setattr(Module_six_moves_urllib_error, attr.name, attr)
+del attr
+
+Module_six_moves_urllib_error._moved_attributes = _urllib_error_moved_attributes
+
+_importer._add_module(Module_six_moves_urllib_error(__name__ + ".moves.urllib.error"),
+                      "moves.urllib_error", "moves.urllib.error")
+
+
+class Module_six_moves_urllib_request(_LazyModule):
+
+    """Lazy loading of moved objects in six.moves.urllib_request"""
+
+
+_urllib_request_moved_attributes = [
+    MovedAttribute("urlopen", "urllib2", "urllib.request"),
+    MovedAttribute("install_opener", "urllib2", "urllib.request"),
+    MovedAttribute("build_opener", "urllib2", "urllib.request"),
+    MovedAttribute("pathname2url", "urllib", "urllib.request"),
+    MovedAttribute("url2pathname", "urllib", "urllib.request"),
+    MovedAttribute("getproxies", "urllib", "urllib.request"),
+    MovedAttribute("Request", "urllib2", "urllib.request"),
+    MovedAttribute("OpenerDirector", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPDefaultErrorHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPRedirectHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPCookieProcessor", "urllib2", "urllib.request"),
+    MovedAttribute("ProxyHandler", "urllib2", "urllib.request"),
+    MovedAttribute("BaseHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPPasswordMgr", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPPasswordMgrWithDefaultRealm", "urllib2", "urllib.request"),
+    MovedAttribute("AbstractBasicAuthHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPBasicAuthHandler", "urllib2", "urllib.request"),
+    MovedAttribute("ProxyBasicAuthHandler", "urllib2", "urllib.request"),
+    MovedAttribute("AbstractDigestAuthHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPDigestAuthHandler", "urllib2", "urllib.request"),
+    MovedAttribute("ProxyDigestAuthHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPSHandler", "urllib2", "urllib.request"),
+    MovedAttribute("FileHandler", "urllib2", "urllib.request"),
+    MovedAttribute("FTPHandler", "urllib2", "urllib.request"),
+    MovedAttribute("CacheFTPHandler", "urllib2", "urllib.request"),
+    MovedAttribute("UnknownHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPErrorProcessor", "urllib2", "urllib.request"),
+    MovedAttribute("urlretrieve", "urllib", "urllib.request"),
+    MovedAttribute("urlcleanup", "urllib", "urllib.request"),
+    MovedAttribute("URLopener", "urllib", "urllib.request"),
+    MovedAttribute("FancyURLopener", "urllib", "urllib.request"),
+    MovedAttribute("proxy_bypass", "urllib", "urllib.request"),
+]
+for attr in _urllib_request_moved_attributes:
+    setattr(Module_six_moves_urllib_request, attr.name, attr)
+del attr
+
+Module_six_moves_urllib_request._moved_attributes = _urllib_request_moved_attributes
+
+_importer._add_module(Module_six_moves_urllib_request(__name__ + ".moves.urllib.request"),
+                      "moves.urllib_request", "moves.urllib.request")
+
+
+class Module_six_moves_urllib_response(_LazyModule):
+
+    """Lazy loading of moved objects in six.moves.urllib_response"""
+
+
+_urllib_response_moved_attributes = [
+    MovedAttribute("addbase", "urllib", "urllib.response"),
+    MovedAttribute("addclosehook", "urllib", "urllib.response"),
+    MovedAttribute("addinfo", "urllib", "urllib.response"),
+    MovedAttribute("addinfourl", "urllib", "urllib.response"),
+]
+for attr in _urllib_response_moved_attributes:
+    setattr(Module_six_moves_urllib_response, attr.name, attr)
+del attr
+
+Module_six_moves_urllib_response._moved_attributes = _urllib_response_moved_attributes
+
+_importer._add_module(Module_six_moves_urllib_response(__name__ + ".moves.urllib.response"),
+                      "moves.urllib_response", "moves.urllib.response")
+
+
+class Module_six_moves_urllib_robotparser(_LazyModule):
+
+    """Lazy loading of moved objects in six.moves.urllib_robotparser"""
+
+
+_urllib_robotparser_moved_attributes = [
+    MovedAttribute("RobotFileParser", "robotparser", "urllib.robotparser"),
+]
+for attr in _urllib_robotparser_moved_attributes:
+    setattr(Module_six_moves_urllib_robotparser, attr.name, attr)
+del attr
+
+Module_six_moves_urllib_robotparser._moved_attributes = _urllib_robotparser_moved_attributes
+
+_importer._add_module(Module_six_moves_urllib_robotparser(__name__ + ".moves.urllib.robotparser"),
+                      "moves.urllib_robotparser", "moves.urllib.robotparser")
+
+
+class Module_six_moves_urllib(types.ModuleType):
+
+    """Create a six.moves.urllib namespace that resembles the Python 3 namespace"""
+    __path__ = []  # mark as package
+    parse = _importer._get_module("moves.urllib_parse")
+    error = _importer._get_module("moves.urllib_error")
+    request = _importer._get_module("moves.urllib_request")
+    response = _importer._get_module("moves.urllib_response")
+    robotparser = _importer._get_module("moves.urllib_robotparser")
+
+    def __dir__(self):
+        return ['parse', 'error', 'request', 'response', 'robotparser']
+
+_importer._add_module(Module_six_moves_urllib(__name__ + ".moves.urllib"),
+                      "moves.urllib")
+
+
+def add_move(move):
+    """Add an item to six.moves."""
+    setattr(_MovedItems, move.name, move)
+
+
+def remove_move(name):
+    """Remove item from six.moves."""
+    try:
+        delattr(_MovedItems, name)
+    except AttributeError:
+        try:
+            del moves.__dict__[name]
+        except KeyError:
+            raise AttributeError("no such move, %r" % (name,))
+
+
+if PY3:
+    _meth_func = "__func__"
+    _meth_self = "__self__"
+
+    _func_closure = "__closure__"
+    _func_code = "__code__"
+    _func_defaults = "__defaults__"
+    _func_globals = "__globals__"
+else:
+    _meth_func = "im_func"
+    _meth_self = "im_self"
+
+    _func_closure = "func_closure"
+    _func_code = "func_code"
+    _func_defaults = "func_defaults"
+    _func_globals = "func_globals"
+
+
+try:
+    advance_iterator = next
+except NameError:
+    def advance_iterator(it):
+        return it.next()
+next = advance_iterator
+
+
+try:
+    callable = callable
+except NameError:
+    def callable(obj):
+        return any("__call__" in klass.__dict__ for klass in type(obj).__mro__)
+
+
+if PY3:
+    def get_unbound_function(unbound):
+        return unbound
+
+    create_bound_method = types.MethodType
+
+    def create_unbound_method(func, cls):
+        return func
+
+    Iterator = object
+else:
+    def get_unbound_function(unbound):
+        return unbound.im_func
+
+    def create_bound_method(func, obj):
+        return types.MethodType(func, obj, obj.__class__)
+
+    def create_unbound_method(func, cls):
+        return types.MethodType(func, None, cls)
+
+    class Iterator(object):
+
+        def next(self):
+            return type(self).__next__(self)
+
+    callable = callable
+_add_doc(get_unbound_function,
+         """Get the function out of a possibly unbound function""")
+
+
+get_method_function = operator.attrgetter(_meth_func)
+get_method_self = operator.attrgetter(_meth_self)
+get_function_closure = operator.attrgetter(_func_closure)
+get_function_code = operator.attrgetter(_func_code)
+get_function_defaults = operator.attrgetter(_func_defaults)
+get_function_globals = operator.attrgetter(_func_globals)
+
+
+if PY3:
+    def iterkeys(d, **kw):
+        return iter(d.keys(**kw))
+
+    def itervalues(d, **kw):
+        return iter(d.values(**kw))
+
+    def iteritems(d, **kw):
+        return iter(d.items(**kw))
+
+    def iterlists(d, **kw):
+        return iter(d.lists(**kw))
+
+    viewkeys = operator.methodcaller("keys")
+
+    viewvalues = operator.methodcaller("values")
+
+    viewitems = operator.methodcaller("items")
+else:
+    def iterkeys(d, **kw):
+        return d.iterkeys(**kw)
+
+    def itervalues(d, **kw):
+        return d.itervalues(**kw)
+
+    def iteritems(d, **kw):
+        return d.iteritems(**kw)
+
+    def iterlists(d, **kw):
+        return d.iterlists(**kw)
+
+    viewkeys = operator.methodcaller("viewkeys")
+
+    viewvalues = operator.methodcaller("viewvalues")
+
+    viewitems = operator.methodcaller("viewitems")
+
+_add_doc(iterkeys, "Return an iterator over the keys of a dictionary.")
+_add_doc(itervalues, "Return an iterator over the values of a dictionary.")
+_add_doc(iteritems,
+         "Return an iterator over the (key, value) pairs of a dictionary.")
+_add_doc(iterlists,
+         "Return an iterator over the (key, [values]) pairs of a dictionary.")
+
+
+if PY3:
+    def b(s):
+        return s.encode("latin-1")
+
+    def u(s):
+        return s
+    unichr = chr
+    import struct
+    int2byte = struct.Struct(">B").pack
+    del struct
+    byte2int = operator.itemgetter(0)
+    indexbytes = operator.getitem
+    iterbytes = iter
+    import io
+    StringIO = io.StringIO
+    BytesIO = io.BytesIO
+    _assertCountEqual = "assertCountEqual"
+    if sys.version_info[1] <= 1:
+        _assertRaisesRegex = "assertRaisesRegexp"
+        _assertRegex = "assertRegexpMatches"
+    else:
+        _assertRaisesRegex = "assertRaisesRegex"
+        _assertRegex = "assertRegex"
+else:
+    def b(s):
+        return s
+    # Workaround for standalone backslash
+
+    def u(s):
+        return unicode(s.replace(r'\\', r'\\\\'), "unicode_escape")
+    unichr = unichr
+    int2byte = chr
+
+    def byte2int(bs):
+        return ord(bs[0])
+
+    def indexbytes(buf, i):
+        return ord(buf[i])
+    iterbytes = functools.partial(itertools.imap, ord)
+    import StringIO
+    StringIO = BytesIO = StringIO.StringIO
+    _assertCountEqual = "assertItemsEqual"
+    _assertRaisesRegex = "assertRaisesRegexp"
+    _assertRegex = "assertRegexpMatches"
+_add_doc(b, """Byte literal""")
+_add_doc(u, """Text literal""")
+
+
+def assertCountEqual(self, *args, **kwargs):
+    return getattr(self, _assertCountEqual)(*args, **kwargs)
+
+
+def assertRaisesRegex(self, *args, **kwargs):
+    return getattr(self, _assertRaisesRegex)(*args, **kwargs)
+
+
+def assertRegex(self, *args, **kwargs):
+    return getattr(self, _assertRegex)(*args, **kwargs)
+
+
+if PY3:
+    exec_ = getattr(moves.builtins, "exec")
+
+    def reraise(tp, value, tb=None):
+        if value is None:
+            value = tp()
+        if value.__traceback__ is not tb:
+            raise value.with_traceback(tb)
+        raise value
+
+else:
+    def exec_(_code_, _globs_=None, _locs_=None):
+        """Execute code in a namespace."""
+        if _globs_ is None:
+            frame = sys._getframe(1)
+            _globs_ = frame.f_globals
+            if _locs_ is None:
+                _locs_ = frame.f_locals
+            del frame
+        elif _locs_ is None:
+            _locs_ = _globs_
+        exec("""exec _code_ in _globs_, _locs_""")
+
+    exec_("""def reraise(tp, value, tb=None):
+    raise tp, value, tb
+""")
+
+
+if sys.version_info[:2] == (3, 2):
+    exec_("""def raise_from(value, from_value):
+    if from_value is None:
+        raise value
+    raise value from from_value
+""")
+elif sys.version_info[:2] > (3, 2):
+    exec_("""def raise_from(value, from_value):
+    raise value from from_value
+""")
+else:
+    def raise_from(value, from_value):
+        raise value
+
+
+print_ = getattr(moves.builtins, "print", None)
+if print_ is None:
+    def print_(*args, **kwargs):
+        """The new-style print function for Python 2.4 and 2.5."""
+        fp = kwargs.pop("file", sys.stdout)
+        if fp is None:
+            return
+
+        def write(data):
+            if not isinstance(data, basestring):
+                data = str(data)
+            # If the file has an encoding, encode unicode with it.
+            if (isinstance(fp, file) and
+                    isinstance(data, unicode) and
+                    fp.encoding is not None):
+                errors = getattr(fp, "errors", None)
+                if errors is None:
+                    errors = "strict"
+                data = data.encode(fp.encoding, errors)
+            fp.write(data)
+        want_unicode = False
+        sep = kwargs.pop("sep", None)
+        if sep is not None:
+            if isinstance(sep, unicode):
+                want_unicode = True
+            elif not isinstance(sep, str):
+                raise TypeError("sep must be None or a string")
+        end = kwargs.pop("end", None)
+        if end is not None:
+            if isinstance(end, unicode):
+                want_unicode = True
+            elif not isinstance(end, str):
+                raise TypeError("end must be None or a string")
+        if kwargs:
+            raise TypeError("invalid keyword arguments to print()")
+        if not want_unicode:
+            for arg in args:
+                if isinstance(arg, unicode):
+                    want_unicode = True
+                    break
+        if want_unicode:
+            newline = unicode("\n")
+            space = unicode(" ")
+        else:
+            newline = "\n"
+            space = " "
+        if sep is None:
+            sep = space
+        if end is None:
+            end = newline
+        for i, arg in enumerate(args):
+            if i:
+                write(sep)
+            write(arg)
+        write(end)
+if sys.version_info[:2] < (3, 3):
+    _print = print_
+
+    def print_(*args, **kwargs):
+        fp = kwargs.get("file", sys.stdout)
+        flush = kwargs.pop("flush", False)
+        _print(*args, **kwargs)
+        if flush and fp is not None:
+            fp.flush()
+
+_add_doc(reraise, """Reraise an exception.""")
+
+if sys.version_info[0:2] < (3, 4):
+    def wraps(wrapped, assigned=functools.WRAPPER_ASSIGNMENTS,
+              updated=functools.WRAPPER_UPDATES):
+        def wrapper(f):
+            f = functools.wraps(wrapped, assigned, updated)(f)
+            f.__wrapped__ = wrapped
+            return f
+        return wrapper
+else:
+    wraps = functools.wraps
+
+
+def with_metaclass(meta, *bases):
+    """Create a base class with a metaclass."""
+    # This requires a bit of explanation: the basic idea is to make a dummy
+    # metaclass for one level of class instantiation that replaces itself with
+    # the actual metaclass.
+    class metaclass(meta):
+
+        def __new__(cls, name, this_bases, d):
+            return meta(name, bases, d)
+    return type.__new__(metaclass, 'temporary_class', (), {})
+
+
+def add_metaclass(metaclass):
+    """Class decorator for creating a class with a metaclass."""
+    def wrapper(cls):
+        orig_vars = cls.__dict__.copy()
+        slots = orig_vars.get('__slots__')
+        if slots is not None:
+            if isinstance(slots, str):
+                slots = [slots]
+            for slots_var in slots:
+                orig_vars.pop(slots_var)
+        orig_vars.pop('__dict__', None)
+        orig_vars.pop('__weakref__', None)
+        return metaclass(cls.__name__, cls.__bases__, orig_vars)
+    return wrapper
+
+
+def python_2_unicode_compatible(klass):
+    """
+    A decorator that defines __unicode__ and __str__ methods under Python 2.
+    Under Python 3 it does nothing.
+
+    To support Python 2 and 3 with a single code base, define a __str__ method
+    returning text and apply this decorator to the class.
+    """
+    if PY2:
+        if '__str__' not in klass.__dict__:
+            raise ValueError("@python_2_unicode_compatible cannot be applied "
+                             "to %s because it doesn't define __str__()." %
+                             klass.__name__)
+        klass.__unicode__ = klass.__str__
+        klass.__str__ = lambda self: self.__unicode__().encode('utf-8')
+    return klass
+
+
+# Complete the moves implementation.
+# This code is at the end of this module to speed up module loading.
+# Turn this module into a package.
+__path__ = []  # required for PEP 302 and PEP 451
+__package__ = __name__  # see PEP 366 @ReservedAssignment
+if globals().get("__spec__") is not None:
+    __spec__.submodule_search_locations = []  # PEP 451 @UndefinedVariable
+# Remove other six meta path importers, since they cause problems. This can
+# happen if six is removed from sys.modules and then reloaded. (Setuptools does
+# this for some reason.)
+if sys.meta_path:
+    for i, importer in enumerate(sys.meta_path):
+        # Here's some real nastiness: Another "instance" of the six module might
+        # be floating around. Therefore, we can't use isinstance() to check for
+        # the six meta path importer, since the other six instance will have
+        # inserted an importer with different class.
+        if (type(importer).__name__ == "_SixMetaPathImporter" and
+                importer.name == __name__):
+            del sys.meta_path[i]
+            break
+    del i, importer
+# Finally, add the importer to the meta path import hook.
+sys.meta_path.append(_importer)
diff --git a/nltk/tag/__init__.py b/nltk/tag/__init__.py
index fccf506..847f6a7 100644
--- a/nltk/tag/__init__.py
+++ b/nltk/tag/__init__.py
@@ -21,18 +21,16 @@ the word ``'fly'`` with a noun part of speech tag (``'NN'``):
 
 An off-the-shelf tagger is available.  It uses the Penn Treebank tagset:
 
-    >>> from nltk.tag import pos_tag  # doctest: +SKIP
-    >>> from nltk.tokenize import word_tokenize # doctest: +SKIP
-    >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +SKIP
-    [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is',
-    'VBZ'), ("n't", 'RB'), ('all', 'DT'), ('that', 'DT'), ('bad', 'JJ'),
-    ('.', '.')]
-
-This package defines several taggers, which take a token list (typically a
-sentence), assign a tag to each token, and return the resulting list of
-tagged tokens.  Most of the taggers are built automatically based on a
-training corpus.  For example, the unigram tagger tags each word *w*
-by checking what the most frequent tag for *w* was in a training corpus:
+    >>> from nltk import pos_tag, word_tokenize
+    >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
+    [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
+    ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
+
+This package defines several taggers, which take a list of tokens,
+assign a tag to each one, and return the resulting list of tagged tokens.
+Most of the taggers are built automatically based on a training corpus.
+For example, the unigram tagger tags each word *w* by checking what
+the most frequent tag for *w* was in a training corpus:
 
     >>> from nltk.corpus import brown
     >>> from nltk.tag import UnigramTagger
@@ -76,41 +74,54 @@ from nltk.tag.hmm           import HiddenMarkovModelTagger, HiddenMarkovModelTra
 from nltk.tag.senna         import SennaTagger, SennaChunkTagger, SennaNERTagger
 from nltk.tag.mapping       import tagset_mapping, map_tag
 from nltk.tag.crf           import CRFTagger
+from nltk.tag.perceptron    import PerceptronTagger
 
 from nltk.data import load
 
-
-# Standard treebank POS tagger
-_POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
+def _pos_tag(tokens, tagset, tagger):
+    tagged_tokens = tagger.tag(tokens)
+    if tagset:
+        tagged_tokens = [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagged_tokens]
+    return tagged_tokens
 
 def pos_tag(tokens, tagset=None):
     """
     Use NLTK's currently recommended part of speech tagger to
     tag the given list of tokens.
 
-        >>> from nltk.tag import pos_tag # doctest: +SKIP
-        >>> from nltk.tokenize import word_tokenize # doctest: +SKIP
-        >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +SKIP
-        [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is',
-        'VBZ'), ("n't", 'RB'), ('all', 'DT'), ('that', 'DT'), ('bad', 'JJ'),
-        ('.', '.')]
+        >>> from nltk.tag import pos_tag
+        >>> from nltk.tokenize import word_tokenize
+        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
+        [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
+        ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
+        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal')
+        [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'),
+        ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')]
+
+    NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence.
 
     :param tokens: Sequence of tokens to be tagged
     :type tokens: list(str)
+    :param tagset: the tagset to be used, e.g. universal, wsj, brown
+    :type tagset: str
     :return: The tagged tokens
     :rtype: list(tuple(str, str))
     """
-    tagger = load(_POS_TAGGER)
-    if tagset:
-        return [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagger.tag(tokens)]
-    return tagger.tag(tokens)
+    tagger = PerceptronTagger()
+    return _pos_tag(tokens, tagset, tagger)    
 
-def pos_tag_sents(sentences):
+
+def pos_tag_sents(sentences, tagset=None):
     """
     Use NLTK's currently recommended part of speech tagger to tag the
     given list of sentences, each consisting of a list of tokens.
-    """
-    tagger = load(_POS_TAGGER)
-    return tagger.tag_sents(sentences)
-
 
+    :param tokens: List of sentences to be tagged
+    :type tokens: list(list(str))
+    :param tagset: the tagset to be used, e.g. universal, wsj, brown
+    :type tagset: str
+    :return: The list of tagged sentences
+    :rtype: list(list(tuple(str, str)))
+    """
+    tagger = PerceptronTagger()
+    return [_pos_tag(sent, tagset, tagger) for sent in sentences]
diff --git a/nltk/tag/perceptron.py b/nltk/tag/perceptron.py
new file mode 100644
index 0000000..e1e3f39
--- /dev/null
+++ b/nltk/tag/perceptron.py
@@ -0,0 +1,318 @@
+# -*- coding: utf-8 -*-
+# This module is a port of the Textblob Averaged Perceptron Tagger
+# Author: Matthew Honnibal <honnibal+gh at gmail.com>, 
+#         Long Duong <longdt219 at gmail.com> (NLTK port)
+# URL: <https://github.com/sloria/textblob-aptagger>
+#      <http://nltk.org/>
+# Copyright 2013 Matthew Honnibal
+# NLTK modifications Copyright 2015 The NLTK Project
+#
+# This module is provided under the terms of the MIT License.
+
+from __future__ import absolute_import
+from __future__ import print_function, division
+
+import random
+from collections import defaultdict
+import pickle
+import logging
+
+from nltk.tag.api import TaggerI
+from nltk.data import find
+from nltk.compat import python_2_unicode_compatible
+
+PICKLE = "averaged_perceptron_tagger.pickle"
+
+class AveragedPerceptron(object):
+
+    '''An averaged perceptron, as implemented by Matthew Honnibal.
+
+    See more implementation details here:
+        http://spacy.io/blog/part-of-speech-POS-tagger-in-python/
+    '''
+
+    def __init__(self):
+        # Each feature gets its own weight vector, so weights is a dict-of-dicts
+        self.weights = {}
+        self.classes = set()
+        # The accumulated values, for the averaging. These will be keyed by
+        # feature/clas tuples
+        self._totals = defaultdict(int)
+        # The last time the feature was changed, for the averaging. Also
+        # keyed by feature/clas tuples
+        # (tstamps is short for timestamps)
+        self._tstamps = defaultdict(int)
+        # Number of instances seen
+        self.i = 0
+
+    def predict(self, features):
+        '''Dot-product the features and current weights and return the best label.'''
+        scores = defaultdict(float)
+        for feat, value in features.items():
+            if feat not in self.weights or value == 0:
+                continue
+            weights = self.weights[feat]
+            for label, weight in weights.items():
+                scores[label] += value * weight
+        # Do a secondary alphabetic sort, for stability
+        return max(self.classes, key=lambda label: (scores[label], label))
+
+    def update(self, truth, guess, features):
+        '''Update the feature weights.'''
+        def upd_feat(c, f, w, v):
+            param = (f, c)
+            self._totals[param] += (self.i - self._tstamps[param]) * w
+            self._tstamps[param] = self.i
+            self.weights[f][c] = w + v
+
+        self.i += 1
+        if truth == guess:
+            return None
+        for f in features:
+            weights = self.weights.setdefault(f, {})
+            upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
+            upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
+
+    def average_weights(self):
+        '''Average weights from all iterations.'''
+        for feat, weights in self.weights.items():
+            new_feat_weights = {}
+            for clas, weight in weights.items():
+                param = (feat, clas)
+                total = self._totals[param]
+                total += (self.i - self._tstamps[param]) * weight
+                averaged = round(total / float(self.i), 3)
+                if averaged:
+                    new_feat_weights[clas] = averaged
+            self.weights[feat] = new_feat_weights
+
+    def save(self, path):
+        '''Save the pickled model weights.'''
+        with open(path, 'wb') as fout:
+            return pickle.dump(dict(self.weights), fout)
+
+    def load(self, path):
+        '''Load the pickled model weights.'''
+        with open(path,'rb') as fin:
+            self.weights = pickle.load(fin)
+
+ at python_2_unicode_compatible
+class PerceptronTagger(TaggerI):
+
+    '''
+    Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
+    See more implementation details here:
+        http://spacy.io/blog/part-of-speech-POS-tagger-in-python/
+    
+    >>> from nltk.tag.perceptron import PerceptronTagger
+
+    Train the model 
+    
+    >>> tagger = PerceptronTagger(load=False)
+    
+    >>> tagger.train([[('today','NN'),('is','VBZ'),('good','JJ'),('day','NN')],
+    ... [('yes','NNS'),('it','PRP'),('beautiful','JJ')]])
+    
+    >>> tagger.tag(['today','is','a','beautiful','day'])
+    [('today', 'NN'), ('is', 'PRP'), ('a', 'PRP'), ('beautiful', 'JJ'), ('day', 'NN')]
+    
+    Use the pretrain model (the default constructor) 
+    
+    >>> pretrain = PerceptronTagger()
+    
+    >>> pretrain.tag('The quick brown fox jumps over the lazy dog'.split())
+    [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]
+    
+    >>> pretrain.tag("The red cat".split())
+    [('The', 'DT'), ('red', 'JJ'), ('cat', 'NN')]
+    '''
+
+    START = ['-START-', '-START2-']
+    END = ['-END-', '-END2-']
+    
+    def __init__(self, load=True):
+        '''
+        :param load: Load the pickled model upon instantiation.
+        '''
+        self.model = AveragedPerceptron()
+        self.tagdict = {}
+        self.classes = set()
+        if load:
+            AP_MODEL_LOC = str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
+            self.load(AP_MODEL_LOC)
+
+    def tag(self, tokens):
+        '''
+        Tag tokenized sentences.
+        :params tokens: list of word
+        :type tokens: list(str)
+        '''
+        prev, prev2 = self.START
+        output = []
+        
+        context = self.START + [self.normalize(w) for w in tokens] + self.END
+        for i, word in enumerate(tokens):
+            tag = self.tagdict.get(word)
+            if not tag:
+                features = self._get_features(i, word, context, prev, prev2)
+                tag = self.model.predict(features)
+            output.append((word, tag))
+            prev2 = prev
+            prev = tag
+
+        return output
+
+    def train(self, sentences, save_loc=None, nr_iter=5):
+        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
+        controls the number of Perceptron training iterations.
+
+        :param sentences: A list of (words, tags) tuples.
+        :param save_loc: If not ``None``, saves a pickled model in this location.
+        :param nr_iter: Number of training iterations.
+        '''
+        self._make_tagdict(sentences)
+        self.model.classes = self.classes
+        for iter_ in range(nr_iter):
+            c = 0
+            n = 0
+            for sentence  in sentences:
+                words = [word for word,tag in sentence]
+                tags  = [tag for word,tag in sentence]
+                
+                prev, prev2 = self.START
+                context = self.START + [self.normalize(w) for w in words] \
+                                                                    + self.END
+                for i, word in enumerate(words):
+                    guess = self.tagdict.get(word)
+                    if not guess:
+                        feats = self._get_features(i, word, context, prev, prev2)
+                        guess = self.model.predict(feats)
+                        self.model.update(tags[i], guess, feats)
+                    prev2 = prev
+                    prev = guess
+                    c += guess == tags[i]
+                    n += 1
+            random.shuffle(sentences)
+            logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)))
+        self.model.average_weights()
+        # Pickle as a binary file
+        if save_loc is not None:
+            with open(save_loc, 'wb') as fout:
+                pickle.dump((self.model.weights, self.tagdict, self.classes), fout, -1)
+        
+
+    def load(self, loc):
+        '''
+        :param loc: Load a pickled model at location.
+        :type loc: str 
+        '''
+        
+        with open(loc, 'rb') as fin:
+            w_td_c = pickle.load(fin)
+        
+        self.model.weights, self.tagdict, self.classes = w_td_c
+        self.model.classes = self.classes
+        
+
+    def normalize(self, word):
+        '''
+        Normalization used in pre-processing.
+        - All words are lower cased
+        - Digits in the range 1800-2100 are represented as !YEAR;
+        - Other digits are represented as !DIGITS
+
+        :rtype: str
+        '''
+        if '-' in word and word[0] != '-':
+            return '!HYPHEN'
+        elif word.isdigit() and len(word) == 4:
+            return '!YEAR'
+        elif word[0].isdigit():
+            return '!DIGITS'
+        else:
+            return word.lower()
+
+    def _get_features(self, i, word, context, prev, prev2):
+        '''Map tokens into a feature representation, implemented as a
+        {hashable: float} dict. If the features change, a new model must be
+        trained.
+        '''
+        def add(name, *args):
+            features[' '.join((name,) + tuple(args))] += 1
+
+        i += len(self.START)
+        features = defaultdict(int)
+        # It's useful to have a constant feature, which acts sort of like a prior
+        add('bias')
+        add('i suffix', word[-3:])
+        add('i pref1', word[0])
+        add('i-1 tag', prev)
+        add('i-2 tag', prev2)
+        add('i tag+i-2 tag', prev, prev2)
+        add('i word', context[i])
+        add('i-1 tag+i word', prev, context[i])
+        add('i-1 word', context[i-1])
+        add('i-1 suffix', context[i-1][-3:])
+        add('i-2 word', context[i-2])
+        add('i+1 word', context[i+1])
+        add('i+1 suffix', context[i+1][-3:])
+        add('i+2 word', context[i+2])
+        return features
+
+    def _make_tagdict(self, sentences):
+        '''
+        Make a tag dictionary for single-tag words.
+        :param sentences: A list of list of (word, tag) tuples.
+        '''
+        counts = defaultdict(lambda: defaultdict(int))
+        for sentence in sentences:
+            for word, tag in sentence:
+                counts[word][tag] += 1
+                self.classes.add(tag)
+        freq_thresh = 20
+        ambiguity_thresh = 0.97
+        for word, tag_freqs in counts.items():
+            tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
+            n = sum(tag_freqs.values())
+            # Don't add rare words to the tag dictionary
+            # Only add quite unambiguous words
+            if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
+                self.tagdict[word] = tag
+
+
+def _pc(n, d):
+    return (float(n) / d) * 100
+
+def _load_data_conll_format(filename):
+    print ('Read from file: ', filename)
+    with open(filename,'rb') as fin:
+        sentences = []
+        sentence = []
+        for line in fin.readlines():
+            line = line.strip()
+            #print line
+            if len(line) ==0:
+                sentences.append(sentence)
+                sentence = []
+                continue
+            tokens = line.split('\t')
+            word = tokens[1]
+            tag = tokens[4]
+            sentence.append((word,tag)) 
+        return sentences
+
+def _get_pretrain_model():
+    # Train and test on English part of ConLL data (WSJ part of Penn Treebank)
+    # Train: section 2-11 
+    # Test : section 23
+    tagger = PerceptronTagger()
+    training = _load_data_conll_format('english_ptb_train.conll')
+    testing = _load_data_conll_format('english_ptb_test.conll')
+    print ('Size of training and testing (sentence)', len(training), len(testing))
+    # Train and save the model 
+    tagger.train(training, PICKLE) 
+    print ('Accuracy : ',tagger.evaluate(testing))
+    
+if __name__ == '__main__':
+    #_get_pretrain_model()
+    pass
diff --git a/nltk/tag/senna.py b/nltk/tag/senna.py
index 84c474e..64b742c 100644
--- a/nltk/tag/senna.py
+++ b/nltk/tag/senna.py
@@ -86,7 +86,7 @@ class SennaChunkTagger(Senna):
         [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'),
         ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'),
         ('?', 'O')]
-        >>> list(chktagger.bio_to_chunks(tagged_sents, chunk_type='NP'))
+        >>> list(chktagger.bio_to_chunks(tagged_sent, chunk_type='NP'))
         [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')]
         
         :param tagged_sent: A list of tuples of word and BIO chunk tag.
diff --git a/nltk/test/bleu.doctest b/nltk/test/bleu.doctest
index 9827062..e5ed074 100644
--- a/nltk/test/bleu.doctest
+++ b/nltk/test/bleu.doctest
@@ -2,13 +2,13 @@
 BLEU tests
 ==========
 
->>> from nltk.align import bleu
+>>> from nltk.translate import bleu
 
 If the candidate has no alignment to any of the references, the BLEU score is 0.
 
 >>> bleu(
-...     'John loves Mary'.split(),
 ...     ['The candidate has no alignment to any of the references'.split()],
+...     'John loves Mary'.split(),
 ...     [1],
 ... )
 0
diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest
index 67a1a5b..cf38764 100644
--- a/nltk/test/corpus.doctest
+++ b/nltk/test/corpus.doctest
@@ -550,49 +550,49 @@ category assigned to a document is correct as follows:
 Other Corpora
 =============
 
-senseval
---------
-The Senseval 2 corpus is a word sense disambiguation corpus.  Each
-item in the corpus corresponds to a single ambiguous word.  For each
-of these words, the corpus contains a list of instances, corresponding
-to occurrences of that word.  Each instance provides the word; a list
-of word senses that apply to the word occurrence; and the word's
-context.
+comparative_sentences
+---------------------
+A list of sentences from various sources, especially reviews and articles. Each
+line contains one sentence; sentences were separated by using a sentence tokenizer.
+Comparative sentences have been annotated with their type, entities, features and
+keywords.
+
+    >>> from nltk.corpus import comparative_sentences
+    >>> comparison = comparative_sentences.comparisons()[0]
+    >>> comparison.text
+    ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
+    'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
+    'had', '.']
+    >>> comparison.entity_2
+    'models'
+    >>> (comparison.feature, comparison.keyword)
+    ('rewind', 'more')
+    >>> len(comparative_sentences.comparisons())
+    853
+
+opinion_lexicon
+---------------
+A list of positive and negative opinion words or sentiment words for English.
 
-    >>> from nltk.corpus import senseval
-    >>> senseval.fileids()
-    ['hard.pos', 'interest.pos', 'line.pos', 'serve.pos']
-    >>> senseval.instances('hard.pos')
-    ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
-    [SensevalInstance(word='hard-a',
-        position=20,
-        context=[('``', '``'), ('he', 'PRP'), ...('hard', 'JJ'), ...],
-        senses=('HARD1',)),
-     SensevalInstance(word='hard-a',
-        position=10,
-        context=[('clever', 'NNP'), ...('hard', 'JJ'), ('time', 'NN'), ...],
-        senses=('HARD1',)), ...]
+    >>> from nltk.corpus import opinion_lexicon
+    >>> opinion_lexicon.words()[:4]
+        ['2-faced', '2-faces', 'abnormal', 'abolish']
 
-The following code looks at instances of the word 'interest', and
-displays their local context (2 words on each side) and word sense(s):
+The OpinionLexiconCorpusReader also provides shortcuts to retrieve positive/negative
+words:
 
-    >>> for inst in senseval.instances('interest.pos')[:10]:
-    ...     p = inst.position
-    ...     left = ' '.join(w for (w,t) in inst.context[p-2:p])
-    ...     word = ' '.join(w for (w,t) in inst.context[p:p+1])
-    ...     right = ' '.join(w for (w,t) in inst.context[p+1:p+3])
-    ...     senses = ' '.join(inst.senses)
-    ...     print('%20s |%10s | %-15s -> %s' % (left, word, right, senses))
-             declines in |  interest | rates .         -> interest_6
-      indicate declining |  interest | rates because   -> interest_6
-           in short-term |  interest | rates .         -> interest_6
-                     4 % |  interest | in this         -> interest_5
-            company with | interests | in the          -> interest_5
-                  , plus |  interest | .               -> interest_6
-                 set the |  interest | rate on         -> interest_6
-                  's own |  interest | , prompted      -> interest_4
-           principal and |  interest | is the          -> interest_6
-            increase its |  interest | to 70           -> interest_5
+    >>> opinion_lexicon.negative()[:4]
+    ['2-faced', '2-faces', 'abnormal', 'abolish']
+
+Note that words from `words()` method in opinion_lexicon are sorted by file id,
+not alphabetically:
+
+    >>> opinion_lexicon.words()[0:10]
+    ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably',
+    'abominate', 'abomination', 'abort', 'aborted']
+    >>> sorted(opinion_lexicon.words())[0:10]
+    ['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably',
+    'abominate', 'abomination', 'abort']
 
 ppattach
 --------
@@ -613,6 +613,51 @@ corpus is encoded as a ``PPAttachment`` object:
     >>> inst.attachment
     'V'
 
+product_reviews_1 and product_reviews_2
+---------------------------------------
+These two datasets respectively contain annotated customer reviews of 5 and 9
+products from amazon.com.
+
+    >>> from nltk.corpus import product_reviews_1
+    >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
+    >>> review = camera_reviews[0]
+    >>> review.sents()[0]
+    ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
+    'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
+    >>> review.features()
+    [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
+    ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
+    ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
+    ('option', '+1')]
+
+It is also possible to reach the same information directly from the stream:
+
+    >>> product_reviews_1.features('Canon_G3.txt')
+    [('canon powershot g3', '+3'), ('use', '+2'), ...]
+
+We can compute stats for specific product features:
+
+    >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
+    >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
+    >>> # We use float for backward compatibility with division in Python2.7
+    >>> mean = float(tot)/n_reviews
+    >>> print(n_reviews, tot, mean)
+    15 24 1.6
+
+pros_cons
+---------
+A list of pros/cons sentences for determining context (aspect) dependent
+sentiment words, which are then applied to sentiment analysis of comparative
+sentences.
+
+    >>> from nltk.corpus import pros_cons
+    >>> pros_cons.sents(categories='Cons')
+    [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
+    'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
+    ...]
+    >>> pros_cons.words('IntegratedPros.txt')
+    ['Easy', 'to', 'use', ',', 'economical', '!', ...]
+
 semcor
 ------
 The Brown Corpus, annotated with WordNet senses.
@@ -653,6 +698,66 @@ The Brown Corpus, annotated with WordNet senses.
     counter cmd=done lemma=counter lexsn=1:06:00:: pos=NN wnsn=1
     .
 
+senseval
+--------
+The Senseval 2 corpus is a word sense disambiguation corpus.  Each
+item in the corpus corresponds to a single ambiguous word.  For each
+of these words, the corpus contains a list of instances, corresponding
+to occurrences of that word.  Each instance provides the word; a list
+of word senses that apply to the word occurrence; and the word's
+context.
+
+    >>> from nltk.corpus import senseval
+    >>> senseval.fileids()
+    ['hard.pos', 'interest.pos', 'line.pos', 'serve.pos']
+    >>> senseval.instances('hard.pos')
+    ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    [SensevalInstance(word='hard-a',
+        position=20,
+        context=[('``', '``'), ('he', 'PRP'), ...('hard', 'JJ'), ...],
+        senses=('HARD1',)),
+     SensevalInstance(word='hard-a',
+        position=10,
+        context=[('clever', 'NNP'), ...('hard', 'JJ'), ('time', 'NN'), ...],
+        senses=('HARD1',)), ...]
+
+The following code looks at instances of the word 'interest', and
+displays their local context (2 words on each side) and word sense(s):
+
+    >>> for inst in senseval.instances('interest.pos')[:10]:
+    ...     p = inst.position
+    ...     left = ' '.join(w for (w,t) in inst.context[p-2:p])
+    ...     word = ' '.join(w for (w,t) in inst.context[p:p+1])
+    ...     right = ' '.join(w for (w,t) in inst.context[p+1:p+3])
+    ...     senses = ' '.join(inst.senses)
+    ...     print('%20s |%10s | %-15s -> %s' % (left, word, right, senses))
+             declines in |  interest | rates .         -> interest_6
+      indicate declining |  interest | rates because   -> interest_6
+           in short-term |  interest | rates .         -> interest_6
+                     4 % |  interest | in this         -> interest_5
+            company with | interests | in the          -> interest_5
+                  , plus |  interest | .               -> interest_6
+                 set the |  interest | rate on         -> interest_6
+                  's own |  interest | , prompted      -> interest_4
+           principal and |  interest | is the          -> interest_6
+            increase its |  interest | to 70           -> interest_5
+
+sentence_polarity
+-----------------
+The Sentence Polarity dataset contains 5331 positive and 5331 negative processed
+sentences.
+
+    >>> from nltk.corpus import sentence_polarity
+    >>> sentence_polarity.sents()
+    [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish',
+    'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find',
+    'it', 'funny', '.'], ...]
+    >>> sentence_polarity.categories()
+    ['neg', 'pos']
+    >>> sentence_polarity.sents()[1]
+    ["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys',
+    'could', 'possibly', 'find', 'it', 'funny', '.']
+
 shakespeare
 -----------
 The Shakespeare corpus contains a set of Shakespeare plays, formatted
@@ -680,6 +785,20 @@ as XML files.  These corpora are returned as ElementTree objects:
      'Lion', 'MOTH', 'MUSTARDSEED', 'Moonshine', 'PEASEBLOSSOM',
      'Prologue', 'Pyramus', 'Thisbe', 'Wall']
 
+subjectivity
+-----------
+The Subjectivity Dataset contains 5000 subjective and 5000 objective processed
+sentences.
+
+    >>> from nltk.corpus import subjectivity
+    >>> subjectivity.categories()
+    ['obj', 'subj']
+    >>> subjectivity.sents()[23]
+    ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits',
+    'happened', 'off', 'screen', '.']
+    >>> subjectivity.words(categories='subj')
+    ['smart', 'and', 'alert', ',', 'thirteen', ...]
+
 toolbox
 -------
 The Toolbox corpus distributed with NLTK contains a sample lexicon and
@@ -843,7 +962,6 @@ the `tokenized()` method returns a list of lists of tokens.
     [['RT', '@KirkKus', ':', 'Indirect', 'cost', 'of', 'the', 'UK', 'being', 'in', ...],
      ['VIDEO', ':', 'Sturgeon', 'on', 'post-election', 'deals', 'http://t.co/BTJwrpbmOY'], ...]
 
-
 rte
 ---
 The RTE (Recognizing Textual Entailment) corpus was derived from the
@@ -991,6 +1109,24 @@ the XML structure directly, as follows:
     >>> [t.attrib['pos'] + "/" + t.attrib['word'] for t in tokens]
     ['RB/now', 'PRP/im', 'VBD/left', 'IN/with', 'DT/this', 'JJ/gay', 'NN/name']
 
+multext_east
+------------
+
+The Multext-East Corpus consists of POS-tagged versions of George Orwell's book
+1984 in 12 languages: English, Czech, Hungarian, Macedonian, Slovenian, Serbian,
+Slovak, Romanian, Estonian, Farsi, Bulgarian and Polish.
+The corpus can be accessed using the usual methods for tagged corpora. The tagset
+can be transformed from the Multext-East specific MSD tags to the Universal tagset
+using the "tagset" parameter of all functions returning tagged parts of the corpus.
+
+    >>> print(nltk.corpus.multext_east.words("oana-en.xml"))
+    ['It', 'was', 'a', 'bright', ...]
+    >>> print(nltk.corpus.multext_east.tagged_words("oana-en.xml"))
+    [('It', '#Pp3ns'), ('was', '#Vmis3s'), ('a', '#Di'), ...]
+    >>> print(nltk.corpus.multext_east.tagged_sents("oana-en.xml", "universal"))
+    [[('It', 'PRON'), ('was', 'VERB'), ('a', 'DET'), ...]
+
+
 
 ---------------------
 Corpus Reader Classes
@@ -1079,7 +1215,7 @@ reader for it with::
     ...     '/usr/share/some-corpus', '.*\.txt') # doctest: +SKIP
 
 For a complete list of corpus reader subclasses, see the API
-documentation for `nltk.corpus.CorpusReader`.
+documentation for `nltk.corpus.reader`.
 
 Corpus Types
 ============
@@ -2073,4 +2209,3 @@ access to its tuples() method
 
 
 
-
diff --git a/nltk/test/dependency.doctest b/nltk/test/dependency.doctest
index 56b5625..0972a11 100755
--- a/nltk/test/dependency.doctest
+++ b/nltk/test/dependency.doctest
@@ -63,6 +63,33 @@ CoNLL Data
     (Nov., NNP), NMOD, (29, CD)
     (join, VB), VMOD, (., .)
 
+Using a custom cell extractor.
+
+    >>> def custom_extractor(cells):
+    ...     _, tag, head, rel = cells
+    ...     return 'spam', 'spam', tag, tag, '', head, rel
+    >>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor)
+    >>> dg.tree().pprint()
+    (spam
+      (spam spam spam (spam (spam spam)) spam)
+      (spam (spam spam) (spam (spam spam spam)) (spam spam) spam))
+
+Custom cell extractors can take in and return an index.
+
+    >>> def custom_extractor(cells, index):
+    ...     word, tag, head, rel = cells
+    ...     return (index, '{}-{}'.format(word, index), word,
+    ...             tag, tag, '', head, rel)
+    >>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor)
+    >>> dg.tree().pprint()
+    (will-8
+      (Vinken-2 Pierre-1 ,-3 (old-6 (years-5 61-4)) ,-7)
+      (join-9
+        (board-11 the-10)
+        (as-12 (director-15 a-13 nonexecutive-14))
+        (Nov.-16 29-17)
+        .-18))
+
 Using the dependency-parsed version of the Penn Treebank corpus sample.
 
     >>> from nltk.corpus import dependency_treebank
@@ -181,3 +208,34 @@ Non-Projective Dependency Parsing
 
     >>> print(g.tree())
     (taught (man the) (play (dog his) to golf))
+
+Integration with MALT parser
+============================
+
+In case the top relation is different from the default, we can set it. In case
+of MALT parser, it's set to `'null'`.
+
+>>> dg_str = """1       I       _       NN      NN      _       2       nn      _       _
+... 2   shot    _       NN      NN      _       0       null    _       _
+... 3   an      _       AT      AT      _       2       dep     _       _
+... 4   elephant        _       NN      NN      _       7       nn      _       _
+... 5   in      _       NN      NN      _       7       nn      _       _
+... 6   my      _       NN      NN      _       7       nn      _       _
+... 7   pajamas _       NNS     NNS     _       3       dobj    _       _
+... """
+>>> dg = DependencyGraph(dg_str, top_relation_label='null')
+
+>>> len(dg.nodes)
+8
+
+>>> dg.root['word'], dg.root['address']
+('shot', 2)
+
+>>> print(dg.to_conll(10))  # doctest: +NORMALIZE_WHITESPACE
+1   I       _       NN      NN      _       2       nn      _       _
+2   shot    _       NN      NN      _       0       null    _       _
+3   an      _       AT      AT      _       2       dep     _       _
+4   elephant        _       NN      NN      _       7       nn      _       _
+5   in      _       NN      NN      _       7       nn      _       _
+6   my      _       NN      NN      _       7       nn      _       _
+7   pajamas _       NNS     NNS     _       3       dobj    _       _
diff --git a/nltk/test/gensim_fixt.py b/nltk/test/gensim_fixt.py
new file mode 100644
index 0000000..297e1c4
--- /dev/null
+++ b/nltk/test/gensim_fixt.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+def setup_module(module):
+    from nose import SkipTest
+    try:
+        import gensim
+    except ImportError:
+        raise SkipTest("Gensim doctest requires gensim")
diff --git a/nltk/test/sentiment.doctest b/nltk/test/sentiment.doctest
new file mode 100644
index 0000000..660653f
--- /dev/null
+++ b/nltk/test/sentiment.doctest
@@ -0,0 +1,233 @@
+.. Copyright (C) 2001-2015 NLTK Project
+.. For license information, see LICENSE.TXT
+
+===================
+Sentiment Analysis
+===================
+
+    >>> from nltk.classify import NaiveBayesClassifier
+    >>> from nltk.corpus import subjectivity
+    >>> from nltk.sentiment import SentimentAnalyzer
+    >>> from nltk.sentiment.util import *
+
+    >>> n_instances = 100
+    >>> subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
+    >>> obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
+    >>> len(subj_docs), len(obj_docs)
+    (100, 100)
+
+Each document is represented by a tuple (sentence, label). The sentence is tokenized,
+so it is represented by a list of strings:
+
+    >>> subj_docs[0]
+    (['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one',
+    'thing', 'is', 'a', 'small', 'gem', '.'], 'subj')
+
+We separately split subjective and objective instances to keep a balanced uniform
+class distribution in both train and test sets.
+
+    >>> train_subj_docs = subj_docs[:80]
+    >>> test_subj_docs = subj_docs[80:100]
+    >>> train_obj_docs = obj_docs[:80]
+    >>> test_obj_docs = obj_docs[80:100]
+    >>> training_docs = train_subj_docs+train_obj_docs
+    >>> testing_docs = test_subj_docs+test_obj_docs
+
+    >>> sentim_analyzer = SentimentAnalyzer()
+    >>> all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
+
+We use simple unigram word features, handling negation:
+
+    >>> unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
+    >>> len(unigram_feats)
+    83
+    >>> sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
+
+We apply features to obtain a feature-value representation of our datasets:
+
+    >>> training_set = sentim_analyzer.apply_features(training_docs)
+    >>> test_set = sentim_analyzer.apply_features(testing_docs)
+
+We can now train our classifier on the training set, and subsequently output the
+evaluation results:
+
+    >>> trainer = NaiveBayesClassifier.train
+    >>> classifier = sentim_analyzer.train(trainer, training_set)
+    Training classifier
+    >>> for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
+    ...     print('{0}: {1}'.format(key, value))
+    Evaluating NaiveBayesClassifier results...
+    Accuracy: 0.8
+    F-measure [obj]: 0.8
+    F-measure [subj]: 0.8
+    Precision [obj]: 0.8
+    Precision [subj]: 0.8
+    Recall [obj]: 0.8
+    Recall [subj]: 0.8
+
+
+Vader
+------
+
+    >>> from nltk.sentiment.vader import SentimentIntensityAnalyzer
+    >>> sentences = ["VADER is smart, handsome, and funny.", # positive sentence example
+    ...    "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted)
+    ...    "VADER is very smart, handsome, and funny.",  # booster words handled correctly (sentiment intensity adjusted)
+    ...    "VADER is VERY SMART, handsome, and FUNNY.",  # emphasis for ALLCAPS handled
+    ...    "VADER is VERY SMART, handsome, and FUNNY!!!",# combination of signals - VADER appropriately adjusts intensity
+    ...    "VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!",# booster words & punctuation make this close to ceiling for score
+    ...    "The book was good.",         # positive sentence
+    ...    "The book was kind of good.", # qualified positive sentence is handled correctly (intensity adjusted)
+    ...    "The plot was good, but the characters are uncompelling and the dialog is not great.", # mixed negation sentence
+    ...    "A really bad, horrible book.",       # negative sentence with booster words
+    ...    "At least it isn't a horrible book.", # negated negative sentence with contraction
+    ...    ":) and :D",     # emoticons handled
+    ...    "",              # an empty string is correctly handled
+    ...    "Today sux",     #  negative slang handled
+    ...    "Today sux!",    #  negative slang with punctuation emphasis handled
+    ...    "Today SUX!",    #  negative slang with capitalization emphasis
+    ...    "Today kinda sux! But I'll get by, lol" # mixed sentiment example with slang and constrastive conjunction "but"
+    ... ]
+    >>> paragraph = "It was one of the worst movies I've seen, despite good reviews. \
+    ... Unbelievably bad acting!! Poor direction. VERY poor production. \
+    ... The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!"
+
+    >>> from nltk import tokenize
+    >>> lines_list = tokenize.sent_tokenize(paragraph)
+    >>> sentences.extend(lines_list)
+
+    >>> tricky_sentences = [
+    ...    "Most automated sentiment analysis tools are shit.",
+    ...    "VADER sentiment analysis is the shit.",
+    ...    "Sentiment analysis has never been good.",
+    ...    "Sentiment analysis with VADER has never been this good.",
+    ...    "Warren Beatty has never been so entertaining.",
+    ...    "I won't say that the movie is astounding and I wouldn't claim that \
+    ...    the movie is too banal either.",
+    ...    "I like to hate Michael Bay films, but I couldn't fault this one",
+    ...    "It's one thing to watch an Uwe Boll film, but another thing entirely \
+    ...    to pay for it",
+    ...    "The movie was too good",
+    ...    "This movie was actually neither that funny, nor super witty.",
+    ...    "This movie doesn't care about cleverness, wit or any other kind of \
+    ...    intelligent humor.",
+    ...    "Those who find ugly meanings in beautiful things are corrupt without \
+    ...    being charming.",
+    ...    "There are slow and repetitive parts, BUT it has just enough spice to \
+    ...    keep it interesting.",
+    ...    "The script is not fantastic, but the acting is decent and the cinematography \
+    ...    is EXCELLENT!",
+    ...    "Roger Dodger is one of the most compelling variations on this theme.",
+    ...    "Roger Dodger is one of the least compelling variations on this theme.",
+    ...    "Roger Dodger is at least compelling as a variation on the theme.",
+    ...    "they fall in love with the product",
+    ...    "but then it breaks",
+    ...    "usually around the time the 90 day warranty expires",
+    ...    "the twin towers collapsed today",
+    ...    "However, Mr. Carter solemnly argues, his client carried out the kidnapping \
+    ...    under orders and in the ''least offensive way possible.''"
+    ... ]
+    >>> sentences.extend(tricky_sentences)
+    >>> sid = SentimentIntensityAnalyzer()
+    >>> for sentence in sentences:
+    ...     print(sentence)
+    ...     ss = sid.polarity_scores(sentence)
+    ...     for k in sorted(ss):
+    ...         print('{0}: {1}, '.format(k, ss[k]), end='')
+    ...     print()
+    VADER is smart, handsome, and funny.
+    compound: 0.8316, neg: 0.0, neu: 0.254, pos: 0.746,
+    VADER is smart, handsome, and funny!
+    compound: 0.8439, neg: 0.0, neu: 0.248, pos: 0.752,
+    VADER is very smart, handsome, and funny.
+    compound: 0.8545, neg: 0.0, neu: 0.299, pos: 0.701,
+    VADER is VERY SMART, handsome, and FUNNY.
+    compound: 0.9227, neg: 0.0, neu: 0.246, pos: 0.754,
+    VADER is VERY SMART, handsome, and FUNNY!!!
+    compound: 0.9342, neg: 0.0, neu: 0.233, pos: 0.767,
+    VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!
+    compound: 0.9469, neg: 0.0, neu: 0.294, pos: 0.706,
+    The book was good.
+    compound: 0.4404, neg: 0.0, neu: 0.508, pos: 0.492,
+    The book was kind of good.
+    compound: 0.3832, neg: 0.0, neu: 0.657, pos: 0.343,
+    The plot was good, but the characters are uncompelling and the dialog is not great.
+    compound: -0.7042, neg: 0.327, neu: 0.579, pos: 0.094,
+    A really bad, horrible book.
+    compound: -0.8211, neg: 0.791, neu: 0.209, pos: 0.0,
+    At least it isn't a horrible book.
+    compound: 0.431, neg: 0.0, neu: 0.637, pos: 0.363,
+    :) and :D
+    compound: 0.7925, neg: 0.0, neu: 0.124, pos: 0.876,
+    <BLANKLINE>
+    compound: 0.0, neg: 0.0, neu: 0.0, pos: 0.0,
+    Today sux
+    compound: -0.3612, neg: 0.714, neu: 0.286, pos: 0.0,
+    Today sux!
+    compound: -0.4199, neg: 0.736, neu: 0.264, pos: 0.0,
+    Today SUX!
+    compound: -0.5461, neg: 0.779, neu: 0.221, pos: 0.0,
+    Today kinda sux! But I'll get by, lol
+    compound: 0.2228, neg: 0.195, neu: 0.531, pos: 0.274,
+    It was one of the worst movies I've seen, despite good reviews.
+    compound: -0.7584, neg: 0.394, neu: 0.606, pos: 0.0,
+    Unbelievably bad acting!!
+    compound: -0.6572, neg: 0.686, neu: 0.314, pos: 0.0,
+    Poor direction.
+    compound: -0.4767, neg: 0.756, neu: 0.244, pos: 0.0,
+    VERY poor production.
+    compound: -0.6281, neg: 0.674, neu: 0.326, pos: 0.0,
+    The movie was bad.
+    compound: -0.5423, neg: 0.538, neu: 0.462, pos: 0.0,
+    Very bad movie.
+    compound: -0.5849, neg: 0.655, neu: 0.345, pos: 0.0,
+    VERY bad movie.
+    compound: -0.6732, neg: 0.694, neu: 0.306, pos: 0.0,
+    VERY BAD movie.
+    compound: -0.7398, neg: 0.724, neu: 0.276, pos: 0.0,
+    VERY BAD movie!
+    compound: -0.7616, neg: 0.735, neu: 0.265, pos: 0.0,
+    Most automated sentiment analysis tools are shit.
+    compound: -0.5574, neg: 0.375, neu: 0.625, pos: 0.0,
+    VADER sentiment analysis is the shit.
+    compound: 0.6124, neg: 0.0, neu: 0.556, pos: 0.444,
+    Sentiment analysis has never been good.
+    compound: -0.3412, neg: 0.325, neu: 0.675, pos: 0.0,
+    Sentiment analysis with VADER has never been this good.
+    compound: 0.5228, neg: 0.0, neu: 0.703, pos: 0.297,
+    Warren Beatty has never been so entertaining.
+    compound: 0.5777, neg: 0.0, neu: 0.616, pos: 0.384,
+    I won't say that the movie is astounding and I wouldn't claim that the movie is too banal either.
+    compound: 0.4215, neg: 0.0, neu: 0.851, pos: 0.149,
+    I like to hate Michael Bay films, but I couldn't fault this one
+    compound: 0.3153, neg: 0.157, neu: 0.534, pos: 0.309,
+    It's one thing to watch an Uwe Boll film, but another thing entirely to pay for it
+    compound: -0.2541, neg: 0.112, neu: 0.888, pos: 0.0,
+    The movie was too good
+    compound: 0.4404, neg: 0.0, neu: 0.58, pos: 0.42,
+    This movie was actually neither that funny, nor super witty.
+    compound: -0.6759, neg: 0.41, neu: 0.59, pos: 0.0,
+    This movie doesn't care about cleverness, wit or any other kind of intelligent humor.
+    compound: -0.1338, neg: 0.265, neu: 0.497, pos: 0.239,
+    Those who find ugly meanings in beautiful things are corrupt without being charming.
+    compound: -0.3553, neg: 0.314, neu: 0.493, pos: 0.192,
+    There are slow and repetitive parts, BUT it has just enough spice to keep it interesting.
+    compound: 0.4678, neg: 0.079, neu: 0.735, pos: 0.186,
+    The script is not fantastic, but the acting is decent and the cinematography is EXCELLENT!
+    compound: 0.7565, neg: 0.092, neu: 0.607, pos: 0.301,
+    Roger Dodger is one of the most compelling variations on this theme.
+    compound: 0.2944, neg: 0.0, neu: 0.834, pos: 0.166,
+    Roger Dodger is one of the least compelling variations on this theme.
+    compound: -0.1695, neg: 0.132, neu: 0.868, pos: 0.0,
+    Roger Dodger is at least compelling as a variation on the theme.
+    compound: 0.2263, neg: 0.0, neu: 0.84, pos: 0.16,
+    they fall in love with the product
+    compound: 0.6369, neg: 0.0, neu: 0.588, pos: 0.412,
+    but then it breaks
+    compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0,
+    usually around the time the 90 day warranty expires
+    compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0,
+    the twin towers collapsed today
+    compound: -0.2732, neg: 0.344, neu: 0.656, pos: 0.0,
+    However, Mr. Carter solemnly argues, his client carried out the kidnapping under orders and in the ''least offensive way possible.''
+    compound: -0.5859, neg: 0.23, neu: 0.697, pos: 0.074,
\ No newline at end of file
diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest
index 135a864..f271251 100644
--- a/nltk/test/tokenize.doctest
+++ b/nltk/test/tokenize.doctest
@@ -73,45 +73,39 @@ Some additional test strings.
     ['Alas', 'it has not rained today', 'When', 'do you think',
      'will it rain again']
 
-Make sure that grouping parentheses don't confuse the tokenizer:
+Take care to avoid using capturing groups:
 
-    >>> regexp_tokenize(s3, r'</?(b|p)>', gaps=False)
+    >>> regexp_tokenize(s3, r'</?[bp]>', gaps=False)
     ['<p>', '<b>', '</b>', '</p>']
-    >>> regexp_tokenize(s3, r'</?(b|p)>', gaps=True)
+    >>> regexp_tokenize(s3, r'</?(?:b|p)>', gaps=False)
+    ['<p>', '<b>', '</b>', '</p>']
+    >>> regexp_tokenize(s3, r'</?(?:b|p)>', gaps=True)
     ['Although this is ', 'not',
      ' the case here, we must not relax our vigilance!']
 
-Make sure that named groups don't confuse the tokenizer:
+Named groups are capturing groups, and confuse the tokenizer:
 
     >>> regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=False)
-    ['<p>', '<b>', '</b>', '</p>']
+    ['p', 'b', 'b', 'p']
     >>> regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=True)
-    ['Although this is ', 'not',
-     ' the case here, we must not relax our vigilance!']
+    ['p', 'Although this is ', 'b', 'not', 'b',
+     ' the case here, we must not relax our vigilance!', 'p']
 
 Make sure that nested groups don't confuse the tokenizer:
 
-    >>> regexp_tokenize(s2, r'(h|r|l)a(s|(i|n0))', gaps=False)
+    >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=False)
     ['las', 'has', 'rai', 'rai']
-    >>> regexp_tokenize(s2, r'(h|r|l)a(s|(i|n0))', gaps=True)
+    >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=True)
     ['A', ', it ', ' not ', 'ned today. When, do you think, will it ',
      'n again?']
 
-The tokenizer should reject any patterns with backreferences:
+Back-references require capturing groups, and these are not supported:
 
-    >>> regexp_tokenize(s2, r'(.)\1')
-    Traceback (most recent call last):
-       ...
-    ValueError: Regular expressions with back-references are
-    not supported: '(.)\\1'
-    >>> regexp_tokenize(s2, r'(?P<foo>)(?P=foo)')
-    Traceback (most recent call last):
-       ...
-    ValueError: Regular expressions with back-references are
-    not supported: '(?P<foo>)(?P=foo)'
+    >>> regexp_tokenize("aabbbcccc", r'(.)\1')
+    ['a', 'b', 'c', 'c']
 
 A simple sentence tokenizer '\.(\s+|$)'
 
-    >>> regexp_tokenize(s, pattern=r'\.(\s+|$)', gaps=True)
+    >>> regexp_tokenize(s, pattern=r'\.(?:\s+|$)', gaps=True)
     ['Good muffins cost $3.88\nin New York',
      'Please buy me\ntwo of them', 'Thanks']
diff --git a/nltk/test/align.doctest b/nltk/test/translate.doctest
similarity index 79%
rename from nltk/test/align.doctest
rename to nltk/test/translate.doctest
index 9dbcfea..78887d9 100644
--- a/nltk/test/align.doctest
+++ b/nltk/test/translate.doctest
@@ -52,7 +52,7 @@ but they are easily inverted:
 We can create new alignments, but these need to be in the correct range of
 the corresponding sentences:
 
-    >>> from nltk.align import Alignment, AlignedSent
+    >>> from nltk.translate import Alignment, AlignedSent
     >>> als = AlignedSent(['Reprise', 'de', 'la', 'session'],
     ...                   ['Resumption', 'of', 'the', 'session'],
     ...                   Alignment([(0, 0), (1, 4), (2, 1), (3, 3)]))
@@ -64,7 +64,7 @@ the corresponding sentences:
 You can set alignments with any sequence of tuples, so long as the first two
 indexes of the tuple are the alignment indices:
 
-als.alignment = Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))])
+    >>> als.alignment = Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))])
 
     >>> Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))])
     Alignment([(0, 0), (1, 1), (2, 2, 'boat'), (3, 3, False, (1, 2))])
@@ -78,7 +78,7 @@ EM for IBM Model 1
 
 Here is an example from Koehn, 2010:
 
-    >>> from nltk.align import IBMModel1
+    >>> from nltk.translate import IBMModel1
     >>> corpus = [AlignedSent(['the', 'house'], ['das', 'Haus']),
     ...           AlignedSent(['the', 'book'], ['das', 'Buch']),
     ...           AlignedSent(['a', 'book'], ['ein', 'Buch'])]
@@ -132,15 +132,14 @@ Consider the following aligned sentence for evaluation:
 
     >>> my_als = AlignedSent(['Resumption', 'of', 'the', 'session'],
     ...     ['Reprise', 'de', 'la', 'session'],
-    ...     [(0, 0), (3, 3), (1, 2), (1, 1), (1, 3)])
+    ...     Alignment([(0, 0), (3, 3), (1, 2), (1, 1), (1, 3)]))
 
 Precision
 ~~~~~~~~~
 ``precision = |A∩P| / |A|``
 
-**Precision** is probably the most well known evaluation metric and there is
-already a set based implementation in NLTK as
-`nltk.metrics.scores.precision`_.  Since precision is simply interested in the
+**Precision** is probably the most well known evaluation metric and it is implemented
+in `nltk.metrics.scores.precision`_.  Since precision is simply interested in the
 proportion of correct alignments, we calculate the ratio of the number of our
 test alignments (*A*) that match a possible alignment (*P*), over the number of
 test alignments provided. There is no penalty for missing a possible alignment
@@ -149,15 +148,19 @@ test alignment that is in *P* [OCH2000]_.
 
 Here are some examples:
 
-    >>> print(als.precision(set()))
+    >>> from nltk.metrics import precision
+    >>> als.alignment = Alignment([(0,0), (1,1), (2,2), (3,3)])
+    >>> precision(Alignment([]), als.alignment)
     0.0
-    >>> print(als.precision([(0,0), (1,1), (2,2), (3,3)]))
+    >>> precision(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment)
     1.0
-    >>> print(als.precision([(0,0), (3,3)]))
+    >>> precision(Alignment([(0,0), (3,3)]), als.alignment)
     0.5
-    >>> print(als.precision([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]))
+    >>> precision(Alignment.fromstring('0-0 3-3'), als.alignment)
+    0.5
+    >>> precision(Alignment([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]), als.alignment)
     1.0
-    >>> print(my_als.precision(als))
+    >>> precision(als.alignment, my_als.alignment)
     0.6
 
 
@@ -180,15 +183,18 @@ not [OCH2000]_.
 
 Here are some examples:
 
-    >>> print(als.recall(set()))
+    >>> from nltk.metrics import recall
+    >>> print(recall(Alignment([]), als.alignment))
     None
-    >>> print(als.recall([(0,0), (1,1), (2,2), (3,3)]))
+    >>> recall(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment)
+    1.0
+    >>> recall(Alignment.fromstring('0-0 3-3'), als.alignment)
     1.0
-    >>> print(als.recall([(0,0), (3,3)]))
+    >>> recall(Alignment([(0,0), (3,3)]), als.alignment)
     1.0
-    >>> als.recall([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)])
+    >>> recall(Alignment([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]), als.alignment)
     0.66666...
-    >>> print(my_als.recall(als))
+    >>> recall(als.alignment, my_als.alignment)
     0.75
 
 
@@ -212,14 +218,15 @@ possible alignments [MIHALCEA2003]_ [KOEHN2010]_.
 
 Here are some examples:
 
-    >>> print(als.alignment_error_rate(set()))
+    >>> from nltk.translate import alignment_error_rate
+    >>> alignment_error_rate(Alignment([]), als.alignment)
     1.0
-    >>> print(als.alignment_error_rate([(0,0), (1,1), (2,2), (3,3)]))
+    >>> alignment_error_rate(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment)
     0.0
-    >>> my_als.alignment_error_rate(als)
+    >>> alignment_error_rate(als.alignment, my_als.alignment)
     0.333333...
-    >>> my_als.alignment_error_rate(als,
-    ...     als.alignment | set([(1,2), (2,1)]))
+    >>> alignment_error_rate(als.alignment, my_als.alignment,
+    ...     als.alignment | Alignment([(1,2), (2,1)]))
     0.222222...
 
 
diff --git a/nltk/test/align_fixt.py b/nltk/test/translate_fixt.py
similarity index 100%
rename from nltk/test/align_fixt.py
rename to nltk/test/translate_fixt.py
diff --git a/nltk/test/unit/align/__init__.py b/nltk/test/unit/align/__init__.py
deleted file mode 100644
index 8b13789..0000000
--- a/nltk/test/unit/align/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/nltk/test/unit/test_corpora.py b/nltk/test/unit/test_corpora.py
index 8b39b45..1f0314b 100644
--- a/nltk/test/unit/test_corpora.py
+++ b/nltk/test/unit/test_corpora.py
@@ -40,11 +40,13 @@ class TestCess(unittest.TestCase):
         words = cess_cat.words()[:15]
         txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial"
         self.assertEqual(words, txt.split())
+        self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs")
 
     def test_esp(self):
         words = cess_esp.words()[:15]
         txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del"
         self.assertEqual(words, txt.split())
+        self.assertEqual(cess_esp.words()[115], "años")
 
 
 class TestFloresta(unittest.TestCase):
diff --git a/nltk/test/unit/test_json2csv_corpus.py b/nltk/test/unit/test_json2csv_corpus.py
index 4194821..bcbff7e 100644
--- a/nltk/test/unit/test_json2csv_corpus.py
+++ b/nltk/test/unit/test_json2csv_corpus.py
@@ -17,7 +17,7 @@ from nltk.compat import TemporaryDirectory
 import unittest
 
 from nltk.corpus import twitter_samples
-from nltk.twitter.util import json2csv, json2csv_entities
+from nltk.twitter.common import json2csv, json2csv_entities
 from nltk.compat import izip
 
 
@@ -34,8 +34,8 @@ def are_files_identical(filename1, filename2, debug=False):
                     if debug:
                         print("Error while comparing files. " +
                               "First difference at line below.")
-                        print("=> Output file line: {}".format(lineA))
-                        print("=> Refer. file line: {}".format(lineB))
+                        print("=> Output file line: {0}".format(lineA))
+                        print("=> Refer. file line: {0}".format(lineB))
                     result = False
                     break
             return result
diff --git a/nltk/test/unit/translate/__init__.py b/nltk/test/unit/translate/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/nltk/test/unit/translate/test_bleu.py b/nltk/test/unit/translate/test_bleu.py
new file mode 100644
index 0000000..762bd73
--- /dev/null
+++ b/nltk/test/unit/translate/test_bleu.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for BLEU translation evaluation metric
+"""
+
+import unittest
+from nltk.translate.bleu_score import _modified_precision
+
+class TestBLEU(unittest.TestCase):
+    def test__modified_precision(self):
+        """
+        Examples from the original BLEU paper 
+        http://www.aclweb.org/anthology/P02-1040.pdf
+        """
+        # Example 1: the "the*" example.
+        # Reference sentences.
+        ref1 = 'the cat is on the mat'.split()
+        ref2 = 'there is a cat on the mat'.split()
+        # Hypothesis sentence(s).
+        hyp1 = 'the the the the the the the'.split()
+        
+        references = [ref1, ref2] 
+        
+        # Testing modified unigram precision.
+        hyp1_unigram_precision =  _modified_precision(references, hyp1, n=1) 
+        assert (round(hyp1_unigram_precision, 4) == 0.2857)
+        
+        # Testing modified bigram precision.
+        assert(_modified_precision(references, hyp1, n=2) == 0.0)
+        
+        
+        # Example 2: the "of the" example.
+        # Reference sentences
+        ref1 = str('It is a guide to action that ensures that the military '
+                   'will forever heed Party commands').split()
+        ref2 = str('It is the guiding principle which guarantees the military '
+                   'forces always being under the command of the Party').split()
+        ref3 = str('It is the practical guide for the army always to heed '
+                   'the directions of the party').split()
+        # Hypothesis sentence(s).
+        hyp1 = 'of the'.split()
+        
+        references = [ref1, ref2, ref3] 
+        # Testing modified unigram precision.
+        assert (_modified_precision(references, hyp1, n=1) == 1.0)
+        
+        # Testing modified bigram precision.
+        assert(_modified_precision(references, hyp1, n=2) == 1.0)
+        
+
+        # Example 3: Proper MT outputs.
+        hyp1 = str('It is a guide to action which ensures that the military '
+                   'always obeys the commands of the party').split()
+        hyp2 = str('It is to insure the troops forever hearing the activity '
+                   'guidebook that party direct').split()
+        
+        references = [ref1, ref2, ref3]
+        
+        # Unigram precision.
+        hyp1_unigram_precision = _modified_precision(references, hyp1, n=1)
+        hyp2_unigram_precision = _modified_precision(references, hyp2, n=1)
+        # Test unigram precision without rounding.
+        assert (hyp1_unigram_precision == 0.9444444444444444)
+        assert (hyp2_unigram_precision == 0.5714285714285714)
+        # Test unigram precision with rounding.
+        assert (round(hyp1_unigram_precision, 4) == 0.9444)
+        assert (round(hyp2_unigram_precision, 4) == 0.5714)
+        
+        # Bigram precision
+        hyp1_bigram_precision = _modified_precision(references, hyp1, n=2)
+        hyp2_bigram_precision = _modified_precision(references, hyp2, n=2)
+        # Test bigram precision without rounding.
+        assert (hyp1_bigram_precision == 0.5882352941176471)
+        assert (hyp2_bigram_precision == 0.07692307692307693)
+        # Test bigram precision with rounding.
+        assert (round(hyp1_bigram_precision, 4) == 0.5882)
+        assert (round(hyp2_bigram_precision, 4) == 0.0769)
+        
+    def test_brevity_penalty(self):
+        pass
+    
\ No newline at end of file
diff --git a/nltk/test/unit/align/test_ibm1.py b/nltk/test/unit/translate/test_ibm1.py
similarity index 51%
rename from nltk/test/unit/align/test_ibm1.py
rename to nltk/test/unit/translate/test_ibm1.py
index cd76915..e703c05 100644
--- a/nltk/test/unit/align/test_ibm1.py
+++ b/nltk/test/unit/translate/test_ibm1.py
@@ -6,12 +6,45 @@ Tests for IBM Model 1 training methods
 import unittest
 
 from collections import defaultdict
-from nltk.align import AlignedSent
-from nltk.align.ibm_model import AlignmentInfo
-from nltk.align.ibm1 import IBMModel1
+from nltk.translate import AlignedSent
+from nltk.translate import IBMModel
+from nltk.translate import IBMModel1
+from nltk.translate.ibm_model import AlignmentInfo
 
 
 class TestIBMModel1(unittest.TestCase):
+    def test_set_uniform_translation_probabilities(self):
+        # arrange
+        corpus = [
+            AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
+            AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
+        ]
+        model1 = IBMModel1(corpus, 0)
+
+        # act
+        model1.set_uniform_probabilities(corpus)
+
+        # assert
+        # expected_prob = 1.0 / (target vocab size + 1)
+        self.assertEqual(model1.translation_table['ham']['eier'], 1.0 / 3)
+        self.assertEqual(model1.translation_table['eggs'][None], 1.0 / 3)
+
+    def test_set_uniform_translation_probabilities_of_non_domain_values(self):
+        # arrange
+        corpus = [
+            AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
+            AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
+        ]
+        model1 = IBMModel1(corpus, 0)
+
+        # act
+        model1.set_uniform_probabilities(corpus)
+
+        # assert
+        # examine target words that are not in the training data domain
+        self.assertEqual(model1.translation_table['parrot']['eier'],
+                         IBMModel.MIN_PROB)
+
     def test_prob_t_a_given_s(self):
         # arrange
         src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
diff --git a/nltk/test/unit/align/test_ibm2.py b/nltk/test/unit/translate/test_ibm2.py
similarity index 59%
rename from nltk/test/unit/align/test_ibm2.py
rename to nltk/test/unit/translate/test_ibm2.py
index 6f63bac..1ff01b9 100644
--- a/nltk/test/unit/align/test_ibm2.py
+++ b/nltk/test/unit/translate/test_ibm2.py
@@ -6,12 +6,45 @@ Tests for IBM Model 2 training methods
 import unittest
 
 from collections import defaultdict
-from nltk.align import AlignedSent
-from nltk.align.ibm_model import AlignmentInfo
-from nltk.align.ibm2 import IBMModel2
+from nltk.translate import AlignedSent
+from nltk.translate import IBMModel
+from nltk.translate import IBMModel2
+from nltk.translate.ibm_model import AlignmentInfo
 
 
 class TestIBMModel2(unittest.TestCase):
+    def test_set_uniform_alignment_probabilities(self):
+        # arrange
+        corpus = [
+            AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
+            AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
+        ]
+        model2 = IBMModel2(corpus, 0)
+
+        # act
+        model2.set_uniform_probabilities(corpus)
+
+        # assert
+        # expected_prob = 1.0 / (length of source sentence + 1)
+        self.assertEqual(model2.alignment_table[0][1][3][2], 1.0 / 4)
+        self.assertEqual(model2.alignment_table[2][4][2][4], 1.0 / 3)
+
+    def test_set_uniform_alignment_probabilities_of_non_domain_values(self):
+        # arrange
+        corpus = [
+            AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
+            AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
+        ]
+        model2 = IBMModel2(corpus, 0)
+
+        # act
+        model2.set_uniform_probabilities(corpus)
+
+        # assert
+        # examine i and j values that are not in the training data domain
+        self.assertEqual(model2.alignment_table[99][1][3][2], IBMModel.MIN_PROB)
+        self.assertEqual(model2.alignment_table[2][99][2][4], IBMModel.MIN_PROB)
+
     def test_prob_t_a_given_s(self):
         # arrange
         src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
diff --git a/nltk/test/unit/translate/test_ibm3.py b/nltk/test/unit/translate/test_ibm3.py
new file mode 100644
index 0000000..ee978b9
--- /dev/null
+++ b/nltk/test/unit/translate/test_ibm3.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for IBM Model 3 training methods
+"""
+
+import unittest
+
+from collections import defaultdict
+from nltk.translate import AlignedSent
+from nltk.translate import IBMModel
+from nltk.translate import IBMModel3
+from nltk.translate.ibm_model import AlignmentInfo
+
+
+class TestIBMModel3(unittest.TestCase):
+    def test_set_uniform_distortion_probabilities(self):
+        # arrange
+        corpus = [
+            AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
+            AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
+        ]
+        model3 = IBMModel3(corpus, 0)
+
+        # act
+        model3.set_uniform_probabilities(corpus)
+
+        # assert
+        # expected_prob = 1.0 / length of target sentence
+        self.assertEqual(model3.distortion_table[1][0][3][2], 1.0 / 2)
+        self.assertEqual(model3.distortion_table[4][2][2][4], 1.0 / 4)
+
+    def test_set_uniform_distortion_probabilities_of_non_domain_values(self):
+        # arrange
+        corpus = [
+            AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
+            AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
+        ]
+        model3 = IBMModel3(corpus, 0)
+
+        # act
+        model3.set_uniform_probabilities(corpus)
+
+        # assert
+        # examine i and j values that are not in the training data domain
+        self.assertEqual(model3.distortion_table[0][0][3][2], IBMModel.MIN_PROB)
+        self.assertEqual(model3.distortion_table[9][2][2][4], IBMModel.MIN_PROB)
+        self.assertEqual(model3.distortion_table[2][9][2][4], IBMModel.MIN_PROB)
+
+    def test_prob_t_a_given_s(self):
+        # arrange
+        src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
+        trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
+        corpus = [AlignedSent(trg_sentence, src_sentence)]
+        alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5),
+                                       [None] + src_sentence,
+                                       ['UNUSED'] + trg_sentence,
+                                       [[3], [1], [4], [], [2], [5, 6]])
+
+        distortion_table = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(
+                lambda: defaultdict(float))))
+        distortion_table[1][1][5][6] = 0.97  # i -> ich
+        distortion_table[2][4][5][6] = 0.97  # love -> gern
+        distortion_table[3][0][5][6] = 0.97  # to -> NULL
+        distortion_table[4][2][5][6] = 0.97  # eat -> esse
+        distortion_table[5][5][5][6] = 0.97  # smoked -> räucherschinken
+        distortion_table[6][5][5][6] = 0.97  # ham -> räucherschinken
+
+        translation_table = defaultdict(lambda: defaultdict(float))
+        translation_table['i']['ich'] = 0.98
+        translation_table['love']['gern'] = 0.98
+        translation_table['to'][None] = 0.98
+        translation_table['eat']['esse'] = 0.98
+        translation_table['smoked']['räucherschinken'] = 0.98
+        translation_table['ham']['räucherschinken'] = 0.98
+
+        fertility_table = defaultdict(lambda: defaultdict(float))
+        fertility_table[1]['ich'] = 0.99
+        fertility_table[1]['esse'] = 0.99
+        fertility_table[0]['ja'] = 0.99
+        fertility_table[1]['gern'] = 0.99
+        fertility_table[2]['räucherschinken'] = 0.999
+        fertility_table[1][None] = 0.99
+
+        probabilities = {
+            'p1': 0.167,
+            'translation_table': translation_table,
+            'distortion_table': distortion_table,
+            'fertility_table': fertility_table,
+            'alignment_table': None
+        }
+
+        model3 = IBMModel3(corpus, 0, probabilities)
+
+        # act
+        probability = model3.prob_t_a_given_s(alignment_info)
+
+        # assert
+        null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
+        fertility = 1*0.99 * 1*0.99 * 1*0.99 * 1*0.99 * 2*0.999
+        lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
+        distortion = 0.97 * 0.97 * 0.97 * 0.97 * 0.97 * 0.97
+        expected_probability = (null_generation * fertility *
+                                lexical_translation * distortion)
+        self.assertEqual(round(probability, 4), round(expected_probability, 4))
diff --git a/nltk/test/unit/align/test_ibm4.py b/nltk/test/unit/translate/test_ibm4.py
similarity index 94%
rename from nltk/test/unit/align/test_ibm4.py
rename to nltk/test/unit/translate/test_ibm4.py
index 60f974d..70333b0 100644
--- a/nltk/test/unit/align/test_ibm4.py
+++ b/nltk/test/unit/translate/test_ibm4.py
@@ -6,10 +6,10 @@ Tests for IBM Model 4 training methods
 import unittest
 
 from collections import defaultdict
-from nltk.align import AlignedSent
-from nltk.align.ibm_model import AlignmentInfo
-from nltk.align.ibm_model import IBMModel
-from nltk.align.ibm4 import IBMModel4
+from nltk.translate import AlignedSent
+from nltk.translate import IBMModel
+from nltk.translate import IBMModel4
+from nltk.translate.ibm_model import AlignmentInfo
 
 
 class TestIBMModel4(unittest.TestCase):
@@ -24,7 +24,7 @@ class TestIBMModel4(unittest.TestCase):
         model4 = IBMModel4(corpus, 0, src_classes, trg_classes)
 
         # act
-        model4.set_uniform_distortion_probabilities(corpus)
+        model4.set_uniform_probabilities(corpus)
 
         # assert
         # number of displacement values =
@@ -48,7 +48,7 @@ class TestIBMModel4(unittest.TestCase):
         model4 = IBMModel4(corpus, 0, src_classes, trg_classes)
 
         # act
-        model4.set_uniform_distortion_probabilities(corpus)
+        model4.set_uniform_probabilities(corpus)
 
         # assert
         # examine displacement values that are not in the training data domain
diff --git a/nltk/test/unit/align/test_ibm5.py b/nltk/test/unit/translate/test_ibm5.py
similarity index 93%
rename from nltk/test/unit/align/test_ibm5.py
rename to nltk/test/unit/translate/test_ibm5.py
index be8e694..78b3b3f 100644
--- a/nltk/test/unit/align/test_ibm5.py
+++ b/nltk/test/unit/translate/test_ibm5.py
@@ -6,15 +6,15 @@ Tests for IBM Model 5 training methods
 import unittest
 
 from collections import defaultdict
-from nltk.align import AlignedSent
-from nltk.align.ibm_model import AlignmentInfo
-from nltk.align.ibm_model import IBMModel
-from nltk.align.ibm4 import IBMModel4
-from nltk.align.ibm5 import IBMModel5
+from nltk.translate import AlignedSent
+from nltk.translate import IBMModel
+from nltk.translate import IBMModel4
+from nltk.translate import IBMModel5
+from nltk.translate.ibm_model import AlignmentInfo
 
 
 class TestIBMModel5(unittest.TestCase):
-    def test_set_uniform_distortion_probabilities_of_max_displacements(self):
+    def test_set_uniform_vacancy_probabilities_of_max_displacements(self):
         # arrange
         src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
         trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
@@ -25,7 +25,7 @@ class TestIBMModel5(unittest.TestCase):
         model5 = IBMModel5(corpus, 0, src_classes, trg_classes)
 
         # act
-        model5.set_uniform_distortion_probabilities(corpus)
+        model5.set_uniform_probabilities(corpus)
 
         # assert
         # number of vacancy difference values =
@@ -38,7 +38,7 @@ class TestIBMModel5(unittest.TestCase):
         self.assertEqual(model5.non_head_vacancy_table[4][4][0], expected_prob)
         self.assertEqual(model5.non_head_vacancy_table[-3][1][2], expected_prob)
 
-    def test_set_uniform_distortion_probabilities_of_non_domain_values(self):
+    def test_set_uniform_vacancy_probabilities_of_non_domain_values(self):
         # arrange
         src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
         trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
@@ -49,7 +49,7 @@ class TestIBMModel5(unittest.TestCase):
         model5 = IBMModel5(corpus, 0, src_classes, trg_classes)
 
         # act
-        model5.set_uniform_distortion_probabilities(corpus)
+        model5.set_uniform_probabilities(corpus)
 
         # assert
         # examine dv and max_v values that are not in the training data domain
diff --git a/nltk/test/unit/align/test_ibm_model.py b/nltk/test/unit/translate/test_ibm_model.py
similarity index 98%
rename from nltk/test/unit/align/test_ibm_model.py
rename to nltk/test/unit/translate/test_ibm_model.py
index e13b3ec..6ebabf6 100644
--- a/nltk/test/unit/align/test_ibm_model.py
+++ b/nltk/test/unit/translate/test_ibm_model.py
@@ -6,9 +6,9 @@ Tests for common methods of IBM translation models
 import unittest
 
 from collections import defaultdict
-from nltk.align import AlignedSent
-from nltk.align.ibm_model import AlignmentInfo
-from nltk.align.ibm_model import IBMModel
+from nltk.translate import AlignedSent
+from nltk.translate import IBMModel
+from nltk.translate.ibm_model import AlignmentInfo
 
 
 class TestIBMModel(unittest.TestCase):
diff --git a/nltk/test/unit/translate/test_stack_decoder.py b/nltk/test/unit/translate/test_stack_decoder.py
new file mode 100644
index 0000000..ea20e08
--- /dev/null
+++ b/nltk/test/unit/translate/test_stack_decoder.py
@@ -0,0 +1,300 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Stack decoder
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Tah Wei Hoon <hoon.tw at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Tests for stack decoder
+"""
+
+import unittest
+from collections import defaultdict
+from math import log
+from nltk.translate import PhraseTable
+from nltk.translate import StackDecoder
+from nltk.translate.stack_decoder import _Hypothesis, _Stack
+
+
+class TestStackDecoder(unittest.TestCase):
+    def test_find_all_src_phrases(self):
+        # arrange
+        phrase_table = TestStackDecoder.create_fake_phrase_table()
+        stack_decoder = StackDecoder(phrase_table, None)
+        sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels')
+
+        # act
+        src_phrase_spans = stack_decoder.find_all_src_phrases(sentence)
+
+        # assert
+        self.assertEqual(src_phrase_spans[0], [2])  # 'my hovercraft'
+        self.assertEqual(src_phrase_spans[1], [2])  # 'hovercraft'
+        self.assertEqual(src_phrase_spans[2], [3])  # 'is'
+        self.assertEqual(src_phrase_spans[3], [5, 6])  # 'full of', 'full of eels'
+        self.assertFalse(src_phrase_spans[4])  # no entry starting with 'of'
+        self.assertEqual(src_phrase_spans[5], [6])  # 'eels'
+
+    def test_distortion_score(self):
+        # arrange
+        stack_decoder = StackDecoder(None, None)
+        stack_decoder.distortion_factor = 0.5
+        hypothesis = _Hypothesis()
+        hypothesis.src_phrase_span = (3, 5)
+
+        # act
+        score = stack_decoder.distortion_score(hypothesis, (8, 10))
+
+        # assert
+        expected_score = log(stack_decoder.distortion_factor) * (8 - 5)
+        self.assertEqual(score, expected_score)
+
+    def test_distortion_score_of_first_expansion(self):
+        # arrange
+        stack_decoder = StackDecoder(None, None)
+        stack_decoder.distortion_factor = 0.5
+        hypothesis = _Hypothesis()
+
+        # act
+        score = stack_decoder.distortion_score(hypothesis, (8, 10))
+
+        # assert
+        # expansion from empty hypothesis always has zero distortion cost
+        self.assertEqual(score, 0.0)
+
+    def test_compute_future_costs(self):
+        # arrange
+        phrase_table = TestStackDecoder.create_fake_phrase_table()
+        language_model = TestStackDecoder.create_fake_language_model()
+        stack_decoder = StackDecoder(phrase_table, language_model)
+        sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels')
+
+        # act
+        future_scores = stack_decoder.compute_future_scores(sentence)
+
+        # assert
+        self.assertEqual(
+            future_scores[1][2],
+            (phrase_table.translations_for(('hovercraft',))[0].log_prob +
+             language_model.probability(('hovercraft',))))
+        self.assertEqual(
+            future_scores[0][2],
+            (phrase_table.translations_for(('my', 'hovercraft'))[0].log_prob +
+             language_model.probability(('my', 'hovercraft'))))
+
+    def test_compute_future_costs_for_phrases_not_in_phrase_table(self):
+        # arrange
+        phrase_table = TestStackDecoder.create_fake_phrase_table()
+        language_model = TestStackDecoder.create_fake_language_model()
+        stack_decoder = StackDecoder(phrase_table, language_model)
+        sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels')
+
+        # act
+        future_scores = stack_decoder.compute_future_scores(sentence)
+
+        # assert
+        self.assertEqual(
+            future_scores[1][3],  # 'hovercraft is' is not in phrase table
+            future_scores[1][2] + future_scores[2][3])  # backoff
+
+    def test_future_score(self):
+        # arrange: sentence with 8 words; words 2, 3, 4 already translated
+        hypothesis = _Hypothesis()
+        hypothesis.untranslated_spans = lambda _: [(0, 2), (5, 8)]  # mock
+        future_score_table = defaultdict(lambda: defaultdict(float))
+        future_score_table[0][2] = 0.4
+        future_score_table[5][8] = 0.5
+        stack_decoder = StackDecoder(None, None)
+
+        # act
+        future_score = stack_decoder.future_score(
+            hypothesis, future_score_table, 8)
+
+        # assert
+        self.assertEqual(future_score, 0.4 + 0.5)
+
+    def test_valid_phrases(self):
+        # arrange
+        hypothesis = _Hypothesis()
+        # mock untranslated_spans method
+        hypothesis.untranslated_spans = lambda _: [
+            (0, 2),
+            (3, 6)
+        ]
+        all_phrases_from = [
+            [1, 4],
+            [2],
+            [],
+            [5],
+            [5, 6, 7],
+            [],
+            [7]
+        ]
+
+        # act
+        phrase_spans = StackDecoder.valid_phrases(all_phrases_from, hypothesis)
+
+        # assert
+        self.assertEqual(phrase_spans, [(0, 1), (1, 2), (3, 5), (4, 5), (4, 6)])
+
+    @staticmethod
+    def create_fake_phrase_table():
+        phrase_table = PhraseTable()
+        phrase_table.add(('hovercraft',), ('',), 0.8)
+        phrase_table.add(('my', 'hovercraft'), ('', ''), 0.7)
+        phrase_table.add(('my', 'cheese'), ('', ''), 0.7)
+        phrase_table.add(('is',), ('',), 0.8)
+        phrase_table.add(('is',), ('',), 0.5)
+        phrase_table.add(('full', 'of'), ('', ''), 0.01)
+        phrase_table.add(('full', 'of', 'eels'), ('', '', ''), 0.5)
+        phrase_table.add(('full', 'of', 'spam'), ('', ''), 0.5)
+        phrase_table.add(('eels',), ('',), 0.5)
+        phrase_table.add(('spam',), ('',), 0.5)
+        return phrase_table
+
+    @staticmethod
+    def create_fake_language_model():
+        # nltk.model should be used here once it is implemented
+        language_prob = defaultdict(lambda: -999.0)
+        language_prob[('my',)] = log(0.1)
+        language_prob[('hovercraft',)] = log(0.1)
+        language_prob[('is',)] = log(0.1)
+        language_prob[('full',)] = log(0.1)
+        language_prob[('of',)] = log(0.1)
+        language_prob[('eels',)] = log(0.1)
+        language_prob[('my', 'hovercraft',)] = log(0.3)
+        language_model = type(
+            '', (object,),
+            {'probability': lambda _, phrase: language_prob[phrase]})()
+        return language_model
+
+
+class TestHypothesis(unittest.TestCase):
+    def setUp(self):
+        root = _Hypothesis()
+        child = _Hypothesis(
+            raw_score=0.5,
+            src_phrase_span=(3, 7),
+            trg_phrase=('hello', 'world'),
+            previous=root
+        )
+        grandchild = _Hypothesis(
+            raw_score=0.4,
+            src_phrase_span=(1, 2),
+            trg_phrase=('and', 'goodbye'),
+            previous=child
+        )
+        self.hypothesis_chain = grandchild
+
+    def test_translation_so_far(self):
+        # act
+        translation = self.hypothesis_chain.translation_so_far()
+
+        # assert
+        self.assertEqual(translation, ['hello', 'world', 'and', 'goodbye'])
+
+    def test_translation_so_far_for_empty_hypothesis(self):
+        # arrange
+        hypothesis = _Hypothesis()
+
+        # act
+        translation = hypothesis.translation_so_far()
+
+        # assert
+        self.assertEqual(translation, [])
+
+    def test_total_translated_words(self):
+        # act
+        total_translated_words = self.hypothesis_chain.total_translated_words()
+
+        # assert
+        self.assertEqual(total_translated_words, 5)
+
+    def test_translated_positions(self):
+        # act
+        translated_positions = self.hypothesis_chain.translated_positions()
+
+        # assert
+        translated_positions.sort()
+        self.assertEqual(translated_positions, [1, 3, 4, 5, 6])
+
+    def test_untranslated_spans(self):
+        # act
+        untranslated_spans = self.hypothesis_chain.untranslated_spans(10)
+
+        # assert
+        self.assertEqual(untranslated_spans, [(0, 1), (2, 3), (7, 10)])
+
+    def test_untranslated_spans_for_empty_hypothesis(self):
+        # arrange
+        hypothesis = _Hypothesis()
+
+        # act
+        untranslated_spans = hypothesis.untranslated_spans(10)
+
+        # assert
+        self.assertEqual(untranslated_spans, [(0, 10)])
+
+
+class TestStack(unittest.TestCase):
+    def test_push_bumps_off_worst_hypothesis_when_stack_is_full(self):
+        # arrange
+        stack = _Stack(3)
+        poor_hypothesis = _Hypothesis(0.01)
+
+        # act
+        stack.push(_Hypothesis(0.2))
+        stack.push(poor_hypothesis)
+        stack.push(_Hypothesis(0.1))
+        stack.push(_Hypothesis(0.3))
+
+        # assert
+        self.assertFalse(poor_hypothesis in stack)
+
+    def test_push_removes_hypotheses_that_fall_below_beam_threshold(self):
+        # arrange
+        stack = _Stack(3, 0.5)
+        poor_hypothesis = _Hypothesis(0.01)
+        worse_hypothesis = _Hypothesis(0.009)
+
+        # act
+        stack.push(poor_hypothesis)
+        stack.push(worse_hypothesis)
+        stack.push(_Hypothesis(0.9))  # greatly superior hypothesis
+
+        # assert
+        self.assertFalse(poor_hypothesis in stack)
+        self.assertFalse(worse_hypothesis in stack)
+
+    def test_push_does_not_add_hypothesis_that_falls_below_beam_threshold(self):
+        # arrange
+        stack = _Stack(3, 0.5)
+        poor_hypothesis = _Hypothesis(0.01)
+
+        # act
+        stack.push(_Hypothesis(0.9))  # greatly superior hypothesis
+        stack.push(poor_hypothesis)
+
+        # assert
+        self.assertFalse(poor_hypothesis in stack)
+
+    def test_best_returns_the_best_hypothesis(self):
+        # arrange
+        stack = _Stack(3)
+        best_hypothesis = _Hypothesis(0.99)
+
+        # act
+        stack.push(_Hypothesis(0.0))
+        stack.push(best_hypothesis)
+        stack.push(_Hypothesis(0.5))
+
+        # assert
+        self.assertEqual(stack.best(), best_hypothesis)
+
+    def test_best_returns_none_when_stack_is_empty(self):
+        # arrange
+        stack = _Stack(3)
+
+        # assert
+        self.assertEqual(stack.best(), None)
diff --git a/nltk/test/wordnet.doctest b/nltk/test/wordnet.doctest
index ea442de..7c72631 100644
--- a/nltk/test/wordnet.doctest
+++ b/nltk/test/wordnet.doctest
@@ -6,7 +6,7 @@ WordNet Interface
 =================
 
 WordNet is just another NLTK corpus reader, and can be imported like this:
-
+    >>> from __future__ import print_function, unicode_literals
     >>> from nltk.corpus import wordnet
 
 For more compact code, we recommend:
@@ -47,35 +47,37 @@ A synset is identified with a 3-part name of the form: word.pos.nn:
 The WordNet corpus reader gives access to the Open Multilingual
 WordNet, using ISO-639 language codes.
 
-    >>> sorted(wn.langs())
-    ['als', 'arb', 'cat', 'cmn', 'dan', 'eng', 'eus', 'fas',
-    'fin', 'fra', 'fre', 'glg', 'heb', 'ind', 'ita', 'jpn', 'nno',
-    'nob', 'pol', 'por', 'spa', 'tha', 'zsm']
+    >>> sorted(wn.langs()) # doctest: +NORMALIZE_WHITESPACE
+    ['als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eng', 'eus', 'fas', 'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 
+    'ita', 'jpn', 'nno', 'nob', 'pol', 'por', 'qcn', 'slv', 'spa', 'swe', 'tha', 'zsm']
     >>> wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn')
     [Synset('dog.n.01'), Synset('spy.n.01')]
-    >>> wn.synset('spy.n.01').lemma_names('jpn')
+    
+    wn.synset('spy.n.01').lemma_names('jpn') # doctest: +NORMALIZE_WHITESPACE
     ['\u3044\u306c', '\u307e\u308f\u3057\u8005', '\u30b9\u30d1\u30a4', '\u56de\u3057\u8005',
     '\u56de\u8005', '\u5bc6\u5075', '\u5de5\u4f5c\u54e1', '\u5efb\u3057\u8005',
     '\u5efb\u8005', '\u63a2', '\u63a2\u308a', '\u72ac', '\u79d8\u5bc6\u635c\u67fb\u54e1',
     '\u8adc\u5831\u54e1', '\u8adc\u8005', '\u9593\u8005', '\u9593\u8adc', '\u96a0\u5bc6']
+    
     >>> wn.synset('dog.n.01').lemma_names('ita')
     ['cane', 'Canis_familiaris']
-    >>> wn.lemmas('cane', lang='ita')
-    [Lemma('dog.n.01.cane'), Lemma('hammer.n.01.cane'), Lemma('cramp.n.02.cane'),
-    Lemma('bad_person.n.01.cane'), Lemma('incompetent.n.01.cane')]
-    >>> sorted(wn.synset('dog.n.01').lemmas('dan'))
+    >>> wn.lemmas('cane', lang='ita') # doctest: +NORMALIZE_WHITESPACE
+    [Lemma('dog.n.01.cane'), Lemma('cramp.n.02.cane'), Lemma('hammer.n.01.cane'), Lemma('bad_person.n.01.cane'), 
+    Lemma('incompetent.n.01.cane')]
+    >>> sorted(wn.synset('dog.n.01').lemmas('dan')) # doctest: +NORMALIZE_WHITESPACE
     [Lemma('dog.n.01.hund'), Lemma('dog.n.01.k\xf8ter'),
     Lemma('dog.n.01.vovhund'), Lemma('dog.n.01.vovse')]
-    >>> sorted(wn.synset('dog.n.01').lemmas('por'))
-    [Lemma('dog.n.01.cachorro'), Lemma('dog.n.01.c\xe3es'),
-    Lemma('dog.n.01.c\xe3o'), Lemma('dog.n.01.c\xe3o')]
+    
+    sorted(wn.synset('dog.n.01').lemmas('por'))
+	[Lemma('dog.n.01.cachorra'), Lemma('dog.n.01.cachorro'), Lemma('dog.n.01.cadela'), Lemma('dog.n.01.c\xe3o')]
+    
     >>> dog_lemma = wn.lemma(b'dog.n.01.c\xc3\xa3o'.decode('utf-8'), lang='por')
     >>> dog_lemma
     Lemma('dog.n.01.c\xe3o')
     >>> dog_lemma.lang()
     'por'
     >>> len(wordnet.all_lemma_names(pos='n', lang='jpn'))
-    66027
+    64797
 
 -------
 Synsets
@@ -413,17 +415,16 @@ Compute transitive closures of synsets
     True
     >>> list(dog.closure(hyper, depth=1)) == dog.hypernyms()
     True
-    >>> list(dog.closure(hypo))
+    >>> list(dog.closure(hypo)) # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
     [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'),
      Synset('dalmatian.n.02'), Synset('great_pyrenees.n.01'),
      Synset('griffon.n.02'), Synset('hunting_dog.n.01'), Synset('lapdog.n.01'),
      Synset('leonberg.n.01'), Synset('mexican_hairless.n.01'),
      Synset('newfoundland.n.01'), Synset('pooch.n.01'), Synset('poodle.n.01'), ...]
-    >>> list(dog.closure(hyper))
-    [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'),
-    Synset('animal.n.01'), Synset('placental.n.01'), Synset('organism.n.01'),
-    Synset('mammal.n.01'), Synset('living_thing.n.01'), Synset('vertebrate.n.01'),
-    Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'),
+    >>> list(dog.closure(hyper)) # doctest: +NORMALIZE_WHITESPACE
+    [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'), Synset('animal.n.01'), 
+    Synset('placental.n.01'), Synset('organism.n.01'), Synset('mammal.n.01'), Synset('living_thing.n.01'),
+    Synset('vertebrate.n.01'), Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'),
     Synset('physical_entity.n.01'), Synset('entity.n.01')]
 
 
diff --git a/nltk/text.py b/nltk/text.py
index 88dea81..90ac7be 100644
--- a/nltk/text.py
+++ b/nltk/text.py
@@ -544,7 +544,7 @@ class TextCollection(Text):
     Iterating over a TextCollection produces all the tokens of all the
     texts in order.
     """
-    def __init__(self, source, name=None):
+    def __init__(self, source):
         if hasattr(source, 'words'): # bridge to the text corpus reader
             source = [source.words(f) for f in source.fileids()]
 
@@ -552,11 +552,11 @@ class TextCollection(Text):
         Text.__init__(self, LazyConcatenation(source))
         self._idf_cache = {}
 
-    def tf(self, term, text, method=None):
+    def tf(self, term, text):
         """ The frequency of the term in text. """
         return text.count(term) / len(text)
 
-    def idf(self, term, method=None):
+    def idf(self, term):
         """ The number of texts in the corpus divided by the
         number of texts that the term appears in.
         If a term does not appear in the corpus, 0.0 is returned. """
diff --git a/nltk/tokenize/__init__.py b/nltk/tokenize/__init__.py
index 934c3fc..4e00346 100644
--- a/nltk/tokenize/__init__.py
+++ b/nltk/tokenize/__init__.py
@@ -71,7 +71,8 @@ from nltk.tokenize.sexpr    import SExprTokenizer, sexpr_tokenize
 from nltk.tokenize.treebank import TreebankWordTokenizer
 from nltk.tokenize.stanford import StanfordTokenizer
 from nltk.tokenize.texttiling import TextTilingTokenizer
-from nltk.tokenize.casual import (TweetTokenizer, casual_tokenize)
+from nltk.tokenize.casual   import (TweetTokenizer, casual_tokenize)
+from nltk.tokenize.mwe      import MWETokenizer
 
 # Standard sentence tokenizer.
 def sent_tokenize(text, language='english'):
diff --git a/nltk/tokenize/mwe.py b/nltk/tokenize/mwe.py
new file mode 100644
index 0000000..c13aabd
--- /dev/null
+++ b/nltk/tokenize/mwe.py
@@ -0,0 +1,112 @@
+# Multi-Word Expression tokenizer
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Rob Malouf <rmalouf at mail.sdsu.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Multi-Word Expression Tokenizer
+
+A ``MWETokenizer`` takes a string which has already been divided into tokens and
+retokenizes it, merging multi-word expressions into single tokens, using a lexicon
+of MWEs:
+
+
+    >>> from nltk.tokenize import MWETokenizer
+
+    >>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
+    >>> tokenizer.add_mwe(('in', 'spite', 'of'))
+
+    >>> tokenizer.tokenize('Testing testing testing one two three'.split())
+    ['Testing', 'testing', 'testing', 'one', 'two', 'three']
+
+    >>> tokenizer.tokenize('This is a test in spite'.split())
+    ['This', 'is', 'a', 'test', 'in', 'spite']
+
+    >>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
+    ['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']
+
+"""
+
+from nltk.tokenize.api import TokenizerI
+
+
+class MWETokenizer(TokenizerI):
+    """
+    A tokenizer that processes tokenized text and merges multi-word expressions
+    into single tokens:
+
+        >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
+        >>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
+        ['An', "hors+d'oeuvre", 'tonight,', 'sir?']
+
+    :type mwes: list(list(str))
+    :param mwes: A sequence of multi-word expressions to be merged, where
+        each MWE is a sequence of strings.
+    :type separator: str
+    :param separator: String that should be inserted between words in a multi-word
+        expression token.
+
+    """
+
+    def __init__(self, mwes=None, separator='_'):
+
+        if not mwes:
+            mwes = []
+        self._mwes = dict()
+        self._separator = separator
+        for mwe in mwes:
+            self.add_mwe(mwe)
+
+    def add_mwe(self, mwe, _trie=None):
+        """
+        Add a multi-word expression to the lexicon (stored as a word trie)
+
+        We represent the trie as a dict of dicts:
+
+            >>> tokenizer = MWETokenizer([('a', 'b'), ('a', 'b', 'c'), ('a', 'x')])
+            >>> tokenizer._mwes
+            {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
+
+        The key True marks the end of a valid MWE
+
+        """
+
+        if _trie is None:
+            _trie = self._mwes
+        if mwe:
+            if mwe[0] not in _trie:
+                _trie[mwe[0]] = dict()
+            self.add_mwe(mwe[1:], _trie=_trie[mwe[0]])
+        else:
+            _trie[True] = None
+
+    def tokenize(self, text):
+
+        i = 0
+        n = len(text)
+        result = []
+
+        while i < n:
+            if text[i] in self._mwes:
+                # possible MWE match
+                j = i
+                trie = self._mwes
+                while j < n and text[j] in trie:
+                    trie = trie[text[j]]
+                    j = j + 1
+                else:
+                    if True in trie:
+                        # success!
+                        result.append(self._separator.join(text[i:j]))
+                        i = j
+                    else:
+                        # no match, so backtrack
+                        result.append(text[i])
+                        i += 1
+            else:
+                result.append(text[i])
+                i += 1
+
+        return result
diff --git a/nltk/tokenize/regexp.py b/nltk/tokenize/regexp.py
index bfa4976..5385e1f 100644
--- a/nltk/tokenize/regexp.py
+++ b/nltk/tokenize/regexp.py
@@ -68,9 +68,7 @@ argument.  This differs from the conventions used by Python's
 from __future__ import unicode_literals
 
 import re
-import sre_constants
 
-from nltk.internals import compile_regexp_to_noncapturing
 from nltk.tokenize.api import TokenizerI
 from nltk.tokenize.util import regexp_span_tokenize
 from nltk.compat import python_2_unicode_compatible
@@ -114,13 +112,7 @@ class RegexpTokenizer(TokenizerI):
         
     def _check_regexp(self):
         if self._regexp is None:
-            try:
-                # Remove capturing parentheses -- if the regexp contains any
-                # capturing parentheses, then the behavior of re.findall and
-                # re.split will change.                 
-                self._regexp = compile_regexp_to_noncapturing(self._pattern, self._flags)
-            except re.error as e:
-                raise ValueError('Error in regular expression %r: %s' % (self._pattern, e))
+            self._regexp = re.compile(self._pattern)
         
     def tokenize(self, text):
         self._check_regexp()
diff --git a/nltk/translate/__init__.py b/nltk/translate/__init__.py
new file mode 100644
index 0000000..4f83e17
--- /dev/null
+++ b/nltk/translate/__init__.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Machine Translation
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Steven Bird <stevenbird1 at gmail.com>, Tah Wei Hoon <hoon.tw at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Experimental features for machine translation.
+These interfaces are prone to change.
+"""
+
+from nltk.translate.api import AlignedSent, Alignment, PhraseTable
+from nltk.translate.ibm_model import IBMModel
+from nltk.translate.ibm1 import IBMModel1
+from nltk.translate.ibm2 import IBMModel2
+from nltk.translate.ibm3 import IBMModel3
+from nltk.translate.ibm4 import IBMModel4
+from nltk.translate.ibm5 import IBMModel5
+from nltk.translate.bleu_score import bleu
+from nltk.translate.metrics import alignment_error_rate
+from nltk.translate.stack_decoder import StackDecoder
diff --git a/nltk/align/api.py b/nltk/translate/api.py
similarity index 62%
rename from nltk/align/api.py
rename to nltk/translate/api.py
index 9d9f443..2302331 100644
--- a/nltk/align/api.py
+++ b/nltk/translate/api.py
@@ -1,18 +1,18 @@
-# Natural Language Toolkit: Aligned Sentences
+# Natural Language Toolkit: API for alignment and translation objects 
 #
 # Copyright (C) 2001-2015 NLTK Project
 # Author: Will Zhang <wilzzha at gmail.com>
 #         Guan Gui <ggui at student.unimelb.edu.au>
 #         Steven Bird <stevenbird1 at gmail.com>
+#         Tah Wei Hoon <hoon.tw at gmail.com>
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 from __future__ import print_function, unicode_literals
-
-from nltk.compat import python_2_unicode_compatible, string_types
-from nltk.metrics import precision, recall
 import subprocess
+from collections import namedtuple
 
+from nltk.compat import python_2_unicode_compatible, string_types
 
 @python_2_unicode_compatible
 class AlignedSent(object):
@@ -20,17 +20,15 @@ class AlignedSent(object):
     Return an aligned sentence object, which encapsulates two sentences
     along with an ``Alignment`` between them.
 
-        >>> from nltk.align import AlignedSent
+        >>> from nltk.translate import AlignedSent, Alignment
         >>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'],
-        ...     ['the', 'house', 'is', 'small'], '0-2 1-3 2-1 3-0')
+        ...     ['the', 'house', 'is', 'small'], Alignment.fromstring('0-2 1-3 2-1 3-0'))
         >>> algnsent.words
         ['klein', 'ist', 'das', 'Haus']
         >>> algnsent.mots
         ['the', 'house', 'is', 'small']
         >>> algnsent.alignment
         Alignment([(0, 2), (1, 3), (2, 1), (3, 0)])
-        >>> algnsent.precision('0-2 1-3 2-1 3-3')
-        0.75
         >>> from nltk.corpus import comtrans
         >>> print(comtrans.aligned_sents()[54])
         <AlignedSent: 'Weshalb also sollten...' -> 'So why should EU arm...'>
@@ -46,10 +44,14 @@ class AlignedSent(object):
     :type alignment: Alignment
     """
 
-    def __init__(self, words=[], mots=[], alignment='', encoding='utf8'):
+    def __init__(self, words, mots, alignment=None):
         self._words = words
         self._mots = mots
-        self.alignment = alignment
+        if alignment is None:
+            self.alignment = Alignment([])
+        else:
+            assert type(alignment) is Alignment
+            self.alignment = alignment
 
     @property
     def words(self):
@@ -63,26 +65,10 @@ class AlignedSent(object):
         return self._alignment
         
     def _set_alignment(self, alignment):
-        if not isinstance(alignment, Alignment):
-            alignment = Alignment(alignment)
-        self._check_align(alignment)
+        _check_alignment(len(self.words), len(self.mots), alignment)
         self._alignment = alignment
     alignment = property(_get_alignment, _set_alignment)
 
-    def _check_align(self, a):
-        """
-        Check whether the alignments are legal.
-
-        :param a: alignment to be checked
-        :raise IndexError: if alignment is out of sentence boundary
-        :rtype: boolean
-        """
-        if not all(0 <= p[0] < len(self._words) for p in a):
-            raise IndexError("Alignment is outside boundary of words")
-        if not all(p[1] is None or 0 <= p[1] < len(self._mots) for p in a):
-            raise IndexError("Alignment is outside boundary of mots")
-        return True
-
     def __repr__(self):
         """
         Return a string representation for this ``AlignedSent``.
@@ -163,96 +149,6 @@ class AlignedSent(object):
         return AlignedSent(self._mots, self._words,
                                self._alignment.invert())
 
-    def precision(self, reference):
-        """
-        Return the precision of an aligned sentence with respect to a
-        "gold standard" reference ``AlignedSent``.
-
-        :type reference: AlignedSent or Alignment
-        :param reference: A "gold standard" reference aligned sentence.
-        :rtype: float or None
-        """
-        # Get alignments in set of 2-tuples form
-        # The "possible" precision is used since it doesn't penalize for finding
-        # an alignment that was marked as "possible" (NAACL corpus)
-
-        align = self.alignment
-        if isinstance(reference, AlignedSent):
-            possible = reference.alignment
-        else:
-            possible = Alignment(reference)
-
-        return precision(possible, align)
-
-
-    def recall(self, reference):
-        """
-        Return the recall of an aligned sentence with respect to a
-        "gold standard" reference ``AlignedSent``.
-
-        :type reference: AlignedSent or Alignment
-        :param reference: A "gold standard" reference aligned sentence.
-        :rtype: float or None
-        """
-        # Get alignments in set of 2-tuples form
-        # The "sure" recall is used so we don't penalize for missing an
-        # alignment that was only marked as "possible".
-
-        align = self.alignment
-        if isinstance(reference, AlignedSent):
-            sure = reference.alignment
-        else:
-            sure  = Alignment(reference)
-
-        # Call NLTKs existing functions for recall
-        return recall(sure, align)
-
-
-    def alignment_error_rate(self, reference, possible=None):
-        """
-        Return the Alignment Error Rate (AER) of an aligned sentence
-        with respect to a "gold standard" reference ``AlignedSent``.
-
-        Return an error rate between 0.0 (perfect alignment) and 1.0 (no
-        alignment).
-
-            >>> from nltk.align import AlignedSent
-            >>> s = AlignedSent(["the", "cat"], ["le", "chat"], [(0, 0), (1, 1)])
-            >>> s.alignment_error_rate(s)
-            0.0
-
-        :type reference: AlignedSent or Alignment
-        :param reference: A "gold standard" reference aligned sentence.
-        :type possible: AlignedSent or Alignment or None
-        :param possible: A "gold standard" reference of possible alignments
-            (defaults to *reference* if None)
-        :rtype: float or None
-        """
-        # Get alignments in set of 2-tuples form
-        align = self.alignment
-        if isinstance(reference, AlignedSent):
-            sure = reference.alignment
-        else:
-            sure = Alignment(reference)
-
-        if possible is not None:
-            # Set possible alignment
-            if isinstance(possible, AlignedSent):
-                possible = possible.alignment
-            else:
-                possible = Alignment(possible)
-        else:
-            # Possible alignment is just sure alignment
-            possible = sure
-
-        # Sanity check
-        assert(sure.issubset(possible))
-
-        # Return the Alignment Error Rate
-        return (1.0 - float(len(align & sure) + len(align & possible)) /
-                float(len(align) + len(sure)))
-
-
 @python_2_unicode_compatible
 class Alignment(frozenset):
     """
@@ -262,7 +158,7 @@ class Alignment(frozenset):
     j-th element of s2.  Tuples are extensible (they might contain
     additional data, such as a boolean to indicate sure vs possible alignments).
 
-        >>> from nltk.align import Alignment
+        >>> from nltk.translate import Alignment
         >>> a = Alignment([(0, 0), (0, 1), (1, 2), (2, 2)])
         >>> a.invert()
         Alignment([(0, 0), (1, 0), (2, 1), (2, 2)])
@@ -275,19 +171,33 @@ class Alignment(frozenset):
         >>> b = Alignment([(0, 0), (0, 1)])
         >>> b.issubset(a)
         True
-        >>> c = Alignment('0-0 0-1')
+        >>> c = Alignment.fromstring('0-0 0-1')
         >>> b == c
         True
     """
 
-    def __new__(cls, string_or_pairs):
-        if isinstance(string_or_pairs, string_types):
-            string_or_pairs = [_giza2pair(p) for p in string_or_pairs.split()]
-        self = frozenset.__new__(cls, string_or_pairs)
+    def __new__(cls, pairs):
+        self = frozenset.__new__(cls, pairs)
         self._len = (max(p[0] for p in self) if self != frozenset([]) else 0)
         self._index = None
         return self
 
+    @classmethod
+    def fromstring(cls, s):
+        """
+        Read a giza-formatted string and return an Alignment object.
+
+            >>> Alignment.fromstring('0-0 2-1 9-2 21-3 10-4 7-5')
+            Alignment([(0, 0), (2, 1), (7, 5), (9, 2), (10, 4), (21, 3)])
+
+        :type s: str
+        :param s: the positional alignments in giza format
+        :rtype: Alignment
+        :return: An Alignment object corresponding to the string representation ``s``.
+        """
+
+        return Alignment([_giza2pair(a) for a in s.split()])
+
     def __getitem__(self, key):
         """
         Look up the alignments that map from a given index or slice.
@@ -346,3 +256,66 @@ def _naacl2pair(pair_string):
     i, j, p = pair_string.split("-")
     return int(i), int(j)
 
+def _check_alignment(num_words, num_mots, alignment):
+    """
+    Check whether the alignments are legal.
+
+    :param num_words: the number of source language words
+    :type num_words: int
+    :param num_mots: the number of target language words
+    :type num_mots: int
+    :param alignment: alignment to be checked
+    :type alignment: Alignment
+    :raise IndexError: if alignment falls outside the sentence
+    """
+
+    assert type(alignment) is Alignment
+
+    if not all(0 <= pair[0] < num_words for pair in alignment):
+        raise IndexError("Alignment is outside boundary of words")
+    if not all(pair[1] is None or 0 <= pair[1] < num_mots for pair in alignment):
+        raise IndexError("Alignment is outside boundary of mots")
+
+
+PhraseTableEntry = namedtuple('PhraseTableEntry', ['trg_phrase', 'log_prob'])
+class PhraseTable(object):
+    """
+    In-memory store of translations for a given phrase, and the log
+    probability of the those translations
+    """
+    def __init__(self):
+        self.src_phrases = dict()
+
+    def translations_for(self, src_phrase):
+        """
+        Get the translations for a source language phrase
+
+        :param src_phrase: Source language phrase of interest
+        :type src_phrase: tuple(str)
+
+        :return: A list of target language phrases that are translations
+            of ``src_phrase``, ordered in decreasing order of
+            likelihood. Each list element is a tuple of the target
+            phrase and its log probability.
+        :rtype: list(PhraseTableEntry)
+        """
+        return self.src_phrases[src_phrase]
+
+    def add(self, src_phrase, trg_phrase, log_prob):
+        """
+        :type src_phrase: tuple(str)
+        :type trg_phrase: tuple(str)
+
+        :param log_prob: Log probability that given ``src_phrase``,
+            ``trg_phrase`` is its translation
+        :type log_prob: float
+        """
+        entry = PhraseTableEntry(trg_phrase=trg_phrase, log_prob=log_prob)
+        if src_phrase not in self.src_phrases:
+            self.src_phrases[src_phrase] = []
+        self.src_phrases[src_phrase].append(entry)
+        self.src_phrases[src_phrase].sort(key=lambda e: e.log_prob,
+                                          reverse=True)
+
+    def __contains__(self, src_phrase):
+        return src_phrase in self.src_phrases
diff --git a/nltk/translate/bleu_score.py b/nltk/translate/bleu_score.py
new file mode 100644
index 0000000..d028130
--- /dev/null
+++ b/nltk/translate/bleu_score.py
@@ -0,0 +1,244 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: BLEU Score
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
+# Contributors: Dmitrijs Milajevs
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+"""BLEU score implementation."""
+
+from __future__ import division
+
+import math
+
+from nltk.tokenize import word_tokenize
+from nltk.compat import Counter
+from nltk.util import ngrams
+
+
+def bleu(references, hypothesis, weights):
+    """
+    Calculate BLEU score (Bilingual Evaluation Understudy) from
+    Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
+    "BLEU: a method for automatic evaluation of machine translation." 
+    In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf
+
+
+    >>> weights = [0.25, 0.25, 0.25, 0.25]
+    >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+    ...               'ensures', 'that', 'the', 'military', 'always',
+    ...               'obeys', 'the', 'commands', 'of', 'the', 'party']
+
+    >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
+    ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
+    ...               'that', 'party', 'direct']
+
+    >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+    ...               'ensures', 'that', 'the', 'military', 'will', 'forever',
+    ...               'heed', 'Party', 'commands']
+
+    >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+    ...               'guarantees', 'the', 'military', 'forces', 'always',
+    ...               'being', 'under', 'the', 'command', 'of', 'the',
+    ...               'Party']
+
+    >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+    ...               'army', 'always', 'to', 'heed', 'the', 'directions',
+    ...               'of', 'the', 'party']
+
+    >>> bleu([reference1, reference2, reference3], hypothesis1, weights)
+    0.5045666840058485
+
+    >>> bleu([reference1, reference2, reference3], hypothesis2, weights)
+    0
+
+    :param references: reference sentences
+    :type references: list(list(str))
+    :param hypothesis: a hypothesis sentence
+    :type hypothesis: list(str)
+    :param weights: weights for unigrams, bigrams, trigrams and so on
+    :type weights: list(float)
+    """
+    p_ns = (
+        _modified_precision(references, hypothesis, i)
+        for i, _ in enumerate(weights, start=1)
+    )
+
+    try:
+        s = math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns))
+    except ValueError:
+        # some p_ns is 0
+        return 0
+
+    bp = _brevity_penalty(references, hypothesis)
+    return bp * math.exp(s)
+
+
+def _modified_precision(references, hypothesis, n):
+    """
+    Calculate modified ngram precision.
+
+    The normal precision method may lead to some wrong translations with
+    high-precision, e.g., the translation, in which a word of reference
+    repeats several times, has very high precision. 
+    
+    The famous "the the the ... " example shows that you can get BLEU precision
+    by duplicating high frequency words.
+    
+        >>> reference1 = 'the cat is on the mat'.split()
+        >>> reference2 = 'there is a cat on the mat'.split()
+        >>> hypothesis1 = 'the the the the the the the'.split()
+        >>> references = [reference1, reference2]
+        >>> _modified_precision(references, hypothesis1, n=1)
+        0.2857142857142857
+    
+    In the modified n-gram precision, a reference word will be considered 
+    exhausted after a matching hypothesis word is identified, e.g.
+    
+        >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+        ...               'ensures', 'that', 'the', 'military', 'will', 
+        ...               'forever', 'heed', 'Party', 'commands']
+        >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+        ...               'guarantees', 'the', 'military', 'forces', 'always',
+        ...               'being', 'under', 'the', 'command', 'of', 'the',
+        ...               'Party']
+        >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+        ...               'army', 'always', 'to', 'heed', 'the', 'directions',
+        ...               'of', 'the', 'party']
+        >>> hypothesis = 'of the'.split()
+        >>> references = [reference1, reference2, reference3]
+        >>> _modified_precision(references, hypothesis, n=1)
+        1.0
+        >>> _modified_precision(references, hypothesis, n=2)
+        1.0
+        
+    An example of a normal machine translation hypothesis:
+    
+        >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+        ...               'ensures', 'that', 'the', 'military', 'always',
+        ...               'obeys', 'the', 'commands', 'of', 'the', 'party']
+        
+        >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
+        ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
+        ...               'that', 'party', 'direct']
+    
+        >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+        ...               'ensures', 'that', 'the', 'military', 'will', 
+        ...               'forever', 'heed', 'Party', 'commands']
+        
+        >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+        ...               'guarantees', 'the', 'military', 'forces', 'always',
+        ...               'being', 'under', 'the', 'command', 'of', 'the',
+        ...               'Party']
+        
+        >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+        ...               'army', 'always', 'to', 'heed', 'the', 'directions',
+        ...               'of', 'the', 'party']
+        >>> references = [reference1, reference2, reference3]
+        >>> _modified_precision(references, hypothesis1, n=1)
+        0.9444444444444444
+        >>> _modified_precision(references, hypothesis2, n=1)
+        0.5714285714285714
+        >>> _modified_precision(references, hypothesis1, n=2)
+        0.5882352941176471
+        >>> _modified_precision(references, hypothesis2, n=2)
+        0.07692307692307693
+
+    :param references: A list of reference translations.
+    :type references: list(list(str))
+    :param hypothesis: A hypothesis translation.
+    :type hypothesis: list(str)
+    :param n: The ngram order.
+    :type n: int
+    """
+    counts = Counter(ngrams(hypothesis, n))
+
+    if not counts:
+        return 0
+
+    max_counts = {}
+    for reference in references:
+        reference_counts = Counter(ngrams(reference, n))
+        for ngram in counts:
+            max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
+
+    clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items())
+
+    return sum(clipped_counts.values()) / sum(counts.values())
+
+
+def _brevity_penalty(references, hypothesis):
+    """
+    Calculate brevity penalty.
+
+    As the modified n-gram precision still has the problem from the short
+    length sentence, brevity penalty is used to modify the overall BLEU
+    score according to length.
+
+    An example from the paper. There are three references with length 12, 15
+    and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
+
+        >>> reference1 = list('aaaaaaaaaaaa')      # i.e. ['a'] * 12
+        >>> reference2 = list('aaaaaaaaaaaaaaa')   # i.e. ['a'] * 15
+        >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
+        >>> hypothesis = list('aaaaaaaaaaaa')      # i.e. ['a'] * 12
+        >>> references = [reference1, reference2, reference3]
+        >>> _brevity_penalty(references, hypothesis)
+        1.0
+
+    In case a hypothesis translation is shorter than the references, penalty is
+    applied.
+
+        >>> references = [['a'] * 28, ['a'] * 28]
+        >>> hypothesis = ['a'] * 12
+        >>> _brevity_penalty(references, hypothesis)
+        0.2635971381157267
+
+    The length of the closest reference is used to compute the penalty. If the
+    length of a hypothesis is 12, and the reference lengths are 13 and 2, the
+    penalty is applied because the hypothesis length (12) is less then the
+    closest reference length (13).
+
+        >>> references = [['a'] * 13, ['a'] * 2]
+        >>> hypothesis = ['a'] * 12
+        >>> _brevity_penalty(references, hypothesis)
+        0.9200444146293233
+
+    The brevity penalty doesn't depend on reference order. More importantly,
+    when two reference sentences are at the same distance, the shortest
+    reference sentence length is used.
+
+        >>> references = [['a'] * 13, ['a'] * 11]
+        >>> hypothesis = ['a'] * 12
+        >>> bp1 = _brevity_penalty(references, hypothesis)  
+        >>> bp2 = _brevity_penalty(reversed(references),hypothesis) 
+        >>> bp1 == bp2 == 1
+        True
+
+    A test example from mteval-v13a.pl (starting from the line 705):
+
+        >>> references = [['a'] * 11, ['a'] * 8]
+        >>> hypothesis = ['a'] * 7
+        >>> _brevity_penalty(references, hypothesis)
+        0.8668778997501817
+
+        >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
+        >>> hypothesis = ['a'] * 7
+        >>> _brevity_penalty(references, hypothesis)
+        1.0
+    
+    :param references: A list of reference translations.
+    :type references: list(list(str))
+    :param hypothesis: A hypothesis translation.
+    :type hypothesis: list(str)
+    """
+    c = len(hypothesis)
+    ref_lens = (len(reference) for reference in references)
+    r = min(ref_lens, key=lambda ref_len: (abs(ref_len - c), ref_len))
+
+    if c > r:
+        return 1
+    else:
+        return math.exp(1 - r / c)
+
diff --git a/nltk/align/gale_church.py b/nltk/translate/gale_church.py
similarity index 100%
rename from nltk/align/gale_church.py
rename to nltk/translate/gale_church.py
diff --git a/nltk/align/gdfa.py b/nltk/translate/gdfa.py
similarity index 100%
rename from nltk/align/gdfa.py
rename to nltk/translate/gdfa.py
diff --git a/nltk/align/ibm1.py b/nltk/translate/ibm1.py
similarity index 61%
rename from nltk/align/ibm1.py
rename to nltk/translate/ibm1.py
index 43e14f8..c516cf1 100644
--- a/nltk/align/ibm1.py
+++ b/nltk/translate/ibm1.py
@@ -56,9 +56,10 @@ Translation: Parameter Estimation. Computational Linguistics, 19 (2),
 
 from __future__ import division
 from collections import defaultdict
-from nltk.align import AlignedSent
-from nltk.align import Alignment
-from nltk.align import IBMModel
+from nltk.translate import AlignedSent
+from nltk.translate import Alignment
+from nltk.translate import IBMModel
+from nltk.translate.ibm_model import Counts
 import warnings
 
 
@@ -76,14 +77,14 @@ class IBMModel1(IBMModel):
 
     >>> ibm1 = IBMModel1(bitext, 5)
 
-    >>> print('{0:.3f}'.format(ibm1.translation_table['buch']['book']))
-    0.889
-    >>> print('{0:.3f}'.format(ibm1.translation_table['das']['book']))
-    0.062
-    >>> print('{0:.3f}'.format(ibm1.translation_table['buch'][None]))
-    0.113
-    >>> print('{0:.3f}'.format(ibm1.translation_table['ja'][None]))
-    0.073
+    >>> print(ibm1.translation_table['buch']['book'])
+    0.889...
+    >>> print(ibm1.translation_table['das']['book'])
+    0.061...
+    >>> print(ibm1.translation_table['buch'][None])
+    0.113...
+    >>> print(ibm1.translation_table['ja'][None])
+    0.072...
 
     >>> test_sentence = bitext[2]
     >>> test_sentence.words
@@ -95,7 +96,8 @@ class IBMModel1(IBMModel):
 
     """
 
-    def __init__(self, sentence_aligned_corpus, iterations):
+    def __init__(self, sentence_aligned_corpus, iterations,
+                 probability_tables=None):
         """
         Train on ``sentence_aligned_corpus`` and create a lexical
         translation model.
@@ -108,52 +110,84 @@ class IBMModel1(IBMModel):
 
         :param iterations: Number of iterations to run training algorithm
         :type iterations: int
+
+        :param probability_tables: Optional. Use this to pass in custom
+            probability values. If not specified, probabilities will be
+            set to a uniform distribution, or some other sensible value.
+            If specified, the following entry must be present:
+            ``translation_table``.
+            See ``IBMModel`` for the type and purpose of this table.
+        :type probability_tables: dict[str]: object
         """
         super(IBMModel1, self).__init__(sentence_aligned_corpus)
 
-        # seed with a uniform distribution
-        initial_prob = 1 / len(self.trg_vocab)
-        if initial_prob > IBMModel.MIN_PROB:
-            for t in self.trg_vocab:
-                for s in self.src_vocab:
-                    self.translation_table[t][s] = initial_prob
+        if probability_tables is None:
+            self.set_uniform_probabilities(sentence_aligned_corpus)
         else:
-            warnings.warn("Target language vocabulary is too large. "
-                          "Results may be less accurate.")
+            # Set user-defined probabilities
+            self.translation_table = probability_tables['translation_table']
+
+        for n in range(0, iterations):
+            self.train(sentence_aligned_corpus)
 
-        self.train(sentence_aligned_corpus, iterations)
         self.__align_all(sentence_aligned_corpus)
 
-    def train(self, parallel_corpus, iterations):
-        for i in range(0, iterations):
-            count_t_given_s = defaultdict(lambda: defaultdict(lambda: 0.0))
-            count_any_t_given_s = defaultdict(lambda: 0.0)
-
-            for aligned_sentence in parallel_corpus:
-                trg_sentence = aligned_sentence.words
-                src_sentence = [None] + aligned_sentence.mots
-                total_count = defaultdict(lambda: 0.0)
-
-                # E step (a): Compute normalization factors to weigh counts
-                for t in trg_sentence:
-                    if total_count[t] == 0.0:
-                        for s in src_sentence:
-                            total_count[t] += self.translation_table[t][s]
-
-                # E step (b): Collect counts
-                for t in trg_sentence:
-                    for s in src_sentence:
-                        count = self.translation_table[t][s]
-                        normalized_count = count / total_count[t]
-                        count_t_given_s[t][s] += normalized_count
-                        count_any_t_given_s[s] += normalized_count
-
-            # M step: Update probabilities with maximum likelihood estimate
-            for s in self.src_vocab:
-                for t in self.trg_vocab:
-                    estimate = count_t_given_s[t][s] / count_any_t_given_s[s]
-                    self.translation_table[t][s] = max(estimate,
-                                                       IBMModel.MIN_PROB)
+    def set_uniform_probabilities(self, sentence_aligned_corpus):
+        initial_prob = 1 / len(self.trg_vocab)
+        if initial_prob < IBMModel.MIN_PROB:
+            warnings.warn("Target language vocabulary is too large (" +
+                          str(len(self.trg_vocab)) + " words). "
+                          "Results may be less accurate.")
+
+        for t in self.trg_vocab:
+            self.translation_table[t] = defaultdict(lambda: initial_prob)
+
+    def train(self, parallel_corpus):
+        counts = Counts()
+        for aligned_sentence in parallel_corpus:
+            trg_sentence = aligned_sentence.words
+            src_sentence = [None] + aligned_sentence.mots
+
+            # E step (a): Compute normalization factors to weigh counts
+            total_count = self.prob_all_alignments(src_sentence, trg_sentence)
+
+            # E step (b): Collect counts
+            for t in trg_sentence:
+                for s in src_sentence:
+                    count = self.prob_alignment_point(s, t)
+                    normalized_count = count / total_count[t]
+                    counts.t_given_s[t][s] += normalized_count
+                    counts.any_t_given_s[s] += normalized_count
+
+        # M step: Update probabilities with maximum likelihood estimate
+        self.maximize_lexical_translation_probabilities(counts)
+
+    def prob_all_alignments(self, src_sentence, trg_sentence):
+        """
+        Computes the probability of all possible word alignments,
+        expressed as a marginal distribution over target words t
+
+        Each entry in the return value represents the contribution to
+        the total alignment probability by the target word t.
+
+        To obtain probability(alignment | src_sentence, trg_sentence),
+        simply sum the entries in the return value.
+
+        :return: Probability of t for all s in ``src_sentence``
+        :rtype: dict(str): float
+        """
+        alignment_prob_for_t = defaultdict(lambda: 0.0)
+        for t in trg_sentence:
+            for s in src_sentence:
+                alignment_prob_for_t[t] += self.prob_alignment_point(s, t)
+        return alignment_prob_for_t
+
+    def prob_alignment_point(self, s, t):
+        """
+        Probability that word ``t`` in the target sentence is aligned to
+        word ``s`` in the source sentence
+        """
+        return self.translation_table[t][s]
 
     def prob_t_a_given_s(self, alignment_info):
         """
diff --git a/nltk/translate/ibm2.py b/nltk/translate/ibm2.py
new file mode 100644
index 0000000..64b7fa4
--- /dev/null
+++ b/nltk/translate/ibm2.py
@@ -0,0 +1,308 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: IBM Model 2
+#
+# Copyright (C) 2001-2013 NLTK Project
+# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Lexical translation model that considers word order.
+
+IBM Model 2 improves on Model 1 by accounting for word order.
+An alignment probability is introduced, a(i | j,l,m), which predicts
+a source word position, given its aligned target word's position.
+
+The EM algorithm used in Model 2 is:
+E step - In the training data, collect counts, weighted by prior
+         probabilities.
+         (a) count how many times a source language word is translated
+             into a target language word
+         (b) count how many times a particular position in the source
+             sentence is aligned to a particular position in the target
+             sentence
+
+M step - Estimate new probabilities based on the counts from the E step
+
+
+Notations:
+i: Position in the source sentence
+    Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
+j: Position in the target sentence
+    Valid values are 1, 2, ..., length of target sentence
+l: Number of words in the source sentence, excluding NULL
+m: Number of words in the target sentence
+s: A word in the source language
+t: A word in the target language
+
+
+References:
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
+
+Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
+Robert L. Mercer. 1993. The Mathematics of Statistical Machine
+Translation: Parameter Estimation. Computational Linguistics, 19 (2),
+263-311.
+"""
+
+from __future__ import division
+from collections import defaultdict
+from nltk.translate import AlignedSent
+from nltk.translate import Alignment
+from nltk.translate import IBMModel
+from nltk.translate import IBMModel1
+from nltk.translate.ibm_model import Counts
+import warnings
+
+
+class IBMModel2(IBMModel):
+    """
+    Lexical translation model that considers word order
+
+    >>> bitext = []
+    >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
+    >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
+    >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
+    >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
+    >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
+
+    >>> ibm2 = IBMModel2(bitext, 5)
+
+    >>> print(round(ibm2.translation_table['buch']['book'], 3))
+    1.0
+    >>> print(round(ibm2.translation_table['das']['book'], 3))
+    0.0
+    >>> print(round(ibm2.translation_table['buch'][None], 3))
+    0.0
+    >>> print(round(ibm2.translation_table['ja'][None], 3))
+    0.0
+
+    >>> print(ibm2.alignment_table[1][1][2][2])
+    0.938...
+    >>> print(round(ibm2.alignment_table[1][2][2][2], 3))
+    0.0
+    >>> print(round(ibm2.alignment_table[2][2][4][5], 3))
+    1.0
+
+    >>> test_sentence = bitext[2]
+    >>> test_sentence.words
+    ['das', 'buch', 'ist', 'ja', 'klein']
+    >>> test_sentence.mots
+    ['the', 'book', 'is', 'small']
+    >>> test_sentence.alignment
+    Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
+
+    """
+
+    def __init__(self, sentence_aligned_corpus, iterations,
+                 probability_tables=None):
+        """
+        Train on ``sentence_aligned_corpus`` and create a lexical
+        translation model and an alignment model.
+
+        Translation direction is from ``AlignedSent.mots`` to
+        ``AlignedSent.words``.
+
+        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
+        :type sentence_aligned_corpus: list(AlignedSent)
+
+        :param iterations: Number of iterations to run training algorithm
+        :type iterations: int
+
+        :param probability_tables: Optional. Use this to pass in custom
+            probability values. If not specified, probabilities will be
+            set to a uniform distribution, or some other sensible value.
+            If specified, all the following entries must be present:
+            ``translation_table``, ``alignment_table``.
+            See ``IBMModel`` for the type and purpose of these tables.
+        :type probability_tables: dict[str]: object
+        """
+        super(IBMModel2, self).__init__(sentence_aligned_corpus)
+
+        if probability_tables is None:
+            # Get translation probabilities from IBM Model 1
+            # Run more iterations of training for Model 1, since it is
+            # faster than Model 2
+            ibm1 = IBMModel1(sentence_aligned_corpus, 2 * iterations)
+            self.translation_table = ibm1.translation_table
+            self.set_uniform_probabilities(sentence_aligned_corpus)
+        else:
+            # Set user-defined probabilities
+            self.translation_table = probability_tables['translation_table']
+            self.alignment_table = probability_tables['alignment_table']
+
+        for n in range(0, iterations):
+            self.train(sentence_aligned_corpus)
+
+        self.__align_all(sentence_aligned_corpus)
+
+    def set_uniform_probabilities(self, sentence_aligned_corpus):
+        # a(i | j,l,m) = 1 / (l+1) for all i, j, l, m
+        l_m_combinations = set()
+        for aligned_sentence in sentence_aligned_corpus:
+            l = len(aligned_sentence.mots)
+            m = len(aligned_sentence.words)
+            if (l, m) not in l_m_combinations:
+                l_m_combinations.add((l, m))
+                initial_prob = 1 / float(l + 1)
+                if initial_prob < IBMModel.MIN_PROB:
+                    warnings.warn("A source sentence is too long (" + str(l) +
+                                  " words). Results may be less accurate.")
+
+                for i in range(0, l + 1):
+                    for j in range(1, m + 1):
+                        self.alignment_table[i][j][l][m] = initial_prob
+
+    def train(self, parallel_corpus):
+        counts = Model2Counts()
+        for aligned_sentence in parallel_corpus:
+            src_sentence = [None] + aligned_sentence.mots
+            trg_sentence = ['UNUSED'] + aligned_sentence.words  # 1-indexed
+            l = len(aligned_sentence.mots)
+            m = len(aligned_sentence.words)
+
+            # E step (a): Compute normalization factors to weigh counts
+            total_count = self.prob_all_alignments(src_sentence, trg_sentence)
+
+            # E step (b): Collect counts
+            for j in range(1, m + 1):
+                t = trg_sentence[j]
+                for i in range(0, l + 1):
+                    s = src_sentence[i]
+                    count = self.prob_alignment_point(
+                        i, j, src_sentence, trg_sentence)
+                    normalized_count = count / total_count[t]
+
+                    counts.update_lexical_translation(normalized_count, s, t)
+                    counts.update_alignment(normalized_count, i, j, l, m)
+
+        # M step: Update probabilities with maximum likelihood estimates
+        self.maximize_lexical_translation_probabilities(counts)
+        self.maximize_alignment_probabilities(counts)
+
+    def maximize_alignment_probabilities(self, counts):
+        MIN_PROB = IBMModel.MIN_PROB
+        for i, j_s in counts.alignment.items():
+            for j, src_sentence_lengths in j_s.items():
+                for l, trg_sentence_lengths in src_sentence_lengths.items():
+                    for m in trg_sentence_lengths:
+                        estimate = (counts.alignment[i][j][l][m] /
+                                    counts.alignment_for_any_i[j][l][m])
+                        self.alignment_table[i][j][l][m] = max(estimate,
+                                                               MIN_PROB)
+
+    def prob_all_alignments(self, src_sentence, trg_sentence):
+        """
+        Computes the probability of all possible word alignments,
+        expressed as a marginal distribution over target words t
+
+        Each entry in the return value represents the contribution to
+        the total alignment probability by the target word t.
+
+        To obtain probability(alignment | src_sentence, trg_sentence),
+        simply sum the entries in the return value.
+
+        :return: Probability of t for all s in ``src_sentence``
+        :rtype: dict(str): float
+        """
+        alignment_prob_for_t = defaultdict(lambda: 0.0)
+        for j in range(1, len(trg_sentence)):
+            t = trg_sentence[j]
+            for i in range(0, len(src_sentence)):
+                alignment_prob_for_t[t] += self.prob_alignment_point(
+                    i, j, src_sentence, trg_sentence)
+        return alignment_prob_for_t
+
+    def prob_alignment_point(self, i, j, src_sentence, trg_sentence):
+        """
+        Probability that position j in ``trg_sentence`` is aligned to
+        position i in the ``src_sentence``
+        """
+        l = len(src_sentence) - 1
+        m = len(trg_sentence) - 1
+        s = src_sentence[i]
+        t = trg_sentence[j]
+        return self.translation_table[t][s] * self.alignment_table[i][j][l][m]
+
+    def prob_t_a_given_s(self, alignment_info):
+        """
+        Probability of target sentence and an alignment given the
+        source sentence
+        """
+        prob = 1.0
+        l = len(alignment_info.src_sentence) - 1
+        m = len(alignment_info.trg_sentence) - 1
+
+        for j, i in enumerate(alignment_info.alignment):
+            if j == 0:
+                continue  # skip the dummy zeroeth element
+            trg_word = alignment_info.trg_sentence[j]
+            src_word = alignment_info.src_sentence[i]
+            prob *= (self.translation_table[trg_word][src_word] *
+                     self.alignment_table[i][j][l][m])
+
+        return max(prob, IBMModel.MIN_PROB)
+
+    def __align_all(self, parallel_corpus):
+        for sentence_pair in parallel_corpus:
+            self.__align(sentence_pair)
+
+    def __align(self, sentence_pair):
+        """
+        Determines the best word alignment for one sentence pair from
+        the corpus that the model was trained on.
+
+        The best alignment will be set in ``sentence_pair`` when the
+        method returns. In contrast with the internal implementation of
+        IBM models, the word indices in the ``Alignment`` are zero-
+        indexed, not one-indexed.
+
+        :param sentence_pair: A sentence in the source language and its
+            counterpart sentence in the target language
+        :type sentence_pair: AlignedSent
+        """
+        best_alignment = []
+
+        l = len(sentence_pair.mots)
+        m = len(sentence_pair.words)
+
+        for j, trg_word in enumerate(sentence_pair.words):
+            # Initialize trg_word to align with the NULL token
+            best_prob = (self.translation_table[trg_word][None] *
+                         self.alignment_table[0][j + 1][l][m])
+            best_prob = max(best_prob, IBMModel.MIN_PROB)
+            best_alignment_point = None
+            for i, src_word in enumerate(sentence_pair.mots):
+                align_prob = (self.translation_table[trg_word][src_word] *
+                              self.alignment_table[i + 1][j + 1][l][m])
+                if align_prob >= best_prob:
+                    best_prob = align_prob
+                    best_alignment_point = i
+
+            best_alignment.append((j, best_alignment_point))
+
+        sentence_pair.alignment = Alignment(best_alignment)
+
+
+class Model2Counts(Counts):
+    """
+    Data object to store counts of various parameters during training.
+    Includes counts for alignment.
+    """
+    def __init__(self):
+        super(Model2Counts, self).__init__()
+        self.alignment = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+                lambda: 0.0))))
+        self.alignment_for_any_i = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+
+    def update_lexical_translation(self, count, s, t):
+        self.t_given_s[t][s] += count
+        self.any_t_given_s[s] += count
+
+    def update_alignment(self, count, i, j, l, m):
+        self.alignment[i][j][l][m] += count
+        self.alignment_for_any_i[j][l][m] += count
diff --git a/nltk/align/ibm3.py b/nltk/translate/ibm3.py
similarity index 52%
rename from nltk/align/ibm3.py
rename to nltk/translate/ibm3.py
index 542c024..2bac7d2 100644
--- a/nltk/align/ibm3.py
+++ b/nltk/translate/ibm3.py
@@ -76,10 +76,11 @@ Translation: Parameter Estimation. Computational Linguistics, 19 (2),
 from __future__ import division
 from collections import defaultdict
 from math import factorial
-from nltk.align import AlignedSent
-from nltk.align import Alignment
-from nltk.align import IBMModel
-from nltk.align import IBMModel2
+from nltk.translate import AlignedSent
+from nltk.translate import Alignment
+from nltk.translate import IBMModel
+from nltk.translate import IBMModel2
+from nltk.translate.ibm_model import Counts
 import warnings
 
 
@@ -90,7 +91,7 @@ class IBMModel3(IBMModel):
 
     >>> bitext = []
     >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
-    >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
+    >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
     >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
     >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
     >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
@@ -101,27 +102,27 @@ class IBMModel3(IBMModel):
 
     >>> ibm3 = IBMModel3(bitext, 5)
 
-    >>> print('{0:.3f}'.format(ibm3.translation_table['buch']['book']))
-    1.000
-    >>> print('{0:.3f}'.format(ibm3.translation_table['das']['book']))
-    0.000
-    >>> print('{0:.3f}'.format(ibm3.translation_table['ja'][None]))
-    1.000
+    >>> print(round(ibm3.translation_table['buch']['book'], 3))
+    1.0
+    >>> print(round(ibm3.translation_table['das']['book'], 3))
+    0.0
+    >>> print(round(ibm3.translation_table['ja'][None], 3))
+    1.0
 
-    >>> print('{0:.3f}'.format(ibm3.distortion_table[1][1][2][2]))
-    1.000
-    >>> print('{0:.3f}'.format(ibm3.distortion_table[1][2][2][2]))
-    0.000
-    >>> print('{0:.3f}'.format(ibm3.distortion_table[2][2][4][5]))
-    0.750
+    >>> print(round(ibm3.distortion_table[1][1][2][2], 3))
+    1.0
+    >>> print(round(ibm3.distortion_table[1][2][2][2], 3))
+    0.0
+    >>> print(round(ibm3.distortion_table[2][2][4][5], 3))
+    0.75
 
-    >>> print('{0:.3f}'.format(ibm3.fertility_table[2]['summarize']))
-    1.000
-    >>> print('{0:.3f}'.format(ibm3.fertility_table[1]['book']))
-    1.000
+    >>> print(round(ibm3.fertility_table[2]['summarize'], 3))
+    1.0
+    >>> print(round(ibm3.fertility_table[1]['book'], 3))
+    1.0
 
-    >>> print('{0:.3f}'.format(ibm3.p1))
-    0.026
+    >>> print(ibm3.p1)
+    0.054...
 
     >>> test_sentence = bitext[2]
     >>> test_sentence.words
@@ -133,7 +134,8 @@ class IBMModel3(IBMModel):
 
     """
 
-    def __init__(self, sentence_aligned_corpus, iterations):
+    def __init__(self, sentence_aligned_corpus, iterations,
+                 probability_tables=None):
         """
         Train on ``sentence_aligned_corpus`` and create a lexical
         translation model, a distortion model, a fertility model, and a
@@ -142,17 +144,43 @@ class IBMModel3(IBMModel):
         Translation direction is from ``AlignedSent.mots`` to
         ``AlignedSent.words``.
 
-        Runs a few iterations of Model 2 training to initialize
-        model parameters.
-
         :param sentence_aligned_corpus: Sentence-aligned parallel corpus
         :type sentence_aligned_corpus: list(AlignedSent)
 
         :param iterations: Number of iterations to run training algorithm
         :type iterations: int
+
+        :param probability_tables: Optional. Use this to pass in custom
+            probability values. If not specified, probabilities will be
+            set to a uniform distribution, or some other sensible value.
+            If specified, all the following entries must be present:
+            ``translation_table``, ``alignment_table``,
+            ``fertility_table``, ``p1``, ``distortion_table``.
+            See ``IBMModel`` for the type and purpose of these tables.
+        :type probability_tables: dict[str]: object
         """
         super(IBMModel3, self).__init__(sentence_aligned_corpus)
-
+        self.reset_probabilities()
+
+        if probability_tables is None:
+            # Get translation and alignment probabilities from IBM Model 2
+            ibm2 = IBMModel2(sentence_aligned_corpus, iterations)
+            self.translation_table = ibm2.translation_table
+            self.alignment_table = ibm2.alignment_table
+            self.set_uniform_probabilities(sentence_aligned_corpus)
+        else:
+            # Set user-defined probabilities
+            self.translation_table = probability_tables['translation_table']
+            self.alignment_table = probability_tables['alignment_table']
+            self.fertility_table = probability_tables['fertility_table']
+            self.p1 = probability_tables['p1']
+            self.distortion_table = probability_tables['distortion_table']
+
+        for n in range(0, iterations):
+            self.train(sentence_aligned_corpus)
+
+    def reset_probabilities(self):
+        super(IBMModel3, self).reset_probabilities()
         self.distortion_table = defaultdict(
             lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
                 lambda: self.MIN_PROB))))
@@ -161,148 +189,85 @@ class IBMModel3(IBMModel):
         Values accessed as ``distortion_table[j][i][l][m]``.
         """
 
-        # Get the translation and alignment probabilities from IBM model 2
-        ibm2 = IBMModel2(sentence_aligned_corpus, iterations)
-        self.translation_table = ibm2.translation_table
-
-        # Alignment table is only used for hill climbing and is not part
-        # of the output of Model 3 training
-        self.alignment_table = ibm2.alignment_table
-
-        # Initialize the distribution of distortion probability,
+    def set_uniform_probabilities(self, sentence_aligned_corpus):
         # d(j | i,l,m) = 1 / m for all i, j, l, m
+        l_m_combinations = set()
         for aligned_sentence in sentence_aligned_corpus:
             l = len(aligned_sentence.mots)
             m = len(aligned_sentence.words)
-            initial_value = 1 / m
-            if initial_value > IBMModel.MIN_PROB:
-                for i in range(0, l + 1):
-                    for j in range(1, m + 1):
-                        self.distortion_table[j][i][l][m] = initial_value
-            else:
-                warnings.warn("Target sentence is too long (" + str(m) +
-                              " words). Results may be less accurate.")
-
-        self.train(sentence_aligned_corpus, iterations)
-
-    def train(self, parallel_corpus, iterations):
-        for k in range(0, iterations):
-            max_fertility = 0
-
-            # Reset all counts
-            count_t_given_s = defaultdict(lambda: defaultdict(lambda: 0.0))
-            count_any_t_given_s = defaultdict(lambda: 0.0)
-
-            distortion_count = defaultdict(
-                lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
-                    lambda: 0.0))))
-            distortion_count_for_any_j = defaultdict(
-                lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
-
-            count_p0 = 0.0
-            count_p1 = 0.0
-
-            fertility_count = defaultdict(lambda: defaultdict(lambda: 0.0))
-            fertility_count_for_any_phi = defaultdict(lambda: 0.0)
-
-            for aligned_sentence in parallel_corpus:
-                src_sentence = [None] + aligned_sentence.mots
-                trg_sentence = ['UNUSED'] + aligned_sentence.words  # 1-indexed
-                l = len(aligned_sentence.mots)
-                m = len(aligned_sentence.words)
-
-                # Sample the alignment space
-                sampled_alignments, best_alignment = self.sample(
-                    aligned_sentence)
-                # Record the most probable alignment
-                aligned_sentence.alignment = Alignment(
-                    best_alignment.zero_indexed_alignment())
-
-                total_count = 0.0
-
-                # E step (a): Compute normalization factors to weigh counts
-                for alignment_info in sampled_alignments:
-                    count = self.prob_t_a_given_s(alignment_info)
-                    total_count += count
-
-                # E step (b): Collect counts
-                for alignment_info in sampled_alignments:
-                    count = self.prob_t_a_given_s(alignment_info)
-                    normalized_count = count / total_count
-                    null_count = 0
-
-                    for j in range(1, m + 1):
-                        t = trg_sentence[j]
-                        i = alignment_info.alignment[j]
-                        s = src_sentence[i]
-
-                        # Lexical translation
-                        count_t_given_s[t][s] += normalized_count
-                        count_any_t_given_s[s] += normalized_count
-
-                        # Distortion
-                        distortion_count[j][i][l][m] += normalized_count
-                        distortion_count_for_any_j[i][l][m] += normalized_count
-
-                        if i == 0:
-                            null_count += 1
-
-                    # NULL-aligned words generation
-                    count_p1 += null_count * normalized_count
-                    count_p0 += (m - 2 * null_count) * normalized_count
-
-                    # Fertility
+            if (l, m) not in l_m_combinations:
+                l_m_combinations.add((l, m))
+                initial_prob = 1 / float(m)
+                if initial_prob < IBMModel.MIN_PROB:
+                    warnings.warn("A target sentence is too long (" + str(m) +
+                                  " words). Results may be less accurate.")
+                for j in range(1, m + 1):
                     for i in range(0, l + 1):
-                        fertility = 0
-
-                        for j in range(1, m + 1):
-                            if i == alignment_info.alignment[j]:
-                                fertility += 1
-
-                        s = src_sentence[i]
-                        fertility_count[fertility][s] += normalized_count
-                        fertility_count_for_any_phi[s] += normalized_count
-
-                        if fertility > max_fertility:
-                            max_fertility = fertility
-
-            # M step: Update probabilities with maximum likelihood estimates
-            # If any probability is less than MIN_PROB, clamp it to MIN_PROB
-            MIN_PROB = IBMModel.MIN_PROB
-
-            # Lexical translation
-            for s in self.src_vocab:
-                for t in self.trg_vocab:
-                    estimate = count_t_given_s[t][s] / count_any_t_given_s[s]
-                    self.translation_table[t][s] = max(estimate, MIN_PROB)
-
-            # Distortion
-            for aligned_sentence in parallel_corpus:
-                l = len(aligned_sentence.mots)
-                m = len(aligned_sentence.words)
-
-                for i in range(0, l + 1):
-                    for j in range(1, m + 1):
-                        estimate = (distortion_count[j][i][l][m] /
-                                    distortion_count_for_any_j[i][l][m])
+                        self.distortion_table[j][i][l][m] = initial_prob
+
+        # simple initialization, taken from GIZA++
+        self.fertility_table[0] = defaultdict(lambda: 0.2)
+        self.fertility_table[1] = defaultdict(lambda: 0.65)
+        self.fertility_table[2] = defaultdict(lambda: 0.1)
+        self.fertility_table[3] = defaultdict(lambda: 0.04)
+        MAX_FERTILITY = 10
+        initial_fert_prob = 0.01 / (MAX_FERTILITY - 4)
+        for phi in range(4, MAX_FERTILITY):
+            self.fertility_table[phi] = defaultdict(lambda: initial_fert_prob)
+
+        self.p1 = 0.5
+
+    def train(self, parallel_corpus):
+        counts = Model3Counts()
+        for aligned_sentence in parallel_corpus:
+            l = len(aligned_sentence.mots)
+            m = len(aligned_sentence.words)
+
+            # Sample the alignment space
+            sampled_alignments, best_alignment = self.sample(aligned_sentence)
+            # Record the most probable alignment
+            aligned_sentence.alignment = Alignment(
+                best_alignment.zero_indexed_alignment())
+
+            # E step (a): Compute normalization factors to weigh counts
+            total_count = self.prob_of_alignments(sampled_alignments)
+
+            # E step (b): Collect counts
+            for alignment_info in sampled_alignments:
+                count = self.prob_t_a_given_s(alignment_info)
+                normalized_count = count / total_count
+
+                for j in range(1, m + 1):
+                    counts.update_lexical_translation(
+                        normalized_count, alignment_info, j)
+                    counts.update_distortion(
+                        normalized_count, alignment_info, j, l, m)
+
+                counts.update_null_generation(normalized_count, alignment_info)
+                counts.update_fertility(normalized_count, alignment_info)
+
+        # M step: Update probabilities with maximum likelihood estimates
+        # If any probability is less than MIN_PROB, clamp it to MIN_PROB
+        existing_alignment_table = self.alignment_table
+        self.reset_probabilities()
+        self.alignment_table = existing_alignment_table  # don't retrain
+
+        self.maximize_lexical_translation_probabilities(counts)
+        self.maximize_distortion_probabilities(counts)
+        self.maximize_fertility_probabilities(counts)
+        self.maximize_null_generation_probabilities(counts)
+
+    def maximize_distortion_probabilities(self, counts):
+        MIN_PROB = IBMModel.MIN_PROB
+        for j, i_s in counts.distortion.items():
+            for i, src_sentence_lengths in i_s.items():
+                for l, trg_sentence_lengths in src_sentence_lengths.items():
+                    for m in trg_sentence_lengths:
+                        estimate = (counts.distortion[j][i][l][m] /
+                                    counts.distortion_for_any_j[i][l][m])
                         self.distortion_table[j][i][l][m] = max(estimate,
                                                                 MIN_PROB)
 
-            # Fertility
-            for fertility in range(0, max_fertility + 1):
-                for s in self.src_vocab:
-                    estimate = (fertility_count[fertility][s] /
-                                fertility_count_for_any_phi[s])
-                    self.fertility_table[fertility][s] = max(estimate, MIN_PROB)
-
-            # NULL-aligned words generation
-            p1_estimate = count_p1 / (count_p1 + count_p0)
-            p1_estimate = max(p1_estimate, MIN_PROB)
-
-            # Clip p1 if it is too large, because p0 = 1 - p1 should
-            # not be smaller than MIN_PROB
-            self.p1 = min(p1_estimate, 1 - MIN_PROB)
-
     def prob_t_a_given_s(self, alignment_info):
         """
         Probability of target sentence and an alignment given the
@@ -351,3 +316,22 @@ class IBMModel3(IBMModel):
                 return MIN_PROB
 
         return probability
+
+
+class Model3Counts(Counts):
+    """
+    Data object to store counts of various parameters during training.
+    Includes counts for distortion.
+    """
+    def __init__(self):
+        super(Model3Counts, self).__init__()
+        self.distortion = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+                lambda: 0.0))))
+        self.distortion_for_any_j = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+
+    def update_distortion(self, count, alignment_info, j, l, m):
+        i = alignment_info.alignment[j]
+        self.distortion[j][i][l][m] += count
+        self.distortion_for_any_j[i][l][m] += count
diff --git a/nltk/align/ibm4.py b/nltk/translate/ibm4.py
similarity index 92%
rename from nltk/align/ibm4.py
rename to nltk/translate/ibm4.py
index 6726e0b..36cf197 100644
--- a/nltk/align/ibm4.py
+++ b/nltk/translate/ibm4.py
@@ -104,12 +104,12 @@ Translation: Parameter Estimation. Computational Linguistics, 19 (2),
 from __future__ import division
 from collections import defaultdict
 from math import factorial
-from nltk.align import AlignedSent
-from nltk.align import Alignment
-from nltk.align import IBMModel
-from nltk.align import IBMModel3
-from nltk.align.ibm_model import Counts
-from nltk.align.ibm_model import longest_target_sentence_length
+from nltk.translate import AlignedSent
+from nltk.translate import Alignment
+from nltk.translate import IBMModel
+from nltk.translate import IBMModel3
+from nltk.translate.ibm_model import Counts
+from nltk.translate.ibm_model import longest_target_sentence_length
 import warnings
 
 
@@ -120,7 +120,7 @@ class IBMModel4(IBMModel):
 
     >>> bitext = []
     >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
-    >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
+    >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
     >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
     >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
     >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
@@ -128,32 +128,32 @@ class IBMModel4(IBMModel):
     >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
     >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
     >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
-    >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'i': 4, 'summarize': 5 }
-    >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 }
+    >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 }
+    >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 }
 
     >>> ibm4 = IBMModel4(bitext, 5, src_classes, trg_classes)
 
-    >>> print('{0:.3f}'.format(ibm4.translation_table['buch']['book']))
-    1.000
-    >>> print('{0:.3f}'.format(ibm4.translation_table['das']['book']))
-    0.000
-    >>> print('{0:.3f}'.format(ibm4.translation_table['ja'][None]))
-    1.000
+    >>> print(round(ibm4.translation_table['buch']['book'], 3))
+    1.0
+    >>> print(round(ibm4.translation_table['das']['book'], 3))
+    0.0
+    >>> print(round(ibm4.translation_table['ja'][None], 3))
+    1.0
 
-    >>> print('{0:.3f}'.format(ibm4.head_distortion_table[1][0][1]))
-    1.000
-    >>> print('{0:.3f}'.format(ibm4.head_distortion_table[2][0][1]))
-    0.000
-    >>> print('{0:.3f}'.format(ibm4.non_head_distortion_table[3][6]))
-    0.500
+    >>> print(round(ibm4.head_distortion_table[1][0][1], 3))
+    1.0
+    >>> print(round(ibm4.head_distortion_table[2][0][1], 3))
+    0.0
+    >>> print(round(ibm4.non_head_distortion_table[3][6], 3))
+    0.5
 
-    >>> print('{0:.3f}'.format(ibm4.fertility_table[2]['summarize']))
-    1.000
-    >>> print('{0:.3f}'.format(ibm4.fertility_table[1]['book']))
-    1.000
+    >>> print(round(ibm4.fertility_table[2]['summarize'], 3))
+    1.0
+    >>> print(round(ibm4.fertility_table[1]['book'], 3))
+    1.0
 
-    >>> print('{0:.3f}'.format(ibm4.p1))
-    0.033
+    >>> print(ibm4.p1)
+    0.033...
 
     >>> test_sentence = bitext[2]
     >>> test_sentence.words
@@ -176,9 +176,6 @@ class IBMModel4(IBMModel):
         Translation direction is from ``AlignedSent.mots`` to
         ``AlignedSent.words``.
 
-        Runs a few iterations of Model 3 training to initialize
-        model parameters.
-
         :param sentence_aligned_corpus: Sentence-aligned parallel corpus
         :type sentence_aligned_corpus: list(AlignedSent)
 
@@ -215,7 +212,7 @@ class IBMModel4(IBMModel):
             self.alignment_table = ibm3.alignment_table
             self.fertility_table = ibm3.fertility_table
             self.p1 = ibm3.p1
-            self.set_uniform_distortion_probabilities(sentence_aligned_corpus)
+            self.set_uniform_probabilities(sentence_aligned_corpus)
         else:
             # Set user-defined probabilities
             self.translation_table = probability_tables['translation_table']
@@ -227,7 +224,7 @@ class IBMModel4(IBMModel):
             self.non_head_distortion_table = probability_tables[
                 'non_head_distortion_table']
 
-        for k in range(0, iterations):
+        for n in range(0, iterations):
             self.train(sentence_aligned_corpus)
 
     def reset_probabilities(self):
@@ -248,7 +245,7 @@ class IBMModel4(IBMModel):
         Values accessed as ``distortion_table[dj][trg_class]``.
         """
 
-    def set_uniform_distortion_probabilities(self, sentence_aligned_corpus):
+    def set_uniform_probabilities(self, sentence_aligned_corpus):
         """
         Set distortion probabilities uniformly to
         1 / cardinality of displacement values
@@ -280,9 +277,7 @@ class IBMModel4(IBMModel):
                 lambda: initial_prob)
 
     def train(self, parallel_corpus):
-        # Reset all counts
         counts = Model4Counts()
-
         for aligned_sentence in parallel_corpus:
             m = len(aligned_sentence.words)
 
@@ -314,8 +309,7 @@ class IBMModel4(IBMModel):
         # If any probability is less than MIN_PROB, clamp it to MIN_PROB
         existing_alignment_table = self.alignment_table
         self.reset_probabilities()
-        # don't retrain alignment table
-        self.alignment_table = existing_alignment_table
+        self.alignment_table = existing_alignment_table  # don't retrain
 
         self.maximize_lexical_translation_probabilities(counts)
         self.maximize_distortion_probabilities(counts)
@@ -433,7 +427,7 @@ class IBMModel4(IBMModel):
 class Model4Counts(Counts):
     """
     Data object to store counts of various parameters during training.
-    Include counts for distortion.
+    Includes counts for distortion.
     """
     def __init__(self):
         super(Model4Counts, self).__init__()
diff --git a/nltk/align/ibm5.py b/nltk/translate/ibm5.py
similarity index 95%
rename from nltk/align/ibm5.py
rename to nltk/translate/ibm5.py
index e50129e..94fa631 100644
--- a/nltk/align/ibm5.py
+++ b/nltk/translate/ibm5.py
@@ -114,12 +114,12 @@ Translation: Parameter Estimation. Computational Linguistics, 19 (2),
 from __future__ import division
 from collections import defaultdict
 from math import factorial
-from nltk.align import AlignedSent
-from nltk.align import Alignment
-from nltk.align import IBMModel
-from nltk.align import IBMModel4
-from nltk.align.ibm_model import Counts
-from nltk.align.ibm_model import longest_target_sentence_length
+from nltk.translate import AlignedSent
+from nltk.translate import Alignment
+from nltk.translate import IBMModel
+from nltk.translate import IBMModel4
+from nltk.translate.ibm_model import Counts
+from nltk.translate.ibm_model import longest_target_sentence_length
 import warnings
 
 
@@ -130,7 +130,7 @@ class IBMModel5(IBMModel):
 
     >>> bitext = []
     >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
-    >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
+    >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
     >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
     >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
     >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
@@ -138,25 +138,25 @@ class IBMModel5(IBMModel):
     >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
     >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
     >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
-    >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'i': 4, 'summarize': 5 }
-    >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 }
+    >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 }
+    >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 }
 
     >>> ibm5 = IBMModel5(bitext, 5, src_classes, trg_classes)
 
-    >>> print('{0:.3f}'.format(ibm5.head_vacancy_table[1][1][1]))
-    1.000
-    >>> print('{0:.3f}'.format(ibm5.head_vacancy_table[2][1][1]))
-    0.000
-    >>> print('{0:.3f}'.format(ibm5.non_head_vacancy_table[3][3][6]))
-    1.000
+    >>> print(round(ibm5.head_vacancy_table[1][1][1], 3))
+    1.0
+    >>> print(round(ibm5.head_vacancy_table[2][1][1], 3))
+    0.0
+    >>> print(round(ibm5.non_head_vacancy_table[3][3][6], 3))
+    1.0
 
-    >>> print('{0:.3f}'.format(ibm5.fertility_table[2]['summarize']))
-    1.000
-    >>> print('{0:.3f}'.format(ibm5.fertility_table[1]['book']))
-    1.000
+    >>> print(round(ibm5.fertility_table[2]['summarize'], 3))
+    1.0
+    >>> print(round(ibm5.fertility_table[1]['book'], 3))
+    1.0
 
-    >>> print('{0:.3f}'.format(ibm5.p1))
-    0.033
+    >>> print(ibm5.p1)
+    0.033...
 
     >>> test_sentence = bitext[2]
     >>> test_sentence.words
@@ -223,7 +223,7 @@ class IBMModel5(IBMModel):
             self.p1 = ibm4.p1
             self.head_distortion_table = ibm4.head_distortion_table
             self.non_head_distortion_table = ibm4.non_head_distortion_table
-            self.set_uniform_distortion_probabilities(sentence_aligned_corpus)
+            self.set_uniform_probabilities(sentence_aligned_corpus)
         else:
             # Set user-defined probabilities
             self.translation_table = probability_tables['translation_table']
@@ -239,7 +239,7 @@ class IBMModel5(IBMModel):
             self.non_head_vacancy_table = probability_tables[
                 'non_head_vacancy_table']
 
-        for k in range(0, iterations):
+        for n in range(0, iterations):
             self.train(sentence_aligned_corpus)
 
     def reset_probabilities(self):
@@ -260,7 +260,7 @@ class IBMModel5(IBMModel):
         Values accessed as ``non_head_vacancy_table[dv][v_max][trg_class]``.
         """
 
-    def set_uniform_distortion_probabilities(self, sentence_aligned_corpus):
+    def set_uniform_probabilities(self, sentence_aligned_corpus):
         """
         Set vacancy probabilities uniformly to
         1 / cardinality of vacancy difference values
@@ -292,9 +292,7 @@ class IBMModel5(IBMModel):
                     lambda: initial_prob)
 
     def train(self, parallel_corpus):
-        # Reset all counts
         counts = Model5Counts()
-
         for aligned_sentence in parallel_corpus:
             l = len(aligned_sentence.mots)
             m = len(aligned_sentence.words)
@@ -330,8 +328,7 @@ class IBMModel5(IBMModel):
         # If any probability is less than MIN_PROB, clamp it to MIN_PROB
         existing_alignment_table = self.alignment_table
         self.reset_probabilities()
-        # don't retrain alignment table
-        self.alignment_table = existing_alignment_table
+        self.alignment_table = existing_alignment_table  # don't retrain
 
         self.maximize_lexical_translation_probabilities(counts)
         self.maximize_vacancy_probabilities(counts)
@@ -556,7 +553,7 @@ class IBMModel5(IBMModel):
 class Model5Counts(Counts):
     """
     Data object to store counts of various parameters during training.
-    Include counts for vacancies.
+    Includes counts for vacancies.
     """
     def __init__(self):
         super(Model5Counts, self).__init__()
diff --git a/nltk/align/ibm_model.py b/nltk/translate/ibm_model.py
similarity index 97%
rename from nltk/align/ibm_model.py
rename to nltk/translate/ibm_model.py
index cec7246..c249826 100644
--- a/nltk/align/ibm_model.py
+++ b/nltk/translate/ibm_model.py
@@ -54,8 +54,7 @@ def longest_target_sentence_length(sentence_aligned_corpus):
     max_m = 0
     for aligned_sentence in sentence_aligned_corpus:
         m = len(aligned_sentence.words)
-        if m > max_m:
-            max_m = m
+        max_m = max(m, max_m)
     return max_m
 
 
@@ -106,6 +105,14 @@ class IBMModel(object):
         Used in model 3 and higher.
         """
 
+    def set_uniform_probabilities(self, sentence_aligned_corpus):
+        """
+        Initialize probability tables to a uniform distribution
+
+        Derived classes should implement this accordingly.
+        """
+        pass
+
     def init_vocab(self, sentence_aligned_corpus):
         src_vocab = set()
         trg_vocab = set()
@@ -374,8 +381,8 @@ class AlignmentInfo(object):
     alignment, cepts, and fertility.
 
     Warning: Alignments are one-indexed here, in contrast to
-    nltk.align.Alignment and nltk.align.AlignedSent, which are zero-
-    indexed. This class is not meant to be used outside of IBM models.
+    nltk.translate.Alignment and AlignedSent, which are zero-indexed
+    This class is not meant to be used outside of IBM models.
     """
 
     def __init__(self, alignment, src_sentence, trg_sentence, cepts):
@@ -475,7 +482,7 @@ class AlignmentInfo(object):
     def zero_indexed_alignment(self):
         """
         :return: Zero-indexed alignment, suitable for use in external
-            ``nltk.align`` modules like ``nltk.align.Alignment``
+            ``nltk.translate`` modules like ``nltk.translate.Alignment``
         :rtype: list(tuple)
         """
         zero_indexed_alignment = []
@@ -521,6 +528,6 @@ class Counts(object):
     def update_fertility(self, count, alignment_info):
         for i in range(0, len(alignment_info.src_sentence)):
             s = alignment_info.src_sentence[i]
-            phi = len(alignment_info.cepts[i])
+            phi = alignment_info.fertility_of_i(i)
             self.fertility[phi][s] += count
             self.fertility_for_any_phi[s] += count
diff --git a/nltk/translate/metrics.py b/nltk/translate/metrics.py
new file mode 100644
index 0000000..27c55a5
--- /dev/null
+++ b/nltk/translate/metrics.py
@@ -0,0 +1,39 @@
+# Natural Language Toolkit: Translation metrics
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Will Zhang <wilzzha at gmail.com>
+#         Guan Gui <ggui at student.unimelb.edu.au>
+#         Steven Bird <stevenbird1 at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+def alignment_error_rate(reference, hypothesis, possible=None):
+    """
+    Return the Alignment Error Rate (AER) of an alignment
+    with respect to a "gold standard" reference alignment.
+    Return an error rate between 0.0 (perfect alignment) and 1.0 (no
+    alignment).
+
+        >>> from nltk.translate import Alignment
+        >>> ref = Alignment([(0, 0), (1, 1), (2, 2)])
+        >>> test = Alignment([(0, 0), (1, 2), (2, 1)])
+        >>> alignment_error_rate(ref, test) # doctest: +ELLIPSIS
+        0.6666666666666667
+
+    :type reference: Alignment
+    :param reference: A gold standard alignment (sure alignments)
+    :type hypothesis: Alignment
+    :param hypothesis: A hypothesis alignment (aka. candidate alignments)
+    :type possible: Alignment or None
+    :param possible: A gold standard reference of possible alignments
+        (defaults to *reference* if None)
+    :rtype: float or None
+    """
+
+    if possible is None:
+        possible = reference
+    else:
+        assert(reference.issubset(possible)) # sanity check
+
+    return (1.0 - float(len(hypothesis & reference) + len(hypothesis & possible)) /
+            float(len(hypothesis) + len(reference)))
diff --git a/nltk/align/phrase_based.py b/nltk/translate/phrase_based.py
similarity index 79%
rename from nltk/align/phrase_based.py
rename to nltk/translate/phrase_based.py
index 7c16043..bec641b 100644
--- a/nltk/align/phrase_based.py
+++ b/nltk/translate/phrase_based.py
@@ -2,13 +2,13 @@
 # Natural Language Toolkit: Phrase Extraction Algorithm
 #
 # Copyright (C) 2001-2015 NLTK Project
-# Authors: Liling Tan and Fredrik Hedman
+# Authors: Liling Tan, Fredrik Hedman, Petra Barancikova
 # URL: <http://nltk.org/>
 # For license information, see LICENSE.TXT
 
 def extract(f_start, f_end, e_start, e_end, 
-            alignment, e_aligned, f_aligned,
-            srctext, trgtext, srclen, trglen):
+            alignment, f_aligned,
+            srctext, trgtext, srclen, trglen, max_phrase_length):
     """
     This function checks for alignment point consistency and extracts 
     phrases using the chunk of consistent phrases.
@@ -29,61 +29,51 @@ def extract(f_start, f_end, e_start, e_end,
     
     :type f_start: int
     :param f_start: Starting index of the possible foreign language phrases
-    
     :type f_end: int
     :param f_end: Starting index of the possible foreign language phrases
-    
     :type e_start: int
     :param e_start: Starting index of the possible source language phrases
-    
     :type e_end: int
     :param e_end: Starting index of the possible source language phrases
-    
     :type srctext: list
     :param srctext: The source language tokens, a list of string.
-    
     :type trgtext: list
     :param trgtext: The target language tokens, a list of string.
-    
     :type srclen: int
     :param srclen: The number of tokens in the source language tokens.
-    
     :type trglen: int
     :param trglen: The number of tokens in the target language tokens.
     """
+
     if f_end < 0:  # 0-based indexing.
         return {}
-    # Check if alignement points are consistent.
+    # Check if alignment points are consistent.
     for e,f in alignment:
-        if ((f_start <= f <= f_end) and
-           (e < e_start or e > e_end)):
+        if ((f_start <= f <= f_end) and (e < e_start or e > e_end)):
             return {}
 
     # Add phrase pairs (incl. additional unaligned f)
     phrases = set()
     fs = f_start
     while True:
-        fe = f_end
+        fe = min(f_end, f_start + max_phrase_length - 1)
         while True:
             # add phrase pair ([e_start, e_end], [fs, fe]) to set E
             # Need to +1 in range  to include the end-point.
-            src_phrase = " ".join(srctext[i] for i in 
-                                  range(e_start,e_end+1))
-            trg_phrase = " ".join(trgtext[i] for i in range(fs,fe+1))
+            src_phrase = " ".join(srctext[e_start:e_end+1])
+            trg_phrase = " ".join(trgtext[fs:fe+1])
             # Include more data for later ordering.
             phrases.add(((e_start, e_end+1), (f_start, f_end+1), 
                          src_phrase, trg_phrase))
             fe += 1
-            # if fe is in word alignment or out-of-bounds
             if fe in f_aligned or fe == trglen:
                 break
         fs -=1 
-        # if fs is in word alignment or out-of-bounds
         if fs in f_aligned or fs < 0:
             break
     return phrases
 
-def phrase_extraction(srctext, trgtext, alignment):
+def phrase_extraction(srctext, trgtext, alignment, max_phrase_length=0):
     """
     Phrase extraction algorithm extracts all consistent phrase pairs from 
     a word-aligned sentence pair.
@@ -133,50 +123,52 @@ def phrase_extraction(srctext, trgtext, alignment):
     
     :type srctext: str
     :param srctext: The sentence string from the source language.
-    
     :type trgtext: str
     :param trgtext: The sentence string from the target language.
-    
     :type alignment: str
     :param alignment: The word alignment outputs as list of tuples, where
-    the first elements of tuples are the source words' indices and
-    second elements are the target words' indices. This is also the output
-    format of nltk/align/ibm1.py
-    
+        the first elements of tuples are the source words' indices and
+        second elements are the target words' indices. This is also the output
+        format of nltk.translate.ibm1
     :rtype: list(tuple)
     :return: A list of tuples, each element in a list is a phrase and each 
-    phrase is a tuple made up of (i) its source location, (ii) its target 
-    location, (iii) the source phrase and (iii) the target phrase. The phrase
-    list of tuples represents all the possible phrases extracted from the 
-    word alignments. 
+        phrase is a tuple made up of (i) its source location, (ii) its target 
+        location, (iii) the source phrase and (iii) the target phrase. The phrase
+        list of tuples represents all the possible phrases extracted from the 
+        word alignments. 
+    :type max_phrase_length: int
+    :param max_phrase_length: maximal phrase length, if 0 or not specified
+        it is set to a length of the longer sentence (srctext or trgtext).
     """
-    # Calculate no. of tokens in source and target texts.
+
     srctext = srctext.split()   # e
     trgtext = trgtext.split()   # f
     srclen = len(srctext)       # len(e)
     trglen = len(trgtext)       # len(f)
     # Keeps an index of which source/target words that are aligned.
-    e_aligned = [i for i,_ in alignment]
     f_aligned = [j for _,j in alignment]
+    max_phrase_length = max_phrase_length or max(srclen,trglen)
+
+    # set of phrase pairs BP
+    bp = set()
 
-    bp = set() # set of phrase pairs BP
-    # Index e_start from 0 to len(e) - 1
     for e_start in range(srclen):
-        # Index e_end from e_start to len(e) - 1
-        for e_end in range(e_start, srclen):
+        max_idx = min(srclen, e_start + max_phrase_length)
+        for e_end in range(e_start, max_idx):
             # // find the minimally matching foreign phrase
             # (f start , f end ) = ( length(f), 0 )
             # f_start ∈ [0, len(f) - 1]; f_end ∈ [0, len(f) - 1]
             f_start, f_end = trglen-1 , -1  #  0-based indexing
-            # for all (e,f) ∈ A do
+ 
             for e,f in alignment:
                 if e_start <= e <= e_end:
                     f_start = min(f, f_start)
                     f_end = max(f, f_end)
             # add extract (f start , f end , e start , e end ) to set BP
             phrases = extract(f_start, f_end, e_start, e_end, 
-                              alignment, e_aligned, f_aligned,
-                              srctext, trgtext, srclen, trglen)
+                              alignment, f_aligned,
+                              srctext, trgtext, srclen, trglen,
+                              max_phrase_length)
             if phrases:
                 bp.update(phrases)
     return bp
diff --git a/nltk/translate/stack_decoder.py b/nltk/translate/stack_decoder.py
new file mode 100644
index 0000000..0db00c5
--- /dev/null
+++ b/nltk/translate/stack_decoder.py
@@ -0,0 +1,495 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Stack decoder
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Tah Wei Hoon <hoon.tw at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A decoder that uses stacks to implement phrase-based translation.
+
+In phrase-based translation, the source sentence is segmented into
+phrases of one or more words, and translations for those phrases are
+used to build the target sentence.
+
+Hypothesis data structures are used to keep track of the source words
+translated so far and the partial output. A hypothesis can be expanded
+by selecting an untranslated phrase, looking up its translation in a
+phrase table, and appending that translation to the partial output.
+Translation is complete when a hypothesis covers all source words.
+
+The search space is huge because the source sentence can be segmented
+in different ways, the source phrases can be selected in any order,
+and there could be multiple translations for the same source phrase in
+the phrase table. To make decoding tractable, stacks are used to limit
+the number of candidate hypotheses by doing histogram and/or threshold
+pruning.
+
+Hypotheses with the same number of words translated are placed in the
+same stack. In histogram pruning, each stack has a size limit, and
+the hypothesis with the lowest score is removed when the stack is full.
+In threshold pruning, hypotheses that score below a certain threshold
+of the best hypothesis in that stack are removed.
+
+Hypothesis scoring can include various factors such as phrase
+translation probability, language model probability, length of
+translation, cost of remaining words to be translated, and so on.
+
+
+References:
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
+"""
+
+import warnings
+from collections import defaultdict
+from math import log
+
+
+class StackDecoder(object):
+    """
+    Phrase-based stack decoder for machine translation
+
+    >>> from nltk.translate import PhraseTable
+    >>> phrase_table = PhraseTable()
+    >>> phrase_table.add(('niemand',), ('nobody',), log(0.8))
+    >>> phrase_table.add(('niemand',), ('no', 'one'), log(0.2))
+    >>> phrase_table.add(('erwartet',), ('expects',), log(0.8))
+    >>> phrase_table.add(('erwartet',), ('expecting',), log(0.2))
+    >>> phrase_table.add(('niemand', 'erwartet'), ('one', 'does', 'not', 'expect'), log(0.1))
+    >>> phrase_table.add(('die', 'spanische', 'inquisition'), ('the', 'spanish', 'inquisition'), log(0.8))
+    >>> phrase_table.add(('!',), ('!',), log(0.8))
+
+    >>> #  nltk.model should be used here once it is implemented
+    >>> from collections import defaultdict
+    >>> language_prob = defaultdict(lambda: -999.0)
+    >>> language_prob[('nobody',)] = log(0.5)
+    >>> language_prob[('expects',)] = log(0.4)
+    >>> language_prob[('the', 'spanish', 'inquisition')] = log(0.2)
+    >>> language_prob[('!',)] = log(0.1)
+    >>> language_model = type('',(object,),{'probability_change': lambda self, context, phrase: language_prob[phrase], 'probability': lambda self, phrase: language_prob[phrase]})()
+
+    >>> stack_decoder = StackDecoder(phrase_table, language_model)
+
+    >>> stack_decoder.translate(['niemand', 'erwartet', 'die', 'spanische', 'inquisition', '!'])
+    ['nobody', 'expects', 'the', 'spanish', 'inquisition', '!']
+
+    """
+    def __init__(self, phrase_table, language_model):
+        """
+        :param phrase_table: Table of translations for source language
+            phrases and the log probabilities for those translations.
+        :type phrase_table: PhraseTable
+
+        :param language_model: Target language model. Must define a
+            ``probability_change`` method that calculates the change in
+            log probability of a sentence, if a given string is appended
+            to it.
+            This interface is experimental and will likely be replaced
+            with nltk.model once it is implemented.
+        :type language_model: object
+        """
+        self.phrase_table = phrase_table
+        self.language_model = language_model
+
+        self.word_penalty = 0.0
+        """
+        float: Influences the translation length exponentially.
+            If positive, shorter translations are preferred.
+            If negative, longer translations are preferred.
+            If zero, no penalty is applied.
+        """
+
+        self.beam_threshold = 0.0
+        """
+        float: Hypotheses that score below this factor of the best
+            hypothesis in a stack are dropped from consideration.
+            Value between 0.0 and 1.0.
+        """
+
+        self.stack_size = 100
+        """
+        int: Maximum number of hypotheses to consider in a stack.
+            Higher values increase the likelihood of a good translation,
+            but increases processing time.
+        """
+
+        self.__distortion_factor = 0.5
+        self.__compute_log_distortion()
+
+    @property
+    def distortion_factor(self):
+        """
+        float: Amount of reordering of source phrases.
+            Lower values favour monotone translation, suitable when
+            word order is similar for both source and target languages.
+            Value between 0.0 and 1.0. Default 0.5.
+        """
+        return self.__distortion_factor
+
+    @distortion_factor.setter
+    def distortion_factor(self, d):
+        self.__distortion_factor = d
+        self.__compute_log_distortion()
+
+    def __compute_log_distortion(self):
+        # cache log(distortion_factor) so we don't have to recompute it
+        # when scoring hypotheses
+        if self.__distortion_factor == 0.0:
+            self.__log_distortion_factor = log(1e-9)  # 1e-9 is almost zero
+        else:
+            self.__log_distortion_factor = log(self.__distortion_factor)
+
+    def translate(self, src_sentence):
+        """
+        :param src_sentence: Sentence to be translated
+        :type src_sentence: list(str)
+
+        :return: Translated sentence
+        :rtype: list(str)
+        """
+        sentence = tuple(src_sentence)  # prevent accidental modification
+        sentence_length = len(sentence)
+        stacks = [_Stack(self.stack_size, self.beam_threshold)
+                  for _ in range(0, sentence_length + 1)]
+        empty_hypothesis = _Hypothesis()
+        stacks[0].push(empty_hypothesis)
+
+        all_phrases = self.find_all_src_phrases(sentence)
+        future_score_table = self.compute_future_scores(sentence)
+        for stack in stacks:
+            for hypothesis in stack:
+                possible_expansions = StackDecoder.valid_phrases(all_phrases,
+                                                                 hypothesis)
+                for src_phrase_span in possible_expansions:
+                    src_phrase = sentence[src_phrase_span[0]:src_phrase_span[1]]
+                    for translation_option in (self.phrase_table.
+                                               translations_for(src_phrase)):
+                        raw_score = self.expansion_score(
+                            hypothesis, translation_option, src_phrase_span)
+                        new_hypothesis = _Hypothesis(
+                            raw_score=raw_score,
+                            src_phrase_span=src_phrase_span,
+                            trg_phrase=translation_option.trg_phrase,
+                            previous=hypothesis
+                        )
+                        new_hypothesis.future_score = self.future_score(
+                            new_hypothesis, future_score_table, sentence_length)
+                        total_words = new_hypothesis.total_translated_words()
+                        stacks[total_words].push(new_hypothesis)
+
+        if not stacks[sentence_length]:
+            warnings.warn('Unable to translate all words. '
+                          'The source sentence contains words not in '
+                          'the phrase table')
+            # Instead of returning empty output, perhaps a partial
+            # translation could be returned
+            return []
+
+        best_hypothesis = stacks[sentence_length].best()
+        return best_hypothesis.translation_so_far()
+
+    def find_all_src_phrases(self, src_sentence):
+        """
+        Finds all subsequences in src_sentence that have a phrase
+        translation in the translation table
+
+        :type src_sentence: tuple(str)
+
+        :return: Subsequences that have a phrase translation,
+            represented as a table of lists of end positions.
+            For example, if result[2] is [5, 6, 9], then there are
+            three phrases starting from position 2 in ``src_sentence``,
+            ending at positions 5, 6, and 9 exclusive. The list of
+            ending positions are in ascending order.
+        :rtype: list(list(int))
+        """
+        sentence_length = len(src_sentence)
+        phrase_indices = [[] for _ in src_sentence]
+        for start in range(0, sentence_length):
+            for end in range(start + 1, sentence_length + 1):
+                potential_phrase = src_sentence[start:end]
+                if potential_phrase in self.phrase_table:
+                    phrase_indices[start].append(end)
+        return phrase_indices
+
+    def compute_future_scores(self, src_sentence):
+        """
+        Determines the approximate scores for translating every
+        subsequence in ``src_sentence``
+
+        Future scores can be used a look-ahead to determine the
+        difficulty of translating the remaining parts of a src_sentence.
+
+        :type src_sentence: tuple(str)
+
+        :return: Scores of subsequences referenced by their start and
+        end positions. For example, result[2][5] is the score of the
+        subsequence covering positions 2, 3, and 4.
+        :rtype: dict(int: (dict(int): float))
+        """
+        scores = defaultdict(lambda: defaultdict(lambda: float('-inf')))
+        for seq_length in range(1, len(src_sentence) + 1):
+            for start in range(0, len(src_sentence) - seq_length + 1):
+                end = start + seq_length
+                phrase = src_sentence[start:end]
+                if phrase in self.phrase_table:
+                    score = self.phrase_table.translations_for(
+                        phrase)[0].log_prob  # pick best (first) translation
+                    # Warning: API of language_model is subject to change
+                    score += self.language_model.probability(phrase)
+                    scores[start][end] = score
+
+                # check if a better score can be obtained by combining
+                # two child subsequences
+                for mid in range(start + 1, end):
+                    combined_score = (scores[start][mid] +
+                                      scores[mid][end])
+                    if combined_score > scores[start][end]:
+                        scores[start][end] = combined_score
+        return scores
+
+    def future_score(self, hypothesis, future_score_table, sentence_length):
+        """
+        Determines the approximate score for translating the
+        untranslated words in ``hypothesis``
+        """
+        score = 0.0
+        for span in hypothesis.untranslated_spans(sentence_length):
+            score += future_score_table[span[0]][span[1]]
+        return score
+
+    def expansion_score(self, hypothesis, translation_option, src_phrase_span):
+        """
+        Calculate the score of expanding ``hypothesis`` with
+        ``translation_option``
+
+        :param hypothesis: Hypothesis being expanded
+        :type hypothesis: _Hypothesis
+
+        :param translation_option: Information about the proposed expansion
+        :type translation_option: PhraseTableEntry
+
+        :param src_phrase_span: Word position span of the source phrase
+        :type src_phrase_span: tuple(int, int)
+        """
+        score = hypothesis.raw_score
+        score += translation_option.log_prob
+        # The API of language_model is subject to change; it could accept
+        # a string, a list of words, and/or some other type
+        score += self.language_model.probability_change(
+            hypothesis, translation_option.trg_phrase)
+        score += self.distortion_score(hypothesis, src_phrase_span)
+        score -= self.word_penalty * len(translation_option.trg_phrase)
+        return score
+
+    def distortion_score(self, hypothesis, next_src_phrase_span):
+        if not hypothesis.src_phrase_span:
+            return 0.0
+        next_src_phrase_start = next_src_phrase_span[0]
+        prev_src_phrase_end = hypothesis.src_phrase_span[1]
+        distortion_distance = next_src_phrase_start - prev_src_phrase_end
+        return abs(distortion_distance) * self.__log_distortion_factor
+
+    @staticmethod
+    def valid_phrases(all_phrases_from, hypothesis):
+        """
+        Extract phrases from ``all_phrases_from`` that contains words
+        that have not been translated by ``hypothesis``
+
+        :param all_phrases_from: Phrases represented by their spans, in
+            the same format as the return value of
+            ``find_all_src_phrases``
+        :type all_phrases_from: list(list(int))
+
+        :type hypothesis: _Hypothesis
+
+        :return: A list of phrases, represented by their spans, that
+            cover untranslated positions.
+        :rtype: list(tuple(int, int))
+        """
+        untranslated_spans = hypothesis.untranslated_spans(
+            len(all_phrases_from))
+        valid_phrases = []
+        for available_span in untranslated_spans:
+            start = available_span[0]
+            available_end = available_span[1]
+            while start < available_end:
+                for phrase_end in all_phrases_from[start]:
+                    if phrase_end > available_end:
+                        # Subsequent elements in all_phrases_from[start]
+                        # will also be > available_end, since the
+                        # elements are in ascending order
+                        break
+                    valid_phrases.append((start, phrase_end))
+                start += 1
+        return valid_phrases
+
+
+class _Hypothesis(object):
+    """
+    Partial solution to a translation.
+
+    Records the word positions of the phrase being translated, its
+    translation, raw score, and the cost of the untranslated parts of
+    the sentence. When the next phrase is selected to build upon the
+    partial solution, a new _Hypothesis object is created, with a back
+    pointer to the previous hypothesis.
+
+    To find out which words have been translated so far, look at the
+    ``src_phrase_span`` in the hypothesis chain. Similarly, the
+    translation output can be found by traversing up the chain.
+    """
+    def __init__(self, raw_score=0.0, src_phrase_span=(), trg_phrase=(),
+                 previous=None, future_score=0.0):
+        """
+        :param raw_score: Likelihood of hypothesis so far.
+            Higher is better. Does not account for untranslated words.
+        :type raw_score: float
+
+        :param src_phrase_span: Span of word positions covered by the
+            source phrase in this hypothesis expansion. For example,
+            (2, 5) means that the phrase is from the second word up to,
+            but not including the fifth word in the source sentence.
+        :type src_phrase_span: tuple(int)
+
+        :param trg_phrase: Translation of the source phrase in this
+            hypothesis expansion
+        :type trg_phrase: tuple(str)
+
+        :param previous: Previous hypothesis before expansion to this one
+        :type previous: _Hypothesis
+
+        :param future_score: Approximate score for translating the
+            remaining words not covered by this hypothesis. Higher means
+            that the remaining words are easier to translate.
+        :type future_score: float
+        """
+        self.raw_score = raw_score
+        self.src_phrase_span = src_phrase_span
+        self.trg_phrase = trg_phrase
+        self.previous = previous
+        self.future_score = future_score
+
+    def score(self):
+        """
+        Overall score of hypothesis after accounting for local and
+        global features
+        """
+        return self.raw_score + self.future_score
+
+    def untranslated_spans(self, sentence_length):
+        """
+        Starting from each untranslated word, find the longest
+        continuous span of untranslated positions
+
+        :param sentence_length: Length of source sentence being
+            translated by the hypothesis
+        :type sentence_length: int
+
+        :rtype: list(tuple(int, int))
+        """
+        translated_positions = self.translated_positions()
+        translated_positions.sort()
+        translated_positions.append(sentence_length)  # add sentinel position
+
+        untranslated_spans = []
+        start = 0
+        # each untranslated span must end in one of the translated_positions
+        for end in translated_positions:
+            if start < end:
+                untranslated_spans.append((start, end))
+            start = end + 1
+
+        return untranslated_spans
+
+    def translated_positions(self):
+        """
+        List of positions in the source sentence of words already
+        translated. The list is not sorted.
+
+        :rtype: list(int)
+        """
+        translated_positions = []
+        current_hypothesis = self
+        while current_hypothesis.previous is not None:
+            translated_span = current_hypothesis.src_phrase_span
+            translated_positions.extend(range(translated_span[0],
+                                              translated_span[1]))
+            current_hypothesis = current_hypothesis.previous
+        return translated_positions
+
+    def total_translated_words(self):
+        return len(self.translated_positions())
+
+    def translation_so_far(self):
+        translation = []
+        self.__build_translation(self, translation)
+        return translation
+
+    def __build_translation(self, hypothesis, output):
+        if hypothesis.previous is None:
+            return
+        self.__build_translation(hypothesis.previous, output)
+        output.extend(hypothesis.trg_phrase)
+
+
+class _Stack(object):
+    """
+    Collection of _Hypothesis objects
+    """
+    def __init__(self, max_size=100, beam_threshold=0.0):
+        """
+        :param beam_threshold: Hypotheses that score less than this
+            factor of the best hypothesis are discarded from the stack.
+            Value must be between 0.0 and 1.0.
+        :type beam_threshold: float
+        """
+        self.max_size = max_size
+        self.items = []
+
+        if beam_threshold == 0.0:
+            self.__log_beam_threshold = float('-inf')
+        else:
+            self.__log_beam_threshold = log(beam_threshold)
+
+    def push(self, hypothesis):
+        """
+        Add ``hypothesis`` to the stack.
+        Removes lowest scoring hypothesis if the stack is full.
+        After insertion, hypotheses that score less than
+        ``beam_threshold`` times the score of the best hypothesis
+        are removed.
+        """
+        self.items.append(hypothesis)
+        self.items.sort(key=lambda h: h.score(), reverse=True)
+        while len(self.items) > self.max_size:
+            self.items.pop()
+        self.threshold_prune()
+
+    def threshold_prune(self):
+        if not self.items:
+            return
+        #  log(score * beam_threshold) = log(score) + log(beam_threshold)
+        threshold = self.items[0].score() + self.__log_beam_threshold
+        for hypothesis in reversed(self.items):
+            if hypothesis.score() < threshold:
+                self.items.pop()
+            else:
+                break
+
+    def best(self):
+        """
+        :return: Hypothesis with the highest score in the stack
+        :rtype: _Hypothesis
+        """
+        if self.items:
+            return self.items[0]
+        return None
+
+    def __iter__(self):
+        return iter(self.items)
+
+    def __contains__(self, hypothesis):
+        return hypothesis in self.items
diff --git a/nltk/twitter/__init__.py b/nltk/twitter/__init__.py
index ef83dfc..4ffb61f 100644
--- a/nltk/twitter/__init__.py
+++ b/nltk/twitter/__init__.py
@@ -17,9 +17,12 @@ try:
     import twython
 except ImportError:
     import warnings
-    warnings.warn("nltk.twitter package not loaded "
-                  "(please install twython library).")
+    warnings.warn("The twython library has not been installed. "
+                  "Some functionality from the twitter package will not be available.")
+else:
+    from nltk.twitter.util import Authenticate, credsfromfile
+    from nltk.twitter.twitterclient import Streamer, Query, Twitter,\
+         TweetViewer, TweetWriter
 
-from nltk.twitter.util import Authenticate, credsfromfile, json2csv
-from nltk.twitter.twitterclient import Streamer, Query, Twitter,\
-     TweetViewer, TweetWriter
+
+from nltk.twitter.common import json2csv
diff --git a/nltk/twitter/api.py b/nltk/twitter/api.py
index cda926d..1ff9be4 100644
--- a/nltk/twitter/api.py
+++ b/nltk/twitter/api.py
@@ -13,6 +13,7 @@ handling.
 """
 
 from datetime import tzinfo, timedelta, datetime
+from nltk.compat import UTC
 import time as _time
 
 
@@ -45,21 +46,29 @@ LOCAL = LocalTimezoneOffsetWithUTC()
 
 class BasicTweetHandler(object):
     """
-    Minimum implementation of TweetHandler
-    Counts the number of tweets and decides when the client shoud stop
-    fetching tweets
+    Minimal implementation of `TweetHandler`.
+
+    Counts the number of Tweets and decides when the client should stop
+    fetching them.
     """
     def __init__(self, limit=20):
         self.limit = limit
         self.counter = 0
-        
-        """A flag to indicate that to the client to stop for
-        a functional clause (e.g. date limit)"""
+
+        """
+        A flag to indicate to the client whether to stop fetching data given
+        some condition (e.g., reaching a date limit).
+        """
         self.do_stop = False
 
+        """
+        Stores the id of the last fetched Tweet to handle pagination.
+        """
+        self.max_id = None
+
     def do_continue(self):
         """
-        Returns false if the client should stop fetching tweets
+        Returns `False` if the client should stop fetching Tweets.
         """
         return self.counter < self.limit and not self.do_stop
 
@@ -68,22 +77,27 @@ class TweetHandlerI(BasicTweetHandler):
     Interface class whose subclasses should implement a handle method that
     Twitter clients can delegate to.
     """
-    def __init__(self, limit=20, date_limit=None):
+    def __init__(self, limit=20, upper_date_limit=None, lower_date_limit=None):
         """
-        :param int limit: The number of data items to process in the current round of\
-        processing.
+        :param int limit: The number of data items to process in the current\
+        round of processing.
 
-        :param tuple date_limit: The date at which to stop collecting new\
-        data. This should be entered as a tuple which can serve as the\
-        argument to `datetime.datetime`. E.g. `data_limit=(2015, 4, 1, 12,\
-        40)` for 12:30 pm on April 1 2015.
+        :param tuple upper_date_limit: The date at which to stop collecting\
+        new data. This should be entered as a tuple which can serve as the\
+        argument to `datetime.datetime`.\
+        E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015.
 
+        :param tuple lower_date_limit: The date at which to stop collecting\
+        new data. See `upper_data_limit` for formatting.
         """
         BasicTweetHandler.__init__(self, limit)
 
-        self.date_limit = date_limit
-        if date_limit is not None:
-            self.date_limit = datetime(*date_limit, tzinfo=LOCAL)
+        self.upper_date_limit = None
+        self.lower_date_limit = None
+        if upper_date_limit:
+            self.upper_date_limit = datetime(*upper_date_limit, tzinfo=LOCAL)
+        if lower_date_limit:
+            self.lower_date_limit = datetime(*lower_date_limit, tzinfo=LOCAL)
 
         self.startingup = True
 
@@ -98,4 +112,25 @@ class TweetHandlerI(BasicTweetHandler):
         Actions when the tweet limit has been reached
         """
         raise NotImplementedError
-        
\ No newline at end of file
+
+    def check_date_limit(self, data, verbose=False):
+        """
+        Validate date limits.
+        """
+        if self.upper_date_limit or self.lower_date_limit:
+            date_fmt = '%a %b %d %H:%M:%S +0000 %Y'
+            tweet_date = \
+                datetime.strptime(data['created_at'],
+                                  date_fmt).replace(tzinfo=UTC)
+            if (self.upper_date_limit and tweet_date > self.upper_date_limit) or \
+               (self.lower_date_limit and tweet_date < self.lower_date_limit):
+                if self.upper_date_limit:
+                    message = "earlier"
+                    date_limit = self.upper_date_limit
+                else:
+                    message = "later"
+                    date_limit = self.lower_date_limit
+                if verbose:
+                    print("Date limit {0} is {1} than date of current tweet {2}".\
+                      format(date_limit, message, tweet_date))
+                self.do_stop = True
diff --git a/nltk/twitter/util.py b/nltk/twitter/common.py
similarity index 69%
copy from nltk/twitter/util.py
copy to nltk/twitter/common.py
index d26d32c..9428c64 100644
--- a/nltk/twitter/util.py
+++ b/nltk/twitter/common.py
@@ -8,17 +8,17 @@
 # For license information, see LICENSE.TXT
 
 """
-Utility functions to accompany :module:`twitterclient`.
+Utility functions for the :module:`twitterclient` module which do not require
+the `twython` library to have been installed.
 """
 from __future__ import print_function
+
 import csv
+import gzip
 import json
-import os
-import pprint
+
 import nltk.compat as compat
-import gzip
 
-from twython import Twython
 
 HIER_SEPARATOR = "."
 
@@ -38,7 +38,6 @@ def extract_fields(tweet, fields):
             raise RuntimeError('Fatal error when extracting fields. Cannot find field ', field)
     return out
 
-
 def _add_field_to_out(json, field, out):
     if _is_composed_key(field):
         key, value = _get_key_value_composed(field)
@@ -126,6 +125,7 @@ def json2csv(fp, outfile, fields, encoding='utf8', errors='replace',
         writer.writerow(row)
     outf.close()
 
+
 def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
     """
     Identify appropriate CSV writer given the Python version
@@ -145,8 +145,6 @@ def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
     return (writer, outf)
 
 
-
-
 def json2csv_entities(tweets_file, outfile, main_fields, entity_type, entity_fields,
                       encoding='utf8', errors='replace', gzip_compress=False):
     """
@@ -258,132 +256,3 @@ def _write_to_file(object_fields, items, entity_fields, writer):
         row = object_fields + extract_fields(item, entity_fields)
         writer.writerow(row)
 
-
-def credsfromfile(creds_file=None, subdir=None, verbose=False):
-    """
-    Convenience function for authentication
-    """
-    return Authenticate().load_creds(creds_file=creds_file, subdir=subdir, verbose=verbose)
-
-
-class Authenticate(object):
-    """
-    Methods for authenticating with Twitter.
-    """
-    def __init__(self):
-        self.creds_file = 'credentials.txt'
-        self.creds_fullpath = None
-
-        self.oauth = {}
-        try:
-            self.twitter_dir = os.environ['TWITTER']
-            self.creds_subdir = self.twitter_dir
-        except KeyError:
-            self.twitter_dir = None
-            self.creds_subdir = None
-
-
-    def load_creds(self, creds_file=None, subdir=None, verbose=False):
-        """
-        Read OAuth credentials from a text file.
-
-        ::
-           File format for OAuth 1
-           =======================
-           app_key=YOUR_APP_KEY
-           app_secret=YOUR_APP_SECRET
-           oauth_token=OAUTH_TOKEN
-           oauth_token_secret=OAUTH_TOKEN_SECRET
-
-
-        ::
-           File format for OAuth 2
-           =======================
-
-           app_key=YOUR_APP_KEY
-           app_secret=YOUR_APP_SECRET
-           access_token=ACCESS_TOKEN
-
-        :param str file_name: File containing credentials. ``None`` (default) reads\
-        data from `TWITTER/'credentials.txt'`
-        """
-        if creds_file is not None:
-            self.creds_file = creds_file
-
-        if subdir is None:
-            if self.creds_subdir is None:
-                msg = "Supply a value to the 'subdir' parameter or" +\
-                      " set the TWITTER environment variable."
-                raise ValueError(msg)
-        else:
-            self.creds_subdir = subdir
-
-        self.creds_fullpath =\
-            os.path.normpath(os.path.join(self.creds_subdir, self.creds_file))
-
-        if not os.path.isfile(self.creds_fullpath):
-            raise OSError('Cannot find file {}'.format(self.creds_fullpath))
-
-        with open(self.creds_fullpath) as infile:
-            if verbose:
-                print('Reading credentials file {}'.format(self.creds_fullpath))
-
-            for line in infile:
-                if '=' in line:
-                    name, value = line.split('=', 1)
-                    self.oauth[name.strip()] = value.strip()
-
-        self._validate_creds_file(verbose=verbose)
-
-        return self.oauth
-
-    def _validate_creds_file(self, verbose=False):
-        """Check validity of a credentials file."""
-        oauth1 = False
-        oauth1_keys = ['app_key', 'app_secret', 'oauth_token', 'oauth_token_secret']
-        oauth2 = False
-        oauth2_keys = ['app_key', 'app_secret', 'access_token']
-        if all(k in self.oauth for k in oauth1_keys):
-            oauth1 = True
-        elif all(k in self.oauth for k in oauth2_keys):
-            oauth2 = True
-
-        if not (oauth1 or oauth2):
-            msg = 'Missing or incorrect entries in {}\n'.format(self.creds_file)
-            msg += pprint.pformat(self.oauth)
-            raise ValueError(msg)
-        elif verbose:
-            print('Credentials file "{}" looks good'.format(self.creds_file))
-
-
-def add_access_token(creds_file=None):
-    """
-    For OAuth 2, retrieve an access token for an app and append it to a
-    credentials file.
-    """
-    if creds_file is None:
-        path = os.path.dirname(__file__)
-        creds_file = os.path.join(path, 'credentials2.txt')
-    oauth2 = credsfromfile(creds_file=creds_file)
-    app_key = oauth2['app_key']
-    app_secret = oauth2['app_secret']
-
-    twitter = Twython(app_key, app_secret, oauth_version=2)
-    access_token = twitter.obtain_access_token()
-    tok = 'access_token={}\n'.format(access_token)
-    with open(creds_file, 'a') as infile:
-        print(tok, file=infile)
-
-
-def guess_path(pth):
-    """
-    If the path is not absolute, guess that it is a subdirectory of the
-    user's home directory.
-
-    :param str pth: The pathname of the directory where files of tweets should be written
-    """
-    if os.path.isabs(pth):
-        return pth
-    else:
-        return os.path.expanduser(os.path.join("~", pth))
-
diff --git a/nltk/twitter/twitter_demo.py b/nltk/twitter/twitter_demo.py
index 40ca855..6567ba3 100644
--- a/nltk/twitter/twitter_demo.py
+++ b/nltk/twitter/twitter_demo.py
@@ -13,7 +13,7 @@ Examples to demo the :py:mod:`twitterclient` code.
 These demo functions should all run, with the following caveats:
 
 * You must have obtained API keys from Twitter, and installed them according to
-  the instructions in `nltk/test/twitter.ipynb`.
+  the instructions in the `twitter HOWTO <http://www.nltk.org/howto/twitter.html>`_.
 
 * If you are on a slow network, some of the calls to the Twitter API may
   timeout.
@@ -30,7 +30,9 @@ For documentation about the Twitter APIs, see `The Streaming APIs Overview
 For error codes see Twitter's
 `Error Codes and Responses <https://dev.twitter.com/overview/api/response-codes>`
 """
+from __future__ import print_function
 
+import datetime
 from functools import wraps
 import json
 
@@ -53,14 +55,21 @@ def verbose(func):
         return func(*args, **kwargs)
     return with_formatting
 
+def yesterday():
+    """
+    Get yesterday's datetime as a 5-tuple.
+    """
+    date =  datetime.datetime.now()
+    date -= datetime.timedelta(days=1)
+    date_tuple = date.timetuple()[:6]
+    return date_tuple
 
 def setup():
     """
     Initialize global variables for the demos.
     """
-    global DATE, USERIDS, FIELDS
+    global USERIDS, FIELDS
 
-    DATE = (2015, 4, 20, 16, 40)
     USERIDS = ['759251', '612473', '15108702', '6017542', '2673523800']
     # UserIDs corresponding to\
     #           @CNN,    @BBCNews, @ReutersLive, @BreakingNews, @AJELive
@@ -83,7 +92,7 @@ def twitterclass_demo():
     print("Follow two accounts in the public stream" +
           " -- be prepared to wait a few minutes\n")
     tw = Twitter()
-    tw.tweets(follow=['759251', '6017542'], stream=True, limit=10) #public stream
+    tw.tweets(follow=['759251', '6017542'], stream=True, limit=5) #public stream
 
 
 @verbose
@@ -172,14 +181,25 @@ def streamtofile_demo(limit=20):
 
 
 @verbose
-def limit_by_time_demo(limit=20):
+def limit_by_time_demo(keywords="nltk"):
     """
-    Sample from the Streaming API and send output to terminal.
+    Query the REST API for Tweets about NLTK since yesterday and send
+    the output to terminal.
+
+    This example makes the assumption that there are sufficient Tweets since
+    yesterday for the date to be an effective cut-off.
     """
+    date = yesterday()
+    dt_date = datetime.datetime(*date)
     oauth = credsfromfile()
-    client = Streamer(**oauth)
-    client.register(TweetWriter(limit=limit, date_limit=DATE))
-    client.sample()
+    client = Query(**oauth)
+    client.register(TweetViewer(limit=100, lower_date_limit=date))
+
+    print("Cutoff date: {}\n".format(dt_date))
+
+    for tweet in client.search_tweets(keywords=keywords):
+        print("{} ".format(tweet['created_at']), end='')
+        client.handler.handle(tweet)
 
 
 @verbose
@@ -217,7 +237,7 @@ def corpusreader_demo():
 def expand_tweetids_demo():
     """
     Given a file object containing a list of Tweet IDs, fetch the
-    corresponding full Tweets.
+    corresponding full Tweets, if available.
 
     """
     ids_f =\
@@ -237,11 +257,12 @@ def expand_tweetids_demo():
     hydrated = client.expand_tweetids(ids_f)
 
     for tweet in hydrated:
-        try:
             id_str = tweet['id_str']
-            print('id: {}\ntext: {}\n'.format(id_str, tweet['text']))
-        except IndexError:
-            pass
+            print('id: {}'.format(id_str))
+            text = tweet['text']
+            if text.startswith('@null'):
+                text = "[Tweet not available]"
+            print(text + '\n')
 
 
 
@@ -251,7 +272,7 @@ ALL = [twitterclass_demo, sampletoscreen_demo, tracktoscreen_demo,
 
 """
 Select demo functions to run. E.g. replace the following line with "DEMOS =
-ALL[8:]" to execute only the final two demos.
+ALL[8:]" to execute only the final three demos.
 """
 DEMOS = ALL[:]
 
@@ -265,4 +286,3 @@ if __name__ == "__main__":
     print("All demos completed")
     print(SPACER)
 
-
diff --git a/nltk/twitter/twitterclient.py b/nltk/twitter/twitterclient.py
index 7b69ea9..36abdc2 100644
--- a/nltk/twitter/twitterclient.py
+++ b/nltk/twitter/twitterclient.py
@@ -30,7 +30,6 @@ import os
 import requests
 import time
 import gzip
-from nltk.compat import UTC
 
 
 from twython import Twython, TwythonStreamer
@@ -182,12 +181,8 @@ class Query(Twython):
         :param str lang: language
         """
         while True:
-            if isinstance(self.handler, TweetWriter):
-                max_id = self.handler.max_id
-            else:
-                max_id = None
             tweets = self.search_tweets(keywords=keywords, limit=limit, lang=lang,
-                                        max_id=max_id)
+                                        max_id=self.handler.max_id)
             for tweet in tweets:
                 self.handler.handle(tweet)
             if not (self.handler.do_continue() and self.handler.repeat):
@@ -217,7 +212,9 @@ class Query(Twython):
             self.handler = BasicTweetHandler(limit=limit)
 
         count_from_query = 0
-        if not max_id:
+        if max_id:
+            self.handler.max_id = max_id
+        else:
             results = self.search(q=keywords, count=min(100, limit), lang=lang,
                                   result_type='recent')
             count = len(results['statuses'])
@@ -225,7 +222,7 @@ class Query(Twython):
                 print("No Tweets available through REST API for those keywords")
                 return
             count_from_query = count
-            max_id = results['statuses'][count - 1]['id'] - 1
+            self.handler.max_id = results['statuses'][count - 1]['id'] - 1
 
             for result in results['statuses']:
                 yield result
@@ -241,7 +238,7 @@ class Query(Twython):
             try:
                 mcount = min(100, limit-count_from_query)
                 results = self.search(q=keywords, count=mcount, lang=lang,
-                                      max_id=max_id, result_type='recent')
+                                      max_id=self.handler.max_id, result_type='recent')
             except TwythonRateLimitError as e:
                 print("Waiting for 15 minutes -{0}".format(e))
                 time.sleep(15*60) # wait 15 minutes
@@ -261,8 +258,7 @@ class Query(Twython):
             # results['search_metadata']['next_results'], but as part of a
             # query and difficult to fetch. This is doing the equivalent
             # (last tweet id minus one)
-            max_id = results['statuses'][count - 1]['id'] - 1
-            self.handler.max_id = max_id
+            self.handler.max_id = results['statuses'][count - 1]['id'] - 1
 
             for result in results['statuses']:
                 yield result
@@ -293,7 +289,8 @@ class Query(Twython):
         """
         data = self.get_user_timeline(screen_name=screen_name, count=limit,
                                       include_rts=include_rts)
-        self.handler.handle(data)
+        for item in data:
+            self.handler.handle(item)
 
 
 
@@ -322,28 +319,57 @@ class Twitter(object):
         :param bool stream: If `True`, use the live public stream,\
         otherwise search past public Tweets
 
-        :param int limit: Number of Tweets to process
-        :param tuple date_limit: The date at which to stop collecting new\
-        data. This should be entered as a tuple which can serve as the\
-        argument to `datetime.datetime`. E.g. `data_limit=(2015, 4, 1, 12,\
-        40)` for 12:30 pm on April 1 2015.\
-        Note that, in the case of streaming, it is the maximum date, i.e.\
+        :param int limit: The number of data items to process in the current\
+        round of processing.
+
+        :param tuple date_limit: The date at which to stop collecting\
+        new data. This should be entered as a tuple which can serve as the\
+        argument to `datetime.datetime`.\
+        E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015.
+        Note that, in the case of streaming, this is the maximum date, i.e.\
         a date in the future; if not, it is the minimum date, i.e. a date\
         in the past
 
         :param str lang: language
 
-        :param bool repeat: flag to determine whether multiple files should be\
-        written. If `True`, the length of each file will be set by the value\
-        of `limit`. Use only if `to_screen` is `False`. See also :py:func:`handle`.
+        :param bool repeat: A flag to determine whether multiple files should\
+        be written. If `True`, the length of each file will be set by the\
+        value of `limit`. Use only if `to_screen` is `False`. See also
+        :py:func:`handle`.
 
-        :param gzip_compress: if `True`, ouput files are compressed with gzip
+        :param gzip_compress: if `True`, output files are compressed with gzip.
         """
+        if stream:
+            upper_date_limit = date_limit
+            lower_date_limit = None
+        else:
+            upper_date_limit = None
+            lower_date_limit = date_limit
+
+        if to_screen:
+            handler = TweetViewer(limit=limit,
+                                  upper_date_limit=upper_date_limit,
+                                  lower_date_limit=lower_date_limit)
+        else:
+            handler = TweetWriter(limit=limit,
+                                  upper_date_limit=upper_date_limit,
+                                  lower_date_limit=lower_date_limit, repeat=repeat,
+                                  gzip_compress=gzip_compress)
+
+
+
         if to_screen:
-            handler = TweetViewer(limit=limit, date_limit=date_limit)
+            handler = TweetViewer(limit=limit)
         else:
-            handler = TweetWriter(limit=limit, date_limit=date_limit,
-                                  stream=stream, repeat=repeat,
+            if stream:
+                upper_date_limit = date_limit
+                lower_date_limit = None
+            else:
+                upper_date_limit = None
+                lower_date_limit = date_limit
+
+            handler = TweetWriter(limit=limit, upper_date_limit=upper_date_limit,
+                                  lower_date_limit=lower_date_limit, repeat=repeat,
                                   gzip_compress=gzip_compress)
 
         if stream:
@@ -376,7 +402,10 @@ class TweetViewer(TweetHandlerI):
         """
         text = data['text']
         print(text)
-        self.counter += 1
+
+        self.check_date_limit(data)
+        if self.do_stop:
+            return
 
     def on_finish(self):
         print('Written {0} Tweets'.format(self.counter))
@@ -386,38 +415,44 @@ class TweetWriter(TweetHandlerI):
     """
     Handle data by writing it to a file.
     """
-    def __init__(self, limit=2000, date_limit=None, stream=True,
+    def __init__(self, limit=2000, upper_date_limit=None, lower_date_limit=None,
                  fprefix='tweets', subdir='twitter-files', repeat=False,
                  gzip_compress=False):
         """
+        The difference between the upper and lower date limits depends on
+        whether Tweets are coming in an ascending date order (i.e. when
+        streaming) or descending date order (i.e. when searching past Tweets).
+
         :param int limit: number of data items to process in the current\
-        round of processing
+        round of processing.
 
-        :param bool stream: If `True`, use the live public stream,\
-        otherwise search past public Tweets
+        :param tuple upper_date_limit: The date at which to stop collecting new\
+        data. This should be entered as a tuple which can serve as the\
+        argument to `datetime.datetime`. E.g. `upper_date_limit=(2015, 4, 1, 12,\
+        40)` for 12:30 pm on April 1 2015.
+
+        :param tuple lower_date_limit: The date at which to stop collecting new\
+        data. See `upper_data_limit` for formatting.
 
         :param str fprefix: The prefix to use in creating file names for Tweet\
-        collections
+        collections.
 
         :param str subdir: The name of the directory where Tweet collection\
-        files should be stored
+        files should be stored.
 
         :param bool repeat: flag to determine whether multiple files should be\
         written. If `True`, the length of each file will be set by the value\
         of `limit`. See also :py:func:`handle`.
 
-        :param gzip_compress: if `True`, ouput files are compressed with gzip
+        :param gzip_compress: if `True`, ouput files are compressed with gzip.
         """
         self.fprefix = fprefix
         self.subdir = guess_path(subdir)
         self.gzip_compress = gzip_compress
         self.fname = self.timestamped_file()
-        self.stream = stream
         self.repeat = repeat
-        # max_id stores the id of the older tweet fetched
-        self.max_id = None
         self.output = None
-        TweetHandlerI.__init__(self, limit, date_limit)
+        TweetHandlerI.__init__(self, limit, upper_date_limit, lower_date_limit)
 
 
     def timestamped_file(self):
@@ -462,19 +497,9 @@ class TweetWriter(TweetHandlerI):
         else:
             self.output.write(json_data + "\n")
 
-        if self.date_limit:
-            tweet_date = datetime.datetime.strptime(data['created_at'], '%a %b %d\
-            %H:%M:%S +0000 %Y').replace(tzinfo=UTC)
-            if (tweet_date > self.date_limit and self.stream == True) or \
-               (tweet_date < self.date_limit and self.stream == False):
-                if self.stream:
-                    message = "earlier"
-                else:
-                    message = "later"
-                print("Date limit {0} is {1} than date of current tweet {2}".\
-                      format(self.date_limit, message, tweet_date))
-                self.do_stop = True
-                return
+        self.check_date_limit(data)
+        if self.do_stop:
+            return
 
         self.startingup = False
 
@@ -503,4 +528,3 @@ class TweetWriter(TweetHandlerI):
         self.fname = self.timestamped_file()
         self.startingup = True
         self.counter = 0
-
diff --git a/nltk/twitter/util.py b/nltk/twitter/util.py
index d26d32c..2aff979 100644
--- a/nltk/twitter/util.py
+++ b/nltk/twitter/util.py
@@ -8,257 +8,15 @@
 # For license information, see LICENSE.TXT
 
 """
-Utility functions to accompany :module:`twitterclient`.
+Authentication utilities to accompany :module:`twitterclient`.
 """
+
 from __future__ import print_function
-import csv
-import json
+
 import os
 import pprint
-import nltk.compat as compat
-import gzip
-
 from twython import Twython
 
-HIER_SEPARATOR = "."
-
-def extract_fields(tweet, fields):
-    """
-    Extract field values from a full tweet and return them as a list
-
-    :param json tweet: The tweet in JSON format
-    :param list fields: The fields to be extracted from the tweet
-    :rtype: list(str)
-    """
-    out = []
-    for field in fields:
-        try:
-            _add_field_to_out(tweet, field, out)
-        except TypeError:
-            raise RuntimeError('Fatal error when extracting fields. Cannot find field ', field)
-    return out
-
-
-def _add_field_to_out(json, field, out):
-    if _is_composed_key(field):
-        key, value = _get_key_value_composed(field)
-        _add_field_to_out(json[key], value, out)
-    else:
-        out += [json[field]]
-
-def _is_composed_key(field):
-    if HIER_SEPARATOR in field:
-        return True
-    return False
-
-def _get_key_value_composed(field):
-    out = field.split(HIER_SEPARATOR)
-    # there could be up to 3 levels
-    key = out[0]
-    value = HIER_SEPARATOR.join(out[1:])
-    return key, value
-
-def _get_entity_recursive(json, entity):
-    if not json:
-        return None
-    elif isinstance(json, dict):
-        for key, value in json.items():
-            if key == entity:
-                return value
-            # 'entities' and 'extended_entities' are wrappers in Twitter json
-            # structure that contain other Twitter objects. See:
-            # https://dev.twitter.com/overview/api/entities-in-twitter-objects
-
-            if key == 'entities' or key == 'extended_entities':
-                candidate = _get_entity_recursive(value, entity)
-                if candidate is not None:
-                    return candidate
-        return None
-    elif isinstance(json, list):
-        for item in json:
-            candidate = _get_entity_recursive(item, entity)
-            if candidate is not None:
-                return candidate
-        return None
-    else:
-        return None
-
-def json2csv(fp, outfile, fields, encoding='utf8', errors='replace',
-             gzip_compress=False):
-    """
-    Extract selected fields from a file of line-separated JSON tweets and
-    write to a file in CSV format.
-
-    This utility function allows a file of full tweets to be easily converted
-    to a CSV file for easier processing. For example, just TweetIDs or
-    just the text content of the Tweets can be extracted.
-
-    Additionally, the function allows combinations of fields of other Twitter
-    objects (mainly the users, see below).
-
-    For Twitter entities (e.g. hashtags of a Tweet), and for geolocation, see
-    `json2csv_entities`
-
-    :param str infile: The name of the file containing full tweets
-
-    :param str outfile: The name of the text file where results should be\
-    written
-
-    :param list fields: The list of fields to be extracted. Useful examples\
-    are 'id_str' for the tweetID and 'text' for the text of the tweet. See\
-    <https://dev.twitter.com/overview/api/tweets> for a full list of fields.\
-    e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']\
-    Additonally, it allows IDs from other Twitter objects, e. g.,\
-    ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']
-
-    :param error: Behaviour for encoding errors, see\
-    https://docs.python.org/3/library/codecs.html#codec-base-classes
-
-    :param gzip_compress: if `True`, output files are compressed with gzip
-    """
-    (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
-    # write the list of fields as header
-    writer.writerow(fields)
-    # process the file
-    for line in fp:
-        tweet = json.loads(line)
-        row = extract_fields(tweet, fields)
-        writer.writerow(row)
-    outf.close()
-
-def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
-    """
-    Identify appropriate CSV writer given the Python version
-    """
-    if compat.PY3:
-        if gzip_compress:
-            outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
-        else:
-            outf = open(outfile, 'w', encoding=encoding, errors=errors)
-        writer = csv.writer(outf)
-    else:
-        if gzip_compress:
-            outf = gzip.open(outfile, 'wb')
-        else:
-            outf = open(outfile, 'wb')
-        writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
-    return (writer, outf)
-
-
-
-
-def json2csv_entities(tweets_file, outfile, main_fields, entity_type, entity_fields,
-                      encoding='utf8', errors='replace', gzip_compress=False):
-    """
-    Extract selected fields from a file of line-separated JSON tweets and
-    write to a file in CSV format.
-
-    This utility function allows a file of full Tweets to be easily converted
-    to a CSV file for easier processing of Twitter entities. For example, the
-    hashtags or media elements of a tweet can be extracted.
-
-    It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags
-    there will be two lines in the output file, one per hashtag
-
-    :param tweets_file: the file-like object containing full Tweets
-
-    :param str outfile: The path of the text file where results should be\
-    written
-
-    :param list main_fields: The list of fields to be extracted from the main\
-    object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
-    <https://dev.twitter.com/overview/api/tweets> for a full list of fields.
-    e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
-    If `entity_type` is expressed with hierarchy, then it is the list of\
-    fields of the object that corresponds to the key of the entity_type,\
-    (e.g., for entity_type='user.urls', the fields in the main_fields list\
-    belong to the user object; for entity_type='place.bounding_box', the\
-    files in the main_field list belong to the place object of the tweet).
-
-    :param list entity_type: The name of the entity: 'hashtags', 'media',\
-    'urls' and 'user_mentions' for the tweet object. For a user object,\
-    this needs to be expressed with a hierarchy: `'user.urls'`. For the\
-    bounding box of the Tweet location, use `'place.bounding_box'`.
-
-    :param list entity_fields: The list of fields to be extracted from the\
-    entity. E.g. `['text']` (of the Tweet)
-
-    :param error: Behaviour for encoding errors, see\
-    https://docs.python.org/3/library/codecs.html#codec-base-classes
-
-    :param gzip_compress: if `True`, ouput files are compressed with gzip
-    """
-
-    (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
-    header = get_header_field_list(main_fields, entity_type, entity_fields)
-    writer.writerow(header)
-    for line in tweets_file:
-        tweet = json.loads(line)
-        if _is_composed_key(entity_type):
-            key, value = _get_key_value_composed(entity_type)
-            object_json = _get_entity_recursive(tweet, key)
-            if not object_json:
-                # this can happen in the case of "place"
-                continue
-            object_fields = extract_fields(object_json, main_fields)
-            items = _get_entity_recursive(object_json, value)
-            _write_to_file(object_fields, items, entity_fields, writer)
-        else:
-            tweet_fields = extract_fields(tweet, main_fields)
-            items = _get_entity_recursive(tweet, entity_type)
-            _write_to_file(tweet_fields, items, entity_fields, writer)
-    outf.close()
-
-def get_header_field_list(main_fields, entity_type, entity_fields):
-    if _is_composed_key(entity_type):
-        key, value = _get_key_value_composed(entity_type)
-        main_entity = key
-        sub_entity = value
-    else:
-        main_entity = None
-        sub_entity = entity_type
-
-    if main_entity:
-        output1 = [HIER_SEPARATOR.join([main_entity, x]) for x in main_fields]
-    else:
-        output1 = main_fields
-    output2 = [HIER_SEPARATOR.join([sub_entity, x]) for x in entity_fields]
-    return output1 + output2
-
-def _write_to_file(object_fields, items, entity_fields, writer):
-    if not items:
-        # it could be that the entity is just not present for the tweet
-        # e.g. tweet hashtag is always present, even as [], however
-        # tweet media may not be present
-        return
-    if isinstance(items, dict):
-        # this happens e.g. for "place" of a tweet
-        row = object_fields
-        # there might be composed keys in de list of required fields
-        entity_field_values = [x for x in entity_fields if not _is_composed_key(x)]
-        entity_field_composed = [x for x in entity_fields if _is_composed_key(x)]
-        for field in entity_field_values:
-            value = items[field]
-            if isinstance(value, list):
-                row += value
-            else:
-                row += [value]
-        # now check required dictionaries
-        for d in entity_field_composed:
-            kd, vd = _get_key_value_composed(d)
-            json_dict = items[kd]
-            if not isinstance(json_dict, dict):
-                raise RuntimeError("""Key {0} does not contain a dictionary
-                in the json file""".format(kd))
-            row += [json_dict[vd]]
-        writer.writerow(row)
-        return
-    # in general it is a list
-    for item in items:
-        row = object_fields + extract_fields(item, entity_fields)
-        writer.writerow(row)
-
-
 def credsfromfile(creds_file=None, subdir=None, verbose=False):
     """
     Convenience function for authentication
@@ -386,4 +144,3 @@ def guess_path(pth):
         return pth
     else:
         return os.path.expanduser(os.path.join("~", pth))
-
diff --git a/nltk/util.py b/nltk/util.py
index eec59cd..eec7c3d 100644
--- a/nltk/util.py
+++ b/nltk/util.py
@@ -14,7 +14,7 @@ import pydoc
 import bisect
 import os
 
-from itertools import islice, chain
+from itertools import islice, chain, combinations
 from pprint import pprint
 from collections import defaultdict, deque
 from sys import version_info
@@ -461,6 +461,58 @@ def trigrams(sequence, **kwargs):
     for item in ngrams(sequence, 3, **kwargs):
         yield item
 
+def everygrams(sequence, min_len=1, max_len=-1):
+    """
+    Returns all possible ngrams generated from a sequence of items, as an iterator.
+    
+        >>> sent = 'a b c'.split()
+        >>> list(everygrams(sent))
+        [('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c'), ('a', 'b', 'c')]
+        >>> list(everygrams(sent, max_len=2))
+        [('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c')]
+        
+    :param sequence: the source data to be converted into trigrams
+    :type sequence: sequence or iter
+    :param min_len: minimum length of the ngrams, aka. n-gram order/degree of ngram
+    :type  min_len: int
+    :param max_len: maximum length of the ngrams (set to length of sequence by default)
+    :type  max_len: int
+    :rtype: iter(tuple)
+    """
+    if max_len == -1:
+    	max_len = len(sequence)
+    for n in range(min_len, max_len+1):
+        for ng in ngrams(sequence, n):
+            yield ng
+
+def skipgrams(sequence, n, k):
+    """
+    Returns all possible skipgrams generated from a sequence of items, as an iterator.
+    Skipgrams are ngrams that allows tokens to be skipped.
+    Refer to http://homepages.inf.ed.ac.uk/ballison/pdf/lrec_skipgrams.pdf
+    
+        >>> sent = "Insurgents killed in ongoing fighting".split()
+        >>> list(skipgrams(sent, 2, 2))
+        [('Insurgents', 'killed'), ('Insurgents', 'in'), ('Insurgents', 'ongoing'), ('killed', 'in'), ('killed', 'ongoing'), ('killed', 'fighting'), ('in', 'ongoing'), ('in', 'fighting'), ('ongoing', 'fighting')]
+        >>> list(skipgrams(sent, 3, 2))
+        [('Insurgents', 'killed', 'in'), ('Insurgents', 'killed', 'ongoing'), ('Insurgents', 'killed', 'fighting'), ('Insurgents', 'in', 'ongoing'), ('Insurgents', 'in', 'fighting'), ('Insurgents', 'ongoing', 'fighting'), ('killed', 'in', 'ongoing'), ('killed', 'in', 'fighting'), ('killed', 'ongoing', 'fighting'), ('in', 'ongoing', 'fighting')]
+    
+    :param sequence: the source data to be converted into trigrams
+    :type sequence: sequence or iter
+    :param n: the degree of the ngrams
+    :type n: int
+    :param k: the skip distance
+    :type  k: int
+    :rtype: iter(tuple)
+    """
+    for ngram in ngrams(sequence, n + k, pad_right=True):
+        head = ngram[:1]
+        tail = ngram[1:]
+        for skip_tail in combinations(tail, n - 1):
+            if skip_tail[-1] is None:
+                continue
+            yield head + skip_tail
+
 ##########################################################################
 # Ordered Dictionary
 ##########################################################################
diff --git a/setup.cfg b/setup.cfg
index ebbec92..00bb0ae 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [egg_info]
-tag_build = 
-tag_svn_revision = 0
 tag_date = 0
+tag_svn_revision = 0
+tag_build = 
 
diff --git a/setup.py b/setup.py
index d465f39..9c59e43 100644
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,7 @@ setup(
     url = "http://nltk.org/",
     long_description = """\
 The Natural Language Toolkit (NLTK) is a Python package for
-natural language processing.  NLTK requires Python 2.6, 2.7, or 3.2+.""",
+natural language processing.  NLTK requires Python 2.7, or 3.2+.""",
     license = "Apache License, Version 2.0",
     keywords = ['NLP', 'CL', 'natural language processing',
                 'computational linguistics', 'parsing', 'tagging',
@@ -54,7 +54,6 @@ natural language processing.  NLTK requires Python 2.6, 2.7, or 3.2+.""",
     'Intended Audience :: Science/Research',
     'License :: OSI Approved :: Apache Software License',
     'Operating System :: OS Independent',
-    'Programming Language :: Python :: 2.6',
     'Programming Language :: Python :: 2.7',
     'Programming Language :: Python :: 3.2',
     'Programming Language :: Python :: 3.3',
@@ -70,7 +69,7 @@ natural language processing.  NLTK requires Python 2.6, 2.7, or 3.2+.""",
     'Topic :: Text Processing :: Linguistic',
     ],
     package_data = {'nltk': ['test/*.doctest', 'VERSION']},
-    install_requires = ['six>=1.9.0'],
+#    install_requires = ['six>=1.9.0'],
     packages = find_packages(),
     zip_safe=False, # since normal files will be present too?
     )

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/nltk.git



More information about the debian-science-commits mailing list