[nltk] 01/05: New upstream version 3.2.4
Gianfranco Costamagna
locutusofborg at moszumanska.debian.org
Mon Jun 19 12:39:47 UTC 2017
This is an automated email from the git hooks/post-receive script.
locutusofborg pushed a commit to branch master
in repository nltk.
commit b743e0af932ae7186305eff86034f47f5d71f60c
Author: Gianfranco Costamagna <costamagnagianfranco at yahoo.it>
Date: Mon Jun 19 14:32:40 2017 +0200
New upstream version 3.2.4
---
LICENSE.txt | 2 +-
PKG-INFO | 7 +-
nltk.egg-info/PKG-INFO | 7 +-
nltk.egg-info/SOURCES.txt | 13 +-
nltk.egg-info/requires.txt | 31 +
nltk/VERSION | 2 +-
nltk/__init__.py | 6 +-
nltk/app/__init__.py | 5 +-
nltk/app/chartparser_app.py | 184 ++--
nltk/app/chunkparser_app.py | 24 +-
nltk/app/collocations_app.py | 19 +-
nltk/app/concordance_app.py | 20 +-
nltk/app/nemo_app.py | 35 +-
nltk/app/rdparser_app.py | 22 +-
nltk/app/srparser_app.py | 32 +-
nltk/app/wordfreq_app.py | 2 +-
nltk/app/wordnet_app.py | 14 +-
nltk/book.py | 2 +-
nltk/ccg/__init__.py | 2 +-
nltk/ccg/api.py | 114 ++-
nltk/ccg/chart.py | 12 +-
nltk/ccg/combinator.py | 53 +-
nltk/ccg/lexicon.py | 2 +-
nltk/ccg/logic.py | 2 +-
nltk/chat/__init__.py | 2 +-
nltk/chat/eliza.py | 2 +-
nltk/chat/iesha.py | 2 +-
nltk/chat/rude.py | 2 +-
nltk/chat/suntsu.py | 2 +-
nltk/chat/util.py | 22 +-
nltk/chat/zen.py | 2 +-
nltk/chunk/__init__.py | 2 +-
nltk/chunk/api.py | 2 +-
nltk/chunk/named_entity.py | 24 +-
nltk/chunk/regexp.py | 12 +-
nltk/chunk/util.py | 24 +-
nltk/classify/__init__.py | 2 +-
nltk/classify/api.py | 2 +-
nltk/classify/decisiontree.py | 28 +-
nltk/classify/maxent.py | 13 +-
nltk/classify/megam.py | 9 +-
nltk/classify/naivebayes.py | 4 +-
nltk/classify/rte_classify.py | 8 +-
nltk/classify/scikitlearn.py | 4 +-
nltk/classify/senna.py | 46 +-
nltk/classify/svm.py | 2 +-
nltk/classify/tadm.py | 7 +-
nltk/classify/textcat.py | 2 +-
nltk/classify/util.py | 2 +-
nltk/classify/weka.py | 13 +-
nltk/cluster/__init__.py | 2 +-
nltk/cluster/api.py | 13 +-
nltk/cluster/em.py | 2 +-
nltk/cluster/gaac.py | 6 +-
nltk/cluster/kmeans.py | 2 +-
nltk/cluster/util.py | 55 +-
nltk/collections.py | 688 ++++++++++++++
nltk/collocations.py | 4 +-
nltk/compat.py | 405 +-------
nltk/corpus/__init__.py | 53 +-
nltk/corpus/europarl_raw.py | 2 +-
nltk/corpus/reader/__init__.py | 6 +-
nltk/corpus/reader/aligned.py | 7 +-
nltk/corpus/reader/api.py | 22 +-
nltk/corpus/reader/bnc.py | 2 +-
nltk/corpus/reader/bracket_parse.py | 2 +-
nltk/corpus/reader/categorized_sents.py | 7 +-
nltk/corpus/reader/chasen.py | 9 +-
nltk/corpus/reader/childes.py | 17 +-
nltk/corpus/reader/chunked.py | 8 +-
nltk/corpus/reader/cmudict.py | 6 +-
nltk/corpus/reader/comparative_sents.py | 6 +-
nltk/corpus/reader/conll.py | 16 +-
nltk/corpus/reader/crubadan.py | 4 +-
nltk/corpus/reader/dependency.py | 2 +-
nltk/corpus/reader/framenet.py | 1187 ++++++++++++++++++++---
nltk/corpus/reader/ieer.py | 7 +-
nltk/corpus/reader/indian.py | 9 +-
nltk/corpus/reader/ipipan.py | 11 +-
nltk/corpus/reader/knbc.py | 4 +-
nltk/corpus/reader/lin.py | 2 +-
nltk/corpus/reader/mte.py | 9 +-
nltk/corpus/reader/nkjp.py | 7 +-
nltk/corpus/reader/nombank.py | 12 +-
nltk/corpus/reader/nps_chat.py | 2 +-
nltk/corpus/reader/opinion_lexicon.py | 6 +-
nltk/corpus/reader/panlex_lite.py | 2 +-
nltk/corpus/reader/pl196x.py | 520 ++++++-----
nltk/corpus/reader/plaintext.py | 7 +-
nltk/corpus/reader/ppattach.py | 7 +-
nltk/corpus/reader/propbank.py | 14 +-
nltk/corpus/reader/pros_cons.py | 8 +-
nltk/corpus/reader/reviews.py | 4 +-
nltk/corpus/reader/rte.py | 12 +-
nltk/corpus/reader/semcor.py | 2 +-
nltk/corpus/reader/senseval.py | 6 +-
nltk/corpus/reader/sentiwordnet.py | 2 +-
nltk/corpus/reader/sinica_treebank.py | 2 +-
nltk/corpus/reader/string_category.py | 7 +-
nltk/corpus/reader/switchboard.py | 2 +-
nltk/corpus/reader/tagged.py | 7 +-
nltk/corpus/reader/timit.py | 18 +-
nltk/corpus/reader/toolbox.py | 8 +-
nltk/corpus/reader/twitter.py | 10 +-
nltk/corpus/reader/util.py | 10 +-
nltk/corpus/reader/verbnet.py | 27 +-
nltk/corpus/reader/wordlist.py | 105 ++-
nltk/corpus/reader/wordnet.py | 483 ++++++----
nltk/corpus/reader/xmldocs.py | 15 +-
nltk/corpus/reader/ycoe.py | 49 +-
nltk/corpus/util.py | 27 +-
nltk/data.py | 52 +-
nltk/decorators.py | 6 +-
nltk/downloader.py | 111 +--
nltk/draw/__init__.py | 8 +-
nltk/draw/cfg.py | 13 +-
nltk/draw/dispersion.py | 2 +-
nltk/draw/table.py | 8 +-
nltk/draw/tree.py | 7 +-
nltk/draw/util.py | 34 +-
nltk/featstruct.py | 8 +-
nltk/grammar.py | 26 +-
nltk/help.py | 2 +-
nltk/inference/__init__.py | 2 +-
nltk/inference/api.py | 46 +-
nltk/inference/discourse.py | 36 +-
nltk/inference/nonmonotonic.py | 2 +-
nltk/inference/prover9.py | 2 +-
nltk/inference/resolution.py | 2 +-
nltk/inference/tableau.py | 2 +-
nltk/internals.py | 88 +-
nltk/jsontags.py | 2 +-
nltk/metrics/__init__.py | 3 +-
nltk/metrics/agreement.py | 38 +-
nltk/metrics/aline.py | 607 ++++++++++++
nltk/metrics/association.py | 12 +-
nltk/metrics/confusionmatrix.py | 2 +-
nltk/metrics/distance.py | 19 +-
nltk/metrics/paice.py | 2 +-
nltk/metrics/scores.py | 14 +-
nltk/metrics/segmentation.py | 9 +-
nltk/metrics/spearman.py | 2 +-
nltk/misc/__init__.py | 2 +-
nltk/misc/chomsky.py | 6 +-
nltk/misc/minimalset.py | 2 +-
nltk/misc/sort.py | 2 +-
nltk/misc/wordfinder.py | 2 +-
nltk/parse/__init__.py | 3 +-
nltk/parse/api.py | 2 +-
nltk/parse/bllip.py | 2 +-
nltk/parse/chart.py | 15 +-
nltk/parse/corenlp.py | 673 +++++++++++++
nltk/parse/dependencygraph.py | 6 +-
nltk/parse/earleychart.py | 7 +-
nltk/parse/evaluate.py | 2 +-
nltk/parse/featurechart.py | 10 +-
nltk/parse/generate.py | 19 +-
nltk/parse/malt.py | 4 +-
nltk/parse/nonprojectivedependencyparser.py | 6 +-
nltk/parse/pchart.py | 2 +-
nltk/parse/projectivedependencyparser.py | 11 +-
nltk/parse/recursivedescent.py | 2 +-
nltk/parse/shiftreduce.py | 2 +-
nltk/parse/stanford.py | 44 +-
nltk/parse/transitionparser.py | 20 +-
nltk/parse/util.py | 2 +-
nltk/parse/viterbi.py | 2 +-
nltk/probability.py | 86 +-
nltk/sem/__init__.py | 2 +-
nltk/sem/boxer.py | 30 +-
nltk/sem/chat80.py | 12 +-
nltk/sem/cooper_storage.py | 2 +-
nltk/sem/drt.py | 17 +-
nltk/sem/drt_glue_demo.py | 14 +-
nltk/sem/evaluate.py | 12 +-
nltk/sem/glue.py | 14 +-
nltk/sem/hole.py | 6 +-
nltk/sem/lfg.py | 6 +-
nltk/sem/linearlogic.py | 6 +-
nltk/sem/logic.py | 9 +-
nltk/sem/relextract.py | 29 +-
nltk/sem/skolemize.py | 2 +-
nltk/sem/util.py | 2 +-
nltk/sentiment/__init__.py | 2 +-
nltk/sentiment/sentiment_analyzer.py | 2 +-
nltk/sentiment/util.py | 7 +-
nltk/sentiment/vader.py | 59 +-
nltk/stem/__init__.py | 4 +-
nltk/stem/api.py | 11 +-
nltk/stem/isri.py | 2 +-
nltk/stem/lancaster.py | 51 +-
nltk/stem/porter.py | 1192 ++++++++++++------------
nltk/stem/regexp.py | 4 +-
nltk/stem/rslp.py | 2 +-
nltk/stem/snowball.py | 19 +-
nltk/stem/util.py | 2 +-
nltk/stem/wordnet.py | 2 +-
nltk/tag/__init__.py | 30 +-
nltk/tag/api.py | 21 +-
nltk/tag/brill.py | 10 +-
nltk/tag/crf.py | 2 +-
nltk/tag/hmm.py | 20 +-
nltk/tag/hunpos.py | 8 +-
nltk/tag/mapping.py | 2 +-
nltk/tag/perceptron.py | 31 +-
nltk/tag/senna.py | 48 +-
nltk/tag/sequential.py | 34 +-
nltk/tag/stanford.py | 90 +-
nltk/tag/tnt.py | 31 +-
nltk/tag/util.py | 2 +-
nltk/tbl/__init__.py | 2 +-
nltk/tbl/demo.py | 2 +-
nltk/tbl/erroranalysis.py | 2 +-
nltk/tbl/feature.py | 12 +-
nltk/tbl/rule.py | 12 +-
nltk/tbl/template.py | 41 +-
nltk/test/__init__.py | 2 +-
nltk/test/bnc.doctest | 2 +-
nltk/test/ccg.doctest | 2 +-
nltk/test/ccg_semantics.doctest | 2 +-
nltk/test/chat80.doctest | 2 +-
nltk/test/chunk.doctest | 2 +-
nltk/test/classify.doctest | 2 +-
nltk/test/collocations.doctest | 2 +-
nltk/test/compat.doctest | 4 -
nltk/test/corpus.doctest | 46 +-
nltk/test/crubadan.doctest | 2 +-
nltk/test/data.doctest | 2 +-
nltk/test/dependency.doctest | 2 +-
nltk/test/discourse.doctest | 2 +-
nltk/test/drt.doctest | 2 +-
nltk/test/featgram.doctest | 2 +-
nltk/test/featstruct.doctest | 5 +-
nltk/test/framenet.doctest | 105 ++-
nltk/test/generate.doctest | 2 +-
nltk/test/gensim.doctest | 13 +-
nltk/test/gluesemantics.doctest | 28 +-
nltk/test/gluesemantics_malt.doctest | 2 +-
nltk/test/grammar.doctest | 2 +-
nltk/test/grammartestsuites.doctest | 2 +-
nltk/test/index.doctest | 2 +-
nltk/test/inference.doctest | 2 +-
nltk/test/internals.doctest | 2 +-
nltk/test/japanese.doctest | 2 +-
nltk/test/logic.doctest | 2 +-
nltk/test/metrics.doctest | 6 +-
nltk/test/misc.doctest | 2 +-
nltk/test/nonmonotonic.doctest | 2 +-
nltk/test/parse.doctest | 2 +-
nltk/test/portuguese_en.doctest | 2 +-
nltk/test/probability.doctest | 2 +-
nltk/test/propbank.doctest | 2 +-
nltk/test/relextract.doctest | 2 +-
nltk/test/resolution.doctest | 2 +-
nltk/test/runtests.py | 18 +-
nltk/test/semantics.doctest | 2 +-
nltk/test/sentiment.doctest | 6 +-
nltk/test/sentiwordnet.doctest | 2 +-
nltk/test/simple.doctest | 2 +-
nltk/test/stem.doctest | 2 +-
nltk/test/tag.doctest | 2 +-
nltk/test/tokenize.doctest | 66 +-
nltk/test/toolbox.doctest | 2 +-
nltk/test/translate.doctest | 2 +-
nltk/test/tree.doctest | 2 +-
nltk/test/treeprettyprinter.doctest | 2 +-
nltk/test/treetransforms.doctest | 2 +-
nltk/test/unit/test_aline.py | 52 ++
nltk/test/unit/test_chunk.py | 49 +
nltk/test/unit/test_corpora.py | 19 +-
nltk/test/unit/test_json2csv_corpus.py | 7 +-
nltk/test/unit/test_senna.py | 74 ++
nltk/test/unit/test_stem.py | 71 ++
nltk/test/unit/test_tgrep.py | 6 +-
nltk/test/unit/test_tokenize.py | 88 +-
nltk/test/unit/translate/test_bleu.py | 178 +++-
nltk/test/unit/translate/test_stack_decoder.py | 2 +-
nltk/test/util.doctest | 2 +-
nltk/test/wordnet.doctest | 15 +-
nltk/test/wordnet_lch.doctest | 2 +-
nltk/test/wsd.doctest | 2 +-
nltk/text.py | 15 +-
nltk/tgrep.py | 15 +-
nltk/tokenize/__init__.py | 52 +-
nltk/tokenize/api.py | 7 +-
nltk/tokenize/casual.py | 13 +-
nltk/tokenize/moses.py | 613 ++++++++++++
nltk/tokenize/mwe.py | 2 +-
nltk/tokenize/punkt.py | 36 +-
nltk/tokenize/regexp.py | 2 +-
nltk/tokenize/repp.py | 151 +++
nltk/tokenize/sexpr.py | 2 +-
nltk/tokenize/simple.py | 2 +-
nltk/tokenize/stanford.py | 19 +-
nltk/tokenize/stanford_segmenter.py | 157 +++-
nltk/tokenize/texttiling.py | 2 +-
nltk/tokenize/toktok.py | 155 +++
nltk/tokenize/treebank.py | 232 ++++-
nltk/tokenize/util.py | 147 ++-
nltk/toolbox.py | 6 +-
nltk/translate/__init__.py | 2 +-
nltk/translate/api.py | 50 +-
nltk/translate/bleu_score.py | 314 ++++---
nltk/translate/chrf_score.py | 137 +++
nltk/translate/gale_church.py | 51 +-
nltk/translate/gdfa.py | 2 +-
nltk/translate/gleu_score.py | 193 ++++
nltk/translate/ibm4.py | 2 +-
nltk/translate/ibm5.py | 2 +-
nltk/translate/ibm_model.py | 2 +-
nltk/translate/metrics.py | 2 +-
nltk/translate/phrase_based.py | 2 +-
nltk/translate/ribes_score.py | 2 +-
nltk/translate/stack_decoder.py | 6 +-
nltk/tree.py | 18 +-
nltk/treeprettyprinter.py | 12 +-
nltk/twitter/__init__.py | 2 +-
nltk/twitter/api.py | 11 +-
nltk/twitter/common.py | 2 +-
nltk/twitter/twitter_demo.py | 2 +-
nltk/twitter/twitterclient.py | 2 +-
nltk/twitter/util.py | 2 +-
nltk/util.py | 695 +-------------
nltk/wsd.py | 2 +-
setup.cfg | 3 +-
setup.py | 36 +-
326 files changed, 8964 insertions(+), 3995 deletions(-)
diff --git a/LICENSE.txt b/LICENSE.txt
index 7a5a3aa..c98b6e4 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (C) 2001-2016 NLTK Project
+Copyright (C) 2001-2017 NLTK Project
Licensed under the Apache License, Version 2.0 (the 'License');
you may not use this file except in compliance with the License.
diff --git a/PKG-INFO b/PKG-INFO
index a471bee..8f2b835 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,13 +1,13 @@
Metadata-Version: 1.1
Name: nltk
-Version: 3.2.1
+Version: 3.2.4
Summary: Natural Language Toolkit
Home-page: http://nltk.org/
Author: Steven Bird
Author-email: stevenbird1 at gmail.com
License: Apache License, Version 2.0
Description: The Natural Language Toolkit (NLTK) is a Python package for
- natural language processing. NLTK requires Python 2.7, or 3.2+.
+ natural language processing. NLTK requires Python 2.7, 3.4, or 3.5.
Keywords: NLP,CL,natural language processing,computational linguistics,parsing,tagging,tokenizing,syntax,linguistics,language,natural language,text analytics
Platform: UNKNOWN
Classifier: Development Status :: 5 - Production/Stable
@@ -18,9 +18,8 @@ Classifier: Intended Audience :: Science/Research
Classifier: License :: OSI Approved :: Apache Software License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python :: 2.7
-Classifier: Programming Language :: Python :: 3.2
-Classifier: Programming Language :: Python :: 3.3
Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: 3.5
Classifier: Topic :: Scientific/Engineering
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
Classifier: Topic :: Scientific/Engineering :: Human Machine Interfaces
diff --git a/nltk.egg-info/PKG-INFO b/nltk.egg-info/PKG-INFO
index a471bee..8f2b835 100644
--- a/nltk.egg-info/PKG-INFO
+++ b/nltk.egg-info/PKG-INFO
@@ -1,13 +1,13 @@
Metadata-Version: 1.1
Name: nltk
-Version: 3.2.1
+Version: 3.2.4
Summary: Natural Language Toolkit
Home-page: http://nltk.org/
Author: Steven Bird
Author-email: stevenbird1 at gmail.com
License: Apache License, Version 2.0
Description: The Natural Language Toolkit (NLTK) is a Python package for
- natural language processing. NLTK requires Python 2.7, or 3.2+.
+ natural language processing. NLTK requires Python 2.7, 3.4, or 3.5.
Keywords: NLP,CL,natural language processing,computational linguistics,parsing,tagging,tokenizing,syntax,linguistics,language,natural language,text analytics
Platform: UNKNOWN
Classifier: Development Status :: 5 - Production/Stable
@@ -18,9 +18,8 @@ Classifier: Intended Audience :: Science/Research
Classifier: License :: OSI Approved :: Apache Software License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python :: 2.7
-Classifier: Programming Language :: Python :: 3.2
-Classifier: Programming Language :: Python :: 3.3
Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: 3.5
Classifier: Topic :: Scientific/Engineering
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
Classifier: Topic :: Scientific/Engineering :: Human Machine Interfaces
diff --git a/nltk.egg-info/SOURCES.txt b/nltk.egg-info/SOURCES.txt
index 023c765..0ed3d0b 100644
--- a/nltk.egg-info/SOURCES.txt
+++ b/nltk.egg-info/SOURCES.txt
@@ -6,6 +6,7 @@ setup.py
nltk/VERSION
nltk/__init__.py
nltk/book.py
+nltk/collections.py
nltk/collocations.py
nltk/compat.py
nltk/data.py
@@ -18,7 +19,6 @@ nltk/internals.py
nltk/jsontags.py
nltk/lazyimport.py
nltk/probability.py
-nltk/six.py
nltk/text.py
nltk/tgrep.py
nltk/toolbox.py
@@ -31,6 +31,7 @@ nltk.egg-info/PKG-INFO
nltk.egg-info/SOURCES.txt
nltk.egg-info/dependency_links.txt
nltk.egg-info/not-zip-safe
+nltk.egg-info/requires.txt
nltk.egg-info/top_level.txt
nltk/app/__init__.py
nltk/app/chartparser_app.py
@@ -150,6 +151,7 @@ nltk/inference/resolution.py
nltk/inference/tableau.py
nltk/metrics/__init__.py
nltk/metrics/agreement.py
+nltk/metrics/aline.py
nltk/metrics/association.py
nltk/metrics/confusionmatrix.py
nltk/metrics/distance.py
@@ -167,6 +169,7 @@ nltk/parse/__init__.py
nltk/parse/api.py
nltk/parse/bllip.py
nltk/parse/chart.py
+nltk/parse/corenlp.py
nltk/parse/dependencygraph.py
nltk/parse/earleychart.py
nltk/parse/evaluate.py
@@ -309,6 +312,8 @@ nltk/test/wordnet_lch.doctest
nltk/test/wsd.doctest
nltk/test/unit/__init__.py
nltk/test/unit/test_2x_compat.py
+nltk/test/unit/test_aline.py
+nltk/test/unit/test_chunk.py
nltk/test/unit/test_classify.py
nltk/test/unit/test_collocations.py
nltk/test/unit/test_corpora.py
@@ -317,6 +322,7 @@ nltk/test/unit/test_hmm.py
nltk/test/unit/test_json2csv_corpus.py
nltk/test/unit/test_naivebayes.py
nltk/test/unit/test_seekable_unicode_stream_reader.py
+nltk/test/unit/test_senna.py
nltk/test/unit/test_stem.py
nltk/test/unit/test_tag.py
nltk/test/unit/test_tgrep.py
@@ -335,21 +341,26 @@ nltk/test/unit/translate/test_stack_decoder.py
nltk/tokenize/__init__.py
nltk/tokenize/api.py
nltk/tokenize/casual.py
+nltk/tokenize/moses.py
nltk/tokenize/mwe.py
nltk/tokenize/punkt.py
nltk/tokenize/regexp.py
+nltk/tokenize/repp.py
nltk/tokenize/sexpr.py
nltk/tokenize/simple.py
nltk/tokenize/stanford.py
nltk/tokenize/stanford_segmenter.py
nltk/tokenize/texttiling.py
+nltk/tokenize/toktok.py
nltk/tokenize/treebank.py
nltk/tokenize/util.py
nltk/translate/__init__.py
nltk/translate/api.py
nltk/translate/bleu_score.py
+nltk/translate/chrf_score.py
nltk/translate/gale_church.py
nltk/translate/gdfa.py
+nltk/translate/gleu_score.py
nltk/translate/ibm1.py
nltk/translate/ibm2.py
nltk/translate/ibm3.py
diff --git a/nltk.egg-info/requires.txt b/nltk.egg-info/requires.txt
new file mode 100644
index 0000000..12c2271
--- /dev/null
+++ b/nltk.egg-info/requires.txt
@@ -0,0 +1,31 @@
+six
+
+[all]
+requests
+twython
+gensim
+scipy
+pyparsing
+matplotlib
+python-crfsuite
+numpy
+scikit-learn
+
+[corenlp]
+requests
+
+[machine_learning]
+gensim
+numpy
+python-crfsuite
+scikit-learn
+scipy
+
+[plot]
+matplotlib
+
+[tgrep]
+pyparsing
+
+[twitter]
+twython
diff --git a/nltk/VERSION b/nltk/VERSION
index e4604e3..351227f 100644
--- a/nltk/VERSION
+++ b/nltk/VERSION
@@ -1 +1 @@
-3.2.1
+3.2.4
diff --git a/nltk/__init__.py b/nltk/__init__.py
index cfa1f32..c47f6be 100644
--- a/nltk/__init__.py
+++ b/nltk/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit (NLTK)
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
@@ -41,7 +41,7 @@ if __doc__ is not None: # fix for the ``python -OO``
# Copyright notice
__copyright__ = """\
-Copyright (C) 2001-2016 NLTK Project.
+Copyright (C) 2001-2017 NLTK Project.
Distributed and Licensed under the Apache License, Version 2.0,
which is included by reference.
@@ -159,7 +159,7 @@ else:
from nltk.downloader import download, download_shell
try:
- import tkinter
+ from six.moves import tkinter
except ImportError:
pass
else:
diff --git a/nltk/app/__init__.py b/nltk/app/__init__.py
index 882ffc2..b843d88 100644
--- a/nltk/app/__init__.py
+++ b/nltk/app/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Applications package
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
@@ -21,9 +21,8 @@ wordnet: WordNet Browser
# Import Tkinter-based modules if Tkinter is installed
-import nltk.compat
try:
- import tkinter
+ from six.moves import tkinter
except ImportError:
import warnings
warnings.warn("nltk.app package not loaded "
diff --git a/nltk/app/chartparser_app.py b/nltk/app/chartparser_app.py
index 7e2b970..bc68d88 100644
--- a/nltk/app/chartparser_app.py
+++ b/nltk/app/chartparser_app.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Chart Parser Application
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Jean Mark Gawron <gawron at mail.sdsu.edu>
# Steven Bird <stevenbird1 at gmail.com>
@@ -38,13 +38,14 @@ edge you wish to apply a rule to.
from __future__ import division
-import nltk.compat
import pickle
-from tkinter.filedialog import asksaveasfilename, askopenfilename
-import tkinter
-import math
import os.path
-import tkinter.font, tkinter.messagebox
+
+from six.moves.tkinter import (Button, Canvas, Checkbutton, Frame, IntVar,
+ Label, Menu, Scrollbar, Tk, Toplevel)
+from six.moves.tkinter_font import Font
+from six.moves.tkinter_messagebox import showerror, showinfo
+from six.moves.tkinter_tkfiledialog import asksaveasfilename, askopenfilename
from nltk.parse.chart import (BottomUpPredictCombineRule, BottomUpPredictRule,
Chart, LeafEdge, LeafInitRule, SingleEdgeFundamentalRule,
@@ -107,12 +108,12 @@ class ChartMatrixView(object):
self._selected_cell = None
if toplevel:
- self._root = tkinter.Toplevel(parent)
+ self._root = Toplevel(parent)
self._root.title(title)
self._root.bind('<Control-q>', self.destroy)
self._init_quit(self._root)
else:
- self._root = tkinter.Frame(parent)
+ self._root = Frame(parent)
self._init_matrix(self._root)
self._init_list(self._root)
@@ -128,18 +129,18 @@ class ChartMatrixView(object):
self.draw()
def _init_quit(self, root):
- quit = tkinter.Button(root, text='Quit', command=self.destroy)
+ quit = Button(root, text='Quit', command=self.destroy)
quit.pack(side='bottom', expand=0, fill='none')
def _init_matrix(self, root):
- cframe = tkinter.Frame(root, border=2, relief='sunken')
+ cframe = Frame(root, border=2, relief='sunken')
cframe.pack(expand=0, fill='none', padx=1, pady=3, side='top')
- self._canvas = tkinter.Canvas(cframe, width=200, height=200,
+ self._canvas = Canvas(cframe, width=200, height=200,
background='white')
self._canvas.pack(expand=0, fill='none')
def _init_numedges(self, root):
- self._numedges_label = tkinter.Label(root, text='0 edges')
+ self._numedges_label = Label(root, text='0 edges')
self._numedges_label.pack(expand=0, fill='none', side='top')
def _init_list(self, root):
@@ -331,21 +332,21 @@ class ChartResultsView(object):
self._selectbox = None
if toplevel:
- self._root = tkinter.Toplevel(parent)
+ self._root = Toplevel(parent)
self._root.title('Chart Parser Application: Results')
self._root.bind('<Control-q>', self.destroy)
else:
- self._root = tkinter.Frame(parent)
+ self._root = Frame(parent)
# Buttons
if toplevel:
- buttons = tkinter.Frame(self._root)
+ buttons = Frame(self._root)
buttons.pack(side='bottom', expand=0, fill='x')
- tkinter.Button(buttons, text='Quit',
+ Button(buttons, text='Quit',
command=self.destroy).pack(side='right')
- tkinter.Button(buttons, text='Print All',
+ Button(buttons, text='Print All',
command=self.print_all).pack(side='left')
- tkinter.Button(buttons, text='Print Selection',
+ Button(buttons, text='Print Selection',
command=self.print_selection).pack(side='left')
# Canvas frame.
@@ -408,7 +409,7 @@ class ChartResultsView(object):
def print_selection(self, *e):
if self._root is None: return
if self._selection is None:
- tkinter.messagebox.showerror('Print Error', 'No tree selected')
+ showerror('Print Error', 'No tree selected')
else:
c = self._cframe.canvas()
for widget in self._treewidgets:
@@ -513,7 +514,7 @@ class ChartComparer(object):
self._operator = None
# Set up the root window.
- self._root = tkinter.Tk()
+ self._root = Tk()
self._root.title('Chart Comparison')
self._root.bind('<Control-q>', self.destroy)
self._root.bind('<Control-x>', self.destroy)
@@ -544,10 +545,10 @@ class ChartComparer(object):
#////////////////////////////////////////////////////////////
def _init_menubar(self, root):
- menubar = tkinter.Menu(root)
+ menubar = Menu(root)
# File menu
- filemenu = tkinter.Menu(menubar, tearoff=0)
+ filemenu = Menu(menubar, tearoff=0)
filemenu.add_command(label='Load Chart', accelerator='Ctrl-o',
underline=0, command=self.load_chart_dialog)
filemenu.add_command(label='Save Output', accelerator='Ctrl-s',
@@ -558,7 +559,7 @@ class ChartComparer(object):
menubar.add_cascade(label='File', underline=0, menu=filemenu)
# Compare menu
- opmenu = tkinter.Menu(menubar, tearoff=0)
+ opmenu = Menu(menubar, tearoff=0)
opmenu.add_command(label='Intersection',
command=self._intersection,
accelerator='+')
@@ -577,18 +578,18 @@ class ChartComparer(object):
self._root.config(menu=menubar)
def _init_divider(self, root):
- divider = tkinter.Frame(root, border=2, relief='sunken')
+ divider = Frame(root, border=2, relief='sunken')
divider.pack(side='top', fill='x', ipady=2)
def _init_chartviews(self, root):
opfont=('symbol', -36) # Font for operator.
eqfont=('helvetica', -36) # Font for equals sign.
- frame = tkinter.Frame(root, background='#c0c0c0')
+ frame = Frame(root, background='#c0c0c0')
frame.pack(side='top', expand=1, fill='both')
# The left matrix.
- cv1_frame = tkinter.Frame(frame, border=3, relief='groove')
+ cv1_frame = Frame(frame, border=3, relief='groove')
cv1_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both')
self._left_selector = MutableOptionMenu(
cv1_frame, list(self._charts.keys()), command=self._select_left)
@@ -603,12 +604,12 @@ class ChartComparer(object):
self._left_matrix.inactivate()
# The operator.
- self._op_label = tkinter.Label(frame, text=' ', width=3,
+ self._op_label = Label(frame, text=' ', width=3,
background='#c0c0c0', font=opfont)
self._op_label.pack(side='left', padx=5, pady=5)
# The right matrix.
- cv2_frame = tkinter.Frame(frame, border=3, relief='groove')
+ cv2_frame = Frame(frame, border=3, relief='groove')
cv2_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both')
self._right_selector = MutableOptionMenu(
cv2_frame, list(self._charts.keys()), command=self._select_right)
@@ -623,13 +624,13 @@ class ChartComparer(object):
self._right_matrix.inactivate()
# The equals sign
- tkinter.Label(frame, text='=', width=3, background='#c0c0c0',
+ Label(frame, text='=', width=3, background='#c0c0c0',
font=eqfont).pack(side='left', padx=5, pady=5)
# The output matrix.
- out_frame = tkinter.Frame(frame, border=3, relief='groove')
+ out_frame = Frame(frame, border=3, relief='groove')
out_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both')
- self._out_label = tkinter.Label(out_frame, text='Output')
+ self._out_label = Label(out_frame, text='Output')
self._out_label.pack(side='top', pady=9)
self._out_matrix = ChartMatrixView(out_frame, self._emptychart,
toplevel=False,
@@ -641,19 +642,19 @@ class ChartComparer(object):
self._out_matrix.inactivate()
def _init_buttons(self, root):
- buttons = tkinter.Frame(root)
+ buttons = Frame(root)
buttons.pack(side='bottom', pady=5, fill='x', expand=0)
- tkinter.Button(buttons, text='Intersection',
+ Button(buttons, text='Intersection',
command=self._intersection).pack(side='left')
- tkinter.Button(buttons, text='Union',
+ Button(buttons, text='Union',
command=self._union).pack(side='left')
- tkinter.Button(buttons, text='Difference',
+ Button(buttons, text='Difference',
command=self._difference).pack(side='left')
- tkinter.Frame(buttons, width=20).pack(side='left')
- tkinter.Button(buttons, text='Swap Charts',
+ Frame(buttons, width=20).pack(side='left')
+ Button(buttons, text='Swap Charts',
command=self._swapcharts).pack(side='left')
- tkinter.Button(buttons, text='Detatch Output',
+ Button(buttons, text='Detatch Output',
command=self._detatch_out).pack(side='right')
def _init_bindings(self, root):
@@ -699,7 +700,7 @@ class ChartComparer(object):
with open(filename, 'wb') as outfile:
pickle.dump(self._out_chart, outfile)
except Exception as e:
- tkinter.messagebox.showerror('Error Saving Chart',
+ showerror('Error Saving Chart',
'Unable to open file: %r\n%s' %
(filename, e))
@@ -709,7 +710,7 @@ class ChartComparer(object):
if not filename: return
try: self.load_chart(filename)
except Exception as e:
- tkinter.messagebox.showerror('Error Loading Chart',
+ showerror('Error Loading Chart',
'Unable to open file: %r\n%s' %
(filename, e))
@@ -932,12 +933,12 @@ class ChartView(object):
# If they didn't provide a main window, then set one up.
if root is None:
- top = tkinter.Tk()
+ top = Tk()
top.title('Chart View')
def destroy1(e, top=top): top.destroy()
def destroy2(top=top): top.destroy()
top.bind('q', destroy1)
- b = tkinter.Button(top, text='Done', command=destroy2)
+ b = Button(top, text='Done', command=destroy2)
b.pack(side='bottom')
self._root = top
else:
@@ -953,9 +954,9 @@ class ChartView(object):
# Create the sentence canvas.
if draw_sentence:
- cframe = tkinter.Frame(self._root, relief='sunk', border=2)
+ cframe = Frame(self._root, relief='sunk', border=2)
cframe.pack(fill='both', side='bottom')
- self._sentence_canvas = tkinter.Canvas(cframe, height=50)
+ self._sentence_canvas = Canvas(cframe, height=50)
self._sentence_canvas['background'] = '#e0e0e0'
self._sentence_canvas.pack(fill='both')
#self._sentence_canvas['height'] = self._sentence_height
@@ -981,12 +982,12 @@ class ChartView(object):
self._chart_canvas.bind('<Configure>', self._configure)
def _init_fonts(self, root):
- self._boldfont = tkinter.font.Font(family='helvetica', weight='bold',
+ self._boldfont = Font(family='helvetica', weight='bold',
size=self._fontsize)
- self._font = tkinter.font.Font(family='helvetica',
+ self._font = Font(family='helvetica',
size=self._fontsize)
# See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
- self._sysfont = tkinter.font.Font(font=tkinter.Button()["font"])
+ self._sysfont = Font(font=Button()["font"])
root.option_add("*Font", self._sysfont)
def _sb_canvas(self, root, expand='y',
@@ -994,12 +995,12 @@ class ChartView(object):
"""
Helper for __init__: construct a canvas with a scrollbar.
"""
- cframe =tkinter.Frame(root, relief='sunk', border=2)
+ cframe = Frame(root, relief='sunk', border=2)
cframe.pack(fill=fill, expand=expand, side=side)
- canvas = tkinter.Canvas(cframe, background='#e0e0e0')
+ canvas = Canvas(cframe, background='#e0e0e0')
# Give the canvas a scrollbar.
- sb = tkinter.Scrollbar(cframe, orient='vertical')
+ sb = Scrollbar(cframe, orient='vertical')
sb.pack(side='right', fill='y')
canvas.pack(side='left', fill=fill, expand='yes')
@@ -1630,14 +1631,14 @@ class ChartParserApp(object):
self._root = None
try:
# Create the root window.
- self._root = tkinter.Tk()
+ self._root = Tk()
self._root.title(title)
self._root.bind('<Control-q>', self.destroy)
# Set up some frames.
- frame3 = tkinter.Frame(self._root)
- frame2 = tkinter.Frame(self._root)
- frame1 = tkinter.Frame(self._root)
+ frame3 = Frame(self._root)
+ frame2 = Frame(self._root)
+ frame1 = Frame(self._root)
frame3.pack(side='bottom', fill='none')
frame2.pack(side='bottom', fill='x')
frame1.pack(side='bottom', fill='both', expand=1)
@@ -1701,25 +1702,25 @@ class ChartParserApp(object):
def _init_fonts(self, root):
# See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
- self._sysfont = tkinter.font.Font(font=tkinter.Button()["font"])
+ self._sysfont = Font(font=Button()["font"])
root.option_add("*Font", self._sysfont)
# TWhat's our font size (default=same as sysfont)
- self._size = tkinter.IntVar(root)
+ self._size = IntVar(root)
self._size.set(self._sysfont.cget('size'))
- self._boldfont = tkinter.font.Font(family='helvetica', weight='bold',
+ self._boldfont = Font(family='helvetica', weight='bold',
size=self._size.get())
- self._font = tkinter.font.Font(family='helvetica',
+ self._font = Font(family='helvetica',
size=self._size.get())
def _init_animation(self):
# Are we stepping? (default=yes)
- self._step = tkinter.IntVar(self._root)
+ self._step = IntVar(self._root)
self._step.set(1)
# What's our animation speed (default=fast)
- self._animate = tkinter.IntVar(self._root)
+ self._animate = IntVar(self._root)
self._animate.set(3) # Default speed = fast
# Are we currently animating?
@@ -1733,59 +1734,59 @@ class ChartParserApp(object):
def _init_rulelabel(self, parent):
ruletxt = 'Last edge generated by:'
- self._rulelabel1 = tkinter.Label(parent,text=ruletxt,
+ self._rulelabel1 = Label(parent,text=ruletxt,
font=self._boldfont)
- self._rulelabel2 = tkinter.Label(parent, width=40,
+ self._rulelabel2 = Label(parent, width=40,
relief='groove', anchor='w',
font=self._boldfont)
self._rulelabel1.pack(side='left')
self._rulelabel2.pack(side='left')
- step = tkinter.Checkbutton(parent, variable=self._step,
+ step = Checkbutton(parent, variable=self._step,
text='Step')
step.pack(side='right')
def _init_buttons(self, parent):
- frame1 = tkinter.Frame(parent)
- frame2 = tkinter.Frame(parent)
+ frame1 = Frame(parent)
+ frame2 = Frame(parent)
frame1.pack(side='bottom', fill='x')
frame2.pack(side='top', fill='none')
- tkinter.Button(frame1, text='Reset\nParser',
+ Button(frame1, text='Reset\nParser',
background='#90c0d0', foreground='black',
command=self.reset).pack(side='right')
- #Tkinter.Button(frame1, text='Pause',
+ # Button(frame1, text='Pause',
# background='#90c0d0', foreground='black',
# command=self.pause).pack(side='left')
- tkinter.Button(frame1, text='Top Down\nStrategy',
+ Button(frame1, text='Top Down\nStrategy',
background='#90c0d0', foreground='black',
command=self.top_down_strategy).pack(side='left')
- tkinter.Button(frame1, text='Bottom Up\nStrategy',
+ Button(frame1, text='Bottom Up\nStrategy',
background='#90c0d0', foreground='black',
command=self.bottom_up_strategy).pack(side='left')
- tkinter.Button(frame1, text='Bottom Up\nLeft-Corner Strategy',
+ Button(frame1, text='Bottom Up\nLeft-Corner Strategy',
background='#90c0d0', foreground='black',
command=self.bottom_up_leftcorner_strategy).pack(side='left')
- tkinter.Button(frame2, text='Top Down Init\nRule',
+ Button(frame2, text='Top Down Init\nRule',
background='#90f090', foreground='black',
command=self.top_down_init).pack(side='left')
- tkinter.Button(frame2, text='Top Down Predict\nRule',
+ Button(frame2, text='Top Down Predict\nRule',
background='#90f090', foreground='black',
command=self.top_down_predict).pack(side='left')
- tkinter.Frame(frame2, width=20).pack(side='left')
+ Frame(frame2, width=20).pack(side='left')
- tkinter.Button(frame2, text='Bottom Up Predict\nRule',
+ Button(frame2, text='Bottom Up Predict\nRule',
background='#90f090', foreground='black',
command=self.bottom_up).pack(side='left')
- tkinter.Frame(frame2, width=20).pack(side='left')
+ Frame(frame2, width=20).pack(side='left')
- tkinter.Button(frame2, text='Bottom Up Left-Corner\nPredict Rule',
+ Button(frame2, text='Bottom Up Left-Corner\nPredict Rule',
background='#90f090', foreground='black',
command=self.bottom_up_leftcorner).pack(side='left')
- tkinter.Frame(frame2, width=20).pack(side='left')
+ Frame(frame2, width=20).pack(side='left')
- tkinter.Button(frame2, text='Fundamental\nRule',
+ Button(frame2, text='Fundamental\nRule',
background='#90f090', foreground='black',
command=self.fundamental).pack(side='left')
@@ -1819,9 +1820,9 @@ class ChartParserApp(object):
self._root.bind('s', lambda e,s=self._step:s.set(not s.get()))
def _init_menubar(self):
- menubar = tkinter.Menu(self._root)
+ menubar = Menu(self._root)
- filemenu = tkinter.Menu(menubar, tearoff=0)
+ filemenu = Menu(menubar, tearoff=0)
filemenu.add_command(label='Save Chart', underline=0,
command=self.save_chart, accelerator='Ctrl-s')
filemenu.add_command(label='Load Chart', underline=0,
@@ -1838,7 +1839,7 @@ class ChartParserApp(object):
command=self.destroy, accelerator='Ctrl-x')
menubar.add_cascade(label='File', underline=0, menu=filemenu)
- editmenu = tkinter.Menu(menubar, tearoff=0)
+ editmenu = Menu(menubar, tearoff=0)
editmenu.add_command(label='Edit Grammar', underline=5,
command=self.edit_grammar,
accelerator='Ctrl-g')
@@ -1847,14 +1848,14 @@ class ChartParserApp(object):
accelerator='Ctrl-t')
menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
- viewmenu = tkinter.Menu(menubar, tearoff=0)
+ viewmenu = Menu(menubar, tearoff=0)
viewmenu.add_command(label='Chart Matrix', underline=6,
command=self.view_matrix)
viewmenu.add_command(label='Results', underline=0,
command=self.view_results)
menubar.add_cascade(label='View', underline=0, menu=viewmenu)
- rulemenu = tkinter.Menu(menubar, tearoff=0)
+ rulemenu = Menu(menubar, tearoff=0)
rulemenu.add_command(label='Top Down Strategy', underline=0,
command=self.top_down_strategy,
accelerator='t')
@@ -1877,7 +1878,7 @@ class ChartParserApp(object):
command=self.fundamental)
menubar.add_cascade(label='Apply', underline=0, menu=rulemenu)
- animatemenu = tkinter.Menu(menubar, tearoff=0)
+ animatemenu = Menu(menubar, tearoff=0)
animatemenu.add_checkbutton(label="Step", underline=0,
variable=self._step,
accelerator='s')
@@ -1895,7 +1896,7 @@ class ChartParserApp(object):
accelerator='+')
menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
- zoommenu = tkinter.Menu(menubar, tearoff=0)
+ zoommenu = Menu(menubar, tearoff=0)
zoommenu.add_radiobutton(label='Tiny', variable=self._size,
underline=0, value=10, command=self.resize)
zoommenu.add_radiobutton(label='Small', variable=self._size,
@@ -1908,7 +1909,7 @@ class ChartParserApp(object):
underline=0, value=24, command=self.resize)
menubar.add_cascade(label='Zoom', underline=0, menu=zoommenu)
- helpmenu = tkinter.Menu(menubar, tearoff=0)
+ helpmenu = Menu(menubar, tearoff=0)
helpmenu.add_command(label='About', underline=0,
command=self.about)
helpmenu.add_command(label='Instructions', underline=0,
@@ -1983,7 +1984,7 @@ class ChartParserApp(object):
def about(self, *e):
ABOUT = ("NLTK Chart Parser Application\n"+
"Written by Edward Loper")
- tkinter.messagebox.showinfo('About: Chart Parser Application', ABOUT)
+ showinfo('About: Chart Parser Application', ABOUT)
#////////////////////////////////////////////////////////////
# File Menu
@@ -2011,7 +2012,7 @@ class ChartParserApp(object):
self._cp.set_chart(chart)
except Exception as e:
raise
- tkinter.messagebox.showerror('Error Loading Chart',
+ showerror('Error Loading Chart',
'Unable to open file: %r' % filename)
def save_chart(self, *args):
@@ -2024,7 +2025,7 @@ class ChartParserApp(object):
pickle.dump(self._chart, outfile)
except Exception as e:
raise
- tkinter.messagebox.showerror('Error Saving Chart',
+ showerror('Error Saving Chart',
'Unable to open file: %r' % filename)
def load_grammar(self, *args):
@@ -2041,7 +2042,7 @@ class ChartParserApp(object):
grammar = CFG.fromstring(infile.read())
self.set_grammar(grammar)
except Exception as e:
- tkinter.messagebox.showerror('Error Loading Grammar',
+ showerror('Error Loading Grammar',
'Unable to open file: %r' % filename)
def save_grammar(self, *args):
@@ -2060,7 +2061,7 @@ class ChartParserApp(object):
for prod in start: outfile.write('%s\n' % prod)
for prod in rest: outfile.write('%s\n' % prod)
except Exception as e:
- tkinter.messagebox.showerror('Error Saving Grammar',
+ showerror('Error Saving Grammar',
'Unable to open file: %r' % filename)
def reset(self, *args):
@@ -2273,4 +2274,3 @@ if __name__ == '__main__':
#p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
__all__ = ['app']
-
diff --git a/nltk/app/chunkparser_app.py b/nltk/app/chunkparser_app.py
index 7559c40..5e08421 100644
--- a/nltk/app/chunkparser_app.py
+++ b/nltk/app/chunkparser_app.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Regexp Chunk Parser Application
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -16,16 +16,15 @@ parser ``nltk.chunk.RegexpChunkParser``.
# and what part of the data is being used as the development set.
from __future__ import division
-import nltk.compat
import time
import textwrap
import re
import random
-import tkinter.filedialog, tkinter.font
-from tkinter import (Button, Canvas, Checkbutton,
- Frame, IntVar, Label, Menu,
- Scrollbar, Text, Tk)
+from six.moves.tkinter import (Button, Canvas, Checkbutton, Frame, IntVar,
+ Label, Menu, Scrollbar, Text, Tk)
+from six.moves.tkinter_tkfiledialog import askopenfilename, asksaveasfilename
+from six.moves.tkinter_font import Font
from nltk.tree import Tree
from nltk.util import in_idle
@@ -376,9 +375,9 @@ class RegexpChunkApp(object):
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(top)
self._size.set(20)
- self._font = tkinter.font.Font(family='helvetica',
+ self._font = Font(family='helvetica',
size=-self._size.get())
- self._smallfont = tkinter.font.Font(family='helvetica',
+ self._smallfont = Font(family='helvetica',
size=-(int(self._size.get()*14//20)))
def _init_menubar(self, parent):
@@ -1159,7 +1158,7 @@ class RegexpChunkApp(object):
if not filename:
ftypes = [('Chunk Gramamr', '.chunk'),
('All files', '*')]
- filename = tkinter.filedialog.asksaveasfilename(filetypes=ftypes,
+ filename = asksaveasfilename(filetypes=ftypes,
defaultextension='.chunk')
if not filename: return
if (self._history and self.normalized_grammar ==
@@ -1181,7 +1180,7 @@ class RegexpChunkApp(object):
if not filename:
ftypes = [('Chunk Gramamr', '.chunk'),
('All files', '*')]
- filename = tkinter.filedialog.askopenfilename(filetypes=ftypes,
+ filename = askopenfilename(filetypes=ftypes,
defaultextension='.chunk')
if not filename: return
self.grammarbox.delete('1.0', 'end')
@@ -1197,7 +1196,7 @@ class RegexpChunkApp(object):
if not filename:
ftypes = [('Chunk Gramamr History', '.txt'),
('All files', '*')]
- filename = tkinter.filedialog.asksaveasfilename(filetypes=ftypes,
+ filename = asksaveasfilename(filetypes=ftypes,
defaultextension='.txt')
if not filename: return
@@ -1226,7 +1225,7 @@ class RegexpChunkApp(object):
"Written by Edward Loper")
TITLE = 'About: Regular Expression Chunk Parser Application'
try:
- from tkinter.messagebox import Message
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except:
ShowText(self.top, TITLE, ABOUT)
@@ -1261,4 +1260,3 @@ if __name__ == '__main__':
app()
__all__ = ['app']
-
diff --git a/nltk/app/collocations_app.py b/nltk/app/collocations_app.py
index 7293b73..49cbb8b 100644
--- a/nltk/app/collocations_app.py
+++ b/nltk/app/collocations_app.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Collocations Application
# Much of the GUI code is imported from concordance.py; We intend to merge these tools together
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Sumukh Ghodke <sghodke at csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -9,16 +9,13 @@
from __future__ import division
-import nltk.compat
import threading
-import tkinter.font
-if nltk.compat.PY3:
- import queue as q
-else:
- import Queue as q
-from tkinter import (Button, END, Frame, IntVar, LEFT, Label, Menu,
- OptionMenu, SUNKEN, Scrollbar, StringVar,
- Text, Tk)
+
+from six.moves import queue as q
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (Button, END, Frame, IntVar, LEFT, Label, Menu,
+ OptionMenu, SUNKEN, Scrollbar, StringVar,
+ Text, Tk)
from nltk.corpus import (cess_cat, brown, nps_chat, treebank, sinica_treebank, alpino,
indian, floresta, mac_morpho, machado, cess_esp)
@@ -146,7 +143,7 @@ class CollocationsView:
vscrollbar = Scrollbar(i1, borderwidth=1)
hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
self.results_box = Text(i1,
- font=tkinter.font.Font(family='courier', size='16'),
+ font=Font(family='courier', size='16'),
state='disabled', borderwidth=1,
yscrollcommand=vscrollbar.set,
xscrollcommand=hscrollbar.set, wrap='none', width='40', height = '20', exportselection=1)
diff --git a/nltk/app/concordance_app.py b/nltk/app/concordance_app.py
index 0244612..53c7167 100755
--- a/nltk/app/concordance_app.py
+++ b/nltk/app/concordance_app.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Concordance Application
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Sumukh Ghodke <sghodke at csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -9,14 +9,12 @@
import nltk.compat
import re
import threading
-if nltk.compat.PY3:
- import queue as q
-else:
- import Queue as q
-import tkinter.font
-from tkinter import (Tk, Button, END, Entry, Frame, IntVar, LEFT,
- Label, Menu, OptionMenu, SUNKEN, Scrollbar,
- StringVar, Text)
+
+from six.moves import queue as q
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (Tk, Button, END, Entry, Frame, IntVar, LEFT,
+ Label, Menu, OptionMenu, SUNKEN, Scrollbar,
+ StringVar, Text)
from nltk.corpus import (cess_cat, brown, nps_chat, treebank, sinica_treebank,
alpino, indian, floresta, mac_morpho, cess_esp)
@@ -240,7 +238,7 @@ class ConcordanceSearchView(object):
vscrollbar = Scrollbar(i1, borderwidth=1)
hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
self.results_box = Text(i1,
- font=tkinter.font.Font(family='courier', size='16'),
+ font=Font(family='courier', size='16'),
state='disabled', borderwidth=1,
yscrollcommand=vscrollbar.set,
xscrollcommand=hscrollbar.set, wrap='none', width='40', height = '20', exportselection=1)
@@ -280,7 +278,7 @@ class ConcordanceSearchView(object):
ABOUT = ("NLTK Concordance Search Demo\n")
TITLE = 'About: NLTK Concordance Search Demo'
try:
- from tkinter.messagebox import Message
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE, parent=self.main_frame).show()
except:
ShowText(self.top, TITLE, ABOUT)
diff --git a/nltk/app/nemo_app.py b/nltk/app/nemo_app.py
index 0767539..4b142fc 100755
--- a/nltk/app/nemo_app.py
+++ b/nltk/app/nemo_app.py
@@ -7,8 +7,9 @@ Finding (and Replacing) Nemo
Instant Regular Expressions
Created by Aristide Grange
"""
-import nltk.compat
-import tkinter as tk
+
+from six.moves.tkinter import (Frame, Label, PhotoImage, Scrollbar, Text, Tk,
+ SEL_FIRST, SEL_LAST)
import re
import itertools
@@ -50,30 +51,30 @@ textParams = {
class Zone:
def __init__(self, image, initialField, initialText):
- frm = tk.Frame(root)
+ frm = Frame(root)
frm.config(background="white")
- self.image = tk.PhotoImage(format='gif',data=images[image.upper()])
- self.imageDimmed = tk.PhotoImage(format='gif',data=images[image])
- self.img = tk.Label(frm)
+ self.image = PhotoImage(format='gif',data=images[image.upper()])
+ self.imageDimmed = PhotoImage(format='gif',data=images[image])
+ self.img = Label(frm)
self.img.config(borderwidth=0)
self.img.pack(side = "left")
- self.fld = tk.Text(frm, **fieldParams)
+ self.fld = Text(frm, **fieldParams)
self.initScrollText(frm,self.fld,initialField)
- frm = tk.Frame(root)
- self.txt = tk.Text(frm, **textParams)
+ frm = Frame(root)
+ self.txt = Text(frm, **textParams)
self.initScrollText(frm,self.txt,initialText)
for i in range(2):
self.txt.tag_config(colors[i], background = colors[i])
self.txt.tag_config("emph"+colors[i], foreground = emphColors[i])
def initScrollText(self,frm,txt,contents):
- scl = tk.Scrollbar(frm)
+ scl = Scrollbar(frm)
scl.config(command = txt.yview)
scl.pack(side="right",fill="y")
txt.pack(side = "left", expand=True, fill="x")
txt.config(yscrollcommand = scl.set)
txt.insert("1.0",contents)
frm.pack(fill = "x")
- tk.Frame(height=2, bd=1, relief="ridge").pack(fill="x")
+ Frame(height=2, bd=1, relief="ridge").pack(fill="x")
def refresh(self):
self.colorCycle = itertools.cycle(colors)
try:
@@ -99,12 +100,12 @@ class FindZone(Zone):
self.rex = re.compile("") # default value in case of misformed regexp
self.rex = re.compile(self.fld.get("1.0","end")[:-1],re.MULTILINE)
try:
- re.compile("(?P<emph>%s)" % self.fld.get(tk.SEL_FIRST,
- tk.SEL_LAST))
+ re.compile("(?P<emph>%s)" % self.fld.get(SEL_FIRST,
+ SEL_LAST))
self.rexSel = re.compile("%s(?P<emph>%s)%s" % (
- self.fld.get("1.0",tk.SEL_FIRST),
- self.fld.get(tk.SEL_FIRST,tk.SEL_LAST),
- self.fld.get(tk.SEL_LAST,"end")[:-1],
+ self.fld.get("1.0",SEL_FIRST),
+ self.fld.get(SEL_FIRST,SEL_LAST),
+ self.fld.get(SEL_LAST,"end")[:-1],
),re.MULTILINE)
except:
self.rexSel = self.rex
@@ -134,7 +135,7 @@ def launchRefresh(_):
def app():
global root, sz, rz, rex0
- root = tk.Tk()
+ root = Tk()
root.resizable(height=False,width=True)
root.title(windowTitle)
root.minsize(width=250,height=0)
diff --git a/nltk/app/rdparser_app.py b/nltk/app/rdparser_app.py
index 962dc40..b791767 100644
--- a/nltk/app/rdparser_app.py
+++ b/nltk/app/rdparser_app.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Recursive Descent Parser Application
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -64,10 +64,10 @@ Keyboard Shortcuts::
[q]\t Quit
"""
from __future__ import division
-import nltk.compat
-import tkinter.font
-from tkinter import (Listbox, IntVar, Button,
- Frame, Label, Menu, Scrollbar, Tk)
+
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (Listbox, IntVar, Button, Frame, Label, Menu,
+ Scrollbar, Tk)
from nltk.tree import Tree
from nltk.util import in_idle
@@ -129,20 +129,20 @@ class RecursiveDescentApp(object):
def _init_fonts(self, root):
# See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
- self._sysfont = tkinter.font.Font(font=Button()["font"])
+ self._sysfont = Font(font=Button()["font"])
root.option_add("*Font", self._sysfont)
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(root)
self._size.set(self._sysfont.cget('size'))
- self._boldfont = tkinter.font.Font(family='helvetica', weight='bold',
+ self._boldfont = Font(family='helvetica', weight='bold',
size=self._size.get())
- self._font = tkinter.font.Font(family='helvetica',
+ self._font = Font(family='helvetica',
size=self._size.get())
if self._size.get() < 0: big = self._size.get()-2
else: big = self._size.get()+2
- self._bigfont = tkinter.font.Font(family='helvetica', weight='bold',
+ self._bigfont = Font(family='helvetica', weight='bold',
size=big)
def _init_grammar(self, parent):
@@ -585,7 +585,7 @@ class RecursiveDescentApp(object):
"Written by Edward Loper")
TITLE = 'About: Recursive Descent Parser Application'
try:
- from tkinter.messagebox import Message
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except:
ShowText(self._top, TITLE, ABOUT)
@@ -890,5 +890,3 @@ if __name__ == '__main__':
app()
__all__ = ['app']
-
-
diff --git a/nltk/app/srparser_app.py b/nltk/app/srparser_app.py
index 6ae5455..0dd5786 100644
--- a/nltk/app/srparser_app.py
+++ b/nltk/app/srparser_app.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Shift-Reduce Parser Application
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -59,8 +59,19 @@ Keyboard Shortcuts::
[h]\t Help
[Ctrl-p]\t Print
[q]\t Quit
+
"""
+from six.moves.tkinter_font import Font
+from six.moves.tkinter import (IntVar, Listbox, Button, Frame, Label, Menu,
+ Scrollbar, Tk)
+
+from nltk.tree import Tree
+from nltk.parse import SteppingShiftReduceParser
+from nltk.util import in_idle
+from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget
+from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment
+
"""
Possible future improvements:
- button/window to change and/or select text. Just pop up a window
@@ -73,17 +84,6 @@ Possible future improvements:
responsible for that.
"""
-import nltk.compat
-import tkinter.font
-from tkinter import (IntVar, Listbox, Button, Frame, Label, Menu,
- Scrollbar, Tk)
-
-from nltk.tree import Tree
-from nltk.parse import SteppingShiftReduceParser
-from nltk.util import in_idle
-from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget
-from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment
-
class ShiftReduceApp(object):
"""
A graphical tool for exploring the shift-reduce parser. The tool
@@ -138,16 +138,16 @@ class ShiftReduceApp(object):
def _init_fonts(self, root):
# See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
- self._sysfont = tkinter.font.Font(font=Button()["font"])
+ self._sysfont = Font(font=Button()["font"])
root.option_add("*Font", self._sysfont)
# TWhat's our font size (default=same as sysfont)
self._size = IntVar(root)
self._size.set(self._sysfont.cget('size'))
- self._boldfont = tkinter.font.Font(family='helvetica', weight='bold',
+ self._boldfont = Font(family='helvetica', weight='bold',
size=self._size.get())
- self._font = tkinter.font.Font(family='helvetica',
+ self._font = Font(family='helvetica',
size=self._size.get())
def _init_grammar(self, parent):
@@ -552,7 +552,7 @@ class ShiftReduceApp(object):
"Written by Edward Loper")
TITLE = 'About: Shift-Reduce Parser Application'
try:
- from tkinter.messagebox import Message
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except:
ShowText(self._top, TITLE, ABOUT)
diff --git a/nltk/app/wordfreq_app.py b/nltk/app/wordfreq_app.py
index c5cb8cf..23bc796 100644
--- a/nltk/app/wordfreq_app.py
+++ b/nltk/app/wordfreq_app.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Wordfreq Application
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Sumukh Ghodke <sghodke at csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/app/wordnet_app.py b/nltk/app/wordnet_app.py
index a59b9eb..13807bc 100644
--- a/nltk/app/wordnet_app.py
+++ b/nltk/app/wordnet_app.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: WordNet Browser Application
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Jussi Salmela <jtsalmela at users.sourceforge.net>
# Paul Bone <pbone at students.csse.unimelb.edu.au>
# URL: <http://nltk.org/>
@@ -63,6 +63,8 @@ import base64
import pickle
import copy
+from six.moves.urllib.parse import unquote_plus
+
from nltk import compat
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Synset, Lemma
@@ -95,7 +97,7 @@ class MyServerHandler(BaseHTTPRequestHandler):
def do_GET(self):
global firstClient
sp = self.path[1:]
- if compat.unquote_plus(sp) == 'SHUTDOWN THE SERVER':
+ if unquote_plus(sp) == 'SHUTDOWN THE SERVER':
if server_mode:
page = "Server must be killed with SIGTERM."
type = "text/plain"
@@ -114,7 +116,7 @@ class MyServerHandler(BaseHTTPRequestHandler):
elif sp.endswith('.html'): # Trying to fetch a HTML file TODO:
type = 'text/html'
- usp = compat.unquote_plus(sp)
+ usp = unquote_plus(sp)
if usp == 'NLTK Wordnet Browser Database Info.html':
word = '* Database Info *'
if os.path.isfile(usp):
@@ -799,7 +801,7 @@ def get_static_web_help_page():
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
- Copyright (C) 2001-2016 NLTK Project
+ Copyright (C) 2001-2017 NLTK Project
Author: Jussi Salmela <jtsalmela at users.sourceforge.net>
URL: <http://nltk.org/>
For license information, see LICENSE.TXT -->
@@ -870,7 +872,7 @@ def get_static_index_page(with_shutdown):
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN" "http://www.w3.org/TR/html4/frameset.dtd">
<HTML>
<!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
- Copyright (C) 2001-2016 NLTK Project
+ Copyright (C) 2001-2017 NLTK Project
Author: Jussi Salmela <jtsalmela at users.sourceforge.net>
URL: <http://nltk.org/>
For license information, see LICENSE.TXT -->
@@ -904,7 +906,7 @@ def get_static_upper_page(with_shutdown):
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
- Copyright (C) 2001-2016 NLTK Project
+ Copyright (C) 2001-2017 NLTK Project
Author: Jussi Salmela <jtsalmela at users.sourceforge.net>
URL: <http://nltk.org/>
For license information, see LICENSE.TXT -->
diff --git a/nltk/book.py b/nltk/book.py
index d1f315c..7e006d2 100644
--- a/nltk/book.py
+++ b/nltk/book.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
#
# URL: <http://nltk.org/>
diff --git a/nltk/ccg/__init__.py b/nltk/ccg/__init__.py
index 3b9ac14..630c182 100644
--- a/nltk/ccg/__init__.py
+++ b/nltk/ccg/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Graeme Gange <ggange at csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/ccg/api.py b/nltk/ccg/api.py
index fb7ddb6..79c6b77 100644
--- a/nltk/ccg/api.py
+++ b/nltk/ccg/api.py
@@ -1,47 +1,63 @@
# Natural Language Toolkit: CCG Categories
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Graeme Gange <ggange at csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import unicode_literals
+from functools import total_ordering
+
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
from nltk.internals import raise_unorderable_types
-from nltk.compat import (total_ordering, python_2_unicode_compatible,
- unicode_repr)
+from nltk.compat import (python_2_unicode_compatible, unicode_repr)
+
+ at add_metaclass(ABCMeta)
@total_ordering
class AbstractCCGCategory(object):
'''
Interface for categories in combinatory grammars.
'''
- # Returns true if the category is primitive
+ @abstractmethod
def is_primitive(self):
- raise NotImplementedError()
+ """
+ Returns true if the category is primitive.
+ """
- # Returns true if the category is a function application
+ @abstractmethod
def is_function(self):
- raise NotImplementedError()
+ """
+ Returns true if the category is a function application.
+ """
- # Returns true if the category is a variable
+ @abstractmethod
def is_var(self):
- raise NotImplementedError()
+ """
+ Returns true if the category is a variable.
+ """
- # Takes a set of (var, category) substitutions, and replaces every
- # occurrence of the variable with the corresponding category
- def substitute(self,substitutions):
- raise NotImplementedError()
+ @abstractmethod
+ def substitute(self, substitutions):
+ """
+ Takes a set of (var, category) substitutions, and replaces every
+ occurrence of the variable with the corresponding category.
+ """
- # Determines whether two categories can be unified.
- # - Returns None if they cannot be unified
- # - Returns a list of necessary substitutions if they can.'''
- def can_unify(self,other):
- raise NotImplementedError()
+ @abstractmethod
+ def can_unify(self, other):
+ """
+ Determines whether two categories can be unified.
+ - Returns None if they cannot be unified
+ - Returns a list of necessary substitutions if they can.
+ """
# Utility functions: comparison, strings and hashing.
-
+ @abstractmethod
def __str__(self):
- raise NotImplementedError()
+ pass
def __eq__(self, other):
return (self.__class__ is other.__class__ and
@@ -78,7 +94,8 @@ class CCGVar(AbstractCCGCategory):
def __init__(self, prim_only=False):
"""Initialize a variable (selects a new identifier)
- :param prim_only: a boolean that determines whether the variable is restricted to primitives
+ :param prim_only: a boolean that determines whether the variable is
+ restricted to primitives
:type prim_only: bool
"""
self._id = self.new_id()
@@ -87,7 +104,9 @@ class CCGVar(AbstractCCGCategory):
@classmethod
def new_id(cls):
- """A class method allowing generation of unique variable identifiers."""
+ """
+ A class method allowing generation of unique variable identifiers.
+ """
cls._maxID = cls._maxID + 1
return cls._maxID - 1
@@ -108,9 +127,9 @@ class CCGVar(AbstractCCGCategory):
"""If there is a substitution corresponding to this variable,
return the substituted category.
"""
- for (var,cat) in substitutions:
+ for (var, cat) in substitutions:
if var == self:
- return cat
+ return cat
return self
def can_unify(self, other):
@@ -118,7 +137,7 @@ class CCGVar(AbstractCCGCategory):
a substitution is returned.
"""
if other.is_primitive() or not self._prim_only:
- return [(self,other)]
+ return [(self, other)]
return None
def id(self):
@@ -127,6 +146,7 @@ class CCGVar(AbstractCCGCategory):
def __str__(self):
return "_var" + str(self._id)
+
@total_ordering
@python_2_unicode_compatible
class Direction(object):
@@ -135,7 +155,7 @@ class Direction(object):
Also contains maintains information as to which combinators
may be used with the category.
'''
- def __init__(self,dir,restrictions):
+ def __init__(self, dir, restrictions):
self._dir = dir
self._restrs = restrictions
self._comparison_key = (dir, tuple(restrictions))
@@ -143,6 +163,7 @@ class Direction(object):
# Testing the application direction
def is_forward(self):
return self._dir == '/'
+
def is_backward(self):
return self._dir == '\\'
@@ -164,31 +185,31 @@ class Direction(object):
# Unification and substitution of variable directions.
# Used only if type-raising is implemented as a unary rule, as it
# must inherit restrictions from the argument category.
- def can_unify(self,other):
+ def can_unify(self, other):
if other.is_variable():
- return [('_',self.restrs())]
+ return [('_', self.restrs())]
elif self.is_variable():
- return [('_',other.restrs())]
+ return [('_', other.restrs())]
else:
if self.restrs() == other.restrs():
return []
return None
- def substitute(self,subs):
+ def substitute(self, subs):
if not self.is_variable():
return self
for (var, restrs) in subs:
if var == '_':
- return Direction(self._dir,restrs)
+ return Direction(self._dir, restrs)
return self
# Testing permitted combinators
def can_compose(self):
- return not ',' in self._restrs
+ return (',' not in self._restrs)
def can_cross(self):
- return not '.' in self._restrs
+ return ('.' not in self._restrs)
def __eq__(self, other):
return (self.__class__ is other.__class__ and
@@ -221,9 +242,9 @@ class Direction(object):
# The negation operator reverses the direction of the application
def __neg__(self):
if self._dir == '/':
- return Direction('\\',self._restrs)
+ return Direction('\\', self._restrs)
else:
- return Direction('/',self._restrs)
+ return Direction('/', self._restrs)
@python_2_unicode_compatible
@@ -254,17 +275,17 @@ class PrimitiveCategory(AbstractCCGCategory):
return self._categ
# Substitution does nothing to a primitive category
- def substitute(self,subs):
+ def substitute(self, subs):
return self
# A primitive can be unified with a class of the same
# base category, given that the other category shares all
# of its subclasses, or with a variable.
- def can_unify(self,other):
+ def can_unify(self, other):
if not other.is_primitive():
return None
if other.is_var():
- return [(other,self)]
+ return [(other, self)]
if other.categ() == self.categ():
for restr in self._restrs:
if restr not in other.restrs():
@@ -303,24 +324,25 @@ class FunctionalCategory(AbstractCCGCategory):
# Substitution returns the category consisting of the
# substitution applied to each of its constituents.
- def substitute(self,subs):
+ def substitute(self, subs):
sub_res = self._res.substitute(subs)
sub_dir = self._dir.substitute(subs)
sub_arg = self._arg.substitute(subs)
- return FunctionalCategory(sub_res,sub_arg,self._dir)
+ return FunctionalCategory(sub_res, sub_arg, self._dir)
# A function can unify with another function, so long as its
# constituents can unify, or with an unrestricted variable.
- def can_unify(self,other):
+ def can_unify(self, other):
if other.is_var():
- return [(other,self)]
+ return [(other, self)]
if other.is_function():
sa = self._res.can_unify(other.res())
sd = self._dir.can_unify(other.dir())
if sa is not None and sd is not None:
- sb = self._arg.substitute(sa).can_unify(other.arg().substitute(sa))
- if sb is not None:
- return sa + sb
+ sb = self._arg.substitute(sa).can_unify(
+ other.arg().substitute(sa))
+ if sb is not None:
+ return sa + sb
return None
# Constituent accessors
@@ -335,5 +357,3 @@ class FunctionalCategory(AbstractCCGCategory):
def __str__(self):
return "(%s%s%s)" % (self._res, self._dir, self._arg)
-
-
diff --git a/nltk/ccg/chart.py b/nltk/ccg/chart.py
index f3214ef..e2f04b1 100644
--- a/nltk/ccg/chart.py
+++ b/nltk/ccg/chart.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Graeme Gange <ggange at csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -33,6 +33,8 @@ from __future__ import print_function, division, unicode_literals
import itertools
+from six import string_types
+
from nltk.parse import ParserI
from nltk.parse.chart import AbstractChartRule, EdgeI, Chart
from nltk.tree import Tree
@@ -42,7 +44,7 @@ from nltk.ccg.combinator import (ForwardT, BackwardT, ForwardApplication,
BackwardApplication, ForwardComposition,
BackwardComposition, ForwardSubstitution,
BackwardBx, BackwardSx)
-from nltk.compat import python_2_unicode_compatible, string_types
+from nltk.compat import python_2_unicode_compatible
from nltk.ccg.combinator import *
from nltk.ccg.logic import *
from nltk.sem.logic import *
@@ -264,11 +266,11 @@ class CCGChart(Chart):
memo[edge] = trees
return trees
-
+
def compute_semantics(children, edge):
if children[0].label()[0].semantics() is None:
return None
-
+
if len(children) is 2:
if isinstance(edge.rule(), BackwardCombinator):
children = [children[1],children[0]]
@@ -335,7 +337,7 @@ def printCCGTree(lwidth,tree):
(token, op) = tree.label()
- if op == u'Leaf':
+ if op == 'Leaf':
return rwidth
# Pad to the left with spaces, followed by a sequence of '-'
diff --git a/nltk/ccg/combinator.py b/nltk/ccg/combinator.py
index d617b42..1fecd5c 100644
--- a/nltk/ccg/combinator.py
+++ b/nltk/ccg/combinator.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Graeme Gange <ggange at csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -9,10 +9,14 @@ CCG Combinators
"""
from __future__ import unicode_literals
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
from nltk.compat import python_2_unicode_compatible
from nltk.ccg.api import FunctionalCategory
+
+ at add_metaclass(ABCMeta)
class UndirectedBinaryCombinator(object):
"""
Abstract class for representing a binary combinator.
@@ -24,12 +28,16 @@ class UndirectedBinaryCombinator(object):
of the combinators; these restrictions must be added in the rule
class.
"""
+ @abstractmethod
def can_combine(self, function, argument):
- raise NotImplementedError()
+ pass
+
+ @abstractmethod
+ def combine(self, function, argument):
+ pass
- def combine (self, function, argument):
- raise NotImplementedError()
+ at add_metaclass(ABCMeta)
class DirectedBinaryCombinator(object):
"""
Wrapper for the undirected binary combinator.
@@ -37,11 +45,14 @@ class DirectedBinaryCombinator(object):
the function, and which the argument.
It then decides whether or not they can be combined.
"""
+ @abstractmethod
def can_combine(self, left, right):
- raise NotImplementedError()
+ pass
+ @abstractmethod
def combine(self, left, right):
- raise NotImplementedError()
+ pass
+
@python_2_unicode_compatible
class ForwardCombinator(DirectedBinaryCombinator):
@@ -67,6 +78,7 @@ class ForwardCombinator(DirectedBinaryCombinator):
def __str__(self):
return ">%s%s" % (self._combinator, self._suffix)
+
@python_2_unicode_compatible
class BackwardCombinator(DirectedBinaryCombinator):
"""
@@ -80,6 +92,7 @@ class BackwardCombinator(DirectedBinaryCombinator):
def can_combine(self, left, right):
return (self._combinator.can_combine(right, left) and
self._predicate(left, right))
+
def combine(self, left, right):
for cat in self._combinator.combine(right, left):
yield cat
@@ -87,6 +100,7 @@ class BackwardCombinator(DirectedBinaryCombinator):
def __str__(self):
return "<%s%s" % (self._combinator, self._suffix)
+
@python_2_unicode_compatible
class UndirectedFunctionApplication(UndirectedBinaryCombinator):
"""
@@ -122,10 +136,12 @@ class UndirectedFunctionApplication(UndirectedBinaryCombinator):
def forwardOnly(left, right):
return left.dir().is_forward()
+
# Ensures the right functor takes an argument on the left
def backwardOnly(left, right):
return right.dir().is_backward()
+
# Application combinator instances
ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(),
forwardOnly)
@@ -155,25 +171,29 @@ class UndirectedComposition(UndirectedBinaryCombinator):
return
if function.dir().can_compose() and argument.dir().can_compose():
subs = function.arg().can_unify(argument.res())
- if not subs is None:
+ if subs is not None:
yield FunctionalCategory(function.res().substitute(subs),
- argument.arg().substitute(subs), argument.dir())
+ argument.arg().substitute(subs),
+ argument.dir())
def __str__(self):
return 'B'
+
# Predicates for restricting application of straight composition.
def bothForward(left, right):
return left.dir().is_forward() and right.dir().is_forward()
+
def bothBackward(left, right):
return left.dir().is_backward() and right.dir().is_backward()
-# Predicates for crossed composition
+# Predicates for crossed composition
def crossedDirs(left, right):
return left.dir().is_forward() and right.dir().is_backward()
+
def backwardBxConstraint(left, right):
# The functors must be crossed inwards
if not crossedDirs(left, right):
@@ -184,6 +204,7 @@ def backwardBxConstraint(left, right):
# The resulting argument category is restricted to be primitive
return left.arg().is_primitive()
+
# Straight composition combinators
ForwardComposition = ForwardCombinator(UndirectedComposition(),
forwardOnly)
@@ -194,6 +215,7 @@ BackwardComposition = BackwardCombinator(UndirectedComposition(),
BackwardBx = BackwardCombinator(UndirectedComposition(), backwardBxConstraint,
suffix='x')
+
@python_2_unicode_compatible
class UndirectedSubstitution(UndirectedBinaryCombinator):
"""
@@ -219,17 +241,20 @@ class UndirectedSubstitution(UndirectedBinaryCombinator):
def combine(self, function, argument):
if self.can_combine(function, argument):
- yield FunctionalCategory(function.res().res(), argument.arg(), argument.dir())
+ yield FunctionalCategory(function.res().res(), argument.arg(),
+ argument.dir())
def __str__(self):
return 'S'
+
# Predicate for forward substitution
def forwardSConstraint(left, right):
if not bothForward(left, right):
return False
return left.res().dir().is_forward() and left.arg().is_primitive()
+
# Predicate for backward crossed substitution
def backwardSxConstraint(left, right):
if not left.dir().can_cross() and right.dir().can_cross():
@@ -238,6 +263,7 @@ def backwardSxConstraint(left, right):
return False
return right.res().dir().is_backward() and right.arg().is_primitive()
+
# Instances of substitution combinators
ForwardSubstitution = ForwardCombinator(UndirectedSubstitution(),
forwardSConstraint)
@@ -252,6 +278,7 @@ def innermostFunction(categ):
categ = categ.res()
return categ
+
@python_2_unicode_compatible
class UndirectedTypeRaise(UndirectedBinaryCombinator):
"""
@@ -288,12 +315,14 @@ class UndirectedTypeRaise(UndirectedBinaryCombinator):
if subs is not None:
xcat = arg.res().substitute(subs)
yield FunctionalCategory(xcat,
- FunctionalCategory(xcat, function, arg.dir()),
+ FunctionalCategory(xcat, function,
+ arg.dir()),
-(arg.dir()))
def __str__(self):
return 'T'
+
# Predicates for type-raising
# The direction of the innermost category must be towards
# the primary functor.
@@ -303,10 +332,12 @@ def forwardTConstraint(left, right):
arg = innermostFunction(right)
return arg.dir().is_backward() and arg.res().is_primitive()
+
def backwardTConstraint(left, right):
arg = innermostFunction(left)
return arg.dir().is_forward() and arg.res().is_primitive()
+
# Instances of type-raising combinators
ForwardT = ForwardCombinator(UndirectedTypeRaise(), forwardTConstraint)
BackwardT = BackwardCombinator(UndirectedTypeRaise(), backwardTConstraint)
diff --git a/nltk/ccg/lexicon.py b/nltk/ccg/lexicon.py
index 9ff0a9d..699dd87 100644
--- a/nltk/ccg/lexicon.py
+++ b/nltk/ccg/lexicon.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Graeme Gange <ggange at csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/ccg/logic.py b/nltk/ccg/logic.py
index 85652ff..39d2ba2 100644
--- a/nltk/ccg/logic.py
+++ b/nltk/ccg/logic.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Combinatory Categorial Grammar
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tanin Na Nakorn (@tanin)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/chat/__init__.py b/nltk/chat/__init__.py
index e35ac1d..574d770 100644
--- a/nltk/chat/__init__.py
+++ b/nltk/chat/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Chatbots
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/chat/eliza.py b/nltk/chat/eliza.py
index 2f37e84..c550306 100644
--- a/nltk/chat/eliza.py
+++ b/nltk/chat/eliza.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Eliza
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/chat/iesha.py b/nltk/chat/iesha.py
index f4321e3..68d52be 100644
--- a/nltk/chat/iesha.py
+++ b/nltk/chat/iesha.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Teen Chatbot
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Selina Dennis <sjmd at csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/chat/rude.py b/nltk/chat/rude.py
index cac31e9..0e571d7 100644
--- a/nltk/chat/rude.py
+++ b/nltk/chat/rude.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Rude Chatbot
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Peter Spiller <pspiller at csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/chat/suntsu.py b/nltk/chat/suntsu.py
index 7189b63..f2f1e1b 100644
--- a/nltk/chat/suntsu.py
+++ b/nltk/chat/suntsu.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Sun Tsu-Bot
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Sam Huston 2007
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/chat/util.py b/nltk/chat/util.py
index f19374c..c38b90a 100644
--- a/nltk/chat/util.py
+++ b/nltk/chat/util.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Chatbot Utilities
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -11,7 +11,9 @@ from __future__ import print_function
import re
import random
-from nltk import compat
+
+from six.moves import input
+
reflections = {
"i am" : "you are",
@@ -109,12 +111,12 @@ class Chat(object):
# Hold a conversation with a chatbot
def converse(self, quit="quit"):
- input = ""
- while input != quit:
- input = quit
- try: input = compat.raw_input(">")
+ user_input = ""
+ while user_input != quit:
+ user_input = quit
+ try: user_input = input(">")
except EOFError:
- print(input)
- if input:
- while input[-1] in "!.": input = input[:-1]
- print(self.respond(input))
+ print(user_input)
+ if user_input:
+ while user_input[-1] in "!.": user_input = user_input[:-1]
+ print(self.respond(user_input))
diff --git a/nltk/chat/zen.py b/nltk/chat/zen.py
index ae119c4..c06a122 100644
--- a/nltk/chat/zen.py
+++ b/nltk/chat/zen.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Zen Chatbot
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Amy Holland <amyrh at csse.unimelb.edu.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/chunk/__init__.py b/nltk/chunk/__init__.py
index d54759b..8520202 100644
--- a/nltk/chunk/__init__.py
+++ b/nltk/chunk/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Chunkers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/chunk/api.py b/nltk/chunk/api.py
index 677ec8b..5e41f7a 100644
--- a/nltk/chunk/api.py
+++ b/nltk/chunk/api.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Chunk parsing API
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com> (minor additions)
# URL: <http://nltk.org/>
diff --git a/nltk/chunk/named_entity.py b/nltk/chunk/named_entity.py
index 91d3f4f..9867b0a 100644
--- a/nltk/chunk/named_entity.py
+++ b/nltk/chunk/named_entity.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Chunk parsing API
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -101,9 +101,9 @@ class NEChunkParserTagger(ClassifierBasedTagger):
'nextpos': nextpos,
'prevword': prevword,
'nextword': nextword,
- 'word+nextpos': '%s+%s' % (word.lower(), nextpos),
- 'pos+prevtag': '%s+%s' % (pos, prevtag),
- 'shape+prevtag': '%s+%s' % (prevshape, prevtag),
+ 'word+nextpos': '{0}+{1}'.format(word.lower(), nextpos),
+ 'pos+prevtag': '{0}+{1}'.format(pos, prevtag),
+ 'shape+prevtag': '{0}+{1}'.format(prevshape, prevtag),
}
return features
@@ -159,9 +159,9 @@ class NEChunkParser(ChunkParserI):
if len(child) == 0:
print("Warning -- empty chunk in sentence")
continue
- toks.append((child[0], 'B-%s' % child.label()))
+ toks.append((child[0], 'B-{0}'.format(child.label())))
for tok in child[1:]:
- toks.append((tok, 'I-%s' % child.label()))
+ toks.append((tok, 'I-{0}'.format(child.label())))
else:
toks.append((child, 'O'))
return toks
@@ -210,7 +210,7 @@ def load_ace_data(roots, fmt='binary', skip_bnews=True):
yield sent
def load_ace_file(textfile, fmt):
- print(' - %s' % os.path.split(textfile)[1])
+ print(' - {0}'.format(os.path.split(textfile)[1]))
annfile = textfile+'.tmx.rdc.xml'
# Read the xml file, and get a list of entities
@@ -281,12 +281,12 @@ def cmp_chunks(correct, guessed):
for (w, ct), (w, gt) in zip(correct, guessed):
if ct == gt == 'O':
if not ellipsis:
- print(" %-15s %-15s %s" % (ct, gt, w))
- print(' %-15s %-15s %s' % ('...', '...', '...'))
+ print(" {:15} {:15} {2}".format(ct, gt, w))
+ print(' {:15} {:15} {2}'.format('...', '...', '...'))
ellipsis = True
else:
ellipsis = False
- print(" %-15s %-15s %s" % (ct, gt, w))
+ print(" {:15} {:15} {2}".format(ct, gt, w))
def build_model(fmt='binary'):
print('Loading training data...')
@@ -313,8 +313,8 @@ def build_model(fmt='binary'):
if i < 3: cmp_chunks(correct, guess)
print(chunkscore)
- outfilename = '/tmp/ne_chunker_%s.pickle' % fmt
- print('Saving chunker to %s...' % outfilename)
+ outfilename = '/tmp/ne_chunker_{0}.pickle'.format(fmt)
+ print('Saving chunker to {0}...'.format(outfilename))
with open(outfilename, 'wb') as outfile:
pickle.dump(cp, outfile, -1)
diff --git a/nltk/chunk/regexp.py b/nltk/chunk/regexp.py
index 4ccdb9e..63855b0 100644
--- a/nltk/chunk/regexp.py
+++ b/nltk/chunk/regexp.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Regular Expression Chunkers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com> (minor additions)
# URL: <http://nltk.org/>
@@ -10,9 +10,11 @@ from __future__ import division
import re
+from six import string_types
+
from nltk.tree import Tree
from nltk.chunk.api import ChunkParserI
-from nltk.compat import python_2_unicode_compatible, string_types, unicode_repr
+from nltk.compat import python_2_unicode_compatible, unicode_repr
##//////////////////////////////////////////////////////
## ChunkString
@@ -829,9 +831,13 @@ class ChunkRuleWithContext(RegexpChunkRule):
# this should probably be made more strict than it is -- e.g., it
# currently accepts 'foo'.
CHUNK_TAG_PATTERN = re.compile(r'^((%s|<%s>)*)$' %
- ('[^\{\}<>]+',
+ ('([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+',
'[^\{\}<>]+'))
+
+
+
+
def tag_pattern2re_pattern(tag_pattern):
"""
Convert a tag pattern to a regular expression pattern. A "tag
diff --git a/nltk/chunk/util.py b/nltk/chunk/util.py
index 0ef7a6d..0a99dc6 100644
--- a/nltk/chunk/util.py
+++ b/nltk/chunk/util.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Chunk format conversions
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com> (minor additions)
# URL: <http://nltk.org/>
@@ -288,10 +288,10 @@ class ChunkScore(object):
:rtype: str
"""
return ("ChunkParse score:\n" +
- (" IOB Accuracy: %5.1f%%\n" % (self.accuracy()*100)) +
- (" Precision: %5.1f%%\n" % (self.precision()*100)) +
- (" Recall: %5.1f%%\n" % (self.recall()*100))+
- (" F-Measure: %5.1f%%" % (self.f_measure()*100)))
+ (" IOB Accuracy: {:5.1f}%%\n".format(self.accuracy()*100)) +
+ (" Precision: {:5.1f}%%\n".format(self.precision()*100)) +
+ (" Recall: {:5.1f}%%\n".format(self.recall()*100))+
+ (" F-Measure: {:5.1f}%%".format(self.f_measure()*100)))
# extract chunks, and assign unique id, the absolute position of
# the first word of the chunk
@@ -334,13 +334,13 @@ def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/',
text = match.group()
if text[0] == '[':
if len(stack) != 1:
- raise ValueError('Unexpected [ at char %d' % match.start())
+ raise ValueError('Unexpected [ at char {:d}'.format(match.start()))
chunk = Tree(chunk_label, [])
stack[-1].append(chunk)
stack.append(chunk)
elif text[0] == ']':
if len(stack) != 2:
- raise ValueError('Unexpected ] at char %d' % match.start())
+ raise ValueError('Unexpected ] at char {:d}'.format(match.start()))
stack.pop()
else:
if sep is None:
@@ -352,7 +352,7 @@ def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/',
stack[-1].append((word, tag))
if len(stack) != 1:
- raise ValueError('Expected ] at char %d' % len(s))
+ raise ValueError('Expected ] at char {:d}'.format(len(s)))
return stack[0]
### CONLL
@@ -384,7 +384,7 @@ def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"):
# Decode the line.
match = _LINE_RE.match(line)
if match is None:
- raise ValueError('Error on line %d' % lineno)
+ raise ValueError('Error on line {:d}'.format(lineno))
(word, tag, state, chunk_type) = match.groups()
# If it's a chunk type we don't care about, treat it as O.
@@ -461,7 +461,7 @@ def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
elif chunktag == 'O':
tree.append((word,postag))
else:
- raise ValueError("Bad conll tag %r" % chunktag)
+ raise ValueError("Bad conll tag {0!r}".format(chunktag))
return tree
def tree2conllstr(t):
@@ -512,8 +512,8 @@ def _ieer_read_text(s, root_label):
else:
stack[-1].append(piece)
except (IndexError, ValueError):
- raise ValueError('Bad IEER string (error at character %d)' %
- piece_m.start())
+ raise ValueError('Bad IEER string (error at character {:d})'.format \
+ (piece_m.start()))
if len(stack) != 1:
raise ValueError('Bad IEER string')
return stack[0]
diff --git a/nltk/classify/__init__.py b/nltk/classify/__init__.py
index 5e2bfda..2acfbfa 100644
--- a/nltk/classify/__init__.py
+++ b/nltk/classify/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Classifiers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/classify/api.py b/nltk/classify/api.py
index 2e70a23..fc32b0d 100644
--- a/nltk/classify/api.py
+++ b/nltk/classify/api.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Classifier Interface
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com> (minor additions)
# URL: <http://nltk.org/>
diff --git a/nltk/classify/decisiontree.py b/nltk/classify/decisiontree.py
index ab43796..27897ab 100644
--- a/nltk/classify/decisiontree.py
+++ b/nltk/classify/decisiontree.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Decision Tree Classifiers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -81,17 +81,17 @@ class DecisionTreeClassifier(ClassifierI):
# [xx] display default!!
if self._fname is None:
n = width-len(prefix)-15
- return '%s%s %s\n' % (prefix, '.'*n, self._label)
+ return '{0}{1} {2}\n'.format(prefix, '.'*n, self._label)
s = ''
for i, (fval, result) in enumerate(sorted(self._decisions.items())):
- hdr = '%s%s=%s? ' % (prefix, self._fname, fval)
+ hdr = '{0}{1}={2}? '.format(prefix, self._fname, fval)
n = width-15-len(hdr)
- s += '%s%s %s\n' % (hdr, '.'*(n), result._label)
+ s += '{0}{1} {2}\n'.format(hdr, '.'*(n), result._label)
if result._fname is not None and depth>1:
s += result.pretty_format(width, prefix+' ', depth-1)
if self._default is not None:
n = width-len(prefix)-21
- s += '%selse: %s %s\n' % (prefix, '.'*n, self._default._label)
+ s += '{0}else: {1} {2}\n'.format(prefix, '.'*n, self._default._label)
if self._default._fname is not None and depth>1:
s += self._default.pretty_format(width, prefix+' ', depth-1)
return s
@@ -103,24 +103,24 @@ class DecisionTreeClassifier(ClassifierI):
if statements.
"""
if self._fname is None:
- return "%sreturn %r\n" % (prefix, self._label)
+ return "{0}return {1!r}\n".format(prefix, self._label)
s = ''
for (fval, result) in sorted(self._decisions.items()):
- s += '%sif %s == %r: ' % (prefix, self._fname, fval)
+ s += '{0}if {1} == {2!r}: '.format(prefix, self._fname, fval)
if result._fname is not None and depth>1:
s += '\n'+result.pseudocode(prefix+' ', depth-1)
else:
- s += 'return %r\n' % result._label
+ s += 'return {0!r}\n'.format(result._label)
if self._default is not None:
if len(self._decisions) == 1:
- s += '%sif %s != %r: '% (prefix, self._fname,
+ s += '{0}if {1} != {2!r}: '.format(prefix, self._fname,
list(self._decisions.keys())[0])
else:
- s += '%selse: ' % (prefix,)
+ s += '{0}else: '.format(prefix)
if self._default._fname is not None and depth>1:
s += '\n'+self._default.pseudocode(prefix+' ', depth-1)
else:
- s += 'return %r\n' % self._default._label
+ s += 'return {0!r}\n'.format(self._default._label)
return s
def __str__(self):
@@ -224,7 +224,7 @@ class DecisionTreeClassifier(ClassifierI):
best_error = stump_error
best_stump = stump
if verbose:
- print(('best stump for %6d toks uses %-20s err=%6.4f' %
+ print(('best stump for {:6d} toks uses {:20} err={:6.4f}'.format \
(len(labeled_featuresets), best_stump._fname, best_error)))
return best_stump
@@ -267,12 +267,12 @@ class DecisionTreeClassifier(ClassifierI):
best_error = stump_error
best_stump = stump
if best_stump._decisions:
- descr = '%s=%s' % (best_stump._fname,
+ descr = '{0}={1}'.format(best_stump._fname,
list(best_stump._decisions.keys())[0])
else:
descr = '(default)'
if verbose:
- print(('best stump for %6d toks uses %-20s err=%6.4f' %
+ print(('best stump for {:6d} toks uses {:20} err={:6.4f}'.format \
(len(labeled_featuresets), descr, best_error)))
return best_stump
diff --git a/nltk/classify/maxent.py b/nltk/classify/maxent.py
index c42bce1..f067394 100644
--- a/nltk/classify/maxent.py
+++ b/nltk/classify/maxent.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Maximum Entropy Classifiers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Dmitry Chichkov <dchichkov at gmail.com> (TypedMaxentFeatureEncoding)
# URL: <http://nltk.org/>
@@ -52,7 +52,6 @@ performed by classes that implement the ``MaxentFeatureEncodingI``
interface.
"""
from __future__ import print_function, unicode_literals
-__docformat__ = 'epytext en'
try:
import numpy
@@ -63,6 +62,8 @@ import tempfile
import os
from collections import defaultdict
+from six import integer_types
+
from nltk import compat
from nltk.data import gzip_open_unicode
from nltk.util import OrderedDict
@@ -74,6 +75,8 @@ from nltk.classify.megam import (call_megam,
write_megam_file, parse_megam_weights)
from nltk.classify.tadm import call_tadm, write_tadm_file, parse_tadm_weights
+__docformat__ = 'epytext en'
+
######################################################################
#{ Classifier Model
######################################################################
@@ -547,7 +550,7 @@ class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
def describe(self, f_id):
# Inherit docs.
- if not isinstance(f_id, compat.integer_types):
+ if not isinstance(f_id, integer_types):
raise TypeError('describe() expected an int')
try:
self._inv_mapping
@@ -854,7 +857,7 @@ class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
# Convert input-features to joint-features:
for fname, fval in featureset.items():
- if isinstance(fval, (compat.integer_types, float)):
+ if isinstance(fval, (integer_types, float)):
# Known feature name & value:
if (fname, type(fval), label) in self._mapping:
encoding.append((self._mapping[fname, type(fval),
@@ -884,7 +887,7 @@ class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
def describe(self, f_id):
# Inherit docs.
- if not isinstance(f_id, compat.integer_types):
+ if not isinstance(f_id, integer_types):
raise TypeError('describe() expected an int')
try:
self._inv_mapping
diff --git a/nltk/classify/megam.py b/nltk/classify/megam.py
index b144f31..2db484d 100644
--- a/nltk/classify/megam.py
+++ b/nltk/classify/megam.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Interface to Megam Classifier
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -26,6 +26,8 @@ from __future__ import print_function
import subprocess
+from six import string_types
+
from nltk import compat
from nltk.internals import find_binary
try:
@@ -155,7 +157,7 @@ def call_megam(args):
"""
Call the ``megam`` binary with the given arguments.
"""
- if isinstance(args, compat.string_types):
+ if isinstance(args, string_types):
raise TypeError('args should be a list of strings')
if _megam_bin is None:
config_megam()
@@ -171,8 +173,7 @@ def call_megam(args):
print(stderr)
raise OSError('megam command failed!')
- if isinstance(stdout, compat.string_types):
+ if isinstance(stdout, string_types):
return stdout
else:
return stdout.decode('utf-8')
-
diff --git a/nltk/classify/naivebayes.py b/nltk/classify/naivebayes.py
index 6719df1..22f0861 100644
--- a/nltk/classify/naivebayes.py
+++ b/nltk/classify/naivebayes.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Naive Bayes Classifiers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -22,7 +22,7 @@ independent, given the label:
| P(features)
Rather than computing P(featues) explicitly, the algorithm just
-calculates the denominator for each label, and normalizes them so they
+calculates the numerator for each label, and normalizes them so they
sum to one:
| P(label) * P(f1|label) * ... * P(fn|label)
diff --git a/nltk/classify/rte_classify.py b/nltk/classify/rte_classify.py
index a4a98fe..1693560 100644
--- a/nltk/classify/rte_classify.py
+++ b/nltk/classify/rte_classify.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: RTE Classifier
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -58,10 +58,10 @@ class RTEFeatureExtractor(object):
self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
'denied'])
- # Try to tokenize so that abbreviations like U.S.and monetary amounts
- # like "$23.00" are kept as tokens.
+ # Try to tokenize so that abbreviations, monetary amounts, email
+ # addresses, URLs are single tokens.
from nltk.tokenize import RegexpTokenizer
- tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')
+ tokenizer = RegexpTokenizer('[\w.@:/]+|\w+|\$[\d.]+')
#Get the set of word types for text and hypothesis
self.text_tokens = tokenizer.tokenize(rtepair.text)
diff --git a/nltk/classify/scikitlearn.py b/nltk/classify/scikitlearn.py
index 387adbf..8a3fd22 100644
--- a/nltk/classify/scikitlearn.py
+++ b/nltk/classify/scikitlearn.py
@@ -32,6 +32,8 @@ best 1000 features:
"""
from __future__ import print_function, unicode_literals
+from six.moves import zip
+
from nltk.classify.api import ClassifierI
from nltk.probability import DictionaryProbDist
from nltk import compat
@@ -111,7 +113,7 @@ class SklearnClassifier(ClassifierI):
numbers, booleans or strings.
"""
- X, y = list(compat.izip(*labeled_featuresets))
+ X, y = list(zip(*labeled_featuresets))
X = self._vectorizer.fit_transform(X)
y = self._encoder.fit_transform(y)
self._clf.fit(X, y)
diff --git a/nltk/classify/senna.py b/nltk/classify/senna.py
index 87f55fd..d8d71db 100644
--- a/nltk/classify/senna.py
+++ b/nltk/classify/senna.py
@@ -1,7 +1,7 @@
# encoding: utf-8
# Natural Language Toolkit: Senna Interface
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Rami Al-Rfou' <ralrfou at cs.stonybrook.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -22,17 +22,19 @@ system specific binary should be rebuilt. Otherwise this could introduce
misalignment errors.
The input is:
-- path to the directory that contains SENNA executables. If the path is incorrect,
+- path to the directory that contains SENNA executables. If the path is incorrect,
Senna will automatically search for executable file specified in SENNA environment variable
- List of the operations needed to be performed.
- (optionally) the encoding of the input data (default:utf-8)
+Note: Unit tests for this module can be found in test/unit/test_senna.py
+
>>> from __future__ import unicode_literals
>>> from nltk.classify import Senna
- >>> pipeline = Senna('/usr/share/senna-v2.0', ['pos', 'chk', 'ner'])
+ >>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
>>> sent = 'Dusseldorf is an international business center'.split()
- >>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)]
- [('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
+ >>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP
+ [('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
"""
@@ -42,8 +44,10 @@ from os import path, sep, environ
from subprocess import Popen, PIPE
from platform import architecture, system
+from six import text_type
+
from nltk.tag.api import TaggerI
-from nltk.compat import text_type, python_2_unicode_compatible
+from nltk.compat import python_2_unicode_compatible
_senna_url = 'http://ml.nec-labs.com/senna/'
@@ -55,29 +59,29 @@ class Senna(TaggerI):
def __init__(self, senna_path, operations, encoding='utf-8'):
self._encoding = encoding
- self._path = path.normpath(senna_path) + sep
-
- # Verifies the existence of the executable on the self._path first
+ self._path = path.normpath(senna_path) + sep
+
+ # Verifies the existence of the executable on the self._path first
#senna_binary_file_1 = self.executable(self._path)
exe_file_1 = self.executable(self._path)
if not path.isfile(exe_file_1):
- # Check for the system environment
+ # Check for the system environment
if 'SENNA' in environ:
- #self._path = path.join(environ['SENNA'],'')
- self._path = path.normpath(environ['SENNA']) + sep
+ #self._path = path.join(environ['SENNA'],'')
+ self._path = path.normpath(environ['SENNA']) + sep
exe_file_2 = self.executable(self._path)
if not path.isfile(exe_file_2):
raise OSError("Senna executable expected at %s or %s but not found" % (exe_file_1,exe_file_2))
-
+
self.operations = operations
-
+
def executable(self, base_path):
"""
The function that determines the system specific binary that should be
used in the pipeline. In case, the system is not known the default senna binary will
be used.
- """
+ """
os_name = system()
if os_name == 'Linux':
bits = architecture()[0]
@@ -89,7 +93,7 @@ class Senna(TaggerI):
if os_name == 'Darwin':
return path.join(base_path, 'senna-osx')
return path.join(base_path, 'senna')
-
+
def _map(self):
"""
A method that calculates the order of the columns that SENNA pipeline
@@ -116,11 +120,11 @@ class Senna(TaggerI):
calculated annotations/tags.
"""
encoding = self._encoding
-
+
if not path.isfile(self.executable(self._path)):
raise OSError("Senna executable expected at %s but not found" % self.executable(self._path))
-
-
+
+
# Build the senna command to run the tagger
_senna_cmd = [self.executable(self._path), '-path', self._path, '-usrtokens', '-iobtags']
_senna_cmd.extend(['-'+op for op in self.operations])
@@ -174,8 +178,6 @@ class Senna(TaggerI):
def setup_module(module):
from nose import SkipTest
try:
- tagger = Senna('/usr/share/senna-v2.0', ['pos', 'chk', 'ner'])
+ tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
except OSError:
raise SkipTest("Senna executable not found")
-
-
diff --git a/nltk/classify/svm.py b/nltk/classify/svm.py
index c1d1616..98a4008 100644
--- a/nltk/classify/svm.py
+++ b/nltk/classify/svm.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: SVM-based classifier
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Leon Derczynski <leon at dcs.shef.ac.uk>
#
# URL: <http://nltk.org/>
diff --git a/nltk/classify/tadm.py b/nltk/classify/tadm.py
index 4ca101c..615523c 100644
--- a/nltk/classify/tadm.py
+++ b/nltk/classify/tadm.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Interface to TADM Classifier
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Joseph Frazee <jfrazee at mail.utexas.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -9,7 +9,8 @@ from __future__ import print_function, unicode_literals
import sys
import subprocess
-from nltk import compat
+from six import string_types
+
from nltk.internals import find_binary
try:
import numpy
@@ -73,7 +74,7 @@ def call_tadm(args):
"""
Call the ``tadm`` binary with the given arguments.
"""
- if isinstance(args, compat.string_types):
+ if isinstance(args, string_types):
raise TypeError('args should be a list of strings')
if _tadm_bin is None:
config_tadm()
diff --git a/nltk/classify/textcat.py b/nltk/classify/textcat.py
index b672e16..10c3ad2 100644
--- a/nltk/classify/textcat.py
+++ b/nltk/classify/textcat.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language ID module using TextCat algorithm
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Avital Pekker <avital.pekker at utoronto.ca>
#
# URL: <http://nltk.org/>
diff --git a/nltk/classify/util.py b/nltk/classify/util.py
index e14707b..4741948 100644
--- a/nltk/classify/util.py
+++ b/nltk/classify/util.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Classifier Utility Functions
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com> (minor additions)
# URL: <http://nltk.org/>
diff --git a/nltk/classify/weka.py b/nltk/classify/weka.py
index 3fb9353..2c0ab4b 100644
--- a/nltk/classify/weka.py
+++ b/nltk/classify/weka.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Interface to Weka Classsifiers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -15,9 +15,10 @@ import os
import subprocess
import re
import zipfile
-
from sys import stdin
-from nltk import compat
+
+from six import integer_types, string_types
+
from nltk.probability import DictionaryProbDist
from nltk.internals import java, config_java
@@ -265,9 +266,9 @@ class ARFF_Formatter:
for (fname, fval) in tok.items():
if issubclass(type(fval), bool):
ftype = '{True, False}'
- elif issubclass(type(fval), (compat.integer_types, float, bool)):
+ elif issubclass(type(fval), (integer_types, float, bool)):
ftype = 'NUMERIC'
- elif issubclass(type(fval), compat.string_types):
+ elif issubclass(type(fval), string_types):
ftype = 'STRING'
elif fval is None:
continue # can't tell the type.
@@ -329,7 +330,7 @@ class ARFF_Formatter:
def _fmt_arff_val(self, fval):
if fval is None:
return '?'
- elif isinstance(fval, (bool, compat.integer_types)):
+ elif isinstance(fval, (bool, integer_types)):
return '%s' % fval
elif isinstance(fval, float):
return '%r' % fval
diff --git a/nltk/cluster/__init__.py b/nltk/cluster/__init__.py
index 21ec307..bf37a77 100644
--- a/nltk/cluster/__init__.py
+++ b/nltk/cluster/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Clusterers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn at cs.mu.oz.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/cluster/api.py b/nltk/cluster/api.py
index 79c91ae..bf2f4ad 100644
--- a/nltk/cluster/api.py
+++ b/nltk/cluster/api.py
@@ -1,31 +1,34 @@
# Natural Language Toolkit: Clusterer Interfaces
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn at cs.mu.oz.au>
# Porting: Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
from nltk.probability import DictionaryProbDist
+ at add_metaclass(ABCMeta)
class ClusterI(object):
"""
Interface covering basic clustering functionality.
"""
-
+ @abstractmethod
def cluster(self, vectors, assign_clusters=False):
"""
Assigns the vectors to clusters, learning the clustering parameters
from the data. Returns a cluster identifier for each vector.
"""
- raise NotImplementedError()
+ @abstractmethod
def classify(self, token):
"""
Classifies the token into a cluster, setting the token's CLUSTER
parameter to that cluster identifier.
"""
- raise NotImplementedError()
def likelihood(self, vector, label):
"""
@@ -51,11 +54,11 @@ class ClusterI(object):
likelihoods[cluster] /= sum
return DictionaryProbDist(likelihoods)
+ @abstractmethod
def num_clusters(self):
"""
Returns the number of clusters.
"""
- raise NotImplementedError()
def cluster_names(self):
"""
diff --git a/nltk/cluster/em.py b/nltk/cluster/em.py
index 0ac2a2c..54b42f5 100644
--- a/nltk/cluster/em.py
+++ b/nltk/cluster/em.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Expectation Maximization Clusterer
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn at cs.mu.oz.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/cluster/gaac.py b/nltk/cluster/gaac.py
index 729cc52..2ec63c4 100644
--- a/nltk/cluster/gaac.py
+++ b/nltk/cluster/gaac.py
@@ -1,10 +1,10 @@
# Natural Language Toolkit: Group Average Agglomerative Clusterer
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn at cs.mu.oz.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-from __future__ import print_function, unicode_literals
+from __future__ import print_function, unicode_literals, division
try:
import numpy
@@ -109,7 +109,7 @@ class GAAClusterer(VectorSpaceClusterer):
centroid += self._normalise(vector)
else:
centroid += vector
- centroid /= float(len(cluster))
+ centroid /= len(cluster)
self._centroids.append(centroid)
self._num_clusters = len(self._centroids)
diff --git a/nltk/cluster/kmeans.py b/nltk/cluster/kmeans.py
index 2b28b57..2da6c7c 100644
--- a/nltk/cluster/kmeans.py
+++ b/nltk/cluster/kmeans.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: K-Means Clusterer
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn at cs.mu.oz.au>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/cluster/util.py b/nltk/cluster/util.py
index ad46242..08930aa 100644
--- a/nltk/cluster/util.py
+++ b/nltk/cluster/util.py
@@ -1,10 +1,12 @@
# Natural Language Toolkit: Clusterer Utilities
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn at cs.mu.oz.au>
+# Contributor: J Richard Snape
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-from __future__ import print_function, unicode_literals
+from __future__ import print_function, unicode_literals, division
+from abc import abstractmethod
import copy
from sys import stdout
@@ -18,6 +20,7 @@ except ImportError:
from nltk.cluster.api import ClusterI
from nltk.compat import python_2_unicode_compatible
+
class VectorSpaceClusterer(ClusterI):
"""
Abstract clusterer which takes tokens and maps them into a vector space.
@@ -45,11 +48,12 @@ class VectorSpaceClusterer(ClusterI):
# use SVD to reduce the dimensionality
if self._svd_dimensions and self._svd_dimensions < len(vectors[0]):
- [u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors)))
+ [u, d, vt] = numpy.linalg.svd(numpy.transpose(
+ numpy.array(vectors)))
S = d[:self._svd_dimensions] * \
numpy.identity(self._svd_dimensions, numpy.float64)
- T = u[:,:self._svd_dimensions]
- Dt = vt[:self._svd_dimensions,:]
+ T = u[:, :self._svd_dimensions]
+ Dt = vt[:self._svd_dimensions, :]
vectors = numpy.transpose(numpy.dot(S, Dt))
self._Tt = numpy.transpose(T)
@@ -60,11 +64,11 @@ class VectorSpaceClusterer(ClusterI):
if assign_clusters:
return [self.classify(vector) for vector in vectors]
+ @abstractmethod
def cluster_vectorspace(self, vectors, trace):
"""
Finds the clusters using the given set of vectors.
"""
- raise NotImplementedError()
def classify(self, vector):
if self._should_normalise:
@@ -74,11 +78,11 @@ class VectorSpaceClusterer(ClusterI):
cluster = self.classify_vectorspace(vector)
return self.cluster_name(cluster)
+ @abstractmethod
def classify_vectorspace(self, vector):
"""
Returns the index of the appropriate cluster for the vector.
"""
- raise NotImplementedError()
def likelihood(self, vector, label):
if self._should_normalise:
@@ -110,6 +114,7 @@ class VectorSpaceClusterer(ClusterI):
"""
return vector / sqrt(numpy.dot(vector, vector))
+
def euclidean_distance(u, v):
"""
Returns the euclidean distance between vectors u and v. This is equivalent
@@ -118,12 +123,15 @@ def euclidean_distance(u, v):
diff = u - v
return sqrt(numpy.dot(diff, diff))
+
def cosine_distance(u, v):
"""
- Returns 1 minus the cosine of the angle between vectors v and u. This is equal to
- 1 - (u.v / |u||v|).
+ Returns 1 minus the cosine of the angle between vectors v and u. This is
+ equal to 1 - (u.v / |u||v|).
"""
- return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))
+ return 1 - (numpy.dot(u, v) / (
+ sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))
+
class _DendrogramNode(object):
""" Tree node of a dendrogram. """
@@ -164,6 +172,9 @@ class _DendrogramNode(object):
groups.append(node.leaves())
return groups
+ def __lt__(self, comparator):
+ return cosine_distance(self._value, comparator._value) < 0
+
@python_2_unicode_compatible
class Dendrogram(object):
@@ -214,7 +225,8 @@ class Dendrogram(object):
def show(self, leaf_labels=[]):
"""
Print the dendrogram in ASCII art to standard out.
- :param leaf_labels: an optional list of strings to use for labeling the leaves
+ :param leaf_labels: an optional list of strings to use for labeling the
+ leaves
:type leaf_labels: list
"""
@@ -235,30 +247,35 @@ class Dendrogram(object):
# find the bottom row and the best cell width
width = max(map(len, last_row)) + 1
- lhalf = width / 2
- rhalf = width - lhalf - 1
+ lhalf = width // 2
+ rhalf = int(width - lhalf - 1)
# display functions
def format(centre, left=' ', right=' '):
return '%s%s%s' % (lhalf*left, centre, right*rhalf)
+
def display(str):
stdout.write(str)
# for each merge, top down
queue = [(root._value, root)]
- verticals = [ format(' ') for leaf in leaves ]
+ verticals = [format(' ') for leaf in leaves]
while queue:
priority, node = queue.pop()
- child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children))
+ child_left_leaf = list(map(
+ lambda c: c.leaves(False)[0], node._children))
indices = list(map(leaves.index, child_left_leaf))
if child_left_leaf:
min_idx = min(indices)
max_idx = max(indices)
for i in range(len(leaves)):
if leaves[i] in child_left_leaf:
- if i == min_idx: display(format(JOIN, ' ', HLINK))
- elif i == max_idx: display(format(JOIN, HLINK, ' '))
- else: display(format(JOIN, HLINK, HLINK))
+ if i == min_idx:
+ display(format(JOIN, ' ', HLINK))
+ elif i == max_idx:
+ display(format(JOIN, HLINK, ' '))
+ else:
+ display(format(JOIN, HLINK, HLINK))
verticals[i] = format(VLINK)
elif min_idx <= i <= max_idx:
display(format(HLINK, HLINK, HLINK))
@@ -285,5 +302,3 @@ class Dendrogram(object):
root = self._items[0]
leaves = root.leaves(False)
return '<Dendrogram with %d leaves>' % len(leaves)
-
-
diff --git a/nltk/collections.py b/nltk/collections.py
new file mode 100644
index 0000000..d915c1f
--- /dev/null
+++ b/nltk/collections.py
@@ -0,0 +1,688 @@
+# Natural Language Toolkit: Collections
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Steven Bird <stevenbird1 at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+from __future__ import print_function, absolute_import
+
+import locale
+import re
+import types
+import textwrap
+import pydoc
+import bisect
+import os
+from itertools import islice, chain, combinations
+from functools import total_ordering
+from collections import defaultdict, deque, Counter
+
+from six import text_type
+
+from nltk.internals import slice_bounds, raise_unorderable_types
+from nltk.compat import python_2_unicode_compatible
+
+
+##########################################################################
+# Ordered Dictionary
+##########################################################################
+
+class OrderedDict(dict):
+ def __init__(self, data=None, **kwargs):
+ self._keys = self.keys(data, kwargs.get('keys'))
+ self._default_factory = kwargs.get('default_factory')
+ if data is None:
+ dict.__init__(self)
+ else:
+ dict.__init__(self, data)
+
+ def __delitem__(self, key):
+ dict.__delitem__(self, key)
+ self._keys.remove(key)
+
+ def __getitem__(self, key):
+ try:
+ return dict.__getitem__(self, key)
+ except KeyError:
+ return self.__missing__(key)
+
+ def __iter__(self):
+ return (key for key in self.keys())
+
+ def __missing__(self, key):
+ if not self._default_factory and key not in self._keys:
+ raise KeyError()
+ return self._default_factory()
+
+ def __setitem__(self, key, item):
+ dict.__setitem__(self, key, item)
+ if key not in self._keys:
+ self._keys.append(key)
+
+ def clear(self):
+ dict.clear(self)
+ self._keys.clear()
+
+ def copy(self):
+ d = dict.copy(self)
+ d._keys = self._keys
+ return d
+
+ def items(self):
+ # returns iterator under python 3 and list under python 2
+ return zip(self.keys(), self.values())
+
+ def keys(self, data=None, keys=None):
+ if data:
+ if keys:
+ assert isinstance(keys, list)
+ assert len(data) == len(keys)
+ return keys
+ else:
+ assert isinstance(data, dict) or \
+ isinstance(data, OrderedDict) or \
+ isinstance(data, list)
+ if isinstance(data, dict) or isinstance(data, OrderedDict):
+ return data.keys()
+ elif isinstance(data, list):
+ return [key for (key, value) in data]
+ elif '_keys' in self.__dict__:
+ return self._keys
+ else:
+ return []
+
+ def popitem(self):
+ if not self._keys:
+ raise KeyError()
+
+ key = self._keys.pop()
+ value = self[key]
+ del self[key]
+ return (key, value)
+
+ def setdefault(self, key, failobj=None):
+ dict.setdefault(self, key, failobj)
+ if key not in self._keys:
+ self._keys.append(key)
+
+ def update(self, data):
+ dict.update(self, data)
+ for key in self.keys(data):
+ if key not in self._keys:
+ self._keys.append(key)
+
+ def values(self):
+ # returns iterator under python 3
+ return map(self.get, self._keys)
+
+######################################################################
+# Lazy Sequences
+######################################################################
+
+ at total_ordering
+ at python_2_unicode_compatible
+class AbstractLazySequence(object):
+ """
+ An abstract base class for read-only sequences whose values are
+ computed as needed. Lazy sequences act like tuples -- they can be
+ indexed, sliced, and iterated over; but they may not be modified.
+
+ The most common application of lazy sequences in NLTK is for
+ corpus view objects, which provide access to the contents of a
+ corpus without loading the entire corpus into memory, by loading
+ pieces of the corpus from disk as needed.
+
+ The result of modifying a mutable element of a lazy sequence is
+ undefined. In particular, the modifications made to the element
+ may or may not persist, depending on whether and when the lazy
+ sequence caches that element's value or reconstructs it from
+ scratch.
+
+ Subclasses are required to define two methods: ``__len__()``
+ and ``iterate_from()``.
+ """
+ def __len__(self):
+ """
+ Return the number of tokens in the corpus file underlying this
+ corpus view.
+ """
+ raise NotImplementedError('should be implemented by subclass')
+
+ def iterate_from(self, start):
+ """
+ Return an iterator that generates the tokens in the corpus
+ file underlying this corpus view, starting at the token number
+ ``start``. If ``start>=len(self)``, then this iterator will
+ generate no tokens.
+ """
+ raise NotImplementedError('should be implemented by subclass')
+
+ def __getitem__(self, i):
+ """
+ Return the *i* th token in the corpus file underlying this
+ corpus view. Negative indices and spans are both supported.
+ """
+ if isinstance(i, slice):
+ start, stop = slice_bounds(self, i)
+ return LazySubsequence(self, start, stop)
+ else:
+ # Handle negative indices
+ if i < 0: i += len(self)
+ if i < 0: raise IndexError('index out of range')
+ # Use iterate_from to extract it.
+ try:
+ return next(self.iterate_from(i))
+ except StopIteration:
+ raise IndexError('index out of range')
+
+ def __iter__(self):
+ """Return an iterator that generates the tokens in the corpus
+ file underlying this corpus view."""
+ return self.iterate_from(0)
+
+ def count(self, value):
+ """Return the number of times this list contains ``value``."""
+ return sum(1 for elt in self if elt==value)
+
+ def index(self, value, start=None, stop=None):
+ """Return the index of the first occurrence of ``value`` in this
+ list that is greater than or equal to ``start`` and less than
+ ``stop``. Negative start and stop values are treated like negative
+ slice bounds -- i.e., they count from the end of the list."""
+ start, stop = slice_bounds(self, slice(start, stop))
+ for i, elt in enumerate(islice(self, start, stop)):
+ if elt == value: return i+start
+ raise ValueError('index(x): x not in list')
+
+ def __contains__(self, value):
+ """Return true if this list contains ``value``."""
+ return bool(self.count(value))
+
+ def __add__(self, other):
+ """Return a list concatenating self with other."""
+ return LazyConcatenation([self, other])
+
+ def __radd__(self, other):
+ """Return a list concatenating other with self."""
+ return LazyConcatenation([other, self])
+
+ def __mul__(self, count):
+ """Return a list concatenating self with itself ``count`` times."""
+ return LazyConcatenation([self] * count)
+
+ def __rmul__(self, count):
+ """Return a list concatenating self with itself ``count`` times."""
+ return LazyConcatenation([self] * count)
+
+ _MAX_REPR_SIZE = 60
+ def __repr__(self):
+ """
+ Return a string representation for this corpus view that is
+ similar to a list's representation; but if it would be more
+ than 60 characters long, it is truncated.
+ """
+ pieces = []
+ length = 5
+ for elt in self:
+ pieces.append(repr(elt))
+ length += len(pieces[-1]) + 2
+ if length > self._MAX_REPR_SIZE and len(pieces) > 2:
+ return '[%s, ...]' % text_type(', ').join(pieces[:-1])
+ else:
+ return '[%s]' % text_type(', ').join(pieces)
+
+ def __eq__(self, other):
+ return (type(self) == type(other) and list(self) == list(other))
+
+ def __ne__(self, other):
+ return not self == other
+
+ def __lt__(self, other):
+ if type(other) != type(self):
+ raise_unorderable_types("<", self, other)
+ return list(self) < list(other)
+
+ def __hash__(self):
+ """
+ :raise ValueError: Corpus view objects are unhashable.
+ """
+ raise ValueError('%s objects are unhashable' %
+ self.__class__.__name__)
+
+
+class LazySubsequence(AbstractLazySequence):
+ """
+ A subsequence produced by slicing a lazy sequence. This slice
+ keeps a reference to its source sequence, and generates its values
+ by looking them up in the source sequence.
+ """
+
+ MIN_SIZE = 100
+ """
+ The minimum size for which lazy slices should be created. If
+ ``LazySubsequence()`` is called with a subsequence that is
+ shorter than ``MIN_SIZE``, then a tuple will be returned instead.
+ """
+
+ def __new__(cls, source, start, stop):
+ """
+ Construct a new slice from a given underlying sequence. The
+ ``start`` and ``stop`` indices should be absolute indices --
+ i.e., they should not be negative (for indexing from the back
+ of a list) or greater than the length of ``source``.
+ """
+ # If the slice is small enough, just use a tuple.
+ if stop-start < cls.MIN_SIZE:
+ return list(islice(source.iterate_from(start), stop-start))
+ else:
+ return object.__new__(cls)
+
+ def __init__(self, source, start, stop):
+ self._source = source
+ self._start = start
+ self._stop = stop
+
+ def __len__(self):
+ return self._stop - self._start
+
+ def iterate_from(self, start):
+ return islice(self._source.iterate_from(start+self._start),
+ max(0, len(self)-start))
+
+
+class LazyConcatenation(AbstractLazySequence):
+ """
+ A lazy sequence formed by concatenating a list of lists. This
+ underlying list of lists may itself be lazy. ``LazyConcatenation``
+ maintains an index that it uses to keep track of the relationship
+ between offsets in the concatenated lists and offsets in the
+ sublists.
+ """
+ def __init__(self, list_of_lists):
+ self._list = list_of_lists
+ self._offsets = [0]
+
+ def __len__(self):
+ if len(self._offsets) <= len(self._list):
+ for tok in self.iterate_from(self._offsets[-1]): pass
+ return self._offsets[-1]
+
+ def iterate_from(self, start_index):
+ if start_index < self._offsets[-1]:
+ sublist_index = bisect.bisect_right(self._offsets, start_index)-1
+ else:
+ sublist_index = len(self._offsets)-1
+
+ index = self._offsets[sublist_index]
+
+ # Construct an iterator over the sublists.
+ if isinstance(self._list, AbstractLazySequence):
+ sublist_iter = self._list.iterate_from(sublist_index)
+ else:
+ sublist_iter = islice(self._list, sublist_index, None)
+
+ for sublist in sublist_iter:
+ if sublist_index == (len(self._offsets)-1):
+ assert index+len(sublist) >= self._offsets[-1], (
+ 'offests not monotonic increasing!')
+ self._offsets.append(index+len(sublist))
+ else:
+ assert self._offsets[sublist_index+1] == index+len(sublist), (
+ 'inconsistent list value (num elts)')
+
+ for value in sublist[max(0, start_index-index):]:
+ yield value
+
+ index += len(sublist)
+ sublist_index += 1
+
+
+class LazyMap(AbstractLazySequence):
+ """
+ A lazy sequence whose elements are formed by applying a given
+ function to each element in one or more underlying lists. The
+ function is applied lazily -- i.e., when you read a value from the
+ list, ``LazyMap`` will calculate that value by applying its
+ function to the underlying lists' value(s). ``LazyMap`` is
+ essentially a lazy version of the Python primitive function
+ ``map``. In particular, the following two expressions are
+ equivalent:
+
+ >>> from nltk.collections import LazyMap
+ >>> function = str
+ >>> sequence = [1,2,3]
+ >>> map(function, sequence) # doctest: +SKIP
+ ['1', '2', '3']
+ >>> list(LazyMap(function, sequence))
+ ['1', '2', '3']
+
+ Like the Python ``map`` primitive, if the source lists do not have
+ equal size, then the value None will be supplied for the
+ 'missing' elements.
+
+ Lazy maps can be useful for conserving memory, in cases where
+ individual values take up a lot of space. This is especially true
+ if the underlying list's values are constructed lazily, as is the
+ case with many corpus readers.
+
+ A typical example of a use case for this class is performing
+ feature detection on the tokens in a corpus. Since featuresets
+ are encoded as dictionaries, which can take up a lot of memory,
+ using a ``LazyMap`` can significantly reduce memory usage when
+ training and running classifiers.
+ """
+ def __init__(self, function, *lists, **config):
+ """
+ :param function: The function that should be applied to
+ elements of ``lists``. It should take as many arguments
+ as there are ``lists``.
+ :param lists: The underlying lists.
+ :param cache_size: Determines the size of the cache used
+ by this lazy map. (default=5)
+ """
+ if not lists:
+ raise TypeError('LazyMap requires at least two args')
+
+ self._lists = lists
+ self._func = function
+ self._cache_size = config.get('cache_size', 5)
+ self._cache = ({} if self._cache_size > 0 else None)
+
+ # If you just take bool() of sum() here _all_lazy will be true just
+ # in case n >= 1 list is an AbstractLazySequence. Presumably this
+ # isn't what's intended.
+ self._all_lazy = sum(isinstance(lst, AbstractLazySequence)
+ for lst in lists) == len(lists)
+
+ def iterate_from(self, index):
+ # Special case: one lazy sublist
+ if len(self._lists) == 1 and self._all_lazy:
+ for value in self._lists[0].iterate_from(index):
+ yield self._func(value)
+ return
+
+ # Special case: one non-lazy sublist
+ elif len(self._lists) == 1:
+ while True:
+ try: yield self._func(self._lists[0][index])
+ except IndexError: return
+ index += 1
+
+ # Special case: n lazy sublists
+ elif self._all_lazy:
+ iterators = [lst.iterate_from(index) for lst in self._lists]
+ while True:
+ elements = []
+ for iterator in iterators:
+ try: elements.append(next(iterator))
+ except: elements.append(None)
+ if elements == [None] * len(self._lists):
+ return
+ yield self._func(*elements)
+ index += 1
+
+ # general case
+ else:
+ while True:
+ try: elements = [lst[index] for lst in self._lists]
+ except IndexError:
+ elements = [None] * len(self._lists)
+ for i, lst in enumerate(self._lists):
+ try: elements[i] = lst[index]
+ except IndexError: pass
+ if elements == [None] * len(self._lists):
+ return
+ yield self._func(*elements)
+ index += 1
+
+ def __getitem__(self, index):
+ if isinstance(index, slice):
+ sliced_lists = [lst[index] for lst in self._lists]
+ return LazyMap(self._func, *sliced_lists)
+ else:
+ # Handle negative indices
+ if index < 0: index += len(self)
+ if index < 0: raise IndexError('index out of range')
+ # Check the cache
+ if self._cache is not None and index in self._cache:
+ return self._cache[index]
+ # Calculate the value
+ try: val = next(self.iterate_from(index))
+ except StopIteration:
+ raise IndexError('index out of range')
+ # Update the cache
+ if self._cache is not None:
+ if len(self._cache) > self._cache_size:
+ self._cache.popitem() # discard random entry
+ self._cache[index] = val
+ # Return the value
+ return val
+
+ def __len__(self):
+ return max(len(lst) for lst in self._lists)
+
+
+class LazyZip(LazyMap):
+ """
+ A lazy sequence whose elements are tuples, each containing the i-th
+ element from each of the argument sequences. The returned list is
+ truncated in length to the length of the shortest argument sequence. The
+ tuples are constructed lazily -- i.e., when you read a value from the
+ list, ``LazyZip`` will calculate that value by forming a tuple from
+ the i-th element of each of the argument sequences.
+
+ ``LazyZip`` is essentially a lazy version of the Python primitive function
+ ``zip``. In particular, an evaluated LazyZip is equivalent to a zip:
+
+ >>> from nltk.collections import LazyZip
+ >>> sequence1, sequence2 = [1, 2, 3], ['a', 'b', 'c']
+ >>> zip(sequence1, sequence2) # doctest: +SKIP
+ [(1, 'a'), (2, 'b'), (3, 'c')]
+ >>> list(LazyZip(sequence1, sequence2))
+ [(1, 'a'), (2, 'b'), (3, 'c')]
+ >>> sequences = [sequence1, sequence2, [6,7,8,9]]
+ >>> list(zip(*sequences)) == list(LazyZip(*sequences))
+ True
+
+ Lazy zips can be useful for conserving memory in cases where the argument
+ sequences are particularly long.
+
+ A typical example of a use case for this class is combining long sequences
+ of gold standard and predicted values in a classification or tagging task
+ in order to calculate accuracy. By constructing tuples lazily and
+ avoiding the creation of an additional long sequence, memory usage can be
+ significantly reduced.
+ """
+ def __init__(self, *lists):
+ """
+ :param lists: the underlying lists
+ :type lists: list(list)
+ """
+ LazyMap.__init__(self, lambda *elts: elts, *lists)
+
+ def iterate_from(self, index):
+ iterator = LazyMap.iterate_from(self, index)
+ while index < len(self):
+ yield next(iterator)
+ index += 1
+ return
+
+ def __len__(self):
+ return min(len(lst) for lst in self._lists)
+
+
+class LazyEnumerate(LazyZip):
+ """
+ A lazy sequence whose elements are tuples, each ontaining a count (from
+ zero) and a value yielded by underlying sequence. ``LazyEnumerate`` is
+ useful for obtaining an indexed list. The tuples are constructed lazily
+ -- i.e., when you read a value from the list, ``LazyEnumerate`` will
+ calculate that value by forming a tuple from the count of the i-th
+ element and the i-th element of the underlying sequence.
+
+ ``LazyEnumerate`` is essentially a lazy version of the Python primitive
+ function ``enumerate``. In particular, the following two expressions are
+ equivalent:
+
+ >>> from nltk.collections import LazyEnumerate
+ >>> sequence = ['first', 'second', 'third']
+ >>> list(enumerate(sequence))
+ [(0, 'first'), (1, 'second'), (2, 'third')]
+ >>> list(LazyEnumerate(sequence))
+ [(0, 'first'), (1, 'second'), (2, 'third')]
+
+ Lazy enumerations can be useful for conserving memory in cases where the
+ argument sequences are particularly long.
+
+ A typical example of a use case for this class is obtaining an indexed
+ list for a long sequence of values. By constructing tuples lazily and
+ avoiding the creation of an additional long sequence, memory usage can be
+ significantly reduced.
+ """
+
+ def __init__(self, lst):
+ """
+ :param lst: the underlying list
+ :type lst: list
+ """
+ LazyZip.__init__(self, range(len(lst)), lst)
+
+class LazyIteratorList(AbstractLazySequence):
+ """
+ Wraps an iterator, loading its elements on demand
+ and making them subscriptable.
+ __repr__ displays only the first few elements.
+ """
+ def __init__(self, it, known_len=None):
+ self._it = it
+ self._len = known_len
+ self._cache = []
+
+ def __len__(self):
+ if self._len:
+ return self._len
+ for x in self.iterate_from(len(self._cache)):
+ pass
+ self._len = len(self._cache)
+ return self._len
+
+ def iterate_from(self, start):
+ """Create a new iterator over this list starting at the given offset."""
+ while len(self._cache)<start:
+ v = next(self._it)
+ self._cache.append(v)
+ i = start
+ while i<len(self._cache):
+ yield self._cache[i]
+ i += 1
+ while True:
+ v = next(self._it)
+ self._cache.append(v)
+ yield v
+ i += 1
+
+ def __add__(self, other):
+ """Return a list concatenating self with other."""
+ return type(self)(chain(self, other))
+
+ def __radd__(self, other):
+ """Return a list concatenating other with self."""
+ return type(self)(chain(other, self))
+
+######################################################################
+# Trie Implementation
+######################################################################
+class Trie(defaultdict):
+ """A Trie implementation for strings"""
+ LEAF = True
+
+ def __init__(self, strings=None):
+ """Builds a Trie object, which is built around a ``defaultdict``
+
+ If ``strings`` is provided, it will add the ``strings``, which
+ consist of a ``list`` of ``strings``, to the Trie.
+ Otherwise, it'll construct an empty Trie.
+
+ :param strings: List of strings to insert into the trie
+ (Default is ``None``)
+ :type strings: list(str)
+
+ """
+ defaultdict.__init__(self, Trie)
+ if strings:
+ for string in strings:
+ self.insert(string)
+
+ def insert(self, string):
+ """Inserts ``string`` into the Trie
+
+ :param string: String to insert into the trie
+ :type string: str
+
+ :Example:
+
+ >>> from nltk.collections import Trie
+ >>> trie = Trie(["ab"])
+ >>> trie
+ defaultdict(<class 'nltk.collections.Trie'>, {'a': defaultdict(<class 'nltk.collections.Trie'>, {'b': defaultdict(<class 'nltk.collections.Trie'>, {True: None})})})
+
+ """
+ if len(string):
+ self[string[0]].insert(string[1:])
+ else:
+ # mark the string is complete
+ self[Trie.LEAF] = None
+
+ def __str__(self):
+ return str(self.as_dict())
+
+ def as_dict(self, d=None):
+ """Convert ``defaultdict`` to common ``dict`` representation.
+
+ :param: A defaultdict containing strings mapped to nested defaultdicts.
+ This is the structure of the trie. (Default is None)
+ :type: defaultdict(str -> defaultdict)
+ :return: Even though ``defaultdict`` is a subclass of ``dict`` and thus
+ can be converted to a simple ``dict`` using ``dict()``, in our case
+ it's a nested ``defaultdict``, so here's a quick trick to provide to
+ us the ``dict`` representation of the ``Trie`` without
+ ``defaultdict(<class 'nltk.collections.Trie'>, ...``
+ :rtype: dict(str -> dict(bool -> None))
+ Note: there can be an arbitrarily deeply nested
+ ``dict(str -> dict(str -> dict(..))``, but the last
+ level will have ``dict(str -> dict(bool -> None))``
+
+ :Example:
+
+ >>> from nltk.collections import Trie
+ >>> trie = Trie(["abc", "def"])
+ >>> expected = {'a': {'b': {'c': {True: None}}}, 'd': {'e': {'f': {True: None}}}}
+ >>> trie.as_dict() == expected
+ True
+
+ """
+ def _default_to_regular(d):
+ """
+ Source: http://stackoverflow.com/a/26496899/4760801
+
+ :param d: Nested ``defaultdict`` to convert to regular ``dict``
+ :type d: defaultdict(str -> defaultdict(...))
+ :return: A dict representation of the defaultdict
+ :rtype: dict(str -> dict(str -> ...))
+
+ :Example:
+
+ >>> from collections import defaultdict
+ >>> d = defaultdict(defaultdict)
+ >>> d["one"]["two"] = "three"
+ >>> d
+ defaultdict(<type 'collections.defaultdict'>, {'one': defaultdict(None, {'two': 'three'})})
+ >>> _default_to_regular(d)
+ {'one': {'two': 'three'}}
+
+ """
+ if isinstance(d, defaultdict):
+ d = {k: _default_to_regular(v) for k, v in d.items()}
+ return d
+
+ return _default_to_regular(self)
diff --git a/nltk/collocations.py b/nltk/collocations.py
index 18f7569..0b5a1f5 100644
--- a/nltk/collocations.py
+++ b/nltk/collocations.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Collocations and Association Measures
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Joel Nothman <jnothman at student.usyd.edu.au>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
@@ -32,7 +32,7 @@ from __future__ import print_function
# and unigram counts (raw_freq, pmi, student_t)
import itertools as _itertools
-from nltk.compat import iteritems
+from six import iteritems
from nltk.probability import FreqDist
from nltk.util import ngrams
diff --git a/nltk/compat.py b/nltk/compat.py
index b4c269a..8efda7e 100755
--- a/nltk/compat.py
+++ b/nltk/compat.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Compatibility
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -9,89 +9,32 @@
from __future__ import absolute_import, print_function
import os
import sys
-import types
-from functools import wraps
+from functools import update_wrapper, wraps
import fractions
+import unicodedata
+
+from six import string_types, text_type
# Python 2/3 compatibility layer. Based on six.
PY3 = sys.version_info[0] == 3
-PY26 = sys.version_info[:2] == (2, 6)
if PY3:
- def b(s):
- return s.encode("latin-1")
-
- def u(s):
- return s
-
- string_types = str,
- integer_types = int,
- class_types = type,
- text_type = str
- binary_type = bytes
-
- MAXSIZE = sys.maxsize
- get_im_class = lambda meth: meth.__self__.__class__
- xrange = range
- _iterkeys = "keys"
- _itervalues = "values"
- _iteritems = "items"
- from imp import reload
- raw_input = input
-
- imap = map
- izip = zip
+ def get_im_class(meth):
+ return meth.__self__.__class__
import io
StringIO = io.StringIO
BytesIO = io.BytesIO
- import html.entities as htmlentitydefs
- from urllib.request import (urlopen, ProxyHandler, build_opener,
- install_opener, getproxies, HTTPPasswordMgrWithDefaultRealm,
- ProxyBasicAuthHandler, ProxyDigestAuthHandler, Request,
- url2pathname)
- from urllib.error import HTTPError, URLError
- from urllib.parse import quote_plus, unquote_plus, urlencode
-
- from collections import Counter
-
from datetime import timezone
UTC = timezone.utc
from tempfile import TemporaryDirectory
- unichr = chr
- if sys.version_info[1] <= 1:
- def int2byte(i):
- return bytes((i,))
- else:
- # This is about 2x faster than the implementation above on 3.2+
- import operator
- int2byte = operator.methodcaller("to_bytes", 1, "big")
-
else:
- def b(s):
- return s
-
- def u(s):
- return unicode(s, "unicode_escape")
-
- string_types = basestring,
- integer_types = (int, long)
- class_types = (type, types.ClassType)
- text_type = unicode
- binary_type = str
- get_im_class = lambda meth: meth.im_class
- xrange = xrange
- _iterkeys = "iterkeys"
- _itervalues = "itervalues"
- _iteritems = "iteritems"
- reload = reload
- raw_input = raw_input
-
- from itertools import imap, izip
+ def get_im_class(meth):
+ return meth.im_class
try:
from cStringIO import StringIO
@@ -99,49 +42,6 @@ else:
from StringIO import StringIO
BytesIO = StringIO
- import htmlentitydefs
- from urllib2 import (urlopen, HTTPError, URLError,
- ProxyHandler, build_opener, install_opener,
- HTTPPasswordMgrWithDefaultRealm, ProxyBasicAuthHandler,
- ProxyDigestAuthHandler, Request)
- from urllib import getproxies, quote_plus, unquote_plus, urlencode, url2pathname
-
- # Maps py2 tkinter package structure to py3 using import hook (PEP 302)
- class TkinterPackage(object):
- def __init__(self):
- self.mod = __import__("Tkinter")
- self.__path__ = ["nltk_py2_tkinter_package_path"]
-
- def __getattr__(self, name):
- return getattr(self.mod, name)
-
- class TkinterLoader(object):
- def __init__(self):
- # module name mapping from py3 to py2
- self.module_map = {
- "tkinter": "Tkinter",
- "tkinter.filedialog": "tkFileDialog",
- "tkinter.font": "tkFont",
- "tkinter.messagebox": "tkMessageBox",
- }
-
- def find_module(self, name, path=None):
- # we are only interested in tkinter modules listed
- # in self.module_map
- if name in self.module_map:
- return self
-
- def load_module(self, name):
- if name not in sys.modules:
- if name == 'tkinter':
- mod = TkinterPackage()
- else:
- mod = __import__(self.module_map[name])
- sys.modules[name] = mod
- return sys.modules[name]
-
- sys.meta_path.insert(0, TkinterLoader())
-
from datetime import tzinfo, timedelta
ZERO = timedelta(0)
@@ -162,9 +62,6 @@ else:
UTC = UTC()
- unichr = unichr
- int2byte = chr
-
import csv
import codecs
import cStringIO
@@ -176,7 +73,8 @@ else:
see https://docs.python.org/2/library/csv.html
"""
- def __init__(self, f, dialect=csv.excel, encoding="utf-8", errors='replace', **kwds):
+ def __init__(self, f, dialect=csv.excel, encoding="utf-8",
+ errors='replace', **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
@@ -185,7 +83,7 @@ else:
self.encoder = encoder_cls(errors=errors)
def encode(self, data):
- if isinstance(data, basestring):
+ if isinstance(data, string_types):
return data.encode("utf-8")
else:
return data
@@ -241,19 +139,20 @@ else:
# up due to missing globals
if "None" not in str(ex):
raise
- print("ERROR: {!r} while cleaning up {!r}".format(ex, self,),
+ print("ERROR: {!r} while cleaning up {!r}".format(ex,
+ self),
file=sys.stderr)
return
self._closed = True
if _warn:
self._warn("Implicitly cleaning up {!r}".format(self),
- ResourceWarning)
+ Warning)
def __exit__(self, exc, value, tb):
self.cleanup()
def __del__(self):
- # Issue a ResourceWarning if implicit cleanup needed
+ # Issue a Warning if implicit cleanup needed
self.cleanup(_warn=True)
# XXX (ncoghlan): The following code attempts to make
@@ -274,7 +173,8 @@ else:
for name in self._listdir(path):
fullname = self._path_join(path, name)
try:
- isdir = self._isdir(fullname) and not self._islink(fullname)
+ isdir = (self._isdir(fullname) and not
+ self._islink(fullname))
except OSError:
isdir = False
if isdir:
@@ -289,251 +189,10 @@ else:
except OSError:
pass
- if PY26:
- from operator import itemgetter
- from heapq import nlargest
- from itertools import repeat, ifilter
-
- class Counter(dict):
- '''Dict subclass for counting hashable objects. Sometimes called a bag
- or multiset. Elements are stored as dictionary keys and their counts
- are stored as dictionary values.
-
- >>> Counter('zyzygy')
- Counter({'y': 3, 'z': 2, 'g': 1})
-
- '''
-
- def __init__(self, iterable=None, **kwds):
- '''Create a new, empty Counter object. And if given, count elements
- from an input iterable. Or, initialize the count from another mapping
- of elements to their counts.
-
- >>> Counter() # a new, empty counter
- >>> Counter('gallahad') # a new counter from an iterable
- >>> Counter({'a': 4, 'b': 2}) # a new counter from a mapping
- >>> Counter(a=4, b=2) # a new counter from keyword args
-
- '''
- self.update(iterable, **kwds)
-
- def __missing__(self, key):
- return 0
-
- def most_common(self, n=None):
- '''List the n most common elements and their counts from the most
- common to the least. If n is None, then list all element counts.
-
- >>> Counter('abracadabra').most_common(3)
- [('a', 5), ('r', 2), ('b', 2)]
-
- '''
- if n is None:
- return sorted(self.iteritems(), key=itemgetter(1), reverse=True)
- return nlargest(n, self.iteritems(), key=itemgetter(1))
-
- def elements(self):
- '''Iterator over elements repeating each as many times as its count.
-
- >>> c = Counter('ABCABC')
- >>> sorted(c.elements())
- ['A', 'A', 'B', 'B', 'C', 'C']
-
- If an element's count has been set to zero or is a negative number,
- elements() will ignore it.
-
- '''
- for elem, count in self.iteritems():
- for _ in repeat(None, count):
- yield elem
-
- # Override dict methods where the meaning changes for Counter
- # objects.
-
- @classmethod
- def fromkeys(cls, iterable, v=None):
- raise NotImplementedError(
- 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.')
-
- def update(self, iterable=None, **kwds):
- '''Like dict.update() but add counts instead of replacing them.
-
- Source can be an iterable, a dictionary, or another Counter instance.
-
- >>> c = Counter('which')
- >>> c.update('witch') # add elements from another iterable
- >>> d = Counter('watch')
- >>> c.update(d) # add elements from another counter
- >>> c['h'] # four 'h' in which, witch, and watch
- 4
-
- '''
- if iterable is not None:
- if hasattr(iterable, 'iteritems'):
- if self:
- self_get = self.get
- for elem, count in iterable.iteritems():
- self[elem] = self_get(elem, 0) + count
- else:
- # fast path when counter is empty
- dict.update(self, iterable)
- else:
- self_get = self.get
- for elem in iterable:
- self[elem] = self_get(elem, 0) + 1
- if kwds:
- self.update(kwds)
-
- def copy(self):
- 'Like dict.copy() but returns a Counter instance instead of a dict.'
- return Counter(self)
-
- def __delitem__(self, elem):
- 'Like dict.__delitem__() but does not raise KeyError for missing values.'
- if elem in self:
- dict.__delitem__(self, elem)
-
- def __repr__(self):
- if not self:
- return '%s()' % self.__class__.__name__
- items = ', '.join(map('%r: %r'.__mod__, self.most_common()))
- return '%s({%s})' % (self.__class__.__name__, items)
-
- # Multiset-style mathematical operations discussed in:
- # Knuth TAOCP Volume II section 4.6.3 exercise 19
- # and at http://en.wikipedia.org/wiki/Multiset
- #
- # Outputs guaranteed to only include positive counts.
- #
- # To strip negative and zero counts, add-in an empty counter:
- # c += Counter()
-
- def __add__(self, other):
- '''Add counts from two counters.
-
- >>> Counter('abbb') + Counter('bcc')
- Counter({'b': 4, 'c': 2, 'a': 1})
-
-
- '''
- if not isinstance(other, Counter):
- return NotImplemented
- result = Counter()
- for elem in set(self) | set(other):
- newcount = self[elem] + other[elem]
- if newcount > 0:
- result[elem] = newcount
- return result
-
- def __sub__(self, other):
- ''' Subtract count, but keep only results with positive counts.
-
- >>> Counter('abbbc') - Counter('bccd')
- Counter({'b': 2, 'a': 1})
-
- '''
- if not isinstance(other, Counter):
- return NotImplemented
- result = Counter()
- for elem in set(self) | set(other):
- newcount = self[elem] - other[elem]
- if newcount > 0:
- result[elem] = newcount
- return result
-
- def __or__(self, other):
- '''Union is the maximum of value in either of the input counters.
-
- >>> Counter('abbb') | Counter('bcc')
- Counter({'b': 3, 'c': 2, 'a': 1})
-
- '''
- if not isinstance(other, Counter):
- return NotImplemented
- _max = max
- result = Counter()
- for elem in set(self) | set(other):
- newcount = _max(self[elem], other[elem])
- if newcount > 0:
- result[elem] = newcount
- return result
-
- def __and__(self, other):
- ''' Intersection is the minimum of corresponding counts.
-
- >>> Counter('abbb') & Counter('bcc')
- Counter({'b': 1})
-
- '''
- if not isinstance(other, Counter):
- return NotImplemented
- _min = min
- result = Counter()
- if len(self) < len(other):
- self, other = other, self
- for elem in ifilter(self.__contains__, other):
- newcount = _min(self[elem], other[elem])
- if newcount > 0:
- result[elem] = newcount
- return result
-
- else:
- from collections import Counter
-
-
-def iterkeys(d):
- """Return an iterator over the keys of a dictionary."""
- return getattr(d, _iterkeys)()
-
-
-def itervalues(d):
- """Return an iterator over the values of a dictionary."""
- return getattr(d, _itervalues)()
-
-
-def iteritems(d):
- """Return an iterator over the (key, value) pairs of a dictionary."""
- return getattr(d, _iteritems)()
-
-try:
- from functools import total_ordering
-except ImportError: # python 2.6
- def total_ordering(cls):
- """Class decorator that fills in missing ordering methods"""
- convert = {
- '__lt__': [('__gt__', lambda self, other: not (self < other or self == other)),
- ('__le__', lambda self, other: self < other or self == other),
- ('__ge__', lambda self, other: not self < other)],
- '__le__': [('__ge__', lambda self, other: not self <= other or self == other),
- ('__lt__', lambda self, other: self <= other and not self == other),
- ('__gt__', lambda self, other: not self <= other)],
- '__gt__': [('__lt__', lambda self, other: not (self > other or self == other)),
- ('__ge__', lambda self, other: self > other or self == other),
- ('__le__', lambda self, other: not self > other)],
- '__ge__': [('__le__', lambda self, other: (not self >= other) or self == other),
- ('__gt__', lambda self, other: self >= other and not self == other),
- ('__lt__', lambda self, other: not self >= other)]
- }
- roots = set(dir(cls)) & set(convert)
- if not roots:
- raise ValueError(
- 'must define at least one ordering operation: < > <= >=')
- root = max(roots) # prefer __lt__ to __le__ to __gt__ to __ge__
- for opname, opfunc in convert[root]:
- if opname not in roots:
- opfunc.__name__ = opname
- opfunc.__doc__ = getattr(int, opname).__doc__
- setattr(cls, opname, opfunc)
- return cls
-
-
# ======= Compatibility for datasets that care about Python versions ========
# The following datasets have a /PY3 subdirectory containing
# a full copy of the data which has been re-encoded or repickled.
-
-import os.path
-
DATA_UPDATES = [("chunkers", "maxent_ne_chunker"),
("help", "tagsets"),
("taggers", "maxent_treebank_pos_tagger"),
@@ -541,6 +200,7 @@ DATA_UPDATES = [("chunkers", "maxent_ne_chunker"),
_PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES]
+
def add_py3_data(path):
if PY3:
for item in _PY3_DATA_UPDATES:
@@ -561,12 +221,8 @@ def py3_data(init_func):
return init_func(*args, **kwargs)
return wraps(init_func)(_decorator)
-# ======= Compatibility layer for __str__ and __repr__ ==========
-
-import unicodedata
-import functools
-
+# ======= Compatibility layer for __str__ and __repr__ ==========
def remove_accents(text):
if isinstance(text, bytes):
@@ -577,6 +233,7 @@ def remove_accents(text):
c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn'
)
+
# Select the best transliteration method:
try:
# Older versions of Unidecode are licensed under Artistic License;
@@ -645,7 +302,7 @@ def unicode_repr(obj):
if hasattr(obj, 'unicode_repr'):
return obj.unicode_repr()
- if isinstance(obj, unicode):
+ if isinstance(obj, text_type):
return repr(obj)[1:] # strip "u" letter from output
return repr(obj)
@@ -655,7 +312,7 @@ def _transliterated(method):
def wrapper(self):
return transliterate(method(self))
- functools.update_wrapper(wrapper, method, ["__name__", "__doc__"])
+ update_wrapper(wrapper, method, ["__name__", "__doc__"])
if hasattr(method, "_nltk_compat_7bit"):
wrapper._nltk_compat_7bit = method._nltk_compat_7bit
@@ -667,10 +324,12 @@ def _7bit(method):
def wrapper(self):
return method(self).encode('ascii', 'backslashreplace')
- functools.update_wrapper(wrapper, method, ["__name__", "__doc__"])
+ update_wrapper(wrapper, method, ["__name__", "__doc__"])
if hasattr(method, "_nltk_compat_transliterated"):
- wrapper._nltk_compat_transliterated = method._nltk_compat_transliterated
+ wrapper._nltk_compat_transliterated = (
+ method._nltk_compat_transliterated
+ )
wrapper._nltk_compat_7bit = True
return wrapper
@@ -683,16 +342,16 @@ def _was_fixed(method):
class Fraction(fractions.Fraction):
"""
- This is a simplified backwards compatible version of fractions.Fraction from
- Python >=3.5. It adds the `_normalize` parameter such that it does
+ This is a simplified backwards compatible version of fractions.Fraction
+ from Python >=3.5. It adds the `_normalize` parameter such that it does
not normalize the denominator to the Greatest Common Divisor (gcd) when
the numerator is 0.
-
+
This is most probably only used by the nltk.translate.bleu_score.py where
numerator and denominator of the different ngram precisions are mutable.
- But the idea of "mutable" fraction might not be applicable to other usages,
+ But the idea of "mutable" fraction might not be applicable to other usages,
See http://stackoverflow.com/questions/34561265
-
+
This objects should be deprecated once NLTK stops supporting Python < 3.5
See https://github.com/nltk/nltk/issues/1330
"""
diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py
index 3da480e..d9ccb54 100644
--- a/nltk/corpus/__init__.py
+++ b/nltk/corpus/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Corpus Readers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -106,8 +106,10 @@ dependency_treebank = LazyCorpusLoader(
floresta = LazyCorpusLoader(
'floresta', BracketParseCorpusReader, r'(?!\.).*\.ptb', '#',
tagset='unknown', encoding='ISO-8859-15')
-framenet = LazyCorpusLoader(
+framenet15 = LazyCorpusLoader(
'framenet_v15', FramenetCorpusReader, ['frRelation.xml','frameIndex.xml','fulltextIndex.xml','luIndex.xml','semTypes.xml'])
+framenet = LazyCorpusLoader(
+ 'framenet_v17', FramenetCorpusReader, ['frRelation.xml','frameIndex.xml','fulltextIndex.xml','luIndex.xml','semTypes.xml'])
gazetteers = LazyCorpusLoader(
'gazetteers', WordListCorpusReader, r'(?!LICENSE|\.).*\.txt',
encoding='ISO-8859-2')
@@ -118,9 +120,6 @@ genesis = LazyCorpusLoader(
('.*', 'utf_8')])
gutenberg = LazyCorpusLoader(
'gutenberg', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1')
-# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
-#hebrew_treebank = LazyCorpusLoader(
-# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
ieer = LazyCorpusLoader(
'ieer', IEERCorpusReader, r'(?!README|\.).*')
inaugural = LazyCorpusLoader(
@@ -129,8 +128,7 @@ inaugural = LazyCorpusLoader(
indian = LazyCorpusLoader(
'indian', IndianCorpusReader, r'(?!\.).*\.pos',
tagset='unknown', encoding='utf8')
-ipipan = LazyCorpusLoader(
- 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
+
jeita = LazyCorpusLoader(
'jeita', ChasenCorpusReader, r'.*\.chasen', encoding='utf-8')
knbc = LazyCorpusLoader(
@@ -154,18 +152,11 @@ multext_east = LazyCorpusLoader(
'mte_teip5', MTECorpusReader, r'(oana).*\.xml', encoding="utf-8")
names = LazyCorpusLoader(
'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii')
-nkjp = LazyCorpusLoader(
- 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
nps_chat = LazyCorpusLoader(
'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj')
opinion_lexicon = LazyCorpusLoader(
'opinion_lexicon', OpinionLexiconCorpusReader, r'(\w+)\-words\.txt',
encoding='ISO-8859-2')
-panlex_lite = LazyCorpusLoader(
- 'panlex_lite', PanLexLiteCorpusReader)
-pl196x = LazyCorpusLoader(
- 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
- cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
ppattach = LazyCorpusLoader(
'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset'])
product_reviews_1 = LazyCorpusLoader(
@@ -250,8 +241,7 @@ wordnet_ic = LazyCorpusLoader(
'wordnet_ic', WordNetICCorpusReader, '.*\.dat')
words = LazyCorpusLoader(
'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii')
-ycoe = LazyCorpusLoader(
- 'ycoe', YCOECorpusReader)
+
# defined after treebank
propbank = LazyCorpusLoader(
'propbank', PropbankCorpusReader,
@@ -277,6 +267,37 @@ semcor = LazyCorpusLoader(
'semcor', SemcorCorpusReader, r'brown./tagfiles/br-.*\.xml',
wordnet) # Must be defined *after* wordnet corpus.
+nonbreaking_prefixes = LazyCorpusLoader(
+ 'nonbreaking_prefixes', NonbreakingPrefixesCorpusReader, r'(?!README|\.).*', encoding='utf8')
+perluniprops = LazyCorpusLoader(
+ 'perluniprops', UnicharsCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
+
+# mwa_ppdb = LazyCorpusLoader(
+# 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
+
+# See https://github.com/nltk/nltk/issues/1579
+# and https://github.com/nltk/nltk/issues/1716
+#
+# pl196x = LazyCorpusLoader(
+# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
+# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
+#
+# ipipan = LazyCorpusLoader(
+# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
+#
+# nkjp = LazyCorpusLoader(
+# 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
+#
+#panlex_lite = LazyCorpusLoader(
+# 'panlex_lite', PanLexLiteCorpusReader)
+#
+# ycoe = LazyCorpusLoader(
+# 'ycoe', YCOECorpusReader)
+#
+# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
+#hebrew_treebank = LazyCorpusLoader(
+# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
+
def demo():
# This is out-of-date:
diff --git a/nltk/corpus/europarl_raw.py b/nltk/corpus/europarl_raw.py
index 7a1cb8e..a8e62a5 100644
--- a/nltk/corpus/europarl_raw.py
+++ b/nltk/corpus/europarl_raw.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Europarl Corpus Readers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Nitin Madnani <nmadnani at umiacs.umd.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/corpus/reader/__init__.py b/nltk/corpus/reader/__init__.py
index ebaac13..77e0eb0 100644
--- a/nltk/corpus/reader/__init__.py
+++ b/nltk/corpus/reader/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Corpus Readers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
@@ -141,5 +141,7 @@ __all__ = [
'TwitterCorpusReader', 'NKJPCorpusReader', 'CrubadanCorpusReader',
'MTECorpusReader', 'ReviewsCorpusReader', 'OpinionLexiconCorpusReader',
'ProsConsCorpusReader', 'CategorizedSentencesCorpusReader',
- 'ComparativeSentencesCorpusReader', 'PanLexLiteCorpusReader'
+ 'ComparativeSentencesCorpusReader', 'PanLexLiteCorpusReader',
+ 'NonbreakingPrefixesCorpusReader', 'UnicharsCorpusReader',
+ 'MWAPPDBCorpusReader',
]
diff --git a/nltk/corpus/reader/aligned.py b/nltk/corpus/reader/aligned.py
index 00804c1..0b341c9 100644
--- a/nltk/corpus/reader/aligned.py
+++ b/nltk/corpus/reader/aligned.py
@@ -1,11 +1,12 @@
# Natural Language Toolkit: Aligned Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# Author: Steven Bird <stevenbird1 at gmail.com>
# For license information, see LICENSE.TXT
-from nltk import compat
+from six import string_types
+
from nltk.tokenize import WhitespaceTokenizer, RegexpTokenizer
from nltk.translate import AlignedSent, Alignment
@@ -45,7 +46,7 @@ class AlignedCorpusReader(CorpusReader):
:rtype: str
"""
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
diff --git a/nltk/corpus/reader/api.py b/nltk/corpus/reader/api.py
index ea0a4ab..fae5a11 100644
--- a/nltk/corpus/reader/api.py
+++ b/nltk/corpus/reader/api.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: API for Corpus Readers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
@@ -14,6 +14,9 @@ from __future__ import unicode_literals
import os
import re
from collections import defaultdict
+from itertools import chain
+
+from six import string_types
from nltk import compat
from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer
@@ -72,7 +75,7 @@ class CorpusReader(object):
tagged_...() methods.
"""
# Convert the root to a path pointer, if necessary.
- if isinstance(root, compat.string_types) and not isinstance(root, PathPointer):
+ if isinstance(root, string_types) and not isinstance(root, PathPointer):
m = re.match('(.*\.zip)/?(.*)$|', root)
zipfile, zipentry = m.groups()
if zipfile:
@@ -83,7 +86,7 @@ class CorpusReader(object):
raise TypeError('CorpusReader: expected a string or a PathPointer')
# If `fileids` is a regexp, then expand it.
- if isinstance(fileids, compat.string_types):
+ if isinstance(fileids, string_types):
fileids = find_corpus_fileids(root, fileids)
self._fileids = fileids
@@ -184,7 +187,7 @@ class CorpusReader(object):
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, compat.string_types):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
paths = [self._root.join(f) for f in fileids]
@@ -338,7 +341,7 @@ class CategorizedCorpusReader(object):
self._init()
if fileids is None:
return sorted(self._c2f)
- if isinstance(fileids, compat.string_types):
+ if isinstance(fileids, string_types):
fileids = [fileids]
return sorted(set.union(*[self._f2c[d] for d in fileids]))
@@ -349,7 +352,7 @@ class CategorizedCorpusReader(object):
"""
if categories is None:
return super(CategorizedCorpusReader, self).fileids()
- elif isinstance(categories, compat.string_types):
+ elif isinstance(categories, string_types):
if self._f2c is None:
self._init()
if categories in self._c2f:
@@ -391,7 +394,7 @@ class SyntaxCorpusReader(CorpusReader):
def raw(self, fileids=None):
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def parsed_sents(self, fileids=None):
@@ -426,10 +429,10 @@ class SyntaxCorpusReader(CorpusReader):
#{ Block Readers
def _read_word_block(self, stream):
- return sum(self._read_sent_block(stream), [])
+ return list(chain(*self._read_sent_block(stream)))
def _read_tagged_word_block(self, stream, tagset=None):
- return sum(self._read_tagged_sent_block(stream, tagset), [])
+ return list(chain(*self._read_tagged_sent_block(stream, tagset)))
def _read_sent_block(self, stream):
return list(filter(None, [self._word(t) for t in self._read_block(stream)]))
@@ -443,4 +446,3 @@ class SyntaxCorpusReader(CorpusReader):
#} End of Block Readers
#------------------------------------------------------------
-
diff --git a/nltk/corpus/reader/bnc.py b/nltk/corpus/reader/bnc.py
index 847f145..01ad9a1 100644
--- a/nltk/corpus/reader/bnc.py
+++ b/nltk/corpus/reader/bnc.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Plaintext Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/corpus/reader/bracket_parse.py b/nltk/corpus/reader/bracket_parse.py
index dcbc97f..0944075 100644
--- a/nltk/corpus/reader/bracket_parse.py
+++ b/nltk/corpus/reader/bracket_parse.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Penn Treebank Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/corpus/reader/categorized_sents.py b/nltk/corpus/reader/categorized_sents.py
index 501bbe2..fa139c2 100644
--- a/nltk/corpus/reader/categorized_sents.py
+++ b/nltk/corpus/reader/categorized_sents.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Categorized Sentences Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -34,6 +34,7 @@ Related papers:
sentiment categorization with respect to rating scales". Proceedings of the
ACL, 2005.
"""
+from six import string_types
from nltk.corpus.reader.api import *
from nltk.tokenize import *
@@ -133,7 +134,7 @@ class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, compat.string_types):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)])
@@ -153,7 +154,7 @@ class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, compat.string_types):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)])
diff --git a/nltk/corpus/reader/chasen.py b/nltk/corpus/reader/chasen.py
index aa927de..eaf85dc 100644
--- a/nltk/corpus/reader/chasen.py
+++ b/nltk/corpus/reader/chasen.py
@@ -1,5 +1,5 @@
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Masato Hagiwara <hagisan at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -9,9 +9,10 @@ from __future__ import print_function
import sys
+from six import string_types
+
from nltk.corpus.reader import util
-from nltk import compat
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
@@ -23,7 +24,7 @@ class ChasenCorpusReader(CorpusReader):
def raw(self, fileids=None):
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
@@ -132,7 +133,7 @@ def test():
jeita = LazyCorpusLoader(
'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
- assert isinstance(jeita.tagged_words()[0][1], compat.string_types)
+ assert isinstance(jeita.tagged_words()[0][1], string_types)
if __name__ == '__main__':
demo()
diff --git a/nltk/corpus/reader/childes.py b/nltk/corpus/reader/childes.py
index 0f96e4c..0b092f1 100644
--- a/nltk/corpus/reader/childes.py
+++ b/nltk/corpus/reader/childes.py
@@ -1,6 +1,6 @@
# CHILDES XML Corpus Reader
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tomonori Nagano <tnagano at gc.cuny.edu>
# Alexis Dimitriadis <A.Dimitriadis at uu.nl>
# URL: <http://nltk.org/>
@@ -9,15 +9,15 @@
"""
Corpus reader for the XML version of the CHILDES corpus.
"""
-from __future__ import print_function
+from __future__ import print_function, division
__docformat__ = 'epytext en'
import re
from collections import defaultdict
+from six import string_types
from nltk.util import flatten, LazyMap, LazyConcatenation
-from nltk.compat import string_types
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree
@@ -123,7 +123,7 @@ class CHILDESCorpusReader(XMLCorpusReader):
if not self._lazy:
return [self._get_words(fileid, speaker, sent, stem, relation,
pos, strip_space, replace) for fileid in self.abspaths(fileids)]
-
+
get_words = lambda fileid: self._get_words(fileid, speaker, sent, stem, relation,
pos, strip_space, replace)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
@@ -153,7 +153,7 @@ class CHILDESCorpusReader(XMLCorpusReader):
if not self._lazy:
return [self._get_words(fileid, speaker, sent, stem, relation,
pos, strip_space, replace) for fileid in self.abspaths(fileids)]
-
+
get_words = lambda fileid: self._get_words(fileid, speaker, sent, stem, relation,
pos, strip_space, replace)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
@@ -278,9 +278,9 @@ class CHILDESCorpusReader(XMLCorpusReader):
thisWordList = flatten(results)
# count number of morphemes
# (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
- numWords = float(len(flatten([word.split('-')
- for word in thisWordList]))) - numFillers
- numSents = float(len(results)) - sentDiscount
+ numWords = len(flatten([word.split('-')
+ for word in thisWordList])) - numFillers
+ numSents = len(results) - sentDiscount
mlu = numWords/numSents
except ZeroDivisionError:
mlu = 0
@@ -520,4 +520,3 @@ def demo(corpus_root=None):
if __name__ == "__main__":
demo()
-
diff --git a/nltk/corpus/reader/chunked.py b/nltk/corpus/reader/chunked.py
index f4a079f..b87ae06 100644
--- a/nltk/corpus/reader/chunked.py
+++ b/nltk/corpus/reader/chunked.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Chunked Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
@@ -13,9 +13,10 @@ documents.
import os.path, codecs
+from six import string_types
+
import nltk
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
-from nltk import compat
from nltk.tree import Tree
from nltk.tokenize import *
from nltk.chunk import tagstr2tree
@@ -53,7 +54,7 @@ class ChunkedCorpusReader(CorpusReader):
:rtype: str
"""
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
@@ -209,4 +210,3 @@ class ChunkedCorpusView(StreamBackedCorpusView):
else:
raise ValueError('expected child to be Tree or tuple')
return tree
-
diff --git a/nltk/corpus/reader/cmudict.py b/nltk/corpus/reader/cmudict.py
index 6ccee07..6009dad 100644
--- a/nltk/corpus/reader/cmudict.py
+++ b/nltk/corpus/reader/cmudict.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -47,6 +47,8 @@ ZH seizure S IY ZH ER
import codecs
+from six import string_types
+
from nltk import compat
from nltk.util import Index
@@ -68,7 +70,7 @@ class CMUDictCorpusReader(CorpusReader):
:return: the cmudict lexicon as a raw string.
"""
fileids = self._fileids
- if isinstance(fileids, compat.string_types):
+ if isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
diff --git a/nltk/corpus/reader/comparative_sents.py b/nltk/corpus/reader/comparative_sents.py
index 0ea92a5..1d81049 100644
--- a/nltk/corpus/reader/comparative_sents.py
+++ b/nltk/corpus/reader/comparative_sents.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Comparative Sentence Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -35,6 +35,8 @@ Related papers:
"""
import re
+from six import string_types
+
from nltk.corpus.reader.api import *
from nltk.tokenize import *
@@ -120,7 +122,7 @@ class ComparativeSentencesCorpusReader(CorpusReader):
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, compat.string_types):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.CorpusView(path, self._read_comparison_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)])
diff --git a/nltk/corpus/reader/conll.py b/nltk/corpus/reader/conll.py
index 4b68e61..34d559f 100644
--- a/nltk/corpus/reader/conll.py
+++ b/nltk/corpus/reader/conll.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: CONLL Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
@@ -16,6 +16,8 @@ import os
import codecs
import textwrap
+from six import string_types
+
from nltk import compat
from nltk.tree import Tree
from nltk.util import LazyMap, LazyConcatenation
@@ -72,7 +74,7 @@ class ConllCorpusReader(CorpusReader):
for columntype in columntypes:
if columntype not in self.COLUMN_TYPES:
raise ValueError('Bad column type %r' % columntype)
- if isinstance(chunk_types, compat.string_types):
+ if isinstance(chunk_types, string_types):
chunk_types = [chunk_types]
self._chunk_types = chunk_types
self._colmap = dict((c,i) for (i,c) in enumerate(columntypes))
@@ -89,7 +91,7 @@ class ConllCorpusReader(CorpusReader):
def raw(self, fileids=None):
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
@@ -280,16 +282,16 @@ class ConllCorpusReader(CorpusReader):
right = right.count(')')*')' # only keep ')'.
treestr += '%s (%s %s) %s' % (left, pos_tag, word, right)
try:
- tree = self._tree_class.parse(treestr)
+ tree = self._tree_class.fromstring(treestr)
except (ValueError, IndexError):
- tree = self._tree_class.parse('(%s %s)' %
+ tree = self._tree_class.fromstring('(%s %s)' %
(self._root_label, treestr))
if not pos_in_tree:
for subtree in tree.subtrees():
for i, child in enumerate(subtree):
if (isinstance(child, Tree) and len(child)==1 and
- isinstance(child[0], compat.string_types)):
+ isinstance(child[0], string_types)):
subtree[i] = (child[0], child.label())
return tree
@@ -492,7 +494,7 @@ class ConllSRLInstanceList(list):
def _tree2conll(self, tree, wordnum, words, pos, synt):
assert isinstance(tree, Tree)
- if len(tree) == 1 and isinstance(tree[0], compat.string_types):
+ if len(tree) == 1 and isinstance(tree[0], string_types):
pos[wordnum] = tree.label()
assert words[wordnum] == tree[0]
return wordnum+1
diff --git a/nltk/corpus/reader/crubadan.py b/nltk/corpus/reader/crubadan.py
index 4642353..84f603e 100644
--- a/nltk/corpus/reader/crubadan.py
+++ b/nltk/corpus/reader/crubadan.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: An Crubadan N-grams Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Avital Pekker <avital.pekker at utoronto.ca>
#
# URL: <http://nltk.org/>
@@ -93,7 +93,7 @@ class CrubadanCorpusReader(CorpusReader):
ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')
if not path.isfile(ngram_file):
- raise Runtime("No N-gram file found for requested language.")
+ raise RuntimeError("No N-gram file found for requested language.")
counts = FreqDist()
if PY3:
diff --git a/nltk/corpus/reader/dependency.py b/nltk/corpus/reader/dependency.py
index c72358f..c8a3a39 100644
--- a/nltk/corpus/reader/dependency.py
+++ b/nltk/corpus/reader/dependency.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Dependency Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Kepa Sarasola <kepa.sarasola at ehu.es>
# Iker Manterola <returntothehangar at hotmail.com>
#
diff --git a/nltk/corpus/reader/framenet.py b/nltk/corpus/reader/framenet.py
index 9548781..26fa96e 100644
--- a/nltk/corpus/reader/framenet.py
+++ b/nltk/corpus/reader/framenet.py
@@ -1,28 +1,56 @@
# Natural Language Toolkit: Framenet Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Chuck Wooters <wooters at icsi.berkeley.edu>,
-# Nathan Schneider <nschneid at cs.cmu.edu>
+# Nathan Schneider <nathan.schneider at georgetown.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-from __future__ import print_function, unicode_literals
+
"""
-Corpus reader for the Framenet 1.5 Corpus.
+Corpus reader for the FrameNet 1.7 lexicon and corpus.
"""
-
-__docformat__ = 'epytext en'
+from __future__ import print_function, unicode_literals
import os, sys
import re
import textwrap
-from collections import defaultdict
+import itertools
+import types
+
+from six import string_types, text_type
+from six.moves import zip_longest
+
+from collections import defaultdict, OrderedDict
from pprint import pprint, pformat
from nltk.internals import ElementWrapper
from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView
-from nltk.compat import text_type, string_types, python_2_unicode_compatible
-from nltk.util import AbstractLazySequence, LazyMap
+from nltk.compat import python_2_unicode_compatible
+from nltk.util import AbstractLazySequence, LazyConcatenation, LazyMap, LazyIteratorList
+__docformat__ = 'epytext en'
+
+def mimic_wrap(lines, wrap_at=65, **kwargs):
+ """
+ Wrap the first of 'lines' with textwrap and the remaining lines at exactly the same
+ positions as the first.
+ """
+ l0 = textwrap.fill(lines[0], wrap_at, drop_whitespace=False).split('\n')
+ yield l0
+
+ def _(line):
+ il0 = 0
+ while line and il0<len(l0)-1:
+ yield line[:len(l0[il0])]
+ line = line[len(l0[il0]):]
+ il0 += 1
+ if line: # Remaining stuff on this line past the end of the mimicked line.
+ # So just textwrap this line.
+ for ln in textwrap.fill(line, wrap_at, drop_whitespace=False).split('\n'):
+ yield ln
+
+ for l in lines[1:]:
+ yield list(_(l))
def _pretty_longstring(defstr, prefix='', wrap_at=65):
@@ -163,11 +191,373 @@ def _pretty_lu(lu):
if 'semTypes' in lukeys:
outstr += "\n[semTypes] {0} semantic types\n".format(len(lu.semTypes))
outstr += " "*(len(lu.semTypes)>0) + ", ".join('{0}({1})'.format(x.name, x.ID) for x in lu.semTypes) + '\n'*(len(lu.semTypes)>0)
+ if 'URL' in lukeys:
+ outstr += "\n[URL] {0}\n".format(lu.URL)
if 'subCorpus' in lukeys:
subc = [x.name for x in lu.subCorpus]
outstr += "\n[subCorpus] {0} subcorpora\n".format(len(lu.subCorpus))
for line in textwrap.fill(", ".join(sorted(subc)), 60).split('\n'):
outstr += " {0}\n".format(line)
+ if 'exemplars' in lukeys:
+ outstr += "\n[exemplars] {0} sentences across all subcorpora\n".format(len(lu.exemplars))
+
+ return outstr
+
+def _pretty_exemplars(exemplars, lu):
+ """
+ Helper function for pretty-printing a list of exemplar sentences for a lexical unit.
+
+ :param sent: The list of exemplar sentences to be printed.
+ :type sent: list(AttrDict)
+ :return: An index of the text of the exemplar sentences.
+ :rtype: str
+ """
+
+ outstr = ""
+ outstr += "exemplar sentences for {0.name} in {0.frame.name}:\n\n".format(lu)
+ for i,sent in enumerate(exemplars):
+ outstr += "[{0}] {1}\n".format(i, sent.text)
+ outstr += "\n"
+ return outstr
+
+def _pretty_fulltext_sentences(sents):
+ """
+ Helper function for pretty-printing a list of annotated sentences for a full-text document.
+
+ :param sent: The list of sentences to be printed.
+ :type sent: list(AttrDict)
+ :return: An index of the text of the sentences.
+ :rtype: str
+ """
+
+ outstr = ""
+ outstr += "full-text document ({0.ID}) {0.name}:\n\n".format(sents)
+ outstr += "[corpid] {0.corpid}\n[corpname] {0.corpname}\n[description] {0.description}\n[URL] {0.URL}\n\n".format(sents)
+ outstr += "[sentence]\n".format(sents)
+ for i,sent in enumerate(sents.sentence):
+ outstr += "[{0}] {1}\n".format(i, sent.text)
+ outstr += "\n"
+ return outstr
+
+def _pretty_fulltext_sentence(sent):
+ """
+ Helper function for pretty-printing an annotated sentence from a full-text document.
+
+ :param sent: The sentence to be printed.
+ :type sent: list(AttrDict)
+ :return: The text of the sentence with annotation set indices on frame targets.
+ :rtype: str
+ """
+
+ outstr = ""
+ outstr += "full-text sentence ({0.ID}) in {1}:\n\n".format(sent, sent.doc.get('name',sent.doc.description))
+ outstr += "\n[POS] {0} tags\n".format(len(sent.POS))
+ outstr += "\n[POS_tagset] {0}\n\n".format(sent.POS_tagset)
+ outstr += "[text] + [annotationSet]\n\n"
+ outstr += sent._ascii() # -> _annotation_ascii()
+ outstr += "\n"
+ return outstr
+
+def _pretty_pos(aset):
+ """
+ Helper function for pretty-printing a sentence with its POS tags.
+
+ :param aset: The POS annotation set of the sentence to be printed.
+ :type sent: list(AttrDict)
+ :return: The text of the sentence and its POS tags.
+ :rtype: str
+ """
+
+ outstr = ""
+ outstr += "POS annotation set ({0.ID}) {0.POS_tagset} in sentence {0.sent.ID}:\n\n".format(aset)
+
+ # list the target spans and their associated aset index
+ overt = sorted(aset.POS)
+
+ sent = aset.sent
+ s0 = sent.text
+ s1 = ''
+ s2 = ''
+ i = 0
+ adjust = 0
+ for j,k,lbl in overt:
+ assert j>=i,('Overlapping targets?',(j,k,lbl))
+ s1 += ' '*(j-i) + '-'*(k-j)
+ if len(lbl)>(k-j):
+ # add space in the sentence to make room for the annotation index
+ amt = len(lbl)-(k-j)
+ s0 = s0[:k+adjust]+ '~'*amt + s0[k+adjust:] # '~' to prevent line wrapping
+ s1 = s1[:k+adjust]+ ' '*amt + s1[k+adjust:]
+ adjust += amt
+ s2 += ' '*(j-i) + lbl.ljust(k-j)
+ i = k
+
+ long_lines = [s0, s1, s2]
+
+ outstr += '\n\n'.join(map('\n'.join, zip_longest(*mimic_wrap(long_lines), fillvalue=' '))).replace('~',' ')
+ outstr += "\n"
+ return outstr
+
+def _pretty_annotation(sent, aset_level=False):
+ """
+ Helper function for pretty-printing an exemplar sentence for a lexical unit.
+
+ :param sent: An annotation set or exemplar sentence to be printed.
+ :param aset_level: If True, 'sent' is actually an annotation set within a sentence.
+ :type sent: AttrDict
+ :return: A nicely formated string representation of the exemplar sentence
+ with its target, frame, and FE annotations.
+ :rtype: str
+ """
+
+ sentkeys = sent.keys()
+ outstr = "annotation set" if aset_level else "exemplar sentence"
+ outstr += " ({0.ID}):\n".format(sent)
+ if aset_level: # TODO: any UNANN exemplars?
+ outstr += "\n[status] {0}\n".format(sent.status)
+ for k in ('corpID', 'docID', 'paragNo', 'sentNo', 'aPos'):
+ if k in sentkeys:
+ outstr += "[{0}] {1}\n".format(k, sent[k])
+ outstr += "\n[LU] ({0.ID}) {0.name} in {0.frame.name}\n".format(sent.LU) if sent.LU else '\n[LU] Not found!'
+ outstr += "\n[frame] ({0.ID}) {0.name}\n".format(sent.frame) # redundant with above, but .frame is convenient
+ if not aset_level:
+ outstr += "\n[annotationSet] {0} annotation sets\n".format(len(sent.annotationSet))
+ outstr += "\n[POS] {0} tags\n".format(len(sent.POS))
+ outstr += "\n[POS_tagset] {0}\n".format(sent.POS_tagset)
+ outstr += "\n[GF] {0} relation{1}\n".format(len(sent.GF), "s" if len(sent.GF)!=1 else "")
+ outstr += "\n[PT] {0} phrase{1}\n".format(len(sent.PT), "s" if len(sent.PT)!=1 else "")
+ """
+ Special Layers
+ --------------
+
+ The 'NER' layer contains, for some of the data, named entity labels.
+
+ The 'WSL' (word status layer) contains, for some of the data,
+ spans which should not in principle be considered targets (NT).
+
+ The 'Other' layer records relative clause constructions (Rel=relativizer, Ant=antecedent),
+ pleonastic 'it' (Null), and existential 'there' (Exist).
+ On occasion they are duplicated by accident (e.g., annotationSet 1467275 in lu6700.xml).
+
+ The 'Sent' layer appears to contain labels that the annotator has flagged the
+ sentence with for their convenience: values include
+ 'sense1', 'sense2', 'sense3', etc.;
+ 'Blend', 'Canonical', 'Idiom', 'Metaphor', 'Special-Sent',
+ 'keepS', 'deleteS', 'reexamine'
+ (sometimes they are duplicated for no apparent reason).
+
+ The POS-specific layers may contain the following kinds of spans:
+ Asp (aspectual particle), Non-Asp (non-aspectual particle),
+ Cop (copula), Supp (support), Ctrlr (controller),
+ Gov (governor), X. Gov and X always cooccur.
+
+ >>> from nltk.corpus import framenet as fn
+>>> def f(luRE, lyr, ignore=set()):
+... for i,ex in enumerate(fn.exemplars(luRE)):
+... if lyr in ex and ex[lyr] and set(zip(*ex[lyr])[2]) - ignore:
+... print(i,ex[lyr])
+
+ - Verb: Asp, Non-Asp
+ - Noun: Cop, Supp, Ctrlr, Gov, X
+ - Adj: Cop, Supp, Ctrlr, Gov, X
+ - Prep: Cop, Supp, Ctrlr
+ - Adv: Ctrlr
+ - Scon: (none)
+ - Art: (none)
+ """
+ for lyr in ('NER', 'WSL', 'Other', 'Sent'):
+ if lyr in sent and sent[lyr]:
+ outstr += "\n[{0}] {1} entr{2}\n".format(lyr, len(sent[lyr]), "ies" if len(sent[lyr])!=1 else "y")
+ outstr += "\n[text] + [Target] + [FE]"
+ # POS-specific layers: syntactically important words that are neither the target
+ # nor the FEs. Include these along with the first FE layer but with '^' underlining.
+ for lyr in ('Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art'):
+ if lyr in sent and sent[lyr]:
+ outstr += " + [{0}]".format(lyr)
+ if 'FE2' in sentkeys:
+ outstr += " + [FE2]"
+ if 'FE3' in sentkeys:
+ outstr += " + [FE3]"
+ outstr += "\n\n"
+ outstr += sent._ascii() # -> _annotation_ascii()
+ outstr += "\n"
+
+ return outstr
+
+def _annotation_ascii(sent):
+ '''
+ Given a sentence or FE annotation set, construct the width-limited string showing
+ an ASCII visualization of the sentence's annotations, calling either
+ _annotation_ascii_frames() or _annotation_ascii_FEs() as appropriate.
+ This will be attached as a method to appropriate AttrDict instances
+ and called in the full pretty-printing of the instance.
+ '''
+ if sent._type=='fulltext_sentence' or ('annotationSet' in sent and len(sent.annotationSet)>2):
+ # a full-text sentence OR sentence with multiple targets.
+ # (multiple targets = >2 annotation sets, because the first annotation set is POS.)
+ return _annotation_ascii_frames(sent)
+ else: # an FE annotation set, or an LU sentence with 1 target
+ return _annotation_ascii_FEs(sent)
+
+def _annotation_ascii_frames(sent):
+ '''
+ ASCII string rendering of the sentence along with its targets and frame names.
+ Called for all full-text sentences, as well as the few LU sentences with multiple
+ targets (e.g., fn.lu(6412).exemplars[82] has two want.v targets).
+ Line-wrapped to limit the display width.
+ '''
+ # list the target spans and their associated aset index
+ overt = []
+ for a,aset in enumerate(sent.annotationSet[1:]):
+ for j,k in aset.Target:
+ indexS = "[{0}]".format(a+1)
+ if aset.status=='UNANN' or aset.LU.status=='Problem':
+ indexS += " "
+ if aset.status=='UNANN':
+ indexS += "!" # warning indicator that there is a frame annotation but no FE annotation
+ if aset.LU.status=='Problem':
+ indexS += "?" # warning indicator that there is a missing LU definition (because the LU has Problem status)
+ overt.append((j,k,aset.LU.frame.name,indexS))
+ overt = sorted(overt)
+
+ duplicates = set()
+ for o,(j,k,fname,asetIndex) in enumerate(overt):
+ if o>0 and j<=overt[o-1][1]:
+ # multiple annotation sets on the same target
+ # (e.g. due to a coordination construction or multiple annotators)
+ if overt[o-1][:2]==(j,k) and overt[o-1][2]==fname: # same target, same frame
+ # splice indices together
+ combinedIndex = overt[o-1][3] + asetIndex # e.g., '[1][2]', '[1]! [2]'
+ combinedIndex = combinedIndex.replace(' !', '! ').replace(' ?', '? ')
+ overt[o-1] = overt[o-1][:3]+(combinedIndex,)
+ duplicates.add(o)
+ else: # different frames, same or overlapping targets
+ s = sent.text
+ for j,k,fname,asetIndex in overt:
+ s += '\n' + asetIndex + ' ' + sent.text[j:k] + ' :: ' + fname
+ s += '\n(Unable to display sentence with targets marked inline due to overlap)'
+ return s
+ for o in reversed(sorted(duplicates)):
+ del overt[o]
+
+ s0 = sent.text
+ s1 = ''
+ s11 = ''
+ s2 = ''
+ i = 0
+ adjust = 0
+ fAbbrevs = OrderedDict()
+ for j,k,fname,asetIndex in overt:
+ if not j>=i:
+ assert j>=i,('Overlapping targets?'+(' UNANN' if any(aset.status=='UNANN' for aset in sent.annotationSet[1:]) else ''),(j,k,asetIndex))
+ s1 += ' '*(j-i) + '*'*(k-j)
+ short = fname[:k-j]
+ if (k-j)<len(fname):
+ r = 0
+ while short in fAbbrevs:
+ if fAbbrevs[short]==fname:
+ break
+ r += 1
+ short = fname[:k-j-1] + str(r)
+ else: # short not in fAbbrevs
+ fAbbrevs[short] = fname
+ s11 += ' '*(j-i) + short.ljust(k-j)
+ if len(asetIndex)>(k-j):
+ # add space in the sentence to make room for the annotation index
+ amt = len(asetIndex)-(k-j)
+ s0 = s0[:k+adjust]+ '~'*amt + s0[k+adjust:] # '~' to prevent line wrapping
+ s1 = s1[:k+adjust]+ ' '*amt + s1[k+adjust:]
+ s11 = s11[:k+adjust]+ ' '*amt + s11[k+adjust:]
+ adjust += amt
+ s2 += ' '*(j-i) + asetIndex.ljust(k-j)
+ i = k
+
+ long_lines = [s0, s1, s11, s2]
+
+ outstr = '\n\n'.join(map('\n'.join, zip_longest(*mimic_wrap(long_lines), fillvalue=' '))).replace('~',' ')
+ outstr += '\n'
+ if fAbbrevs:
+ outstr += ' ('+', '.join('='.join(pair) for pair in fAbbrevs.items())+')'
+ assert len(fAbbrevs)==len(dict(fAbbrevs)),'Abbreviation clash'
+
+ return outstr
+
+def _annotation_ascii_FE_layer(overt, ni, feAbbrevs):
+ '''Helper for _annotation_ascii_FEs().'''
+ s1 = ''
+ s2 = ''
+ i = 0
+ for j,k,fename in overt:
+ s1 += ' '*(j-i) + ('^' if fename.islower() else '-')*(k-j)
+ short = fename[:k-j]
+ if len(fename)>len(short):
+ r = 0
+ while short in feAbbrevs:
+ if feAbbrevs[short]==fename:
+ break
+ r += 1
+ short = fename[:k-j-1] + str(r)
+ else: # short not in feAbbrevs
+ feAbbrevs[short] = fename
+ s2 += ' '*(j-i) + short.ljust(k-j)
+ i = k
+
+ sNI = ''
+ if ni:
+ sNI += ' ['+', '.join(':'.join(x) for x in sorted(ni.items()))+']'
+ return [s1,s2,sNI]
+
+def _annotation_ascii_FEs(sent):
+ '''
+ ASCII string rendering of the sentence along with a single target and its FEs.
+ Secondary and tertiary FE layers are included if present.
+ 'sent' can be an FE annotation set or an LU sentence with a single target.
+ Line-wrapped to limit the display width.
+ '''
+ feAbbrevs = OrderedDict()
+ posspec = [] # POS-specific layer spans (e.g., Supp[ort], Cop[ula])
+ posspec_separate = False
+ for lyr in ('Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art'):
+ if lyr in sent and sent[lyr]:
+ for a,b,lbl in sent[lyr]:
+ if lbl=='X': # skip this, which covers an entire phrase typically containing the target and all its FEs
+ # (but do display the Gov)
+ continue
+ if any(1 for x,y,felbl in sent.FE[0] if x<=a<y or a<=x<b):
+ # overlap between one of the POS-specific layers and first FE layer
+ posspec_separate = True # show POS-specific layers on a separate line
+ posspec.append((a,b,lbl.lower().replace('-',''))) # lowercase Cop=>cop, Non-Asp=>nonasp, etc. to distinguish from FE names
+ if posspec_separate:
+ POSSPEC = _annotation_ascii_FE_layer(posspec, {}, feAbbrevs)
+ FE1 = _annotation_ascii_FE_layer(sorted(sent.FE[0] + (posspec if not posspec_separate else [])), sent.FE[1], feAbbrevs)
+ FE2 = FE3 = None
+ if 'FE2' in sent:
+ FE2 = _annotation_ascii_FE_layer(sent.FE2[0], sent.FE2[1], feAbbrevs)
+ if 'FE3' in sent:
+ FE3 = _annotation_ascii_FE_layer(sent.FE3[0], sent.FE3[1], feAbbrevs)
+
+ for i,j in sent.Target:
+ FE1span, FE1name, FE1exp = FE1
+ if len(FE1span)<j:
+ FE1span += ' '*(j-len(FE1span))
+ if len(FE1name)<j:
+ FE1name += ' '*(j-len(FE1name))
+ FE1[1] = FE1name
+ FE1[0] = FE1span[:i] + FE1span[i:j].replace(' ','*').replace('-','=') + FE1span[j:]
+ long_lines = [sent.text]
+ if posspec_separate:
+ long_lines.extend(POSSPEC[:2])
+ long_lines.extend([FE1[0], FE1[1]+FE1[2]]) # lines with no length limit
+ if FE2:
+ long_lines.extend([FE2[0], FE2[1]+FE2[2]])
+ if FE3:
+ long_lines.extend([FE3[0], FE3[1]+FE3[2]])
+ long_lines.append('')
+ outstr = '\n'.join(map('\n'.join, zip_longest(*mimic_wrap(long_lines), fillvalue=' ')))
+ if feAbbrevs:
+ outstr += '('+', '.join('='.join(pair) for pair in feAbbrevs.items())+')'
+ assert len(feAbbrevs)==len(dict(feAbbrevs)),'Abbreviation clash'
+ outstr += "\n"
return outstr
@@ -225,6 +615,7 @@ def _pretty_frame(frame):
outstr = ""
outstr += "frame ({0.ID}): {0.name}\n\n".format(frame)
+ outstr += "[URL] {0}\n\n".format(frame.URL)
outstr += "[definition]\n"
outstr += _pretty_longstring(frame.definition, ' ') + '\n'
@@ -301,8 +692,11 @@ class AttrDict(dict):
return self.__repr__()
try:
return "<{0} ID={1} name={2}>".format(self['_type'], self['ID'], self['name'])
- except KeyError: # no ID--e.g., for _type=lusubcorpus
- return "<{0} name={1}>".format(self['_type'], self['name'])
+ except KeyError:
+ try: # no ID--e.g., for _type=lusubcorpus
+ return "<{0} name={1}>".format(self['_type'], self['name'])
+ except KeyError: # no name--e.g., for _type=lusentence
+ return "<{0} ID={1}>".format(self['_type'], self['ID'])
else:
return self.__repr__()
@@ -317,6 +711,18 @@ class AttrDict(dict):
outstr = _pretty_fe(self)
elif self['_type'] == 'lu':
outstr = _pretty_lu(self)
+ elif self['_type'] == 'luexemplars': # list of ALL exemplars for LU
+ outstr = _pretty_exemplars(self, self[0].LU)
+ elif self['_type'] == 'fulltext_annotation': # list of all sentences for full-text doc
+ outstr = _pretty_fulltext_sentences(self)
+ elif self['_type'] == 'lusentence':
+ outstr = _pretty_annotation(self)
+ elif self['_type'] == 'fulltext_sentence':
+ outstr = _pretty_fulltext_sentence(self)
+ elif self['_type'] in ('luannotationset', 'fulltext_annotationset'):
+ outstr = _pretty_annotation(self, aset_level=True)
+ elif self['_type'] == 'posannotationset':
+ outstr = _pretty_pos(self)
elif self['_type'] == 'semtype':
outstr = _pretty_semtype(self)
elif self['_type'] == 'framerelationtype':
@@ -340,6 +746,33 @@ class AttrDict(dict):
def __repr__(self):
return self.__str__()
+ at python_2_unicode_compatible
+class SpecialList(list):
+ """
+ A list subclass which adds a '_type' attribute for special printing
+ (similar to an AttrDict, though this is NOT an AttrDict subclass).
+ """
+ def __init__(self, typ, *args, **kwargs):
+ super(SpecialList,self).__init__(*args, **kwargs)
+ self._type = typ
+
+ def _str(self):
+ outstr = ""
+
+ assert self._type
+ if len(self)==0:
+ outstr = "[]"
+ elif self._type == 'luexemplars': # list of ALL exemplars for LU
+ outstr = _pretty_exemplars(self, self[0].LU)
+ else:
+ assert False,self._type
+ return outstr
+
+ def __str__(self):
+ return self._str()
+ def __repr__(self):
+ return self.__str__()
+
class Future(object):
"""
Wraps and acts as a proxy for a value to be loaded lazily (on demand).
@@ -376,7 +809,6 @@ class Future(object):
def __repr__(self):
return self._data().__repr__()
-
@python_2_unicode_compatible
class PrettyDict(AttrDict):
"""
@@ -448,6 +880,61 @@ class PrettyLazyMap(LazyMap):
else:
return "[%s]" % text_type(', ').join(pieces)
+ at python_2_unicode_compatible
+class PrettyLazyIteratorList(LazyIteratorList):
+ """
+ Displays an abbreviated repr of only the first several elements, not the whole list.
+ """
+ # from nltk.util
+ _MAX_REPR_SIZE = 60
+ def __repr__(self):
+ """
+ Return a string representation for this corpus view that is
+ similar to a list's representation; but if it would be more
+ than 60 characters long, it is truncated.
+ """
+ pieces = []
+ length = 5
+ for elt in self:
+ pieces.append(elt._short_repr()) # key difference from inherited version: call to _short_repr()
+ length += len(pieces[-1]) + 2
+ if length > self._MAX_REPR_SIZE and len(pieces) > 2:
+ return "[%s, ...]" % text_type(', ').join(pieces[:-1])
+ else:
+ return "[%s]" % text_type(', ').join(pieces)
+
+ at python_2_unicode_compatible
+class PrettyLazyConcatenation(LazyConcatenation):
+ """
+ Displays an abbreviated repr of only the first several elements, not the whole list.
+ """
+ # from nltk.util
+ _MAX_REPR_SIZE = 60
+ def __repr__(self):
+ """
+ Return a string representation for this corpus view that is
+ similar to a list's representation; but if it would be more
+ than 60 characters long, it is truncated.
+ """
+ pieces = []
+ length = 5
+ for elt in self:
+ pieces.append(elt._short_repr()) # key difference from inherited version: call to _short_repr()
+ length += len(pieces[-1]) + 2
+ if length > self._MAX_REPR_SIZE and len(pieces) > 2:
+ return "[%s, ...]" % text_type(', ').join(pieces[:-1])
+ else:
+ return "[%s]" % text_type(', ').join(pieces)
+
+ def __add__(self, other):
+ """Return a list concatenating self with other."""
+ return PrettyLazyIteratorList(itertools.chain(self, other))
+
+ def __radd__(self, other):
+ """Return a list concatenating other with self."""
+ return PrettyLazyIteratorList(itertools.chain(other, self))
+
+
class FramenetCorpusReader(XMLCorpusReader):
"""A corpus reader for the Framenet Corpus.
@@ -468,6 +955,17 @@ class FramenetCorpusReader(XMLCorpusReader):
in the XML index.
"""
+ _warnings = False
+
+ def warnings(self, v):
+ """Enable or disable warnings of data integrity issues as they are encountered.
+ If v is truthy, warnings will be enabled.
+
+ (This is a function rather than just an attribute/property to ensure that if
+ enabling warnings is the first action taken, the corpus reader is instantiated first.)
+ """
+ self._warnings = v
+
def __init__(self, root, fileids):
XMLCorpusReader.__init__(self, root, fileids)
@@ -479,6 +977,9 @@ class FramenetCorpusReader(XMLCorpusReader):
# sub dir containing the xml files for fulltext annotation files
self._fulltext_dir = "fulltext"
+ # location of latest development version of FrameNet
+ self._fnweb_url = "https://framenet2.icsi.berkeley.edu/fnReports/data"
+
# Indexes used for faster look-ups
self._frame_idx = None
self._cached_frames = {} # name -> ID
@@ -490,6 +991,87 @@ class FramenetCorpusReader(XMLCorpusReader):
self._ferel_idx = None # FE-to-FE relation instances
self._frel_f_idx = None # frame-to-frame relations associated with each frame
+ def help(self, attrname=None):
+ """Display help information summarizing the main methods."""
+
+ if attrname is not None:
+ return help(self.__getattribute__(attrname))
+
+ # No need to mention frame_by_name() or frame_by_id(),
+ # as it's easier to just call frame().
+ # Also not mentioning lu_basic().
+
+
+ msg = """
+Use the following methods to access data in FrameNet.
+Provide a method name to `help()` for more information.
+
+FRAMES
+======
+
+frame() to look up a frame by its exact name or ID
+frames() to get frames matching a name pattern
+frames_by_lemma() to get frames containing an LU matching a name pattern
+frame_ids_and_names() to get a mapping from frame IDs to names
+
+FRAME ELEMENTS
+==============
+
+fes() to get frame elements (a.k.a. roles) matching a name pattern, optionally constrained
+ by a frame name pattern
+
+LEXICAL UNITS
+=============
+
+lu() to look up a frame by its ID
+lus() to get lexical units matching a name pattern, optionally constrained by frame
+lu_ids_and_names() to get a mapping from LU IDs to names
+
+RELATIONS
+=========
+
+frame_relation_types() to get the different kinds of frame-to-frame relations
+ (Inheritance, Subframe, Using, etc.).
+frame_relations() to get the relation instances, optionally constrained by
+ frame(s) or relation type
+fe_relations() to get the frame element pairs belonging to a frame-to-frame relation
+
+SEMANTIC TYPES
+==============
+
+semtypes() to get the different kinds of semantic types that can be applied to
+ FEs, LUs, and entire frames
+semtype() to look up a particular semtype by name, ID, or abbreviation
+semtype_inherits() to check whether two semantic types have a subtype-supertype
+ relationship in the semtype hierarchy
+propagate_semtypes() to apply inference rules that distribute semtypes over relations
+ between FEs
+
+ANNOTATIONS
+===========
+
+annotations() to get annotation sets, in which a token in a sentence is annotated
+ with a lexical unit in a frame, along with its frame elements and their syntactic properties;
+ can be constrained by LU name pattern and limited to lexicographic exemplars or full-text.
+ Sentences of full-text annotation can have multiple annotation sets.
+sents() to get annotated sentences illustrating one or more lexical units
+exemplars() to get sentences of lexicographic annotation, most of which have
+ just 1 annotation set; can be constrained by LU name pattern, frame, and overt FE(s)
+doc() to look up a document of full-text annotation by its ID
+docs() to get documents of full-text annotation that match a name pattern
+docs_metadata() to get metadata about all full-text documents without loading them
+ft_sents() to iterate over sentences of full-text annotation
+
+UTILITIES
+=========
+
+buildindexes() loads metadata about all frames, LUs, etc. into memory to avoid
+ delay when one is accessed for the first time. It does not load annotations.
+readme() gives the text of the FrameNet README file
+warnings(True) to display corpus consistency warnings when loading data
+ """
+ print(msg)
+
def _buildframeindex(self):
# The total number of Frames in Framenet is fairly small (~1200) so
# this index should not be very large
@@ -546,6 +1128,11 @@ class FramenetCorpusReader(XMLCorpusReader):
self._ferel_idx[ferel.ID] = ferel
#print('...done building relation index', file=sys.stderr)
+ def _warn(self, *message, **kwargs):
+ if self._warnings:
+ kwargs.setdefault('file', sys.stderr)
+ print(*message, **kwargs)
+
def readme(self):
"""
Return the contents of the corpus README.txt (or README) file.
@@ -568,7 +1155,7 @@ class FramenetCorpusReader(XMLCorpusReader):
# frame and FE relations
self._buildrelationindex()
- def annotated_document(self, fn_docid):
+ def doc(self, fn_docid):
"""
Returns the annotated document whose id number is
``fn_docid``. This id number can be obtained by calling the
@@ -630,7 +1217,11 @@ class FramenetCorpusReader(XMLCorpusReader):
# Grab the top-level xml element containing the fulltext annotation
elt = XMLCorpusView(locpath, 'fullTextAnnotation')[0]
- return self._handle_fulltextannotation_elt(elt)
+ info = self._handle_fulltextannotation_elt(elt)
+ # add metadata
+ for k,v in self._fulltext_idx[fn_docid].items():
+ info[k] = v
+ return info
def frame_by_id(self, fn_fid, ignorekeys=[]):
"""
@@ -719,11 +1310,13 @@ class FramenetCorpusReader(XMLCorpusReader):
fentry = self._handle_frame_elt(elt, ignorekeys)
assert fentry
+ fentry.URL = self._fnweb_url + '/' + self._frame_dir + '/' + fn_fname + '.xml'
+
# INFERENCE RULE: propagate lexical semtypes from the frame to all its LUs
for st in fentry.semTypes:
if st.rootType.name=='Lexical_type':
for lu in fentry.lexUnit.values():
- if st not in lu.semTypes:
+ if not any(x is st for x in lu.semTypes): # identity containment check
lu.semTypes.append(st)
@@ -827,8 +1420,8 @@ class FramenetCorpusReader(XMLCorpusReader):
search through ALL of the frame XML files in the db.
>>> from nltk.corpus import framenet as fn
- >>> fn.frames_by_lemma(r'(?i)a little')
- [<frame ID=189 name=Quantity>, <frame ID=2001 name=Degree>]
+ >>> fn.frames_by_lemma(r'(?i)a little') # doctest: +ELLIPSIS
+ [<frame ID=189 name=Quanti...>, <frame ID=2001 name=Degree>]
:return: A list of frame objects.
:rtype: list(AttrDict)
@@ -842,17 +1435,23 @@ class FramenetCorpusReader(XMLCorpusReader):
``lu()`` function with "subCorpus" info excluded.
>>> from nltk.corpus import framenet as fn
- >>> PrettyDict(fn.lu_basic(256), breakLines=True)
+ >>> lu = PrettyDict(fn.lu_basic(256), breakLines=True)
+ >>> # ellipses account for differences between FN 1.5 and 1.7
+ >>> lu # doctest: +ELLIPSIS
{'ID': 256,
'POS': 'V',
+ 'URL': u'https://framenet2.icsi.berkeley.edu/fnReports/data/lu/lu256.xml',
'_type': 'lu',
+ 'cBy': ...,
+ 'cDate': '02/08/2001 01:27:50 PST Thu',
'definition': 'COD: be aware of beforehand; predict.',
+ 'definitionMarkup': 'COD: be aware of beforehand; predict.',
'frame': <frame ID=26 name=Expectation>,
'lemmaID': 15082,
'lexemes': [{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}],
'name': 'foresee.v',
'semTypes': [],
- 'sentenceCount': {'annotated': 44, 'total': 227},
+ 'sentenceCount': {'annotated': ..., 'total': ...},
'status': 'FN1_Sent'}
:param fn_luid: The id number of the desired LU
@@ -860,15 +1459,15 @@ class FramenetCorpusReader(XMLCorpusReader):
:return: Basic information about the lexical unit
:rtype: dict
"""
- return self.lu(fn_luid, ignorekeys=['subCorpus'])
+ return self.lu(fn_luid, ignorekeys=['subCorpus', 'exemplars'])
- def lu(self, fn_luid, ignorekeys=[]):
+ def lu(self, fn_luid, ignorekeys=[], luName=None, frameID=None, frameName=None):
"""
- Get information about a specific Lexical Unit using the id number
- ``fn_luid``. This function reads the LU information from the xml
- file on disk each time it is called. You may want to cache this
- info if you plan to call this function with the same id number
- multiple times.
+ Access a lexical unit by its ID. luName, frameID, and frameName are used
+ only in the event that the LU does not have a file in the database
+ (which is the case for LUs with "Problem" status); in this case,
+ a placeholder LU is created which just contains its name, ID, and frame.
+
Usage examples:
@@ -882,6 +1481,40 @@ class FramenetCorpusReader(XMLCorpusReader):
>>> pprint(list(map(PrettyDict, fn.lu(256).lexemes)))
[{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}]
+ >>> fn.lu(227).exemplars[23]
+ exemplar sentence (352962):
+ [sentNo] 0
+ [aPos] 59699508
+ <BLANKLINE>
+ [LU] (227) guess.v in Coming_to_believe
+ <BLANKLINE>
+ [frame] (23) Coming_to_believe
+ <BLANKLINE>
+ [annotationSet] 2 annotation sets
+ <BLANKLINE>
+ [POS] 18 tags
+ <BLANKLINE>
+ [POS_tagset] BNC
+ <BLANKLINE>
+ [GF] 3 relations
+ <BLANKLINE>
+ [PT] 3 phrases
+ <BLANKLINE>
+ [Other] 1 entry
+ <BLANKLINE>
+ [text] + [Target] + [FE]
+ <BLANKLINE>
+ When he was inside the house , Culley noticed the characteristic
+ ------------------
+ Content
+ <BLANKLINE>
+ he would n't have guessed at .
+ -- ******* --
+ Co C1 [Evidence:INI]
+ (Co=Cognizer, C1=Content)
+ <BLANKLINE>
+ <BLANKLINE>
+
The dict that is returned from this function will contain most of the
following information about the LU. Note that some LUs do not contain
all of these pieces of information - particularly 'totalAnnotated' and
@@ -959,8 +1592,20 @@ class FramenetCorpusReader(XMLCorpusReader):
# look for this LU in cache
if not self._lu_idx:
self._buildluindex()
- luinfo = self._lu_idx[fn_luid]
- if '_type' not in luinfo:
+ OOV = object()
+ luinfo = self._lu_idx.get(fn_luid, OOV)
+ if luinfo is OOV:
+ # LU not in the index. We create a placeholder by falling back to
+ # luName, frameID, and frameName. However, this will not be listed
+ # among the LUs for its frame.
+ self._warn('LU ID not found: {0} ({1}) in {2} ({3})'.format(luName, fn_luid, frameName, frameID))
+ luinfo = AttrDict({'_type': 'lu', 'ID': fn_luid, 'name': luName,
+ 'frameID': frameID, 'status': 'Problem'})
+ f = self.frame_by_id(luinfo.frameID)
+ assert f.name==frameName,(f.name,frameName)
+ luinfo['frame'] = f
+ self._lu_idx[fn_luid] = luinfo
+ elif '_type' not in luinfo:
# we only have an index entry for the LU. loading the frame will replace this.
f = self.frame_by_id(luinfo.frameID)
luinfo = self._lu_idx[fn_luid]
@@ -988,9 +1633,18 @@ class FramenetCorpusReader(XMLCorpusReader):
raise FramenetError('Unknown LU id: {0}'.format(fn_luid))
lu2 = self._handle_lexunit_elt(elt, ignorekeys)
+ lu.URL = self._fnweb_url + '/' + self._lu_dir + '/' + fname
lu.subCorpus = lu2.subCorpus
+ lu.exemplars = SpecialList('luexemplars',
+ [sent for subc in lu.subCorpus for sent in subc.sentence])
+ for sent in lu.exemplars:
+ sent['LU'] = lu
+ sent['frame'] = lu.frame
+ for aset in sent.annotationSet:
+ aset['LU'] = lu
+ aset['frame'] = lu.frame
- return lu.subCorpus
+ return lu
def _loadsemtypes(self):
"""Create the semantic types index."""
@@ -1034,11 +1688,11 @@ class FramenetCorpusReader(XMLCorpusReader):
to traverse the neighboring relations on demand for each FE semtype.)
>>> from nltk.corpus import framenet as fn
- >>> sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType)
- 4241
+ >>> x = sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType)
>>> fn.propagate_semtypes()
- >>> sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType)
- 5252
+ >>> y = sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType)
+ >>> y-x > 1000
+ True
"""
if not self._semtypes:
self._loadsemtypes()
@@ -1124,13 +1778,15 @@ class FramenetCorpusReader(XMLCorpusReader):
Obtain details for a specific frame.
>>> from nltk.corpus import framenet as fn
- >>> len(fn.frames())
- 1019
- >>> PrettyList(fn.frames(r'(?i)medical'), maxReprSize=0, breakLines=True)
- [<frame ID=256 name=Medical_specialties>,
- <frame ID=257 name=Medical_instruments>,
- <frame ID=255 name=Medical_professionals>,
- <frame ID=239 name=Medical_conditions>]
+ >>> len(fn.frames()) in (1019, 1221) # FN 1.5 and 1.7, resp.
+ True
+ >>> x = PrettyList(fn.frames(r'(?i)crim'), maxReprSize=0, breakLines=True)
+ >>> x.sort(key=lambda f: f.ID)
+ >>> x
+ [<frame ID=200 name=Criminal_process>,
+ <frame ID=500 name=Criminal_investigation>,
+ <frame ID=692 name=Crime_scenario>,
+ <frame ID=700 name=Committing_crime>]
A brief intro to Frames (excerpted from "FrameNet II: Extended
Theory and Practice" by Ruppenhofer et. al., 2010):
@@ -1198,28 +1854,33 @@ class FramenetCorpusReader(XMLCorpusReader):
self._buildframeindex()
return dict((fID, finfo.name) for fID,finfo in self._frame_idx.items() if name is None or re.search(name, finfo.name) is not None)
- def fes(self, name=None):
+ def fes(self, name=None, frame=None):
'''
- Lists frame element objects. If 'name' is provided, this is treated as
- a case-insensitive regular expression to filter by frame name.
- (Case-insensitivity is because casing of frame element names is not always
- consistent across frames.)
-
+ Lists frame element objects. If 'name' is provided, this is treated as
+ a case-insensitive regular expression to filter by frame name.
+ (Case-insensitivity is because casing of frame element names is not always
+ consistent across frames.) Specify 'frame' to filter by a frame name pattern,
+ ID, or object.
+
>>> from nltk.corpus import framenet as fn
>>> fn.fes('Noise_maker')
[<fe ID=6043 name=Noise_maker>]
>>> sorted([(fe.frame.name,fe.name) for fe in fn.fes('sound')])
- [('Cause_to_make_noise', 'Sound_maker'), ('Make_noise', 'Sound'),
- ('Make_noise', 'Sound_source'), ('Sound_movement', 'Location_of_sound_source'),
- ('Sound_movement', 'Sound'), ('Sound_movement', 'Sound_source'),
- ('Sounds', 'Component_sound'), ('Sounds', 'Location_of_sound_source'),
- ('Sounds', 'Sound_source'), ('Vocalizations', 'Location_of_sound_source'),
+ [('Cause_to_make_noise', 'Sound_maker'), ('Make_noise', 'Sound'),
+ ('Make_noise', 'Sound_source'), ('Sound_movement', 'Location_of_sound_source'),
+ ('Sound_movement', 'Sound'), ('Sound_movement', 'Sound_source'),
+ ('Sounds', 'Component_sound'), ('Sounds', 'Location_of_sound_source'),
+ ('Sounds', 'Sound_source'), ('Vocalizations', 'Location_of_sound_source'),
('Vocalizations', 'Sound_source')]
+ >>> sorted([(fe.frame.name,fe.name) for fe in fn.fes('sound',r'(?i)make_noise')])
+ [('Cause_to_make_noise', 'Sound_maker'),
+ ('Make_noise', 'Sound'),
+ ('Make_noise', 'Sound_source')]
>>> sorted(set(fe.name for fe in fn.fes('^sound')))
['Sound', 'Sound_maker', 'Sound_source']
>>> len(fn.fes('^sound$'))
2
-
+
:param name: A regular expression pattern used to match against
frame element names. If 'name' is None, then a list of all
frame elements will be returned.
@@ -1227,19 +1888,34 @@ class FramenetCorpusReader(XMLCorpusReader):
:return: A list of matching frame elements
:rtype: list(AttrDict)
'''
- return PrettyList(fe for f in self.frames() for fename,fe in f.FE.items() if name is None or re.search(name, fename, re.I))
+ # what frames are we searching in?
+ if frame is not None:
+ if isinstance(frame, int):
+ frames = [self.frame(frame)]
+ elif isinstance(frame, string_types):
+ frames = self.frames(frame)
+ else:
+ frames = [frame]
+ else:
+ frames = self.frames()
+
+ return PrettyList(fe for f in frames for fename,fe in f.FE.items() if name is None or re.search(name, fename, re.I))
- def lus(self, name=None):
+ def lus(self, name=None, frame=None):
"""
- Obtain details for a specific lexical unit.
+ Obtain details for lexical units.
+ Optionally restrict by lexical unit name pattern, and/or to a certain frame
+ or frames whose name matches a pattern.
>>> from nltk.corpus import framenet as fn
- >>> len(fn.lus())
- 11829
+ >>> len(fn.lus()) in (11829, 13572) # FN 1.5 and 1.7, resp.
+ True
>>> PrettyList(fn.lus(r'(?i)a little'), maxReprSize=0, breakLines=True)
[<lu ID=14744 name=a little bit.adv>,
<lu ID=14733 name=a little.n>,
<lu ID=14743 name=a little.adv>]
+ >>> fn.lus(r'interest', r'(?i)stimulus')
+ [<lu ID=14920 name=interesting.a>, <lu ID=14894 name=interested.a>]
A brief intro to Lexical Units (excerpted from "FrameNet II:
Extended Theory and Practice" by Ruppenhofer et. al., 2010):
@@ -1324,21 +2000,39 @@ class FramenetCorpusReader(XMLCorpusReader):
scon - subordinating conjunction
:type name: str
+ :type frame: str or int or frame
:return: A list of selected (or all) lexical units
:rtype: list of LU objects (dicts). See the lu() function for info
about the specifics of LU objects.
"""
- try:
- luIDs = list(self._lu_idx.keys())
- except AttributeError:
+ if not self._lu_idx:
self._buildluindex()
- luIDs = list(self._lu_idx.keys())
- if name is not None:
- return PrettyList(self.lu(luID) for luID,luName in self.lu_ids_and_names(name).items())
- else:
- return PrettyLazyMap(self.lu, luIDs)
+
+
+ if name is not None: # match LUs, then restrict by frame
+ result = PrettyList(self.lu(luID) for luID,luName in self.lu_ids_and_names(name).items())
+ if frame is not None:
+ if isinstance(frame, int):
+ frameIDs = {frame}
+ elif isinstance(frame, string_types):
+ frameIDs = {f.ID for f in self.frames(frame)}
+ else:
+ frameIDs = {frame.ID}
+ result = PrettyList(lu for lu in result if lu.frame.ID in frameIDs)
+ elif frame is not None: # all LUs in matching frames
+ if isinstance(frame, int):
+ frames = [self.frame(frame)]
+ elif isinstance(frame, string_types):
+ frames = self.frames(frame)
+ else:
+ frames = [frame]
+ result = PrettyLazyIteratorList(iter(LazyConcatenation(list(f.lexUnit.values()) for f in frames)))
+ else: # all LUs
+ luIDs = [luID for luID,lu in self._lu_idx.items() if lu.status not in self._bad_statuses]
+ result = PrettyLazyMap(self.lu, luIDs)
+ return result
def lu_ids_and_names(self, name=None):
"""
@@ -1347,20 +2041,22 @@ class FramenetCorpusReader(XMLCorpusReader):
"""
if not self._lu_idx:
self._buildluindex()
- return dict((luID, luinfo.name) for luID,luinfo in self._lu_idx.items() if name is None or re.search(name, luinfo.name) is not None)
+ return {luID: luinfo.name for luID,luinfo in self._lu_idx.items()
+ if luinfo.status not in self._bad_statuses
+ and (name is None or re.search(name, luinfo.name) is not None)}
- def documents(self, name=None):
+ def docs_metadata(self, name=None):
"""
- Return a list of the annotated documents in Framenet.
+ Return an index of the annotated documents in Framenet.
Details for a specific annotated document can be obtained using this
- class's annotated_document() function and pass it the value of the 'ID' field.
+ class's doc() function and pass it the value of the 'ID' field.
>>> from nltk.corpus import framenet as fn
- >>> len(fn.documents())
- 78
- >>> set([x.corpname for x in fn.documents()])==set(['ANC', 'C-4', 'KBEval', \
- 'LUCorpus-v0.3', 'Miscellaneous', 'NTI', 'PropBank', 'QA', 'SemAnno'])
+ >>> len(fn.docs()) in (78, 107) # FN 1.5 and 1.7, resp.
+ True
+ >>> set([x.corpname for x in fn.docs_metadata()])>=set(['ANC', 'KBEval', \
+ 'LUCorpus-v0.3', 'Miscellaneous', 'NTI', 'PropBank'])
True
:param name: A regular expression pattern used to search the
@@ -1394,6 +2090,152 @@ class FramenetCorpusReader(XMLCorpusReader):
else:
return PrettyList(x for x in ftlist if re.search(name, x['filename']) is not None)
+ def docs(self, name=None):
+ """
+ Return a list of the annotated full-text documents in FrameNet,
+ optionally filtered by a regex to be matched against the document name.
+ """
+ return PrettyLazyMap((lambda x: self.doc(x.ID)), self.docs_metadata(name))
+
+ def sents(self, exemplars=True, full_text=True):
+ """
+ Annotated sentences matching the specified criteria.
+ """
+ if exemplars:
+ if full_text:
+ return self.exemplars() + self.ft_sents()
+ else:
+ return self.exemplars()
+ elif full_text:
+ return self.ft_sents()
+
+ def annotations(self, luNamePattern=None, exemplars=True, full_text=True):
+ """
+ Frame annotation sets matching the specified criteria.
+ """
+
+ if exemplars:
+ epart = PrettyLazyIteratorList(sent.frameAnnotation for sent in self.exemplars(luNamePattern))
+ else:
+ epart = []
+
+ if full_text:
+ if luNamePattern is not None:
+ matchedLUIDs = set(self.lu_ids_and_names(luNamePattern).keys())
+ ftpart = PrettyLazyIteratorList(aset for sent in self.ft_sents() for aset in sent.annotationSet[1:] if luNamePattern is None or aset.get('luID','CXN_ASET') in matchedLUIDs)
+ else:
+ ftpart = []
+
+ if exemplars:
+ if full_text:
+ return epart + ftpart
+ else:
+ return epart
+ elif full_text:
+ return ftpart
+
+ def exemplars(self, luNamePattern=None, frame=None, fe=None, fe2=None):
+ """
+ Lexicographic exemplar sentences, optionally filtered by LU name and/or 1-2 FEs that
+ are realized overtly. 'frame' may be a name pattern, frame ID, or frame instance.
+ 'fe' may be a name pattern or FE instance; if specified, 'fe2' may also
+ be specified to retrieve sentences with both overt FEs (in either order).
+ """
+ if fe is None and fe2 is not None:
+ raise FramenetError('exemplars(..., fe=None, fe2=<value>) is not allowed')
+ elif fe is not None and fe2 is not None:
+ if not isinstance(fe2, string_types):
+ if isinstance(fe, string_types):
+ # fe2 is specific to a particular frame. swap fe and fe2 so fe is always used to determine the frame.
+ fe, fe2 = fe2, fe
+ elif fe.frame is not fe2.frame: # ensure frames match
+ raise FramenetError('exemplars() call with inconsistent `fe` and `fe2` specification (frames must match)')
+ if frame is None and fe is not None and not isinstance(fe, string_types):
+ frame = fe.frame
+
+ # narrow down to frames matching criteria
+
+ lusByFrame = defaultdict(list) # frame name -> matching LUs, if luNamePattern is specified
+ if frame is not None or luNamePattern is not None:
+ if frame is None or isinstance(frame, string_types):
+ if luNamePattern is not None:
+ frames = set()
+ for lu in self.lus(luNamePattern, frame=frame):
+ frames.add(lu.frame.ID)
+ lusByFrame[lu.frame.name].append(lu)
+ frames = LazyMap(self.frame, list(frames))
+ else:
+ frames = self.frames(frame)
+ else:
+ if isinstance(frame,int):
+ frames = [self.frame(frame)]
+ else: # frame object
+ frames = [frame]
+
+ if luNamePattern is not None:
+ lusByFrame = {frame.name: self.lus(luNamePattern, frame=frame)}
+
+ if fe is not None: # narrow to frames that define this FE
+ if isinstance(fe, string_types):
+ frames = PrettyLazyIteratorList(f for f in frames if fe in f.FE or any(re.search(fe, ffe, re.I) for ffe in f.FE.keys()))
+ else:
+ if fe.frame not in frames:
+ raise FramenetError('exemplars() call with inconsistent `frame` and `fe` specification')
+ frames = [fe.frame]
+
+ if fe2 is not None: # narrow to frames that ALSO define this FE
+ if isinstance(fe2, string_types):
+ frames = PrettyLazyIteratorList(f for f in frames if fe2 in f.FE or any(re.search(fe2, ffe, re.I) for ffe in f.FE.keys()))
+ # else we already narrowed it to a single frame
+ else: # frame, luNamePattern are None. fe, fe2 are None or strings
+ if fe is not None:
+ frames = {ffe.frame.ID for ffe in self.fes(fe)}
+ if fe2 is not None:
+ frames2 = {ffe.frame.ID for ffe in self.fes(fe2)}
+ frames = frames & frames2
+ frames = LazyMap(self.frame, list(frames))
+ else:
+ frames = self.frames()
+
+ # we've narrowed down 'frames'
+ # now get exemplars for relevant LUs in those frames
+
+ def _matching_exs():
+ for f in frames:
+ fes = fes2 = None # FEs of interest
+ if fe is not None:
+ fes = {ffe for ffe in f.FE.keys() if re.search(fe, ffe, re.I)} if isinstance(fe, string_types) else {fe.name}
+ if fe2 is not None:
+ fes2 = {ffe for ffe in f.FE.keys() if re.search(fe2, ffe, re.I)} if isinstance(fe2, string_types) else {fe2.name}
+
+ for lu in lusByFrame[f.name] if luNamePattern is not None else f.lexUnit.values():
+ for ex in lu.exemplars:
+ if (fes is None or self._exemplar_of_fes(ex, fes)) and (fes2 is None or self._exemplar_of_fes(ex, fes2)):
+ yield ex
+
+ return PrettyLazyIteratorList(_matching_exs())
+
+ def _exemplar_of_fes(self, ex, fes=None):
+ """
+ Given an exemplar sentence and a set of FE names, return the subset of FE names
+ that are realized overtly in the sentence on the FE, FE2, or FE3 layer.
+
+ If 'fes' is None, returns all overt FE names.
+ """
+ overtNames = set(list(zip(*ex.FE[0]))[2]) if ex.FE[0] else set()
+ if 'FE2' in ex:
+ overtNames |= set(list(zip(*ex.FE2[0]))[2]) if ex.FE2[0] else set()
+ if 'FE3' in ex:
+ overtNames |= set(list(zip(*ex.FE3[0]))[2]) if ex.FE3[0] else set()
+ return overtNames & fes if fes is not None else overtNames
+
+ def ft_sents(self, docNamePattern=None):
+ """
+ Full-text annotation sentences, optionally filtered by document name.
+ """
+ return PrettyLazyIteratorList(sent for d in self.docs(docNamePattern) for sent in d.sentence)
+
+
def frame_relation_types(self):
"""
Obtain a list of frame relation types.
@@ -1402,8 +2244,8 @@ class FramenetCorpusReader(XMLCorpusReader):
>>> frts = list(fn.frame_relation_types())
>>> isinstance(frts, list)
True
- >>> len(frts)
- 9
+ >>> len(frts) in (9, 10) # FN 1.5 and 1.7, resp.
+ True
>>> PrettyDict(frts[0], breakLines=True)
{'ID': 1,
'_type': 'framerelationtype',
@@ -1435,15 +2277,15 @@ class FramenetCorpusReader(XMLCorpusReader):
>>> frels = fn.frame_relations()
>>> isinstance(frels, list)
True
- >>> len(frels)
- 1676
+ >>> len(frels) in (1676, 2070) # FN 1.5 and 1.7, resp.
+ True
>>> PrettyList(fn.frame_relations('Cooking_creation'), maxReprSize=0, breakLines=True)
[<Parent=Intentionally_create -- Inheritance -> Child=Cooking_creation>,
<Parent=Apply_heat -- Using -> Child=Cooking_creation>,
<MainEntry=Apply_heat -- See_also -> ReferringEntry=Cooking_creation>]
- >>> PrettyList(fn.frame_relations(373), breakLines=True)
- [<Parent=Topic -- Using -> Child=Communication>,
- <Source=Discussion -- ReFraming_Mapping -> Target=Topic>, ...]
+ >>> PrettyList(fn.frame_relations(274), breakLines=True)
+ [<Parent=Avoiding -- Inheritance -> Child=Dodging>,
+ <Parent=Avoiding -- Inheritance -> Child=Evading>, ...]
>>> PrettyList(fn.frame_relations(fn.frame('Cooking_creation')), breakLines=True)
[<Parent=Intentionally_create -- Inheritance -> Child=Cooking_creation>,
<Parent=Apply_heat -- Using -> Child=Cooking_creation>, ...]
@@ -1510,8 +2352,8 @@ class FramenetCorpusReader(XMLCorpusReader):
>>> ferels = fn.fe_relations()
>>> isinstance(ferels, list)
True
- >>> len(ferels)
- 10020
+ >>> len(ferels) in (10020, 12393) # FN 1.5 and 1.7, resp.
+ True
>>> PrettyDict(ferels[0], breakLines=True)
{'ID': 14642,
'_type': 'ferelation',
@@ -1541,10 +2383,10 @@ class FramenetCorpusReader(XMLCorpusReader):
>>> from nltk.corpus import framenet as fn
>>> stypes = fn.semtypes()
- >>> len(stypes)
- 73
+ >>> len(stypes) in (73, 109) # FN 1.5 and 1.7, resp.
+ True
>>> sorted(stypes[0].keys())
- ['ID', '_type', 'abbrev', 'definition', 'name', 'rootType', 'subTypes', 'superType']
+ ['ID', '_type', 'abbrev', 'definition', 'definitionMarkup', 'name', 'rootType', 'subTypes', 'superType']
:return: A list of all of the semantic types in framenet
:rtype: list(dict)
@@ -1577,8 +2419,8 @@ class FramenetCorpusReader(XMLCorpusReader):
return d
# Ignore these attributes when loading attributes from an xml node
- ignore_attrs = ['cBy', 'cDate', 'mDate', 'xsi',
- 'schemaLocation', 'xmlns', 'bgColor', 'fgColor']
+ ignore_attrs = [ #'cBy', 'cDate', 'mDate', # <-- annotation metadata that could be of interest
+ 'xsi', 'schemaLocation', 'xmlns', 'bgColor', 'fgColor']
for attr in attr_dict:
@@ -1602,6 +2444,13 @@ class FramenetCorpusReader(XMLCorpusReader):
"""
try:
+ '''
+ # Look for boundary issues in markup. (Sometimes FEs are pluralized in definitions.)
+ m = re.search(r'\w[<][^/]|[<][/][^>]+[>](s\w|[a-rt-z0-9])', data)
+ if m:
+ print('Markup boundary:', data[max(0,m.start(0)-10):m.end(0)+10].replace('\n',' '), file=sys.stderr)
+ '''
+
data = data.replace('<t>', '')
data = data.replace('</t>', '')
data = re.sub('<fex name="[^"]+">', '', data)
@@ -1657,6 +2506,7 @@ class FramenetCorpusReader(XMLCorpusReader):
else:
docname = doc.description
doc.filename = "{0}__{1}.xml".format(corpname, docname)
+ doc.URL = self._fnweb_url + '/' + self._fulltext_dir + '/' + doc.filename
doc.corpname = corpname
doc.corpid = corpid
retlist.append(doc)
@@ -1664,11 +2514,12 @@ class FramenetCorpusReader(XMLCorpusReader):
return retlist
def _handle_frame_elt(self, elt, ignorekeys=[]):
- """Load the info for a Frame from an frame xml file"""
+ """Load the info for a Frame from a frame xml file"""
frinfo = self._load_xml_attributes(AttrDict(), elt)
frinfo['_type'] = 'frame'
frinfo['definition'] = ""
+ frinfo['definitionMarkup'] = ""
frinfo['FE'] = PrettyDict()
frinfo['FEcoreSets'] = []
frinfo['lexUnit'] = PrettyDict()
@@ -1679,6 +2530,7 @@ class FramenetCorpusReader(XMLCorpusReader):
for sub in elt:
if sub.tag.endswith('definition') and 'definition' not in ignorekeys:
+ frinfo['definitionMarkup'] = sub.text
frinfo['definition'] = self._strip_tags(sub.text)
elif sub.tag.endswith('FE') and 'FE' not in ignorekeys:
feinfo = self._handle_fe_elt(sub)
@@ -1694,7 +2546,9 @@ class FramenetCorpusReader(XMLCorpusReader):
# problematic LU entry; ignore it
continue
luentry['frame'] = frinfo
- luentry['subCorpus'] = Future((lambda lu: lambda: self._lu_file(lu))(luentry))
+ luentry['URL'] = self._fnweb_url + '/' + self._lu_dir + '/' + "lu{0}.xml".format(luentry['ID'])
+ luentry['subCorpus'] = Future((lambda lu: lambda: self._lu_file(lu).subCorpus)(luentry))
+ luentry['exemplars'] = Future((lambda lu: lambda: self._lu_file(lu).exemplars)(luentry))
frinfo['lexUnit'][luentry.name] = luentry
if not self._lu_idx:
self._buildluindex()
@@ -1765,7 +2619,7 @@ class FramenetCorpusReader(XMLCorpusReader):
element (which we ignore here) and a bunch of 'sentence'
elements."""
info = AttrDict()
- info['_type'] = 'fulltextannotation'
+ info['_type'] = 'fulltext_annotation'
info['sentence'] = []
for sub in elt:
@@ -1773,40 +2627,59 @@ class FramenetCorpusReader(XMLCorpusReader):
continue # not used
elif sub.tag.endswith('sentence'):
s = self._handle_fulltext_sentence_elt(sub)
+ s.doc = info
info['sentence'].append(s)
return info
def _handle_fulltext_sentence_elt(self, elt):
"""Load information from the given 'sentence' element. Each
- 'sentence' element contains a "text" and an "annotationSet" sub
- element."""
+ 'sentence' element contains a "text" and "annotationSet" sub
+ elements."""
info = self._load_xml_attributes(AttrDict(), elt)
- info['_type'] = "sentence"
+ info['_type'] = "fulltext_sentence"
info['annotationSet'] = []
+ info['targets'] = []
+ target_spans = set()
+ info['_ascii'] = types.MethodType(_annotation_ascii, info) # attach a method for this instance
info['text'] = ""
for sub in elt:
if sub.tag.endswith('text'):
info['text'] = self._strip_tags(sub.text)
elif sub.tag.endswith('annotationSet'):
- a = self._handle_fulltextannotationset_elt(sub)
+ a = self._handle_fulltextannotationset_elt(sub, is_pos=(len(info['annotationSet'])==0))
+ if 'cxnID' in a: # ignoring construction annotations for now
+ continue
+ a.sent = info
+ a.text = info.text
info['annotationSet'].append(a)
-
+ if 'Target' in a:
+ for tspan in a.Target:
+ if tspan in target_spans:
+ self._warn('Duplicate target span "{0}"'.format(info.text[slice(*tspan)]),
+ tspan, 'in sentence',info['ID'], info.text)
+ # this can happen in cases like "chemical and biological weapons"
+ # being annotated as "chemical weapons" and "biological weapons"
+ else:
+ target_spans.add(tspan)
+ info['targets'].append((a.Target, a.luName, a.frameName))
+
+ assert info['annotationSet'][0].status=='UNANN'
+ info['POS'] = info['annotationSet'][0].POS
+ info['POS_tagset'] = info['annotationSet'][0].POS_tagset
return info
- def _handle_fulltextannotationset_elt(self, elt):
+ def _handle_fulltextannotationset_elt(self, elt, is_pos=False):
"""Load information from the given 'annotationSet' element. Each
'annotationSet' contains several "layer" elements."""
- info = self._load_xml_attributes(AttrDict(), elt)
- info['_type'] = "annotationset"
- info['layer'] = []
-
- for sub in elt:
- if sub.tag.endswith('layer'):
- l = self._handle_fulltextlayer_elt(sub)
- info['layer'].append(l)
+ info = self._handle_luannotationset_elt(elt, is_pos=is_pos)
+ if not is_pos:
+ info['_type'] = 'fulltext_annotationset'
+ if 'cxnID' not in info: # ignoring construction annotations for now
+ info['LU'] = self.lu(info.luID, luName=info.luName, frameID=info.frameID, frameName=info.frameName)
+ info['frame'] = info.LU.frame
return info
def _handle_fulltextlayer_elt(self, elt):
@@ -1829,22 +2702,33 @@ class FramenetCorpusReader(XMLCorpusReader):
luinfo['_type'] = 'lu'
luinfo = self._load_xml_attributes(luinfo, elt)
luinfo["definition"] = ""
+ luinfo["definitionMarkup"] = ""
luinfo["sentenceCount"] = PrettyDict()
luinfo['lexemes'] = PrettyList() # multiword LUs have multiple lexemes
luinfo['semTypes'] = PrettyList() # an LU can have multiple semtypes
for sub in elt:
if sub.tag.endswith('definition'):
+ luinfo['definitionMarkup'] = sub.text
luinfo['definition'] = self._strip_tags(sub.text)
elif sub.tag.endswith('sentenceCount'):
luinfo['sentenceCount'] = self._load_xml_attributes(
PrettyDict(), sub)
elif sub.tag.endswith('lexeme'):
- luinfo['lexemes'].append(self._load_xml_attributes(PrettyDict(), sub))
+ lexemeinfo = self._load_xml_attributes(PrettyDict(), sub)
+ if not isinstance(lexemeinfo.name, string_types):
+ # some lexeme names are ints by default: e.g.,
+ # thousand.num has lexeme with name="1000"
+ lexemeinfo.name = str(lexemeinfo.name)
+ luinfo['lexemes'].append(lexemeinfo)
elif sub.tag.endswith('semType'):
semtypeinfo = self._load_xml_attributes(PrettyDict(), sub)
luinfo['semTypes'].append(self.semtype(semtypeinfo.ID))
+ # sort lexemes by 'order' attribute
+ # otherwise, e.g., 'write down.v' may have lexemes in wrong order
+ luinfo['lexemes'].sort(key=lambda x: x.order)
+
return luinfo
def _handle_lexunit_elt(self, elt, ignorekeys):
@@ -1856,6 +2740,7 @@ class FramenetCorpusReader(XMLCorpusReader):
luinfo = self._load_xml_attributes(AttrDict(), elt)
luinfo['_type'] = 'lu'
luinfo['definition'] = ""
+ luinfo['definitionMarkup'] = ""
luinfo['subCorpus'] = PrettyList()
luinfo['lexemes'] = PrettyList() # multiword LUs have multiple lexemes
luinfo['semTypes'] = PrettyList() # an LU can have multiple semtypes
@@ -1869,6 +2754,7 @@ class FramenetCorpusReader(XMLCorpusReader):
elif sub.tag.endswith('valences'):
continue # not used
elif sub.tag.endswith('definition') and 'definition' not in ignorekeys:
+ luinfo['definitionMarkup'] = sub.text
luinfo['definition'] = self._strip_tags(sub.text)
elif sub.tag.endswith('subCorpus') and 'subCorpus' not in ignorekeys:
sc = self._handle_lusubcorpus_elt(sub)
@@ -1886,7 +2772,7 @@ class FramenetCorpusReader(XMLCorpusReader):
"""Load a subcorpus of a lexical unit from the given xml."""
sc = AttrDict()
try:
- sc['name'] = str(elt.get('name'))
+ sc['name'] = elt.get('name')
except AttributeError:
return None
sc['_type'] = "lusubcorpus"
@@ -1905,25 +2791,99 @@ class FramenetCorpusReader(XMLCorpusReader):
info = self._load_xml_attributes(AttrDict(), elt)
info['_type'] = 'lusentence'
info['annotationSet'] = []
+ info['_ascii'] = types.MethodType(_annotation_ascii, info) # attach a method for this instance
for sub in elt:
if sub.tag.endswith('text'):
info['text'] = self._strip_tags(sub.text)
elif sub.tag.endswith('annotationSet'):
- annset = self._handle_luannotationset_elt(sub)
+ annset = self._handle_luannotationset_elt(sub, is_pos=(len(info['annotationSet'])==0))
if annset is not None:
+ assert annset.status=='UNANN' or 'FE' in annset,annset
+ if annset.status!='UNANN':
+ info['frameAnnotation'] = annset
+ # copy layer info up to current level
+ for k in ('Target', 'FE', 'FE2', 'FE3', 'GF', 'PT', 'POS', 'POS_tagset',
+ 'Other', 'Sent', 'Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art'):
+ if k in annset:
+ info[k] = annset[k]
info['annotationSet'].append(annset)
+ annset['sent'] = info
+ annset['text'] = info.text
return info
- def _handle_luannotationset_elt(self, elt):
+ def _handle_luannotationset_elt(self, elt, is_pos=False):
"""Load an annotation set from a sentence in an subcorpus of an LU"""
info = self._load_xml_attributes(AttrDict(), elt)
- info['_type'] = 'luannotationset'
+ info['_type'] = 'posannotationset' if is_pos else 'luannotationset'
info['layer'] = []
+ info['_ascii'] = types.MethodType(_annotation_ascii, info) # attach a method for this instance
+
+ if 'cxnID' in info: # ignoring construction annotations for now.
+ return info
+
for sub in elt:
if sub.tag.endswith('layer'):
l = self._handle_lulayer_elt(sub)
if l is not None:
+ overt = []
+ ni = {} # null instantiations
+
info['layer'].append(l)
+ for lbl in l.label:
+ if 'start' in lbl:
+ thespan = (lbl.start,lbl.end+1,lbl.name)
+ if l.name not in ('Sent','Other'): # 'Sent' and 'Other' layers sometimes contain accidental duplicate spans
+ assert thespan not in overt,(info.ID,l.name,thespan)
+ overt.append(thespan)
+ else: # null instantiation
+ if lbl.name in ni:
+ self._warn('FE with multiple NI entries:', lbl.name, ni[lbl.name], lbl.itype)
+ else:
+ ni[lbl.name] = lbl.itype
+ overt = sorted(overt)
+
+ if l.name=='Target':
+ if not overt:
+ self._warn('Skipping empty Target layer in annotation set ID={0}'.format(info.ID))
+ continue
+ assert all(lblname=='Target' for i,j,lblname in overt)
+ if 'Target' in info:
+ self._warn('Annotation set {0} has multiple Target layers'.format(info.ID))
+ else:
+ info['Target'] = [(i,j) for (i,j,_) in overt]
+ elif l.name=='FE':
+ if l.rank==1:
+ assert 'FE' not in info
+ info['FE'] = (overt, ni)
+ #assert False,info
+ else:
+ # sometimes there are 3 FE layers! e.g. Change_position_on_a_scale.fall.v
+ assert 2<=l.rank<=3,l.rank
+ k = 'FE'+str(l.rank)
+ assert k not in info
+ info[k] = (overt, ni)
+ elif l.name in ('GF', 'PT'):
+ assert l.rank==1
+ info[l.name] = overt
+ elif l.name in ('BNC', 'PENN'):
+ assert l.rank==1
+ info['POS'] = overt
+ info['POS_tagset'] = l.name
+ else:
+ if is_pos:
+ if l.name not in ('NER', 'WSL'):
+ self._warn('Unexpected layer in sentence annotationset:', l.name)
+ else:
+ if l.name not in ('Sent', 'Verb', 'Noun', 'Adj', 'Adv', 'Prep', 'Scon', 'Art', 'Other'):
+ self._warn('Unexpected layer in frame annotationset:', l.name)
+ info[l.name] = overt
+ if not is_pos and 'cxnID' not in info:
+ if 'Target' not in info:
+ self._warn('Missing target in annotation set ID={0}'.format(info.ID))
+ assert 'FE' in info
+ if 'FE3' in info:
+ assert 'FE2' in info
+
return info
def _handle_lulayer_elt(self, elt):
@@ -1943,11 +2903,13 @@ class FramenetCorpusReader(XMLCorpusReader):
feinfo = self._load_xml_attributes(AttrDict(), elt)
feinfo['_type'] = 'fe'
feinfo['definition'] = ""
+ feinfo['definitionMarkup'] = ""
feinfo['semType'] = None
feinfo['requiresFE'] = None
feinfo['excludesFE'] = None
for sub in elt:
if sub.tag.endswith('definition'):
+ feinfo['definitionMarkup'] = sub.text
feinfo['definition'] = self._strip_tags(sub.text)
elif sub.tag.endswith('semType'):
stinfo = self._load_xml_attributes(AttrDict(), sub)
@@ -1966,6 +2928,7 @@ class FramenetCorpusReader(XMLCorpusReader):
semt['subTypes'] = PrettyList()
for sub in elt:
if sub.text is not None:
+ semt['definitionMarkup'] = sub.text
semt['definition'] = self._strip_tags(sub.text)
else:
supertypeinfo = self._load_xml_attributes(AttrDict(), sub)
@@ -1994,7 +2957,7 @@ def demo():
#
print('Number of Frames:', len(fn.frames()))
print('Number of Lexical Units:', len(fn.lus()))
- print('Number of annotated documents:', len(fn.documents()))
+ print('Number of annotated documents:', len(fn.docs()))
print()
#
@@ -2065,14 +3028,14 @@ def demo():
# Get a list of all of the corpora used for fulltext annotation
#
print('\nNames of all of the corpora used for fulltext annotation:')
- allcorpora = set([x.corpname for x in fn.documents()])
+ allcorpora = set([x.corpname for x in fn.docs_metadata()])
pprint(list(allcorpora))
#
# Get the names of the annotated documents in the first corpus
#
firstcorp = list(allcorpora)[0]
- firstcorp_docs = fn.documents(firstcorp)
+ firstcorp_docs = fn.docs(firstcorp)
print(
'\nNames of the annotated documents in the "{0}" corpus:'.format(firstcorp))
pprint([x.filename for x in firstcorp_docs])
diff --git a/nltk/corpus/reader/ieer.py b/nltk/corpus/reader/ieer.py
index 977b285..91b9425 100644
--- a/nltk/corpus/reader/ieer.py
+++ b/nltk/corpus/reader/ieer.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: IEER Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
@@ -22,6 +22,8 @@ APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.
"""
from __future__ import unicode_literals
+from six import string_types
+
import nltk
from nltk import compat
from nltk.corpus.reader.api import *
@@ -66,7 +68,7 @@ class IEERCorpusReader(CorpusReader):
"""
def raw(self, fileids=None):
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def docs(self, fileids=None):
@@ -108,4 +110,3 @@ class IEERCorpusReader(CorpusReader):
if line.strip() == '</DOC>': break
# Return the document
return ['\n'.join(out)]
-
diff --git a/nltk/corpus/reader/indian.py b/nltk/corpus/reader/indian.py
index f7dee59..1c50547 100644
--- a/nltk/corpus/reader/indian.py
+++ b/nltk/corpus/reader/indian.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
@@ -18,7 +18,8 @@ Contents:
- Telugu: IIIT Hyderabad
"""
-from nltk import compat
+from six import string_types
+
from nltk.tag import str2tuple, map_tag
from nltk.corpus.reader.util import *
@@ -58,7 +59,7 @@ class IndianCorpusReader(CorpusReader):
def raw(self, fileids=None):
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
@@ -82,5 +83,3 @@ class IndianCorpusView(StreamBackedCorpusView):
return [sent]
else:
return sent
-
-
diff --git a/nltk/corpus/reader/ipipan.py b/nltk/corpus/reader/ipipan.py
index fa62e48..bf9b73e 100644
--- a/nltk/corpus/reader/ipipan.py
+++ b/nltk/corpus/reader/ipipan.py
@@ -1,13 +1,14 @@
# Natural Language Toolkit: IPI PAN Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Konrad Goluchowski <kodie at mimuw.edu.pl>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import functools
-from nltk import compat
+from six import string_types
+
from nltk.corpus.reader.util import StreamBackedCorpusView, concat
from nltk.corpus.reader.api import CorpusReader
@@ -93,11 +94,11 @@ class IPIPANCorpusReader(CorpusReader):
if channels is None and domains is None and \
categories is None:
return CorpusReader.fileids(self)
- if isinstance(channels, compat.string_types):
+ if isinstance(channels, string_types):
channels = [channels]
- if isinstance(domains, compat.string_types):
+ if isinstance(domains, string_types):
domains = [domains]
- if isinstance(categories, compat.string_types):
+ if isinstance(categories, string_types):
categories = [categories]
if channels:
return self._list_morph_files_by('channel', channels)
diff --git a/nltk/corpus/reader/knbc.py b/nltk/corpus/reader/knbc.py
index e280fc9..8ad90a7 100644
--- a/nltk/corpus/reader/knbc.py
+++ b/nltk/corpus/reader/knbc.py
@@ -1,6 +1,6 @@
#! /usr/bin/env python
# KNB Corpus reader
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Masato Hagiwara <hagisan at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -9,8 +9,8 @@
from __future__ import print_function
import re
+from six import string_types
-from nltk.compat import string_types
from nltk.parse import DependencyGraph
from nltk.corpus.reader.util import (
diff --git a/nltk/corpus/reader/lin.py b/nltk/corpus/reader/lin.py
index cb25822..49d8a93 100644
--- a/nltk/corpus/reader/lin.py
+++ b/nltk/corpus/reader/lin.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Lin's Thesaurus
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Dan Blanchard <dblanchard at ets.org>
# URL: <http://nltk.org/>
# For license information, see LICENSE.txt
diff --git a/nltk/corpus/reader/mte.py b/nltk/corpus/reader/mte.py
index 71dd1a8..cd443a1 100644
--- a/nltk/corpus/reader/mte.py
+++ b/nltk/corpus/reader/mte.py
@@ -2,13 +2,14 @@
A reader for corpora whose documents are in MTE format.
"""
import os
+import re
from functools import reduce
-from nltk import compat
+
+from six import string_types
+
from nltk.corpus.reader import concat, TaggedCorpusReader
from nltk.corpus.reader.xmldocs import XMLCorpusView
-import xml.etree.ElementTree as etree
-import re
def xpath(root, path, ns):
return root.findall(path, ns)
@@ -179,7 +180,7 @@ class MTECorpusReader(TaggedCorpusReader):
def __fileids(self, fileids):
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
# filter wrong userinput
fileids = filter(lambda x : x in self._fileids, fileids)
# filter multext-east sourcefiles that are not compatible to the teip5 specification
diff --git a/nltk/corpus/reader/nkjp.py b/nltk/corpus/reader/nkjp.py
index a6af20b..6f141a2 100644
--- a/nltk/corpus/reader/nkjp.py
+++ b/nltk/corpus/reader/nkjp.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: NKJP Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Gabriela Kaczka
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -8,7 +8,8 @@
import functools
import os
import tempfile
-from nltk import compat
+
+from six import string_types
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
@@ -53,7 +54,7 @@ class NKJPCorpusReader(XMLCorpusReader):
x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
"""
- if isinstance(fileids, compat.string_types):
+ if isinstance(fileids, string_types):
XMLCorpusReader.__init__(self, root, fileids + '.*/header.xml')
else:
XMLCorpusReader.__init__(self, root, [fileid + '/header.xml' for fileid in fileids])
diff --git a/nltk/corpus/reader/nombank.py b/nltk/corpus/reader/nombank.py
index 8b5db56..e1427ac 100644
--- a/nltk/corpus/reader/nombank.py
+++ b/nltk/corpus/reader/nombank.py
@@ -1,17 +1,20 @@
# Natural Language Toolkit: NomBank Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Paul Bedaride <paul.bedaride at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import unicode_literals
+from xml.etree import ElementTree
+from functools import total_ordering
+
+from six import string_types
from nltk.tree import Tree
-from xml.etree import ElementTree
from nltk.internals import raise_unorderable_types
-from nltk.compat import total_ordering, python_2_unicode_compatible, string_types
+from nltk.compat import python_2_unicode_compatible
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
@@ -66,7 +69,7 @@ class NombankCorpusReader(CorpusReader):
:return: the text contents of the given fileids, as a single string.
"""
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def instances(self, baseform=None):
@@ -418,4 +421,3 @@ class NombankTreePointer(NombankPointer):
else:
wordnum += 1
stack.pop()
-
diff --git a/nltk/corpus/reader/nps_chat.py b/nltk/corpus/reader/nps_chat.py
index cf45466..a2da13c 100644
--- a/nltk/corpus/reader/nps_chat.py
+++ b/nltk/corpus/reader/nps_chat.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: NPS Chat Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/corpus/reader/opinion_lexicon.py b/nltk/corpus/reader/opinion_lexicon.py
index a436eee..0c70278 100644
--- a/nltk/corpus/reader/opinion_lexicon.py
+++ b/nltk/corpus/reader/opinion_lexicon.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Opinion Lexicon Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -27,8 +27,8 @@ Related papers:
Comparing Opinions on the Web". Proceedings of the 14th International World
Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.
"""
+from six import string_types
-from nltk.compat import string_types
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus.reader.api import *
@@ -83,7 +83,7 @@ class OpinionLexiconCorpusReader(WordListCorpusReader):
:rtype: list(str)
"""
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)])
diff --git a/nltk/corpus/reader/panlex_lite.py b/nltk/corpus/reader/panlex_lite.py
index 62cc6b2..08d3399 100644
--- a/nltk/corpus/reader/panlex_lite.py
+++ b/nltk/corpus/reader/panlex_lite.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: PanLex Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: David Kamholz <kamholz at panlex.org>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/corpus/reader/pl196x.py b/nltk/corpus/reader/pl196x.py
index e782785..93b8b19 100644
--- a/nltk/corpus/reader/pl196x.py
+++ b/nltk/corpus/reader/pl196x.py
@@ -1,278 +1,292 @@
# Natural Language Toolkit:
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Piotr Kasprzyk <p.j.kasprzyk at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-import os
-import re
+from six import string_types
-from nltk import compat
-from nltk import tokenize, tree
-
-from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import XMLCorpusReader
-# (?:something) -- non-capturing parentheses!
PARA = re.compile(r'<p(?: [^>]*){0,1}>(.*?)</p>')
SENT = re.compile(r'<s(?: [^>]*){0,1}>(.*?)</s>')
TAGGEDWORD = re.compile(r'<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>')
-WORD = re.compile(r'<[wc](?: [^>]*){0,1}>(.*?)</[wc]>')
+WORD = re.compile(r'<[wc](?: [^>]*){0,1}>(.*?)</[wc]>')
TYPE = re.compile(r'type="(.*?)"')
-ANA = re.compile(r'ana="(.*?)"')
+ANA = re.compile(r'ana="(.*?)"')
TEXTID = re.compile(r'text id="(.*?)"')
class TEICorpusView(StreamBackedCorpusView):
- def __init__(self, corpus_file,
- tagged, group_by_sent, group_by_para,
- tagset=None, headLen=0, textids=None):
- self._tagged = tagged
- self._textids = textids
-
- self._group_by_sent = group_by_sent
- self._group_by_para = group_by_para
- # WARNING -- skip header
- StreamBackedCorpusView.__init__(self, corpus_file, startpos=headLen)
-
- _pagesize = 4096
-
- def read_block(self, stream):
- block = stream.readlines(self._pagesize)
- block = concat(block)
- while (block.count('<text id') > block.count('</text>')) \
- or block.count('<text id') == 0:
- tmp = stream.readline()
- if len(tmp) <= 0:
- break
- block += tmp
-
- block = block.replace('\n','')
-
- textids = TEXTID.findall(block)
- if self._textids:
- for tid in textids:
- if tid not in self._textids:
- beg = block.find(tid)-1
- end = block[beg: ].find('</text>')+len('</text>')
- block = block[ :beg]+block[beg+end: ]
-
- output = []
- for para_str in PARA.findall(block):
- para = []
- for sent_str in SENT.findall(para_str):
- if not self._tagged:
- sent = WORD.findall(sent_str)
- else:
- sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
- if self._group_by_sent:
- para.append(sent)
- else:
- para.extend(sent)
- if self._group_by_para:
- output.append(para)
- else:
- output.extend(para)
- return output
-
- def _parse_tag(self, tag_word_tuple):
- (tag, word) = tag_word_tuple
- if tag.startswith('w'):
- tag = ANA.search(tag).group(1)
- else: # tag.startswith('c')
- tag = TYPE.search(tag).group(1)
- return (word, tag)
+ def __init__(self, corpus_file,
+ tagged, group_by_sent, group_by_para,
+ tagset=None, head_len=0, textids=None):
+
+ self._tagged = tagged
+ self._textids = textids
+
+ self._group_by_sent = group_by_sent
+ self._group_by_para = group_by_para
+ # WARNING -- skip header
+ StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len)
+
+ _pagesize = 4096
+
+ def read_block(self, stream):
+ block = stream.readlines(self._pagesize)
+ block = concat(block)
+ while (block.count('<text id') > block.count('</text>')) \
+ or block.count('<text id') == 0:
+ tmp = stream.readline()
+ if len(tmp) <= 0:
+ break
+ block += tmp
+
+ block = block.replace('\n', '')
+
+ textids = TEXTID.findall(block)
+ if self._textids:
+ for tid in textids:
+ if tid not in self._textids:
+ beg = block.find(tid) - 1
+ end = block[beg:].find('</text>') + len('</text>')
+ block = block[:beg] + block[beg + end:]
+
+ output = []
+ for para_str in PARA.findall(block):
+ para = []
+ for sent_str in SENT.findall(para_str):
+ if not self._tagged:
+ sent = WORD.findall(sent_str)
+ else:
+ sent = list(
+ map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
+ if self._group_by_sent:
+ para.append(sent)
+ else:
+ para.extend(sent)
+ if self._group_by_para:
+ output.append(para)
+ else:
+ output.extend(para)
+ return output
+
+ def _parse_tag(self, tag_word_tuple):
+ (tag, word) = tag_word_tuple
+ if tag.startswith('w'):
+ tag = ANA.search(tag).group(1)
+ else: # tag.startswith('c')
+ tag = TYPE.search(tag).group(1)
+ return word, tag
class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
-
- headLen = 2770
-
- def __init__(self, *args, **kwargs):
- if 'textid_file' in kwargs: self._textids = kwargs['textid_file']
- else: self._textids = None
-
- XMLCorpusReader.__init__(self, *args)
- CategorizedCorpusReader.__init__(self, kwargs)
-
- self._init_textids()
-
- def _init_textids(self):
- self._f2t = defaultdict(list)
- self._t2f = defaultdict(list)
- if self._textids is not None:
- for line in self.open(self._textids).readlines():
- line = line.strip()
- file_id, text_ids = line.split(' ', 1)
- if file_id not in self.fileids():
- raise ValueError('In text_id mapping file %s: %s '
- 'not found' % (catfile, file_id))
- for text_id in text_ids.split(self._delimiter):
- self._add_textids(file_id, text_id)
-
- def _add_textids(self, file_id, text_id):
- self._f2t[file_id].append(text_id)
- self._t2f[text_id].append(file_id)
-
- def _resolve(self, fileids, categories, textids=None):
- tmp = None
- if fileids is not None:
- if not tmp:
- tmp = fileids, None
- else:
- raise ValueError('Specify only fileids, categories or textids')
- if categories is not None:
- if not tmp:
- tmp = self.fileids(categories), None
- else:
- raise ValueError('Specify only fileids, categories or textids')
- if textids is not None:
- if not tmp:
- if isinstance(textids, compat.string_types): textids = [textids]
- files = sum((self._t2f[t] for t in textids), [])
- tdict = dict()
- for f in files:
- tdict[f] = (set(self._f2t[f]) & set(textids))
- tmp = files, tdict
- else:
- raise ValueError('Specify only fileids, categories or textids')
- return None, None
-
- def decode_tag(self, tag):
- # to be implemented
- return tag
-
- def textids(self, fileids=None, categories=None):
- """
- In the pl196x corpus each category is stored in single
- file and thus both methods provide identical functionality. In order
- to accommodate finer granularity, a non-standard textids() method was
- implemented. All the main functions can be supplied with a list
- of required chunks---giving much more control to the user.
- """
- fileids, _ = self._resolve(fileids, categories)
- if fileids is None: return sorted(self._t2f)
-
- if isinstance(fileids, compat.string_types):
- fileids = [fileids]
- return sorted(sum((self._f2t[d] for d in fileids), []))
-
- def words(self, fileids=None, categories=None, textids=None):
- fileids, textids = self._resolve(fileids, categories, textids)
- if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
-
- if textids:
- return concat([TEICorpusView(self.abspath(fileid),
- False, False, False,
- headLen=self.headLen,
- textids=textids[fileid])
- for fileid in fileids])
- else:
- return concat([TEICorpusView(self.abspath(fileid),
- False, False, False,
- headLen=self.headLen)
- for fileid in fileids])
-
- def sents(self, fileids=None, categories=None, textids=None):
- fileids, textids = self._resolve(fileids, categories, textids)
- if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
-
- if textids:
- return concat([TEICorpusView(self.abspath(fileid),
- False, True, False,
- headLen=self.headLen,
- textids=textids[fileid])
- for fileid in fileids])
- else:
- return concat([TEICorpusView(self.abspath(fileid),
- False, True, False,
- headLen=self.headLen)
- for fileid in fileids])
-
- def paras(self, fileids=None, categories=None, textids=None):
- fileids, textids = self._resolve(fileids, categories, textids)
- if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
-
- if textids:
- return concat([TEICorpusView(self.abspath(fileid),
- False, True, True,
- headLen=self.headLen,
- textids=textids[fileid])
- for fileid in fileids])
- else:
- return concat([TEICorpusView(self.abspath(fileid),
- False, True, True,
- headLen=self.headLen)
- for fileid in fileids])
-
- def tagged_words(self, fileids=None, categories=None, textids=None):
- fileids, textids = self._resolve(fileids, categories, textids)
- if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
-
- if textids:
- return concat([TEICorpusView(self.abspath(fileid),
- True, False, False,
- headLen=self.headLen,
- textids=textids[fileid])
- for fileid in fileids])
- else:
- return concat([TEICorpusView(self.abspath(fileid),
- True, False, False,
- headLen=self.headLen)
- for fileid in fileids])
-
- def tagged_sents(self, fileids=None, categories=None, textids=None):
- fileids, textids = self._resolve(fileids, categories, textids)
- if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
-
- if textids:
- return concat([TEICorpusView(self.abspath(fileid),
- True, True, False,
- headLen=self.headLen,
- textids=textids[fileid])
- for fileid in fileids])
- else:
- return concat([TEICorpusView(self.abspath(fileid),
- True, True, False,
- headLen=self.headLen)
- for fileid in fileids])
-
- def tagged_paras(self, fileids=None, categories=None, textids=None):
- fileids, textids = self._resolve(fileids, categories, textids)
- if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
-
- if textids:
- return concat([TEICorpusView(self.abspath(fileid),
- True, True, True,
- headLen=self.headLen,
- textids=textids[fileid])
- for fileid in fileids])
- else:
- return concat([TEICorpusView(self.abspath(fileid),
- True, True, True,
- headLen=self.headLen)
- for fileid in fileids])
-
- def xml(self, fileids=None, categories=None):
- fileids, _ = self._resolve(fileids, categories)
- if len(fileids) == 1: return XMLCorpusReader.xml(self, fileids[0])
- else: raise TypeError('Expected a single file')
-
- def raw(self, fileids=None, categories=None):
- fileids, _ = self._resolve(fileids, categories)
- if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
- return concat([self.open(f).read() for f in fileids])
-
+ head_len = 2770
+
+ def __init__(self, *args, **kwargs):
+ if 'textid_file' in kwargs:
+ self._textids = kwargs['textid_file']
+ else:
+ self._textids = None
+
+ XMLCorpusReader.__init__(self, *args)
+ CategorizedCorpusReader.__init__(self, kwargs)
+
+ self._init_textids()
+
+ def _init_textids(self):
+ self._f2t = defaultdict(list)
+ self._t2f = defaultdict(list)
+ if self._textids is not None:
+ with open(self._textids) as fp:
+ for line in fp:
+ line = line.strip()
+ file_id, text_ids = line.split(' ', 1)
+ if file_id not in self.fileids():
+ raise ValueError(
+ 'In text_id mapping file %s: %s not found'
+ % (self._textids, file_id)
+ )
+ for text_id in text_ids.split(self._delimiter):
+ self._add_textids(file_id, text_id)
+
+ def _add_textids(self, file_id, text_id):
+ self._f2t[file_id].append(text_id)
+ self._t2f[text_id].append(file_id)
+
+ def _resolve(self, fileids, categories, textids=None):
+ tmp = None
+ if len(filter(lambda accessor: accessor is None,
+ (fileids, categories, textids))) != 1:
+
+ raise ValueError('Specify exactly one of: fileids, '
+ 'categories or textids')
+
+ if fileids is not None:
+ return fileids, None
+
+ if categories is not None:
+ return self.fileids(categories), None
+
+ if textids is not None:
+ if isinstance(textids, string_types):
+ textids = [textids]
+ files = sum((self._t2f[t] for t in textids), [])
+ tdict = dict()
+ for f in files:
+ tdict[f] = (set(self._f2t[f]) & set(textids))
+ return files, tdict
+
+ def decode_tag(self, tag):
+ # to be implemented
+ return tag
+
+ def textids(self, fileids=None, categories=None):
+ """
+ In the pl196x corpus each category is stored in single
+ file and thus both methods provide identical functionality. In order
+ to accommodate finer granularity, a non-standard textids() method was
+ implemented. All the main functions can be supplied with a list
+ of required chunks---giving much more control to the user.
+ """
+ fileids, _ = self._resolve(fileids, categories)
+ if fileids is None: return sorted(self._t2f)
+
+ if isinstance(fileids, string_types):
+ fileids = [fileids]
+ return sorted(sum((self._f2t[d] for d in fileids), []))
+
+ def words(self, fileids=None, categories=None, textids=None):
+ fileids, textids = self._resolve(fileids, categories, textids)
+ if fileids is None:
+ fileids = self._fileids
+ elif isinstance(fileids, string_types):
+ fileids = [fileids]
+
+ if textids:
+ return concat([TEICorpusView(self.abspath(fileid),
+ False, False, False,
+ head_len=self.head_len,
+ textids=textids[fileid])
+ for fileid in fileids])
+ else:
+ return concat([TEICorpusView(self.abspath(fileid),
+ False, False, False,
+ head_len=self.head_len)
+ for fileid in fileids])
+
+ def sents(self, fileids=None, categories=None, textids=None):
+ fileids, textids = self._resolve(fileids, categories, textids)
+ if fileids is None:
+ fileids = self._fileids
+ elif isinstance(fileids, string_types):
+ fileids = [fileids]
+
+ if textids:
+ return concat([TEICorpusView(self.abspath(fileid),
+ False, True, False,
+ head_len=self.head_len,
+ textids=textids[fileid])
+ for fileid in fileids])
+ else:
+ return concat([TEICorpusView(self.abspath(fileid),
+ False, True, False,
+ head_len=self.head_len)
+ for fileid in fileids])
+
+ def paras(self, fileids=None, categories=None, textids=None):
+ fileids, textids = self._resolve(fileids, categories, textids)
+ if fileids is None:
+ fileids = self._fileids
+ elif isinstance(fileids, string_types):
+ fileids = [fileids]
+
+ if textids:
+ return concat([TEICorpusView(self.abspath(fileid),
+ False, True, True,
+ head_len=self.head_len,
+ textids=textids[fileid])
+ for fileid in fileids])
+ else:
+ return concat([TEICorpusView(self.abspath(fileid),
+ False, True, True,
+ head_len=self.head_len)
+ for fileid in fileids])
+
+ def tagged_words(self, fileids=None, categories=None, textids=None):
+ fileids, textids = self._resolve(fileids, categories, textids)
+ if fileids is None:
+ fileids = self._fileids
+ elif isinstance(fileids, string_types):
+ fileids = [fileids]
+
+ if textids:
+ return concat([TEICorpusView(self.abspath(fileid),
+ True, False, False,
+ head_len=self.head_len,
+ textids=textids[fileid])
+ for fileid in fileids])
+ else:
+ return concat([TEICorpusView(self.abspath(fileid),
+ True, False, False,
+ head_len=self.head_len)
+ for fileid in fileids])
+
+ def tagged_sents(self, fileids=None, categories=None, textids=None):
+ fileids, textids = self._resolve(fileids, categories, textids)
+ if fileids is None:
+ fileids = self._fileids
+ elif isinstance(fileids, string_types):
+ fileids = [fileids]
+
+ if textids:
+ return concat([TEICorpusView(self.abspath(fileid),
+ True, True, False,
+ head_len=self.head_len,
+ textids=textids[fileid])
+ for fileid in fileids])
+ else:
+ return concat([TEICorpusView(self.abspath(fileid),
+ True, True, False,
+ head_len=self.head_len)
+ for fileid in fileids])
+
+ def tagged_paras(self, fileids=None, categories=None, textids=None):
+ fileids, textids = self._resolve(fileids, categories, textids)
+ if fileids is None:
+ fileids = self._fileids
+ elif isinstance(fileids, string_types):
+ fileids = [fileids]
+
+ if textids:
+ return concat([TEICorpusView(self.abspath(fileid),
+ True, True, True,
+ head_len=self.head_len,
+ textids=textids[fileid])
+ for fileid in fileids])
+ else:
+ return concat([TEICorpusView(self.abspath(fileid),
+ True, True, True,
+ head_len=self.head_len)
+ for fileid in fileids])
+
+ def xml(self, fileids=None, categories=None):
+ fileids, _ = self._resolve(fileids, categories)
+ if len(fileids) == 1:
+ return XMLCorpusReader.xml(self, fileids[0])
+ else:
+ raise TypeError('Expected a single file')
+
+ def raw(self, fileids=None, categories=None):
+ fileids, _ = self._resolve(fileids, categories)
+ if fileids is None:
+ fileids = self._fileids
+ elif isinstance(fileids, string_types):
+ fileids = [fileids]
+ return concat([self.open(f).read() for f in fileids])
diff --git a/nltk/corpus/reader/plaintext.py b/nltk/corpus/reader/plaintext.py
index f834b7c..332b6aa 100644
--- a/nltk/corpus/reader/plaintext.py
+++ b/nltk/corpus/reader/plaintext.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Plaintext Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# Nitin Madnani <nmadnani at umiacs.umd.edu>
@@ -11,10 +11,10 @@
A reader for corpora that consist of plaintext documents.
"""
+from six import string_types
import codecs
import nltk.data
-from nltk.compat import string_types
from nltk.tokenize import *
from nltk.corpus.reader.util import *
@@ -75,7 +75,7 @@ class PlaintextCorpusReader(CorpusReader):
for f in fileids:
_fin = self.open(f)
raw_texts.append(_fin.read())
- _fin.close()
+ _fin.close()
return concat(raw_texts)
def words(self, fileids=None):
@@ -230,4 +230,3 @@ class EuroparlCorpusReader(PlaintextCorpusReader):
def paras(self, fileids=None):
raise NotImplementedError('The Europarl corpus reader does not support paragraphs. Please use chapters() instead.')
-
diff --git a/nltk/corpus/reader/ppattach.py b/nltk/corpus/reader/ppattach.py
index d4f4563..9c0ac65 100644
--- a/nltk/corpus/reader/ppattach.py
+++ b/nltk/corpus/reader/ppattach.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: PP Attachment Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
@@ -39,6 +39,8 @@ of the author.
"""
from __future__ import unicode_literals
+from six import string_types
+
from nltk import compat
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
@@ -76,7 +78,7 @@ class PPAttachmentCorpusReader(CorpusReader):
def raw(self, fileids=None):
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def _read_tuple_block(self, stream):
@@ -92,4 +94,3 @@ class PPAttachmentCorpusReader(CorpusReader):
return [PPAttachment(*line.split())]
else:
return []
-
diff --git a/nltk/corpus/reader/propbank.py b/nltk/corpus/reader/propbank.py
index d672e97..320c75a 100644
--- a/nltk/corpus/reader/propbank.py
+++ b/nltk/corpus/reader/propbank.py
@@ -1,18 +1,19 @@
# Natural Language Toolkit: PropBank Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import unicode_literals
import re
+from functools import total_ordering
from xml.etree import ElementTree
-from nltk import compat
+from six import string_types
+
from nltk.tree import Tree
from nltk.internals import raise_unorderable_types
-from nltk.compat import total_ordering
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
@@ -48,7 +49,7 @@ class PropbankCorpusReader(CorpusReader):
necessary to resolve the tree pointers used by propbank.
"""
# If framefiles is specified as a regexp, expand it.
- if isinstance(framefiles, compat.string_types):
+ if isinstance(framefiles, string_types):
framefiles = find_corpus_fileids(root, framefiles)
framefiles = list(framefiles)
# Initialze the corpus reader.
@@ -67,7 +68,7 @@ class PropbankCorpusReader(CorpusReader):
:return: the text contents of the given fileids, as a single string.
"""
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, ): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def instances(self, baseform=None):
@@ -472,10 +473,9 @@ class PropbankInflection(object):
@staticmethod
def parse(s):
- if not isinstance(s, compat.string_types):
+ if not isinstance(s, string_types):
raise TypeError('expected a string')
if (len(s) != 5 or
not PropbankInflection._VALIDATE.match(s)):
raise ValueError('Bad propbank inflection string %r' % s)
return PropbankInflection(*s)
-
diff --git a/nltk/corpus/reader/pros_cons.py b/nltk/corpus/reader/pros_cons.py
index 6bb2eb6..61e904e 100644
--- a/nltk/corpus/reader/pros_cons.py
+++ b/nltk/corpus/reader/pros_cons.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Pros and Cons Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -27,6 +27,8 @@ Related papers:
"""
import re
+from six import string_types
+
from nltk.corpus.reader.api import *
from nltk.tokenize import *
@@ -75,7 +77,7 @@ class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, compat.string_types):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)])
@@ -95,7 +97,7 @@ class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, compat.string_types):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)])
diff --git a/nltk/corpus/reader/reviews.py b/nltk/corpus/reader/reviews.py
index 783ac5c..1ce3d25 100644
--- a/nltk/corpus/reader/reviews.py
+++ b/nltk/corpus/reader/reviews.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Product Reviews Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -61,6 +61,8 @@ Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
from __future__ import division
+from six import string_types
+
import re
from nltk.corpus.reader.api import *
diff --git a/nltk/corpus/reader/rte.py b/nltk/corpus/reader/rte.py
index 98f39f5..66c702d 100644
--- a/nltk/corpus/reader/rte.py
+++ b/nltk/corpus/reader/rte.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: RTE Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -33,6 +33,9 @@ file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
challenge number and 'n' is the pair ID.
"""
from __future__ import unicode_literals
+
+from six import string_types
+
from nltk import compat
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
@@ -137,10 +140,5 @@ class RTECorpusReader(XMLCorpusReader):
:type: list
:rtype: list(RTEPair)
"""
- if isinstance(fileids, compat.string_types): fileids = [fileids]
+ if isinstance(fileids, string_types): fileids = [fileids]
return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])
-
-
-
-
-
diff --git a/nltk/corpus/reader/semcor.py b/nltk/corpus/reader/semcor.py
index 10811e6..826439f 100644
--- a/nltk/corpus/reader/semcor.py
+++ b/nltk/corpus/reader/semcor.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: SemCor Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Nathan Schneider <nschneid at cs.cmu.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/corpus/reader/senseval.py b/nltk/corpus/reader/senseval.py
index fa59e90..e8a0f3e 100644
--- a/nltk/corpus/reader/senseval.py
+++ b/nltk/corpus/reader/senseval.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Senseval 2 Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn at cs.mu.oz.au>
# Steven Bird <stevenbird1 at gmail.com> (modifications)
# URL: <http://nltk.org/>
@@ -23,6 +23,8 @@ is tagged with a sense identifier, and supplied with context.
"""
from __future__ import print_function, unicode_literals
+from six import string_types
+
import re
from xml.etree import ElementTree
@@ -56,7 +58,7 @@ class SensevalCorpusReader(CorpusReader):
:return: the text contents of the given fileids, as a single string.
"""
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def _entry(self, tree):
diff --git a/nltk/corpus/reader/sentiwordnet.py b/nltk/corpus/reader/sentiwordnet.py
index c501f8d..afb398b 100644
--- a/nltk/corpus/reader/sentiwordnet.py
+++ b/nltk/corpus/reader/sentiwordnet.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: SentiWordNet
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Christopher Potts <cgpotts at stanford.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/corpus/reader/sinica_treebank.py b/nltk/corpus/reader/sinica_treebank.py
index 06d609a..c63f7ad 100644
--- a/nltk/corpus/reader/sinica_treebank.py
+++ b/nltk/corpus/reader/sinica_treebank.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Sinica Treebank Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/corpus/reader/string_category.py b/nltk/corpus/reader/string_category.py
index b3d9087..2afd080 100644
--- a/nltk/corpus/reader/string_category.py
+++ b/nltk/corpus/reader/string_category.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: String Category Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
@@ -18,6 +18,7 @@ NUM:date When did Hawaii become a state ?
"""
# based on PPAttachmentCorpusReader
+from six import string_types
from nltk import compat
from nltk.corpus.reader.util import *
@@ -38,7 +39,7 @@ class StringCategoryCorpusReader(CorpusReader):
def tuples(self, fileids=None):
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([StreamBackedCorpusView(fileid, self._read_tuple_block,
encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)])
@@ -48,7 +49,7 @@ class StringCategoryCorpusReader(CorpusReader):
:return: the text contents of the given fileids, as a single string.
"""
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def _read_tuple_block(self, stream):
diff --git a/nltk/corpus/reader/switchboard.py b/nltk/corpus/reader/switchboard.py
index 1b5555f..f07e2f6 100644
--- a/nltk/corpus/reader/switchboard.py
+++ b/nltk/corpus/reader/switchboard.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Switchboard Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/corpus/reader/tagged.py b/nltk/corpus/reader/tagged.py
index 2a3bf75..d7f563d 100644
--- a/nltk/corpus/reader/tagged.py
+++ b/nltk/corpus/reader/tagged.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Tagged Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# Jacob Perkins <japerk at gmail.com>
@@ -13,7 +13,8 @@ A reader for corpora whose documents contain part-of-speech-tagged words.
import os
-from nltk import compat
+from six import string_types
+
from nltk.tag import str2tuple, map_tag
from nltk.tokenize import *
@@ -65,7 +66,7 @@ class TaggedCorpusReader(CorpusReader):
:rtype: str
"""
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
diff --git a/nltk/corpus/reader/timit.py b/nltk/corpus/reader/timit.py
index 9d922b9..b8346df 100644
--- a/nltk/corpus/reader/timit.py
+++ b/nltk/corpus/reader/timit.py
@@ -126,6 +126,8 @@ import re
import tempfile
import time
+from six import string_types
+
from nltk import compat
from nltk.tree import Tree
from nltk.internals import import_from_stdlib
@@ -162,7 +164,7 @@ class TimitCorpusReader(CorpusReader):
:param root: The root directory for this corpus.
"""
# Ensure that wave files don't get treated as unicode data:
- if isinstance(encoding, compat.string_types):
+ if isinstance(encoding, string_types):
encoding = [('.*\.wav', None), ('.*', encoding)]
CorpusReader.__init__(self, root,
@@ -205,11 +207,11 @@ class TimitCorpusReader(CorpusReader):
region, gender, sentence type, or sentence number, if
specified.
"""
- if isinstance(dialect, compat.string_types): dialect = [dialect]
- if isinstance(sex, compat.string_types): sex = [sex]
- if isinstance(spkrid, compat.string_types): spkrid = [spkrid]
- if isinstance(sent_type, compat.string_types): sent_type = [sent_type]
- if isinstance(sentid, compat.string_types): sentid = [sentid]
+ if isinstance(dialect, string_types): dialect = [dialect]
+ if isinstance(sex, string_types): sex = [sex]
+ if isinstance(spkrid, string_types): spkrid = [spkrid]
+ if isinstance(sent_type, string_types): sent_type = [sent_type]
+ if isinstance(sentid, string_types): sentid = [sentid]
utterances = self._utterances[:]
if dialect is not None:
@@ -307,7 +309,7 @@ class TimitCorpusReader(CorpusReader):
def phone_trees(self, utterances=None):
if utterances is None: utterances = self._utterances
- if isinstance(utterances, compat.string_types): utterances = [utterances]
+ if isinstance(utterances, string_types): utterances = [utterances]
trees = []
for utterance in utterances:
@@ -372,7 +374,7 @@ class TimitCorpusReader(CorpusReader):
def _utterance_fileids(self, utterances, extension):
if utterances is None: utterances = self._utterances
- if isinstance(utterances, compat.string_types): utterances = [utterances]
+ if isinstance(utterances, string_types): utterances = [utterances]
return ['%s%s' % (u, extension) for u in utterances]
def play(self, utterance, start=0, end=None):
diff --git a/nltk/corpus/reader/toolbox.py b/nltk/corpus/reader/toolbox.py
index bef15ca..169ed02 100644
--- a/nltk/corpus/reader/toolbox.py
+++ b/nltk/corpus/reader/toolbox.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Toolbox Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Greg Aumann <greg_aumann at sil.org>
# Stuart Robinson <Stuart.Robinson at mpi.nl>
# Steven Bird <stevenbird1 at gmail.com>
@@ -16,9 +16,9 @@ import os
import re
import codecs
-from nltk import compat
-from nltk.toolbox import ToolboxData
+from six import string_types
+from nltk.toolbox import ToolboxData
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
@@ -57,7 +57,7 @@ class ToolboxCorpusReader(CorpusReader):
def raw(self, fileids):
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
diff --git a/nltk/corpus/reader/twitter.py b/nltk/corpus/reader/twitter.py
index 813f022..5b48dcf 100644
--- a/nltk/corpus/reader/twitter.py
+++ b/nltk/corpus/reader/twitter.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Twitter Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -13,7 +13,8 @@ have been serialised into line-delimited JSON.
import json
import os
-from nltk import compat
+from six import string_types
+
from nltk.tokenize import TweetTokenizer
from nltk.corpus.reader.util import StreamBackedCorpusView, concat, ZipFilePathPointer
@@ -135,7 +136,7 @@ class TwitterCorpusReader(CorpusReader):
"""
if fileids is None:
fileids = self._fileids
- elif isinstance(fileids, compat.string_types):
+ elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
@@ -152,6 +153,3 @@ class TwitterCorpusReader(CorpusReader):
tweet = json.loads(line)
tweets.append(tweet)
return tweets
-
-
-
diff --git a/nltk/corpus/reader/util.py b/nltk/corpus/reader/util.py
index 4357645..cf44eb9 100644
--- a/nltk/corpus/reader/util.py
+++ b/nltk/corpus/reader/util.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Corpus Reader Utilities
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
@@ -10,6 +10,7 @@ import os
import bisect
import re
import tempfile
+from six import string_types, text_type
from functools import reduce
try:
import cPickle as pickle
@@ -20,7 +21,6 @@ except ImportError:
try: from xml.etree import cElementTree as ElementTree
except ImportError: from xml.etree import ElementTree
-from nltk.compat import string_types, text_type
from nltk.tokenize import wordpunct_tokenize
from nltk.internals import slice_bounds
from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer
@@ -281,6 +281,11 @@ class StreamBackedCorpusView(AbstractLazySequence):
if self._stream is None:
self._open()
+ # If the file is empty, the while loop will never run.
+ # This *seems* to be all the state we need to set:
+ if self._eofpos == 0:
+ self._len = 0
+
# Each iteration through this loop, we read a single block
# from the stream.
while filepos < self._eofpos:
@@ -797,4 +802,3 @@ def tagged_treebank_para_block_reader(stream):
# Content line:
else:
para += line
-
diff --git a/nltk/corpus/reader/verbnet.py b/nltk/corpus/reader/verbnet.py
index 370fd2a..6a34113 100644
--- a/nltk/corpus/reader/verbnet.py
+++ b/nltk/corpus/reader/verbnet.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Verbnet Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -17,17 +17,18 @@ import re
import textwrap
from collections import defaultdict
-from nltk import compat
+from six import string_types
+
from nltk.corpus.reader.xmldocs import XMLCorpusReader
class VerbnetCorpusReader(XMLCorpusReader):
"""
An NLTK interface to the VerbNet verb lexicon.
-
- From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest
- on-line verb lexicon currently available for English. It is a hierarchical
- domain-independent, broad-coverage verb lexicon with mappings to other
- lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), Xtag
+
+ From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest
+ on-line verb lexicon currently available for English. It is a hierarchical
+ domain-independent, broad-coverage verb lexicon with mappings to other
+ lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), Xtag
(XTAG Research Group, 2001), and FrameNet (Baker et al., 1998)."
For details about VerbNet see:
@@ -165,7 +166,7 @@ class VerbnetCorpusReader(XMLCorpusReader):
"""
if vnclass_ids is None:
return self._fileids
- elif isinstance(vnclass_ids, compat.string_types):
+ elif isinstance(vnclass_ids, string_types):
return [self._class_to_fileid[self.longid(vnclass_ids)]]
else:
return [self._class_to_fileid[self.longid(vnclass_id)]
@@ -269,7 +270,7 @@ class VerbnetCorpusReader(XMLCorpusReader):
:param vnclass: A verbnet class identifier; or an ElementTree
containing the xml contents of a verbnet class.
"""
- if isinstance(vnclass, compat.string_types):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
s = vnclass.get('ID') + '\n'
@@ -290,7 +291,7 @@ class VerbnetCorpusReader(XMLCorpusReader):
:param vnclass: A verbnet class identifier; or an ElementTree
containing the xml contents of a verbnet class.
"""
- if isinstance(vnclass, compat.string_types):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
subclasses = [subclass.get('ID') for subclass in
@@ -308,7 +309,7 @@ class VerbnetCorpusReader(XMLCorpusReader):
:param vnclass: A verbnet class identifier; or an ElementTree
containing the xml contents of a verbnet class.
"""
- if isinstance(vnclass, compat.string_types):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
members = [member.get('name') for member in
@@ -326,7 +327,7 @@ class VerbnetCorpusReader(XMLCorpusReader):
:param vnclass: A verbnet class identifier; or an ElementTree
containing the xml contents of a verbnet class.
"""
- if isinstance(vnclass, compat.string_types):
+ if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
pieces = []
@@ -404,5 +405,3 @@ class VerbnetCorpusReader(XMLCorpusReader):
args = [arg.get('value') for arg in pred.findall('ARGS/ARG')]
pieces.append('%s(%s)' % (pred.get('value'), ', '.join(args)))
return '\n'.join('%s* %s' % (indent, piece) for piece in pieces)
-
-
diff --git a/nltk/corpus/reader/wordlist.py b/nltk/corpus/reader/wordlist.py
index a8b2fcf..85f529e 100644
--- a/nltk/corpus/reader/wordlist.py
+++ b/nltk/corpus/reader/wordlist.py
@@ -1,27 +1,30 @@
+# -*- coding: utf-8 -*-
# Natural Language Toolkit: Word List Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from six import string_types
-from nltk import compat
from nltk.tokenize import line_tokenize
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
+
class WordListCorpusReader(CorpusReader):
"""
List of words, one per line. Blank lines are ignored.
"""
- def words(self, fileids=None):
- return line_tokenize(self.raw(fileids))
+ def words(self, fileids=None, ignore_lines_startswith='\n'):
+ return [line for line in line_tokenize(self.raw(fileids))
+ if not line.startswith(ignore_lines_startswith)]
def raw(self, fileids=None):
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
@@ -35,3 +38,95 @@ class SwadeshCorpusReader(WordListCorpusReader):
wordlists = [self.words(f) for f in fileids]
return list(zip(*wordlists))
+
+
+class NonbreakingPrefixesCorpusReader(WordListCorpusReader):
+ """
+ This is a class to read the nonbreaking prefixes textfiles from the
+ Moses Machine Translation toolkit. These lists are used in the Python port
+ of the Moses' word tokenizer.
+ """
+ available_langs = {'catalan': 'ca', 'czech': 'cs', 'german': 'de',
+ 'greek': 'el', 'english': 'en', 'spanish': 'es',
+ 'finnish': 'fi', 'french': 'fr', 'hungarian': 'hu',
+ 'icelandic': 'is', 'italian': 'it', 'latvian': 'lv',
+ 'dutch': 'nl', 'polish': 'pl', 'portuguese': 'pt',
+ 'romanian': 'ro', 'russian': 'ru', 'slovak': 'sk',
+ 'slovenian': 'sl', 'swedish': 'sv', 'tamil': 'ta'}
+ # Also, add the lang IDs as the keys.
+ available_langs.update({v:v for v in available_langs.values()})
+
+ def words(self, lang=None, fileids=None, ignore_lines_startswith='#'):
+ """
+ This module returns a list of nonbreaking prefixes for the specified
+ language(s).
+
+ >>> from nltk.corpus import nonbreaking_prefixes as nbp
+ >>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J']
+ True
+ >>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89']
+ True
+
+ :return: a list words for the specified language(s).
+ """
+ # If *lang* in list of languages available, allocate apt fileid.
+ # Otherwise, the function returns non-breaking prefixes for
+ # all languages when fileids==None.
+ if lang in self.available_langs:
+ lang = self.available_langs[lang]
+ fileids = ['nonbreaking_prefix.'+lang]
+ return [line for line in line_tokenize(self.raw(fileids))
+ if not line.startswith(ignore_lines_startswith)]
+
+class UnicharsCorpusReader(WordListCorpusReader):
+ """
+ This class is used to read lists of characters from the Perl Unicode
+ Properties (see http://perldoc.perl.org/perluniprops.html).
+ The files in the perluniprop.zip are extracted using the Unicode::Tussle
+ module from http://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm
+ """
+ # These are categories similar to the Perl Unicode Properties
+ available_categories = ['Close_Punctuation', 'Currency_Symbol',
+ 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc',
+ 'IsSo', 'Open_Punctuation']
+
+ def chars(self, category=None, fileids=None):
+ """
+ This module returns a list of characters from the Perl Unicode Properties.
+ They are very useful when porting Perl tokenizers to Python.
+
+ >>> from nltk.corpus import perluniprops as pup
+ >>> pup.chars('Open_Punctuation')[:5] == [u'(', u'[', u'{', u'\u0f3a', u'\u0f3c']
+ True
+ >>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5']
+ True
+ >>> pup.available_categories
+ ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'Open_Punctuation']
+
+ :return: a list of characters given the specific unicode character category
+ """
+ if category in self.available_categories:
+ fileids = [category+'.txt']
+ return list(self.raw(fileids).strip())
+
+
+class MWAPPDBCorpusReader(WordListCorpusReader):
+ """
+ This class is used to read the list of word pairs from the subset of lexical
+ pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word
+ Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015):
+ - http://acl2014.org/acl2014/Q14/pdf/Q14-1017
+ - http://www.aclweb.org/anthology/S14-2039
+ - http://www.aclweb.org/anthology/S15-2027
+
+ The original source of the full PPDB corpus can be found on
+ http://www.cis.upenn.edu/~ccb/ppdb/
+
+ :return: a list of tuples of similar lexical terms.
+ """
+ mwa_ppdb_xxxl_file = 'ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs'
+ def entries(self, fileids=mwa_ppdb_xxxl_file):
+ """
+ :return: a tuple of synonym word pairs.
+ """
+ return [tuple(line.split('\t')) for line in line_tokenize(self.raw(fileids))]
diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py
index 845e447..3069a83 100644
--- a/nltk/corpus/reader/wordnet.py
+++ b/nltk/corpus/reader/wordnet.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: WordNet
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bethard <Steven.Bethard at colorado.edu>
# Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
@@ -23,7 +23,7 @@ such as hypernyms, hyponyms, synonyms, antonyms etc.
For details about WordNet see:
http://wordnet.princeton.edu/
-This module also allows you to find lemmas in languages
+This module also allows you to find lemmas in languages
other than English from the Open Multilingual Wordnet
http://compling.hss.ntu.edu.sg/omw/
@@ -34,42 +34,46 @@ from __future__ import print_function, unicode_literals
import math
import re
from itertools import islice, chain
-from operator import itemgetter, attrgetter
+from functools import total_ordering
+from operator import itemgetter
from collections import defaultdict, deque
+from six import iteritems
+from six.moves import range
+
from nltk.corpus.reader import CorpusReader
from nltk.util import binary_search_file as _binary_search_file
from nltk.probability import FreqDist
-from nltk.compat import (iteritems, python_2_unicode_compatible,
- total_ordering, xrange)
+from nltk.compat import python_2_unicode_compatible
+from nltk.internals import deprecated
######################################################################
-## Table of Contents
+# Table of Contents
######################################################################
-## - Constants
-## - Data Classes
-## - WordNetError
-## - Lemma
-## - Synset
-## - WordNet Corpus Reader
-## - WordNet Information Content Corpus Reader
-## - Similarity Metrics
-## - Demo
+# - Constants
+# - Data Classes
+# - WordNetError
+# - Lemma
+# - Synset
+# - WordNet Corpus Reader
+# - WordNet Information Content Corpus Reader
+# - Similarity Metrics
+# - Demo
######################################################################
-## Constants
+# Constants
######################################################################
#: Positive infinity (for similarity functions)
_INF = 1e300
-#{ Part-of-speech constants
+# { Part-of-speech constants
ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
-#}
+# }
POS_LIST = [NOUN, VERB, ADJ, ADV]
-#: A table of strings that are used to express verb frames.
+# A table of strings that are used to express verb frames.
VERB_FRAME_STRINGS = (
None,
"Something %s",
@@ -110,10 +114,12 @@ VERB_FRAME_STRINGS = (
SENSENUM_RE = re.compile(r'\.\d\d\.')
+
######################################################################
-## Data Classes
+# Data Classes
######################################################################
+
class WordNetError(Exception):
"""An exception class for wordnet-related errors."""
@@ -263,7 +269,7 @@ class Lemma(_WordNetObject):
self._lex_id = lex_id
self._lang = 'eng'
- self._key = None # gets set later.
+ self._key = None # gets set later.
def name(self):
return self._name
@@ -291,10 +297,12 @@ class Lemma(_WordNetObject):
return "%s('%s.%s')" % tup
def _related(self, relation_symbol):
- get_synset = self._wordnet_corpus_reader._synset_from_pos_and_offset
- return sorted([get_synset(pos, offset)._lemmas[lemma_index]
- for pos, offset, lemma_index
- in self._synset._lemma_pointers[self._name, relation_symbol]])
+ get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
+ return sorted([
+ get_synset(pos, offset)._lemmas[lemma_index]
+ for pos, offset, lemma_index
+ in self._synset._lemma_pointers[self._name, relation_symbol]
+ ])
def count(self):
"""Return the frequency count for this Lemma"""
@@ -383,7 +391,7 @@ class Synset(_WordNetObject):
self._lemma_names = []
self._definition = None
self._examples = []
- self._lexname = None # lexicographer name
+ self._lexname = None # lexicographer name
self._all_hypernyms = None
self._pointers = defaultdict(set)
@@ -421,7 +429,7 @@ class Synset(_WordNetObject):
def lemma_names(self, lang='eng'):
'''Return all the lemma_names associated with the synset'''
- if lang=='eng':
+ if lang == 'eng':
return self._lemma_names
else:
self._wordnet_corpus_reader._load_lang_data(lang)
@@ -431,21 +439,30 @@ class Synset(_WordNetObject):
return self._wordnet_corpus_reader._lang_data[lang][0][i]
else:
return []
-
+
def lemmas(self, lang='eng'):
'''Return all the lemma objects associated with the synset'''
- if lang=='eng':
+ if lang == 'eng':
return self._lemmas
else:
self._wordnet_corpus_reader._load_lang_data(lang)
lemmark = []
lemmy = self.lemma_names(lang)
for lem in lemmy:
- temp= Lemma(self._wordnet_corpus_reader, self, lem, self._wordnet_corpus_reader._lexnames.index(self.lexname()), 0, None)
- temp._lang=lang
+ temp = Lemma(
+ self._wordnet_corpus_reader,
+ self,
+ lem,
+ self._wordnet_corpus_reader._lexnames.index(
+ self.lexname()
+ ),
+ 0,
+ None
+ )
+ temp._lang = lang
lemmark.append(temp)
return lemmark
-
+
def root_hypernyms(self):
"""Get the topmost hypernyms of this synset in WordNet."""
@@ -555,32 +572,40 @@ class Synset(_WordNetObject):
:return: The synsets that are hypernyms of both synsets.
"""
if not self._all_hypernyms:
- self._all_hypernyms = set(self_synset
- for self_synsets in self._iter_hypernym_lists()
- for self_synset in self_synsets)
+ self._all_hypernyms = set(
+ self_synset
+ for self_synsets in self._iter_hypernym_lists()
+ for self_synset in self_synsets
+ )
if not other._all_hypernyms:
- other._all_hypernyms = set(other_synset
- for other_synsets in other._iter_hypernym_lists()
- for other_synset in other_synsets)
+ other._all_hypernyms = set(
+ other_synset
+ for other_synsets in other._iter_hypernym_lists()
+ for other_synset in other_synsets
+ )
return list(self._all_hypernyms.intersection(other._all_hypernyms))
- def lowest_common_hypernyms(self, other, simulate_root=False, use_min_depth=False):
+ def lowest_common_hypernyms(
+ self, other, simulate_root=False, use_min_depth=False
+ ):
"""
Get a list of lowest synset(s) that both synsets have as a hypernym.
- When `use_min_depth == False` this means that the synset which appears as a
- hypernym of both `self` and `other` with the lowest maximum depth is returned
- or if there are multiple such synsets at the same depth they are all returned
+ When `use_min_depth == False` this means that the synset which appears
+ as a hypernym of both `self` and `other` with the lowest maximum depth
+ is returned or if there are multiple such synsets at the same depth
+ they are all returned
- However, if `use_min_depth == True` then the synset(s) which has/have the lowest
- minimum depth and appear(s) in both paths is/are returned.
+ However, if `use_min_depth == True` then the synset(s) which has/have
+ the lowest minimum depth and appear(s) in both paths is/are returned.
- By setting the use_min_depth flag to True, the behavior of NLTK2 can be preserved.
- This was changed in NLTK3 to give more accurate results in a small set of cases,
- generally with synsets concerning people. (eg: 'chef.n.01', 'fireman.n.01', etc.)
+ By setting the use_min_depth flag to True, the behavior of NLTK2 can be
+ preserved. This was changed in NLTK3 to give more accurate results in a
+ small set of cases, generally with synsets concerning people. (eg:
+ 'chef.n.01', 'fireman.n.01', etc.)
- This method is an implementation of Ted Pedersen's "Lowest Common Subsumer" method
- from the Perl Wordnet module. It can return either "self" or "other" if they are a
- hypernym of the other.
+ This method is an implementation of Ted Pedersen's "Lowest Common
+ Subsumer" method from the Perl Wordnet module. It can return either
+ "self" or "other" if they are a hypernym of the other.
:type other: Synset
:param other: other input synset
@@ -594,11 +619,13 @@ class Synset(_WordNetObject):
If you are using wordnet 1.6, a fake root will need to be added
for nouns as well.
:type use_min_depth: bool
- :param use_min_depth: This setting mimics older (v2) behavior of NLTK wordnet
- If True, will use the min_depth function to calculate the lowest common
- hypernyms. This is known to give strange results for some synset pairs
- (eg: 'chef.n.01', 'fireman.n.01') but is retained for backwards compatibility
- :return: The synsets that are the lowest common hypernyms of both synsets
+ :param use_min_depth: This setting mimics older (v2) behavior of NLTK
+ wordnet If True, will use the min_depth function to calculate the
+ lowest common hypernyms. This is known to give strange results for
+ some synset pairs (eg: 'chef.n.01', 'fireman.n.01') but is retained
+ for backwards compatibility
+ :return: The synsets that are the lowest common hypernyms of both
+ synsets
"""
synsets = self.common_hypernyms(other)
if simulate_root:
@@ -611,10 +638,14 @@ class Synset(_WordNetObject):
try:
if use_min_depth:
max_depth = max(s.min_depth() for s in synsets)
- unsorted_lch = [s for s in synsets if s.min_depth() == max_depth]
+ unsorted_lch = [
+ s for s in synsets if s.min_depth() == max_depth
+ ]
else:
max_depth = max(s.max_depth() for s in synsets)
- unsorted_lch = [s for s in synsets if s.max_depth() == max_depth]
+ unsorted_lch = [
+ s for s in synsets if s.max_depth() == max_depth
+ ]
return sorted(unsorted_lch)
except ValueError:
return []
@@ -633,7 +664,10 @@ class Synset(_WordNetObject):
"""
distances = set([(self, distance)])
for hypernym in self._hypernyms() + self._instance_hypernyms():
- distances |= hypernym.hypernym_distances(distance+1, simulate_root=False)
+ distances |= hypernym.hypernym_distances(
+ distance+1,
+ simulate_root=False
+ )
if simulate_root:
fake_synset = Synset(None)
fake_synset._name = '*ROOT*'
@@ -762,7 +796,10 @@ class Synset(_WordNetObject):
itself.
"""
- distance = self.shortest_path_distance(other, simulate_root=simulate_root and self._needs_root())
+ distance = self.shortest_path_distance(
+ other,
+ simulate_root=simulate_root and self._needs_root()
+ )
if distance is None or distance < 0:
return None
return 1.0 / (distance + 1)
@@ -795,18 +832,25 @@ class Synset(_WordNetObject):
"""
if self._pos != other._pos:
- raise WordNetError('Computing the lch similarity requires ' + \
- '%s and %s to have the same part of speech.' % \
- (self, other))
+ raise WordNetError(
+ 'Computing the lch similarity requires '
+ '%s and %s to have the same part of speech.' %
+ (self, other)
+ )
need_root = self._needs_root()
if self._pos not in self._wordnet_corpus_reader._max_depth:
- self._wordnet_corpus_reader._compute_max_depth(self._pos, need_root)
+ self._wordnet_corpus_reader._compute_max_depth(
+ self._pos, need_root
+ )
depth = self._wordnet_corpus_reader._max_depth[self._pos]
- distance = self.shortest_path_distance(other, simulate_root=simulate_root and need_root)
+ distance = self.shortest_path_distance(
+ other,
+ simulate_root=simulate_root and need_root
+ )
if distance is None or distance < 0 or depth == 0:
return None
@@ -842,9 +886,9 @@ class Synset(_WordNetObject):
there is usually a default root except for WordNet version 1.6.
If you are using wordnet 1.6, a fake root will be added for nouns
as well.
- :return: A float score denoting the similarity of the two ``Synset`` objects,
- normally greater than zero. If no connecting path between the two
- senses can be found, None is returned.
+ :return: A float score denoting the similarity of the two ``Synset``
+ objects, normally greater than zero. If no connecting path between
+ the two senses can be found, None is returned.
"""
@@ -852,7 +896,10 @@ class Synset(_WordNetObject):
# Note that to preserve behavior from NLTK2 we set use_min_depth=True
# It is possible that more accurate results could be obtained by
# removing this setting and it should be tested later on
- subsumers = self.lowest_common_hypernyms(other, simulate_root=simulate_root and need_root, use_min_depth=True)
+ subsumers = self.lowest_common_hypernyms(
+ other,
+ simulate_root=simulate_root and need_root, use_min_depth=True
+ )
# If no LCS was found return None
if len(subsumers) == 0:
@@ -867,16 +914,22 @@ class Synset(_WordNetObject):
depth = subsumer.max_depth() + 1
# Note: No need for an additional add-one correction for non-nouns
- # to account for an imaginary root node because that is now automatically
- # handled by simulate_root
+ # to account for an imaginary root node because that is now
+ # automatically handled by simulate_root
# if subsumer._pos != NOUN:
# depth += 1
# Get the shortest path from the LCS to each of the synsets it is
# subsuming. Add this to the LCS path length to get the path
# length from each synset to the root.
- len1 = self.shortest_path_distance(subsumer, simulate_root=simulate_root and need_root)
- len2 = other.shortest_path_distance(subsumer, simulate_root=simulate_root and need_root)
+ len1 = self.shortest_path_distance(
+ subsumer,
+ simulate_root=simulate_root and need_root
+ )
+ len2 = other.shortest_path_distance(
+ subsumer,
+ simulate_root=simulate_root and need_root
+ )
if len1 is None or len2 is None:
return None
len1 += depth
@@ -893,10 +946,11 @@ class Synset(_WordNetObject):
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
- :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
- :return: A float score denoting the similarity of the two ``Synset`` objects.
- Synsets whose LCS is the root node of the taxonomy will have a
- score of 0 (e.g. N['dog'][0] and N['table'][0]).
+ :param ic: an information content object (as returned by
+ ``nltk.corpus.wordnet_ic.ic()``).
+ :return: A float score denoting the similarity of the two ``Synset``
+ objects. Synsets whose LCS is the root node of the taxonomy will
+ have a score of 0 (e.g. N['dog'][0] and N['table'][0]).
"""
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
@@ -913,8 +967,10 @@ class Synset(_WordNetObject):
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
- :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
- :return: A float score denoting the similarity of the two ``Synset`` objects.
+ :param ic: an information content object (as returned by
+ ``nltk.corpus.wordnet_ic.ic()``).
+ :return: A float score denoting the similarity of the two ``Synset``
+ objects.
"""
if self == other:
@@ -945,9 +1001,10 @@ class Synset(_WordNetObject):
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
- :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
- :return: A float score denoting the similarity of the two ``Synset`` objects,
- in the range 0 to 1.
+ :param ic: an information content object (as returned by
+ ``nltk.corpus.wordnet_ic.ic()``).
+ :return: A float score denoting the similarity of the two ``Synset``
+ objects, in the range 0 to 1.
"""
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
@@ -966,15 +1023,16 @@ class Synset(_WordNetObject):
yield todo
todo = [hypernym
for synset in todo
- for hypernym in (synset.hypernyms() +
- synset.instance_hypernyms())
+ for hypernym in (
+ synset.hypernyms() + synset.instance_hypernyms()
+ )
if hypernym not in seen]
def __repr__(self):
return "%s('%s')" % (type(self).__name__, self._name)
def _related(self, relation_symbol, sort=True):
- get_synset = self._wordnet_corpus_reader._synset_from_pos_and_offset
+ get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
pointer_tuples = self._pointers[relation_symbol]
r = [get_synset(pos, offset) for pos, offset in pointer_tuples]
if sort:
@@ -983,7 +1041,7 @@ class Synset(_WordNetObject):
######################################################################
-## WordNet Corpus Reader
+# WordNet Corpus Reader
######################################################################
class WordNetCorpusReader(CorpusReader):
@@ -993,18 +1051,18 @@ class WordNetCorpusReader(CorpusReader):
_ENCODING = 'utf8'
- #{ Part-of-speech constants
+ # { Part-of-speech constants
ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
- #}
+ # }
- #{ Filename constants
+ # { Filename constants
_FILEMAP = {ADJ: 'adj', ADV: 'adv', NOUN: 'noun', VERB: 'verb'}
- #}
+ # }
- #{ Part of speech constants
+ # { Part of speech constants
_pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, ADJ_SAT: 5}
_pos_names = dict(tup[::-1] for tup in _pos_numbers.items())
- #}
+ # }
#: A list of file identifiers for all the fileids used by this
#: corpus reader.
@@ -1062,48 +1120,38 @@ class WordNetCorpusReader(CorpusReader):
def of2ss(self, of):
''' take an id and return the synsets '''
- return self._synset_from_pos_and_offset(of[-1], int(of[:8]))
+ return self.synset_from_pos_and_offset(of[-1], int(of[:8]))
def ss2of(self, ss):
''' return the ID of the synset '''
return ("{:08d}-{}".format(ss.offset(), ss.pos()))
-
- def _load_lang_data(self, lang):
- ''' load the wordnet data of the requested language from the file to the cache, _lang_data '''
- if lang not in self.langs():
- raise WordNetError("Language is not supported.")
+ def _load_lang_data(self, lang):
+ ''' load the wordnet data of the requested language from the file to
+ the cache, _lang_data '''
if lang in self._lang_data.keys():
return
- f = self._omw_reader.open('{0:}/wn-data-{0:}.tab'.format(lang))
+ if lang not in self.langs():
+ raise WordNetError("Language is not supported.")
- self._lang_data[lang].append(defaultdict(list))
- self._lang_data[lang].append(defaultdict(list))
-
- for l in f.readlines():
- l = l.replace('\n', '')
- l = l.replace(' ', '_')
- if l[0] != '#':
- word = l.split('\t')
- self._lang_data[lang][0][word[0]].append(word[2])
- self._lang_data[lang][1][word[2]].append(word[0])
+ f = self._omw_reader.open('{0:}/wn-data-{0:}.tab'.format(lang))
+ self.custom_lemmas(f, lang)
f.close()
def langs(self):
''' return a list of languages supported by Multilingual Wordnet '''
import os
- langs = [ 'eng' ]
+ langs = ['eng']
fileids = self._omw_reader.fileids()
for fileid in fileids:
file_name, file_extension = os.path.splitext(fileid)
if file_extension == '.tab':
langs.append(file_name.split('-')[-1])
-
+
return langs
-
def _load_lemma_pos_offset_map(self):
for suffix in self._FILEMAP.values():
@@ -1113,7 +1161,9 @@ class WordNetCorpusReader(CorpusReader):
continue
_iter = iter(line.split())
- _next_token = lambda: next(_iter)
+
+ def _next_token(): return next(_iter)
+
try:
# get the lemma and part-of-speech
@@ -1124,19 +1174,23 @@ class WordNetCorpusReader(CorpusReader):
n_synsets = int(_next_token())
assert n_synsets > 0
- # get the pointer symbols for all synsets of this lemma
+ # get and ignore the pointer symbols for all synsets of
+ # this lemma
n_pointers = int(_next_token())
- _ = [_next_token() for _ in xrange(n_pointers)]
+ [_next_token() for _ in range(n_pointers)]
# same as number of synsets
n_senses = int(_next_token())
assert n_synsets == n_senses
- # get number of senses ranked according to frequency
- _ = int(_next_token())
+ # get and ignore number of senses ranked according to
+ # frequency
+ _next_token()
# get synset offsets
- synset_offsets = [int(_next_token()) for _ in xrange(n_synsets)]
+ synset_offsets = [
+ int(_next_token()) for _ in range(n_synsets)
+ ]
# raise more informative error with file name and line number
except (AssertionError, ValueError) as e:
@@ -1181,13 +1235,14 @@ class WordNetCorpusReader(CorpusReader):
fh.seek(0)
return version
- #////////////////////////////////////////////////////////////
+ #############################################################
# Loading Lemmas
- #////////////////////////////////////////////////////////////
+ #############################################################
def lemma(self, name, lang='eng'):
'''Return lemma object that matches the name'''
- # cannot simply split on first '.', e.g.: '.45_caliber.a.01..45_caliber'
+ # cannot simply split on first '.',
+ # e.g.: '.45_caliber.a.01..45_caliber'
separator = SENSENUM_RE.search(name).start()
synset_name, lemma_name = name[:separator+3], name[separator+4:]
synset = self.synset(synset_name)
@@ -1213,7 +1268,7 @@ class WordNetCorpusReader(CorpusReader):
if not synset_line:
raise WordNetError("No synset found for key %r" % key)
offset = int(synset_line.split()[1])
- synset = self._synset_from_pos_and_offset(pos, offset)
+ synset = self.synset_from_pos_and_offset(pos, offset)
# return the corresponding lemma
for lemma in synset._lemmas:
@@ -1221,9 +1276,9 @@ class WordNetCorpusReader(CorpusReader):
return lemma
raise WordNetError("No lemma found for for key %r" % key)
- #////////////////////////////////////////////////////////////
+ #############################################################
# Loading Synsets
- #////////////////////////////////////////////////////////////
+ #############################################################
def synset(self, name):
# split name into lemma, part of speech and synset number
lemma, pos, synset_index_str = name.lower().rsplit('.', 2)
@@ -1245,7 +1300,7 @@ class WordNetCorpusReader(CorpusReader):
raise WordNetError(message % tup)
# load synset information from the appropriate file
- synset = self._synset_from_pos_and_offset(pos, offset)
+ synset = self.synset_from_pos_and_offset(pos, offset)
# some basic sanity checks on loaded attributes
if pos == 's' and synset._pos == 'a':
@@ -1269,7 +1324,7 @@ class WordNetCorpusReader(CorpusReader):
self._data_file_map[pos] = self.open(fileid)
return self._data_file_map[pos]
- def _synset_from_pos_and_offset(self, pos, offset):
+ def synset_from_pos_and_offset(self, pos, offset):
# Check to see if the synset is in the cache
if offset in self._synset_offset_cache[pos]:
return self._synset_offset_cache[pos][offset]
@@ -1282,6 +1337,15 @@ class WordNetCorpusReader(CorpusReader):
self._synset_offset_cache[pos][offset] = synset
return synset
+ @deprecated('Use public method synset_from_pos_and_offset() instead')
+ def _synset_from_pos_and_offset(self, *args, **kwargs):
+ """
+ Hack to help people like the readers of
+ http://stackoverflow.com/a/27145655/1709587
+ who were using this function before it was officially a public method
+ """
+ return self.synset_from_pos_and_offset(*args, **kwargs)
+
def _synset_from_pos_and_line(self, pos, data_file_line):
# Construct a new (empty) synset.
synset = Synset(self)
@@ -1303,7 +1367,8 @@ class WordNetCorpusReader(CorpusReader):
# split the other info into fields
_iter = iter(columns_str.split())
- _next_token = lambda: next(_iter)
+
+ def _next_token(): return next(_iter)
# get the offset
synset._offset = int(_next_token())
@@ -1317,7 +1382,7 @@ class WordNetCorpusReader(CorpusReader):
# create Lemma objects for each lemma
n_lemmas = int(_next_token(), 16)
- for _ in xrange(n_lemmas):
+ for _ in range(n_lemmas):
# get the lemma name
lemma_name = _next_token()
# get the lex_id (used for sense_keys)
@@ -1333,7 +1398,7 @@ class WordNetCorpusReader(CorpusReader):
# collect the pointer tuples
n_pointers = int(_next_token())
- for _ in xrange(n_pointers):
+ for _ in range(n_pointers):
symbol = _next_token()
offset = int(_next_token())
pos = _next_token()
@@ -1354,7 +1419,7 @@ class WordNetCorpusReader(CorpusReader):
except StopIteration:
pass
else:
- for _ in xrange(frame_count):
+ for _ in range(frame_count):
# read the plus sign
plus = _next_token()
assert plus == '+'
@@ -1367,14 +1432,16 @@ class WordNetCorpusReader(CorpusReader):
synset._frame_ids.append(frame_number)
for lemma in synset._lemmas:
lemma._frame_ids.append(frame_number)
- lemma._frame_strings.append(frame_string_fmt %
- lemma._name)
+ lemma._frame_strings.append(
+ frame_string_fmt % lemma._name
+ )
# only a specific word in the synset
else:
lemma = synset._lemmas[lemma_number - 1]
lemma._frame_ids.append(frame_number)
- lemma._frame_strings.append(frame_string_fmt %
- lemma._name)
+ lemma._frame_strings.append(
+ frame_string_fmt % lemma._name
+ )
# raise a more informative error with line text
except ValueError as e:
@@ -1402,27 +1469,27 @@ class WordNetCorpusReader(CorpusReader):
return synset
- #////////////////////////////////////////////////////////////
+ #############################################################
# Retrieve synsets and lemmas.
- #////////////////////////////////////////////////////////////
+ #############################################################
- def synsets(self, lemma, pos=None, lang='eng'):
+ def synsets(self, lemma, pos=None, lang='eng', check_exceptions=True):
"""Load all synsets with a given lemma and part of speech tag.
If no pos is specified, all synsets for all parts of speech
- will be loaded.
+ will be loaded.
If lang is specified, all the synsets associated with the lemma name
of that language will be returned.
"""
lemma = lemma.lower()
-
+
if lang == 'eng':
- get_synset = self._synset_from_pos_and_offset
+ get_synset = self.synset_from_pos_and_offset
index = self._lemma_pos_offset_map
if pos is None:
pos = POS_LIST
return [get_synset(p, offset)
for p in pos
- for form in self._morphy(lemma, p)
+ for form in self._morphy(lemma, p, check_exceptions)
for offset in index[form].get(p, [])]
else:
@@ -1439,8 +1506,8 @@ class WordNetCorpusReader(CorpusReader):
name and part of speech tag. Matches any part of speech tag if none is
specified."""
+ lemma = lemma.lower()
if lang == 'eng':
- lemma = lemma.lower()
return [lemma_obj
for synset in self.synsets(lemma, pos)
for lemma_obj in synset.lemmas()
@@ -1453,23 +1520,25 @@ class WordNetCorpusReader(CorpusReader):
for s in syn:
if pos is not None and s.pos() != pos:
continue
- a = Lemma(self, s, lemma, self._lexnames.index(s.lexname()), 0, None)
- a._lang = lang
- lemmas.append(a)
+ for lemma_obj in s.lemmas(lang=lang):
+ if lemma_obj.name().lower() == lemma:
+ lemmas.append(lemma_obj)
return lemmas
def all_lemma_names(self, pos=None, lang='eng'):
"""Return all lemma names for all synsets for the given
- part of speech tag and language or languages. If pos is not specified, all synsets
- for all parts of speech will be used."""
+ part of speech tag and language or languages. If pos is
+ not specified, all synsets for all parts of speech will
+ be used."""
if lang == 'eng':
if pos is None:
return iter(self._lemma_pos_offset_map)
else:
- return (lemma
- for lemma in self._lemma_pos_offset_map
- if pos in self._lemma_pos_offset_map[lemma])
+ return (
+ lemma for lemma in self._lemma_pos_offset_map
+ if pos in self._lemma_pos_offset_map[lemma]
+ )
else:
self._load_lang_data(lang)
lemma = []
@@ -1477,10 +1546,10 @@ class WordNetCorpusReader(CorpusReader):
if pos is not None and i[-1] != pos:
continue
lemma.extend(self._lang_data[lang][0][i])
-
+
lemma = list(set(lemma))
return lemma
-
+
def all_synsets(self, pos=None):
"""Iterate over all synsets with a given part of speech tag.
If no pos is specified, all synsets for all parts of speech
@@ -1551,11 +1620,15 @@ class WordNetCorpusReader(CorpusReader):
elif lang in self.langs():
return self._omw_reader.open("{}/LICENSE".format(lang)).read()
elif lang == 'omw':
- ### under the not unreasonable assumption you don't mean Omwunra-Toqura
+ # under the assumption you don't mean Omwunra-Toqura
return self._omw_reader.open("LICENSE").read()
+ elif lang in self._lang_data:
+ raise WordNetError(
+ "Cannot determine license for user-provided tab file"
+ )
else:
raise WordNetError("Language is not supported.")
-
+
def readme(self, lang='omw'):
"""Return the contents of README (for omw)
use lang=lang to get the readme for an individual language"""
@@ -1564,8 +1637,10 @@ class WordNetCorpusReader(CorpusReader):
elif lang in self.langs():
return self._omw_reader.open("{}/README".format(lang)).read()
elif lang == 'omw':
- ### under the not unreasonable assumption you don't mean Omwunra-Toqura
+ # under the assumption you don't mean Omwunra-Toqura
return self._omw_reader.open("README").read()
+ elif lang in self._lang_data:
+ raise WordNetError("No README for user-provided tab file")
else:
raise WordNetError("Language is not supported.")
@@ -1577,16 +1652,16 @@ class WordNetCorpusReader(CorpusReader):
elif lang in self.langs():
return self._omw_reader.open("{}/citation.bib".format(lang)).read()
elif lang == 'omw':
- ### under the not unreasonable assumption you don't mean Omwunra-Toqura
+ # under the assumption you don't mean Omwunra-Toqura
return self._omw_reader.open("citation.bib").read()
+ elif lang in self._lang_data:
+ raise WordNetError("citation not known for user-provided tab file")
else:
raise WordNetError("Language is not supported.")
-
-
- #////////////////////////////////////////////////////////////
+ #############################################################
# Misc
- #////////////////////////////////////////////////////////////
+ #############################################################
def lemma_count(self, lemma):
"""Return the frequency count for this Lemma"""
# Currently, count is only work for English
@@ -1602,15 +1677,21 @@ class WordNetCorpusReader(CorpusReader):
else:
return 0
- def path_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
+ def path_similarity(
+ self, synset1, synset2, verbose=False, simulate_root=True
+ ):
return synset1.path_similarity(synset2, verbose, simulate_root)
path_similarity.__doc__ = Synset.path_similarity.__doc__
- def lch_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
+ def lch_similarity(
+ self, synset1, synset2, verbose=False, simulate_root=True
+ ):
return synset1.lch_similarity(synset2, verbose, simulate_root)
lch_similarity.__doc__ = Synset.lch_similarity.__doc__
- def wup_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
+ def wup_similarity(
+ self, synset1, synset2, verbose=False, simulate_root=True
+ ):
return synset1.wup_similarity(synset2, verbose, simulate_root)
wup_similarity.__doc__ = Synset.wup_similarity.__doc__
@@ -1626,11 +1707,11 @@ class WordNetCorpusReader(CorpusReader):
return synset1.lin_similarity(synset2, ic, verbose)
lin_similarity.__doc__ = Synset.lin_similarity.__doc__
- #////////////////////////////////////////////////////////////
+ #############################################################
# Morphy
- #////////////////////////////////////////////////////////////
+ #############################################################
# Morphy, adapted from Oliver Steele's pywordnet
- def morphy(self, form, pos=None):
+ def morphy(self, form, pos=None, check_exceptions=True):
"""
Find a possible base form for the given form, with the given
part of speech, by checking WordNet's list of exceptional
@@ -1656,7 +1737,7 @@ class WordNetCorpusReader(CorpusReader):
morphy = self._morphy
analyses = chain(a for p in POS_LIST for a in morphy(form, p))
else:
- analyses = self._morphy(form, pos)
+ analyses = self._morphy(form, pos, check_exceptions)
# get the first one we find
first = list(islice(analyses, 1))
@@ -1676,7 +1757,7 @@ class WordNetCorpusReader(CorpusReader):
MORPHOLOGICAL_SUBSTITUTIONS[ADJ_SAT] = MORPHOLOGICAL_SUBSTITUTIONS[ADJ]
- def _morphy(self, form, pos):
+ def _morphy(self, form, pos, check_exceptions=True):
# from jordanbg:
# Given an original string x
# 1. Apply rules once to the input to get y1, y2, y3, etc.
@@ -1705,8 +1786,9 @@ class WordNetCorpusReader(CorpusReader):
return result
# 0. Check the exception lists
- if form in exceptions:
- return filter_forms([form] + exceptions[form])
+ if check_exceptions:
+ if form in exceptions:
+ return filter_forms([form] + exceptions[form])
# 1. Apply rules once to the input to get y1, y2, y3, etc.
forms = apply_rules([form])
@@ -1726,10 +1808,10 @@ class WordNetCorpusReader(CorpusReader):
# Return an empty list if we can't find anything
return []
- #////////////////////////////////////////////////////////////
+ #############################################################
# Create information content from corpus
- #////////////////////////////////////////////////////////////
- def ic(self, corpus, weight_senses_equally = False, smoothing = 1.0):
+ #############################################################
+ def ic(self, corpus, weight_senses_equally=False, smoothing=1.0):
"""
Creates an information content lookup dictionary from a corpus.
@@ -1783,9 +1865,37 @@ class WordNetCorpusReader(CorpusReader):
ic[pos][0] += weight
return ic
+ def custom_lemmas(self, tab_file, lang):
+ """
+ Reads a custom tab file containing mappings of lemmas in the given
+ language to Princeton WordNet 3.0 synset offsets, allowing NLTK's
+ WordNet functions to then be used with that language.
+
+ See the "Tab files" section at http://compling.hss.ntu.edu.sg/omw/ for
+ documentation on the Multilingual WordNet tab file format.
+
+ :param tab_file: Tab file as a file or file-like object
+ :type lang str
+ :param lang ISO 639-3 code of the language of the tab file
+ """
+ if len(lang) != 3:
+ raise ValueError('lang should be a (3 character) ISO 639-3 code')
+ self._lang_data[lang] = [defaultdict(list), defaultdict(list)]
+ for l in tab_file.readlines():
+ if isinstance(l, bytes):
+ # Support byte-stream files (e.g. as returned by Python 2's
+ # open() function) as well as text-stream ones
+ l = l.decode('utf-8')
+ l = l.replace('\n', '')
+ l = l.replace(' ', '_')
+ if l[0] != '#':
+ word = l.split('\t')
+ self._lang_data[lang][0][word[0]].append(word[2])
+ self._lang_data[lang][1][word[2].lower()].append(word[0])
+
######################################################################
-## WordNet Information Content Corpus Reader
+# WordNet Information Content Corpus Reader
######################################################################
class WordNetICCorpusReader(CorpusReader):
@@ -1815,7 +1925,7 @@ class WordNetICCorpusReader(CorpusReader):
ic[NOUN] = defaultdict(float)
ic[VERB] = defaultdict(float)
for num, line in enumerate(self.open(icfile)):
- if num == 0: # skip the header
+ if num == 0: # skip the header
continue
fields = line.split()
offset = int(fields[0][:-1])
@@ -1841,31 +1951,33 @@ class WordNetICCorpusReader(CorpusReader):
def path_similarity(synset1, synset2, verbose=False, simulate_root=True):
return synset1.path_similarity(synset2, verbose, simulate_root)
-path_similarity.__doc__ = Synset.path_similarity.__doc__
def lch_similarity(synset1, synset2, verbose=False, simulate_root=True):
return synset1.lch_similarity(synset2, verbose, simulate_root)
-lch_similarity.__doc__ = Synset.lch_similarity.__doc__
def wup_similarity(synset1, synset2, verbose=False, simulate_root=True):
return synset1.wup_similarity(synset2, verbose, simulate_root)
-wup_similarity.__doc__ = Synset.wup_similarity.__doc__
def res_similarity(synset1, synset2, ic, verbose=False):
return synset1.res_similarity(synset2, verbose)
-res_similarity.__doc__ = Synset.res_similarity.__doc__
def jcn_similarity(synset1, synset2, ic, verbose=False):
return synset1.jcn_similarity(synset2, verbose)
-jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
def lin_similarity(synset1, synset2, ic, verbose=False):
return synset1.lin_similarity(synset2, verbose)
+
+
+path_similarity.__doc__ = Synset.path_similarity.__doc__
+lch_similarity.__doc__ = Synset.lch_similarity.__doc__
+wup_similarity.__doc__ = Synset.wup_similarity.__doc__
+res_similarity.__doc__ = Synset.res_similarity.__doc__
+jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
lin_similarity.__doc__ = Synset.lin_similarity.__doc__
@@ -1887,9 +1999,11 @@ def _lcs_ic(synset1, synset2, ic, verbose=False):
informative subsumer
"""
if synset1._pos != synset2._pos:
- raise WordNetError('Computing the least common subsumer requires ' + \
- '%s and %s to have the same part of speech.' % \
- (synset1, synset2))
+ raise WordNetError(
+ 'Computing the least common subsumer requires '
+ '%s and %s to have the same part of speech.' %
+ (synset1, synset2)
+ )
ic1 = information_content(synset1, ic)
ic2 = information_content(synset2, ic)
@@ -1930,7 +2044,10 @@ def _get_pos(field):
elif field[-1] == 'v':
return VERB
else:
- msg = "Unidentified part of speech in WordNet Information Content file for field %s" % field
+ msg = (
+ "Unidentified part of speech in WordNet Information Content file "
+ "for field %s" % field
+ )
raise ValueError(msg)
diff --git a/nltk/corpus/reader/xmldocs.py b/nltk/corpus/reader/xmldocs.py
index 786b8ec..295e91e 100644
--- a/nltk/corpus/reader/xmldocs.py
+++ b/nltk/corpus/reader/xmldocs.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: XML Corpus Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -18,7 +18,8 @@ import codecs
try: from xml.etree import cElementTree as ElementTree
except ImportError: from xml.etree import ElementTree
-from nltk import compat
+from six import string_types
+
from nltk.data import SeekableUnicodeStreamReader
from nltk.tokenize import WordPunctTokenizer
from nltk.internals import ElementWrapper
@@ -42,7 +43,7 @@ class XMLCorpusReader(CorpusReader):
# Make sure we have exactly one file -- no concatenating XML.
if fileid is None and len(self._fileids) == 1:
fileid = self._fileids[0]
- if not isinstance(fileid, compat.string_types):
+ if not isinstance(fileid, string_types):
raise TypeError('Expected a single file identifier string')
# Read the XML in using ElementTree.
elt = ElementTree.parse(self.abspath(fileid).open()).getroot()
@@ -79,7 +80,7 @@ class XMLCorpusReader(CorpusReader):
def raw(self, fileids=None):
if fileids is None: fileids = self._fileids
- elif isinstance(fileids, compat.string_types): fileids = [fileids]
+ elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
@@ -156,7 +157,11 @@ class XMLCorpusView(StreamBackedCorpusView):
def _detect_encoding(self, fileid):
if isinstance(fileid, PathPointer):
- s = fileid.open().readline()
+ try:
+ infile = fileid.open()
+ s = infile.readline()
+ finally:
+ infile.close()
else:
with open(fileid, 'rb') as infile:
s = infile.readline()
diff --git a/nltk/corpus/reader/ycoe.py b/nltk/corpus/reader/ycoe.py
index be6716c..a8870b1 100644
--- a/nltk/corpus/reader/ycoe.py
+++ b/nltk/corpus/reader/ycoe.py
@@ -22,7 +22,8 @@ to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
import os
import re
-from nltk import compat
+from six import string_types
+
from nltk.tokenize import RegexpTokenizer
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
from nltk.corpus.reader.tagged import TaggedCorpusReader
@@ -63,7 +64,7 @@ class YCOECorpusReader(CorpusReader):
"""
if fileids is None:
return self._documents
- if isinstance(fileids, compat.string_types):
+ if isinstance(fileids, string_types):
fileids = [fileids]
for f in fileids:
if f not in self._fileids:
@@ -78,7 +79,7 @@ class YCOECorpusReader(CorpusReader):
"""
if documents is None:
return self._fileids
- elif isinstance(documents, compat.string_types):
+ elif isinstance(documents, string_types):
documents = [documents]
return sorted(set(['%s.pos' % doc for doc in documents] +
['%s.psd' % doc for doc in documents]))
@@ -91,7 +92,7 @@ class YCOECorpusReader(CorpusReader):
if documents is None:
documents = self._documents
else:
- if isinstance(documents, compat.string_types):
+ if isinstance(documents, string_types):
documents = [documents]
for document in documents:
if document not in self._documents:
@@ -140,8 +141,8 @@ class YCOETaggedCorpusReader(TaggedCorpusReader):
#: A list of all documents and their titles in ycoe.
documents = {
'coadrian.o34': 'Adrian and Ritheus',
- 'coaelhom.o3': '�lfric, Supplemental Homilies',
- 'coaelive.o3': '�lfric\'s Lives of Saints',
+ 'coaelhom.o3': 'Ælfric, Supplemental Homilies',
+ 'coaelive.o3': 'Ælfric\'s Lives of Saints',
'coalcuin': 'Alcuin De virtutibus et vitiis',
'coalex.o23': 'Alexander\'s Letter to Aristotle',
'coapollo.o3': 'Apollonius of Tyre',
@@ -153,8 +154,8 @@ documents = {
'cobyrhtf.o3': 'Byrhtferth\'s Manual',
'cocanedgD': 'Canons of Edgar (D)',
'cocanedgX': 'Canons of Edgar (X)',
- 'cocathom1.o3': '�lfric\'s Catholic Homilies I',
- 'cocathom2.o3': '�lfric\'s Catholic Homilies II',
+ 'cocathom1.o3': 'Ælfric\'s Catholic Homilies I',
+ 'cocathom2.o3': 'Ælfric\'s Catholic Homilies II',
'cochad.o24': 'Saint Chad',
'cochdrul': 'Chrodegang of Metz, Rule',
'cochristoph': 'Saint Christopher',
@@ -173,7 +174,7 @@ documents = {
'codocu4.o24': 'Documents 4 (O2/O4)',
'coeluc1': 'Honorius of Autun, Elucidarium 1',
'coeluc2': 'Honorius of Autun, Elucidarium 1',
- 'coepigen.o3': '�lfric\'s Epilogue to Genesis',
+ 'coepigen.o3': 'Ælfric\'s Epilogue to Genesis',
'coeuphr': 'Saint Euphrosyne',
'coeust': 'Saint Eustace and his companions',
'coexodusP': 'Exodus (P)',
@@ -188,8 +189,8 @@ documents = {
'colaece.o2': 'Leechdoms',
'colaw1cn.o3': 'Laws, Cnut I',
'colaw2cn.o3': 'Laws, Cnut II',
- 'colaw5atr.o3': 'Laws, �thelred V',
- 'colaw6atr.o3': 'Laws, �thelred VI',
+ 'colaw5atr.o3': 'Laws, Æthelred V',
+ 'colaw6atr.o3': 'Laws, Æthelred VI',
'colawaf.o2': 'Laws, Alfred',
'colawafint.o2': 'Alfred\'s Introduction to Laws',
'colawger.o34': 'Laws, Gerefa',
@@ -197,14 +198,14 @@ documents = {
'colawnorthu.o3': 'Northumbra Preosta Lagu',
'colawwllad.o4': 'Laws, William I, Lad',
'coleofri.o4': 'Leofric',
- 'colsigef.o3': '�lfric\'s Letter to Sigefyrth',
- 'colsigewB': '�lfric\'s Letter to Sigeweard (B)',
- 'colsigewZ.o34': '�lfric\'s Letter to Sigeweard (Z)',
- 'colwgeat': '�lfric\'s Letter to Wulfgeat',
- 'colwsigeT': '�lfric\'s Letter to Wulfsige (T)',
- 'colwsigeXa.o34': '�lfric\'s Letter to Wulfsige (Xa)',
- 'colwstan1.o3': '�lfric\'s Letter to Wulfstan I',
- 'colwstan2.o3': '�lfric\'s Letter to Wulfstan II',
+ 'colsigef.o3': 'Ælfric\'s Letter to Sigefyrth',
+ 'colsigewB': 'Ælfric\'s Letter to Sigeweard (B)',
+ 'colsigewZ.o34': 'Ælfric\'s Letter to Sigeweard (Z)',
+ 'colwgeat': 'Ælfric\'s Letter to Wulfgeat',
+ 'colwsigeT': 'Ælfric\'s Letter to Wulfsige (T)',
+ 'colwsigeXa.o34': 'Ælfric\'s Letter to Wulfsige (Xa)',
+ 'colwstan1.o3': 'Ælfric\'s Letter to Wulfstan I',
+ 'colwstan2.o3': 'Ælfric\'s Letter to Wulfstan II',
'comargaC.o34': 'Saint Margaret (C)',
'comargaT': 'Saint Margaret (T)',
'comart1': 'Martyrology, I',
@@ -219,11 +220,11 @@ documents = {
'conicodE': 'Gospel of Nicodemus (E)',
'coorosiu.o2': 'Orosius',
'cootest.o3': 'Heptateuch',
- 'coprefcath1.o3': '�lfric\'s Preface to Catholic Homilies I',
- 'coprefcath2.o3': '�lfric\'s Preface to Catholic Homilies II',
+ 'coprefcath1.o3': 'Ælfric\'s Preface to Catholic Homilies I',
+ 'coprefcath2.o3': 'Ælfric\'s Preface to Catholic Homilies II',
'coprefcura.o2': 'Preface to the Cura Pastoralis',
- 'coprefgen.o3': '�lfric\'s Preface to Genesis',
- 'copreflives.o3': '�lfric\'s Preface to Lives of Saints',
+ 'coprefgen.o3': 'Ælfric\'s Preface to Genesis',
+ 'copreflives.o3': 'Ælfric\'s Preface to Lives of Saints',
'coprefsolilo': 'Preface to Augustine\'s Soliloquies',
'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus',
'corood': 'History of the Holy Rood-Tree',
@@ -231,7 +232,7 @@ documents = {
'cosolilo': 'St. Augustine\'s Soliloquies',
'cosolsat1.o4': 'Solomon and Saturn I',
'cosolsat2': 'Solomon and Saturn II',
- 'cotempo.o3': '�lfric\'s De Temporibus Anni',
+ 'cotempo.o3': 'Ælfric\'s De Temporibus Anni',
'coverhom': 'Vercelli Homilies',
'coverhomE': 'Vercelli Homilies (E)',
'coverhomL': 'Vercelli Homilies (L)',
diff --git a/nltk/corpus/util.py b/nltk/corpus/util.py
index 7e67d05..d23c561 100644
--- a/nltk/corpus/util.py
+++ b/nltk/corpus/util.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Corpus Reader Utility Functions
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -38,12 +38,29 @@ class LazyCorpusLoader(object):
NLTK data package. Once they've properly installed the data
package (or modified ``nltk.data.path`` to point to its location),
they can then use the corpus object without restarting python.
+
+ :param name: The name of the corpus
+ :type name: str
+ :param reader_cls: The specific CorpusReader class, e.g. PlaintextCorpusReader, WordListCorpusReader
+ :type reader: nltk.corpus.reader.api.CorpusReader
+ :param nltk_data_subdir: The subdirectory where the corpus is stored.
+ :type nltk_data_subdir: str
+ :param *args: Any other non-keywords arguments that `reader_cls` might need.
+ :param *kargs: Any other keywords arguments that `reader_cls` might need.
"""
def __init__(self, name, reader_cls, *args, **kwargs):
from nltk.corpus.reader.api import CorpusReader
assert issubclass(reader_cls, CorpusReader)
self.__name = self.__name__ = name
self.__reader_cls = reader_cls
+ # If nltk_data_subdir is set explicitly
+ if 'nltk_data_subdir' in kwargs:
+ # Use the specified subdirectory path
+ self.subdir = kwargs['nltk_data_subdir']
+ # Pops the `nltk_data_subdir` argument, we don't need it anymore.
+ kwargs.pop('nltk_data_subdir', None)
+ else: # Otherwise use 'nltk_data/corpora'
+ self.subdir = 'corpora'
self.__args = args
self.__kwargs = kwargs
@@ -52,15 +69,15 @@ class LazyCorpusLoader(object):
zip_name = re.sub(r'(([^/]*)(/.*)?)', r'\2.zip/\1/', self.__name)
if TRY_ZIPFILE_FIRST:
try:
- root = nltk.data.find('corpora/%s' % zip_name)
+ root = nltk.data.find('{}/{}'.format(self.subdir, zip_name))
except LookupError as e:
- try: root = nltk.data.find('corpora/%s' % self.__name)
+ try: root = nltk.data.find('{}/{}'.format(self.subdir, self.__name))
except LookupError: raise e
else:
try:
- root = nltk.data.find('corpora/%s' % self.__name)
+ root = nltk.data.find('{}/{}'.format(self.subdir, self.__name))
except LookupError as e:
- try: root = nltk.data.find('corpora/%s' % zip_name)
+ try: root = nltk.data.find('{}/{}'.format(self.subdir, zip_name))
except LookupError: raise e
# Load the corpus.
diff --git a/nltk/data.py b/nltk/data.py
index 2feb5b2..4f4e375 100644
--- a/nltk/data.py
+++ b/nltk/data.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Utility functions
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -32,6 +32,8 @@ to a local file.
"""
from __future__ import print_function, unicode_literals
from __future__ import division
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
import sys
import io
@@ -53,12 +55,12 @@ try:
except ImportError:
import pickle
+from six import string_types, text_type
+from six.moves.urllib.request import urlopen, url2pathname
+
# this import should be more specific:
import nltk
-
-from nltk.compat import py3_data, add_py3_data
-from nltk.compat import text_type, string_types, BytesIO, urlopen, url2pathname
-
+from nltk.compat import py3_data, add_py3_data, BytesIO
######################################################################
# Search Path
@@ -100,8 +102,8 @@ else:
# Util Functions
######################################################################
-def gzip_open_unicode(filename, mode="rb", compresslevel=9,
- encoding='utf-8', fileobj=None, errors=None, newline=None):
+def gzip_open_unicode(filename, mode="rb", compresslevel=9, encoding='utf-8',
+ fileobj=None, errors=None, newline=None):
if fileobj is None:
fileobj = GzipFile(filename, mode, compresslevel, fileobj)
return io.TextIOWrapper(fileobj, encoding, errors, newline)
@@ -241,6 +243,7 @@ def normalize_resource_name(resource_name, allow_relative=True, relative_path=No
# Path Pointers
######################################################################
+ at add_metaclass(ABCMeta)
class PathPointer(object):
"""
An abstract base class for 'path pointers,' used by NLTK's data
@@ -251,6 +254,7 @@ class PathPointer(object):
by reading that zipfile.
"""
+ @abstractmethod
def open(self, encoding=None):
"""
Return a seekable read-only stream that can be used to read
@@ -259,8 +263,8 @@ class PathPointer(object):
:raise IOError: If the path specified by this pointer does
not contain a readable file.
"""
- raise NotImplementedError('abstract base class')
+ @abstractmethod
def file_size(self):
"""
Return the size of the file pointed to by this path pointer,
@@ -269,8 +273,8 @@ class PathPointer(object):
:raise IOError: If the path specified by this pointer does
not contain a readable file.
"""
- raise NotImplementedError('abstract base class')
+ @abstractmethod
def join(self, fileid):
"""
Return a new path pointer formed by starting at the path
@@ -279,7 +283,6 @@ class PathPointer(object):
should be separated by forward slashes, regardless of
the underlying file system's path seperator character.
"""
- raise NotImplementedError('abstract base class')
class FileSystemPathPointer(PathPointer, text_type):
@@ -369,11 +372,6 @@ class BufferedGzipFile(GzipFile):
"""
GzipFile.__init__(self, filename, mode, compresslevel, fileobj)
self._size = kwargs.get('size', self.SIZE)
- # Note: In > Python3.5, GzipFile is already using a
- # buffered reader in the backend which has a variable self._buffer
- # See https://github.com/nltk/nltk/issues/1308
- if sys.version.startswith('3.5'):
- sys.stderr.write("Use the native Python gzip.GzipFile instead.")
self._nltk_buffer = BytesIO()
# cStringIO does not support len.
self._len = 0
@@ -445,7 +443,13 @@ class GzipFileSystemPathPointer(FileSystemPathPointer):
"""
def open(self, encoding=None):
- stream = BufferedGzipFile(self._path, 'rb')
+ # Note: In >= Python3.5, GzipFile is already using a
+ # buffered reader in the backend which has a variable self._buffer
+ # See https://github.com/nltk/nltk/issues/1308
+ if sys.version.startswith('2.7') or sys.version.startswith('3.4'):
+ stream = BufferedGzipFile(self._path, 'rb')
+ else:
+ stream = GzipFile(self._path, 'rb')
if encoding:
stream = SeekableUnicodeStreamReader(stream, encoding)
return stream
@@ -510,7 +514,13 @@ class ZipFilePathPointer(PathPointer):
data = self._zipfile.read(self._entry)
stream = BytesIO(data)
if self._entry.endswith('.gz'):
- stream = BufferedGzipFile(self._entry, fileobj=stream)
+ # Note: In >= Python3.5, GzipFile is already using a
+ # buffered reader in the backend which has a variable self._buffer
+ # See https://github.com/nltk/nltk/issues/1308
+ if sys.version.startswith('2.7') or sys.version.startswith('3.4'):
+ stream = BufferedGzipFile(self._entry, fileobj=stream)
+ else:
+ stream = GzipFile(self._entry, fileobj=stream)
elif encoding is not None:
stream = SeekableUnicodeStreamReader(stream, encoding)
return stream
@@ -527,7 +537,9 @@ class ZipFilePathPointer(PathPointer):
self._zipfile.filename, self._entry)
def __str__(self):
- return os.path.normpath(os.path.join(self._zipfile.filename, self._entry))
+ return os.path.normpath(os.path.join(self._zipfile.filename,
+ self._entry))
+
######################################################################
# Access Functions
@@ -678,6 +690,7 @@ def retrieve(resource_url, filename=None, verbose=True):
infile.close()
+
#: A dictionary describing the formats that are supported by NLTK's
#: load() method. Keys are format names, and values are format
#: descriptions.
@@ -979,7 +992,7 @@ class OpenOnDemandZipFile(zipfile.ZipFile):
zipfile.ZipFile.__init__(self, filename)
assert self.filename == filename
self.close()
- # After closing a ZipFile object, the _fileRefCnt needs to be cleared
+ # After closing a ZipFile object, the _fileRefCnt needs to be cleared
# for Python2and3 compatible code.
self._fileRefCnt = 0
@@ -1444,6 +1457,7 @@ class SeekableUnicodeStreamReader(object):
return None
+
__all__ = ['path', 'PathPointer', 'FileSystemPathPointer', 'BufferedGzipFile',
'GzipFileSystemPathPointer', 'GzipFileSystemPathPointer',
'find', 'retrieve', 'FORMATS', 'AUTO_FORMATS', 'load',
diff --git a/nltk/decorators.py b/nltk/decorators.py
index 0c36943..6350eae 100644
--- a/nltk/decorators.py
+++ b/nltk/decorators.py
@@ -56,7 +56,11 @@ def getinfo(func):
'self, x, y, *args, **kw'
"""
assert inspect.ismethod(func) or inspect.isfunction(func)
- regargs, varargs, varkwargs, defaults = inspect.getargspec(func)
+ if sys.version_info[0] >= 3:
+ argspec = inspect.getfullargspec(func)
+ else:
+ argspec = inspect.getargspec(func)
+ regargs, varargs, varkwargs, defaults = argspec[:4]
argnames = list(regargs)
if varargs:
argnames.append(varargs)
diff --git a/nltk/downloader.py b/nltk/downloader.py
index d4ea94c..7beb2c4 100644
--- a/nltk/downloader.py
+++ b/nltk/downloader.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Corpus & Model Downloader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -159,14 +159,14 @@ they didn't download that model.
default: unzip or not?
"""
-import time, os, zipfile, sys, textwrap, threading, itertools
+import time, os, zipfile, sys, textwrap, threading, itertools, shutil
from hashlib import md5
try:
TKINTER = True
- from tkinter import (Tk, Frame, Label, Entry, Button, Canvas, Menu, IntVar,
- TclError)
- from tkinter.messagebox import showerror
+ from six.moves.tkinter import (Tk, Frame, Label, Entry, Button, Canvas,
+ Menu, IntVar, TclError)
+ from six.moves.tkinter_messagebox import showerror
from nltk.draw.table import Table
from nltk.draw.util import ShowText
except:
@@ -174,8 +174,14 @@ except:
TclError = ValueError
from xml.etree import ElementTree
+
+from six import string_types, text_type
+from six.moves import input
+from six.moves.urllib.request import urlopen
+from six.moves.urllib.error import HTTPError, URLError
+
import nltk
-from nltk import compat
+from nltk.compat import python_2_unicode_compatible
#urllib2 = nltk.internals.import_from_stdlib('urllib2')
@@ -183,7 +189,7 @@ from nltk import compat
# Directory entry objects (from the data server's index file)
######################################################################
- at compat.python_2_unicode_compatible
+ at python_2_unicode_compatible
class Package(object):
"""
A directory entry for a downloadable package. These entries are
@@ -253,10 +259,10 @@ class Package(object):
@staticmethod
def fromxml(xml):
- if isinstance(xml, compat.string_types):
+ if isinstance(xml, string_types):
xml = ElementTree.parse(xml)
for key in xml.attrib:
- xml.attrib[key] = compat.text_type(xml.attrib[key])
+ xml.attrib[key] = text_type(xml.attrib[key])
return Package(**xml.attrib)
def __lt__(self, other):
@@ -265,7 +271,7 @@ class Package(object):
def __repr__(self):
return '<Package %s>' % self.id
- at compat.python_2_unicode_compatible
+ at python_2_unicode_compatible
class Collection(object):
"""
A directory entry for a collection of downloadable packages.
@@ -292,10 +298,10 @@ class Collection(object):
@staticmethod
def fromxml(xml):
- if isinstance(xml, compat.string_types):
+ if isinstance(xml, string_types):
xml = ElementTree.parse(xml)
for key in xml.attrib:
- xml.attrib[key] = compat.text_type(xml.attrib[key])
+ xml.attrib[key] = text_type(xml.attrib[key])
children = [child.get('ref') for child in xml.findall('item')]
return Collection(children=children, **xml.attrib)
@@ -472,7 +478,7 @@ class Downloader(object):
print(' [%s] %s %s' % (prefix, info.id.ljust(20, '.'), name))
lines += len(name.split('\n')) # for more_prompt
if more_prompt and lines > 20:
- user_input = compat.raw_input("Hit Enter to continue: ")
+ user_input = input("Hit Enter to continue: ")
if (user_input.lower() in ('x', 'q')): return
lines = 0
print()
@@ -504,7 +510,7 @@ class Downloader(object):
#/////////////////////////////////////////////////////////////////
def _info_or_id(self, info_or_id):
- if isinstance(info_or_id, compat.string_types):
+ if isinstance(info_or_id, string_types):
return self.info(info_or_id)
else:
return info_or_id
@@ -610,7 +616,7 @@ class Downloader(object):
yield StartDownloadMessage(info)
yield ProgressMessage(5)
try:
- infile = compat.urlopen(info.url)
+ infile = urlopen(info.url)
with open(filepath, 'wb') as outfile:
#print info.size
num_blocks = max(1, info.size/(1024*16))
@@ -672,7 +678,7 @@ class Downloader(object):
self._errors = True
if not quiet:
print("Error installing package. Retry? [n/y/e]")
- choice = compat.raw_input().strip()
+ choice = input().strip()
if choice in ['y', 'Y']:
if not self.download(msg.package.id, download_dir,
quiet, force, prefix,
@@ -756,7 +762,7 @@ class Downloader(object):
else:
filepath = os.path.join(download_dir, info.filename)
if download_dir != self._download_dir:
- status = self._pkg_status(info, filepath)
+ return self._pkg_status(info, filepath)
else:
if info.id not in self._status_cache:
self._status_cache[info.id] = self._pkg_status(info,
@@ -822,7 +828,7 @@ class Downloader(object):
# Download the index file.
self._index = nltk.internals.ElementWrapper(
- ElementTree.parse(compat.urlopen(self._url)).getroot())
+ ElementTree.parse(urlopen(self._url)).getroot())
self._index_timestamp = time.time()
# Build a dictionary of packages.
@@ -853,8 +859,10 @@ class Downloader(object):
for child in queue:
if isinstance(child, Collection):
queue.extend(child.children)
- else:
+ elif isinstance(child, Package):
packages[child.id] = child
+ else:
+ pass
collection.packages = packages.values()
# Flush the status cache
@@ -995,7 +1003,7 @@ class DownloaderShell(object):
while True:
self._simple_interactive_menu(
'd) Download', 'l) List', ' u) Update', 'c) Config', 'h) Help', 'q) Quit')
- user_input = compat.raw_input('Downloader> ').strip()
+ user_input = input('Downloader> ').strip()
if not user_input: print(); continue
command = user_input.lower().split()[0]
args = user_input.split()[1:]
@@ -1016,9 +1024,9 @@ class DownloaderShell(object):
self._simple_interactive_update()
else:
print('Command %r unrecognized' % user_input)
- except compat.HTTPError as e:
+ except HTTPError as e:
print('Error reading from server: %s'%e)
- except compat.URLError as e:
+ except URLError as e:
print('Error connecting to server: %s'%e.reason)
# try checking if user_input is a package name, &
# downloading it?
@@ -1033,7 +1041,7 @@ class DownloaderShell(object):
while True:
print()
print('Download which package (l=list; x=cancel)?')
- user_input = compat.raw_input(' Identifier> ')
+ user_input = input(' Identifier> ')
if user_input.lower()=='l':
self._ds.list(self._ds.download_dir, header=False,
more_prompt=True, skip_installed=True)
@@ -1063,7 +1071,7 @@ class DownloaderShell(object):
print(' [ ] %s %s' % (pid.ljust(20, '.'), name))
print()
- user_input = compat.raw_input(' Identifier> ')
+ user_input = input(' Identifier> ')
if user_input.lower()=='o':
for pid, pname in stale_packages:
try: self._ds.download(pid, prefix=' ')
@@ -1101,11 +1109,11 @@ class DownloaderShell(object):
self._simple_interactive_menu(
's) Show Config', 'u) Set Server URL',
'd) Set Data Dir', 'm) Main Menu')
- user_input = compat.raw_input('Config> ').strip().lower()
+ user_input = input('Config> ').strip().lower()
if user_input == 's':
self._show_config()
elif user_input == 'd':
- new_dl_dir = compat.raw_input(' New Directory> ').strip()
+ new_dl_dir = input(' New Directory> ').strip()
if new_dl_dir in ('', 'x', 'q', 'X', 'Q'):
print(' Cancelled!')
elif os.path.isdir(new_dl_dir):
@@ -1114,7 +1122,7 @@ class DownloaderShell(object):
print(('Directory %r not found! Create it first.' %
new_dl_dir))
elif user_input == 'u':
- new_url = compat.raw_input(' New URL> ').strip()
+ new_url = input(' New URL> ').strip()
if new_url in ('', 'x', 'q', 'X', 'Q'):
print(' Cancelled!')
else:
@@ -1232,9 +1240,9 @@ class DownloaderGUI(object):
self._init_menu()
try:
self._fill_table()
- except compat.HTTPError as e:
+ except HTTPError as e:
showerror('Error reading from server', e)
- except compat.URLError as e:
+ except URLError as e:
showerror('Error connecting to server', e.reason)
self._show_info()
@@ -1414,9 +1422,9 @@ class DownloaderGUI(object):
self._ds.clear_status_cache()
try:
self._fill_table()
- except compat.HTTPError as e:
+ except HTTPError as e:
showerror('Error reading from server', e)
- except compat.URLError as e:
+ except URLError as e:
showerror('Error connecting to server', e.reason)
self._table.select(0)
@@ -1441,7 +1449,7 @@ class DownloaderGUI(object):
def _table_reprfunc(self, row, col, val):
if self._table.column_names[col].endswith('Size'):
- if isinstance(val, compat.string_types): return ' %s' % val
+ if isinstance(val, string_types): return ' %s' % val
elif val < 1024**2: return ' %.1f KB' % (val/1024.**1)
elif val < 1024**3: return ' %.1f MB' % (val/1024.**2)
else: return ' %.1f GB' % (val/1024.**3)
@@ -1467,9 +1475,9 @@ class DownloaderGUI(object):
self._ds.download_dir = download_dir
try:
self._fill_table()
- except compat.HTTPError as e:
+ except HTTPError as e:
showerror('Error reading from server', e)
- except compat.URLError as e:
+ except URLError as e:
showerror('Error connecting to server', e.reason)
self._show_info()
@@ -1489,9 +1497,9 @@ class DownloaderGUI(object):
self._tab = self._tab_names[i-1].lower()
try:
return self._fill_table()
- except compat.HTTPError as e:
+ except HTTPError as e:
showerror('Error reading from server', e)
- except compat.URLError as e:
+ except URLError as e:
showerror('Error connecting to server', e.reason)
def _next_tab(self, *e):
@@ -1500,18 +1508,18 @@ class DownloaderGUI(object):
self._tab = self._tab_names[i+1].lower()
try:
return self._fill_table()
- except compat.HTTPError as e:
+ except HTTPError as e:
showerror('Error reading from server', e)
- except compat.URLError as e:
+ except URLError as e:
showerror('Error connecting to server', e.reason)
def _select_tab(self, event):
self._tab = event.widget['text'].lower()
try:
self._fill_table()
- except compat.HTTPError as e:
+ except HTTPError as e:
showerror('Error reading from server', e)
- except compat.URLError as e:
+ except URLError as e:
showerror('Error connecting to server', e.reason)
_tab = 'collections'
@@ -1753,14 +1761,10 @@ class DownloaderGUI(object):
"Written by Edward Loper")
TITLE = 'About: NLTK Downloader'
try:
- from tkMessageBox import Message
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except ImportError:
- try:
- from tkinter.messagebox import Message
- Message(message=ABOUT, title=TITLE).show()
- except ImportError:
- ShowText(self.top, TITLE, ABOUT)
+ ShowText(self.top, TITLE, ABOUT)
#/////////////////////////////////////////////////////////////////
# Progress Bar
@@ -1965,7 +1969,7 @@ def md5_hexdigest(file):
Calculate and return the MD5 checksum for a given file.
``file`` may either be a filename or an open stream.
"""
- if isinstance(file, compat.string_types):
+ if isinstance(file, string_types):
with open(file, 'rb') as infile:
return _md5_hexdigest(infile)
return _md5_hexdigest(file)
@@ -2030,13 +2034,12 @@ def _unzip_iter(filename, root, verbose=True):
for i, filename in enumerate(filelist):
filepath = os.path.join(root, *filename.split('/'))
- with open(filepath, 'wb') as outfile:
- try:
- contents = zf.read(filename)
- except Exception as e:
- yield ErrorMessage(filename, e)
- return
- outfile.write(contents)
+ try:
+ with open(filepath, 'wb') as dstfile, zf.open(filename) as srcfile:
+ shutil.copyfileobj(srcfile, dstfile)
+ except Exception as e:
+ yield ErrorMessage(filename, e)
+ return
if verbose and (i*10/len(filelist) > (i-1)*10/len(filelist)):
sys.stdout.write('.')
diff --git a/nltk/draw/__init__.py b/nltk/draw/__init__.py
index 78088aa..fdc6678 100644
--- a/nltk/draw/__init__.py
+++ b/nltk/draw/__init__.py
@@ -1,15 +1,14 @@
# Natural Language Toolkit: graphical representations package
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# Import Tkinter-based modules if Tkinter is installed
-import nltk.compat
try:
- import tkinter
+ from six.moves import tkinter
except ImportError:
import warnings
warnings.warn("nltk.draw package not loaded "
@@ -18,9 +17,10 @@ else:
from nltk.draw.cfg import ProductionList, CFGEditor, CFGDemo
from nltk.draw.tree import (TreeSegmentWidget, tree_to_treesegment,
TreeWidget, TreeView, draw_trees)
- from nltk.draw.dispersion import dispersion_plot
from nltk.draw.table import Table
+from nltk.draw.dispersion import dispersion_plot
+
# skip doctests from this package
def setup_module(module):
from nose import SkipTest
diff --git a/nltk/draw/cfg.py b/nltk/draw/cfg.py
index 2cdd7b1..3038f9f 100644
--- a/nltk/draw/cfg.py
+++ b/nltk/draw/cfg.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: CFG visualization
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -46,13 +46,11 @@ Visualization tools for CFGs.
# - disconnect top & bottom -- right click
# - if connected to top & bottom, then disconnect
-
-
-import nltk.compat
import re
-from tkinter import (Button, Canvas, Entry, Frame, IntVar, Label,
- Scrollbar, Text, Tk, Toplevel)
+from six import string_types
+from six.moves.tkinter import (Button, Canvas, Entry, Frame, IntVar, Label,
+ Scrollbar, Text, Tk, Toplevel)
from nltk.grammar import (CFG, _read_cfg_production,
Nonterminal, nonterminals)
@@ -60,7 +58,6 @@ from nltk.tree import Tree
from nltk.draw.tree import TreeSegmentWidget, tree_to_treesegment
from nltk.draw.util import (CanvasFrame, ColorizedList, ShowText,
SymbolWidget, TextWidget)
-from nltk import compat
######################################################################
# Production List
@@ -617,7 +614,7 @@ class CFGDemo(object):
isinstance(widget, TreeSegmentWidget) and
node.symbol == widget.label().text()):
pass # matching nonterminal
- elif (isinstance(node, compat.string_types) and
+ elif (isinstance(node, string_types) and
isinstance(widget, TextWidget) and
node == widget.text()):
pass # matching nonterminal
diff --git a/nltk/draw/dispersion.py b/nltk/draw/dispersion.py
index 206c05f..5f3a568 100644
--- a/nltk/draw/dispersion.py
+++ b/nltk/draw/dispersion.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Dispersion Plots
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/draw/table.py b/nltk/draw/table.py
index 898496e..7894f8e 100644
--- a/nltk/draw/table.py
+++ b/nltk/draw/table.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Table widget
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -11,10 +11,10 @@ Tkinter widgets for displaying multi-column listboxes and tables.
from __future__ import division
-import nltk.compat
+
import operator
-from tkinter import (Frame, Label, Listbox, Scrollbar, Tk)
+from six.moves.tkinter import (Frame, Label, Listbox, Scrollbar, Tk)
######################################################################
@@ -808,7 +808,7 @@ class Table(object):
"""
Delete the ``row_index``th row from this table.
"""
- if isinstance(index, slice):
+ if isinstance(row_index, slice):
raise ValueError('Slicing not supported')
if isinstance(row_index, tuple) and len(row_index)==2:
raise ValueError('Cannot delete a single cell!')
diff --git a/nltk/draw/tree.py b/nltk/draw/tree.py
index 8b4e37e..f421d13 100644
--- a/nltk/draw/tree.py
+++ b/nltk/draw/tree.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Graphical Representations for Trees
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -9,10 +9,7 @@
Graphically display a Tree.
"""
-import nltk.compat
-import sys
-
-from tkinter import IntVar, Menu, Tk
+from six.moves.tkinter import IntVar, Menu, Tk
from nltk.util import in_idle
from nltk.tree import Tree
diff --git a/nltk/draw/util.py b/nltk/draw/util.py
index c5a2e4e..e4006bc 100644
--- a/nltk/draw/util.py
+++ b/nltk/draw/util.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Drawing utilities
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -33,13 +33,12 @@ structures. For more information, see the CLIG
homepage (http://www.ags.uni-sb.de/~konrad/clig.html).
"""
-
-
-import nltk.compat
-from tkinter import (Button, Canvas, Entry, Frame, Label, Menu, Menubutton,
- RAISED, Scrollbar, StringVar, Text, Tk, Toplevel, Widget)
-
-import tkinter.font, tkinter.messagebox, tkinter.filedialog
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+from six.moves.tkinter import (Button, Canvas, Entry, Frame, Label, Menu,
+ Menubutton, Scrollbar, StringVar, Text, Tk,
+ Toplevel, Widget, RAISED)
+from six.moves.tkinter_tkfiledialog import asksaveasfilename
from nltk.util import in_idle
@@ -47,6 +46,8 @@ from nltk.util import in_idle
## CanvasWidget
##//////////////////////////////////////////////////////
+
+ at add_metaclass(ABCMeta)
class CanvasWidget(object):
"""
A collection of graphical elements and bindings used to display a
@@ -664,6 +665,7 @@ class CanvasWidget(object):
## Defined by subclass
##//////////////////////////////////////////////////////
+ @abstractmethod
def _tags(self):
"""
:return: a list of canvas tags for all graphical elements
@@ -671,7 +673,6 @@ class CanvasWidget(object):
elements managed by its child widgets.
:rtype: list of int
"""
- raise NotImplementedError()
def _manage(self):
"""
@@ -682,7 +683,6 @@ class CanvasWidget(object):
:rtype: None
"""
- pass
def _update(self, child):
"""
@@ -693,7 +693,6 @@ class CanvasWidget(object):
:type child: CanvasWidget
:rtype: None
"""
- pass
##//////////////////////////////////////////////////////
## Basic widgets.
@@ -1705,18 +1704,21 @@ class CanvasFrame(object):
:rtype: None
"""
if filename is None:
- from tkinter.filedialog import asksaveasfilename
ftypes = [('Postscript files', '.ps'),
('All files', '*')]
filename = asksaveasfilename(filetypes=ftypes,
defaultextension='.ps')
if not filename: return
(x0, y0, w, h) = self.scrollregion()
- self._canvas.postscript(file=filename, x=x0, y=y0,
+ postscript = self._canvas.postscript(x=x0, y=y0,
width=w+2, height=h+2,
pagewidth=w+2, # points = 1/72 inch
pageheight=h+2, # points = 1/72 inch
pagex=0, pagey=0)
+ # workaround for bug in Tk font handling
+ postscript = postscript.replace(' 0 scalefont ', ' 9 scalefont ')
+ with open(filename, 'wb') as f:
+ f.write(postscript.encode('utf8'))
def scrollregion(self):
"""
@@ -2034,15 +2036,15 @@ class ColorizedList(object):
#////////////////////////////////////////////////////////////
# Abstract methods
#////////////////////////////////////////////////////////////
-
+ @abstractmethod
def _init_colortags(self, textwidget, options):
"""
Set up any colortags that will be used by this colorized list.
E.g.:
>>> textwidget.tag_config('terminal', foreground='black')
"""
- raise NotImplementedError()
+ @abstractmethod
def _item_repr(self, item):
"""
Return a list of (text, colortag) tuples that make up the
@@ -2050,7 +2052,6 @@ class ColorizedList(object):
representations may not span multiple lines. I.e., the text
strings returned may not contain newline characters.
"""
- raise NotImplementedError()
#////////////////////////////////////////////////////////////
# Item Access
@@ -2353,4 +2354,3 @@ def demo():
if __name__ == '__main__':
demo()
-
diff --git a/nltk/featstruct.py b/nltk/featstruct.py
index 81bf6b3..7795286 100644
--- a/nltk/featstruct.py
+++ b/nltk/featstruct.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Feature Structures
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>,
# Rob Speer,
# Steven Bird <stevenbird1 at gmail.com>
@@ -92,12 +92,14 @@ from __future__ import print_function, unicode_literals, division
import re
import copy
+from functools import total_ordering
+
+from six import integer_types, string_types
from nltk.internals import read_str, raise_unorderable_types
from nltk.sem.logic import (Variable, Expression, SubstituteBindingsI,
LogicParser, LogicalExpressionException)
-from nltk.compat import (string_types, integer_types, total_ordering,
- python_2_unicode_compatible, unicode_repr)
+from nltk.compat import python_2_unicode_compatible, unicode_repr
######################################################################
# Feature Structure
diff --git a/nltk/grammar.py b/nltk/grammar.py
index 8ae8d45..6c09500 100644
--- a/nltk/grammar.py
+++ b/nltk/grammar.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Context Free Grammars
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# Jason Narad <jason.narad at gmail.com>
@@ -68,13 +68,15 @@ The operation of replacing the left hand side (*lhs*) of a production
with the right hand side (*rhs*) in a tree (*tree*) is known as
"expanding" *lhs* to *rhs* in *tree*.
"""
-from __future__ import print_function, unicode_literals, division
+from __future__ import print_function, unicode_literals, division
import re
+from functools import total_ordering
+
+from six import string_types
from nltk.util import transitive_closure, invert_graph
-from nltk.compat import (string_types, total_ordering, text_type,
- python_2_unicode_compatible, unicode_repr)
+from nltk.compat import python_2_unicode_compatible, unicode_repr
from nltk.internals import raise_unorderable_types
from nltk.probability import ImmutableProbabilisticMixIn
@@ -179,6 +181,21 @@ class Nonterminal(object):
"""
return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))
+
+ def __truediv__(self, rhs):
+ """
+ Return a new nonterminal whose symbol is ``A/B``, where ``A`` is
+ the symbol for this nonterminal, and ``B`` is the symbol for rhs.
+ This function allows use of the slash ``/`` operator with
+ the future import of division.
+
+ :param rhs: The nonterminal used to form the right hand side
+ of the new nonterminal.
+ :type rhs: Nonterminal
+ :rtype: Nonterminal
+ """
+ return self.__div__(rhs)
+
def nonterminals(symbols):
"""
Given a string containing a list of symbol names, return a list of
@@ -1513,4 +1530,3 @@ __all__ = ['Nonterminal', 'nonterminals',
'DependencyGrammar', 'DependencyProduction',
'ProbabilisticDependencyGrammar',
'induce_pcfg', 'read_grammar']
-
diff --git a/nltk/help.py b/nltk/help.py
index cad41d3..2d2f516 100644
--- a/nltk/help.py
+++ b/nltk/help.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit (NLTK) Help
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/inference/__init__.py b/nltk/inference/__init__.py
index a5d89c4..94581e5 100644
--- a/nltk/inference/__init__.py
+++ b/nltk/inference/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Inference
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Dan Garrette <dhgarrette at gmail.com>
# Ewan Klein <ewan at inf.ed.ac.uk>
#
diff --git a/nltk/inference/api.py b/nltk/inference/api.py
index d8185ef..ca03a96 100644
--- a/nltk/inference/api.py
+++ b/nltk/inference/api.py
@@ -18,10 +18,14 @@ goal *G*, the model builder tries to find a counter-model, in the sense of a mod
the assumptions plus the negation of *G*.
"""
from __future__ import print_function
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
import threading
import time
+
+ at add_metaclass(ABCMeta)
class Prover(object):
"""
Interface for trying to prove a goal from assumptions. Both the goal and
@@ -34,13 +38,15 @@ class Prover(object):
"""
return self._prove(goal, assumptions, verbose)[0]
+ @abstractmethod
def _prove(self, goal=None, assumptions=None, verbose=False):
"""
:return: Whether the proof was successful or not, along with the proof
:rtype: tuple: (bool, str)
"""
- raise NotImplementedError()
+
+ at add_metaclass(ABCMeta)
class ModelBuilder(object):
"""
Interface for trying to build a model of set of formulas.
@@ -56,20 +62,22 @@ class ModelBuilder(object):
"""
return self._build_model(goal, assumptions, verbose)[0]
+ @abstractmethod
def _build_model(self, goal=None, assumptions=None, verbose=False):
"""
Perform the actual model building.
:return: Whether a model was generated, and the model itself
:rtype: tuple(bool, sem.Valuation)
"""
- raise NotImplementedError()
+ at add_metaclass(ABCMeta)
class TheoremToolCommand(object):
"""
This class holds a goal and a list of assumptions to be used in proving
or model building.
"""
+ @abstractmethod
def add_assumptions(self, new_assumptions):
"""
Add new assumptions to the assumption list.
@@ -77,8 +85,8 @@ class TheoremToolCommand(object):
:param new_assumptions: new assumptions
:type new_assumptions: list(sem.Expression)
"""
- raise NotImplementedError()
+ @abstractmethod
def retract_assumptions(self, retracted, debug=False):
"""
Retract assumptions from the assumption list.
@@ -89,29 +97,28 @@ class TheoremToolCommand(object):
:param retracted: assumptions to be retracted
:type retracted: list(sem.Expression)
"""
- raise NotImplementedError()
+ @abstractmethod
def assumptions(self):
"""
List the current assumptions.
:return: list of ``Expression``
"""
- raise NotImplementedError()
+ @abstractmethod
def goal(self):
"""
Return the goal
:return: ``Expression``
"""
- raise NotImplementedError()
+ @abstractmethod
def print_assumptions(self):
"""
Print the list of the current assumptions.
"""
- raise NotImplementedError()
class ProverCommand(TheoremToolCommand):
@@ -119,26 +126,26 @@ class ProverCommand(TheoremToolCommand):
This class holds a ``Prover``, a goal, and a list of assumptions. When
prove() is called, the ``Prover`` is executed with the goal and assumptions.
"""
+ @abstractmethod
def prove(self, verbose=False):
"""
Perform the actual proof.
"""
- raise NotImplementedError()
+ @abstractmethod
def proof(self, simplify=True):
"""
Return the proof string
:param simplify: bool simplify the proof?
:return: str
"""
- raise NotImplementedError()
+ @abstractmethod
def get_prover(self):
"""
Return the prover object
:return: ``Prover``
"""
- raise NotImplementedError()
class ModelBuilderCommand(TheoremToolCommand):
@@ -147,14 +154,15 @@ class ModelBuilderCommand(TheoremToolCommand):
When build_model() is called, the ``ModelBuilder`` is executed with the goal
and assumptions.
"""
+ @abstractmethod
def build_model(self, verbose=False):
"""
Perform the actual model building.
:return: A model if one is generated; None otherwise.
:rtype: sem.Valuation
"""
- raise NotImplementedError()
+ @abstractmethod
def model(self, format=None):
"""
Return a string representation of the model
@@ -162,14 +170,13 @@ class ModelBuilderCommand(TheoremToolCommand):
:param simplify: bool simplify the proof?
:return: str
"""
- raise NotImplementedError()
+ @abstractmethod
def get_model_builder(self):
"""
Return the model builder object
:return: ``ModelBuilder``
"""
- raise NotImplementedError()
class BaseTheoremToolCommand(TheoremToolCommand):
@@ -369,8 +376,8 @@ class TheoremToolCommandDecorator(TheoremToolCommand):
"""
self._command = command
- #The decorator has its own versions of 'result' different from the
- #underlying command
+ # The decorator has its own versions of 'result' different from the
+ # underlying command
self._result = None
def assumptions(self):
@@ -402,8 +409,8 @@ class ProverCommandDecorator(TheoremToolCommandDecorator, ProverCommand):
"""
TheoremToolCommandDecorator.__init__(self, proverCommand)
- #The decorator has its own versions of 'result' and 'proof'
- #because they may be different from the underlying command
+ # The decorator has its own versions of 'result' and 'proof'
+ # because they may be different from the underlying command
self._proof = None
def prove(self, verbose=False):
@@ -449,8 +456,8 @@ class ModelBuilderCommandDecorator(TheoremToolCommandDecorator, ModelBuilderComm
"""
TheoremToolCommandDecorator.__init__(self, modelBuilderCommand)
- #The decorator has its own versions of 'result' and 'valuation'
- #because they may be different from the underlying command
+ # The decorator has its own versions of 'result' and 'valuation'
+ # because they may be different from the underlying command
self._model = None
def build_model(self, verbose=False):
@@ -528,6 +535,7 @@ class ParallelProverBuilder(Prover, ModelBuilder):
else:
return None
+
class ParallelProverBuilderCommand(BaseProverCommand, BaseModelBuilderCommand):
"""
This command stores both a prover and a model builder and when either
diff --git a/nltk/inference/discourse.py b/nltk/inference/discourse.py
index ee5aed2..a04d360 100644
--- a/nltk/inference/discourse.py
+++ b/nltk/inference/discourse.py
@@ -43,6 +43,8 @@ The set of all threads for a discourse is the Cartesian product of all the readi
those threads which are consistent (taking into account any background assumptions).
"""
from __future__ import print_function
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
import os
from operator import and_, add
@@ -60,13 +62,14 @@ from nltk.inference.mace import MaceCommand
from nltk.inference.prover9 import Prover9Command
+ at add_metaclass(ABCMeta)
class ReadingCommand(object):
+ @abstractmethod
def parse_to_readings(self, sentence):
"""
:param sentence: the sentence to read
:type sentence: str
"""
- raise NotImplementedError()
def process_thread(self, sentence_readings):
"""
@@ -80,6 +83,7 @@ class ReadingCommand(object):
"""
return sentence_readings
+ @abstractmethod
def combine_readings(self, readings):
"""
:param readings: readings to combine
@@ -87,8 +91,8 @@ class ReadingCommand(object):
:return: one combined reading
:rtype: Expression
"""
- raise NotImplementedError()
+ @abstractmethod
def to_fol(self, expression):
"""
Convert this expression into a First-Order Logic expression.
@@ -98,7 +102,6 @@ class ReadingCommand(object):
:return: a FOL version of the input expression
:rtype: Expression
"""
- raise NotImplementedError()
class CfgReadingCommand(ReadingCommand):
@@ -299,7 +302,6 @@ class DiscourseTester(object):
if (tid, True) in consistency_checked:
self._filtered_threads[tid] = thread
-
def _show_readings(self, sentence=None):
"""
Print out the readings for the discourse (or a single sentence).
@@ -492,6 +494,7 @@ class DiscourseTester(object):
#L2 = ['a', 'b', 'c']
#print multiply(L1,L2)
+
def load_fol(s):
"""
Temporarily duplicated from ``nltk.sem.util``.
@@ -505,17 +508,18 @@ def load_fol(s):
statements = []
for linenum, line in enumerate(s.splitlines()):
line = line.strip()
- if line.startswith('#') or line=='': continue
+ if line.startswith('#') or line == '':
+ continue
try:
statements.append(Expression.fromstring(line))
except Exception:
raise ValueError('Unable to parse line %s: %s' % (linenum, line))
return statements
+
###############################
# Demo
###############################
-
def discourse_demo(reading_command=None):
"""
Illustrate the various methods of ``DiscourseTester``
@@ -524,7 +528,7 @@ def discourse_demo(reading_command=None):
reading_command)
dt.models()
print()
- #dt.grammar()
+ # dt.grammar()
print()
dt.sentences()
print()
@@ -555,12 +559,12 @@ def discourse_demo(reading_command=None):
dt.add_sentence('A person dances', informchk=True)
dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer',
'Vincent is married', 'Fido barks'],
- reading_command)
+ reading_command)
dt.readings(filter=True)
import nltk.data
background_file = os.path.join('grammars', 'book_grammars', 'background.fol')
background = nltk.data.load(background_file)
-
+
print()
dt.add_background(background, verbose=False)
dt.background()
@@ -590,19 +594,19 @@ def drt_discourse_demo(reading_command=None):
def spacer(num=30):
print('-' * num)
+
def demo():
discourse_demo()
- tagger = RegexpTagger(
- [('^(chases|runs)$', 'VB'),
- ('^(a)$', 'ex_quant'),
- ('^(every)$', 'univ_quant'),
- ('^(dog|boy)$', 'NN'),
- ('^(he)$', 'PRP')
- ])
+ tagger = RegexpTagger([('^(chases|runs)$', 'VB'),
+ ('^(a)$', 'ex_quant'),
+ ('^(every)$', 'univ_quant'),
+ ('^(dog|boy)$', 'NN'),
+ ('^(he)$', 'PRP')])
depparser = MaltParser(tagger=tagger)
drt_discourse_demo(DrtGlueReadingCommand(remove_duplicates=False,
depparser=depparser))
+
if __name__ == '__main__':
demo()
diff --git a/nltk/inference/nonmonotonic.py b/nltk/inference/nonmonotonic.py
index 0659448..60c9cd8 100644
--- a/nltk/inference/nonmonotonic.py
+++ b/nltk/inference/nonmonotonic.py
@@ -2,7 +2,7 @@
#
# Author: Daniel H. Garrette <dhgarrette at gmail.com>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
diff --git a/nltk/inference/prover9.py b/nltk/inference/prover9.py
index 83d8ae3..cfeeb1e 100644
--- a/nltk/inference/prover9.py
+++ b/nltk/inference/prover9.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Interface to the Prover9 Theorem Prover
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Dan Garrette <dhgarrette at gmail.com>
# Ewan Klein <ewan at inf.ed.ac.uk>
#
diff --git a/nltk/inference/resolution.py b/nltk/inference/resolution.py
index 95d757e..eb38d73 100755
--- a/nltk/inference/resolution.py
+++ b/nltk/inference/resolution.py
@@ -2,7 +2,7 @@
#
# Author: Dan Garrette <dhgarrette at gmail.com>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
diff --git a/nltk/inference/tableau.py b/nltk/inference/tableau.py
index 996faa5..02e769c 100644
--- a/nltk/inference/tableau.py
+++ b/nltk/inference/tableau.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: First-Order Tableau Theorem Prover
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Dan Garrette <dhgarrette at gmail.com>
#
# URL: <http://nltk.org/>
diff --git a/nltk/internals.py b/nltk/internals.py
index 5f95cc8..84e28cf 100644
--- a/nltk/internals.py
+++ b/nltk/internals.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Internal utility functions
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# Nitin Madnani <nmadnani at ets.org>
@@ -25,6 +25,8 @@ try:
except ImportError:
from xml.etree import ElementTree
+from six import string_types
+
from nltk import __file__
from nltk import compat
@@ -35,7 +37,7 @@ from nltk import compat
_java_bin = None
_java_options = []
# [xx] add classpath option to config_java?
-def config_java(bin=None, options=None, verbose=True):
+def config_java(bin=None, options=None, verbose=False):
"""
Configure nltk's java interface, by letting nltk know where it can
find the Java binary, and what extra options (if any) should be
@@ -56,7 +58,7 @@ def config_java(bin=None, options=None, verbose=True):
_java_bin = find_binary('java', bin, env_vars=['JAVAHOME', 'JAVA_HOME'], verbose=verbose, binary_names=['java.exe'])
if options is not None:
- if isinstance(options, compat.string_types):
+ if isinstance(options, string_types):
options = options.split()
_java_options = list(options)
@@ -104,7 +106,7 @@ def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None,
if stdin == 'pipe': stdin = subprocess.PIPE
if stdout == 'pipe': stdout = subprocess.PIPE
if stderr == 'pipe': stderr = subprocess.PIPE
- if isinstance(cmd, compat.string_types):
+ if isinstance(cmd, string_types):
raise TypeError('cmd should be a list of strings')
# Make sure we know where a java binary is.
@@ -112,7 +114,7 @@ def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None,
config_java()
# Set up the classpath.
- if isinstance(classpath, compat.string_types):
+ if isinstance(classpath, string_types):
classpaths=[classpath]
else:
classpaths=list(classpath)
@@ -173,21 +175,21 @@ def read_str(s, start_position):
containing the value of the string literal and the position where
it ends. Otherwise, raise a ``ReadError``.
- :param s: A string that will be checked to see if within which a
+ :param s: A string that will be checked to see if within which a
Python string literal exists.
:type s: str
-
+
:param start_position: The specified beginning position of the string ``s``
to begin regex matching.
:type start_position: int
-
- :return: A tuple containing the matched string literal evaluated as a
+
+ :return: A tuple containing the matched string literal evaluated as a
string and the end position of the string literal.
:rtype: tuple(str, int)
:raise ReadError: If the ``_STRING_START_RE`` regex doesn't return a
- match in ``s`` at ``start_position``, i.e., open quote. If the
- ``_STRING_END_RE`` regex doesn't return a match in ``s`` at the
+ match in ``s`` at ``start_position``, i.e., open quote. If the
+ ``_STRING_END_RE`` regex doesn't return a match in ``s`` at the
end of the first match, i.e., close quote.
:raise ValueError: If an invalid string (i.e., contains an invalid
escape sequence) is passed into the ``eval``.
@@ -227,14 +229,14 @@ def read_int(s, start_position):
value of the integer and the position where it ends. Otherwise,
raise a ``ReadError``.
- :param s: A string that will be checked to see if within which a
+ :param s: A string that will be checked to see if within which a
Python integer exists.
:type s: str
-
+
:param start_position: The specified beginning position of the string ``s``
to begin regex matching.
:type start_position: int
-
+
:return: A tuple containing the matched integer casted to an int,
and the end position of the int in ``s``.
:rtype: tuple(int, int)
@@ -246,7 +248,7 @@ def read_int(s, start_position):
>>> from nltk.internals import read_int
>>> read_int('42 is the answer', 0)
(42, 2)
-
+
"""
m = _READ_INT_RE.match(s, start_position)
if not m: raise ReadError('integer', start_position)
@@ -260,14 +262,14 @@ def read_number(s, start_position):
containing the value of the number and the position where it ends.
Otherwise, raise a ``ReadError``.
- :param s: A string that will be checked to see if within which a
+ :param s: A string that will be checked to see if within which a
Python number exists.
:type s: str
-
+
:param start_position: The specified beginning position of the string ``s``
to begin regex matching.
:type start_position: int
-
+
:return: A tuple containing the matched number casted to a ``float``,
and the end position of the number in ``s``.
:rtype: tuple(float, int)
@@ -279,7 +281,7 @@ def read_number(s, start_position):
>>> from nltk.internals import read_number
>>> read_number('Pi is 3.14159', 6)
(3.14159, 13)
-
+
"""
m = _READ_NUMBER_VALUE.match(s, start_position)
if not m or not (m.group(1) or m.group(2)):
@@ -451,7 +453,7 @@ class Counter:
##########################################################################
def find_file_iter(filename, env_vars=(), searchpath=(),
- file_names=None, url=None, verbose=True, finding_dir=False):
+ file_names=None, url=None, verbose=False, finding_dir=False):
"""
Search for a file to be used by nltk.
@@ -463,10 +465,10 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
:param verbose: Whether or not to print path when a file is found.
"""
file_names = [filename] + (file_names or [])
- assert isinstance(filename, compat.string_types)
- assert not isinstance(file_names, compat.string_types)
- assert not isinstance(searchpath, compat.string_types)
- if isinstance(env_vars, compat.string_types):
+ assert isinstance(filename, string_types)
+ assert not isinstance(file_names, string_types)
+ assert not isinstance(searchpath, string_types)
+ if isinstance(env_vars, string_types):
env_vars = env_vars.split()
yielded = False
@@ -498,7 +500,7 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
if finding_dir: # This is to file a directory instead of file
yielded = True
yield os.environ[env_var]
-
+
for env_dir in os.environ[env_var].split(os.pathsep):
# Check if the environment variable contains a direct path to the bin
if os.path.isfile(env_dir):
@@ -568,19 +570,19 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
def find_file(filename, env_vars=(), searchpath=(),
- file_names=None, url=None, verbose=True):
+ file_names=None, url=None, verbose=False):
return next(find_file_iter(filename, env_vars, searchpath,
file_names, url, verbose))
def find_dir(filename, env_vars=(), searchpath=(),
- file_names=None, url=None, verbose=True):
+ file_names=None, url=None, verbose=False):
return next(find_file_iter(filename, env_vars, searchpath,
file_names, url, verbose, finding_dir=True))
def find_binary_iter(name, path_to_bin=None, env_vars=(), searchpath=(),
- binary_names=None, url=None, verbose=True):
+ binary_names=None, url=None, verbose=False):
"""
Search for a file to be used by nltk.
@@ -597,12 +599,12 @@ def find_binary_iter(name, path_to_bin=None, env_vars=(), searchpath=(),
yield file
def find_binary(name, path_to_bin=None, env_vars=(), searchpath=(),
- binary_names=None, url=None, verbose=True):
+ binary_names=None, url=None, verbose=False):
return next(find_binary_iter(name, path_to_bin, env_vars, searchpath,
binary_names, url, verbose))
def find_jar_iter(name_pattern, path_to_jar=None, env_vars=(),
- searchpath=(), url=None, verbose=True, is_regex=False):
+ searchpath=(), url=None, verbose=False, is_regex=False):
"""
Search for a jar that is used by nltk.
@@ -615,9 +617,9 @@ def find_jar_iter(name_pattern, path_to_jar=None, env_vars=(),
:param is_regex: Whether name is a regular expression.
"""
- assert isinstance(name_pattern, compat.string_types)
- assert not isinstance(searchpath, compat.string_types)
- if isinstance(env_vars, compat.string_types):
+ assert isinstance(name_pattern, string_types)
+ assert not isinstance(searchpath, string_types)
+ if isinstance(env_vars, string_types):
env_vars = env_vars.split()
yielded = False
@@ -648,7 +650,7 @@ def find_jar_iter(name_pattern, path_to_jar=None, env_vars=(),
print('[Found %s: %s]' % (name_pattern, cp))
yielded = True
yield cp
- # The case where user put directory containing the jar file in the classpath
+ # The case where user put directory containing the jar file in the classpath
if os.path.isdir(cp):
if not is_regex:
if os.path.isfile(os.path.join(cp,name_pattern)):
@@ -657,14 +659,14 @@ def find_jar_iter(name_pattern, path_to_jar=None, env_vars=(),
yielded = True
yield os.path.join(cp,name_pattern)
else:
- # Look for file using regular expression
+ # Look for file using regular expression
for file_name in os.listdir(cp):
if re.match(name_pattern,file_name):
if verbose:
print('[Found %s: %s]' % (name_pattern, os.path.join(cp,file_name)))
yielded = True
yield os.path.join(cp,file_name)
-
+
else:
jar_env = os.environ[env_var]
jar_iter = ((os.path.join(jar_env, path_to_jar) for path_to_jar in os.listdir(jar_env))
@@ -714,21 +716,21 @@ def find_jar_iter(name_pattern, path_to_jar=None, env_vars=(),
raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
def find_jar(name_pattern, path_to_jar=None, env_vars=(),
- searchpath=(), url=None, verbose=True, is_regex=False):
+ searchpath=(), url=None, verbose=False, is_regex=False):
return next(find_jar_iter(name_pattern, path_to_jar, env_vars,
searchpath, url, verbose, is_regex))
-
+
def find_jars_within_path(path_to_jars):
- return [os.path.join(root, filename)
- for root, dirnames, filenames in os.walk(path_to_jars)
+ return [os.path.join(root, filename)
+ for root, dirnames, filenames in os.walk(path_to_jars)
for filename in fnmatch.filter(filenames, '*.jar')]
def _decode_stdoutdata(stdoutdata):
""" Convert data read from stdout/stderr to unicode """
if not isinstance(stdoutdata, bytes):
return stdoutdata
-
+
encoding = getattr(sys.__stdout__, "encoding", locale.getpreferredencoding())
if encoding is None:
return stdoutdata.decode()
@@ -795,7 +797,7 @@ class ElementWrapper(object):
<Element "<?xml version='1.0' encoding='utf8'?>\n<test />">
"""
- if isinstance(etree, compat.string_types):
+ if isinstance(etree, string_types):
etree = ElementTree.fromstring(etree)
self.__dict__['_etree'] = etree
@@ -978,5 +980,3 @@ def is_writable(path):
def raise_unorderable_types(ordering, a, b):
raise TypeError("unorderable types: %s() %s %s()" % (type(a).__name__, ordering, type(b).__name__))
-
-
diff --git a/nltk/jsontags.py b/nltk/jsontags.py
index 28c7ec0..3f74b12 100644
--- a/nltk/jsontags.py
+++ b/nltk/jsontags.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: JSON Encoder/Decoder Helpers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Xu <xxu at student.unimelb.edu.au>
#
# URL: <http://nltk.org/>
diff --git a/nltk/metrics/__init__.py b/nltk/metrics/__init__.py
index e835385..2205cc6 100644
--- a/nltk/metrics/__init__.py
+++ b/nltk/metrics/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Metrics
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
@@ -27,3 +27,4 @@ from nltk.metrics.association import (NgramAssocMeasures, BigramAssocMeasure
TrigramAssocMeasures, ContingencyMeasures)
from nltk.metrics.spearman import (spearman_correlation, ranks_from_sequence,
ranks_from_scores)
+from nltk.metrics.aline import align
diff --git a/nltk/metrics/agreement.py b/nltk/metrics/agreement.py
index 78b6958..887ad76 100644
--- a/nltk/metrics/agreement.py
+++ b/nltk/metrics/agreement.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Agreement Metrics
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tom Lippincott <tom at cs.columbia.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -74,9 +74,11 @@ import logging
from itertools import groupby
from operator import itemgetter
+from six import iteritems
+
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.internals import deprecated
-from nltk.compat import python_2_unicode_compatible, iteritems
+from nltk.compat import python_2_unicode_compatible
from nltk.metrics.distance import binary_distance
@@ -97,8 +99,15 @@ class AnnotationTask(object):
"""
def __init__(self, data=None, distance=binary_distance):
- """Initialize an empty annotation task.
+ """Initialize an annotation task.
+
+ The data argument can be None (to create an empty annotation task) or a sequence of 3-tuples,
+ each representing a coder's labeling of an item:
+ (coder,item,label)
+ The distance argument is a function taking two arguments (labels) and producing a numerical distance.
+ The distance from a label to itself should be zero:
+ distance(l,l) = 0
"""
self.distance = distance
self.I = set()
@@ -114,9 +123,9 @@ class AnnotationTask(object):
",".join(x['labels'])), self.data))
def load_array(self, array):
- """Load the results of annotation.
+ """Load an sequence of annotation results, appending to any data already loaded.
- The argument is a list of 3-tuples, each representing a coder's labeling of an item:
+ The argument is a sequence of 3-tuples, each representing a coder's labeling of an item:
(coder,item,label)
"""
for coder, item, labels in array:
@@ -304,6 +313,15 @@ class AnnotationTask(object):
"""Krippendorff 1980
"""
+ # check for degenerate cases
+ if len(self.K)==0:
+ raise ValueError("Cannot calculate alpha, no data present!")
+ if len(self.K) == 1:
+ log.debug("Only one annotation value, allpha returning 1.")
+ return 1
+ if len(self.C)==1 and len(self.I) == 1:
+ raise ValueError("Cannot calculate alpha, only one coder and item present!")
+
De = 0.0
label_freqs = FreqDist(x['labels'] for x in self.data)
@@ -311,9 +329,12 @@ class AnnotationTask(object):
nj = label_freqs[j]
for l in self.K:
De += float(nj * label_freqs[l]) * self.distance(j, l)
- De = (1.0 / (len(self.I) * len(self.C) * (len(self.I) * len(self.C) - 1))) * De
- log.debug("Expected disagreement: %f", De)
- ret = 1.0 - (self.Do_alpha() / De)
+ try:
+ De = (1.0 / (len(self.I) * len(self.C) * (len(self.I) * len(self.C) - 1))) * De
+ log.debug("Expected disagreement: %f", De)
+ ret = 1.0 - (self.Do_alpha() / De)
+ except ZeroDivisionError:
+ raise ValueError("Cannot calculate alpha, expected disagreement zero, check the distance function!")
return ret
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
@@ -398,4 +419,3 @@ if __name__ == '__main__':
print(getattr(task, options.agreement)())
logging.shutdown()
-
diff --git a/nltk/metrics/aline.py b/nltk/metrics/aline.py
new file mode 100644
index 0000000..aa1da0d
--- /dev/null
+++ b/nltk/metrics/aline.py
@@ -0,0 +1,607 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: ALINE
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Author: Greg Kondrak <gkondrak at ualberta.ca>
+# Geoff Bacon <bacon at berkeley.edu> (Python port)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+ALINE
+http://webdocs.cs.ualberta.ca/~kondrak/
+Copyright 2002 by Grzegorz Kondrak.
+
+ALINE is an algorithm for aligning phonetic sequences, described in [1].
+This module is a port of Kondrak's (2002) ALINE. It provides functions for
+phonetic sequence alignment and similarity analysis. These are useful in
+historical linguistics, sociolinguistics and synchronic phonology.
+
+ALINE has parameters that can be tuned for desired output. These parameters are:
+- C_skip, C_sub, C_exp, C_vwl
+- Salience weights
+- Segmental features
+
+In this implementation, some parameters have been changed from their default
+values as described in [1], in order to replicate published results. All changes
+are noted in comments.
+
+Example usage
+-------------
+
+# Get optimal alignment of two phonetic sequences
+
+>>> align('θin', 'tenwis') # doctest: +SKIP
+[[('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]]
+
+[1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation,
+University of Toronto.
+"""
+
+from __future__ import unicode_literals
+
+try:
+ import numpy as np
+except ImportError:
+ np = None
+
+# === Constants ===
+
+inf = float('inf')
+
+# Default values for maximum similarity scores (Kondrak 2002: 54)
+C_skip = 10 # Indels
+C_sub = 35 # Substitutions
+C_exp = 45 # Expansions/compressions
+C_vwl = 5 # Vowel/consonant relative weight (decreased from 10)
+
+consonants = ['B', 'N', 'R', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm',
+ 'n', 'p', 'q', 'r', 's', 't', 'v', 'x', 'z', 'ç', 'ð', 'ħ',
+ 'ŋ', 'ɖ', 'ɟ', 'ɢ', 'ɣ', 'ɦ', 'ɬ', 'ɮ', 'ɰ', 'ɱ', 'ɲ', 'ɳ', 'ɴ',
+ 'ɸ', 'ɹ', 'ɻ', 'ɽ', 'ɾ', 'ʀ', 'ʁ', 'ʂ', 'ʃ', 'ʈ', 'ʋ', 'ʐ ', 'ʒ',
+ 'ʔ', 'ʕ', 'ʙ', 'ʝ', 'β', 'θ', 'χ', 'ʐ', 'w']
+
+# Relevant features for comparing consonants and vowels
+R_c = ['aspirated', 'lateral', 'manner', 'nasal', 'place', 'retroflex',
+ 'syllabic', 'voice']
+# 'high' taken out of R_v because same as manner
+R_v = ['back', 'lateral', 'long', 'manner', 'nasal', 'place',
+ 'retroflex', 'round', 'syllabic', 'voice']
+
+# Flattened feature matrix (Kondrak 2002: 56)
+similarity_matrix = {
+ #place
+ 'bilabial': 1.0, 'labiodental': 0.95, 'dental': 0.9,
+ 'alveolar': 0.85, 'retroflex': 0.8, 'palato-alveolar': 0.75,
+ 'palatal': 0.7, 'velar': 0.6, 'uvular': 0.5, 'pharyngeal': 0.3,
+ 'glottal': 0.1, 'labiovelar': 1.0, 'vowel': -1.0, # added 'vowel'
+ #manner
+ 'stop': 1.0, 'affricate': 0.9, 'fricative': 0.85, # increased fricative from 0.8
+ 'trill': 0.7, 'tap': 0.65, 'approximant': 0.6, 'high vowel': 0.4,
+ 'mid vowel': 0.2, 'low vowel': 0.0, 'vowel2': 0.5, # added vowel
+ #high
+ 'high': 1.0, 'mid': 0.5, 'low': 0.0,
+ #back
+ 'front': 1.0, 'central': 0.5, 'back': 0.0,
+ #binary features
+ 'plus': 1.0, 'minus': 0.0
+}
+
+# Relative weights of phonetic features (Kondrak 2002: 55)
+salience = {
+ 'syllabic': 5,
+ 'place': 40,
+ 'manner': 50,
+ 'voice': 5, # decreased from 10
+ 'nasal': 20, # increased from 10
+ 'retroflex': 10,
+ 'lateral': 10,
+ 'aspirated': 5,
+ 'long': 0, # decreased from 1
+ 'high': 3, # decreased from 5
+ 'back': 2, # decreased from 5
+ 'round': 2 # decreased from 5
+}
+
+# (Kondrak 2002: 59-60)
+feature_matrix = {
+# Consonants
+'p': {'place': 'bilabial', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'b': {'place': 'bilabial', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'t': {'place': 'alveolar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'d': {'place': 'alveolar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʈ': {'place': 'retroflex', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɖ': {'place': 'retroflex', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'c': {'place': 'palatal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɟ': {'place': 'palatal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'k': {'place': 'velar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'g': {'place': 'velar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'q': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɢ': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʔ': {'place': 'glottal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'m': {'place': 'bilabial', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɱ': {'place': 'labiodental', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'n': {'place': 'alveolar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɳ': {'place': 'retroflex', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɲ': {'place': 'palatal', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ŋ': {'place': 'velar', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɴ': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'N': {'place': 'uvular', 'manner': 'stop', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'plus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʙ': {'place': 'bilabial', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'B': {'place': 'bilabial', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'r': {'place': 'alveolar', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʀ': {'place': 'uvular', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'R': {'place': 'uvular', 'manner': 'trill', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɾ': {'place': 'alveolar', 'manner': 'tap', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɽ': {'place': 'retroflex', 'manner': 'tap', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɸ': {'place': 'bilabial', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'β': {'place': 'bilabial', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'f': {'place': 'labiodental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'v': {'place': 'labiodental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'θ': {'place': 'dental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ð': {'place': 'dental', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'s': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'z': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʃ': {'place': 'palato-alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʒ': {'place': 'palato-alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʂ': {'place': 'retroflex', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʐ': {'place': 'retroflex', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ç': {'place': 'palatal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʝ': {'place': 'palatal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'x': {'place': 'velar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɣ': {'place': 'velar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'χ': {'place': 'uvular', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʁ': {'place': 'uvular', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ħ': {'place': 'pharyngeal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ʕ': {'place': 'pharyngeal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'h': {'place': 'glottal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɦ': {'place': 'glottal', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɬ': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'minus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'plus', 'aspirated': 'minus'},
+
+'ɮ': {'place': 'alveolar', 'manner': 'fricative', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'plus', 'aspirated': 'minus'},
+
+'ʋ': {'place': 'labiodental', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɹ': {'place': 'alveolar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɻ': {'place': 'retroflex', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'plus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'j': {'place': 'palatal', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'ɰ': {'place': 'velar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+'l': {'place': 'alveolar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'plus', 'aspirated': 'minus'},
+
+'w': {'place': 'labiovelar', 'manner': 'approximant', 'syllabic': 'minus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'aspirated': 'minus'},
+
+# Vowels
+
+'i': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'y': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'front','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'e': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'E': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'front','round': 'minus', 'long': 'plus', 'aspirated': 'minus'},
+
+'ø': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'front','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'ɛ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'œ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'front','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'æ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
+'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'a': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
+'back': 'front','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'A': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
+'back': 'front','round': 'minus', 'long': 'plus', 'aspirated': 'minus'},
+
+'ɨ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'central','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'ʉ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'central','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'ə': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'central','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'u': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'back','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'U': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'back','round': 'plus', 'long': 'plus', 'aspirated': 'minus'},
+
+'o': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'back','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'O': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'back','round': 'plus', 'long': 'plus', 'aspirated': 'minus'},
+
+'ɔ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'mid',
+'back': 'back','round': 'plus', 'long': 'minus', 'aspirated': 'minus'},
+
+'ɒ': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'low',
+'back': 'back','round': 'minus', 'long': 'minus', 'aspirated': 'minus'},
+
+'I': {'place': 'vowel', 'manner': 'vowel2', 'syllabic': 'plus', 'voice': 'plus',
+'nasal': 'minus', 'retroflex': 'minus', 'lateral': 'minus', 'high': 'high',
+'back': 'front','round': 'minus', 'long': 'plus', 'aspirated': 'minus'},
+
+}
+
+# === Algorithm ===
+
+def align(str1, str2, epsilon=0):
+ """
+ Compute the alignment of two phonetic strings.
+
+ :type str1, str2: str
+ :param str1, str2: Two strings to be aligned
+ :type epsilon: float (0.0 to 1.0)
+ :param epsilon: Adjusts threshold similarity score for near-optimal alignments
+
+ :rtpye: list(list(tuple(str, str)))
+ :return: Alignment(s) of str1 and str2
+
+ (Kondrak 2002: 51)
+ """
+ if np == None:
+ raise ImportError('You need numpy in order to use the align function')
+
+ assert 0.0 <= epsilon <= 1.0, "Epsilon must be between 0.0 and 1.0."
+ m = len(str1)
+ n = len(str2)
+ # This includes Kondrak's initialization of row 0 and column 0 to all 0s.
+ S = np.zeros((m+1, n+1), dtype=float)
+
+ # If i <= 1 or j <= 1, don't allow expansions as it doesn't make sense,
+ # and breaks array and string indices. Make sure they never get chosen
+ # by setting them to -inf.
+ for i in range(1, m+1):
+ for j in range(1, n+1):
+ edit1 = S[i-1, j] + sigma_skip(str1[i-1])
+ edit2 = S[i, j-1] + sigma_skip(str2[j-1])
+ edit3 = S[i-1, j-1] + sigma_sub(str1[i-1], str2[j-1])
+ if i > 1:
+ edit4 = S[i-2, j-1] + sigma_exp(str2[j-1], str1[i-2:i])
+ else:
+ edit4 = -inf
+ if j > 1:
+ edit5 = S[i-1, j-2] + sigma_exp(str1[i-1], str2[j-2:j])
+ else:
+ edit5 = -inf
+ S[i, j] = max(edit1, edit2, edit3, edit4, edit5, 0)
+
+ T = (1-epsilon)*np.amax(S) # Threshold score for near-optimal alignments
+
+ alignments = []
+ for i in range(1, m+1):
+ for j in range(1, n+1):
+ if S[i,j] >= T:
+ alignments.append(_retrieve(i, j, 0, S, T, str1, str2, []))
+ return alignments
+
+def _retrieve(i, j, s, S, T, str1, str2, out):
+ """
+ Retrieve the path through the similarity matrix S starting at (i, j).
+
+ :rtype: list(tuple(str, str))
+ :return: Alignment of str1 and str2
+ """
+ if S[i, j] == 0:
+ return out
+ else:
+ if j > 1 and S[i-1, j-2] + sigma_exp(str1[i-1], str2[j-2:j]) + s >= T:
+ out.insert(0, (str1[i-1], str2[j-2:j]))
+ _retrieve(i-1, j-2, s+sigma_exp(str1[i-1], str2[j-2:j]), S, T, str1, str2, out)
+ elif i > 1 and S[i-2, j-1] + sigma_exp(str2[j-1], str1[i-2:i]) + s >= T:
+ out.insert(0, (str1[i-2:i], str2[j-1]))
+ _retrieve(i-2, j-1, s+sigma_exp(str2[j-1], str1[i-2:i]), S, T, str1, str2, out)
+ elif S[i, j-1] + sigma_skip(str2[j-1]) + s >= T:
+ out.insert(0, ('-', str2[j-1]))
+ _retrieve(i, j-1, s+sigma_skip(str2[j-1]), S, T, str1, str2, out)
+ elif S[i-1, j] + sigma_skip(str1[i-1]) + s >= T:
+ out.insert(0, (str1[i-1], '-'))
+ _retrieve(i-1, j, s+sigma_skip(str1[i-1]), S, T, str1, str2, out)
+ elif S[i-1, j-1] + sigma_sub(str1[i-1], str2[j-1]) + s >= T:
+ out.insert(0, (str1[i-1], str2[j-1]))
+ _retrieve(i-1, j-1, s+sigma_sub(str1[i-1], str2[j-1]), S, T, str1, str2, out)
+ return out
+
+def sigma_skip(p):
+ """
+ Returns score of an indel of P.
+
+ (Kondrak 2002: 54)
+ """
+ return C_skip
+
+def sigma_sub(p, q):
+ """
+ Returns score of a substitution of P with Q.
+
+ (Kondrak 2002: 54)
+ """
+ return C_sub - delta(p, q) - V(p) - V(q)
+
+def sigma_exp(p, q):
+ """
+ Returns score of an expansion/compression.
+
+ (Kondrak 2002: 54)
+ """
+ q1 = q[0]
+ q2 = q[1]
+ return C_exp - delta(p, q1) - delta(p, q2) - V(p) - max(V(q1), V(q2))
+
+def delta(p, q):
+ """
+ Return weighted sum of difference between P and Q.
+
+ (Kondrak 2002: 54)
+ """
+ features = R(p, q)
+ total = 0
+ for f in features:
+ total += diff(p, q, f) * salience[f]
+ return total
+
+def diff(p, q, f):
+ """
+ Returns difference between phonetic segments P and Q for feature F.
+
+ (Kondrak 2002: 52, 54)
+ """
+ p_features, q_features = feature_matrix[p], feature_matrix[q]
+ return abs(similarity_matrix[p_features[f]] - similarity_matrix[q_features[f]])
+
+def R(p, q):
+ """
+ Return relevant features for segment comparsion.
+
+ (Kondrak 2002: 54)
+ """
+ if p in consonants or q in consonants:
+ return R_c
+ return R_v
+
+def V(p):
+ """
+ Return vowel weight if P is vowel.
+
+ (Kondrak 2002: 54)
+ """
+ if p in consonants:
+ return 0
+ return C_vwl
+
+# === Test ===
+
+def demo():
+ """
+ A demonstration of the result of aligning phonetic sequences
+ used in Kondrak's (2002) dissertation.
+ """
+ data = [pair.split(',') for pair in cognate_data.split('\n')]
+ for pair in data:
+ alignment = align(pair[0], pair[1])[0]
+ alignment = ['({}, {})'.format(a[0], a[1]) for a in alignment]
+ alignment = ' '.join(alignment)
+ print('{} ~ {} : {}'.format(pair[0], pair[1], alignment))
+
+cognate_data = """jo,ʒə
+tu,ty
+nosotros,nu
+kjen,ki
+ke,kwa
+todos,tu
+una,ən
+dos,dø
+tres,trwa
+ombre,om
+arbol,arbrə
+pluma,plym
+kabeθa,kap
+boka,buʃ
+pje,pje
+koraθon,kœr
+ber,vwar
+benir,vənir
+deθir,dir
+pobre,povrə
+ðis,dIzes
+ðæt,das
+wat,vas
+nat,nixt
+loŋ,laŋ
+mæn,man
+fleʃ,flajʃ
+bləd,blyt
+feðər,fEdər
+hær,hAr
+ir,Or
+aj,awgə
+nowz,nAzə
+mawθ,munt
+təŋ,tsuŋə
+fut,fys
+nij,knI
+hænd,hant
+hart,herts
+livər,lEbər
+ænd,ante
+æt,ad
+blow,flAre
+ir,awris
+ijt,edere
+fiʃ,piʃkis
+flow,fluere
+staɾ,stella
+ful,plenus
+græs,gramen
+hart,kordis
+horn,korny
+aj,ego
+nij,genU
+məðər,mAter
+mawntən,mons
+nejm,nomen
+njuw,nowus
+wən,unus
+rawnd,rotundus
+sow,suere
+sit,sedere
+θrij,tres
+tuwθ,dentis
+θin,tenwis
+kinwawa,kenuaʔ
+nina,nenah
+napewa,napɛw
+wapimini,wapemen
+namesa,namɛʔs
+okimawa,okemaw
+ʃiʃipa,seʔsep
+ahkohkwa,ahkɛh
+pematesiweni,pematesewen
+asenja,aʔsɛn"""
+
+if __name__ == '__main__':
+ demo()
diff --git a/nltk/metrics/association.py b/nltk/metrics/association.py
index b5677e7..3e012f6 100644
--- a/nltk/metrics/association.py
+++ b/nltk/metrics/association.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Ngram Association Measures
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Joel Nothman <jnothman at student.usyd.edu.au>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
@@ -12,6 +12,8 @@ generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
"""
from __future__ import division
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
import math as _math
from functools import reduce
_log2 = lambda x: _math.log(x, 2.0)
@@ -39,6 +41,7 @@ TOTAL = -1
"""Marginals index for the number of words in the data"""
+ at add_metaclass(ABCMeta)
class NgramAssocMeasures(object):
"""
An abstract class defining a collection of generic association measures.
@@ -61,16 +64,18 @@ class NgramAssocMeasures(object):
_n = 0
@staticmethod
+ @abstractmethod
def _contingency(*marginals):
"""Calculates values of a contingency table from marginal values."""
raise NotImplementedError("The contingency table is not available"
- "in the general ngram case")
+ "in the general ngram case")
@staticmethod
+ @abstractmethod
def _marginals(*contingency):
"""Calculates values of contingency table marginals from its values."""
raise NotImplementedError("The contingency table is not available"
- "in the general ngram case")
+ "in the general ngram case")
@classmethod
def _expected_values(cls, cont):
@@ -407,4 +412,3 @@ class ContingencyMeasures(object):
res.__doc__ = old_fn.__doc__
res.__name__ = old_fn.__name__
return res
-
diff --git a/nltk/metrics/confusionmatrix.py b/nltk/metrics/confusionmatrix.py
index b00118e..611d82a 100644
--- a/nltk/metrics/confusionmatrix.py
+++ b/nltk/metrics/confusionmatrix.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Confusion Matrices
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/metrics/distance.py b/nltk/metrics/distance.py
index bd72a3d..e8957bf 100644
--- a/nltk/metrics/distance.py
+++ b/nltk/metrics/distance.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Distance Metrics
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# Tom Lippincott <tom at cs.columbia.edu>
@@ -34,7 +34,7 @@ def _edit_dist_init(len1, len2):
return lev
-def _edit_dist_step(lev, i, j, s1, s2, transpositions=False):
+def _edit_dist_step(lev, i, j, s1, s2, substitution_cost=1, transpositions=False):
c1 = s1[i - 1]
c2 = s2[j - 1]
@@ -43,7 +43,7 @@ def _edit_dist_step(lev, i, j, s1, s2, transpositions=False):
# skipping a character in s2
b = lev[i][j - 1] + 1
# substitution
- c = lev[i - 1][j - 1] + (c1 != c2)
+ c = lev[i - 1][j - 1] + (substitution_cost if c1 != c2 else 0)
# transposition
d = c + 1 # never picked by default
@@ -55,7 +55,7 @@ def _edit_dist_step(lev, i, j, s1, s2, transpositions=False):
lev[i][j] = min(a, b, c, d)
-def edit_distance(s1, s2, transpositions=False):
+def edit_distance(s1, s2, substitution_cost=1, transpositions=False):
"""
Calculate the Levenshtein edit-distance between two strings.
The edit distance is the number of characters that need to be
@@ -65,6 +65,9 @@ def edit_distance(s1, s2, transpositions=False):
"rain" -> "sain" -> "shin" -> "shine". These operations could have
been done in other orders, but at least three steps are needed.
+ Allows specifying the cost of substitution edits (e.g., "a" -> "b"),
+ because sometimes it makes sense to assign greater penalties to substitutions.
+
This also optionally allows transposition edits (e.g., "ab" -> "ba"),
though this is disabled by default.
@@ -72,6 +75,7 @@ def edit_distance(s1, s2, transpositions=False):
:param transpositions: Whether to allow transposition edits
:type s1: str
:type s2: str
+ :type substitution_cost: int
:type transpositions: bool
:rtype int
"""
@@ -83,7 +87,8 @@ def edit_distance(s1, s2, transpositions=False):
# iterate over the array
for i in range(len1):
for j in range(len2):
- _edit_dist_step(lev, i + 1, j + 1, s1, s2, transpositions=transpositions)
+ _edit_dist_step(lev, i + 1, j + 1, s1, s2,
+ substitution_cost=substitution_cost, transpositions=transpositions)
return lev[len1][len2]
@@ -116,7 +121,7 @@ def masi_distance(label1, label2):
>>> from nltk.metrics import masi_distance
>>> masi_distance(set([1, 2]), set([1, 2, 3, 4]))
- 0.665...
+ 0.335
Passonneau 2006, Measuring Agreement on Set-Valued Items (MASI)
for Semantic and Pragmatic Annotation.
@@ -135,7 +140,7 @@ def masi_distance(label1, label2):
else:
m = 0
- return 1 - (len_intersection / len_union) * m
+ return (1 - (len_intersection / float(len_union))) * m
def interval_distance(label1,label2):
diff --git a/nltk/metrics/paice.py b/nltk/metrics/paice.py
index 727a204..d23e4b5 100644
--- a/nltk/metrics/paice.py
+++ b/nltk/metrics/paice.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Agreement Metrics
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Lauri Hallila <laurihallila at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/metrics/scores.py b/nltk/metrics/scores.py
index 9113fbc..ad78cc8 100644
--- a/nltk/metrics/scores.py
+++ b/nltk/metrics/scores.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Evaluation
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
@@ -12,12 +12,13 @@ import operator
from random import shuffle
from functools import reduce
+from six.moves import range, zip
+
try:
from scipy.stats.stats import betai
except ImportError:
betai = None
-from nltk.compat import xrange, izip
from nltk.util import LazyConcatenation, LazyMap
def accuracy(reference, test):
@@ -37,7 +38,7 @@ def accuracy(reference, test):
"""
if len(reference) != len(test):
raise ValueError("Lists must have the same length.")
- return sum(x == y for x, y in izip(reference, test)) / len(test)
+ return sum(x == y for x, y in zip(reference, test)) / len(test)
def precision(reference, test):
"""
@@ -132,7 +133,7 @@ def log_likelihood(reference, test):
# Return the average value of dist.logprob(val).
total_likelihood = sum(dist.logprob(val)
- for (val, dist) in izip(reference, test))
+ for (val, dist) in zip(reference, test))
return total_likelihood / len(reference)
def approxrand(a, b, **kwargs):
@@ -159,7 +160,7 @@ def approxrand(a, b, **kwargs):
shuffles = kwargs.get('shuffles', 999)
# there's no point in trying to shuffle beyond all possible permutations
shuffles = \
- min(shuffles, reduce(operator.mul, xrange(1, len(a) + len(b) + 1)))
+ min(shuffles, reduce(operator.mul, range(1, len(a) + len(b) + 1)))
stat = kwargs.get('statistic', lambda lst: sum(lst) / len(lst))
verbose = kwargs.get('verbose', False)
@@ -176,7 +177,7 @@ def approxrand(a, b, **kwargs):
lst = LazyConcatenation([a, b])
indices = list(range(len(a) + len(b)))
- for i in xrange(shuffles):
+ for i in range(shuffles):
if verbose and i % 10 == 0:
print('shuffle: %d' % i)
@@ -225,4 +226,3 @@ def demo():
if __name__ == '__main__':
demo()
-
diff --git a/nltk/metrics/segmentation.py b/nltk/metrics/segmentation.py
index a0e6298..668f68e 100644
--- a/nltk/metrics/segmentation.py
+++ b/nltk/metrics/segmentation.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Text Segmentation Metrics
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# David Doukhan <david.doukhan at gmail.com>
@@ -46,7 +46,8 @@ try:
except ImportError:
pass
-from nltk.compat import xrange
+from six.moves import range
+
def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
"""
@@ -213,7 +214,7 @@ def pk(ref, hyp, k=None, boundary='1'):
k = int(round(len(ref) / (ref.count(boundary) * 2.)))
err = 0
- for i in xrange(len(ref)-k +1):
+ for i in range(len(ref)-k +1):
r = ref[i:i+k].count(boundary) > 0
h = hyp[i:i+k].count(boundary) > 0
if r != h:
@@ -228,5 +229,3 @@ def setup_module(module):
import numpy
except ImportError:
raise SkipTest("numpy is required for nltk.metrics.segmentation")
-
-
diff --git a/nltk/metrics/spearman.py b/nltk/metrics/spearman.py
index 7caf055..07b158e 100644
--- a/nltk/metrics/spearman.py
+++ b/nltk/metrics/spearman.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Spearman Rank Correlation
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Joel Nothman <jnothman at student.usyd.edu.au>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
diff --git a/nltk/misc/__init__.py b/nltk/misc/__init__.py
index b382bc1..e03dc4e 100644
--- a/nltk/misc/__init__.py
+++ b/nltk/misc/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Miscellaneous modules
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/misc/chomsky.py b/nltk/misc/chomsky.py
index 68b4dbf..9cfb5c2 100644
--- a/nltk/misc/chomsky.py
+++ b/nltk/misc/chomsky.py
@@ -118,7 +118,9 @@ scope of a complex symbol.
import textwrap, random
from itertools import chain, islice
-from nltk.compat import izip
+
+from six.moves import zip
+
def generate_chomsky(times=5, line_length=72):
parts = []
@@ -126,7 +128,7 @@ def generate_chomsky(times=5, line_length=72):
phraselist = list(map(str.strip, part.splitlines()))
random.shuffle(phraselist)
parts.append(phraselist)
- output = chain(*islice(izip(*parts), 0, times))
+ output = chain(*islice(zip(*parts), 0, times))
print(textwrap.fill(" ".join(output), line_length))
if __name__ == '__main__':
diff --git a/nltk/misc/minimalset.py b/nltk/misc/minimalset.py
index 4704589..1299f19 100644
--- a/nltk/misc/minimalset.py
+++ b/nltk/misc/minimalset.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Minimal Sets
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
diff --git a/nltk/misc/sort.py b/nltk/misc/sort.py
index 6c993fe..8e1dd38 100644
--- a/nltk/misc/sort.py
+++ b/nltk/misc/sort.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: List Sorting
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/misc/wordfinder.py b/nltk/misc/wordfinder.py
index 7c28e5c..c2d5449 100644
--- a/nltk/misc/wordfinder.py
+++ b/nltk/misc/wordfinder.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Word Finder
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/parse/__init__.py b/nltk/parse/__init__.py
index ef230e6..66441de 100644
--- a/nltk/parse/__init__.py
+++ b/nltk/parse/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Parsers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
@@ -78,3 +78,4 @@ from nltk.parse.malt import MaltParser
from nltk.parse.evaluate import DependencyEvaluator
from nltk.parse.transitionparser import TransitionParser
from nltk.parse.bllip import BllipParser
+from nltk.parse.corenlp import CoreNLPParser, CoreNLPDependencyParser
diff --git a/nltk/parse/api.py b/nltk/parse/api.py
index 6dc0205..6ddd9aa 100644
--- a/nltk/parse/api.py
+++ b/nltk/parse/api.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Parser API
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/parse/bllip.py b/nltk/parse/bllip.py
index 8da15ba..06d0051 100644
--- a/nltk/parse/bllip.py
+++ b/nltk/parse/bllip.py
@@ -2,7 +2,7 @@
#
# Author: David McClosky <dmcc at bigasterisk.com>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/parse/chart.py b/nltk/parse/chart.py
index 5853e0e..79c3bd4 100644
--- a/nltk/parse/chart.py
+++ b/nltk/parse/chart.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: A Chart Parser
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# Jean Mark Gawron <gawron at mail.sdsu.edu>
@@ -40,14 +40,15 @@ from __future__ import print_function, division, unicode_literals
import itertools
import re
import warnings
+from functools import total_ordering
+
+from six.moves import range
-from nltk import compat
from nltk.tree import Tree
from nltk.grammar import PCFG, is_nonterminal, is_terminal
from nltk.util import OrderedDict
from nltk.internals import raise_unorderable_types
-from nltk.compat import (total_ordering, python_2_unicode_compatible,
- unicode_repr)
+from nltk.compat import python_2_unicode_compatible, unicode_repr
from nltk.parse.api import ParserI
@@ -1147,7 +1148,7 @@ class EmptyPredictRule(AbstractChartRule):
NUM_EDGES = 0
def apply(self, chart, grammar):
for prod in grammar.productions(empty=True):
- for index in compat.xrange(chart.num_leaves() + 1):
+ for index in range(chart.num_leaves() + 1):
new_edge = TreeEdge.from_production(prod, index)
if chart.insert(new_edge, ()):
yield new_edge
@@ -1313,7 +1314,7 @@ class ChartParser(ParserI):
# Width, for printing trace edges.
trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
if trace: print(chart.pretty_format_leaves(trace_edge_width))
-
+
if self._use_agenda:
# Use an agenda-based algorithm.
for axiom in self._axioms:
@@ -1630,7 +1631,7 @@ def demo(choice=None,
t = time.time()
chart = cp.chart_parse(tokens)
parses = list(chart.parses(grammar.start()))
-
+
times[strategies[strategy][0]] = time.time()-t
print("Nr edges in chart:", len(chart.edges()))
if numparses:
diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py
new file mode 100644
index 0000000..49c428b
--- /dev/null
+++ b/nltk/parse/corenlp.py
@@ -0,0 +1,673 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Interface to the CoreNLP REST API.
+#
+# Copyright (C) 2001-2016 NLTK Project
+# Author: Dmitrijs Milajevs <dimazest at gmail.com>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import unicode_literals
+
+import re
+import json
+import time
+import socket
+
+from nltk.internals import find_jar_iter, config_java, java, _java_options
+
+from nltk.parse.api import ParserI
+from nltk.tokenize.api import TokenizerI
+from nltk.parse.dependencygraph import DependencyGraph
+from nltk.tree import Tree
+
+_stanford_url = 'http://stanfordnlp.github.io/CoreNLP/'
+
+
+class CoreNLPServerError(EnvironmentError):
+ """Exceptions associated with the Core NLP server."""
+
+
+def try_port(port=0):
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ sock.bind(('', port))
+
+ p = sock.getsockname()[1]
+ sock.close()
+
+ return p
+
+
+class CoreNLPServer(object):
+
+ _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar'
+ _JAR = r'stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar'
+
+ def __init__(
+ self, path_to_jar=None, path_to_models_jar=None, verbose=False,
+ java_options=None, corenlp_options=None, port=None,
+ ):
+
+ if corenlp_options is None:
+ corenlp_options = [
+ '-preload', 'tokenize,ssplit,pos,lemma,parse,depparse',
+ ]
+
+ jars = list(find_jar_iter(
+ self._JAR,
+ path_to_jar,
+ env_vars=('CORENLP', ),
+ searchpath=(),
+ url=_stanford_url,
+ verbose=verbose,
+ is_regex=True,
+ ))
+
+ # find the most recent code and model jar
+ stanford_jar = max(
+ jars,
+ key=lambda model_name: re.match(self._JAR, model_name)
+ )
+
+ if port is None:
+ try:
+ port = try_port(9000)
+ except socket.error:
+ port = try_port()
+ corenlp_options.append(str(port))
+ else:
+ try_port(port)
+
+ self.url = 'http://localhost:{}'.format(port)
+
+ model_jar = max(
+ find_jar_iter(
+ self._MODEL_JAR_PATTERN,
+ path_to_models_jar,
+ env_vars=('CORENLP_MODELS', ),
+ searchpath=(),
+ url=_stanford_url,
+ verbose=verbose,
+ is_regex=True,
+ ),
+ key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name)
+ )
+
+ self.verbose = verbose
+
+ self._classpath = stanford_jar, model_jar
+
+ self.corenlp_options = corenlp_options
+ self.java_options = java_options or ['-mx2g']
+
+ def start(self):
+ import requests
+
+ cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLPServer']
+
+ if self.corenlp_options:
+ cmd.extend(self.corenlp_options)
+
+ # Configure java.
+ default_options = ' '.join(_java_options)
+ config_java(options=self.java_options, verbose=self.verbose)
+
+ try:
+ # TODO: it's probably a bad idea to pipe stdout, as it will
+ # accumulate when lots of text is being parsed.
+ self.popen = java(
+ cmd,
+ classpath=self._classpath,
+ blocking=False,
+ stdout='pipe',
+ stderr='pipe',
+ )
+ finally:
+ # Return java configurations to their default values.
+ config_java(options=default_options, verbose=self.verbose)
+
+ # Check that the server is istill running.
+ returncode = self.popen.poll()
+ if returncode is not None:
+ _, stderrdata = self.popen.communicate()
+ raise CoreNLPServerError(
+ returncode,
+ 'Could not start the server. '
+ 'The error was: {}'.format(stderrdata.decode('ascii'))
+ )
+
+ for i in range(30):
+ try:
+ response = requests.get(requests.compat.urljoin(self.url, 'live'))
+ except requests.exceptions.ConnectionError:
+ time.sleep(1)
+ else:
+ if response.ok:
+ break
+ else:
+ raise CoreNLPServerError(
+ 'Could not connect to the server.'
+ )
+
+ for i in range(60):
+ try:
+ response = requests.get(requests.compat.urljoin(self.url, 'ready'))
+ except requests.exceptions.ConnectionError:
+ time.sleep(1)
+ else:
+ if response.ok:
+ break
+ else:
+ raise CoreNLPServerError(
+ 'The server is not ready.'
+ )
+
+ def stop(self):
+ self.popen.terminate()
+ self.popen.wait()
+
+ def __enter__(self):
+ self.start()
+
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.stop()
+ return False
+
+
+class GenericCoreNLPParser(ParserI, TokenizerI):
+ """Interface to the CoreNLP Parser."""
+
+ def __init__(self, url='http://localhost:9000', encoding='utf8'):
+ import requests
+
+ self.url = url
+ self.encoding = encoding
+
+ self.session = requests.Session()
+
+ def parse_sents(self, sentences, *args, **kwargs):
+ """Parse multiple sentences.
+
+ Takes multiple sentences as a list where each sentence is a list of
+ words. Each sentence will be automatically tagged with this
+ CoreNLPParser instance's tagger.
+
+ If a whitespace exists inside a token, then the token will be treated as
+ several tokens.
+
+ :param sentences: Input sentences to parse
+ :type sentences: list(list(str))
+ :rtype: iter(iter(Tree))
+ """
+
+ sentences = (' '.join(words) for words in sentences)
+ return self.raw_parse_sents(sentences, *args, **kwargs)
+
+ def raw_parse(self, sentence, properties=None, *args, **kwargs):
+ """Parse a sentence.
+
+ Takes a sentence as a string; before parsing, it will be automatically
+ tokenized and tagged by the CoreNLP Parser.
+
+ :param sentence: Input sentence to parse
+ :type sentence: str
+ :rtype: iter(Tree)
+ """
+ default_properties = {
+ 'tokenize.whitespace': 'false',
+ }
+ default_properties.update(properties or {})
+
+ return next(
+ self.raw_parse_sents(
+ [sentence],
+ properties=default_properties,
+ *args,
+ **kwargs
+ )
+ )
+
+ def api_call(self, data, properties=None):
+ default_properties = {
+ 'outputFormat': 'json',
+ 'annotators': 'tokenize,pos,lemma,ssplit,{parser_annotator}'.format(
+ parser_annotator=self.parser_annotator,
+ ),
+ }
+
+ default_properties.update(properties or {})
+
+ response = self.session.post(
+ self.url,
+ params={
+ 'properties': json.dumps(default_properties),
+ },
+ data=data.encode(self.encoding),
+ timeout=60,
+ )
+
+ response.raise_for_status()
+
+ return response.json()
+
+ def raw_parse_sents(
+ self,
+ sentences,
+ verbose=False,
+ properties=None,
+ *args,
+ **kwargs
+ ):
+ """Parse multiple sentences.
+
+ Takes multiple sentences as a list of strings. Each sentence will be
+ automatically tokenized and tagged.
+
+ :param sentences: Input sentences to parse.
+ :type sentences: list(str)
+ :rtype: iter(iter(Tree))
+
+ """
+ default_properties = {
+ 'ssplit.isOneSentence': 'true',
+ }
+
+ default_properties.update(properties or {})
+
+ for sentence in sentences:
+ parsed_data = self.api_call(sentence, properties=default_properties)
+
+ assert len(parsed_data['sentences']) == 1
+
+ for parse in parsed_data['sentences']:
+ tree = self.make_tree(parse)
+ yield iter([tree])
+
+ def parse_text(self, text, *args, **kwargs):
+ """Parse a piece of text.
+
+ The text might contain several sentences which will be split by CoreNLP.
+
+ :param str text: text to be split.
+ :returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables?
+
+ """
+ parsed_data = self.api_call(text, *args, **kwargs)
+
+ for parse in parsed_data['sentences']:
+ yield self.make_tree(parse)
+
+ def tokenize(self, text, properties=None):
+ """Tokenize a string of text.
+
+ >>> parser = CoreNLPParser(url='http://localhost:9000')
+
+ >>> text = 'Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.'
+ >>> list(parser.tokenize(text))
+ ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+
+ >>> s = "The colour of the wall is blue."
+ >>> list(
+ ... parser.tokenize(
+ ... 'The colour of the wall is blue.',
+ ... properties={'tokenize.options': 'americanize=true'},
+ ... )
+ ... )
+ ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
+
+ """
+ default_properties = {
+ 'annotators': 'tokenize,ssplit',
+ }
+
+ default_properties.update(properties or {})
+
+ result = self.api_call(text, properties=default_properties)
+
+ for sentence in result['sentences']:
+ for token in sentence['tokens']:
+ yield token['originalText']
+
+
+class CoreNLPParser(GenericCoreNLPParser):
+ """
+ >>> parser = CoreNLPParser(url='http://localhost:9000')
+
+ >>> next(
+ ... parser.raw_parse('The quick brown fox jumps over the lazy dog.')
+ ... ).pretty_print() # doctest: +NORMALIZE_WHITESPACE
+ ROOT
+ |
+ S
+ _______________|__________________________
+ | VP |
+ | _________|___ |
+ | | PP |
+ | | ________|___ |
+ NP | | NP |
+ ____|__________ | | _______|____ |
+ DT JJ JJ NN VBZ IN DT JJ NN .
+ | | | | | | | | | |
+ The quick brown fox jumps over the lazy dog .
+
+ >>> (parse_fox, ), (parse_wolf, ) = parser.raw_parse_sents(
+ ... [
+ ... 'The quick brown fox jumps over the lazy dog.',
+ ... 'The quick grey wolf jumps over the lazy fox.',
+ ... ]
+ ... )
+
+ >>> parse_fox.pretty_print() # doctest: +NORMALIZE_WHITESPACE
+ ROOT
+ |
+ S
+ _______________|__________________________
+ | VP |
+ | _________|___ |
+ | | PP |
+ | | ________|___ |
+ NP | | NP |
+ ____|__________ | | _______|____ |
+ DT JJ JJ NN VBZ IN DT JJ NN .
+ | | | | | | | | | |
+ The quick brown fox jumps over the lazy dog .
+
+ >>> parse_wolf.pretty_print() # doctest: +NORMALIZE_WHITESPACE
+ ROOT
+ |
+ S
+ _______________|__________________________
+ | VP |
+ | _________|___ |
+ | | PP |
+ | | ________|___ |
+ NP | | NP |
+ ____|_________ | | _______|____ |
+ DT JJ JJ NN VBZ IN DT JJ NN .
+ | | | | | | | | | |
+ The quick grey wolf jumps over the lazy fox .
+
+ >>> (parse_dog, ), (parse_friends, ) = parser.parse_sents(
+ ... [
+ ... "I 'm a dog".split(),
+ ... "This is my friends ' cat ( the tabby )".split(),
+ ... ]
+ ... )
+
+ >>> parse_dog.pretty_print() # doctest: +NORMALIZE_WHITESPACE
+ ROOT
+ |
+ S
+ _______|____
+ | VP
+ | ________|___
+ NP | NP
+ | | ___|___
+ PRP VBP DT NN
+ | | | |
+ I 'm a dog
+
+ >>> parse_friends.pretty_print() # doctest: +NORMALIZE_WHITESPACE
+ ROOT
+ |
+ S
+ ____|___________
+ | VP
+ | ___________|_____________
+ | | NP
+ | | _______|_________
+ | | NP PRN
+ | | _____|_______ ____|______________
+ NP | NP | | NP |
+ | | ______|_________ | | ___|____ |
+ DT VBZ PRP$ NNS POS NN -LRB- DT NN -RRB-
+ | | | | | | | | | |
+ This is my friends ' cat -LRB- the tabby -RRB-
+
+ >>> parse_john, parse_mary, = parser.parse_text(
+ ... 'John loves Mary. Mary walks.'
+ ... )
+
+ >>> parse_john.pretty_print() # doctest: +NORMALIZE_WHITESPACE
+ ROOT
+ |
+ S
+ _____|_____________
+ | VP |
+ | ____|___ |
+ NP | NP |
+ | | | |
+ NNP VBZ NNP .
+ | | | |
+ John loves Mary .
+
+ >>> parse_mary.pretty_print() # doctest: +NORMALIZE_WHITESPACE
+ ROOT
+ |
+ S
+ _____|____
+ NP VP |
+ | | |
+ NNP VBZ .
+ | | |
+ Mary walks .
+
+ Special cases
+ -------------
+
+ >>> next(
+ ... parser.raw_parse(
+ ... 'NASIRIYA, Iraq—Iraqi doctors who treated former prisoner of war '
+ ... 'Jessica Lynch have angrily dismissed claims made in her biography '
+ ... 'that she was raped by her Iraqi captors.'
+ ... )
+ ... ).height()
+ 20
+
+ >>> next(
+ ... parser.raw_parse(
+ ... "The broader Standard & Poor's 500 Index <.SPX> was 0.46 points lower, or "
+ ... '0.05 percent, at 997.02.'
+ ... )
+ ... ).height()
+ 9
+
+ """
+
+ _OUTPUT_FORMAT = 'penn'
+ parser_annotator = 'parse'
+
+ def make_tree(self, result):
+ return Tree.fromstring(result['parse'])
+
+
+class CoreNLPDependencyParser(GenericCoreNLPParser):
+ """Dependency parser.
+
+ >>> dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
+
+ >>> parse, = dep_parser.raw_parse(
+ ... 'The quick brown fox jumps over the lazy dog.'
+ ... )
+ >>> print(parse.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
+ The DT 4 det
+ quick JJ 4 amod
+ brown JJ 4 amod
+ fox NN 5 nsubj
+ jumps VBZ 0 ROOT
+ over IN 9 case
+ the DT 9 det
+ lazy JJ 9 amod
+ dog NN 5 nmod
+ . . 5 punct
+
+ >>> print(parse.tree()) # doctest: +NORMALIZE_WHITESPACE
+ (jumps (fox The quick brown) (dog over the lazy) .)
+
+ >>> for governor, dep, dependent in parse.triples():
+ ... print(governor, dep, dependent) # doctest: +NORMALIZE_WHITESPACE
+ ('jumps', 'VBZ') nsubj ('fox', 'NN')
+ ('fox', 'NN') det ('The', 'DT')
+ ('fox', 'NN') amod ('quick', 'JJ')
+ ('fox', 'NN') amod ('brown', 'JJ')
+ ('jumps', 'VBZ') nmod ('dog', 'NN')
+ ('dog', 'NN') case ('over', 'IN')
+ ('dog', 'NN') det ('the', 'DT')
+ ('dog', 'NN') amod ('lazy', 'JJ')
+ ('jumps', 'VBZ') punct ('.', '.')
+
+ >>> (parse_fox, ), (parse_dog, ) = dep_parser.raw_parse_sents(
+ ... [
+ ... 'The quick brown fox jumps over the lazy dog.',
+ ... 'The quick grey wolf jumps over the lazy fox.',
+ ... ]
+ ... )
+ >>> print(parse_fox.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
+ The DT 4 det
+ quick JJ 4 amod
+ brown JJ 4 amod
+ fox NN 5 nsubj
+ jumps VBZ 0 ROOT
+ over IN 9 case
+ the DT 9 det
+ lazy JJ 9 amod
+ dog NN 5 nmod
+ . . 5 punct
+
+ >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
+ The DT 4 det
+ quick JJ 4 amod
+ grey JJ 4 amod
+ wolf NN 5 nsubj
+ jumps VBZ 0 ROOT
+ over IN 9 case
+ the DT 9 det
+ lazy JJ 9 amod
+ fox NN 5 nmod
+ . . 5 punct
+
+ >>> (parse_dog, ), (parse_friends, ) = dep_parser.parse_sents(
+ ... [
+ ... "I 'm a dog".split(),
+ ... "This is my friends ' cat ( the tabby )".split(),
+ ... ]
+ ... )
+ >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
+ I PRP 4 nsubj
+ 'm VBP 4 cop
+ a DT 4 det
+ dog NN 0 ROOT
+
+ >>> print(parse_friends.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
+ This DT 6 nsubj
+ is VBZ 6 cop
+ my PRP$ 4 nmod:poss
+ friends NNS 6 nmod:poss
+ ' POS 4 case
+ cat NN 0 ROOT
+ -LRB- -LRB- 9 punct
+ the DT 9 det
+ tabby NN 6 appos
+ -RRB- -RRB- 9 punct
+
+ >>> parse_john, parse_mary, = dep_parser.parse_text(
+ ... 'John loves Mary. Mary walks.'
+ ... )
+
+ >>> print(parse_john.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
+ John NNP 2 nsubj
+ loves VBZ 0 ROOT
+ Mary NNP 2 dobj
+ . . 2 punct
+
+ >>> print(parse_mary.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE
+ Mary NNP 2 nsubj
+ walks VBZ 0 ROOT
+ . . 2 punct
+
+ Special cases
+ -------------
+
+ Non-breaking space inside of a token.
+
+ >>> len(
+ ... next(
+ ... dep_parser.raw_parse(
+ ... 'Anhalt said children typically treat a 20-ounce soda bottle as one '
+ ... 'serving, while it actually contains 2 1/2 servings.'
+ ... )
+ ... ).nodes
+ ... )
+ 21
+
+ Phone numbers.
+
+ >>> len(
+ ... next(
+ ... dep_parser.raw_parse('This is not going to crash: 01 111 555.')
+ ... ).nodes
+ ... )
+ 10
+
+ """
+
+ _OUTPUT_FORMAT = 'conll2007'
+ parser_annotator = 'depparse'
+
+ def make_tree(self, result):
+
+ return DependencyGraph(
+ (
+ ' '.join(n_items[1:]) # NLTK expects an iterable of strings...
+ for n_items in sorted(transform(result))
+ ),
+ cell_separator=' ', # To make sure that a non-breaking space is kept inside of a token.
+ )
+
+
+def transform(sentence):
+ for dependency in sentence['basicDependencies']:
+
+ dependent_index = dependency['dependent']
+ token = sentence['tokens'][dependent_index - 1]
+
+ # Return values that we don't know as '_'. Also, consider tag and ctag
+ # to be equal.
+ yield (
+ dependent_index,
+ '_',
+ token['word'],
+ token['lemma'],
+ token['pos'],
+ token['pos'],
+ '_',
+ str(dependency['governor']),
+ dependency['dep'],
+ '_',
+ '_',
+ )
+
+
+def setup_module(module):
+ from nose import SkipTest
+
+ global server
+ try:
+ server = CoreNLPServer(port=9000)
+ except LookupError as e:
+ raise SkipTest('Could not instantiate CoreNLPServer.')
+
+ try:
+ server.start()
+ except CoreNLPServerError as e:
+ raise SkipTest(
+ 'Skipping CoreNLP tests because the server could not be started. '
+ 'Make sure that the 9000 port is free. '
+ '{}'.format(e.strerror)
+ )
+
+
+def teardown_module(module):
+ server.stop()
diff --git a/nltk/parse/dependencygraph.py b/nltk/parse/dependencygraph.py
index f0dfe55..6fadad9 100755
--- a/nltk/parse/dependencygraph.py
+++ b/nltk/parse/dependencygraph.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Dependency Grammars
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Jason Narad <jason.narad at gmail.com>
# Steven Bird <stevenbird1 at gmail.com> (modifications)
#
@@ -21,8 +21,10 @@ from pprint import pformat
import subprocess
import warnings
+from six import string_types
+
from nltk.tree import Tree
-from nltk.compat import python_2_unicode_compatible, string_types
+from nltk.compat import python_2_unicode_compatible
#################################################################
diff --git a/nltk/parse/earleychart.py b/nltk/parse/earleychart.py
index 9e0e810..5955b50 100644
--- a/nltk/parse/earleychart.py
+++ b/nltk/parse/earleychart.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: An Incremental Earley Chart Parser
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Peter Ljunglöf <peter.ljunglof at heatherleaf.se>
# Rob Speer <rspeer at mit.edu>
# Edward Loper <edloper at gmail.com>
@@ -27,7 +27,8 @@ algorithm, originally formulated by Jay Earley (1970).
"""
from __future__ import print_function, division
-from nltk.compat import xrange
+from six.moves import range
+
from nltk.parse.chart import (Chart, ChartParser, EdgeI, LeafEdge, LeafInitRule,
BottomUpPredictRule, BottomUpPredictCombineRule,
TopDownInitRule, SingleEdgeFundamentalRule,
@@ -108,7 +109,7 @@ class IncrementalChart(Chart):
self._edgelists[edge.end()].append(edge)
def _positions(self):
- return xrange(self.num_leaves() + 1)
+ return range(self.num_leaves() + 1)
class FeatureIncrementalChart(IncrementalChart, FeatureChart):
diff --git a/nltk/parse/evaluate.py b/nltk/parse/evaluate.py
index 43e2ab4..0d101bf 100644
--- a/nltk/parse/evaluate.py
+++ b/nltk/parse/evaluate.py
@@ -2,7 +2,7 @@
#
# Author: Long Duong <longdt219 at gmail.com>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/parse/featurechart.py b/nltk/parse/featurechart.py
index fe098af..eafd0bf 100644
--- a/nltk/parse/featurechart.py
+++ b/nltk/parse/featurechart.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Chart Parser for Feature-Based Grammars
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Rob Speer <rspeer at mit.edu>
# Peter Ljunglöf <peter.ljunglof at heatherleaf.se>
# URL: <http://nltk.org/>
@@ -13,7 +13,9 @@ feature structures as nodes.
"""
from __future__ import print_function, unicode_literals
-from nltk.compat import xrange, python_2_unicode_compatible
+from six.moves import range
+
+from nltk.compat import python_2_unicode_compatible
from nltk.featstruct import FeatStruct, unify, TYPE, find_variables
from nltk.sem import logic
from nltk.tree import Tree
@@ -206,7 +208,7 @@ class FeatureChart(Chart):
for edge in self.select(start=0, end=self._num_leaves):
if ((isinstance(edge, FeatureTreeEdge)) and
(edge.lhs()[TYPE] == start[TYPE]) and
- (unify(edge.lhs(), start, rename_vars=True))
+ (unify(edge.lhs(), start, rename_vars=True))
):
for tree in self.trees(edge, complete=True, tree_class=tree_class):
yield tree
@@ -401,7 +403,7 @@ class FeatureBottomUpPredictCombineRule(BottomUpPredictCombineRule):
class FeatureEmptyPredictRule(EmptyPredictRule):
def apply(self, chart, grammar):
for prod in grammar.productions(empty=True):
- for index in xrange(chart.num_leaves() + 1):
+ for index in range(chart.num_leaves() + 1):
new_edge = FeatureTreeEdge.from_production(prod, index)
if chart.insert(new_edge, ()):
yield new_edge
diff --git a/nltk/parse/generate.py b/nltk/parse/generate.py
index eb3089d..8326f5d 100644
--- a/nltk/parse/generate.py
+++ b/nltk/parse/generate.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Generating from a CFG
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Peter Ljunglöf <peter.ljunglof at heatherleaf.se>
# URL: <http://nltk.org/>
@@ -36,14 +36,23 @@ def generate(grammar, start=None, depth=None, n=None):
return iter
+
def _generate_all(grammar, items, depth):
if items:
- for frag1 in _generate_one(grammar, items[0], depth):
- for frag2 in _generate_all(grammar, items[1:], depth):
- yield frag1 + frag2
+ try:
+ for frag1 in _generate_one(grammar, items[0], depth):
+ for frag2 in _generate_all(grammar, items[1:], depth):
+ yield frag1 + frag2
+ except RuntimeError as _error:
+ if _error.message == "maximum recursion depth exceeded":
+ # Helpful error message while still showing the recursion stack.
+ raise RuntimeError("The grammar has rule(s) that yield infinite recursion!!")
+ else:
+ raise
else:
yield []
+
def _generate_one(grammar, item, depth):
if depth > 0:
if isinstance(item, Nonterminal):
@@ -63,6 +72,7 @@ demo_grammar = """
P -> 'in' | 'with'
"""
+
def demo(N=23):
from nltk.grammar import CFG
@@ -72,5 +82,6 @@ def demo(N=23):
for n, sent in enumerate(generate(grammar, n=N), 1):
print('%3d. %s' % (n, ' '.join(sent)))
+
if __name__ == '__main__':
demo()
diff --git a/nltk/parse/malt.py b/nltk/parse/malt.py
index 5799559..68bb396 100644
--- a/nltk/parse/malt.py
+++ b/nltk/parse/malt.py
@@ -4,14 +4,14 @@
# Author: Dan Garrette <dhgarrette at gmail.com>
# Contributor: Liling Tan, Mustufain, osamamukhtar11
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import print_function
from __future__ import unicode_literals
-from nltk.six import text_type
+from six import text_type
import os
import sys
import tempfile
diff --git a/nltk/parse/nonprojectivedependencyparser.py b/nltk/parse/nonprojectivedependencyparser.py
index bd40d6a..5adcd2c 100644
--- a/nltk/parse/nonprojectivedependencyparser.py
+++ b/nltk/parse/nonprojectivedependencyparser.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Dependency Grammars
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Jason Narad <jason.narad at gmail.com>
#
# URL: <http://nltk.org/>
@@ -11,7 +11,7 @@ from __future__ import print_function
import math
import logging
-from nltk.compat import xrange
+from six.moves import range
from nltk.parse.dependencygraph import DependencyGraph
@@ -682,7 +682,7 @@ class NonprojectiveDependencyParser(object):
orig_length = len(possible_heads[i])
if index_on_stack and orig_length == 0:
- for j in xrange(len(stack) - 1, -1, -1):
+ for j in range(len(stack) - 1, -1, -1):
stack_item = stack[j]
if stack_item[0] == i:
possible_heads[i].append(stack.pop(j)[1])
diff --git a/nltk/parse/pchart.py b/nltk/parse/pchart.py
index 18c7434..288d8c2 100644
--- a/nltk/parse/pchart.py
+++ b/nltk/parse/pchart.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Probabilistic Chart Parsers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/parse/projectivedependencyparser.py b/nltk/parse/projectivedependencyparser.py
index db3703b..273851d 100644
--- a/nltk/parse/projectivedependencyparser.py
+++ b/nltk/parse/projectivedependencyparser.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Dependency Grammars
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Jason Narad <jason.narad at gmail.com>
#
# URL: <http://nltk.org/>
@@ -9,12 +9,14 @@
from __future__ import print_function, unicode_literals
from collections import defaultdict
+from itertools import chain
+from functools import total_ordering
from nltk.grammar import (DependencyProduction, DependencyGrammar,
ProbabilisticDependencyGrammar)
from nltk.parse.dependencygraph import DependencyGraph
from nltk.internals import raise_unorderable_types
-from nltk.compat import total_ordering, python_2_unicode_compatible
+from nltk.compat import python_2_unicode_compatible
#################################################################
# Dependency Span
@@ -367,8 +369,7 @@ class ProbabilisticProjectiveDependencyParser(object):
for dg in graphs:
for node_index in range(1, len(dg.nodes)):
#children = dg.nodes[node_index]['deps']
- # Put list so that in will work in python 3
- children = sum(list(dg.nodes[node_index]['deps'].values()), [])
+ children = list(chain(*dg.nodes[node_index]['deps'].values()))
nr_left_children = dg.left_children(node_index)
nr_right_children = dg.right_children(node_index)
@@ -428,7 +429,7 @@ class ProbabilisticProjectiveDependencyParser(object):
prob = 1.0
for node_index in range(1, len(dg.nodes)):
#children = dg.nodes[node_index]['deps']
- children = sum(list(dg.nodes[node_index]['deps'].values()), [])
+ children = list(chain(*dg.nodes[node_index]['deps'].values()))
nr_left_children = dg.left_children(node_index)
nr_right_children = dg.right_children(node_index)
diff --git a/nltk/parse/recursivedescent.py b/nltk/parse/recursivedescent.py
index 1f9feba..d59eb3e 100644
--- a/nltk/parse/recursivedescent.py
+++ b/nltk/parse/recursivedescent.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Recursive Descent Parser
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/parse/shiftreduce.py b/nltk/parse/shiftreduce.py
index 68fcd89..4ade68a 100644
--- a/nltk/parse/shiftreduce.py
+++ b/nltk/parse/shiftreduce.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Shift-Reduce Parser
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py
index e7bdfe0..34939a9 100644
--- a/nltk/parse/stanford.py
+++ b/nltk/parse/stanford.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford Parser
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Xu <xxu at student.unimelb.edu.au>
#
# URL: <http://nltk.org/>
@@ -16,14 +16,15 @@ import warnings
from subprocess import PIPE
from io import StringIO
-from nltk import compat
+from six import text_type
+
from nltk.internals import find_jar, find_jar_iter, config_java, java, _java_options, find_jars_within_path
from nltk.parse.api import ParserI
from nltk.parse.dependencygraph import DependencyGraph
from nltk.tree import Tree
-_stanford_url = 'http://nlp.stanford.edu/software/lex-parser.shtml'
+_stanford_url = 'https://nlp.stanford.edu/software/lex-parser.shtml'
class GenericStanfordParser(ParserI):
"""Interface to the Stanford Parser"""
@@ -48,7 +49,7 @@ class GenericStanfordParser(ParserI):
searchpath=(), url=_stanford_url,
verbose=verbose, is_regex=True
),
- key=lambda model_name: re.match(self._JAR, model_name)
+ key=lambda model_path: os.path.dirname(model_path)
)
model_jar=max(
@@ -58,12 +59,13 @@ class GenericStanfordParser(ParserI):
searchpath=(), url=_stanford_url,
verbose=verbose, is_regex=True
),
- key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name)
+ key=lambda model_path: os.path.dirname(model_path)
)
+
#self._classpath = (stanford_jar, model_jar)
-
- # Adding logging jar files to classpath
+
+ # Adding logging jar files to classpath
stanford_dir = os.path.split(stanford_jar)[0]
self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir))
@@ -200,7 +202,7 @@ class GenericStanfordParser(ParserI):
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
# Write the actual sentences to the temporary input file
- if isinstance(input_, compat.text_type) and encoding:
+ if isinstance(input_, text_type) and encoding:
input_ = input_.encode(encoding)
input_file.write(input_)
input_file.flush()
@@ -214,9 +216,9 @@ class GenericStanfordParser(ParserI):
cmd.append(input_file.name)
stdout, stderr = java(cmd, classpath=self._classpath,
stdout=PIPE, stderr=PIPE)
-
+
stdout = stdout.replace(b'\xc2\xa0',b' ')
- stdout = stdout.replace(b'\xa0',b' ')
+ stdout = stdout.replace(b'\x00\xa0',b' ')
stdout = stdout.decode(encoding)
os.unlink(input_file.name)
@@ -342,29 +344,33 @@ class StanfordDependencyParser(GenericStanfordParser):
class StanfordNeuralDependencyParser(GenericStanfordParser):
'''
>>> from nltk.parse.stanford import StanfordNeuralDependencyParser
- >>> dep_parser=StanfordNeuralDependencyParser()
+ >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx3g')
>>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
- [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]
+ [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])]
>>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE
- [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
- ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
- ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
- ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]
+ [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det',
+ (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'),
+ u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')),
+ ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det',
+ (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'),
+ u'punct', (u'.', u'.'))]]
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
... "The quick brown fox jumps over the lazy dog.",
... "The quick grey wolf jumps over the lazy fox."
... ))], []) # doctest: +NORMALIZE_WHITESPACE
- [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),
- Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]
+ [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over',
+ 'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']),
+ Tree('fox', ['over', 'the', 'lazy']), '.'])]
>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
... "I 'm a dog".split(),
... "This is my friends ' cat ( the tabby )".split(),
... ))], []) # doctest: +NORMALIZE_WHITESPACE
- [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]
+ [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
+ ['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
'''
_OUTPUT_FORMAT = 'conll'
diff --git a/nltk/parse/transitionparser.py b/nltk/parse/transitionparser.py
index 15bac50..cad2261 100644
--- a/nltk/parse/transitionparser.py
+++ b/nltk/parse/transitionparser.py
@@ -2,7 +2,7 @@
#
# Author: Long Duong <longdt219 at gmail.com>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -328,8 +328,8 @@ class TransitionParser(ParserI):
def _is_projective(self, depgraph):
arc_list = []
for key in depgraph.nodes:
- node = depgraph.nodes[key]
-
+ node = depgraph.nodes[key]
+
if 'head' in node:
childIdx = node['address']
parentIdx = node['head']
@@ -490,7 +490,7 @@ class TransitionParser(ParserI):
print(" Number of valid (projective) examples : " + str(countProj))
return training_seq
- def train(self, depgraphs, modelfile):
+ def train(self, depgraphs, modelfile, verbose=True):
"""
:param depgraphs : list of DependencyGraph as the training data
:type depgraphs : DependencyGraph
@@ -522,7 +522,7 @@ class TransitionParser(ParserI):
coef0=0,
gamma=0.2,
C=0.5,
- verbose=True,
+ verbose=verbose,
probability=True)
model.fit(x_train, y_train)
@@ -730,10 +730,9 @@ def demo():
Number of valid (projective) examples : 1
SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, SHIFT, SHIFT, LEFTARC:ATT, SHIFT, SHIFT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, RIGHTARC:ATT, RIGHTARC:OBJ, SHIFT, RIGHTARC:PU, RIGHTARC:ROOT, SHIFT
- >>> parser_std.train([gold_sent],'temp.arcstd.model')
+ >>> parser_std.train([gold_sent],'temp.arcstd.model', verbose=False)
Number of training examples : 1
Number of valid (projective) examples : 1
- ...
>>> remove(input_file.name)
B. Check the ARC-EAGER training
@@ -745,10 +744,9 @@ def demo():
Number of valid (projective) examples : 1
SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, RIGHTARC:ROOT, SHIFT, LEFTARC:ATT, RIGHTARC:OBJ, RIGHTARC:ATT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, REDUCE, REDUCE, REDUCE, RIGHTARC:PU
- >>> parser_eager.train([gold_sent],'temp.arceager.model')
+ >>> parser_eager.train([gold_sent],'temp.arceager.model', verbose=False)
Number of training examples : 1
Number of valid (projective) examples : 1
- ...
>>> remove(input_file.name)
@@ -767,6 +765,10 @@ def demo():
>>> de.eval() >= (0, 0)
True
+ Remove test temporary files
+ >>> remove('temp.arceager.model')
+ >>> remove('temp.arcstd.model')
+
Note that result is very poor because of only one training example.
"""
diff --git a/nltk/parse/util.py b/nltk/parse/util.py
index a8eec83..e8694b6 100644
--- a/nltk/parse/util.py
+++ b/nltk/parse/util.py
@@ -2,7 +2,7 @@
#
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/parse/viterbi.py b/nltk/parse/viterbi.py
index a75b0c7..dce5979 100644
--- a/nltk/parse/viterbi.py
+++ b/nltk/parse/viterbi.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Viterbi Probabilistic Parser
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/probability.py b/nltk/probability.py
old mode 100644
new mode 100755
index 56f5739..0528c2b
--- a/nltk/probability.py
+++ b/nltk/probability.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Probability and Statistics
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com> (additions)
# Trevor Cohn <tacohn at cs.mu.oz.au> (additions)
@@ -44,11 +44,13 @@ import random
import warnings
import array
from operator import itemgetter
-from collections import defaultdict
+from collections import defaultdict, Counter
from functools import reduce
-from nltk import compat
-from nltk.compat import Counter
+from abc import ABCMeta, abstractmethod
+
+from six import itervalues, text_type, add_metaclass
+from nltk import compat
from nltk.internals import raise_unorderable_types
_NINF = float('-1e300')
@@ -105,6 +107,9 @@ class FreqDist(Counter):
"""
Counter.__init__(self, samples)
+ # Cached number of samples in this FreqDist
+ self._N = None
+
def N(self):
"""
Return the total number of sample outcomes that have been
@@ -114,7 +119,38 @@ class FreqDist(Counter):
:rtype: int
"""
- return sum(self.values())
+ if self._N is None:
+ # Not already cached, or cache has been invalidated
+ self._N = sum(self.values())
+ return self._N
+
+ def __setitem__(self, key, val):
+ """
+ Override ``Counter.__setitem__()`` to invalidate the cached N
+ """
+ self._N = None
+ super(FreqDist, self).__setitem__(key, val)
+
+ def __delitem__(self, key):
+ """
+ Override ``Counter.__delitem__()`` to invalidate the cached N
+ """
+ self._N = None
+ super(FreqDist, self).__delitem__(key)
+
+ def update(self, *args, **kwargs):
+ """
+ Override ``Counter.update()`` to invalidate the cached N
+ """
+ self._N = None
+ super(FreqDist, self).update(*args, **kwargs)
+
+ def setdefault(self, key, val):
+ """
+ Override ``Counter.setdefault()`` to invalidate the cached N
+ """
+ self._N = None
+ super(FreqDist, self).setdefault(key, val)
def B(self):
"""
@@ -192,9 +228,10 @@ class FreqDist(Counter):
:type sample: any
:rtype: float
"""
- if self.N() == 0:
+ n = self.N()
+ if n == 0:
return 0
- return self[sample] / self.N()
+ return self[sample] / n
def max(self):
"""
@@ -251,7 +288,7 @@ class FreqDist(Counter):
pylab.title(kwargs["title"])
del kwargs["title"]
pylab.plot(freqs, **kwargs)
- pylab.xticks(range(len(samples)), [compat.text_type(s) for s in samples], rotation=90)
+ pylab.xticks(range(len(samples)), [text_type(s) for s in samples], rotation=90)
pylab.xlabel("Samples")
pylab.ylabel(ylabel)
pylab.show()
@@ -297,8 +334,8 @@ class FreqDist(Counter):
"""
return self.__class__(self)
- # Mathematical operatiors
-
+ # Mathematical operatiors
+
def __add__(self, other):
"""
Add counts from two counters.
@@ -393,6 +430,7 @@ class FreqDist(Counter):
## Probability Distributions
##//////////////////////////////////////////////////////
+ at add_metaclass(ABCMeta)
class ProbDistI(object):
"""
A probability distribution for the outcomes of an experiment. A
@@ -410,10 +448,13 @@ class ProbDistI(object):
"""True if the probabilities of the samples in this probability
distribution will always sum to one."""
+ @abstractmethod
def __init__(self):
- if self.__class__ == ProbDistI:
- raise NotImplementedError("Interfaces can't be instantiated")
+ """
+ Classes inheriting from ProbDistI should implement __init__.
+ """
+ @abstractmethod
def prob(self, sample):
"""
Return the probability for a given sample. Probabilities
@@ -424,7 +465,6 @@ class ProbDistI(object):
:type sample: any
:rtype: float
"""
- raise NotImplementedError()
def logprob(self, sample):
"""
@@ -439,6 +479,7 @@ class ProbDistI(object):
p = self.prob(sample)
return (math.log(p, 2) if p != 0 else _NINF)
+ @abstractmethod
def max(self):
"""
Return the sample with the greatest probability. If two or
@@ -447,8 +488,8 @@ class ProbDistI(object):
:rtype: any
"""
- raise NotImplementedError()
+ @abstractmethod
def samples(self):
"""
Return a list of all samples that have nonzero probabilities.
@@ -456,7 +497,6 @@ class ProbDistI(object):
:rtype: list
"""
- raise NotImplementedError()
# cf self.SUM_TO_ONE
def discount(self):
@@ -549,6 +589,7 @@ class RandomProbDist(ProbDistI):
it can still be passed to MutableProbDist and called with identical
syntax to UniformProbDist
"""
+ samples = set(samples)
randrow = [random.random() for i in range(len(samples))]
total = sum(randrow)
for i, x in enumerate(randrow):
@@ -1748,6 +1789,7 @@ class ConditionalFreqDist(defaultdict):
:type cond_samples: Sequence of (condition, sample) tuples
"""
defaultdict.__init__(self, FreqDist)
+
if cond_samples:
for (cond, sample) in cond_samples:
self[cond][sample] += 1
@@ -1775,7 +1817,7 @@ class ConditionalFreqDist(defaultdict):
:rtype: int
"""
- return sum(fdist.N() for fdist in compat.itervalues(self))
+ return sum(fdist.N() for fdist in itervalues(self))
def plot(self, *args, **kwargs):
"""
@@ -1819,7 +1861,7 @@ class ConditionalFreqDist(defaultdict):
pylab.legend(loc=legend_loc)
pylab.grid(True, color="silver")
- pylab.xticks(range(len(samples)), [compat.text_type(s) for s in samples], rotation=90)
+ pylab.xticks(range(len(samples)), [text_type(s) for s in samples], rotation=90)
if title:
pylab.title(title)
pylab.xlabel("Samples")
@@ -1864,7 +1906,7 @@ class ConditionalFreqDist(defaultdict):
print()
# Mathematical operators
-
+
def __add__(self, other):
"""
Add counts from two ConditionalFreqDists.
@@ -1920,7 +1962,7 @@ class ConditionalFreqDist(defaultdict):
return result
def __and__(self, other):
- """
+ """
Intersection is the minimum of corresponding counts.
"""
if not isinstance(other, ConditionalFreqDist):
@@ -1961,6 +2003,7 @@ class ConditionalFreqDist(defaultdict):
@compat.python_2_unicode_compatible
+ at add_metaclass(ABCMeta)
class ConditionalProbDistI(dict):
"""
A collection of probability distributions for a single experiment
@@ -1974,8 +2017,11 @@ class ConditionalProbDistI(dict):
condition to the ``ProbDist`` for the experiment under that
condition.
"""
+ @abstractmethod
def __init__(self):
- raise NotImplementedError("Interfaces can't be instantiated")
+ """
+ Classes inheriting from ConditionalProbDistI should implement __init__.
+ """
def conditions(self):
"""
diff --git a/nltk/sem/__init__.py b/nltk/sem/__init__.py
index dd0f11b..7bad174 100644
--- a/nltk/sem/__init__.py
+++ b/nltk/sem/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Semantic Interpretation
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/sem/boxer.py b/nltk/sem/boxer.py
index 8bf8c85..a56017f 100644
--- a/nltk/sem/boxer.py
+++ b/nltk/sem/boxer.py
@@ -3,7 +3,7 @@
#
# Author: Dan Garrette <dhgarrette at gmail.com>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -34,7 +34,7 @@ from optparse import OptionParser
import tempfile
from functools import reduce
-from nltk.internals import Counter, find_binary
+from nltk.internals import find_binary
from nltk.sem.logic import (ExpectedMoreTokensException, LogicalExpressionException,
UnexpectedTokenException, Variable)
@@ -89,7 +89,7 @@ class Boxer(object):
discourse_ids = ([discourse_id] if discourse_id is not None else None)
d, = self.interpret_multi_sents([[input]], discourse_ids, question, verbose)
if not d:
- raise Exception('Unable to interpret: "%s"' % input)
+ raise Exception('Unable to interpret: "{0}"'.format(input))
return d
def interpret_multi(self, input, discourse_id=None, question=False, verbose=False):
@@ -104,7 +104,7 @@ class Boxer(object):
discourse_ids = ([discourse_id] if discourse_id is not None else None)
d, = self.interpret_multi_sents([input], discourse_ids, question, verbose)
if not d:
- raise Exception('Unable to interpret: "%s"' % input)
+ raise Exception('Unable to interpret: "{0}"'.format(input))
return d
def interpret_sents(self, inputs, discourse_ids=None, question=False, verbose=False):
@@ -155,7 +155,7 @@ class Boxer(object):
"""
args = ['--models', os.path.join(self._candc_models_path, ['boxer','questions'][question]),
'--candc-printer', 'boxer']
- return self._call('\n'.join(sum((["<META>'%s'" % id] + d for d,id in zip(inputs,discourse_ids)), [])), self._candc_bin, args, verbose)
+ return self._call('\n'.join(sum((["<META>'{0}'".format(id)] + d for d,id in zip(inputs,discourse_ids)), [])), self._candc_bin, args, verbose)
def _call_boxer(self, candc_out, verbose=False):
"""
@@ -212,7 +212,7 @@ class Boxer(object):
cmd = [binary] + args
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
else:
- cmd = 'echo "%s" | %s %s' % (input_str, binary, ' '.join(args))
+ cmd = 'echo "{0}" | {1} {2}'.format(input_str, binary, ' '.join(args))
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout, stderr = p.communicate()
@@ -221,7 +221,7 @@ class Boxer(object):
if stdout: print('stdout:\n', stdout, '\n')
if stderr: print('stderr:\n', stderr, '\n')
if p.returncode != 0:
- raise Exception('ERROR CALLING: %s %s\nReturncode: %d\n%s' % (binary, ' '.join(args), p.returncode, stderr))
+ raise Exception('ERROR CALLING: {0} {1}\nReturncode: {2}\n{3}'.format(binary, ' '.join(args), p.returncode, stderr))
return stdout
@@ -239,12 +239,12 @@ class Boxer(object):
drs_id = line[comma_idx+1:line.index(')')]
i += 1
line = lines[i]
- assert line.startswith('sem(%s,' % drs_id)
+ assert line.startswith('sem({0},'.format(drs_id))
if line[-4:] == "').'":
line = line[:-4] + ")."
- assert line.endswith(').'), "can't parse line: %s" % line
+ assert line.endswith(').'), "can't parse line: {0}".format(line)
- search_start = len('sem(%s,[' % drs_id)
+ search_start = len('sem({0},['.format(drs_id))
brace_count = 1
drs_start = -1
for j,c in enumerate(line[search_start:]):
@@ -464,26 +464,26 @@ class BoxerOutputDrsParser(DrtParser):
self.assertToken(self.token(), '(')
pol = self.token()
self.assertToken(self.token(), ')')
- conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_pol_%s' % (pol), 'a', 0))
+ conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_pol_{0}'.format(pol), 'a', 0))
self.assertToken(self.token(), ',')
(sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list())
year = self.token()
if year != 'XXXX':
year = year.replace(':', '_')
- conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_year_%s' % (year), 'a', 0))
+ conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_year_{0}'.format(year), 'a', 0))
self.assertToken(self.token(), ',')
(sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list())
month = self.token()
if month != 'XX':
- conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_month_%s' % (month), 'a', 0))
+ conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_month_{0}'.format(month), 'a', 0))
self.assertToken(self.token(), ',')
(sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list())
day = self.token()
if day != 'XX':
- conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_day_%s' % (day), 'a', 0))
+ conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_day_{0}'.format(day), 'a', 0))
return conds
@@ -864,7 +864,7 @@ class AbstractBoxerDrs(object):
return self
def __hash__(self):
- return hash("%s" % self)
+ return hash("{0}".format(self))
@python_2_unicode_compatible
diff --git a/nltk/sem/chat80.py b/nltk/sem/chat80.py
index 25bae0c..180c50a 100644
--- a/nltk/sem/chat80.py
+++ b/nltk/sem/chat80.py
@@ -1,7 +1,7 @@
# Natural Language Toolkit: Chat-80 KB Reader
# See http://www.w3.org/TR/swbp-skos-core-guide/
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan at inf.ed.ac.uk>,
# URL: <http://nltk.sourceforge.net>
# For license information, see LICENSE.TXT
@@ -129,8 +129,10 @@ import shelve
import os
import sys
+from six import string_types
+
import nltk.data
-from nltk.compat import string_types, python_2_unicode_compatible
+from nltk.compat import python_2_unicode_compatible
###########################################################################
# Chat-80 relation metadata bundles needed to build the valuation
@@ -505,7 +507,7 @@ def process_bundle(rels):
:param rels: bundle of metadata needed for constructing a concept
:type rels: list(dict)
:return: a dictionary of concepts, indexed by the relation name.
- :rtype: dict(str): Concept
+ :rtype: dict(str): Concept
"""
concepts = {}
for rel in rels:
@@ -641,7 +643,7 @@ def make_lex(symbols):
create a lexical rule for the proper name 'Zloty'.
:param symbols: a list of individual constants in the semantic representation
- :type symbols: sequence -- set(str)
+ :type symbols: sequence -- set(str)
:rtype: list(str)
"""
lex = []
@@ -778,5 +780,3 @@ def sql_demo():
if __name__ == '__main__':
main()
sql_demo()
-
-
diff --git a/nltk/sem/cooper_storage.py b/nltk/sem/cooper_storage.py
index 3a1878e..f1a7aab 100644
--- a/nltk/sem/cooper_storage.py
+++ b/nltk/sem/cooper_storage.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Cooper storage for Quantifier Ambiguity
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/sem/drt.py b/nltk/sem/drt.py
index 0c07762..bd64839 100644
--- a/nltk/sem/drt.py
+++ b/nltk/sem/drt.py
@@ -2,15 +2,18 @@
#
# Author: Dan Garrette <dhgarrette at gmail.com>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import print_function, unicode_literals
import operator
from functools import reduce
+from itertools import chain
-from nltk.compat import string_types, python_2_unicode_compatible
+from six import string_types
+
+from nltk.compat import python_2_unicode_compatible
from nltk.sem.logic import (APP, AbstractVariableExpression, AllExpression,
AndExpression, ApplicationExpression, BinaryExpression,
BooleanExpression, ConstantExpression, EqualityExpression,
@@ -22,10 +25,8 @@ from nltk.sem.logic import (APP, AbstractVariableExpression, AllExpression,
# Import Tkinter-based modules if they are available
try:
- # imports are fixed for Python 2.x by nltk.compat
- from tkinter import Canvas
- from tkinter import Tk
- from tkinter.font import Font
+ from six.moves.tkinter import Canvas, Tk
+ from six.moves.tkinter_font import Font
from nltk.util import in_idle
except ImportError:
@@ -339,7 +340,7 @@ class DRS(DrtExpression, Expression):
def get_refs(self, recursive=False):
""":see: AbstractExpression.get_refs()"""
if recursive:
- conds_refs = self.refs + sum((c.get_refs(True) for c in self.conds), [])
+ conds_refs = self.refs + list(chain(*(c.get_refs(True) for c in self.conds)))
if self.consequent:
conds_refs.extend(self.consequent.get_refs(True))
return conds_refs
@@ -1227,7 +1228,7 @@ def demo():
def test_draw():
try:
- from tkinter import Tk
+ from six.moves.tkinter import Tk
except ImportError:
from nose import SkipTest
raise SkipTest("tkinter is required, but it's not available.")
diff --git a/nltk/sem/drt_glue_demo.py b/nltk/sem/drt_glue_demo.py
index bb55042..4fe4a47 100644
--- a/nltk/sem/drt_glue_demo.py
+++ b/nltk/sem/drt_glue_demo.py
@@ -3,18 +3,14 @@
#
# Author: Dan Garrette <dhgarrette at gmail.com>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-
-from nltk import compat # this fixes tkinter imports for Python 2.x
-
try:
- from tkinter.font import Font
-
- from tkinter import (Button, Frame, IntVar, Label,
- Listbox, Menu, Scrollbar, Tk)
+ from six.moves.tkinter import (Button, Frame, IntVar, Label, Listbox, Menu,
+ Scrollbar, Tk)
+ from six.moves.tkinter_font import Font
from nltk.draw.util import CanvasFrame, ShowText
except ImportError:
@@ -346,7 +342,7 @@ class DrtGlueDemo(object):
"Written by Daniel H. Garrette")
TITLE = 'About: NLTK DRT Glue Demo'
try:
- from tkMessageBox import Message
+ from six.moves.tkinter_messagebox import Message
Message(message=ABOUT, title=TITLE).show()
except:
ShowText(self._top, TITLE, ABOUT)
diff --git a/nltk/sem/evaluate.py b/nltk/sem/evaluate.py
index 0c42cd5..f9cdb90 100644
--- a/nltk/sem/evaluate.py
+++ b/nltk/sem/evaluate.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Models for first-order languages with lambda
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan at inf.ed.ac.uk>,
# URL: <http://nltk.sourceforge.net>
# For license information, see LICENSE.TXT
@@ -19,9 +19,12 @@ from pprint import pformat
import inspect
import textwrap
import re
+import sys
+
+from six import string_types
from nltk.decorators import decorator # this used in code that is commented out
-from nltk.compat import string_types, python_2_unicode_compatible
+from nltk.compat import python_2_unicode_compatible
from nltk.sem.logic import (AbstractVariableExpression, AllExpression, Expression,
AndExpression, ApplicationExpression, EqualityExpression,
@@ -36,7 +39,10 @@ class Error(Exception): pass
class Undefined(Error): pass
def trace(f, *args, **kw):
- argspec = inspect.getargspec(f)
+ if sys.version_info[0] >= 3:
+ argspec = inspect.getfullargspec(f)
+ else:
+ argspec = inspect.getargspec(f)
d = dict(zip(argspec[0], args))
if d.pop('trace', None):
print()
diff --git a/nltk/sem/glue.py b/nltk/sem/glue.py
index 8c7541b..765ff3f 100644
--- a/nltk/sem/glue.py
+++ b/nltk/sem/glue.py
@@ -2,16 +2,18 @@
#
# Author: Dan Garrette <dhgarrette at gmail.com>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import print_function, division, unicode_literals
import os
+from itertools import chain
+
+from six import string_types
import nltk
from nltk.internals import Counter
-from nltk.compat import string_types
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, RegexpTagger
from nltk.sem.logic import (Expression, Variable, VariableExpression,
LambdaExpression, AbstractVariableExpression)
@@ -235,13 +237,13 @@ class GlueDict(dict):
if node is None:
# TODO: should it be depgraph.root? Is this code tested?
top = depgraph.nodes[0]
- depList = sum(list(top['deps'].values()), [])
+ depList = list(chain(*top['deps'].values()))
root = depgraph.nodes[depList[0]]
return self.to_glueformula_list(depgraph, root, Counter(), verbose)
glueformulas = self.lookup(node, depgraph, counter)
- for dep_idx in sum(list(node['deps'].values()), []):
+ for dep_idx in chain(*node['deps'].values()):
dep = depgraph.nodes[dep_idx]
glueformulas.extend(self.to_glueformula_list(depgraph, dep, counter, verbose))
return glueformulas
@@ -285,7 +287,7 @@ class GlueDict(dict):
def _lookup_semtype_option(self, semtype, node, depgraph):
relationships = frozenset(
depgraph.nodes[dep]['rel'].lower()
- for dep in sum(list(node['deps'].values()), [])
+ for dep in chain(*node['deps'].values())
if depgraph.nodes[dep]['rel'].lower() not in OPTIONAL_RELATIONSHIPS
)
@@ -418,7 +420,7 @@ class GlueDict(dict):
"""
deps = [
depgraph.nodes[dep]
- for dep in sum(list(node['deps'].values()), [])
+ for dep in chain(*node['deps'].values())
if depgraph.nodes[dep]['rel'].lower() == rel.lower()
]
diff --git a/nltk/sem/hole.py b/nltk/sem/hole.py
index 25aaada..fe39369 100644
--- a/nltk/sem/hole.py
+++ b/nltk/sem/hole.py
@@ -3,7 +3,7 @@
# Author: Peter Wang
# Updated by: Dan Garrette <dhgarrette at gmail.com>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
@@ -23,6 +23,8 @@ from __future__ import print_function, unicode_literals
from functools import reduce
+from six import itervalues
+
from nltk import compat
from nltk.parse import load_parser
@@ -129,7 +131,7 @@ class HoleSemantics(object):
def _find_top_nodes(self, node_list):
top_nodes = node_list.copy()
- for f in compat.itervalues(self.fragments):
+ for f in itervalues(self.fragments):
# the label is the first argument of the predicate
args = f[1]
for arg in args:
diff --git a/nltk/sem/lfg.py b/nltk/sem/lfg.py
index 610d641..85b3353 100644
--- a/nltk/sem/lfg.py
+++ b/nltk/sem/lfg.py
@@ -2,11 +2,13 @@
#
# Author: Dan Garrette <dhgarrette at gmail.com>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import print_function, division, unicode_literals
+from itertools import chain
+
from nltk.internals import Counter
from nltk.compat import python_2_unicode_compatible
@@ -117,7 +119,7 @@ class FStructure(dict):
if not fstruct.pred:
fstruct.pred = (word, tag)
- children = [depgraph.nodes[idx] for idx in sum(list(node['deps'].values()), [])]
+ children = [depgraph.nodes[idx] for idx in chain(*node['deps'].values())]
for child in children:
fstruct.safeappend(child['rel'], FStructure._read_depgraph(child, depgraph, label_counter, fstruct))
diff --git a/nltk/sem/linearlogic.py b/nltk/sem/linearlogic.py
index cfbb615..38457a7 100644
--- a/nltk/sem/linearlogic.py
+++ b/nltk/sem/linearlogic.py
@@ -2,13 +2,15 @@
#
# Author: Dan Garrette <dhgarrette at gmail.com>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import print_function, unicode_literals
+from six import string_types
+
from nltk.internals import Counter
-from nltk.compat import string_types, python_2_unicode_compatible
+from nltk.compat import python_2_unicode_compatible
from nltk.sem.logic import LogicParser, APP
_counter = Counter()
diff --git a/nltk/sem/logic.py b/nltk/sem/logic.py
index aa54e8e..dd144d9 100644
--- a/nltk/sem/logic.py
+++ b/nltk/sem/logic.py
@@ -2,7 +2,7 @@
#
# Author: Dan Garrette <dhgarrette at gmail.com>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
@@ -15,12 +15,13 @@ from __future__ import print_function, unicode_literals
import re
import operator
from collections import defaultdict
-from functools import reduce
+from functools import reduce, total_ordering
+
+from six import string_types
from nltk.util import Trie
from nltk.internals import Counter
-from nltk.compat import (total_ordering, string_types,
- python_2_unicode_compatible)
+from nltk.compat import python_2_unicode_compatible
APP = 'APP'
diff --git a/nltk/sem/relextract.py b/nltk/sem/relextract.py
index 26d4fec..a54b5aa 100644
--- a/nltk/sem/relextract.py
+++ b/nltk/sem/relextract.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Relation Extraction
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -25,7 +25,8 @@ from __future__ import print_function
from collections import defaultdict
import re
-from nltk.compat import htmlentitydefs
+
+from six.moves import html_entities
# Dictionary that associates corpora with NE classes
NE_CLASSES = {
@@ -79,7 +80,7 @@ def _join(lst, sep=' ', untag=False):
from nltk.tag import tuple2str
return sep.join(tuple2str(tup) for tup in lst)
-def descape_entity(m, defs=htmlentitydefs.entitydefs):
+def descape_entity(m, defs=html_entities.entitydefs):
"""
Translate one entity to its ISO Latin value.
Inspired by example from effbot.org
@@ -114,7 +115,7 @@ def list2sym(lst):
def tree2semi_rel(tree):
"""
- Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).
+ Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).
In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
identifies pairs whose first member is a list (possibly empty) of terminal
@@ -209,7 +210,7 @@ def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10
objclass = _expand(objclass)
else:
raise ValueError("your value for the object type has not been recognized: %s" % objclass)
-
+
if corpus == 'ace' or corpus == 'conll2002':
pairs = tree2semi_rel(doc)
elif corpus == 'ieer':
@@ -334,7 +335,7 @@ def roles_demo(trace=0):
commissioner|
counsel|
director|
- economist|
+ economist|
editor|
executive|
foreman|
@@ -379,10 +380,10 @@ def ieer_headlines():
from nltk.corpus import ieer
from nltk.tree import Tree
-
+
print("IEER: First 20 Headlines")
- print("=" * 45)
-
+ print("=" * 45)
+
trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
@@ -424,7 +425,7 @@ def conllned(trace=1):
if trace:
lcon = rcon = True
for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10):
- print(rtuple(rel, lcon=True, rcon=True))
+ print(rtuple(rel, lcon=lcon, rcon=rcon))
#############################################
## Spanish CONLL2002: (PER, ORG)
@@ -473,11 +474,3 @@ if __name__ == '__main__':
conllesp()
ieer_headlines()
ne_chunked()
-
-
-
-
-
-
-
-
diff --git a/nltk/sem/skolemize.py b/nltk/sem/skolemize.py
index f9ced7f..1c5c03f 100644
--- a/nltk/sem/skolemize.py
+++ b/nltk/sem/skolemize.py
@@ -2,7 +2,7 @@
#
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/sem/util.py b/nltk/sem/util.py
index 9e3c710..edfcb0f 100644
--- a/nltk/sem/util.py
+++ b/nltk/sem/util.py
@@ -2,7 +2,7 @@
#
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/sentiment/__init__.py b/nltk/sentiment/__init__.py
index 08a0336..6f879bf 100644
--- a/nltk/sentiment/__init__.py
+++ b/nltk/sentiment/__init__.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Sentiment Analysis
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/sentiment/sentiment_analyzer.py b/nltk/sentiment/sentiment_analyzer.py
index 9429075..4fd18fb 100644
--- a/nltk/sentiment/sentiment_analyzer.py
+++ b/nltk/sentiment/sentiment_analyzer.py
@@ -2,7 +2,7 @@
#
# Natural Language Toolkit: Sentiment Analyzer
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/sentiment/util.py b/nltk/sentiment/util.py
index 5b3dacb..a26a2b5 100644
--- a/nltk/sentiment/util.py
+++ b/nltk/sentiment/util.py
@@ -2,7 +2,7 @@
#
# Natural Language Toolkit: Sentiment Analyzer
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -10,6 +10,7 @@
"""
Utility methods for Sentiment Analysis.
"""
+from __future__ import division
from copy import deepcopy
import codecs
@@ -72,8 +73,8 @@ def timer(method):
result = method(*args, **kw)
end = time.time()
tot_time = end - start
- hours = int(tot_time / 3600)
- mins = int((tot_time / 60) % 60)
+ hours = tot_time // 3600
+ mins = tot_time // 60 % 60
# in Python 2.x round() will return a float, so we convert it to int
secs = int(round(tot_time % 60))
if hours == 0 and mins == 0 and secs < 10:
diff --git a/nltk/sentiment/vader.py b/nltk/sentiment/vader.py
index 9eb5514..72e0ed9 100644
--- a/nltk/sentiment/vader.py
+++ b/nltk/sentiment/vader.py
@@ -1,10 +1,11 @@
# coding: utf-8
# Natural Language Toolkit: vader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: C.J. Hutto <Clayton.Hutto at gtri.gatech.edu>
# Ewan Klein <ewan at inf.ed.ac.uk> (modifications)
# Pierpaolo Pantone <24alsecondo at gmail.com> (modifications)
+# George Berry <geb97 at cornell.edu> (modifications)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
#
@@ -24,6 +25,7 @@ import codecs
import math
import re
import string
+from itertools import product
import nltk.data
##Constants##
@@ -39,7 +41,7 @@ C_INCR = 0.733
N_SCALAR = -0.74
# for removing punctuation
-REGEX_REMOVE_PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation))
+REGEX_REMOVE_PUNCTUATION = re.compile('[{0}]'.format(re.escape(string.punctuation)))
PUNC_LIST = [".", "!", "?", ",", ";", ":", "-", "'", "\"",
"!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?"]
@@ -160,37 +162,38 @@ class SentiText(object):
# adjacent punctuation (keeps emoticons & contractions)
self.is_cap_diff = allcap_differential(self.words_and_emoticons)
- def _words_only(self):
- text_mod = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
+ def _words_plus_punc(self):
+ """
+ Returns mapping of form:
+ {
+ 'cat,': 'cat',
+ ',cat': 'cat',
+ }
+ """
+ no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
# removes punctuation (but loses emoticons & contractions)
- words_only = text_mod.split()
- # get rid of empty items or single letter "words" like 'a' and 'I'
- words_only = [word for word in words_only if len(word) > 1]
- return words_only
+ words_only = no_punc_text.split()
+ # remove singletons
+ words_only = set( w for w in words_only if len(w) > 1 )
+ # the product gives ('cat', ',') and (',', 'cat')
+ punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)}
+ punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)}
+ words_punc_dict = punc_before
+ words_punc_dict.update(punc_after)
+ return words_punc_dict
def _words_and_emoticons(self):
+ """
+ Removes leading and trailing puncutation
+ Leaves contractions and most emoticons
+ Does not preserve punc-plus-letter emoticons (e.g. :D)
+ """
wes = self.text.split()
-
- # get rid of residual empty items or single letter words
+ words_punc_dict = self._words_plus_punc()
wes = [we for we in wes if len(we) > 1]
-
- for word in self._words_only():
- for punct in PUNC_LIST:
- pword = punct + word
- x1 = wes.count(pword)
- while x1 > 0:
- i = wes.index(pword)
- wes.remove(pword)
- wes.insert(i, word)
- x1 = wes.count(pword)
-
- wordp = word + punct
- x2 = wes.count(wordp)
- while x2 > 0:
- i = wes.index(wordp)
- wes.remove(wordp)
- wes.insert(i, word)
- x2 = wes.count(wordp)
+ for i, we in enumerate(wes):
+ if we in words_punc_dict:
+ wes[i] = words_punc_dict[we]
return wes
class SentimentIntensityAnalyzer(object):
diff --git a/nltk/stem/__init__.py b/nltk/stem/__init__.py
index a7deafc..6886f7b 100644
--- a/nltk/stem/__init__.py
+++ b/nltk/stem/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Stemmers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn at cs.mu.oz.au>
# Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
@@ -29,5 +29,3 @@ from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.rslp import RSLPStemmer
-
-
diff --git a/nltk/stem/api.py b/nltk/stem/api.py
index 5866a0c..92ab73d 100644
--- a/nltk/stem/api.py
+++ b/nltk/stem/api.py
@@ -1,18 +1,24 @@
# Natural Language Toolkit: Stemmer Interface
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn at cs.mu.oz.au>
# Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
+
+ at add_metaclass(ABCMeta)
class StemmerI(object):
"""
A processing interface for removing morphological affixes from
words. This process is known as stemming.
"""
+ @abstractmethod
def stem(self, token):
"""
Strip affixes from the token and return the stem.
@@ -20,6 +26,3 @@ class StemmerI(object):
:param token: The token that should be stemmed.
:type token: str
"""
- raise NotImplementedError()
-
-
diff --git a/nltk/stem/isri.py b/nltk/stem/isri.py
index 4e1bcde..44c187a 100644
--- a/nltk/stem/isri.py
+++ b/nltk/stem/isri.py
@@ -2,7 +2,7 @@
#
# Natural Language Toolkit: The ISRI Arabic Stemmer
#
-# Copyright (C) 2001-2016 NLTK Proejct
+# Copyright (C) 2001-2017 NLTK Proejct
# Algorithm: Kazem Taghva, Rania Elkhoury, and Jeffrey Coombs (2005)
# Author: Hosam Algasaier <hosam_hme at yahoo.com>
# URL: <http://nltk.org/>
diff --git a/nltk/stem/lancaster.py b/nltk/stem/lancaster.py
index 37f8de5..e7e3b47 100644
--- a/nltk/stem/lancaster.py
+++ b/nltk/stem/lancaster.py
@@ -1,12 +1,12 @@
# Natural Language Toolkit: Stemmers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Tomcavage <stomcava at law.upenn.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
-A word stemmer based on the Lancaster stemming algorithm.
+A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
"""
from __future__ import unicode_literals
@@ -44,10 +44,16 @@ class LancasterStemmer(StemmerI):
'meant'
>>> st.stem('cement') # ditto
'cem'
+ >>> st_pre = LancasterStemmer(strip_prefix_flag=True)
+ >>> st_pre.stem('kilometer') # Test Prefix
+ 'met'
+ >>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))
+ >>> st_custom.stem("ness") # Change s to t
+ 'nest'
"""
# The rule list is static since it doesn't change between instances
- rule_tuple = (
+ default_rule_tuple = (
"ai*2.", # -ia > - if intact
"a*1.", # -a > - if intact
"bb1.", # -bb > -b
@@ -165,23 +171,33 @@ class LancasterStemmer(StemmerI):
"zy1s." # -yz > -ys
)
-
- def __init__(self):
+ def __init__(self, rule_tuple=None, strip_prefix_flag=False):
"""Create an instance of the Lancaster stemmer.
"""
# Setup an empty rule dictionary - this will be filled in later
self.rule_dictionary = {}
+ # Check if a user wants to strip prefix
+ self._strip_prefix = strip_prefix_flag
+ # Check if a user wants to use his/her own rule tuples.
+ self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple
- def parseRules(self, rule_tuple):
+ def parseRules(self, rule_tuple=None):
"""Validate the set of rules used in this stemmer.
+
+ If this function is called as an individual method, without using stem
+ method, rule_tuple argument will be compiled into self.rule_dictionary.
+ If this function is called within stem, self._rule_tuple will be used.
+
"""
+ # If there is no argument for the function, use class' own rule tuple.
+ rule_tuple = rule_tuple if rule_tuple else self._rule_tuple
valid_rule = re.compile("^[a-z]+\*?\d[a-z]*[>\.]?$")
# Empty any old rules from the rule set before adding new ones
self.rule_dictionary = {}
for rule in rule_tuple:
if not valid_rule.match(rule):
- raise ValueError("The rule %s is invalid" % rule)
+ raise ValueError("The rule {0} is invalid".format(rule))
first_letter = rule[0:1]
if first_letter in self.rule_dictionary:
self.rule_dictionary[first_letter].append(rule)
@@ -193,13 +209,14 @@ class LancasterStemmer(StemmerI):
"""
# Lower-case the word, since all the rules are lower-cased
word = word.lower()
+ word = self.__stripPrefix(word) if self._strip_prefix else word
# Save a copy of the original word
intact_word = word
- # If the user hasn't supplied any rules, setup the default rules
- if len(self.rule_dictionary) == 0:
- self.parseRules(LancasterStemmer.rule_tuple)
+ # If rule dictionary is empty, parse rule tuple.
+ if not self.rule_dictionary:
+ self.parseRules()
return self.__doStemming(word, intact_word)
@@ -304,7 +321,17 @@ class LancasterStemmer(StemmerI):
word += append_string
return word
- def __repr__(self):
- return '<LancasterStemmer>'
+ def __stripPrefix(self, word):
+ """Remove prefix from a word.
+ This function originally taken from Whoosh.
+ """
+ for prefix in ("kilo", "micro", "milli", "intra", "ultra", "mega",
+ "nano", "pico", "pseudo"):
+ if word.startswith(prefix):
+ return word[len(prefix):]
+ return word
+
+ def __repr__(self):
+ return '<LancasterStemmer>'
diff --git a/nltk/stem/porter.py b/nltk/stem/porter.py
index 2db3b18..db68050 100644
--- a/nltk/stem/porter.py
+++ b/nltk/stem/porter.py
@@ -1,93 +1,25 @@
-# Copyright (c) 2002 Vivake Gupta (vivakeATomniscia.org). All rights reserved.
-#
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License as
-# published by the Free Software Foundation; either version 2 of the
-# License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
-# USA
-#
-# This software is maintained by Vivake (vivakeATomniscia.org) and is available at:
-# http://www.omniscia.org/~vivake/python/PorterStemmer.py
-#
-# Additional modifications were made to incorporate this module into
-# NLTK. All such modifications are marked with "--NLTK--". The NLTK
-# version of this module is maintained by NLTK developers,
-# and is available via http://nltk.org/
-#
-# GNU Linking Exception:
-# Using this module statically or dynamically with other modules is
-# making a combined work based on this module. Thus, the terms and
-# conditions of the GNU General Public License cover the whole combination.
-# As a special exception, the copyright holders of this module give
-# you permission to combine this module with independent modules to
-# produce an executable program, regardless of the license terms of these
-# independent modules, and to copy and distribute the resulting
-# program under terms of your choice, provided that you also meet,
-# for each linked independent module, the terms and conditions of
-# the license of that module. An independent module is a module which
-# is not derived from or based on this module. If you modify this module,
-# you may extend this exception to your version of the module, but you
-# are not obliged to do so. If you do not wish to do so, delete this
-# exception statement from your version.
-
"""
Porter Stemmer
-This is the Porter stemming algorithm, ported to Python from the
-version coded up in ANSI C by the author. It follows the algorithm
+This is the Porter stemming algorithm. It follows the algorithm
presented in
Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137.
-only differing from it at the points marked --DEPARTURE-- and --NEW--
-below.
+with some optional deviations that can be turned on or off with the
+`mode` argument to the constructor.
-For a more faithful version of the Porter algorithm, see
+Martin Porter, the algorithm's inventor, maintains a web page about the
+algorithm at
http://www.tartarus.org/~martin/PorterStemmer/
-Later additions:
-
- June 2000
-
- The 'l' of the 'logi' -> 'log' rule is put with the stem, so that
- short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc.
-
- This follows a suggestion of Barry Wilkins, research student at
- Birmingham.
-
-
- February 2000
-
- the cvc test for not dropping final -e now looks after vc at the
- beginning of a word, so are, eve, ice, ore, use keep final -e. In this
- test c is any consonant, including w, x and y. This extension was
- suggested by Chris Emerson.
-
- -fully -> -ful treated like -fulness -> -ful, and
- -tionally -> -tion treated like -tional -> -tion
-
- both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi.
-
- Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh.
-
-Additional modifications were made to incorperate this module into
-nltk. All such modifications are marked with \"--NLTK--\".
+which includes another Python implementation and other implementations
+in many languages.
"""
from __future__ import print_function, unicode_literals
-## --NLTK--
-## Declare this module's documentation format.
__docformat__ = 'plaintext'
import re
@@ -97,563 +29,653 @@ from nltk.compat import python_2_unicode_compatible
@python_2_unicode_compatible
class PorterStemmer(StemmerI):
-
- ## --NLTK--
- ## Add a module docstring
"""
A word stemmer based on the Porter stemming algorithm.
- Porter, M. \"An algorithm for suffix stripping.\"
+ Porter, M. "An algorithm for suffix stripping."
Program 14.3 (1980): 130-137.
-
- A few minor modifications have been made to Porter's basic
- algorithm. See the source code of this module for more
- information.
-
- The Porter Stemmer requires that all tokens have string types.
+
+ See http://www.tartarus.org/~martin/PorterStemmer/ for the homepage
+ of the algorithm.
+
+ Martin Porter has endorsed several modifications to the Porter
+ algorithm since writing his original paper, and those extensions are
+ included in the implementations on his website. Additionally, others
+ have proposed further improvements to the algorithm, including NLTK
+ contributors. There are thus three modes that can be selected by
+ passing the appropriate constant to the class constructor's `mode`
+ attribute:
+
+ PorterStemmer.ORIGINAL_ALGORITHM
+ - Implementation that is faithful to the original paper.
+
+ Note that Martin Porter has deprecated this version of the
+ algorithm. Martin distributes implementations of the Porter
+ Stemmer in many languages, hosted at:
+
+ http://www.tartarus.org/~martin/PorterStemmer/
+
+ and all of these implementations include his extensions. He
+ strongly recommends against using the original, published
+ version of the algorithm; only use this mode if you clearly
+ understand why you are choosing to do so.
+
+ PorterStemmer.MARTIN_EXTENSIONS
+ - Implementation that only uses the modifications to the
+ algorithm that are included in the implementations on Martin
+ Porter's website. He has declared Porter frozen, so the
+ behaviour of those implementations should never change.
+
+ PorterStemmer.NLTK_EXTENSIONS (default)
+ - Implementation that includes further improvements devised by
+ NLTK contributors or taken from other modified implementations
+ found on the web.
+
+ For the best stemming, you should use the default NLTK_EXTENSIONS
+ version. However, if you need to get the same results as either the
+ original algorithm or one of Martin Porter's hosted versions for
+ compability with an existing implementation or dataset, you can use
+ one of the other modes instead.
"""
-
- # The main part of the stemming algorithm starts here.
- # Note that only lower case sequences are stemmed. Forcing to lower case
- # should be done before stem(...) is called.
-
- def __init__(self):
-
- ## --NEW--
- ## This is a table of irregular forms. It is quite short, but still
- ## reflects the errors actually drawn to Martin Porter's attention over
- ## a 20 year period!
- ##
- ## Extend it as necessary.
- ##
- ## The form of the table is:
- ## {
- ## "p1" : ["s11","s12","s13", ... ],
- ## "p2" : ["s21","s22","s23", ... ],
- ## ...
- ## "pn" : ["sn1","sn2","sn3", ... ]
- ## }
- ##
- ## String sij is mapped to paradigm form pi, and the main stemming
- ## process is then bypassed.
-
- irregular_forms = {
- "sky" : ["sky", "skies"],
- "die" : ["dying"],
- "lie" : ["lying"],
- "tie" : ["tying"],
- "news" : ["news"],
- "inning" : ["innings", "inning"],
- "outing" : ["outings", "outing"],
- "canning" : ["cannings", "canning"],
- "howe" : ["howe"],
-
- # --NEW--
- "proceed" : ["proceed"],
- "exceed" : ["exceed"],
- "succeed" : ["succeed"], # Hiranmay Ghosh
+
+ # Modes the Stemmer can be instantiated in
+ NLTK_EXTENSIONS = 'NLTK_EXTENSIONS'
+ MARTIN_EXTENSIONS = 'MARTIN_EXTENSIONS'
+ ORIGINAL_ALGORITHM = 'ORIGINAL_ALGORITHM'
+
+ def __init__(self, mode=NLTK_EXTENSIONS):
+ if mode not in (
+ self.NLTK_EXTENSIONS,
+ self.MARTIN_EXTENSIONS,
+ self.ORIGINAL_ALGORITHM
+ ):
+ raise ValueError(
+ "Mode must be one of PorterStemmer.NLTK_EXTENSIONS, "
+ "PorterStemmer.MARTIN_EXTENSIONS, or "
+ "PorterStemmer.ORIGINAL_ALGORITHM"
+ )
+
+ self.mode = mode
+
+ if self.mode == self.NLTK_EXTENSIONS:
+ # This is a table of irregular forms. It is quite short,
+ # but still reflects the errors actually drawn to Martin
+ # Porter's attention over a 20 year period!
+ irregular_forms = {
+ "sky" : ["sky", "skies"],
+ "die" : ["dying"],
+ "lie" : ["lying"],
+ "tie" : ["tying"],
+ "news" : ["news"],
+ "inning" : ["innings", "inning"],
+ "outing" : ["outings", "outing"],
+ "canning" : ["cannings", "canning"],
+ "howe" : ["howe"],
+ "proceed" : ["proceed"],
+ "exceed" : ["exceed"],
+ "succeed" : ["succeed"],
}
- self.pool = {}
- for key in irregular_forms:
- for val in irregular_forms[key]:
- self.pool[val] = key
+ self.pool = {}
+ for key in irregular_forms:
+ for val in irregular_forms[key]:
+ self.pool[val] = key
self.vowels = frozenset(['a', 'e', 'i', 'o', 'u'])
- def _cons(self, word, i):
- """cons(i) is TRUE <=> b[i] is a consonant."""
+ def _is_consonant(self, word, i):
+ """Returns True if word[i] is a consonant, False otherwise
+
+ A consonant is defined in the paper as follows:
+
+ A consonant in a word is a letter other than A, E, I, O or
+ U, and other than Y preceded by a consonant. (The fact that
+ the term `consonant' is defined to some extent in terms of
+ itself does not make it ambiguous.) So in TOY the consonants
+ are T and Y, and in SYZYGY they are S, Z and G. If a letter
+ is not a consonant it is a vowel.
+ """
if word[i] in self.vowels:
return False
if word[i] == 'y':
if i == 0:
return True
else:
- return (not self._cons(word, i - 1))
+ return (not self._is_consonant(word, i - 1))
return True
-
- def _m(self, word, j):
- """m() measures the number of consonant sequences between k0 and j.
- if c is a consonant sequence and v a vowel sequence, and <..>
- indicates arbitrary presence,
-
- <c><v> gives 0
- <c>vc<v> gives 1
- <c>vcvc<v> gives 2
- <c>vcvcvc<v> gives 3
- ....
+
+ def _measure(self, stem):
+ """Returns the 'measure' of stem, per definition in the paper
+
+ From the paper:
+
+ A consonant will be denoted by c, a vowel by v. A list
+ ccc... of length greater than 0 will be denoted by C, and a
+ list vvv... of length greater than 0 will be denoted by V.
+ Any word, or part of a word, therefore has one of the four
+ forms:
+
+ CVCV ... C
+ CVCV ... V
+ VCVC ... C
+ VCVC ... V
+
+ These may all be represented by the single form
+
+ [C]VCVC ... [V]
+
+ where the square brackets denote arbitrary presence of their
+ contents. Using (VC){m} to denote VC repeated m times, this
+ may again be written as
+
+ [C](VC){m}[V].
+
+ m will be called the \measure\ of any word or word part when
+ represented in this form. The case m = 0 covers the null
+ word. Here are some examples:
+
+ m=0 TR, EE, TREE, Y, BY.
+ m=1 TROUBLE, OATS, TREES, IVY.
+ m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
"""
- n = 0
- i = 0
- while True:
- if i > j:
- return n
- if not self._cons(word, i):
- break
- i = i + 1
- i = i + 1
-
- while True:
- while True:
- if i > j:
- return n
- if self._cons(word, i):
- break
- i = i + 1
- i = i + 1
- n = n + 1
-
- while True:
- if i > j:
- return n
- if not self._cons(word, i):
- break
- i = i + 1
- i = i + 1
-
- def _vowelinstem(self, stem):
- """vowelinstem(stem) is TRUE <=> stem contains a vowel"""
+ cv_sequence = ''
+
+ # Construct a string of 'c's and 'v's representing whether each
+ # character in `stem` is a consonant or a vowel.
+ # e.g. 'falafel' becomes 'cvcvcvc',
+ # 'architecture' becomes 'vcccvcvccvcv'
for i in range(len(stem)):
- if not self._cons(stem, i):
+ if self._is_consonant(stem, i):
+ cv_sequence += 'c'
+ else:
+ cv_sequence += 'v'
+
+ # Count the number of 'vc' occurences, which is equivalent to
+ # the number of 'VC' occurrences in Porter's reduced form in the
+ # docstring above, which is in turn equivalent to `m`
+ return cv_sequence.count('vc')
+
+ def _has_positive_measure(self, stem):
+ return self._measure(stem) > 0
+
+ def _contains_vowel(self, stem):
+ """Returns True if stem contains a vowel, else False"""
+ for i in range(len(stem)):
+ if not self._is_consonant(stem, i):
return True
return False
-
- def _doublec(self, word):
- """doublec(word) is TRUE <=> word ends with a double consonant"""
- if len(word) < 2:
- return False
- if (word[-1] != word[-2]):
- return False
- return self._cons(word, len(word)-1)
-
- def _cvc(self, word, i):
- """cvc(i) is TRUE <=>
-
- a) ( --NEW--) i == 1, and word[0] word[1] is vowel consonant, or
-
- b) word[i - 2], word[i - 1], word[i] has the form consonant -
- vowel - consonant and also if the second c is not w, x or y. this
- is used when trying to restore an e at the end of a short word.
- e.g.
-
- cav(e), lov(e), hop(e), crim(e), but
- snow, box, tray.
- """
- if i == 0: return False # i == 0 never happens perhaps
- if i == 1: return (not self._cons(word, 0) and self._cons(word, 1))
- if not self._cons(word, i) or self._cons(word, i-1) or not self._cons(word, i-2): return False
-
- ch = word[i]
- if ch == 'w' or ch == 'x' or ch == 'y':
- return False
-
- return True
-
- def _step1ab(self, word):
- """step1ab() gets rid of plurals and -ed or -ing. e.g.
-
- caresses -> caress
- ponies -> poni
- sties -> sti
- tie -> tie (--NEW--: see below)
- caress -> caress
- cats -> cat
-
- feed -> feed
- agreed -> agree
- disabled -> disable
-
- matting -> mat
- mating -> mate
- meeting -> meet
- milling -> mill
- messing -> mess
-
- meetings -> meet
+
+ def _ends_double_consonant(self, word):
+ """Implements condition *d from the paper
+
+ Returns True if word ends with a double consonant
"""
- if word[-1] == 's':
- if word.endswith("sses"):
- word = word[:-2]
- elif word.endswith("ies"):
- if len(word) == 4:
- word = word[:-1]
- # this line extends the original algorithm, so that
- # 'flies'->'fli' but 'dies'->'die' etc
- else:
- word = word[:-2]
- elif word[-2] != 's':
- word = word[:-1]
-
- ed_or_ing_trimmed = False
- if word.endswith("ied"):
- if len(word) == 4:
- word = word[:-1]
- else:
- word = word[:-2]
- # this line extends the original algorithm, so that
- # 'spied'->'spi' but 'died'->'die' etc
-
- elif word.endswith("eed"):
- if self._m(word, len(word)-4) > 0:
- word = word[:-1]
-
-
- elif word.endswith("ed") and self._vowelinstem(word[:-2]):
- word = word[:-2]
- ed_or_ing_trimmed = True
- elif word.endswith("ing") and self._vowelinstem(word[:-3]):
- word = word[:-3]
- ed_or_ing_trimmed = True
-
- if ed_or_ing_trimmed:
- if word.endswith("at") or word.endswith("bl") or word.endswith("iz"):
- word += 'e'
- elif self._doublec(word):
- if word[-1] not in ['l', 's', 'z']:
- word = word[:-1]
- elif (self._m(word, len(word)-1) == 1 and self._cvc(word, len(word)-1)):
- word += 'e'
-
- return word
-
- def _step1c(self, word):
- """step1c() turns terminal y to i when there is another vowel in the stem.
- --NEW--: This has been modified from the original Porter algorithm so that y->i
- is only done when y is preceded by a consonant, but not if the stem
- is only a single consonant, i.e.
-
- (*c and not c) Y -> I
-
- So 'happy' -> 'happi', but
- 'enjoy' -> 'enjoy' etc
-
- This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'->
- 'enjoy'. Step 1c is perhaps done too soon; but with this modification that
- no longer really matters.
-
- Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly',
- 'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried',
- 'flies' ...
+ return (
+ len(word) >= 2 and
+ word[-1] == word[-2] and
+ self._is_consonant(word, len(word)-1)
+ )
+
+ def _ends_cvc(self, word):
+ """Implements condition *o from the paper
+
+ From the paper:
+
+ *o - the stem ends cvc, where the second c is not W, X or Y
+ (e.g. -WIL, -HOP).
"""
- if word[-1] == 'y' and len(word) > 2 and self._cons(word, len(word) - 2):
- return word[:-1] + 'i'
+ return (
+ len(word) >= 3 and
+ self._is_consonant(word, len(word) - 3) and
+ not self._is_consonant(word, len(word) - 2) and
+ self._is_consonant(word, len(word) - 1) and
+ word[-1] not in ('w', 'x', 'y')
+ ) or (
+ self.mode == self.NLTK_EXTENSIONS and
+ len(word) == 2 and
+ not self._is_consonant(word, 0) and
+ self._is_consonant(word, 1)
+ )
+
+ def _replace_suffix(self, word, suffix, replacement):
+ """Replaces `suffix` of `word` with `replacement"""
+ assert word.endswith(suffix), "Given word doesn't end with given suffix"
+ if suffix == '':
+ return word + replacement
else:
- return word
-
- def _step2(self, word):
- """step2() maps double suffices to single ones.
- so -ization ( = -ize plus -ation) maps to -ize etc. note that the
- string before the suffix must give m() > 0.
+ return word[:-len(suffix)] + replacement
+
+ def _apply_rule_list(self, word, rules):
+ """Applies the first applicable suffix-removal rule to the word
+
+ Takes a word and a list of suffix-removal rules represented as
+ 3-tuples, with the first element being the suffix to remove,
+ the second element being the string to replace it with, and the
+ final element being the condition for the rule to be applicable,
+ or None if the rule is unconditional.
"""
- if len(word) <= 1: # Only possible at this stage given unusual inputs to stem_word like 'oed'
- return word
-
- ch = word[-2]
-
- if ch == 'a':
- if word.endswith("ational"):
- return word[:-7] + "ate" if self._m(word, len(word)-8) > 0 else word
- elif word.endswith("tional"):
- return word[:-2] if self._m(word, len(word)-7) > 0 else word
- else:
- return word
- elif ch == 'c':
- if word.endswith("enci"):
- return word[:-4] + "ence" if self._m(word, len(word)-5) > 0 else word
- elif word.endswith("anci"):
- return word[:-4] + "ance" if self._m(word, len(word)-5) > 0 else word
- else:
- return word
- elif ch == 'e':
- if word.endswith("izer"):
- return word[:-1] if self._m(word, len(word)-5) > 0 else word
- else:
- return word
- elif ch == 'l':
- if word.endswith("bli"):
- return word[:-3] + "ble" if self._m(word, len(word)-4) > 0 else word # --DEPARTURE--
- # To match the published algorithm, replace "bli" with "abli" and "ble" with "able"
- elif word.endswith("alli"):
- # --NEW--
- if self._m(word, len(word)-5) > 0:
- word = word[:-2]
- return self._step2(word)
+ for rule in rules:
+ suffix, replacement, condition = rule
+ if suffix == '*d' and self._ends_double_consonant(word):
+ stem = word[:-2]
+ if condition is None or condition(stem):
+ return stem + replacement
else:
+ # Don't try any further rules
return word
- elif word.endswith("fulli"):
- return word[:-2] if self._m(word, len(word)-6) else word # --NEW--
- elif word.endswith("entli"):
- return word[:-2] if self._m(word, len(word)-6) else word
- elif word.endswith("eli"):
- return word[:-2] if self._m(word, len(word)-4) else word
- elif word.endswith("ousli"):
- return word[:-2] if self._m(word, len(word)-6) else word
- else:
- return word
- elif ch == 'o':
- if word.endswith("ization"):
- return word[:-7] + "ize" if self._m(word, len(word)-8) else word
- elif word.endswith("ation"):
- return word[:-5] + "ate" if self._m(word, len(word)-6) else word
- elif word.endswith("ator"):
- return word[:-4] + "ate" if self._m(word, len(word)-5) else word
- else:
- return word
- elif ch == 's':
- if word.endswith("alism"):
- return word[:-3] if self._m(word, len(word)-6) else word
- elif word.endswith("ness"):
- if word.endswith("iveness"):
- return word[:-4] if self._m(word, len(word)-8) else word
- elif word.endswith("fulness"):
- return word[:-4] if self._m(word, len(word)-8) else word
- elif word.endswith("ousness"):
- return word[:-4] if self._m(word, len(word)-8) else word
+ if word.endswith(suffix):
+ stem = self._replace_suffix(word, suffix, '')
+ if condition is None or condition(stem):
+ return stem + replacement
else:
+ # Don't try any further rules
return word
+
+ return word
+
+ def _step1a(self, word):
+ """Implements Step 1a from "An algorithm for suffix stripping"
+
+ From the paper:
+
+ SSES -> SS caresses -> caress
+ IES -> I ponies -> poni
+ ties -> ti
+ SS -> SS caress -> caress
+ S -> cats -> cat
+ """
+ # this NLTK-only rule extends the original algorithm, so
+ # that 'flies'->'fli' but 'dies'->'die' etc
+ if self.mode == self.NLTK_EXTENSIONS:
+ if word.endswith('ies') and len(word) == 4:
+ return self._replace_suffix(word, 'ies', 'ie')
+
+ return self._apply_rule_list(word, [
+ ('sses', 'ss', None), # SSES -> SS
+ ('ies', 'i', None), # IES -> I
+ ('ss', 'ss', None), # SS -> SS
+ ('s', '', None), # S ->
+ ])
+
+ def _step1b(self, word):
+ """Implements Step 1b from "An algorithm for suffix stripping"
+
+ From the paper:
+
+ (m>0) EED -> EE feed -> feed
+ agreed -> agree
+ (*v*) ED -> plastered -> plaster
+ bled -> bled
+ (*v*) ING -> motoring -> motor
+ sing -> sing
+
+ If the second or third of the rules in Step 1b is successful,
+ the following is done:
+
+ AT -> ATE conflat(ed) -> conflate
+ BL -> BLE troubl(ed) -> trouble
+ IZ -> IZE siz(ed) -> size
+ (*d and not (*L or *S or *Z))
+ -> single letter
+ hopp(ing) -> hop
+ tann(ed) -> tan
+ fall(ing) -> fall
+ hiss(ing) -> hiss
+ fizz(ed) -> fizz
+ (m=1 and *o) -> E fail(ing) -> fail
+ fil(ing) -> file
+
+ The rule to map to a single letter causes the removal of one of
+ the double letter pair. The -E is put back on -AT, -BL and -IZ,
+ so that the suffixes -ATE, -BLE and -IZE can be recognised
+ later. This E may be removed in step 4.
+ """
+ # this NLTK-only block extends the original algorithm, so that
+ # 'spied'->'spi' but 'died'->'die' etc
+ if self.mode == self.NLTK_EXTENSIONS:
+ if word.endswith('ied'):
+ if len(word) == 4:
+ return self._replace_suffix(word, 'ied', 'ie')
+ else:
+ return self._replace_suffix(word, 'ied', 'i')
+
+ # (m>0) EED -> EE
+ if word.endswith('eed'):
+ stem = self._replace_suffix(word, 'eed', '')
+ if self._measure(stem) > 0:
+ return stem + 'ee'
else:
return word
- elif ch == 't':
- if word.endswith("aliti"):
- return word[:-3] if self._m(word, len(word)-6) else word
- elif word.endswith("iviti"):
- return word[:-5] + "ive" if self._m(word, len(word)-6) else word
- elif word.endswith("biliti"):
- return word[:-6] + "ble" if self._m(word, len(word)-7) else word
- else:
- return word
- elif ch == 'g': # --DEPARTURE--
- if word.endswith("logi"):
- return word[:-1] if self._m(word, len(word) - 4) else word # --NEW-- (Barry Wilkins)
- # To match the published algorithm, pass len(word)-5 to _m instead of len(word)-4
- else:
- return word
-
- else:
+
+ rule_2_or_3_succeeded = False
+
+ for suffix in ['ed', 'ing']:
+ if word.endswith(suffix):
+ intermediate_stem = self._replace_suffix(word, suffix, '')
+ if self._contains_vowel(intermediate_stem):
+ rule_2_or_3_succeeded = True
+ break
+
+ if not rule_2_or_3_succeeded:
return word
- def _step3(self, word):
- """step3() deals with -ic-, -full, -ness etc. similar strategy to step2."""
+ return self._apply_rule_list(intermediate_stem, [
+ ('at', 'ate', None), # AT -> ATE
+ ('bl', 'ble', None), # BL -> BLE
+ ('iz', 'ize', None), # IZ -> IZE
+ # (*d and not (*L or *S or *Z))
+ # -> single letter
+ (
+ '*d',
+ intermediate_stem[-1],
+ lambda stem: intermediate_stem[-1] not in ('l', 's', 'z')
+ ),
+ # (m=1 and *o) -> E
+ (
+ '',
+ 'e',
+ lambda stem: (self._measure(stem) == 1 and
+ self._ends_cvc(stem))
+ ),
+ ])
+
+ def _step1c(self, word):
+ """Implements Step 1c from "An algorithm for suffix stripping"
+
+ From the paper:
+
+ Step 1c
+
+ (*v*) Y -> I happy -> happi
+ sky -> sky
+ """
+ def nltk_condition(stem):
+ """
+ This has been modified from the original Porter algorithm so
+ that y->i is only done when y is preceded by a consonant,
+ but not if the stem is only a single consonant, i.e.
+
+ (*c and not c) Y -> I
+
+ So 'happy' -> 'happi', but
+ 'enjoy' -> 'enjoy' etc
+
+ This is a much better rule. Formerly 'enjoy'->'enjoi' and
+ 'enjoyment'->'enjoy'. Step 1c is perhaps done too soon; but
+ with this modification that no longer really matters.
+
+ Also, the removal of the contains_vowel(z) condition means
+ that 'spy', 'fly', 'try' ... stem to 'spi', 'fli', 'tri' and
+ conflate with 'spied', 'tried', 'flies' ...
+ """
+ return len(stem) > 1 and self._is_consonant(stem, len(stem) - 1)
+
+ def original_condition(stem):
+ return self._contains_vowel(stem)
+
+ return self._apply_rule_list(word, [
+ (
+ 'y',
+ 'i',
+ nltk_condition if self.mode == self.NLTK_EXTENSIONS
+ else original_condition
+ )
+ ])
- ch = word[-1]
+ def _step2(self, word):
+ """Implements Step 2 from "An algorithm for suffix stripping"
+
+ From the paper:
+
+ Step 2
+
+ (m>0) ATIONAL -> ATE relational -> relate
+ (m>0) TIONAL -> TION conditional -> condition
+ rational -> rational
+ (m>0) ENCI -> ENCE valenci -> valence
+ (m>0) ANCI -> ANCE hesitanci -> hesitance
+ (m>0) IZER -> IZE digitizer -> digitize
+ (m>0) ABLI -> ABLE conformabli -> conformable
+ (m>0) ALLI -> AL radicalli -> radical
+ (m>0) ENTLI -> ENT differentli -> different
+ (m>0) ELI -> E vileli - > vile
+ (m>0) OUSLI -> OUS analogousli -> analogous
+ (m>0) IZATION -> IZE vietnamization -> vietnamize
+ (m>0) ATION -> ATE predication -> predicate
+ (m>0) ATOR -> ATE operator -> operate
+ (m>0) ALISM -> AL feudalism -> feudal
+ (m>0) IVENESS -> IVE decisiveness -> decisive
+ (m>0) FULNESS -> FUL hopefulness -> hopeful
+ (m>0) OUSNESS -> OUS callousness -> callous
+ (m>0) ALITI -> AL formaliti -> formal
+ (m>0) IVITI -> IVE sensitiviti -> sensitive
+ (m>0) BILITI -> BLE sensibiliti -> sensible
+ """
- if ch == 'e':
- if word.endswith("icate"):
- return word[:-3] if self._m(word, len(word)-6) else word
- elif word.endswith("ative"):
- return word[:-5] if self._m(word, len(word)-6) else word
- elif word.endswith("alize"):
- return word[:-3] if self._m(word, len(word)-6) else word
- else:
- return word
- elif ch == 'i':
- if word.endswith("iciti"):
- return word[:-3] if self._m(word, len(word)-6) else word
- else:
- return word
- elif ch == 'l':
- if word.endswith("ical"):
- return word[:-2] if self._m(word, len(word)-5) else word
- elif word.endswith("ful"):
- return word[:-3] if self._m(word, len(word)-4) else word
- else:
- return word
- elif ch == 's':
- if word.endswith("ness"):
- return word[:-4] if self._m(word, len(word)-5) else word
- else:
- return word
+ if self.mode == self.NLTK_EXTENSIONS:
+ # Instead of applying the ALLI -> AL rule after '(a)bli' per
+ # the published algorithm, instead we apply it first, and,
+ # if it succeeds, run the result through step2 again.
+ if (
+ word.endswith('alli') and
+ self._has_positive_measure(
+ self._replace_suffix(word, 'alli', '')
+ )
+ ):
+ return self._step2(
+ self._replace_suffix(word, 'alli', 'al')
+ )
+
+ bli_rule = ('bli', 'ble', self._has_positive_measure)
+ abli_rule = ('abli', 'able', self._has_positive_measure)
+
+ rules = [
+ ('ational', 'ate', self._has_positive_measure),
+ ('tional', 'tion', self._has_positive_measure),
+ ('enci', 'ence', self._has_positive_measure),
+ ('anci', 'ance', self._has_positive_measure),
+ ('izer', 'ize', self._has_positive_measure),
+
+ abli_rule if self.mode == self.ORIGINAL_ALGORITHM else bli_rule,
+
+ ('alli', 'al', self._has_positive_measure),
+ ('entli', 'ent', self._has_positive_measure),
+ ('eli', 'e', self._has_positive_measure),
+ ('ousli', 'ous', self._has_positive_measure),
+ ('ization', 'ize', self._has_positive_measure),
+ ('ation', 'ate', self._has_positive_measure),
+ ('ator', 'ate', self._has_positive_measure),
+ ('alism', 'al', self._has_positive_measure),
+ ('iveness', 'ive', self._has_positive_measure),
+ ('fulness', 'ful', self._has_positive_measure),
+ ('ousness', 'ous', self._has_positive_measure),
+ ('aliti', 'al', self._has_positive_measure),
+ ('iviti', 'ive', self._has_positive_measure),
+ ('biliti', 'ble', self._has_positive_measure),
+ ]
+
+ if self.mode == self.NLTK_EXTENSIONS:
+ rules.append(
+ ('fulli', 'ful', self._has_positive_measure)
+ )
+
+ # The 'l' of the 'logi' -> 'log' rule is put with the stem,
+ # so that short stems like 'geo' 'theo' etc work like
+ # 'archaeo' 'philo' etc.
+ rules.append((
+ "logi",
+ "log",
+ lambda stem: self._has_positive_measure(word[:-3])
+ ))
+
+ if self.mode == self.MARTIN_EXTENSIONS:
+ rules.append(
+ ("logi", "log", self._has_positive_measure)
+ )
+
+ return self._apply_rule_list(word, rules)
- else:
- return word
+ def _step3(self, word):
+ """Implements Step 3 from "An algorithm for suffix stripping"
+
+ From the paper:
+
+ Step 3
+
+ (m>0) ICATE -> IC triplicate -> triplic
+ (m>0) ATIVE -> formative -> form
+ (m>0) ALIZE -> AL formalize -> formal
+ (m>0) ICITI -> IC electriciti -> electric
+ (m>0) ICAL -> IC electrical -> electric
+ (m>0) FUL -> hopeful -> hope
+ (m>0) NESS -> goodness -> good
+ """
+ return self._apply_rule_list(word, [
+ ('icate', 'ic', self._has_positive_measure),
+ ('ative', '', self._has_positive_measure),
+ ('alize', 'al', self._has_positive_measure),
+ ('iciti', 'ic', self._has_positive_measure),
+ ('ical', 'ic', self._has_positive_measure),
+ ('ful', '', self._has_positive_measure),
+ ('ness', '', self._has_positive_measure),
+ ])
def _step4(self, word):
- """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
-
- if len(word) <= 1: # Only possible at this stage given unusual inputs to stem_word like 'oed'
- return word
-
- ch = word[-2]
-
- if ch == 'a':
- if word.endswith("al"):
- return word[:-2] if self._m(word, len(word)-3) > 1 else word
- else:
- return word
- elif ch == 'c':
- if word.endswith("ance"):
- return word[:-4] if self._m(word, len(word)-5) > 1 else word
- elif word.endswith("ence"):
- return word[:-4] if self._m(word, len(word)-5) > 1 else word
- else:
- return word
- elif ch == 'e':
- if word.endswith("er"):
- return word[:-2] if self._m(word, len(word)-3) > 1 else word
- else:
- return word
- elif ch == 'i':
- if word.endswith("ic"):
- return word[:-2] if self._m(word, len(word)-3) > 1 else word
- else:
- return word
- elif ch == 'l':
- if word.endswith("able"):
- return word[:-4] if self._m(word, len(word)-5) > 1 else word
- elif word.endswith("ible"):
- return word[:-4] if self._m(word, len(word)-5) > 1 else word
- else:
- return word
- elif ch == 'n':
- if word.endswith("ant"):
- return word[:-3] if self._m(word, len(word)-4) > 1 else word
- elif word.endswith("ement"):
- return word[:-5] if self._m(word, len(word)-6) > 1 else word
- elif word.endswith("ment"):
- return word[:-4] if self._m(word, len(word)-5) > 1 else word
- elif word.endswith("ent"):
- return word[:-3] if self._m(word, len(word)-4) > 1 else word
- else:
- return word
- elif ch == 'o':
- if word.endswith("sion") or word.endswith("tion"): # slightly different logic to all the other cases
- return word[:-3] if self._m(word, len(word)-4) > 1 else word
- elif word.endswith("ou"):
- return word[:-2] if self._m(word, len(word)-3) > 1 else word
- else:
- return word
- elif ch == 's':
- if word.endswith("ism"):
- return word[:-3] if self._m(word, len(word)-4) > 1 else word
- else:
- return word
- elif ch == 't':
- if word.endswith("ate"):
- return word[:-3] if self._m(word, len(word)-4) > 1 else word
- elif word.endswith("iti"):
- return word[:-3] if self._m(word, len(word)-4) > 1 else word
- else:
- return word
- elif ch == 'u':
- if word.endswith("ous"):
- return word[:-3] if self._m(word, len(word)-4) > 1 else word
- else:
- return word
- elif ch == 'v':
- if word.endswith("ive"):
- return word[:-3] if self._m(word, len(word)-4) > 1 else word
- else:
- return word
- elif ch == 'z':
- if word.endswith("ize"):
- return word[:-3] if self._m(word, len(word)-4) > 1 else word
- else:
- return word
- else:
- return word
-
- def _step5(self, word):
- """step5() removes a final -e if m() > 1, and changes -ll to -l if
- m() > 1.
+ """Implements Step 4 from "An algorithm for suffix stripping"
+
+ Step 4
+
+ (m>1) AL -> revival -> reviv
+ (m>1) ANCE -> allowance -> allow
+ (m>1) ENCE -> inference -> infer
+ (m>1) ER -> airliner -> airlin
+ (m>1) IC -> gyroscopic -> gyroscop
+ (m>1) ABLE -> adjustable -> adjust
+ (m>1) IBLE -> defensible -> defens
+ (m>1) ANT -> irritant -> irrit
+ (m>1) EMENT -> replacement -> replac
+ (m>1) MENT -> adjustment -> adjust
+ (m>1) ENT -> dependent -> depend
+ (m>1 and (*S or *T)) ION -> adoption -> adopt
+ (m>1) OU -> homologou -> homolog
+ (m>1) ISM -> communism -> commun
+ (m>1) ATE -> activate -> activ
+ (m>1) ITI -> angulariti -> angular
+ (m>1) OUS -> homologous -> homolog
+ (m>1) IVE -> effective -> effect
+ (m>1) IZE -> bowdlerize -> bowdler
+
+ The suffixes are now removed. All that remains is a little
+ tidying up.
"""
- if word[-1] == 'e':
- a = self._m(word, len(word)-1)
- if a > 1 or (a == 1 and not self._cvc(word, len(word)-2)):
- word = word[:-1]
- if word.endswith('ll') and self._m(word, len(word)-1) > 1:
- word = word[:-1]
-
+ measure_gt_1 = lambda stem: self._measure(stem) > 1
+
+ return self._apply_rule_list(word, [
+ ('al', '', measure_gt_1),
+ ('ance', '', measure_gt_1),
+ ('ence', '', measure_gt_1),
+ ('er', '', measure_gt_1),
+ ('ic', '', measure_gt_1),
+ ('able', '', measure_gt_1),
+ ('ible', '', measure_gt_1),
+ ('ant', '', measure_gt_1),
+ ('ement', '', measure_gt_1),
+ ('ment', '', measure_gt_1),
+ ('ent', '', measure_gt_1),
+
+ # (m>1 and (*S or *T)) ION ->
+ (
+ 'ion',
+ '',
+ lambda stem: self._measure(stem) > 1 and stem[-1] in ('s', 't')
+ ),
+
+ ('ou', '', measure_gt_1),
+ ('ism', '', measure_gt_1),
+ ('ate', '', measure_gt_1),
+ ('iti', '', measure_gt_1),
+ ('ous', '', measure_gt_1),
+ ('ive', '', measure_gt_1),
+ ('ize', '', measure_gt_1),
+ ])
+
+ def _step5a(self, word):
+ """Implements Step 5a from "An algorithm for suffix stripping"
+
+ From the paper:
+
+ Step 5a
+
+ (m>1) E -> probate -> probat
+ rate -> rate
+ (m=1 and not *o) E -> cease -> ceas
+ """
+ # Note that Martin's test vocabulary and reference
+ # implementations are inconsistent in how they handle the case
+ # where two rules both refer to a suffix that matches the word
+ # to be stemmed, but only the condition of the second one is
+ # true.
+ # Earlier in step2b we had the rules:
+ # (m>0) EED -> EE
+ # (*v*) ED ->
+ # but the examples in the paper included "feed"->"feed", even
+ # though (*v*) is true for "fe" and therefore the second rule
+ # alone would map "feed"->"fe".
+ # However, in THIS case, we need to handle the consecutive rules
+ # differently and try both conditions (obviously; the second
+ # rule here would be redundant otherwise). Martin's paper makes
+ # no explicit mention of the inconsistency; you have to infer it
+ # from the examples.
+ # For this reason, we can't use _apply_rule_list here.
+ if word.endswith('e'):
+ stem = self._replace_suffix(word, 'e', '')
+ if self._measure(stem) > 1:
+ return stem
+ if self._measure(stem) == 1 and not self._ends_cvc(stem):
+ return stem
return word
- def stem_word(self, p, i=0, j=None):
- """
- Returns the stem of p, or, if i and j are given, the stem of p[i:j+1].
+ def _step5b(self, word):
+ """Implements Step 5a from "An algorithm for suffix stripping"
+
+ From the paper:
+
+ Step 5b
+
+ (m > 1 and *d and *L) -> single letter
+ controll -> control
+ roll -> roll
"""
- ## --NLTK--
- if j is None and i == 0:
- word = p
- else:
- if j is None:
- j = len(p) - 1
- word = p[i:j+1]
+ return self._apply_rule_list(word, [
+ ('ll', 'l', lambda stem: self._measure(word[:-1]) > 1)
+ ])
- if word in self.pool:
+ def stem(self, word):
+ stem = word.lower()
+
+ if self.mode == self.NLTK_EXTENSIONS and word in self.pool:
return self.pool[word]
- if len(word) <= 2:
- return word # --DEPARTURE--
- # With this line, strings of length 1 or 2 don't go through the
- # stemming process, although no mention is made of this in the
- # published algorithm. Remove the line to match the published
- # algorithm.
-
- word = self._step1ab(word)
- word = self._step1c(word)
- word = self._step2(word)
- word = self._step3(word)
- word = self._step4(word)
- word = self._step5(word)
- return word
-
- def _adjust_case(self, word, stem):
- lower = word.lower()
+ if self.mode != self.ORIGINAL_ALGORITHM and len(word) <= 2:
+ # With this line, strings of length 1 or 2 don't go through
+ # the stemming process, although no mention is made of this
+ # in the published algorithm.
+ return word
- ret = ""
- for x in range(len(stem)):
- if lower[x] == stem[x]:
- ret += word[x]
- else:
- ret += stem[x]
-
- return ret
-
- ## --NLTK--
- ## Don't use this procedure; we want to work with individual
- ## tokens, instead. (commented out the following procedure)
- #def stem(self, text):
- # parts = re.split("(\W+)", text)
- # numWords = (len(parts) + 1)/2
- #
- # ret = ""
- # for i in xrange(numWords):
- # word = parts[2 * i]
- # separator = ""
- # if ((2 * i) + 1) < len(parts):
- # separator = parts[(2 * i) + 1]
- #
- # stem = self.stem_word(string.lower(word), 0, len(word) - 1)
- # ret = ret + self.adjust_case(word, stem)
- # ret = ret + separator
- # return ret
-
- ## --NLTK--
- ## Define a stem() method that implements the StemmerI interface.
- def stem(self, word):
- stem = self.stem_word(word.lower(), 0, len(word) - 1)
- return self._adjust_case(word, stem)
+ stem = self._step1a(stem)
+ stem = self._step1b(stem)
+ stem = self._step1c(stem)
+ stem = self._step2(stem)
+ stem = self._step3(stem)
+ stem = self._step4(stem)
+ stem = self._step5a(stem)
+ stem = self._step5b(stem)
+
+ return stem
- ## --NLTK--
- ## Add a string representation function
def __repr__(self):
return '<PorterStemmer>'
-## --NLTK--
-## This test procedure isn't applicable.
-#if __name__ == '__main__':
-# p = PorterStemmer()
-# if len(sys.argv) > 1:
-# for f in sys.argv[1:]:
-# with open(f, 'r') as infile:
-# while 1:
-# w = infile.readline()
-# if w == '':
-# break
-# w = w[:-1]
-# print(p.stem(w))
-
-##--NLTK--
-## Added a demo() function
-
def demo():
"""
A demonstration of the porter stemmer on a sample from
@@ -667,7 +689,7 @@ def demo():
orig = []
stemmed = []
- for item in treebank.files()[:3]:
+ for item in treebank.fileids()[:3]:
for (word, tag) in treebank.tagged_words(item):
orig.append(word)
stemmed.append(stemmer.stem(word))
@@ -686,7 +708,3 @@ def demo():
print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
print(results)
print('*'*70)
-
-##--NLTK--
-
-
diff --git a/nltk/stem/regexp.py b/nltk/stem/regexp.py
index 0a68df0..9053571 100644
--- a/nltk/stem/regexp.py
+++ b/nltk/stem/regexp.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Stemmers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn at cs.mu.oz.au>
# Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
@@ -54,7 +54,7 @@ class RegexpStemmer(StemmerI):
return self._regexp.sub('', word)
def __repr__(self):
- return '<RegexpStemmer: %r>' % self._regexp.pattern
+ return '<RegexpStemmer: {!r}>'.format(self._regexp.pattern)
diff --git a/nltk/stem/rslp.py b/nltk/stem/rslp.py
index eedd09f..ebf190d 100644
--- a/nltk/stem/rslp.py
+++ b/nltk/stem/rslp.py
@@ -2,7 +2,7 @@
# Natural Language Toolkit: RSLP Stemmer
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tiago Tresoldi <tresoldi at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/stem/snowball.py b/nltk/stem/snowball.py
index 10811fb..3ed2dbb 100644
--- a/nltk/stem/snowball.py
+++ b/nltk/stem/snowball.py
@@ -2,7 +2,7 @@
#
# Natural Language Toolkit: Snowball Stemmer
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Peter Michael Stahl <pemistahl at gmail.com>
# Peter Ljunglof <peter.ljunglof at heatherleaf.se> (revisions)
# Algorithms: Dr Martin Porter <martin at tartarus.org>
@@ -20,6 +20,8 @@ There is also a demo function: `snowball.demo()`.
"""
from __future__ import unicode_literals, print_function
+from six.moves import input
+
from nltk import compat
from nltk.corpus import stopwords
from nltk.stem import porter
@@ -85,11 +87,14 @@ class SnowballStemmer(StemmerI):
def __init__(self, language, ignore_stopwords=False):
if language not in self.languages:
- raise ValueError("The language '%s' is not supported." % language)
+ raise ValueError("The language '{0}' is not supported.".format(language))
stemmerclass = globals()[language.capitalize() + "Stemmer"]
self.stemmer = stemmerclass(ignore_stopwords)
self.stem = self.stemmer.stem
self.stopwords = self.stemmer.stopwords
+
+ def stem(self, token):
+ return self.stemmer.stem(self, token)
@compat.python_2_unicode_compatible
@@ -120,15 +125,15 @@ class _LanguageSpecificStemmer(StemmerI):
for word in stopwords.words(language):
self.stopwords.add(word)
except IOError:
- raise ValueError("%r has no list of stopwords. Please set"
- " 'ignore_stopwords' to 'False'." % self)
+ raise ValueError("{!r} has no list of stopwords. Please set"
+ " 'ignore_stopwords' to 'False'.".format(self))
def __repr__(self):
"""
Print out the string representation of the respective class.
"""
- return "<%s>" % type(self).__name__
+ return "<{0}>".format(type(self).__name__)
class PorterStemmer(_LanguageSpecificStemmer, porter.PorterStemmer):
@@ -3677,7 +3682,7 @@ def demo():
while True:
- language = compat.raw_input("Please enter the name of the language " +
+ language = input("Please enter the name of the language " +
"to be demonstrated\n" +
"/".join(SnowballStemmer.languages) +
"\n" +
@@ -3708,5 +3713,3 @@ def demo():
print(stemmed)
print('-' * 70)
print("\n")
-
-
diff --git a/nltk/stem/util.py b/nltk/stem/util.py
index 6bb386a..c3d9b90 100644
--- a/nltk/stem/util.py
+++ b/nltk/stem/util.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Stemmer Utilities
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Helder <he7d3r at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/stem/wordnet.py b/nltk/stem/wordnet.py
index 29428ab..3a217ff 100644
--- a/nltk/stem/wordnet.py
+++ b/nltk/stem/wordnet.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: WordNet stemmer interface
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/tag/__init__.py b/nltk/tag/__init__.py
index 1b52e63..0de452a 100644
--- a/nltk/tag/__init__.py
+++ b/nltk/tag/__init__.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Taggers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com> (minor additions)
# URL: <http://nltk.org/>
@@ -76,7 +76,20 @@ from nltk.tag.mapping import tagset_mapping, map_tag
from nltk.tag.crf import CRFTagger
from nltk.tag.perceptron import PerceptronTagger
-from nltk.data import load
+from nltk.data import load, find
+
+RUS_PICKLE = 'taggers/averaged_perceptron_tagger_ru/averaged_perceptron_tagger_ru.pickle'
+
+
+def _get_tagger(lang=None):
+ if lang == 'rus':
+ tagger = PerceptronTagger(False)
+ ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
+ tagger.load(ap_russian_model_loc)
+ else:
+ tagger = PerceptronTagger()
+ return tagger
+
def _pos_tag(tokens, tagset, tagger):
tagged_tokens = tagger.tag(tokens)
@@ -84,7 +97,8 @@ def _pos_tag(tokens, tagset, tagger):
tagged_tokens = [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagged_tokens]
return tagged_tokens
-def pos_tag(tokens, tagset=None):
+
+def pos_tag(tokens, tagset=None, lang='eng'):
"""
Use NLTK's currently recommended part of speech tagger to
tag the given list of tokens.
@@ -104,14 +118,16 @@ def pos_tag(tokens, tagset=None):
:type tokens: list(str)
:param tagset: the tagset to be used, e.g. universal, wsj, brown
:type tagset: str
+ :param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian
+ :type lang: str
:return: The tagged tokens
:rtype: list(tuple(str, str))
"""
- tagger = PerceptronTagger()
+ tagger = _get_tagger(lang)
return _pos_tag(tokens, tagset, tagger)
-def pos_tag_sents(sentences, tagset=None):
+def pos_tag_sents(sentences, tagset=None, lang='eng'):
"""
Use NLTK's currently recommended part of speech tagger to tag the
given list of sentences, each consisting of a list of tokens.
@@ -120,8 +136,10 @@ def pos_tag_sents(sentences, tagset=None):
:type tokens: list(list(str))
:param tagset: the tagset to be used, e.g. universal, wsj, brown
:type tagset: str
+ :param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian
+ :type lang: str
:return: The list of tagged sentences
:rtype: list(list(tuple(str, str)))
"""
- tagger = PerceptronTagger()
+ tagger = _get_tagger(lang)
return [_pos_tag(sent, tagset, tagger) for sent in sentences]
diff --git a/nltk/tag/api.py b/nltk/tag/api.py
index f6aa0c1..804c769 100644
--- a/nltk/tag/api.py
+++ b/nltk/tag/api.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Tagger Interface
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com> (minor additions)
# URL: <http://nltk.org/>
@@ -10,11 +10,17 @@
Interface for tagging each token in a sentence with supplementary
information, such as its part of speech.
"""
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+from itertools import chain
+
from nltk.internals import overridden
from nltk.metrics import accuracy
from nltk.tag.util import untag
+
+ at add_metaclass(ABCMeta)
class TaggerI(object):
"""
A processing interface for assigning a tag to each token in a list.
@@ -29,6 +35,7 @@ class TaggerI(object):
Subclasses must define:
- either ``tag()`` or ``tag_sents()`` (or both)
"""
+ @abstractmethod
def tag(self, tokens):
"""
Determine the most appropriate tag sequence for the given
@@ -39,8 +46,6 @@ class TaggerI(object):
"""
if overridden(self.tag_sents):
return self.tag_sents([tokens])[0]
- else:
- raise NotImplementedError()
def tag_sents(self, sentences):
"""
@@ -62,13 +67,15 @@ class TaggerI(object):
"""
tagged_sents = self.tag_sents(untag(sent) for sent in gold)
- gold_tokens = sum(gold, [])
- test_tokens = sum(tagged_sents, [])
+ gold_tokens = list(chain(*gold))
+ test_tokens = list(chain(*tagged_sents))
return accuracy(gold_tokens, test_tokens)
def _check_params(self, train, model):
if (train and model) or (not train and not model):
- raise ValueError('Must specify either training data or trained model.')
+ raise ValueError(
+ 'Must specify either training data or trained model.')
+
class FeaturesetTaggerI(TaggerI):
"""
@@ -77,5 +84,3 @@ class FeaturesetTaggerI(TaggerI):
values. See ``nltk.classify`` for more information about features
and featuresets.
"""
-
-
diff --git a/nltk/tag/brill.py b/nltk/tag/brill.py
index bd74545..24e4df4 100644
--- a/nltk/tag/brill.py
+++ b/nltk/tag/brill.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Marcus Uneson <marcus.uneson at gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
@@ -10,9 +10,8 @@
from __future__ import print_function, division
-from collections import defaultdict
+from collections import defaultdict, Counter
-from nltk.compat import Counter
from nltk.tag import TaggerI
from nltk.tbl import Feature, Template
from nltk import jsontags
@@ -376,8 +375,9 @@ class BrillTagger(TaggerI):
usedtpls = set([int(tid) for tid in tids])
unused = [(tid, tpl) for (tid, tpl) in enumerate(Template.ALLTEMPLATES) if tid not in usedtpls]
print("UNUSED TEMPLATES ({0})".format(len(unused)))
+
for (tid, tpl) in unused:
- print("{0:03d} {1:s}".format(tid, tpl))
+ print("{0:03d} {1:s}".format(tid, str(tpl)))
if test_stats is None:
print_train_stats()
@@ -422,5 +422,3 @@ class BrillTagger(TaggerI):
testing_stats['finalerrors'] = errors[-1]
testing_stats['finalacc'] = 1 - testing_stats['finalerrors']/testing_stats['tokencount']
return (tagged_tokenses, testing_stats)
-
-
diff --git a/nltk/tag/crf.py b/nltk/tag/crf.py
index 453e111..6a33aca 100644
--- a/nltk/tag/crf.py
+++ b/nltk/tag/crf.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the CRFSuite Tagger
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Long Duong <longdt219 at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/tag/hmm.py b/nltk/tag/hmm.py
index 61f2b6a..309f6fe 100644
--- a/nltk/tag/hmm.py
+++ b/nltk/tag/hmm.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Hidden Markov Model
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Trevor Cohn <tacohn at csse.unimelb.edu.au>
# Philip Blunsom <pcbl at csse.unimelb.edu.au>
# Tiago Tresoldi <tiago at tresoldi.pro.br> (fixes)
@@ -73,6 +73,8 @@ from __future__ import print_function, unicode_literals, division
import re
import itertools
+from six.moves import map, zip
+
try:
import numpy as np
except ImportError:
@@ -85,7 +87,7 @@ from nltk.probability import (FreqDist, ConditionalFreqDist,
MLEProbDist, RandomProbDist)
from nltk.metrics import accuracy
from nltk.util import LazyMap, unique_list
-from nltk.compat import python_2_unicode_compatible, izip, imap
+from nltk.compat import python_2_unicode_compatible
from nltk.tag.api import TaggerI
@@ -269,7 +271,7 @@ class HiddenMarkovModelTagger(TaggerI):
def _tag(self, unlabeled_sequence):
path = self._best_path(unlabeled_sequence)
- return list(izip(unlabeled_sequence, path))
+ return list(zip(unlabeled_sequence, path))
def _output_logprob(self, state, symbol):
"""
@@ -778,10 +780,10 @@ class HiddenMarkovModelTagger(TaggerI):
return list(itertools.chain(*seq))
test_sequence = self._transform(test_sequence)
- predicted_sequence = list(imap(self._tag, imap(words, test_sequence)))
+ predicted_sequence = list(map(self._tag, map(words, test_sequence)))
if verbose:
- for test_sent, predicted_sent in izip(test_sequence, predicted_sequence):
+ for test_sent, predicted_sent in zip(test_sequence, predicted_sequence):
print('Test:',
' '.join('%s/%s' % (token, tag)
for (token, tag) in test_sent))
@@ -799,8 +801,8 @@ class HiddenMarkovModelTagger(TaggerI):
print()
print('-' * 60)
- test_tags = flatten(imap(tags, test_sequence))
- predicted_tags = flatten(imap(tags, predicted_sequence))
+ test_tags = flatten(map(tags, test_sequence))
+ predicted_tags = flatten(map(tags, predicted_sequence))
acc = accuracy(test_tags, predicted_tags)
count = sum(len(sent) for sent in test_sequence)
@@ -1271,7 +1273,3 @@ def demo_bw():
trainer = HiddenMarkovModelTrainer(states, symbols)
hmm = trainer.train_unsupervised(training, model=model,
max_iterations=1000)
-
-
-
-
diff --git a/nltk/tag/hunpos.py b/nltk/tag/hunpos.py
index 4113629..e81b180 100644
--- a/nltk/tag/hunpos.py
+++ b/nltk/tag/hunpos.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the HunPos POS-tagger
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Peter Ljunglöf <peter.ljunglof at heatherleaf.se>
# Dávid Márk Nemeskey <nemeskeyd at gmail.com> (modifications)
# Attila Zséder <zseder at gmail.com> (modifications)
@@ -15,9 +15,10 @@ A module for interfacing with the HunPos open-source POS-tagger.
import os
from subprocess import Popen, PIPE
+from six import text_type
+
from nltk.internals import find_binary, find_file
from nltk.tag.api import TaggerI
-from nltk import compat
_hunpos_url = 'http://code.google.com/p/hunpos/'
@@ -105,7 +106,7 @@ class HunposTagger(TaggerI):
"""
for token in tokens:
assert "\n" not in token, "Tokens should not contain newlines"
- if isinstance(token, compat.text_type):
+ if isinstance(token, text_type):
token = token.encode(self._encoding)
self._hunpos.stdin.write(token + b"\n")
# We write a final empty line to tell hunpos that the sentence is finished:
@@ -129,4 +130,3 @@ def setup_module(module):
HunposTagger('en_wsj.model')
except LookupError:
raise SkipTest("HunposTagger is not available")
-
diff --git a/nltk/tag/mapping.py b/nltk/tag/mapping.py
index afa73e2..fc37dbc 100644
--- a/nltk/tag/mapping.py
+++ b/nltk/tag/mapping.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Tagset Mapping
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Nathan Schneider <nathan at cmu.edu>
# Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/tag/perceptron.py b/nltk/tag/perceptron.py
index ff59e55..b194ad0 100644
--- a/nltk/tag/perceptron.py
+++ b/nltk/tag/perceptron.py
@@ -165,18 +165,26 @@ class PerceptronTagger(TaggerI):
'''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
controls the number of Perceptron training iterations.
- :param sentences: A list of (words, tags) tuples.
+ :param sentences: A list or iterator of sentences, where each sentence
+ is a list of (words, tags) tuples.
:param save_loc: If not ``None``, saves a pickled model in this location.
:param nr_iter: Number of training iterations.
'''
+ # We'd like to allow ``sentences`` to be either a list or an iterator,
+ # the latter being especially important for a large training dataset.
+ # Because ``self._make_tagdict(sentences)`` runs regardless, we make
+ # it populate ``self._sentences`` (a list) with all the sentences.
+ # This saves the overheard of just iterating through ``sentences`` to
+ # get the list by ``sentences = list(sentences)``.
+
+ self._sentences = list() # to be populated by self._make_tagdict...
self._make_tagdict(sentences)
self.model.classes = self.classes
for iter_ in range(nr_iter):
c = 0
n = 0
- for sentence in sentences:
- words = [word for word,tag in sentence]
- tags = [tag for word,tag in sentence]
+ for sentence in self._sentences:
+ words, tags = zip(*sentence)
prev, prev2 = self.START
context = self.START + [self.normalize(w) for w in words] \
@@ -191,13 +199,19 @@ class PerceptronTagger(TaggerI):
prev = guess
c += guess == tags[i]
n += 1
- random.shuffle(sentences)
+ random.shuffle(self._sentences)
logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)))
+
+ # We don't need the training sentences anymore, and we don't want to
+ # waste space on them when we pickle the trained tagger.
+ self._sentences = None
+
self.model.average_weights()
# Pickle as a binary file
if save_loc is not None:
with open(save_loc, 'wb') as fout:
- pickle.dump((self.model.weights, self.tagdict, self.classes), fout, -1)
+ # changed protocol from -1 to 2 to make pickling Python 2 compatible
+ pickle.dump((self.model.weights, self.tagdict, self.classes), fout, 2)
def load(self, loc):
@@ -214,7 +228,7 @@ class PerceptronTagger(TaggerI):
'''
Normalization used in pre-processing.
- All words are lower cased
- - Digits in the range 1800-2100 are represented as !YEAR;
+ - Groups of digits of length 4 are represented as !YEAR;
- Other digits are represented as !DIGITS
:rtype: str
@@ -230,7 +244,7 @@ class PerceptronTagger(TaggerI):
def _get_features(self, i, word, context, prev, prev2):
'''Map tokens into a feature representation, implemented as a
- {hashable: float} dict. If the features change, a new model must be
+ {hashable: int} dict. If the features change, a new model must be
trained.
'''
def add(name, *args):
@@ -262,6 +276,7 @@ class PerceptronTagger(TaggerI):
'''
counts = defaultdict(lambda: defaultdict(int))
for sentence in sentences:
+ self._sentences.append(sentence)
for word, tag in sentence:
counts[word][tag] += 1
self.classes.add(tag)
diff --git a/nltk/tag/senna.py b/nltk/tag/senna.py
index 93030b3..c74ec94 100644
--- a/nltk/tag/senna.py
+++ b/nltk/tag/senna.py
@@ -1,7 +1,7 @@
# encoding: utf-8
# Natural Language Toolkit: Senna POS Tagger
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Rami Al-Rfou' <ralrfou at cs.stonybrook.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -10,29 +10,31 @@
Senna POS tagger, NER Tagger, Chunk Tagger
The input is:
-- path to the directory that contains SENNA executables. If the path is incorrect,
- SennaTagger will automatically search for executable file specified in SENNA environment variable
+- path to the directory that contains SENNA executables. If the path is incorrect,
+ SennaTagger will automatically search for executable file specified in SENNA environment variable
- (optionally) the encoding of the input data (default:utf-8)
+Note: Unit tests for this module can be found in test/unit/test_senna.py
+
>>> from nltk.tag import SennaTagger
- >>> tagger = SennaTagger('/usr/share/senna-v2.0')
- >>> tagger.tag('What is the airspeed of an unladen swallow ?'.split())
+ >>> tagger = SennaTagger('/usr/share/senna-v3.0')
+ >>> tagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'),
('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')]
>>> from nltk.tag import SennaChunkTagger
- >>> chktagger = SennaChunkTagger('/usr/share/senna-v2.0')
- >>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split())
+ >>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0')
+ >>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
[('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'),
('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'),
('?', 'O')]
>>> from nltk.tag import SennaNERTagger
- >>> nertagger = SennaNERTagger('/usr/share/senna-v2.0')
- >>> nertagger.tag('Shakespeare theatre was in London .'.split())
+ >>> nertagger = SennaNERTagger('/usr/share/senna-v3.0')
+ >>> nertagger.tag('Shakespeare theatre was in London .'.split()) # doctest: +SKIP
[('Shakespeare', 'B-PER'), ('theatre', 'O'), ('was', 'O'), ('in', 'O'),
('London', 'B-LOC'), ('.', 'O')]
- >>> nertagger.tag('UN headquarters are in NY , USA .'.split())
+ >>> nertagger.tag('UN headquarters are in NY , USA .'.split()) # doctest: +SKIP
[('UN', 'B-ORG'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'),
('NY', 'B-LOC'), (',', 'O'), ('USA', 'B-LOC'), ('.', 'O')]
"""
@@ -73,29 +75,29 @@ class SennaChunkTagger(Senna):
annotations = tagged_sents[i][j]
tagged_sents[i][j] = (annotations['word'], annotations['chk'])
return tagged_sents
-
+
def bio_to_chunks(self, tagged_sent, chunk_type):
"""
Extracts the chunks in a BIO chunk-tagged sentence.
-
+
>>> from nltk.tag import SennaChunkTagger
- >>> chktagger = SennaChunkTagger('/usr/share/senna-v2.0')
+ >>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0')
>>> sent = 'What is the airspeed of an unladen swallow ?'.split()
- >>> tagged_sent = chktagger.tag(sent)
- >>> tagged_sent
+ >>> tagged_sent = chktagger.tag(sent) # doctest: +SKIP
+ >>> tagged_sent # doctest: +SKIP
[('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'),
('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'),
('?', 'O')]
- >>> list(chktagger.bio_to_chunks(tagged_sent, chunk_type='NP'))
+ >>> list(chktagger.bio_to_chunks(tagged_sent, chunk_type='NP')) # doctest: +SKIP
[('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')]
-
+
:param tagged_sent: A list of tuples of word and BIO chunk tag.
:type tagged_sent: list(tuple)
:param tagged_sent: The chunk tag that users want to extract, e.g. 'NP' or 'VP'
:type tagged_sent: str
-
+
:return: An iterable of tuples of chunks that users want to extract
- and their corresponding indices.
+ and their corresponding indices.
:rtype: iter(tuple(str))
"""
current_chunk = []
@@ -107,14 +109,14 @@ class SennaChunkTagger(Senna):
current_chunk_position.append((idx))
else:
if current_chunk: # Flush the full chunk when out of an NP.
- _chunk_str = ' '.join(current_chunk)
+ _chunk_str = ' '.join(current_chunk)
_chunk_pos_str = '-'.join(map(str, current_chunk_position))
- yield _chunk_str, _chunk_pos_str
+ yield _chunk_str, _chunk_pos_str
current_chunk = []
current_chunk_position = []
if current_chunk: # Flush the last chunk.
yield ' '.join(current_chunk), '-'.join(map(str, current_chunk_position))
-
+
@python_2_unicode_compatible
class SennaNERTagger(Senna):
@@ -139,7 +141,7 @@ class SennaNERTagger(Senna):
def setup_module(module):
from nose import SkipTest
try:
- tagger = Senna('/usr/share/senna-v2.0', ['pos', 'chk', 'ner'])
+ tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
except OSError:
raise SkipTest("Senna executable not found")
diff --git a/nltk/tag/sequential.py b/nltk/tag/sequential.py
index b98ae52..3cdcd05 100644
--- a/nltk/tag/sequential.py
+++ b/nltk/tag/sequential.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Sequential Backoff Taggers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com> (minor additions)
# Tiago Tresoldi <tresoldi at users.sf.net> (original affix tagger)
@@ -18,6 +18,7 @@ consulted instead. Any SequentialBackoffTagger may serve as a
backoff tagger for any other SequentialBackoffTagger.
"""
from __future__ import print_function, unicode_literals
+from abc import abstractmethod
import re
@@ -29,8 +30,9 @@ from nltk.tag.api import TaggerI, FeaturesetTaggerI
from nltk import jsontags
+
######################################################################
-#{ Abstract Base Classes
+# Abstract Base Classes
######################################################################
class SequentialBackoffTagger(TaggerI):
"""
@@ -79,9 +81,11 @@ class SequentialBackoffTagger(TaggerI):
tag = None
for tagger in self._taggers:
tag = tagger.choose_tag(tokens, index, history)
- if tag is not None: break
+ if tag is not None:
+ break
return tag
+ @abstractmethod
def choose_tag(self, tokens, index, history):
"""
Decide which tag should be used for the specified token, and
@@ -99,7 +103,6 @@ class SequentialBackoffTagger(TaggerI):
:type history: list(str)
:param history: A list of the tags for all words before *index*.
"""
- raise NotImplementedError()
@python_2_unicode_compatible
@@ -125,6 +128,7 @@ class ContextTagger(SequentialBackoffTagger):
SequentialBackoffTagger.__init__(self, backoff)
self._context_to_tag = (context_to_tag if context_to_tag else {})
+ @abstractmethod
def context(self, tokens, index, history):
"""
:return: the context that should be used to look up the tag
@@ -132,7 +136,6 @@ class ContextTagger(SequentialBackoffTagger):
should not be handled by this tagger.
:rtype: (hashable)
"""
- raise NotImplementedError()
def choose_tag(self, tokens, index, history):
context = self.context(tokens, index, history)
@@ -180,11 +183,13 @@ class ContextTagger(SequentialBackoffTagger):
# Record the event.
token_count += 1
context = self.context(tokens, index, tags[:index])
- if context is None: continue
+ if context is None:
+ continue
fd[context][tag] += 1
# If the backoff got it wrong, this context is useful:
if (self.backoff is None or
- tag != self.backoff.tag_one(tokens, index, tags[:index])):
+ tag != self.backoff.tag_one(
+ tokens, index, tags[:index])):
useful_contexts.add(context)
# Build the context_to_tag table -- for each context, figure
@@ -200,16 +205,16 @@ class ContextTagger(SequentialBackoffTagger):
# Display some stats, if requested.
if verbose:
size = len(self._context_to_tag)
- backoff = 100 - (hit_count * 100.0)/ token_count
+ backoff = 100 - (hit_count * 100.0) / token_count
pruning = 100 - (size * 100.0) / len(fd.conditions())
print("[Trained Unigram tagger:", end=' ')
print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
size, backoff, pruning))
+
######################################################################
-#{ Tagger Classes
+# Tagger Classes
######################################################################
-
@python_2_unicode_compatible
@jsontags.register_tag
class DefaultTagger(SequentialBackoffTagger):
@@ -295,7 +300,7 @@ class NgramTagger(ContextTagger):
return cls(_n, model=_context_to_tag, backoff=backoff)
def context(self, tokens, index, history):
- tag_context = tuple(history[max(0,index-self._n+1):index])
+ tag_context = tuple(history[max(0, index-self._n+1):index])
return tag_context, tokens[index]
@@ -651,8 +656,8 @@ class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI):
untagged_sentence, tags = zip(*sentence)
for index in range(len(sentence)):
featureset = self.feature_detector(untagged_sentence,
- index, history)
- classifier_corpus.append( (featureset, tags[index]) )
+ index, history)
+ classifier_corpus.append((featureset, tags[index]))
history.append(tags[index])
if verbose:
@@ -683,6 +688,7 @@ class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI):
"""
return self._classifier
+
class ClassifierBasedPOSTagger(ClassifierBasedTagger):
"""
A classifier based part of speech tagger.
@@ -732,5 +738,3 @@ class ClassifierBasedPOSTagger(ClassifierBasedTagger):
'shape': shape,
}
return features
-
-
diff --git a/nltk/tag/stanford.py b/nltk/tag/stanford.py
index 570046e..d055e5d 100644
--- a/nltk/tag/stanford.py
+++ b/nltk/tag/stanford.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Nitin Madnani <nmadnani at ets.org>
# Rami Al-Rfou' <ralrfou at cs.stonybrook.edu>
# URL: <http://nltk.org/>
@@ -10,23 +10,26 @@
"""
A module for interfacing with the Stanford taggers.
-Tagger models need to be downloaded from http://nlp.stanford.edu/software
+Tagger models need to be downloaded from https://nlp.stanford.edu/software
and the STANFORD_MODELS environment variable set (a colon-separated
list of paths).
For more details see the documentation for StanfordPOSTagger and StanfordNERTagger.
"""
+from abc import abstractmethod
import os
import tempfile
from subprocess import PIPE
import warnings
-from nltk.internals import find_file, find_jar, config_java, java, _java_options, find_jars_within_path
+from six import text_type
+
+from nltk.internals import find_file, find_jar, config_java, java, _java_options
from nltk.tag.api import TaggerI
-from nltk import compat
-_stanford_url = 'http://nlp.stanford.edu/software'
+_stanford_url = 'https://nlp.stanford.edu/software'
+
class StanfordTagger(TaggerI):
"""
@@ -42,33 +45,35 @@ class StanfordTagger(TaggerI):
_SEPARATOR = ''
_JAR = ''
- def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
+ def __init__(self, model_filename, path_to_jar=None, encoding='utf8',
+ verbose=False, java_options='-mx1000m'):
if not self._JAR:
warnings.warn('The StanfordTagger class is not meant to be '
- 'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
+ 'instantiated directly. Did you mean '
+ 'StanfordPOSTagger or StanfordNERTagger?')
self._stanford_jar = find_jar(
self._JAR, path_to_jar,
searchpath=(), url=_stanford_url,
verbose=verbose)
self._stanford_model = find_file(model_filename,
- env_vars=('STANFORD_MODELS',), verbose=verbose)
-
- # Adding logging jar files to classpath
- stanford_dir = os.path.split(self._stanford_jar)[0]
- self._stanford_jar = tuple(find_jars_within_path(stanford_dir))
-
+ env_vars=('STANFORD_MODELS',),
+ verbose=verbose)
+
self._encoding = encoding
self.java_options = java_options
@property
+ @abstractmethod
def _cmd(self):
- raise NotImplementedError
+ """
+ A property that returns the command that will be executed.
+ """
def tag(self, tokens):
- # This function should return list of tuple rather than list of list
- return sum(self.tag_sents([tokens]), [])
+ # This function should return list of tuple rather than list of list
+ return sum(self.tag_sents([tokens]), [])
def tag_sents(self, sentences):
encoding = self._encoding
@@ -80,29 +85,29 @@ class StanfordTagger(TaggerI):
cmd = list(self._cmd)
cmd.extend(['-encoding', encoding])
-
+
# Write the actual sentences to the temporary input file
_input_fh = os.fdopen(_input_fh, 'wb')
_input = '\n'.join((' '.join(x) for x in sentences))
- if isinstance(_input, compat.text_type) and encoding:
+ if isinstance(_input, text_type) and encoding:
_input = _input.encode(encoding)
_input_fh.write(_input)
_input_fh.close()
-
+
# Run the tagger and get the output
stanpos_output, _stderr = java(cmd, classpath=self._stanford_jar,
- stdout=PIPE, stderr=PIPE)
+ stdout=PIPE, stderr=PIPE)
stanpos_output = stanpos_output.decode(encoding)
-
+
# Delete the temporary file
- os.unlink(self._input_file_path)
+ os.unlink(self._input_file_path)
# Return java configurations to their default values
config_java(options=default_options, verbose=False)
-
+
return self.parse_output(stanpos_output, sentences)
- def parse_output(self, text, sentences = None):
+ def parse_output(self, text, sentences=None):
# Output the tagged sentences
tagged_sentences = []
for tagged_sentence in text.strip().split("\n"):
@@ -113,6 +118,7 @@ class StanfordTagger(TaggerI):
tagged_sentences.append(sentence)
return tagged_sentences
+
class StanfordPOSTagger(StanfordTagger):
"""
A class for pos tagging with Stanford Tagger. The input is the paths to:
@@ -124,11 +130,10 @@ class StanfordPOSTagger(StanfordTagger):
Example:
>>> from nltk.tag import StanfordPOSTagger
- >>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # doctest: +SKIP
- >>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
+ >>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
+ >>> st.tag('What is the airspeed of an unladen swallow ?'.split())
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
"""
-
_SEPARATOR = '_'
_JAR = 'stanford-postagger.jar'
@@ -139,7 +144,9 @@ class StanfordPOSTagger(StanfordTagger):
def _cmd(self):
return ['edu.stanford.nlp.tagger.maxent.MaxentTagger',
'-model', self._stanford_model, '-textFile',
- self._input_file_path, '-tokenize', 'false','-outputFormatOptions', 'keepEmptySentences']
+ self._input_file_path, '-tokenize', 'false',
+ '-outputFormatOptions', 'keepEmptySentences']
+
class StanfordNERTagger(StanfordTagger):
"""
@@ -169,28 +176,39 @@ class StanfordNERTagger(StanfordTagger):
@property
def _cmd(self):
- # Adding -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false for not using stanford Tokenizer
+ # Adding -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false for not using stanford Tokenizer
return ['edu.stanford.nlp.ie.crf.CRFClassifier',
'-loadClassifier', self._stanford_model, '-textFile',
- self._input_file_path, '-outputFormat', self._FORMAT, '-tokenizerFactory', 'edu.stanford.nlp.process.WhitespaceTokenizer', '-tokenizerOptions','\"tokenizeNLs=false\"']
+ self._input_file_path, '-outputFormat', self._FORMAT,
+ '-tokenizerFactory',
+ 'edu.stanford.nlp.process.WhitespaceTokenizer',
+ '-tokenizerOptions', '\"tokenizeNLs=false\"']
def parse_output(self, text, sentences):
if self._FORMAT == 'slashTags':
- # Joint together to a big list
+ # Joint together to a big list
tagged_sentences = []
for tagged_sentence in text.strip().split("\n"):
for tagged_word in tagged_sentence.strip().split():
word_tags = tagged_word.strip().split(self._SEPARATOR)
- tagged_sentences.append((''.join(word_tags[:-1]), word_tags[-1]))
-
+ tagged_sentences.append((''.join(word_tags[:-1]),
+ word_tags[-1]))
+
# Separate it according to the input
result = []
- start = 0
+ start = 0
for sent in sentences:
result.append(tagged_sentences[start:start + len(sent)])
- start += len(sent);
- return result
+ start += len(sent)
+ return result
raise NotImplementedError
+def setup_module(module):
+ from nose import SkipTest
+ try:
+ StanfordPOSTagger('english-bidirectional-distsim.tagger')
+ except LookupError:
+ raise SkipTest('Doctests from nltk.tag.stanford are skipped because one \
+ of the stanford jars cannot be found.')
diff --git a/nltk/tag/tnt.py b/nltk/tag/tnt.py
index 47759d3..63db23a 100755
--- a/nltk/tag/tnt.py
+++ b/nltk/tag/tnt.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: TnT Tagger
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Sam Huston <sjh900 at gmail.com>
#
# URL: <http://nltk.org/>
@@ -357,30 +357,24 @@ class TnT(TaggerI):
# if word is known
# compute the set of possible tags
# and their associated log probabilities
- if word in self._wd.conditions():
+ if word in self._wd:
self.known += 1
for (history, curr_sent_logprob) in current_states:
logprobs = []
for t in self._wd[word].keys():
- p_uni = self._uni.freq((t,C))
- p_bi = self._bi[history[-1]].freq((t,C))
- p_tri = self._tri[tuple(history[-2:])].freq((t,C))
- p_wd = self._wd[word][t] / self._uni[(t,C)]
+ tC = (t,C)
+ p_uni = self._uni.freq(tC)
+ p_bi = self._bi[history[-1]].freq(tC)
+ p_tri = self._tri[tuple(history[-2:])].freq(tC)
+ p_wd = self._wd[word][t] / self._uni[tC]
p = self._l1 *p_uni + self._l2 *p_bi + self._l3 *p_tri
p2 = log(p, 2) + log(p_wd, 2)
- logprobs.append(((t,C), p2))
-
-
- # compute the result of appending each tag to this history
- for (tag, logprob) in logprobs:
- new_states.append((history + [tag],
- curr_sent_logprob + logprob))
-
-
-
+ # compute the result of appending each tag to this history
+ new_states.append((history + [tC],
+ curr_sent_logprob + p2))
# otherwise a new word, set of possible tags is unknown
else:
@@ -398,7 +392,7 @@ class TnT(TaggerI):
tag = ('Unk',C)
# otherwise apply the unknown word tagger
- else :
+ else:
[(_w, t)] = list(self._unk.tag([word]))
tag = (t,C)
@@ -407,8 +401,6 @@ class TnT(TaggerI):
new_states = current_states
-
-
# now have computed a set of possible new_states
# sort states by log prob
@@ -420,7 +412,6 @@ class TnT(TaggerI):
if len(new_states) > self._N:
new_states = new_states[:self._N]
-
# compute the tags for the rest of the sentence
# return the best list of tags for the sentence
return self._tagword(sent, new_states)
diff --git a/nltk/tag/util.py b/nltk/tag/util.py
index eee98e7..5d72f01 100644
--- a/nltk/tag/util.py
+++ b/nltk/tag/util.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Tagger Utilities
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/tbl/__init__.py b/nltk/tbl/__init__.py
index cd56392..a71ca8c 100644
--- a/nltk/tbl/__init__.py
+++ b/nltk/tbl/__init__.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Marcus Uneson <marcus.uneson at gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
diff --git a/nltk/tbl/demo.py b/nltk/tbl/demo.py
index 06d4b1e..fa70972 100644
--- a/nltk/tbl/demo.py
+++ b/nltk/tbl/demo.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Marcus Uneson <marcus.uneson at gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
diff --git a/nltk/tbl/erroranalysis.py b/nltk/tbl/erroranalysis.py
index 72253d7..aaceb01 100644
--- a/nltk/tbl/erroranalysis.py
+++ b/nltk/tbl/erroranalysis.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Marcus Uneson <marcus.uneson at gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
diff --git a/nltk/tbl/feature.py b/nltk/tbl/feature.py
index 1f54245..eb3539b 100644
--- a/nltk/tbl/feature.py
+++ b/nltk/tbl/feature.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Marcus Uneson <marcus.uneson at gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
@@ -9,8 +9,11 @@
# For license information, see LICENSE.TXT
from __future__ import division, print_function, unicode_literals
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+ at add_metaclass(ABCMeta)
class Feature(object):
"""
An abstract base class for Features. A Feature is a combination of
@@ -29,9 +32,6 @@ class Feature(object):
to the classname.
"""
- # !!FOR_FUTURE: when targeting python3 only, consider @abc.abstractmethod
- # and metaclass=abc.ABCMeta rather than NotImplementedError
- # http://julien.danjou.info/blog/2013/guide-python-static-class-abstract-methods
json_tag = 'nltk.tbl.Feature'
PROPERTY_NAME = None
@@ -245,6 +245,7 @@ class Feature(object):
return self < other or self == other
@staticmethod
+ @abstractmethod
def extract_property(tokens, index):
"""
Any subclass of Feature must define static method extract_property(tokens, index)
@@ -256,6 +257,3 @@ class Feature(object):
:return: feature value
:rtype: any (but usually scalar)
"""
- raise NotImplementedError
-
-
diff --git a/nltk/tbl/rule.py b/nltk/tbl/rule.py
index d039718..7c5c3f2 100644
--- a/nltk/tbl/rule.py
+++ b/nltk/tbl/rule.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Marcus Uneson <marcus.uneson at gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
@@ -9,15 +9,17 @@
# For license information, see LICENSE.TXT
from __future__ import print_function
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
from nltk.compat import python_2_unicode_compatible, unicode_repr
from nltk import jsontags
+
######################################################################
# Tag Rules
######################################################################
-
-
+ at add_metaclass(ABCMeta)
class TagRule(object):
"""
An interface for tag transformations on a tagged corpus, as
@@ -71,6 +73,7 @@ class TagRule(object):
return change
+ @abstractmethod
def applies(self, tokens, index):
"""
:return: True if the rule would change the tag of
@@ -81,7 +84,6 @@ class TagRule(object):
:param index: The index to check
:type index: int
"""
- raise NotImplementedError
# Rules must be comparable and hashable for the algorithm to work
def __eq__(self, other):
@@ -317,5 +319,3 @@ class Rule(TagRule):
condition_to_str(f, v) for (f, v) in self._conditions
)
return replacement + conditions
-
-
diff --git a/nltk/tbl/template.py b/nltk/tbl/template.py
index cb66129..38db64b 100644
--- a/nltk/tbl/template.py
+++ b/nltk/tbl/template.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Marcus Uneson <marcus.uneson at gmail.com>
# based on previous (nltk2) version by
# Christopher Maloof, Edward Loper, Steven Bird
@@ -9,19 +9,21 @@
# For license information, see LICENSE.TXT
from __future__ import print_function
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
import itertools as it
from nltk.tbl.feature import Feature
+from nltk.tbl.rule import Rule
+ at add_metaclass(ABCMeta)
class BrillTemplateI(object):
"""
An interface for generating lists of transformational rules that
apply at given sentence positions. ``BrillTemplateI`` is used by
``Brill`` training algorithms to generate candidate rules.
"""
- #!!FOR_FUTURE: when targeting python3 only, consider @abc.abstractmethod
- # and metaclass=abc.ABCMeta rather than NotImplementedError
- #http://julien.danjou.info/blog/2013/guide-python-static-class-abstract-methods
+ @abstractmethod
def applicable_rules(self, tokens, i, correctTag):
"""
Return a list of the transformational rules that would correct
@@ -41,8 +43,8 @@ class BrillTemplateI(object):
:type correctTag: any
:rtype: list(BrillRule)
"""
- raise NotImplementedError
+ @abstractmethod
def get_neighborhood(self, token, index):
"""
Returns the set of indices *i* such that
@@ -57,10 +59,6 @@ class BrillTemplateI(object):
:type index: int
:rtype: set
"""
- raise NotImplementedError
-
-
-from nltk.tbl.rule import Rule
class Template(BrillTemplateI):
@@ -75,8 +73,8 @@ class Template(BrillTemplateI):
- are applicable to the given token.
"""
ALLTEMPLATES = []
- #record a unique id of form "001", for each template created
-# _ids = it.count(0)
+ # record a unique id of form "001", for each template created
+ # _ids = it.count(0)
def __init__(self, *features):
@@ -129,9 +127,9 @@ class Template(BrillTemplateI):
:type features: list of Features
:param features: the features to build this Template on
"""
- #determine the calling form: either
- #Template(Feature, args1, [args2, ...)]
- #Template(Feature1(args), Feature2(args), ...)
+ # determine the calling form: either
+ # Template(Feature, args1, [args2, ...)]
+ # Template(Feature1(args), Feature2(args), ...)
if all(isinstance(f, Feature) for f in features):
self._features = features
elif issubclass(features[0], Feature) and all(isinstance(a, tuple) for a in features[1:]):
@@ -267,13 +265,13 @@ class Template(BrillTemplateI):
"""
def nonempty_powerset(xs): #xs is a list
- #itertools docnonempty_powerset([1,2,3]) --> (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
+ # itertools docnonempty_powerset([1,2,3]) --> (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
- #find the correct tuple given combinations, one of {None, k, (k1,k2)}
+ # find the correct tuple given combinations, one of {None, k, (k1,k2)}
k = combinations #for brevity
- combrange = ((1, len(xs)+1) if k is None else #n over 1 .. n over n (all non-empty combinations)
- (k, k+1) if isinstance(k, int) else #n over k (only
- (k[0], k[1]+1)) #n over k1, n over k1+1... n over k2
+ combrange = ((1, len(xs)+1) if k is None else # n over 1 .. n over n (all non-empty combinations)
+ (k, k+1) if isinstance(k, int) else # n over k (only
+ (k[0], k[1]+1)) # n over k1, n over k1+1... n over k2
return it.chain.from_iterable(it.combinations(xs, r)
for r in range(*combrange))
seentemplates = set()
@@ -281,7 +279,7 @@ class Template(BrillTemplateI):
for pick in it.product(*picks):
if any(i != j and x.issuperset(y)
for (i, x) in enumerate(pick)
- for (j,y) in enumerate(pick)):
+ for (j, y) in enumerate(pick)):
continue
if skipintersecting and any(i != j and x.intersects(y)
for (i, x) in enumerate(pick)
@@ -303,6 +301,3 @@ class Template(BrillTemplateI):
@classmethod
def _poptemplate(cls):
return cls.ALLTEMPLATES.pop() if cls.ALLTEMPLATES else None
-
-
-
diff --git a/nltk/test/__init__.py b/nltk/test/__init__.py
index 91ed07a..9debad8 100644
--- a/nltk/test/__init__.py
+++ b/nltk/test/__init__.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Unit Tests
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/test/bnc.doctest b/nltk/test/bnc.doctest
index 242a999..b84491e 100644
--- a/nltk/test/bnc.doctest
+++ b/nltk/test/bnc.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
>>> import os.path
diff --git a/nltk/test/ccg.doctest b/nltk/test/ccg.doctest
index 0cc13d4..65116b9 100644
--- a/nltk/test/ccg.doctest
+++ b/nltk/test/ccg.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
==============================
diff --git a/nltk/test/ccg_semantics.doctest b/nltk/test/ccg_semantics.doctest
index 81e4290..649c86d 100644
--- a/nltk/test/ccg_semantics.doctest
+++ b/nltk/test/ccg_semantics.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
==============================================
diff --git a/nltk/test/chat80.doctest b/nltk/test/chat80.doctest
index 64b2f38..13dd70f 100644
--- a/nltk/test/chat80.doctest
+++ b/nltk/test/chat80.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
=======
diff --git a/nltk/test/chunk.doctest b/nltk/test/chunk.doctest
index d877083..3cd4405 100644
--- a/nltk/test/chunk.doctest
+++ b/nltk/test/chunk.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
==========
diff --git a/nltk/test/classify.doctest b/nltk/test/classify.doctest
index 3f00b1d..986c9b8 100644
--- a/nltk/test/classify.doctest
+++ b/nltk/test/classify.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
=============
diff --git a/nltk/test/collocations.doctest b/nltk/test/collocations.doctest
index 01f8c23..19a04d4 100644
--- a/nltk/test/collocations.doctest
+++ b/nltk/test/collocations.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
==============
diff --git a/nltk/test/compat.doctest b/nltk/test/compat.doctest
index ecf5a9a..1d668f3 100644
--- a/nltk/test/compat.doctest
+++ b/nltk/test/compat.doctest
@@ -9,10 +9,6 @@ NLTK comes with a Python 2.x/3.x compatibility layer, nltk.compat
>>> from nltk import compat
>>> compat.PY3
False
- >>> compat.integer_types
- (<type 'int'>, <type 'long'>)
- >>> compat.string_types
- (<type 'basestring'>,)
>>> # and so on
@python_2_unicode_compatible
diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest
index 3976774..b658e53 100644
--- a/nltk/test/corpus.doctest
+++ b/nltk/test/corpus.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
================
@@ -372,50 +372,6 @@ Reading the CoNLL 2007 Dependency Treebanks:
(frente_a , (0,9349_dólares los (de (mañana esta)))))))
.)
-NLTK also provides a corpus reader for the York-Toronto-Helsinki
-Parsed Corpus of Old English Prose (YCOE); but the corpus itself is
-not included in the NLTK data package. If you install it yourself,
-you can use NLTK to access it:
-
- >>> from nltk.corpus import ycoe
- >>> for tree in ycoe.parsed_sents('cocuraC')[:4]:
- ... print(tree) # doctest: +SKIP
- (CP-THT
- (C +D+atte)
- (IP-SUB ...)
- ...
- (. .))
- (IP-MAT
- (IP-MAT-0
- (PP (P On) (NP (ADJ o+dre) (N wisan)))...)
- ...
- (. .))
- (IP-MAT
- (NP-NOM-x-2 *exp*)
- (NP-DAT-1 (D^D +D+am) (ADJ^D unge+dyldegum))
- ...
- (. .))
- (IP-MAT
- (ADVP (ADV Sw+a))
- (NP-NOM-x (PRO^N hit))
- (ADVP-TMP (ADV^T oft))
- ...
- (. .))
-
-If the YCOE corpus is not available, you will get an error message
-when you try to access it:
-
- >>> from nltk.corpus import ycoe
- >>> print(ycoe) # doctest: +SKIP
- Traceback (most recent call last):
- LookupError:
- **********************************************************************
- Resource 'corpora/ycoe' not found. For installation
- instructions, please see <http://nltk.org/index.php/Installation>.
- Searched in:
- - ...
- **********************************************************************
-
Word Lists and Lexicons
=======================
diff --git a/nltk/test/crubadan.doctest b/nltk/test/crubadan.doctest
index d485ffc..5d2c6d4 100644
--- a/nltk/test/crubadan.doctest
+++ b/nltk/test/crubadan.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
Crubadan Corpus Reader
diff --git a/nltk/test/data.doctest b/nltk/test/data.doctest
index 9cc7663..7a8ddc9 100644
--- a/nltk/test/data.doctest
+++ b/nltk/test/data.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
=========================================
diff --git a/nltk/test/dependency.doctest b/nltk/test/dependency.doctest
index 80607ee..97a7300 100755
--- a/nltk/test/dependency.doctest
+++ b/nltk/test/dependency.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
===================
diff --git a/nltk/test/discourse.doctest b/nltk/test/discourse.doctest
index e84dec0..befbd4e 100644
--- a/nltk/test/discourse.doctest
+++ b/nltk/test/discourse.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
==================
diff --git a/nltk/test/drt.doctest b/nltk/test/drt.doctest
index 8f73283..f442427 100644
--- a/nltk/test/drt.doctest
+++ b/nltk/test/drt.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
================================
diff --git a/nltk/test/featgram.doctest b/nltk/test/featgram.doctest
index ded2f8c..c10ad89 100644
--- a/nltk/test/featgram.doctest
+++ b/nltk/test/featgram.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
=========================
diff --git a/nltk/test/featstruct.doctest b/nltk/test/featstruct.doctest
index 55e5eb8..293c3aa 100644
--- a/nltk/test/featstruct.doctest
+++ b/nltk/test/featstruct.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
==================================
@@ -334,7 +334,7 @@ base value types that have custom unification methods. For example,
the following feature value type encodes a range, and defines
unification as taking the intersection on the ranges:
- >>> from nltk.compat import total_ordering
+ >>> from functools import total_ordering
>>> from nltk.featstruct import CustomFeatureValue, UnificationFailure
>>> @total_ordering
... class Range(CustomFeatureValue):
@@ -1227,4 +1227,3 @@ is present.)
..
>>> del fs1, fs2 # clean-up
-
diff --git a/nltk/test/framenet.doctest b/nltk/test/framenet.doctest
index e1b1681..5776ad5 100644
--- a/nltk/test/framenet.doctest
+++ b/nltk/test/framenet.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
========
@@ -70,29 +70,40 @@ that pattern:
>>> from pprint import pprint
>>> from nltk.corpus import framenet as fn
- >>> len(fn.frames())
- 1019
- >>> pprint(fn.frames(r'(?i)medical'))
- [<frame ID=256 name=Medical_specialties>, <frame ID=257 name=Medical_instruments>, ...]
+ >>> x = fn.frames(r'(?i)crim')
+ >>> x.sort(key=lambda f: f.ID)
+ >>> x
+ [<frame ID=200 name=Criminal_process>, <frame ID=500 name=Criminal_investigation>, ...]
To get the details of a particular Frame, you can use the `frame()`
function passing in the frame number:
>>> from pprint import pprint
>>> from nltk.corpus import framenet as fn
- >>> f = fn.frame(256)
+ >>> f = fn.frame(202)
>>> f.ID
- 256
+ 202
>>> f.name
- 'Medical_specialties'
+ 'Arrest'
>>> f.definition # doctest: +ELLIPSIS
- "This frame includes words that name ..."
+ "Authorities charge a Suspect, who is under suspicion of having committed a crime..."
>>> len(f.lexUnit)
- 29
+ 11
>>> pprint(sorted([x for x in f.FE]))
- ['Affliction', 'Body_system', 'Specialty', 'Type']
+ ['Authorities',
+ 'Charges',
+ 'Co-participant',
+ 'Manner',
+ 'Means',
+ 'Offense',
+ 'Place',
+ 'Purpose',
+ 'Source_of_legal_authority',
+ 'Suspect',
+ 'Time',
+ 'Type']
>>> pprint(f.frameRelations)
- [<Parent=Cure -- Using -> Child=Medical_specialties>]
+ [<Parent=Intentionally_affect -- Inheritance -> Child=Arrest>, <Complex=Criminal_process -- Subframe -> Component=Arrest>, ...]
The `frame()` function shown above returns a dict object containing
detailed information about the Frame. See the documentation on the
@@ -106,8 +117,8 @@ expression. Note that LU names are composed of "lemma.POS", where the
multiple lexemes (e.g. 'a little') (see below).
>>> from nltk.corpus import framenet as fn
- >>> fn.frames_by_lemma(r'(?i)a little')
- [<frame ID=189 name=Quantity>, <frame ID=2001 name=Degree>]
+ >>> fn.frames_by_lemma(r'(?i)a little') # doctest: +ELLIPSIS
+ [<frame ID=189 name=Quanti...>, <frame ID=2001 name=Degree>]
-------------
Lexical Units
@@ -176,8 +187,6 @@ pattern that will be matched against the name of the lexical unit:
>>> from pprint import pprint
>>> from nltk.corpus import framenet as fn
- >>> len(fn.lus())
- 11829
>>> pprint(fn.lus(r'(?i)a little'))
[<lu ID=14744 name=a little bit.adv>, <lu ID=14733 name=a little.n>, ...]
@@ -221,19 +230,57 @@ Annotated Documents
-------------------
The FrameNet corpus contains a small set of annotated documents. A list
-of these documents can be obtained by calling the `documents()` function:
+of these documents can be obtained by calling the `docs()` function:
>>> from pprint import pprint
>>> from nltk.corpus import framenet as fn
- >>> docs = fn.documents()
- >>> len(docs)
- 78
- >>> pprint(sorted(docs[0].keys()))
- ['ID', 'corpid', 'corpname', 'description', 'filename']
-
-Detailed information about each sentence contained in each document can
-be obtained by calling the `annotated_document()` function and supplying
-the 'ID' number of the document. For detailed information about the info
-that is for each document, see the documentation on the
-`annotated_document()` function.
-
+ >>> d = fn.docs('BellRinging')[0]
+ >>> d.corpname
+ 'PropBank'
+ >>> d.sentence[49] # doctest: +ELLIPSIS
+ full-text sentence (...) in BellRinging:
+ <BLANKLINE>
+ <BLANKLINE>
+ [POS] 17 tags
+ <BLANKLINE>
+ [POS_tagset] PENN
+ <BLANKLINE>
+ [text] + [annotationSet]
+ <BLANKLINE>
+ `` I live in hopes that the ringers themselves will be drawn into
+ ***** ******* *****
+ Desir Cause_t Cause
+ [1] [3] [2]
+ <BLANKLINE>
+ that fuller life .
+ ******
+ Comple
+ [4]
+ (Desir=Desiring, Cause_t=Cause_to_make_noise, Cause=Cause_motion, Comple=Completeness)
+ <BLANKLINE>
+
+ >>> d.sentence[49].annotationSet[1] # doctest: +ELLIPSIS
+ annotation set (...):
+ <BLANKLINE>
+ [status] MANUAL
+ <BLANKLINE>
+ [LU] (6605) hope.n in Desiring
+ <BLANKLINE>
+ [frame] (366) Desiring
+ <BLANKLINE>
+ [GF] 2 relations
+ <BLANKLINE>
+ [PT] 2 phrases
+ <BLANKLINE>
+ [text] + [Target] + [FE] + [Noun]
+ <BLANKLINE>
+ `` I live in hopes that the ringers themselves will be drawn into
+ - ^^^^ ^^ ***** ----------------------------------------------
+ E supp su Event
+ <BLANKLINE>
+ that fuller life .
+ -----------------
+ <BLANKLINE>
+ (E=Experiencer, su=supp)
+ <BLANKLINE>
+ <BLANKLINE>
diff --git a/nltk/test/generate.doctest b/nltk/test/generate.doctest
index 3536444..d6e006e 100644
--- a/nltk/test/generate.doctest
+++ b/nltk/test/generate.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
===============================================
diff --git a/nltk/test/gensim.doctest b/nltk/test/gensim.doctest
index eae4a6b..79a9af8 100644
--- a/nltk/test/gensim.doctest
+++ b/nltk/test/gensim.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
=======================================
@@ -35,7 +35,7 @@ For example, to compute the cosine similarity between 2 words:
>>> new_model.similarity('university','school') > 0.3
True
-
+
---------------------------
Using the pre-trained model
---------------------------
@@ -45,8 +45,8 @@ The full model is from https://code.google.com/p/word2vec/ (about 3 GB).
>>> from nltk.data import find
>>> word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
- >>> model = gensim.models.Word2Vec.load_word2vec_format(word2vec_sample, binary=False)
-
+ >>> model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)
+
We pruned the model to only include the most common words (~44k words).
>>> len(model.vocab)
@@ -56,12 +56,12 @@ Each word is represented in the space of 300 dimensions:
>>> len(model['university'])
300
-
+
Finding the top n words that are similar to a target word is simple. The result is the list of n words with the score.
>>> model.most_similar(positive=['university'], topn = 3)
[(u'universities', 0.70039...), (u'faculty', 0.67809...), (u'undergraduate', 0.65870...)]
-
+
Finding a word that is not in a list is also supported, although, implementing this by yourself is simple.
>>> model.doesnt_match('breakfast cereal dinner lunch'.split())
@@ -138,4 +138,3 @@ We use this code to get the `word2vec_sample` model.
| f.write('{} {}\n'.format(word, ' '.join(str(value) for value in model[word])))
|
| f.close()
-
diff --git a/nltk/test/gluesemantics.doctest b/nltk/test/gluesemantics.doctest
index f8d6b47..e307219 100644
--- a/nltk/test/gluesemantics.doctest
+++ b/nltk/test/gluesemantics.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
==============================================================================
@@ -343,13 +343,12 @@ Dependency Graph to Glue Formulas
... 4 dog _ NN NN _ 2 OBJ _ _
... """)
>>> gfl = GlueDict('nltk:grammars/sample_grammars/glue.semtype').to_glueformula_list(depgraph)
- >>> for gf in sorted(gfl):
- ... print(gf)
- \P Q.exists x.(P(x) & Q(x)) : ((fv -o fr) -o ((f -o F2) -o F2))
- \P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I5) -o I5))
- \x y.sees(x,y) : (f -o (i -o g))
- \x.John(x) : (fv -o fr)
- \x.dog(x) : (iv -o ir)
+ >>> print(gfl) # doctest: +SKIP
+ [\x y.sees(x,y) : (f -o (i -o g)),
+ \x.dog(x) : (iv -o ir),
+ \P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I3) -o I3)),
+ \P Q.exists x.(P(x) & Q(x)) : ((fv -o fr) -o ((f -o F4) -o F4)),
+ \x.John(x) : (fv -o fr)]
>>> glue = Glue()
>>> for r in sorted([r.simplify().normalize() for r in glue.get_readings(glue.gfl_to_compiled(gfl))], key=str):
... print(r)
@@ -375,12 +374,11 @@ Dependency Graph to LFG f-structure
---------------------------------
LFG f-structure to Glue
---------------------------------
- >>> for gf in sorted(fstruct.to_glueformula_list(GlueDict('nltk:grammars/sample_grammars/glue.semtype'))):
- ... print(gf)
- \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G5) -o G5))
- \P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I2) -o I2))
- \x y.sees(x,y) : (i -o (g -o f))
- \x.John(x) : (iv -o ir)
- \x.dog(x) : (gv -o gr)
+ >>> fstruct.to_glueformula_list(GlueDict('nltk:grammars/sample_grammars/glue.semtype')) # doctest: +SKIP
+ [\x y.sees(x,y) : (i -o (g -o f)),
+ \x.dog(x) : (gv -o gr),
+ \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G3) -o G3)),
+ \P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I4) -o I4)),
+ \x.John(x) : (iv -o ir)]
.. see gluesemantics_malt.doctest for more
diff --git a/nltk/test/gluesemantics_malt.doctest b/nltk/test/gluesemantics_malt.doctest
index 308d0ec..fc3df2b 100644
--- a/nltk/test/gluesemantics_malt.doctest
+++ b/nltk/test/gluesemantics_malt.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
.. see also: gluesemantics.doctest
diff --git a/nltk/test/grammar.doctest b/nltk/test/grammar.doctest
index de232f1..f265ff4 100644
--- a/nltk/test/grammar.doctest
+++ b/nltk/test/grammar.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
===============
diff --git a/nltk/test/grammartestsuites.doctest b/nltk/test/grammartestsuites.doctest
index 731ae4a..cbff6b8 100644
--- a/nltk/test/grammartestsuites.doctest
+++ b/nltk/test/grammartestsuites.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
==========================
diff --git a/nltk/test/index.doctest b/nltk/test/index.doctest
index a039849..d328f93 100644
--- a/nltk/test/index.doctest
+++ b/nltk/test/index.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
.. _align howto: align.html
diff --git a/nltk/test/inference.doctest b/nltk/test/inference.doctest
index 962e0d5..6836e9b 100644
--- a/nltk/test/inference.doctest
+++ b/nltk/test/inference.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
====================================
diff --git a/nltk/test/internals.doctest b/nltk/test/internals.doctest
index f6fc2a2..f5f4c5e 100644
--- a/nltk/test/internals.doctest
+++ b/nltk/test/internals.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
==========================================
diff --git a/nltk/test/japanese.doctest b/nltk/test/japanese.doctest
index 08dac17..2ca6caf 100644
--- a/nltk/test/japanese.doctest
+++ b/nltk/test/japanese.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
============================
diff --git a/nltk/test/logic.doctest b/nltk/test/logic.doctest
index 2dffc3a..c340181 100644
--- a/nltk/test/logic.doctest
+++ b/nltk/test/logic.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
=======================
diff --git a/nltk/test/metrics.doctest b/nltk/test/metrics.doctest
index 36ee980..9541b59 100644
--- a/nltk/test/metrics.doctest
+++ b/nltk/test/metrics.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
=======
@@ -62,7 +62,7 @@ Other distance measures:
>>> print(jaccard_distance(s1, s2))
0.6
>>> print(masi_distance(s1, s2))
- 0.868...
+ 0.198
----------------------
Miscellaneous Measures
@@ -239,7 +239,7 @@ For other associations, we ensure the ordering of the measures:
True
>>> bam.dice(20, (42, 20), N) > bam.dice(20, (41, 27), N)
True
- >>> bam.fisher(20, (42, 20), N) > bam.fisher(20, (41, 27), N)
+ >>> bam.fisher(20, (42, 20), N) > bam.fisher(20, (41, 27), N) # doctest: +SKIP
False
For trigrams, we have to provide more count information:
diff --git a/nltk/test/misc.doctest b/nltk/test/misc.doctest
index 6555519..98bcbc1 100644
--- a/nltk/test/misc.doctest
+++ b/nltk/test/misc.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
--------------------------------------------------------------------------------
diff --git a/nltk/test/nonmonotonic.doctest b/nltk/test/nonmonotonic.doctest
index 73ed059..8e9f982 100644
--- a/nltk/test/nonmonotonic.doctest
+++ b/nltk/test/nonmonotonic.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
======================
diff --git a/nltk/test/parse.doctest b/nltk/test/parse.doctest
index d9dac28..11e6c27 100644
--- a/nltk/test/parse.doctest
+++ b/nltk/test/parse.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
=========
diff --git a/nltk/test/portuguese_en.doctest b/nltk/test/portuguese_en.doctest
index 83087af..bc56cf9 100644
--- a/nltk/test/portuguese_en.doctest
+++ b/nltk/test/portuguese_en.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
==================================
diff --git a/nltk/test/probability.doctest b/nltk/test/probability.doctest
index 54d4e93..58bd007 100644
--- a/nltk/test/probability.doctest
+++ b/nltk/test/probability.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
===========
diff --git a/nltk/test/propbank.doctest b/nltk/test/propbank.doctest
index 4d3cca3..90e572c 100644
--- a/nltk/test/propbank.doctest
+++ b/nltk/test/propbank.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
========
diff --git a/nltk/test/relextract.doctest b/nltk/test/relextract.doctest
index 758c42c..23348c5 100644
--- a/nltk/test/relextract.doctest
+++ b/nltk/test/relextract.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
======================
diff --git a/nltk/test/resolution.doctest b/nltk/test/resolution.doctest
index a714677..62027ab 100644
--- a/nltk/test/resolution.doctest
+++ b/nltk/test/resolution.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
=========================
diff --git a/nltk/test/runtests.py b/nltk/test/runtests.py
index 13d8089..ef31865 100755
--- a/nltk/test/runtests.py
+++ b/nltk/test/runtests.py
@@ -16,17 +16,26 @@ NLTK_TEST_DIR = os.path.join(NLTK_ROOT, 'nltk')
if __name__ == '__main__':
# there shouldn't be import from NLTK for coverage to work properly
from doctest_nose_plugin import DoctestFix
+ try:
+ # Import RedNose plugin for colored test output
+ from rednose import RedNose
+ rednose_available = True
+ except ImportError:
+ rednose_available = False
class NltkPluginManager(PluginManager):
"""
Nose plugin manager that replaces standard doctest plugin
- with a patched version.
+ with a patched version and adds RedNose plugin for colored test output.
"""
def loadPlugins(self):
for plug in builtin.plugins:
if plug != Doctest:
self.addPlugin(plug())
self.addPlugin(DoctestFix())
+ if rednose_available:
+ self.addPlugin(RedNose())
+
super(NltkPluginManager, self).loadPlugins()
manager = NltkPluginManager()
@@ -47,6 +56,13 @@ if __name__ == '__main__':
# only extra options were passed
args += [NLTK_TEST_DIR]
+ # Activate RedNose and hide skipped test messages from output
+ if rednose_available:
+ args += [
+ '--rednose',
+ '--hide-skips'
+ ]
+
arguments = [
'--exclude=', # why is this needed?
#'--with-xunit',
diff --git a/nltk/test/semantics.doctest b/nltk/test/semantics.doctest
index 2aa9d6c..1f04cf3 100644
--- a/nltk/test/semantics.doctest
+++ b/nltk/test/semantics.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
=========
diff --git a/nltk/test/sentiment.doctest b/nltk/test/sentiment.doctest
index e6e9430..400feb3 100644
--- a/nltk/test/sentiment.doctest
+++ b/nltk/test/sentiment.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
===================
@@ -128,8 +128,8 @@ Vader
... under orders and in the ''least offensive way possible.''"
... ]
>>> sentences.extend(tricky_sentences)
- >>> sid = SentimentIntensityAnalyzer()
>>> for sentence in sentences:
+ ... sid = SentimentIntensityAnalyzer()
... print(sentence)
... ss = sid.polarity_scores(sentence)
... for k in sorted(ss):
@@ -230,4 +230,4 @@ Vader
the twin towers collapsed today
compound: -0.2732, neg: 0.344, neu: 0.656, pos: 0.0,
However, Mr. Carter solemnly argues, his client carried out the kidnapping under orders and in the ''least offensive way possible.''
- compound: -0.5859, neg: 0.23, neu: 0.697, pos: 0.074,
\ No newline at end of file
+ compound: -0.5859, neg: 0.23, neu: 0.697, pos: 0.074,
diff --git a/nltk/test/sentiwordnet.doctest b/nltk/test/sentiwordnet.doctest
index e032482..c0aed1c 100644
--- a/nltk/test/sentiwordnet.doctest
+++ b/nltk/test/sentiwordnet.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
======================
diff --git a/nltk/test/simple.doctest b/nltk/test/simple.doctest
index 073169a..d067d5f 100644
--- a/nltk/test/simple.doctest
+++ b/nltk/test/simple.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
=================
diff --git a/nltk/test/stem.doctest b/nltk/test/stem.doctest
index 4f99072..d8427af 100644
--- a/nltk/test/stem.doctest
+++ b/nltk/test/stem.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
==========
diff --git a/nltk/test/tag.doctest b/nltk/test/tag.doctest
index 1b888c7..89b3f9a 100644
--- a/nltk/test/tag.doctest
+++ b/nltk/test/tag.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
Regression Tests
diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest
index 8981c6d..9d0d668 100644
--- a/nltk/test/tokenize.doctest
+++ b/nltk/test/tokenize.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
>>> from __future__ import print_function
@@ -40,6 +40,19 @@ Some test strings.
>>> word_tokenize(s10)
['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.']
+
+Testing improvement made to the TreebankWordTokenizer
+
+ >>> sx1 = u'\xabNow that I can do.\xbb'
+ >>> expected = [u'\xab', u'Now', u'that', u'I', u'can', u'do', u'.', u'\xbb']
+ >>> word_tokenize(sx1) == expected
+ True
+ >>> sx2 = u'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.'
+ >>> expected = [u'The', u'unicode', u'201C', u'and', u'201D', u'\u201c', u'LEFT', u'(', u'RIGHT', u')', u'DOUBLE', u'QUOTATION', u'MARK', u'\u201d', u'is', u'also', u'OPEN_PUNCT', u'and', u'CLOSE_PUNCT', u'.']
+ >>> word_tokenize(sx2) == expected
+ True
+
+
Sentence tokenization in word_tokenize:
>>> s11 = "I called Dr. Jones. I called Dr. Jones."
@@ -163,4 +176,53 @@ It should not hang on long sequences of the same punctuation character.
>>> tknzr = TweetTokenizer()
>>> s10 = "Photo: Aujourd'hui sur http://t.co/0gebOFDUzn Projet... http://t.co/bKfIUbydz2.............................. http://fb.me/3b6uXpz0L"
>>> tknzr.tokenize(s10)
- [u'Photo', u':', u"Aujourd'hui", u'sur', u'http://t.co/0gebOFDUzn', u'Projet', u'...', u'http://t.co/bKfIUbydz2', u'...', u'http://fb.me/3b6uXpz0L']
\ No newline at end of file
+ [u'Photo', u':', u"Aujourd'hui", u'sur', u'http://t.co/0gebOFDUzn', u'Projet', u'...', u'http://t.co/bKfIUbydz2', u'...', u'http://fb.me/3b6uXpz0L']
+
+
+Regression Tests: PunktSentenceTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The sentence splitter should remove whitespace following the sentence boundary.
+
+ >>> pst = PunktSentenceTokenizer()
+ >>> pst.tokenize('See Section 3). Or Section 2). ')
+ ['See Section 3).', 'Or Section 2).']
+ >>> pst.tokenize('See Section 3.) Or Section 2.) ')
+ ['See Section 3.)', 'Or Section 2.)']
+ >>> pst.tokenize('See Section 3.) Or Section 2.) ', realign_boundaries=False)
+ ['See Section 3.', ') Or Section 2.', ')']
+
+
+Regression Tests: aling_tokens
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Post-hoc alignment of tokens with a source string
+
+ >>> from nltk.tokenize.util import align_tokens
+ >>> list(align_tokens([''], ""))
+ [(0, 0)]
+ >>> list(align_tokens([''], " "))
+ [(0, 0)]
+ >>> list(align_tokens([], ""))
+ []
+ >>> list(align_tokens([], " "))
+ []
+ >>> list(align_tokens(['a'], "a"))
+ [(0, 1)]
+ >>> list(align_tokens(['abc', 'def'], "abcdef"))
+ [(0, 3), (3, 6)]
+ >>> list(align_tokens(['abc', 'def'], "abc def"))
+ [(0, 3), (4, 7)]
+ >>> list(align_tokens(['ab', 'cd'], "ab cd ef"))
+ [(0, 2), (3, 5)]
+ >>> list(align_tokens(['ab', 'cd', 'ef'], "ab cd ef"))
+ [(0, 2), (3, 5), (6, 8)]
+ >>> list(align_tokens(['ab', 'cd', 'efg'], "ab cd ef"))
+ Traceback (most recent call last):
+ ....
+ ValueError: substring "efg" not found in "ab cd ef"
+ >>> list(align_tokens(['ab', 'cd', 'ef', 'gh'], "ab cd ef"))
+ Traceback (most recent call last):
+ ....
+ ValueError: substring "gh" not found in "ab cd ef"
+ >>> list(align_tokens(['The', 'plane', ',', 'bound', 'for', 'St', 'Petersburg', ',', 'crashed', 'in', 'Egypt', "'s", 'Sinai', 'desert', 'just', '23', 'minutes', 'after', 'take-off', 'from', 'Sharm', 'el-Sheikh', 'on', 'Saturday', '.'], "The plane, bound for St Petersburg, crashed in Egypt's Sinai desert just 23 minutes after take-off from Sharm el-Sheikh on Saturday."))
+ [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23), (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54), (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89), (90, 98), (99, 103), (104, 109), (110, 119), (120, 122), (123, 131), (131, 132)]
diff --git a/nltk/test/toolbox.doctest b/nltk/test/toolbox.doctest
index d1b70fe..d9abb20 100644
--- a/nltk/test/toolbox.doctest
+++ b/nltk/test/toolbox.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
===============================
diff --git a/nltk/test/translate.doctest b/nltk/test/translate.doctest
index 8cf9a47..6826ce3 100644
--- a/nltk/test/translate.doctest
+++ b/nltk/test/translate.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
.. -*- coding: utf-8 -*-
diff --git a/nltk/test/tree.doctest b/nltk/test/tree.doctest
index 792dc97..46b29fb 100644
--- a/nltk/test/tree.doctest
+++ b/nltk/test/tree.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
===============================
diff --git a/nltk/test/treeprettyprinter.doctest b/nltk/test/treeprettyprinter.doctest
index 7e8d315..39b9ef4 100644
--- a/nltk/test/treeprettyprinter.doctest
+++ b/nltk/test/treeprettyprinter.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
========================================================
diff --git a/nltk/test/treetransforms.doctest b/nltk/test/treetransforms.doctest
index eb636bc..f5e4a02 100644
--- a/nltk/test/treetransforms.doctest
+++ b/nltk/test/treetransforms.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
-------------------------------------------
diff --git a/nltk/test/unit/test_aline.py b/nltk/test/unit/test_aline.py
new file mode 100644
index 0000000..40bffce
--- /dev/null
+++ b/nltk/test/unit/test_aline.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+"""
+Unit tests for nltk.metrics.aline
+"""
+
+from __future__ import unicode_literals
+
+import unittest
+
+from nltk.metrics import aline
+
+class TestAline(unittest.TestCase):
+ """
+ Test Aline algorithm for aligning phonetic sequences
+ """
+
+ def test_aline(self):
+ result = aline.align('θin', 'tenwis')
+ expected = [[('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]]
+
+ self.assertEqual(result, expected)
+
+ result = aline.align('jo', 'ʒə')
+ expected = [[('j', 'ʒ'), ('o', 'ə')]]
+
+ self.assertEqual(result, expected)
+
+ result = aline.align('pematesiweni', 'pematesewen')
+ expected = [[('p', 'p'), ('e', 'e'), ('m', 'm'), ('a', 'a'), ('t', 't'), ('e', 'e'),
+ ('s', 's'), ('i', 'e'), ('w', 'w'), ('e', 'e'), ('n', 'n'), ('i', '-')]]
+
+ self.assertEqual(result, expected)
+
+ result = aline.align('tuwθ', 'dentis')
+ expected = [[('t', 'd'), ('u', 'e'), ('w', '-'), ('-', 'n'), ('-', 't'), ('-', 'i'), ('θ', 's')]]
+
+ self.assertEqual(result, expected)
+
+ def test_aline_delta(self):
+ """
+ Test aline for computing the difference between two segments
+ """
+ result = aline.delta('p', 'q')
+ expected = 20.0
+
+ self.assertEqual(result, expected)
+
+ result = aline.delta('a', 'A')
+ expected = 0.0
+
+ self.assertEqual(result, expected)
+
diff --git a/nltk/test/unit/test_chunk.py b/nltk/test/unit/test_chunk.py
new file mode 100644
index 0000000..892ce0c
--- /dev/null
+++ b/nltk/test/unit/test_chunk.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
+import unittest
+
+from nltk import RegexpParser
+
+class TestChunkRule(unittest.TestCase):
+
+ def test_tag_pattern2re_pattern_quantifier(self):
+ """Test for bug https://github.com/nltk/nltk/issues/1597
+
+ Ensures that curly bracket quantifiers can be used inside a chunk rule.
+ This type of quantifier has been used for the supplementary example
+ in http://www.nltk.org/book/ch07.html#exploring-text-corpora.
+ """
+ sent = [('The', 'AT'), ('September-October', 'NP'), ('term', 'NN'), ('jury', 'NN'), ('had', 'HVD'), ('been', 'BEN'), ('charged', 'VBN'), ('by', 'IN'), ('Fulton', 'NP-TL'), ('Superior', 'JJ-TL'), ('Court', 'NN-TL'), ('Judge', 'NN-TL'), ('Durwood', 'NP'), ('Pye', 'NP'), ('to', 'TO'), ('investigate', 'VB'), ('reports', 'NNS'), ('of', 'IN'), ('possible', 'JJ'), ('``', '``'), ('irregularities', 'NNS'), ("''", "''"), ('in', 'IN'), ('the', 'AT'), ('hard-fought', 'JJ'), ('primary', 'NN') [...]
+ cp = RegexpParser('CHUNK: {<N.*>{4,}}')
+ tree = cp.parse(sent)
+ assert tree.pformat() == """(S
+ The/AT
+ September-October/NP
+ term/NN
+ jury/NN
+ had/HVD
+ been/BEN
+ charged/VBN
+ by/IN
+ Fulton/NP-TL
+ Superior/JJ-TL
+ (CHUNK Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
+ to/TO
+ investigate/VB
+ reports/NNS
+ of/IN
+ possible/JJ
+ ``/``
+ irregularities/NNS
+ ''/''
+ in/IN
+ the/AT
+ hard-fought/JJ
+ primary/NN
+ which/WDT
+ was/BEDZ
+ won/VBN
+ by/IN
+ (CHUNK Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
+ ./.)"""
+
diff --git a/nltk/test/unit/test_corpora.py b/nltk/test/unit/test_corpora.py
index 1f0314b..aa28188 100644
--- a/nltk/test/unit/test_corpora.py
+++ b/nltk/test/unit/test_corpora.py
@@ -1,8 +1,11 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import unittest
+
from nltk.corpus import (sinica_treebank, conll2007, indian, cess_cat, cess_esp,
- floresta, ptb, udhr)
+ floresta, ptb, udhr) # mwa_ppdb
+
+from nltk.compat import python_2_unicode_compatible
from nltk.tree import Tree
from nltk.test.unit.utils import skipIf
@@ -179,5 +182,19 @@ class TestPTB(unittest.TestCase):
['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back']
)
+ at unittest.skip("Skipping test for mwa_ppdb.")
+class TestMWAPPDB(unittest.TestCase):
+ def test_fileids(self):
+ self.assertEqual(mwa_ppdb.fileids(),
+ ['ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs'])
+
+ def test_entries(self):
+ self.assertEqual(mwa_ppdb.entries()[:10],
+ [('10/17/01', '17/10/2001'), ('102,70', '102.70'),
+ ('13,53', '13.53'), ('3.2.5.3.2.1', '3.2.5.3.2.1.'),
+ ('53,76', '53.76'), ('6.9.5', '6.9.5.'),
+ ('7.7.6.3', '7.7.6.3.'), ('76,20', '76.20'),
+ ('79,85', '79.85'), ('93,65', '93.65')] )
+
# unload corpora
from nltk.corpus import teardown_module
diff --git a/nltk/test/unit/test_json2csv_corpus.py b/nltk/test/unit/test_json2csv_corpus.py
index aa13f80..2a6d947 100644
--- a/nltk/test/unit/test_json2csv_corpus.py
+++ b/nltk/test/unit/test_json2csv_corpus.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter client
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Lorenzo Rubio <lrnzcig at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -16,9 +16,10 @@ import os
from nltk.compat import TemporaryDirectory
import unittest
+from six.moves import zip
+
from nltk.corpus import twitter_samples
from nltk.twitter.common import json2csv, json2csv_entities
-from nltk.compat import izip
def are_files_identical(filename1, filename2, debug=False):
@@ -28,7 +29,7 @@ def are_files_identical(filename1, filename2, debug=False):
with open(filename1, "rb") as fileA:
with open(filename2, "rb") as fileB:
result = True
- for lineA, lineB in izip(sorted(fileA.readlines()),
+ for lineA, lineB in zip(sorted(fileA.readlines()),
sorted(fileB.readlines())):
if lineA.strip() != lineB.strip():
if debug:
diff --git a/nltk/test/unit/test_senna.py b/nltk/test/unit/test_senna.py
new file mode 100644
index 0000000..bcd61a8
--- /dev/null
+++ b/nltk/test/unit/test_senna.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+"""
+Unit tests for Senna
+"""
+
+from __future__ import unicode_literals
+from os import environ, path, sep
+
+import logging
+import unittest
+
+from nltk.classify import Senna
+from nltk.tag import SennaTagger, SennaChunkTagger, SennaNERTagger
+
+# Set Senna executable path for tests if it is not specified as an environment variable
+if 'SENNA' in environ:
+ SENNA_EXECUTABLE_PATH = path.normpath(environ['SENNA']) + sep
+else:
+ SENNA_EXECUTABLE_PATH = '/usr/share/senna-v3.0'
+
+senna_is_installed = path.exists(SENNA_EXECUTABLE_PATH)
+
+ at unittest.skipUnless(senna_is_installed, "Requires Senna executable")
+class TestSennaPipeline(unittest.TestCase):
+ """Unittest for nltk.classify.senna"""
+
+ def test_senna_pipeline(self):
+ """Senna pipeline interface"""
+
+ pipeline = Senna(SENNA_EXECUTABLE_PATH, ['pos', 'chk', 'ner'])
+ sent = 'Dusseldorf is an international business center'.split()
+ result = [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)]
+ expected = [('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP',
+ 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'), ('international', 'I-NP',
+ 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP',
+ 'O', 'NN')]
+ self.assertEqual(result, expected)
+
+ at unittest.skipUnless(senna_is_installed, "Requires Senna executable")
+class TestSennaTagger(unittest.TestCase):
+ """Unittest for nltk.tag.senna"""
+
+ def test_senna_tagger(self):
+ tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
+ result = tagger.tag('What is the airspeed of an unladen swallow ?'.split())
+ expected = [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed',
+ 'NN'),('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow',
+ 'NN'), ('?', '.')]
+ self.assertEqual(result, expected)
+
+ def test_senna_chunk_tagger(self):
+ chktagger = SennaChunkTagger(SENNA_EXECUTABLE_PATH)
+ result_1 = chktagger.tag('What is the airspeed of an unladen swallow ?'.split())
+ expected_1 = [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed',
+ 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow',
+ 'I-NP'), ('?', 'O')]
+
+ result_2 = list(chktagger.bio_to_chunks(result_1, chunk_type='NP'))
+ expected_2 = [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow',
+ '5-6-7')]
+ self.assertEqual(result_1, expected_1)
+ self.assertEqual(result_2, expected_2)
+
+ def test_senna_ner_tagger(self):
+ nertagger = SennaNERTagger(SENNA_EXECUTABLE_PATH)
+ result_1 = nertagger.tag('Shakespeare theatre was in London .'.split())
+ expected_1 = [('Shakespeare', 'B-PER'), ('theatre', 'O'), ('was', 'O'),
+ ('in', 'O'), ('London', 'B-LOC'), ('.', 'O')]
+
+ result_2 = nertagger.tag('UN headquarters are in NY , USA .'.split())
+ expected_2 = [('UN', 'B-ORG'), ('headquarters', 'O'), ('are', 'O'),
+ ('in', 'O'), ('NY', 'B-LOC'), (',', 'O'), ('USA', 'B-LOC'), ('.', 'O')]
+ self.assertEqual(result_1, expected_1)
+ self.assertEqual(result_2, expected_2)
diff --git a/nltk/test/unit/test_stem.py b/nltk/test/unit/test_stem.py
index 13ca2fa..6287f42 100644
--- a/nltk/test/unit/test_stem.py
+++ b/nltk/test/unit/test_stem.py
@@ -1,7 +1,11 @@
# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals
import unittest
+from contextlib import closing
+from nltk import data
from nltk.stem.snowball import SnowballStemmer
+from nltk.stem.porter import PorterStemmer
+import os
class SnowballTest(unittest.TestCase):
@@ -34,3 +38,70 @@ class SnowballTest(unittest.TestCase):
def test_short_strings_bug(self):
stemmer = SnowballStemmer('english')
assert stemmer.stem("y's") == 'y'
+
+class PorterTest(unittest.TestCase):
+
+ def _vocabulary(self):
+ with closing(data.find('stemmers/porter_test/porter_vocabulary.txt').open(encoding='utf-8')) as fp:
+ return fp.read().splitlines()
+
+ def _test_against_expected_output(self, stemmer_mode, expected_stems):
+ stemmer = PorterStemmer(mode=stemmer_mode)
+ for word, true_stem in zip(self._vocabulary(), expected_stems):
+ our_stem = stemmer.stem(word)
+ assert our_stem == true_stem, (
+ "%s should stem to %s in %s mode but got %s" % (
+ word, true_stem, stemmer_mode, our_stem
+ )
+ )
+
+ def test_vocabulary_martin_mode(self):
+ """Tests all words from the test vocabulary provided by M Porter
+
+ The sample vocabulary and output were sourced from:
+ http://tartarus.org/martin/PorterStemmer/voc.txt
+ http://tartarus.org/martin/PorterStemmer/output.txt
+ and are linked to from the Porter Stemmer algorithm's homepage
+ at
+ http://tartarus.org/martin/PorterStemmer/
+ """
+ with closing(data.find('stemmers/porter_test/porter_martin_output.txt').open(encoding='utf-8')) as fp:
+ self._test_against_expected_output(
+ PorterStemmer.MARTIN_EXTENSIONS,
+ fp.read().splitlines()
+ )
+
+ def test_vocabulary_nltk_mode(self):
+ with closing(data.find('stemmers/porter_test/porter_nltk_output.txt').open(encoding='utf-8')) as fp:
+ self._test_against_expected_output(
+ PorterStemmer.NLTK_EXTENSIONS,
+ fp.read().splitlines()
+ )
+
+ def test_vocabulary_original_mode(self):
+ # The list of stems for this test was generated by taking the
+ # Martin-blessed stemmer from
+ # http://tartarus.org/martin/PorterStemmer/c.txt
+ # and removing all the --DEPARTURE-- sections from it and
+ # running it against Martin's test vocabulary.
+
+ with closing(data.find('stemmers/porter_test/porter_original_output.txt').open(encoding='utf-8')) as fp:
+ self._test_against_expected_output(
+ PorterStemmer.ORIGINAL_ALGORITHM,
+ fp.read().splitlines()
+ )
+
+ self._test_against_expected_output(
+ PorterStemmer.ORIGINAL_ALGORITHM,
+ data.find('stemmers/porter_test/porter_original_output.txt')
+ .open(encoding='utf-8')
+ .read()
+ .splitlines()
+ )
+
+ def test_oed_bug(self):
+ """Test for bug https://github.com/nltk/nltk/issues/1581
+
+ Ensures that 'oed' can be stemmed without throwing an error.
+ """
+ assert PorterStemmer().stem('oed') == 'o'
diff --git a/nltk/test/unit/test_tgrep.py b/nltk/test/unit/test_tgrep.py
index 9add766..57a0d7f 100644
--- a/nltk/test/unit/test_tgrep.py
+++ b/nltk/test/unit/test_tgrep.py
@@ -3,7 +3,7 @@
#
# Natural Language Toolkit: TGrep search
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Will Roberts <wildwilhelm at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -13,7 +13,9 @@ Unit tests for nltk.tgrep.
'''
from __future__ import absolute_import, print_function, unicode_literals
-from nltk.compat import b
+
+from six import b
+
from nltk.tree import ParentedTree
from nltk import tgrep
import unittest
diff --git a/nltk/test/unit/test_tokenize.py b/nltk/test/unit/test_tokenize.py
index af55fc3..45fba66 100644
--- a/nltk/test/unit/test_tokenize.py
+++ b/nltk/test/unit/test_tokenize.py
@@ -5,8 +5,11 @@ See also nltk/test/tokenize.doctest
"""
from __future__ import unicode_literals
-from nltk.tokenize import TweetTokenizer
+from nltk.tokenize import TweetTokenizer, StanfordSegmenter
+from nose import SkipTest
import unittest
+import os
+
class TestTokenize(unittest.TestCase):
@@ -21,3 +24,86 @@ class TestTokenize(unittest.TestCase):
expected = [':', "Let's", 'test', 'these', 'words', ':', 'resumé',
'España', 'München', 'français']
self.assertEqual(tokens, expected)
+
+ def test_stanford_segmenter_arabic(self):
+ """
+ Test the Stanford Word Segmenter for Arabic (default config)
+ """
+ try:
+ seg = StanfordSegmenter()
+ seg.default_config('ar')
+ sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات'
+ segmented_sent = seg.segment(sent.split())
+ assert segmented_sent.split() == ['يبحث', 'علم', 'الحاسوب', 'استخدام',
+ 'الحوسبة', 'ب', 'جميع', 'اشكال',
+ 'ها', 'ل', 'حل', 'المشكلات']
+ except LookupError as e:
+ raise SkipTest(str(e))
+
+ def test_stanford_segmenter_chinese(self):
+ """
+ Test the Stanford Word Segmenter for Chinese (default config)
+ """
+ try:
+ seg = StanfordSegmenter()
+ seg.default_config('zh')
+ sent = u"这是斯坦福中文分词器测试"
+ segmented_sent = seg.segment(sent.split())
+ assert segmented_sent.split() == ['这', '是', '斯坦福',
+ '中文', '分词器', '测试']
+ except LookupError as e:
+ raise SkipTest(str(e))
+
+ def test_remove_handle(self):
+ """
+ Test remove_handle() from casual.py with specially crafted edge cases
+ """
+
+ tokenizer = TweetTokenizer(strip_handles=True)
+
+ # Simple example. Handles with just numbers should be allowed
+ test1 = "@twitter hello @twi_tter_. hi @12345 @123news"
+ expected = ['hello', '.', 'hi']
+ result = tokenizer.tokenize(test1)
+ self.assertEqual(result, expected)
+
+ # Handles are allowed to follow any of the following characters
+ test2 = "@n`@n~@n(@n)@n- at n=@n+ at n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n. at n, at n<@n>@n @n\n at n ñ@n.ü@n.ç@n."
+ expected = ['`', '~', '(', ')', '-', '=', '+', '\\', '|', '[', ']', '{', '}', ';', ':', "'", '"', '/', '?', '.', ',', '<', '>', 'ñ', '.', 'ü', '.', 'ç', '.']
+ result = tokenizer.tokenize(test2)
+ self.assertEqual(result, expected)
+
+
+ # Handles are NOT allowed to follow any of the following characters
+ test3 = "a at n j at n z at n A at n L at n Z at n 1 at n 4 at n 7 at n 9 at n 0 at n _ at n !@n @@n #@n $@n %@n &@n *@n"
+ expected = ['a', '@n', 'j', '@n', 'z', '@n', 'A', '@n', 'L', '@n', 'Z', '@n', '1', '@n', '4', '@n', '7', '@n', '9', '@n', '0', '@n', '_', '@n', '!', '@n', '@', '@n', '#', '@n', '$', '@n', '%', '@n', '&', '@n', '*', '@n']
+ result = tokenizer.tokenize(test3)
+ self.assertEqual(result, expected)
+
+
+ # Handles are allowed to precede the following characters
+ test4 = "@n!a @n#a @n$a @n%a @n&a @n*a"
+ expected = ['!', 'a', '#', 'a', '$', 'a', '%', 'a', '&', 'a', '*', 'a']
+ result = tokenizer.tokenize(test4)
+ self.assertEqual(result, expected)
+
+
+ # Tests interactions with special symbols and multiple @
+ test5 = "@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n at n @@n @n@@n @n_ at n @n7 at n @nj at n"
+ expected = ['!', '@n', '#', '@n', '$', '@n', '%', '@n', '&', '@n', '*', '@n', '@n', '@n', '@', '@n', '@n', '@', '@n', '@n_', '@n', '@n7', '@n', '@nj', '@n']
+ result = tokenizer.tokenize(test5)
+ self.assertEqual(result, expected)
+
+
+ # Tests that handles can have a max length of 20
+ test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmnopqrst1234 @abcdefghijklmnopqrst_ @abcdefghijklmnopqrstendofhandle"
+ expected = ['uvwxyz', '1234', '_', 'endofhandle']
+ result = tokenizer.tokenize(test6)
+ self.assertEqual(result, expected)
+
+
+ # Edge case where an @ comes directly after a long handle
+ test7 = "@abcdefghijklmnopqrstu at abcde @abcdefghijklmnopqrst at abcde @abcdefghijklmnopqrst_ at abcde @abcdefghijklmnopqrst5 at abcde"
+ expected = ['u', '@abcde', '@abcdefghijklmnopqrst', '@abcde', '_', '@abcde', '5', '@abcde']
+ result = tokenizer.tokenize(test7)
+ self.assertEqual(result, expected)
diff --git a/nltk/test/unit/translate/test_bleu.py b/nltk/test/unit/translate/test_bleu.py
index 1af711f..3411dd2 100644
--- a/nltk/test/unit/translate/test_bleu.py
+++ b/nltk/test/unit/translate/test_bleu.py
@@ -3,15 +3,19 @@
Tests for BLEU translation evaluation metric
"""
+import functools
+import io
import unittest
-from nltk.translate.bleu_score import modified_precision, brevity_penalty
-from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
+
+from nltk.data import find
+from nltk.translate.bleu_score import modified_precision, brevity_penalty, closest_ref_length
+from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
class TestBLEU(unittest.TestCase):
def test_modified_precision(self):
"""
- Examples from the original BLEU paper
+ Examples from the original BLEU paper
http://www.aclweb.org/anthology/P02-1040.pdf
"""
# Example 1: the "the*" example.
@@ -20,19 +24,19 @@ class TestBLEU(unittest.TestCase):
ref2 = 'there is a cat on the mat'.split()
# Hypothesis sentence(s).
hyp1 = 'the the the the the the the'.split()
-
- references = [ref1, ref2]
-
+
+ references = [ref1, ref2]
+
# Testing modified unigram precision.
hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
assert (round(hyp1_unigram_precision, 4) == 0.2857)
# With assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4)
-
+
# Testing modified bigram precision.
assert(float(modified_precision(references, hyp1, n=2)) == 0.0)
-
-
+
+
# Example 2: the "of the" example.
# Reference sentences
ref1 = str('It is a guide to action that ensures that the military '
@@ -43,23 +47,23 @@ class TestBLEU(unittest.TestCase):
'the directions of the party').split()
# Hypothesis sentence(s).
hyp1 = 'of the'.split()
-
- references = [ref1, ref2, ref3]
+
+ references = [ref1, ref2, ref3]
# Testing modified unigram precision.
assert (float(modified_precision(references, hyp1, n=1)) == 1.0)
-
+
# Testing modified bigram precision.
assert(float(modified_precision(references, hyp1, n=2)) == 1.0)
-
+
# Example 3: Proper MT outputs.
hyp1 = str('It is a guide to action which ensures that the military '
'always obeys the commands of the party').split()
hyp2 = str('It is to insure the troops forever hearing the activity '
'guidebook that party direct').split()
-
+
references = [ref1, ref2, ref3]
-
+
# Unigram precision.
hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1))
@@ -69,7 +73,7 @@ class TestBLEU(unittest.TestCase):
# Test unigram precision with rounding.
assert (round(hyp1_unigram_precision, 4) == 0.9444)
assert (round(hyp2_unigram_precision, 4) == 0.5714)
-
+
# Bigram precision
hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2))
hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2))
@@ -79,60 +83,158 @@ class TestBLEU(unittest.TestCase):
# Test bigram precision with rounding.
assert (round(hyp1_bigram_precision, 4) == 0.5882)
assert (round(hyp2_bigram_precision, 4) == 0.0769)
-
+
+ def test_brevity_penalty(self):
+ # Test case from brevity_penalty_closest function in mteval-v13a.pl.
+ # Same test cases as in the doctest in nltk.translate.bleu_score.py
+ references = [['a'] * 11, ['a'] * 8]
+ hypothesis = ['a'] * 7
+ hyp_len = len(hypothesis)
+ closest_ref_len = closest_ref_length(references, hyp_len)
+ self.assertAlmostEqual(brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4)
+
+ references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
+ hypothesis = ['a'] * 7
+ hyp_len = len(hypothesis)
+ closest_ref_len = closest_ref_length(references, hyp_len)
+ assert brevity_penalty(closest_ref_len, hyp_len) == 1.0
+
def test_zero_matches(self):
# Test case where there's 0 matches
references = ['The candidate has no alignment to any of the references'.split()]
hypothesis = 'John loves Mary'.split()
-
- # Test BLEU to nth order of n-grams, where n is len(hypothesis).
+
+ # Test BLEU to nth order of n-grams, where n is len(hypothesis).
for n in range(1,len(hypothesis)):
weights = [1.0/n] * n # Uniform weights.
assert(sentence_bleu(references, hypothesis, weights) == 0)
-
- def test_full_matches(self):
+
+ def test_full_matches(self):
# Test case where there's 100% matches
references = ['John loves Mary'.split()]
hypothesis = 'John loves Mary'.split()
-
- # Test BLEU to nth order of n-grams, where n is len(hypothesis).
+
+ # Test BLEU to nth order of n-grams, where n is len(hypothesis).
for n in range(1,len(hypothesis)):
weights = [1.0/n] * n # Uniform weights.
assert(sentence_bleu(references, hypothesis, weights) == 1.0)
-
+
def test_partial_matches_hypothesis_longer_than_reference(self):
references = ['John loves Mary'.split()]
hypothesis = 'John loves Mary who loves Mike'.split()
self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.4729, places=4)
-
+ # Checks that the warning has been raised because len(reference) < 4.
+ try:
+ self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
+ except AttributeError:
+ pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
- at unittest.skip("Skipping fringe cases for BLEU.")
+#@unittest.skip("Skipping fringe cases for BLEU.")
class TestBLEUFringeCases(unittest.TestCase):
def test_case_where_n_is_bigger_than_hypothesis_length(self):
# Test BLEU to nth order of n-grams, where n > len(hypothesis).
- # TODO: Currently this test breaks the BLEU implementation (13.03.2016)
- references = ['John loves Mary'.split()]
+ references = ['John loves Mary ?'.split()]
hypothesis = 'John loves Mary'.split()
- n = len(hypothesis) + 1 #
+ n = len(hypothesis) + 1 #
weights = [1.0/n] * n # Uniform weights.
+ self.assertAlmostEqual(sentence_bleu(references, hypothesis, weights), 0.7165, places=4)
+ # Checks that the warning has been raised because len(hypothesis) < 4.
+ try:
+ self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
+ except AttributeError:
+ pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
+
+ # Test case where n > len(hypothesis) but so is n > len(reference), and
+ # it's a special case where reference == hypothesis.
+ references = ['John loves Mary'.split()]
+ hypothesis = 'John loves Mary'.split()
assert(sentence_bleu(references, hypothesis, weights) == 1.0)
-
+
def test_empty_hypothesis(self):
# Test case where there's hypothesis is empty.
- # TODO: Currently this test breaks the BLEU implementation (13.03.2016)
references = ['The candidate has no alignment to any of the references'.split()]
hypothesis = []
assert(sentence_bleu(references, hypothesis) == 0)
-
+
def test_empty_references(self):
# Test case where there's reference is empty.
- # TODO: Currently this test breaks the BLEU implementation (13.03.2016)
references = [[]]
hypothesis = 'John loves Mary'.split()
assert(sentence_bleu(references, hypothesis) == 0)
-
-
- def test_brevity_penalty(self):
- pass
-
+
+ def test_empty_references_and_hypothesis(self):
+ # Test case where both references and hypothesis is empty.
+ references = [[]]
+ hypothesis = []
+ assert(sentence_bleu(references, hypothesis) == 0)
+
+ def test_reference_or_hypothesis_shorter_than_fourgrams(self):
+ # Tese case where the length of reference or hypothesis
+ # is shorter than 4.
+ references = ['let it go'.split()]
+ hypothesis = 'let go it'.split()
+ # Checks that the value the hypothesis and reference returns is 1.0
+ assert(sentence_bleu(references, hypothesis) == 1.0)
+ # Checks that the warning has been raised.
+ try:
+ self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
+ except AttributeError:
+ pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
+
+class TestBLEUvsMteval13a(unittest.TestCase):
+
+ def test_corpus_bleu(self):
+ ref_file = find('models/wmt15_eval/ref.ru')
+ hyp_file = find('models/wmt15_eval/google.ru')
+ mteval_output_file = find('models/wmt15_eval/mteval-13a.output')
+
+ # Reads the BLEU scores from the `mteval-13a.output` file.
+ # The order of the list corresponds to the order of the ngrams.
+ with open(mteval_output_file, 'r') as mteval_fin:
+ # The numbers are located in the last 2nd line of the file.
+ # The first and 2nd item in the list are the score and system names.
+ mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])
+
+ with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
+ with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
+ # Whitespace tokenize the file.
+ # Note: split() automatically strip().
+ hypothesis = list(map(lambda x: x.split(), hyp_fin))
+ # Note that the corpus_bleu input is list of list of references.
+ references = list(map(lambda x: [x.split()], ref_fin))
+ # Without smoothing.
+ for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
+ nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i)
+ # Check that the BLEU scores difference is less than 0.005 .
+ # Note: This is an approximate comparison; as much as
+ # +/- 0.01 BLEU might be "statistically significant",
+ # the actual translation quality might not be.
+ assert abs(mteval_bleu - nltk_bleu) < 0.005
+
+ # With the same smoothing method used in mteval-v13a.pl
+ chencherry = SmoothingFunction()
+ for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
+ nltk_bleu = corpus_bleu(references, hypothesis,
+ weights=(1.0/i,)*i,
+ smoothing_function=chencherry.method3)
+ assert abs(mteval_bleu - nltk_bleu) < 0.005
+
+class TestEmulateMultiBLEU(unittest.TestCase):
+ def test_corpus_bleu_with_emulate_multibleu(self):
+ hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R"
+ ref = str("Their tasks include changing a pump on the faulty stokehold ."
+ "Likewise , two species that are very similar in morphology "
+ "were distinguished using genetics .")
+ references = [[ref.split()]]
+ hypothese = [hyp.split()]
+ try: # Check that the warning is raised since no. of 2-grams < 0.
+ with self.assertWarns(UserWarning):
+ # Verify that the BLEU output is undesired since no. of 2-grams < 0.
+ self.assertAlmostEqual(corpus_bleu(references, hypothese), 0.4309, places=4)
+ except AttributeError:
+ pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
+ desired_output = corpus_bleu(references, hypothese,
+ emulate_multibleu=True)
+ #assert
+ assert desired_output == 0.0
diff --git a/nltk/test/unit/translate/test_stack_decoder.py b/nltk/test/unit/translate/test_stack_decoder.py
index 5055d31..d4936a4 100644
--- a/nltk/test/unit/translate/test_stack_decoder.py
+++ b/nltk/test/unit/translate/test_stack_decoder.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Stack decoder
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tah Wei Hoon <hoon.tw at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/test/util.doctest b/nltk/test/util.doctest
index 472624c..d8da77f 100644
--- a/nltk/test/util.doctest
+++ b/nltk/test/util.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
=================
diff --git a/nltk/test/wordnet.doctest b/nltk/test/wordnet.doctest
index 178e968..939a95e 100644
--- a/nltk/test/wordnet.doctest
+++ b/nltk/test/wordnet.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
=================
@@ -113,6 +113,12 @@ Note that some relations are defined by WordNet only over Lemmas:
The relations that are currently defined in this way are `antonyms`,
`derivationally_related_forms` and `pertainyms`.
+If you know the byte offset used to identify a synset in the original
+Princeton WordNet data file, you can use that to instantiate the synset
+in NLTK:
+
+ >>> wn.synset_from_pos_and_offset('n', 4543158)
+ Synset('wagon.n.01')
------
Lemmas
@@ -579,3 +585,10 @@ Issue 395: wordnet returns incorrect result for lowest_common_hypernyms of chef
>>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01'))
[Synset('person.n.01')]
+
+Bug https://github.com/nltk/nltk/issues/1641: Non-English lemmas containing capital letters cannot be looked up using wordnet.lemmas() or wordnet.synsets()
+
+ >>> wn.lemmas('Londres', lang='fra')
+ [Lemma('united_kingdom.n.01.Londres'), Lemma('london.n.01.Londres'), Lemma('london.n.02.Londres')]
+ >>> wn.lemmas('londres', lang='fra')
+ [Lemma('united_kingdom.n.01.Londres'), Lemma('london.n.01.Londres'), Lemma('london.n.02.Londres')]
diff --git a/nltk/test/wordnet_lch.doctest b/nltk/test/wordnet_lch.doctest
index 2f65833..37200ca 100644
--- a/nltk/test/wordnet_lch.doctest
+++ b/nltk/test/wordnet_lch.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
===============================
diff --git a/nltk/test/wsd.doctest b/nltk/test/wsd.doctest
index 7fea034..0d1e01c 100644
--- a/nltk/test/wsd.doctest
+++ b/nltk/test/wsd.doctest
@@ -1,4 +1,4 @@
-.. Copyright (C) 2001-2016 NLTK Project
+.. Copyright (C) 2001-2017 NLTK Project
.. For license information, see LICENSE.TXT
.. -*- coding: utf-8 -*-
diff --git a/nltk/text.py b/nltk/text.py
index 2340269..a8c357c 100644
--- a/nltk/text.py
+++ b/nltk/text.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Texts
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# URL: <http://nltk.org/>
@@ -16,17 +16,19 @@ distributional similarity.
from __future__ import print_function, division, unicode_literals
from math import log
-from collections import defaultdict
+from collections import defaultdict, Counter
from functools import reduce
from itertools import islice
import re
+from six import text_type
+
from nltk.probability import FreqDist, LidstoneProbDist
from nltk.probability import ConditionalFreqDist as CFD
from nltk.util import tokenwrap, LazyConcatenation
from nltk.metrics import f_measure, BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
-from nltk.compat import python_2_unicode_compatible, text_type, Counter
+from nltk.compat import python_2_unicode_compatible
class ContextIndex(object):
@@ -444,6 +446,13 @@ class Text(object):
from nltk.draw import dispersion_plot
dispersion_plot(self, words)
+ def generate(self, words):
+ """
+ Issues a reminder to users following the book online
+ """
+ import warnings
+ warnings.warn('The generate() method is no longer available.', DeprecationWarning)
+
def plot(self, *args):
"""
See documentation for FreqDist.plot()
diff --git a/nltk/tgrep.py b/nltk/tgrep.py
index 11fa8cf..81c7e2b 100644
--- a/nltk/tgrep.py
+++ b/nltk/tgrep.py
@@ -3,7 +3,7 @@
#
# Natural Language Toolkit: TGrep search
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Will Roberts <wildwilhelm at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -111,15 +111,19 @@ macro definitions to ``m`` and initialises ``l`` to an empty dictionary.
'''
from __future__ import absolute_import, print_function, unicode_literals
-from nltk.compat import binary_type, text_type
+
import functools
-import nltk.tree
+import re
+
+from six import binary_type, text_type
+
try:
import pyparsing
except ImportError:
print('Warning: nltk.tgrep will not work without the `pyparsing` package')
print('installed.')
-import re
+
+import nltk.tree
class TgrepException(Exception):
'''Tgrep exception type.'''
@@ -932,6 +936,3 @@ def tgrep_nodes(pattern, trees, search_leaves=True):
if pattern(tree[position])]
except AttributeError:
yield []
-
-
-
diff --git a/nltk/tokenize/__init__.py b/nltk/tokenize/__init__.py
index 4d36b9e..b4b6dd7 100644
--- a/nltk/tokenize/__init__.py
+++ b/nltk/tokenize/__init__.py
@@ -1,9 +1,10 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Tokenizers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com> (minor additions)
+# Contributors: matthewmc, clouds56
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -59,23 +60,28 @@ tokenization, see the other methods provided in this package.
For further information, please see Chapter 3 of the NLTK book.
"""
+import re
+
from nltk.data import load
-from nltk.tokenize.simple import (SpaceTokenizer, TabTokenizer, LineTokenizer,
- line_tokenize)
+from nltk.tokenize.casual import (TweetTokenizer, casual_tokenize)
+from nltk.tokenize.mwe import MWETokenizer
+from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize.regexp import (RegexpTokenizer, WhitespaceTokenizer,
BlanklineTokenizer, WordPunctTokenizer,
wordpunct_tokenize, regexp_tokenize,
blankline_tokenize)
-from nltk.tokenize.punkt import PunktSentenceTokenizer
+from nltk.tokenize.repp import ReppTokenizer
from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize
-from nltk.tokenize.treebank import TreebankWordTokenizer
+from nltk.tokenize.simple import (SpaceTokenizer, TabTokenizer, LineTokenizer,
+ line_tokenize)
from nltk.tokenize.stanford import StanfordTokenizer
from nltk.tokenize.texttiling import TextTilingTokenizer
-from nltk.tokenize.casual import (TweetTokenizer, casual_tokenize)
-from nltk.tokenize.mwe import MWETokenizer
+from nltk.tokenize.toktok import ToktokTokenizer
+from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
+
# Standard sentence tokenizer.
def sent_tokenize(text, language='english'):
"""
@@ -91,18 +97,36 @@ def sent_tokenize(text, language='english'):
return tokenizer.tokenize(text)
# Standard word tokenizer.
-_treebank_word_tokenize = TreebankWordTokenizer().tokenize
-def word_tokenize(text, language='english'):
+_treebank_word_tokenizer = TreebankWordTokenizer()
+
+# See discussion on https://github.com/nltk/nltk/pull/1437
+# Adding to TreebankWordTokenizer, the splits on
+# - chervon quotes u'\xab' and u'\xbb' .
+# - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
+
+improved_open_quote_regex = re.compile(u'([«“‘])', re.U)
+improved_close_quote_regex = re.compile(u'([»”’])', re.U)
+improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U)
+_treebank_word_tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
+_treebank_word_tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
+_treebank_word_tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))
+
+
+def word_tokenize(text, language='english', preserve_line=False):
"""
Return a tokenized copy of *text*,
using NLTK's recommended word tokenizer
- (currently :class:`.TreebankWordTokenizer`
+ (currently an improved :class:`.TreebankWordTokenizer`
along with :class:`.PunktSentenceTokenizer`
for the specified language).
- :param text: text to split into sentences
+ :param text: text to split into words
+ :param text: str
:param language: the model name in the Punkt corpus
+ :type language: str
+ :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
+ :type preserver_line: bool
"""
- return [token for sent in sent_tokenize(text, language)
- for token in _treebank_word_tokenize(sent)]
-
+ sentences = [text] if preserve_line else sent_tokenize(text, language)
+ return [token for sent in sentences
+ for token in _treebank_word_tokenizer.tokenize(sent)]
diff --git a/nltk/tokenize/api.py b/nltk/tokenize/api.py
index 8a121e8..f38ce86 100644
--- a/nltk/tokenize/api.py
+++ b/nltk/tokenize/api.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Tokenizer Interface
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
@@ -11,11 +11,12 @@ Tokenizer Interface
"""
from abc import ABCMeta, abstractmethod
-from nltk.six import add_metaclass
+from six import add_metaclass
from nltk.internals import overridden
from nltk.tokenize.util import string_span_tokenize
+
@add_metaclass(ABCMeta)
class TokenizerI(object):
"""
@@ -74,5 +75,3 @@ class StringTokenizer(TokenizerI):
def span_tokenize(self, s):
for span in string_span_tokenize(s, self._string):
yield span
-
-
diff --git a/nltk/tokenize/casual.py b/nltk/tokenize/casual.py
index 8d73fcf..4a44233 100644
--- a/nltk/tokenize/casual.py
+++ b/nltk/tokenize/casual.py
@@ -2,7 +2,7 @@
#
# Natural Language Toolkit: Twitter Tokenizer
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Christopher Potts <cgpotts at stanford.edu>
# Ewan Klein <ewan at inf.ed.ac.uk> (modifications)
# Pierpaolo Pantone <> (modifications)
@@ -38,8 +38,9 @@ domains and tasks. The basic logic is this:
from __future__ import unicode_literals
import re
-from nltk.compat import htmlentitydefs, int2byte, unichr
+from six import int2byte, unichr
+from six.moves import html_entities
######################################################################
# The following strings are components in the regular expression
@@ -246,7 +247,7 @@ def _replace_html_entities(text, keep=(), remove_illegal=True, encoding='utf-8')
if entity_body in keep:
return match.group(0)
else:
- number = htmlentitydefs.name2codepoint.get(entity_body)
+ number = html_entities.name2codepoint.get(entity_body)
if number is not None:
try:
return unichr(number)
@@ -324,8 +325,9 @@ def remove_handles(text):
"""
Remove Twitter username handles from text.
"""
- pattern = re.compile(r"(^|(?<=[^\w.-]))@[A-Za-z_]+\w+")
- return pattern.sub('', text)
+ pattern = re.compile(r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)")
+ # Substitute hadnles with ' ' to ensure that text on either side of removed handles are tokenized correctly
+ return pattern.sub(' ', text)
######################################################################
# Tokenization Function
@@ -339,4 +341,3 @@ def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=Fa
strip_handles=strip_handles).tokenize(text)
###############################################################################
-
diff --git a/nltk/tokenize/moses.py b/nltk/tokenize/moses.py
new file mode 100644
index 0000000..44fcace
--- /dev/null
+++ b/nltk/tokenize/moses.py
@@ -0,0 +1,613 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit:
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Pidong Wang, Josh Schroeder, Ondrej Bojar, based on code by Philipp Koehn
+# Contributors: Liling Tan, Martijn Pieters, Wiktor Stribizew
+#
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+from __future__ import print_function
+import re
+from six import text_type
+
+from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.util import is_cjk
+from nltk.corpus import perluniprops, nonbreaking_prefixes
+
+
+class MosesTokenizer(TokenizerI):
+ """
+ This is a Python port of the Moses Tokenizer from
+ https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+ >>> tokenizer = MosesTokenizer()
+ >>> text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
+ >>> expected_tokenized = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+ >>> tokenized_text = tokenizer.tokenize(text, return_str=True)
+ >>> tokenized_text == expected_tokenized
+ True
+ >>> tokenizer.tokenize(text) == [u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf']
+ True
+
+ The nonbreaking prefixes should tokenize the final fullstop.
+
+ >>> m = MosesTokenizer()
+ >>> m.tokenize('abc def.')
+ [u'abc', u'def', u'.']
+ """
+
+ # Perl Unicode Properties character sets.
+ IsN = text_type(''.join(perluniprops.chars('IsN')))
+ IsAlnum = text_type(''.join(perluniprops.chars('IsAlnum')))
+ IsSc = text_type(''.join(perluniprops.chars('IsSc')))
+ IsSo = text_type(''.join(perluniprops.chars('IsSo')))
+ IsAlpha = text_type(''.join(perluniprops.chars('IsAlpha')))
+ IsLower = text_type(''.join(perluniprops.chars('IsLower')))
+
+ # Remove ASCII junk.
+ DEDUPLICATE_SPACE = r'\s+', r' '
+ ASCII_JUNK = r'[\000-\037]', r''
+
+ # Neurotic Perl heading space, multi-space and trailing space chomp.
+ # These regexes are kept for reference purposes and shouldn't be used!!
+ MID_STRIP = r" +", r" " # Use DEDUPLICATE_SPACE instead.
+ LEFT_STRIP = r"^ ", r"" # Uses text.lstrip() instead.
+ RIGHT_STRIP = r" $", r"" # Uses text.rstrip() instead.
+
+ # Pad all "other" special characters not in IsAlnum.
+ PAD_NOT_ISALNUM = u'([^{}\s\.\'\`\,\-])'.format(IsAlnum), r' \1 '
+
+ # Splits all hypens (regardless of circumstances), e.g.
+ # 'foo -- bar' -> 'foo @-@ @-@ bar' , 'foo-bar' -> 'foo @-@ bar'
+ AGGRESSIVE_HYPHEN_SPLIT = u'([{alphanum}])\-(?=[{alphanum}])'.format(alphanum=IsAlnum), r'\1 \@-\@ '
+
+ # Make multi-dots stay together.
+ REPLACE_DOT_WITH_LITERALSTRING_1 = r'\.([\.]+)', ' DOTMULTI\1'
+ REPLACE_DOT_WITH_LITERALSTRING_2 = r'DOTMULTI\.([^\.])', 'DOTDOTMULTI \1'
+ REPLACE_DOT_WITH_LITERALSTRING_3 = r'DOTMULTI\.', 'DOTDOTMULTI'
+
+ # Separate out "," except if within numbers (5,300)
+ # e.g. A,B,C,D,E > A , B,C , D,E
+ # First application uses up B so rule can't see B,C
+ # two-step version here may create extra spaces but these are removed later
+ # will also space digit,letter or letter,digit forms (redundant with next section)
+ COMMA_SEPARATE_1 = u'([^{}])[,]'.format(IsN), r'\1 , '
+ COMMA_SEPARATE_2 = u'[,]([^{}])'.format(IsN), r' , \1'
+
+ # Attempt to get correct directional quotes.
+ DIRECTIONAL_QUOTE_1 = r'^``', r'`` '
+ DIRECTIONAL_QUOTE_2 = r'^"', r'`` '
+ DIRECTIONAL_QUOTE_3 = r'^`([^`])', r'` \1'
+ DIRECTIONAL_QUOTE_4 = r"^'", r'` '
+ DIRECTIONAL_QUOTE_5 = r'([ ([{<])"', r'\1 `` '
+ DIRECTIONAL_QUOTE_6 = r'([ ([{<])``', r'\1 `` '
+ DIRECTIONAL_QUOTE_7 = r'([ ([{<])`([^`])', r'\1 ` \2'
+ DIRECTIONAL_QUOTE_8 = r"([ ([{<])'", r'\1 ` '
+
+ # Replace ... with _ELLIPSIS_
+ REPLACE_ELLIPSIS = r'\.\.\.', r' _ELLIPSIS_ '
+ # Restore _ELLIPSIS_ with ...
+ RESTORE_ELLIPSIS = r'_ELLIPSIS_', r'\.\.\.'
+
+ # Pad , with tailing space except if within numbers, e.g. 5,300
+ # These are used in nltk.tokenize.moses.penn_tokenize()
+ COMMA_1 = u'([^{numbers}])[,]([^{numbers}])'.format(numbers=IsN), r'\1 , \2'
+ COMMA_2 = u'([{numbers}])[,]([^{numbers}])'.format(numbers=IsN), r'\1 , \2'
+ COMMA_3 = u'([^{numbers}])[,]([{numbers}])'.format(numbers=IsN), r'\1 , \2'
+
+ # Pad unicode symbols with spaces.
+ SYMBOLS = u'([;:@#\$%&{}{}])'.format(IsSc, IsSo), r' \1 '
+
+ # Separate out intra-token slashes. PTB tokenization doesn't do this, so
+ # the tokens should be merged prior to parsing with a PTB-trained parser.
+ # e.g. "and/or" -> "and @/@ or"
+ INTRATOKEN_SLASHES = u'([{alphanum}])\/([{alphanum}])'.format(alphanum=IsAlnum), r'$1 \@\/\@ $2'
+
+ # Splits final period at end of string.
+ FINAL_PERIOD = r"""([^.])([.])([\]\)}>"']*) ?$""", r'\1 \2\3'
+ # Pad all question marks and exclamation marks with spaces.
+ PAD_QUESTION_EXCLAMATION_MARK = r'([?!])', r' \1 '
+
+ # Handles parentheses, brackets and converts them to PTB symbols.
+ PAD_PARENTHESIS = r'([\]\[\(\){}<>])', r' \1 '
+ CONVERT_PARENTHESIS_1 = r'\(', '-LRB-'
+ CONVERT_PARENTHESIS_2 = r'\)', '-RRB-'
+ CONVERT_PARENTHESIS_3 = r'\[', '-LSB-'
+ CONVERT_PARENTHESIS_4 = r'\]', '-RSB-'
+ CONVERT_PARENTHESIS_5 = r'\{', '-LCB-'
+ CONVERT_PARENTHESIS_6 = r'\}', '-RCB-'
+
+ # Pads double dashes with spaces.
+ PAD_DOUBLE_DASHES = r'--', ' -- '
+
+ # Adds spaces to start and end of string to simplify further regexps.
+ PAD_START_OF_STR = r'^', ' '
+ PAD_END_OF_STR = r'$', ' '
+
+ # Converts double quotes to two single quotes and pad with spaces.
+ CONVERT_DOUBLE_TO_SINGLE_QUOTES = r'"', " '' "
+ # Handles single quote in possessives or close-single-quote.
+ HANDLES_SINGLE_QUOTES = r"([^'])' ", r"\1 ' "
+
+ # Pad apostrophe in possessive or close-single-quote.
+ APOSTROPHE = r"([^'])'", r"\1 ' "
+
+ # Prepend space on contraction apostrophe.
+ CONTRACTION_1 = r"'([sSmMdD]) ", r" '\1 "
+ CONTRACTION_2 = r"'ll ", r" 'll "
+ CONTRACTION_3 = r"'re ", r" 're "
+ CONTRACTION_4 = r"'ve ", r" 've "
+ CONTRACTION_5 = r"n't ", r" n't "
+ CONTRACTION_6 = r"'LL ", r" 'LL "
+ CONTRACTION_7 = r"'RE ", r" 'RE "
+ CONTRACTION_8 = r"'VE ", r" 'VE "
+ CONTRACTION_9 = r"N'T ", r" N'T "
+
+ # Informal Contractions.
+ CONTRACTION_10 = r" ([Cc])annot ", r" \1an not "
+ CONTRACTION_11 = r" ([Dd])'ye ", r" \1' ye "
+ CONTRACTION_12 = r" ([Gg])imme ", r" \1im me "
+ CONTRACTION_13 = r" ([Gg])onna ", r" \1on na "
+ CONTRACTION_14 = r" ([Gg])otta ", r" \1ot ta "
+ CONTRACTION_15 = r" ([Ll])emme ", r" \1em me "
+ CONTRACTION_16 = r" ([Mm])ore$text =~ s='n ", r" \1ore 'n "
+ CONTRACTION_17 = r" '([Tt])is ", r" '\1 is "
+ CONTRACTION_18 = r" '([Tt])was ", r" '\1 was "
+ CONTRACTION_19 = r" ([Ww])anna ", r" \1an na "
+
+ # Clean out extra spaces
+ CLEAN_EXTRA_SPACE_1 = r' *', r' '
+ CLEAN_EXTRA_SPACE_2 = r'^ *', r''
+ CLEAN_EXTRA_SPACE_3 = r' *$', r''
+
+ # Neurotic Perl regexes to escape special characters.
+ # These XML escaping regexes are kept such that tokens generated from
+ # NLTK's implementation is consistent with Moses' tokenizer's output.
+ # Outside of the MosesTokenizer function, it's strongly encouraged to use
+ # nltk.tokenize.util.xml_escape() function instead.
+ ESCAPE_AMPERSAND = r'&', r'&'
+ ESCAPE_PIPE = r'\|', r'|'
+ ESCAPE_LEFT_ANGLE_BRACKET = r'<', r'<'
+ ESCAPE_RIGHT_ANGLE_BRACKET = r'>', r'>'
+ ESCAPE_SINGLE_QUOTE = r"\'", r"'"
+ ESCAPE_DOUBLE_QUOTE = r'\"', r'"'
+ ESCAPE_LEFT_SQUARE_BRACKET = r"\[", r"["
+ ESCAPE_RIGHT_SQUARE_BRACKET = r"]", r"]"
+
+ EN_SPECIFIC_1 = u"([^{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
+ EN_SPECIFIC_2 = u"([^{alpha}{isn}])[']([{alpha}])".format(alpha=IsAlpha, isn=IsN), r"\1 ' \2"
+ EN_SPECIFIC_3 = u"([{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
+ EN_SPECIFIC_4 = u"([{alpha}])[']([{alpha}])".format(alpha=IsAlpha), r"\1 '\2"
+ EN_SPECIFIC_5 = u"([{isn}])[']([s])".format(isn=IsN), r"\1 '\2"
+
+ ENGLISH_SPECIFIC_APOSTROPHE = [EN_SPECIFIC_1, EN_SPECIFIC_2, EN_SPECIFIC_3,
+ EN_SPECIFIC_4, EN_SPECIFIC_5]
+
+ FR_IT_SPECIFIC_1 = u"([^{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
+ FR_IT_SPECIFIC_2 = u"([^{alpha}])[']([{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
+ FR_IT_SPECIFIC_3 = u"([{alpha}])[']([^{alpha}])".format(alpha=IsAlpha), r"\1 ' \2"
+ FR_IT_SPECIFIC_4 = u"([{alpha}])[']([{alpha}])".format(alpha=IsAlpha), r"\1' \2"
+
+ FR_IT_SPECIFIC_APOSTROPHE = [FR_IT_SPECIFIC_1, FR_IT_SPECIFIC_2,
+ FR_IT_SPECIFIC_3, FR_IT_SPECIFIC_4]
+
+ NON_SPECIFIC_APOSTROPHE = r"\'", r" \' "
+
+ MOSES_PENN_REGEXES_1 = [DEDUPLICATE_SPACE, ASCII_JUNK, DIRECTIONAL_QUOTE_1,
+ DIRECTIONAL_QUOTE_2, DIRECTIONAL_QUOTE_3,
+ DIRECTIONAL_QUOTE_4, DIRECTIONAL_QUOTE_5,
+ DIRECTIONAL_QUOTE_6, DIRECTIONAL_QUOTE_7,
+ DIRECTIONAL_QUOTE_8, REPLACE_ELLIPSIS, COMMA_1,
+ COMMA_2, COMMA_3, SYMBOLS, INTRATOKEN_SLASHES,
+ FINAL_PERIOD, PAD_QUESTION_EXCLAMATION_MARK,
+ PAD_PARENTHESIS, CONVERT_PARENTHESIS_1,
+ CONVERT_PARENTHESIS_2, CONVERT_PARENTHESIS_3,
+ CONVERT_PARENTHESIS_4, CONVERT_PARENTHESIS_5,
+ CONVERT_PARENTHESIS_6, PAD_DOUBLE_DASHES,
+ PAD_START_OF_STR, PAD_END_OF_STR,
+ CONVERT_DOUBLE_TO_SINGLE_QUOTES,
+ HANDLES_SINGLE_QUOTES, APOSTROPHE, CONTRACTION_1,
+ CONTRACTION_2, CONTRACTION_3, CONTRACTION_4,
+ CONTRACTION_5, CONTRACTION_6, CONTRACTION_7,
+ CONTRACTION_8, CONTRACTION_9, CONTRACTION_10,
+ CONTRACTION_11, CONTRACTION_12, CONTRACTION_13,
+ CONTRACTION_14, CONTRACTION_15, CONTRACTION_16,
+ CONTRACTION_17, CONTRACTION_18, CONTRACTION_19]
+
+ MOSES_PENN_REGEXES_2 = [RESTORE_ELLIPSIS, CLEAN_EXTRA_SPACE_1,
+ CLEAN_EXTRA_SPACE_2, CLEAN_EXTRA_SPACE_3,
+ ESCAPE_AMPERSAND, ESCAPE_PIPE,
+ ESCAPE_LEFT_ANGLE_BRACKET, ESCAPE_RIGHT_ANGLE_BRACKET,
+ ESCAPE_SINGLE_QUOTE, ESCAPE_DOUBLE_QUOTE]
+
+ MOSES_ESCAPE_XML_REGEXES = [ESCAPE_AMPERSAND, ESCAPE_PIPE,
+ ESCAPE_LEFT_ANGLE_BRACKET,
+ ESCAPE_RIGHT_ANGLE_BRACKET,
+ ESCAPE_SINGLE_QUOTE, ESCAPE_DOUBLE_QUOTE,
+ ESCAPE_LEFT_SQUARE_BRACKET,
+ ESCAPE_RIGHT_SQUARE_BRACKET]
+
+ def __init__(self, lang='en'):
+ # Initialize the object.
+ super(MosesTokenizer, self).__init__()
+ self.lang = lang
+ # Initialize the language specific nonbreaking prefixes.
+ self.NONBREAKING_PREFIXES = nonbreaking_prefixes.words(lang)
+ self.NUMERIC_ONLY_PREFIXES = [w.rpartition(' ')[0] for w in
+ self.NONBREAKING_PREFIXES if
+ self.has_numeric_only(w)]
+
+
+
+ def replace_multidots(self, text):
+ text = re.sub(r'\.([\.]+)', r' DOTMULTI\1', text)
+ while re.search(r'DOTMULTI\.', text):
+ text = re.sub(r'DOTMULTI\.([^\.])', r'DOTDOTMULTI \1', text)
+ text = re.sub(r'DOTMULTI\.', 'DOTDOTMULTI', text)
+ return text
+
+ def restore_multidots(self, text):
+ while re.search(r'DOTDOTMULTI', text):
+ text = re.sub(r'DOTDOTMULTI', r'DOTMULTI.', text)
+ return re.sub(r'DOTMULTI', r'.', text)
+
+ def islower(self, text):
+ return not set(text).difference(set(self.IsLower))
+
+ def isalpha(self, text):
+ return not set(text).difference(set(self.IsAlpha))
+
+ def has_numeric_only(self, text):
+ return bool(re.search(r'(.*)[\s]+(\#NUMERIC_ONLY\#)', text))
+
+ def handles_nonbreaking_prefixes(self, text):
+ # Splits the text into tokens to check for nonbreaking prefixes.
+ tokens = text.split()
+ num_tokens = len(tokens)
+ for i, token in enumerate(tokens):
+ # Checks if token ends with a fullstop.
+ token_ends_with_period = re.search(r'^(\S+)\.$', token)
+ if token_ends_with_period:
+ prefix = token_ends_with_period.group(1)
+ # Checks for 3 conditions if
+ # i. the prefix contains a fullstop and
+ # any char in the prefix is within the IsAlpha charset
+ # ii. the prefix is in the list of nonbreaking prefixes and
+ # does not contain #NUMERIC_ONLY#
+ # iii. the token is not the last token and that the
+ # next token contains all lowercase.
+ if ( ('.' in prefix and self.isalpha(prefix)) or
+ (prefix in self.NONBREAKING_PREFIXES and
+ prefix not in self.NUMERIC_ONLY_PREFIXES) or
+ (i != num_tokens-1 and self.islower(tokens[i+1])) ):
+ pass # No change to the token.
+ # Checks if the prefix is in NUMERIC_ONLY_PREFIXES
+ # and ensures that the next word is a digit.
+ elif (prefix in self.NUMERIC_ONLY_PREFIXES and
+ re.search(r'^[0-9]+', tokens[i+1])):
+ pass # No change to the token.
+ else: # Otherwise, adds a space after the tokens before a dot.
+ tokens[i] = prefix + ' .'
+ return " ".join(tokens) # Stitch the tokens back.
+
+ def escape_xml(self, text):
+ for regexp, substitution in self.MOSES_ESCAPE_XML_REGEXES:
+ text = re.sub(regexp, substitution, text)
+ return text
+
+ def penn_tokenize(self, text, return_str=False):
+ """
+ This is a Python port of the Penn treebank tokenizer adapted by the Moses
+ machine translation community. It's a little different from the
+ version in nltk.tokenize.treebank.
+ """
+ # Converts input string into unicode.
+ text = text_type(text)
+ # Perform a chain of regex substituitions using MOSES_PENN_REGEXES_1
+ for regexp, substitution in self.MOSES_PENN_REGEXES_1:
+ text = re.sub(regexp, substitution, text)
+ # Handles nonbreaking prefixes.
+ text = self.handles_nonbreaking_prefixes(text)
+ # Restore ellipsis, clean extra spaces, escape XML symbols.
+ for regexp, substitution in self.MOSES_PENN_REGEXES_2:
+ text = re.sub(regexp, substitution, text)
+ return text if return_str else text.split()
+
+ def tokenize(self, text, agressive_dash_splits=False, return_str=False):
+ """
+ Python port of the Moses tokenizer.
+
+ >>> mtokenizer = MosesTokenizer()
+ >>> text = u'Is 9.5 or 525,600 my favorite number?'
+ >>> print (mtokenizer.tokenize(text, return_str=True))
+ Is 9.5 or 525,600 my favorite number ?
+ >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
+ >>> print (mtokenizer.tokenize(text, return_str=True))
+ The https : / / github.com / jonsafari / tok-tok / blob / master / tok-tok.pl is a website with / and / or slashes and sort of weird : things
+ >>> text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
+ >>> expected = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+ >>> assert mtokenizer.tokenize(text, return_str=True) == expected
+
+ :param tokens: A single string, i.e. sentence text.
+ :type tokens: str
+ :param agressive_dash_splits: Option to trigger dash split rules .
+ :type agressive_dash_splits: bool
+ """
+ # Converts input string into unicode.
+ text = text_type(text)
+
+ # De-duplicate spaces and clean ASCII junk
+ for regexp, substitution in [self.DEDUPLICATE_SPACE, self.ASCII_JUNK]:
+ text = re.sub(regexp, substitution, text)
+ # Strips heading and trailing spaces.
+ text = text.strip()
+ # Separate special characters outside of IsAlnum character set.
+ regexp, substitution = self.PAD_NOT_ISALNUM
+ text = re.sub(regexp, substitution, text)
+ # Aggressively splits dashes
+ if agressive_dash_splits:
+ regexp, substitution = self.AGGRESSIVE_HYPHEN_SPLIT
+ text = re.sub(regexp, substitution, text)
+ # Replaces multidots with "DOTDOTMULTI" literal strings.
+ text = self.replace_multidots(text)
+ # Separate out "," except if within numbers e.g. 5,300
+ for regexp, substitution in [self.COMMA_SEPARATE_1, self.COMMA_SEPARATE_2]:
+ text = re.sub(regexp, substitution, text)
+
+ # (Language-specific) apostrophe tokenization.
+ if self.lang == 'en':
+ for regexp, substitution in self.ENGLISH_SPECIFIC_APOSTROPHE:
+ text = re.sub(regexp, substitution, text)
+ elif self.lang in ['fr', 'it']:
+ for regexp, substitution in self.FR_IT_SPECIFIC_APOSTROPHE:
+ text = re.sub(regexp, substitution, text)
+ else:
+ regexp, substitution = self.NON_SPECIFIC_APOSTROPHE
+ text = re.sub(regexp, substitution, text)
+
+ # Handles nonbreaking prefixes.
+ text = self.handles_nonbreaking_prefixes(text)
+ # Cleans up extraneous spaces.
+ regexp, substitution = self.DEDUPLICATE_SPACE
+ text = re.sub(regexp,substitution, text).strip()
+ # Restore multidots.
+ text = self.restore_multidots(text)
+ # Escape XML symbols.
+ text = self.escape_xml(text)
+
+ return text if return_str else text.split()
+
+
+class MosesDetokenizer(TokenizerI):
+ """
+ This is a Python port of the Moses Detokenizer from
+ https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl
+
+ >>> tokenizer = MosesTokenizer()
+ >>> text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
+ >>> expected_tokenized = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+ >>> tokenized_text = tokenizer.tokenize(text, return_str=True)
+ >>> tokenized_text == expected_tokenized
+ True
+ >>> detokenizer = MosesDetokenizer()
+ >>> expected_detokenized = u'This, is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+ >>> detokenized_text = detokenizer.detokenize(tokenized_text.split(), return_str=True)
+ >>> detokenized_text == expected_detokenized
+ True
+
+ >>> from nltk.tokenize.moses import MosesTokenizer, MosesDetokenizer
+ >>> t, d = MosesTokenizer(), MosesDetokenizer()
+ >>> sent = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
+ >>> expected_tokens = [u'This', u'ain', u''t', u'funny', u'.', u'It', u''s', u'actually', u'hillarious', u',', u'yet', u'double', u'Ls', u'.', u'|', u'[', u']', u'<', u'>', u'[', u']', u'&', u'You', u''re', u'gonna', u'shake', u'it', u'off', u'?', u'Don', u''t', u'?']
+ >>> expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?"
+ >>> tokens = t.tokenize(sent)
+ >>> tokens == expected_tokens
+ True
+ >>> detokens = d.detokenize(tokens)
+ >>> " ".join(detokens) == expected_detokens
+ True
+ """
+ # Currency Symbols.
+ IsAlnum = text_type(''.join(perluniprops.chars('IsAlnum')))
+ IsAlpha = text_type(''.join(perluniprops.chars('IsAlpha')))
+ IsSc = text_type(''.join(perluniprops.chars('IsSc')))
+
+ AGGRESSIVE_HYPHEN_SPLIT = r' \@\-\@ ', r'-'
+
+ # Merge multiple spaces.
+ ONE_SPACE = re.compile(r' {2,}'), ' '
+
+ # Unescape special characters.
+ UNESCAPE_FACTOR_SEPARATOR = r'|', r'|'
+ UNESCAPE_LEFT_ANGLE_BRACKET = r'<', r'<'
+ UNESCAPE_RIGHT_ANGLE_BRACKET = r'>', r'>'
+ UNESCAPE_DOUBLE_QUOTE = r'"', r'"'
+ UNESCAPE_SINGLE_QUOTE = r"'", r"'"
+ UNESCAPE_SYNTAX_NONTERMINAL_LEFT = r'[', r'['
+ UNESCAPE_SYNTAX_NONTERMINAL_RIGHT = r']', r']'
+ UNESCAPE_AMPERSAND = r'&', r'&'
+ # The legacy regexes are used to support outputs from older Moses versions.
+ UNESCAPE_FACTOR_SEPARATOR_LEGACY = r'&bar;', r'|'
+ UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACY = r'&bra;', r'['
+ UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACY = r'&ket;', r']'
+
+
+ MOSES_UNESCAPE_XML_REGEXES = [UNESCAPE_FACTOR_SEPARATOR_LEGACY,
+ UNESCAPE_FACTOR_SEPARATOR, UNESCAPE_LEFT_ANGLE_BRACKET,
+ UNESCAPE_RIGHT_ANGLE_BRACKET,
+ UNESCAPE_SYNTAX_NONTERMINAL_LEFT_LEGACY,
+ UNESCAPE_SYNTAX_NONTERMINAL_RIGHT_LEGACY,
+ UNESCAPE_DOUBLE_QUOTE, UNESCAPE_SINGLE_QUOTE,
+ UNESCAPE_SYNTAX_NONTERMINAL_LEFT,
+ UNESCAPE_SYNTAX_NONTERMINAL_RIGHT, UNESCAPE_AMPERSAND]
+
+ FINNISH_MORPHSET_1 = [u'N', u'n', u'A', u'a', u'\xc4', u'\xe4', u'ssa',
+ u'Ssa', u'ss\xe4', u'Ss\xe4', u'sta', u'st\xe4',
+ u'Sta', u'St\xe4', u'hun', u'Hun', u'hyn', u'Hyn',
+ u'han', u'Han', u'h\xe4n', u'H\xe4n', u'h\xf6n',
+ u'H\xf6n', u'un', u'Un', u'yn', u'Yn', u'an', u'An',
+ u'\xe4n', u'\xc4n', u'\xf6n', u'\xd6n', u'seen',
+ u'Seen', u'lla', u'Lla', u'll\xe4', u'Ll\xe4', u'lta',
+ u'Lta', u'lt\xe4', u'Lt\xe4', u'lle', u'Lle', u'ksi',
+ u'Ksi', u'kse', u'Kse', u'tta', u'Tta', u'ine', u'Ine']
+
+ FINNISH_MORPHSET_2 = [u'ni', u'si', u'mme', u'nne', u'nsa']
+
+ FINNISH_MORPHSET_3 = [u'ko', u'k\xf6', u'han', u'h\xe4n', u'pa', u'p\xe4',
+ u'kaan', u'k\xe4\xe4n', u'kin']
+
+ FINNISH_REGEX = u'^({})({})?({})$'.format(text_type('|'.join(FINNISH_MORPHSET_1)),
+ text_type('|'.join(FINNISH_MORPHSET_2)),
+ text_type('|'.join(FINNISH_MORPHSET_3)))
+
+
+ def __init__(self, lang='en'):
+ super(MosesDetokenizer, self).__init__()
+ self.lang = lang
+
+
+ def unescape_xml(self, text):
+ for regexp, substitution in self.MOSES_UNESCAPE_XML_REGEXES:
+ text = re.sub(regexp, substitution, text)
+ return text
+
+
+ def tokenize(self, tokens, return_str=False):
+ """
+ Python port of the Moses detokenizer.
+
+ :param tokens: A list of strings, i.e. tokenized text.
+ :type tokens: list(str)
+ :return: str
+ """
+ # Convert the list of tokens into a string and pad it with spaces.
+ text = u" {} ".format(" ".join(tokens))
+ # Converts input string into unicode.
+ text = text_type(text)
+ # Detokenize the agressive hyphen split.
+ regexp, substitution = self.AGGRESSIVE_HYPHEN_SPLIT
+ text = re.sub(regexp, substitution, text)
+ # Unescape the XML symbols.
+ text = self.unescape_xml(text)
+ # Keep track of no. of quotation marks.
+ quote_counts = {u"'":0 , u'"':0, u"``":0, u"`":0, u"''":0}
+
+ # The *prepend_space* variable is used to control the "effects" of
+ # detokenization as the function loops through the list of tokens and
+ # changes the *prepend_space* accordingly as it sequentially checks
+ # through the language specific and language independent conditions.
+ prepend_space = " "
+ detokenized_text = ""
+ tokens = text.split()
+ # Iterate through every token and apply language specific detokenization rule(s).
+ for i, token in enumerate(iter(tokens)):
+ # Check if the first char is CJK.
+ if is_cjk(token[0]):
+ # Perform left shift if this is a second consecutive CJK word.
+ if i > 0 and is_cjk(token[-1]):
+ detokenized_text += token
+ # But do nothing special if this is a CJK word that doesn't follow a CJK word
+ else:
+ detokenized_text += prepend_space + token
+ prepend_space = " "
+
+ # If it's a currency symbol.
+ elif token in self.IsSc:
+ # Perform right shift on currency and other random punctuation items
+ detokenized_text += prepend_space + token
+ prepend_space = ""
+
+ elif re.search(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$', token):
+ # In French, these punctuations are prefixed with a non-breakable space.
+ if self.lang == 'fr' and re.search(r'^[\?\!\:\;\\\%]$', token):
+ detokenized_text += " "
+ # Perform left shift on punctuation items.
+ detokenized_text += token
+ prepend_space = " "
+
+ elif (self.lang == 'en' and i > 0
+ and re.search(u"^[\'][{}]".format(self.IsAlpha), token)):
+ #and re.search(u'[{}]$'.format(self.IsAlnum), tokens[i-1])):
+ # For English, left-shift the contraction.
+ detokenized_text += token
+ prepend_space = " "
+
+ elif (self.lang == 'cs' and i > 1
+ and re.search(r'^[0-9]+$', tokens[-2]) # If the previous previous token is a number.
+ and re.search(r'^[.,]$', tokens[-1]) # If previous token is a dot.
+ and re.search(r'^[0-9]+$', token)): # If the current token is a number.
+ # In Czech, left-shift floats that are decimal numbers.
+ detokenized_text += token
+ prepend_space = " "
+
+ elif (self.lang in ['fr', 'it'] and i <= len(tokens)-2
+ and re.search(u'[{}][\']$'.format(self.IsAlpha), token)
+ and re.search(u'^[{}]$'.format(self.IsAlpha), tokens[i+1])): # If the next token is alpha.
+ # For French and Italian, right-shift the contraction.
+ detokenized_text += prepend_space + token
+ prepend_space = ""
+
+ elif (self.lang == 'cs' and i <= len(tokens)-3
+ and re.search(u'[{}][\']$'.format(self.IsAlpha), token)
+ and re.search(u'^[-–]$', tokens[i+1])
+ and re.search(u'^li$|^mail.*', tokens[i+2], re.IGNORECASE)): # In Perl, ($words[$i+2] =~ /^li$|^mail.*/i)
+ # In Czech, right-shift "-li" and a few Czech dashed words (e.g. e-mail)
+ detokenized_text += prepend_space + token + tokens[i+1]
+ next(tokens, None) # Advance over the dash
+ prepend_space = ""
+
+ # Combine punctuation smartly.
+ elif re.search(r'''^[\'\"„“`]+$''', token):
+ normalized_quo = token
+ if re.search(r'^[„“”]+$', token):
+ normalized_quo = '"'
+ quote_counts[normalized_quo] = quote_counts.get(normalized_quo, 0)
+
+ if self.lang == 'cs' and token == u"„":
+ quote_counts[normalized_quo] = 0
+ if self.lang == 'cs' and token == u"“":
+ quote_counts[normalized_quo] = 1
+
+
+ if quote_counts[normalized_quo] % 2 == 0:
+ if (self.lang == 'en' and token == u"'" and i > 0
+ and re.search(r'[s]$', tokens[i-1]) ):
+ # Left shift on single quote for possessives ending
+ # in "s", e.g. "The Jones' house"
+ detokenized_text += token
+ prepend_space = " "
+ else:
+ # Right shift.
+ detokenized_text += prepend_space + token
+ prepend_space = ""
+ quote_counts[normalized_quo] += 1
+ else:
+ # Left shift.
+ detokenized_text += token
+ prepend_space = " "
+ quote_counts[normalized_quo] += 1
+
+ elif (self.lang == 'fi' and re.search(r':$', tokens[i-1])
+ and re.search(self.FINNISH_REGEX, token)):
+ # Finnish : without intervening space if followed by case suffix
+ # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...
+ detokenized_text += prepend_space + token
+ prepend_space = " "
+
+ else:
+ detokenized_text += prepend_space + token
+ prepend_space = " "
+
+ # Merge multiple spaces.
+ regexp, substitution = self.ONE_SPACE
+ detokenized_text = re.sub(regexp, substitution, detokenized_text)
+ # Removes heading and trailing spaces.
+ detokenized_text = detokenized_text.strip()
+
+ return detokenized_text if return_str else detokenized_text.split()
+
+ def detokenize(self, tokens, return_str=False):
+ """ Duck-typing the abstract *tokenize()*."""
+ return self.tokenize(tokens, return_str)
diff --git a/nltk/tokenize/mwe.py b/nltk/tokenize/mwe.py
index 3c2dda0..40b3705 100644
--- a/nltk/tokenize/mwe.py
+++ b/nltk/tokenize/mwe.py
@@ -1,6 +1,6 @@
# Multi-Word Expression tokenizer
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Rob Malouf <rmalouf at mail.sdsu.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/tokenize/punkt.py b/nltk/tokenize/punkt.py
index 4aa11f1..b5b724c 100644
--- a/nltk/tokenize/punkt.py
+++ b/nltk/tokenize/punkt.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Punkt sentence tokenizer
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Algorithm: Kiss & Strunk (2006)
# Author: Willy <willy at csse.unimelb.edu.au> (original Python port)
# Steven Bird <stevenbird1 at gmail.com> (additions)
@@ -109,7 +109,9 @@ import re
import math
from collections import defaultdict
-from nltk.compat import unicode_repr, python_2_unicode_compatible, string_types
+from six import string_types
+
+from nltk.compat import unicode_repr, python_2_unicode_compatible
from nltk.probability import FreqDist
from nltk.tokenize.api import TokenizerI
@@ -1058,25 +1060,32 @@ class PunktTrainer(PunktBaseClass):
unlike the previous log_l function where it used modified
Dunning log-likelihood values
"""
- import math
-
p = count_b / N
p1 = count_ab / count_a
- p2 = (count_b - count_ab) / (N - count_a)
+ try:
+ p2 = (count_b - count_ab) / (N - count_a)
+ except ZeroDivisionError as e:
+ p2 = 1
- summand1 = (count_ab * math.log(p) +
- (count_a - count_ab) * math.log(1.0 - p))
+ try:
+ summand1 = (count_ab * math.log(p) +
+ (count_a - count_ab) * math.log(1.0 - p))
+ except ValueError as e:
+ summand1 = 0
- summand2 = ((count_b - count_ab) * math.log(p) +
- (N - count_a - count_b + count_ab) * math.log(1.0 - p))
+ try:
+ summand2 = ((count_b - count_ab) * math.log(p) +
+ (N - count_a - count_b + count_ab) * math.log(1.0 - p))
+ except ValueError as e:
+ summand2 = 0
- if count_a == count_ab:
+ if count_a == count_ab or p1 <= 0 or p1 >= 1:
summand3 = 0
else:
summand3 = (count_ab * math.log(p1) +
(count_a - count_ab) * math.log(1.0 - p1))
- if count_b == count_ab:
+ if count_b == count_ab or p2 <= 0 or p2 >= 1:
summand4 = 0
else:
summand4 = ((count_b - count_ab) * math.log(p2) +
@@ -1285,7 +1294,8 @@ class PunktSentenceTokenizer(PunktBaseClass,TokenizerI):
else:
# next sentence starts at following punctuation
last_break = match.end()
- yield slice(last_break, len(text))
+ # The last sentence should not contain trailing whitespace.
+ yield slice(last_break, len(text.rstrip()))
def _realign_boundaries(self, text, slices):
"""
@@ -1597,5 +1607,3 @@ def demo(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer):
sbd = tok_cls(trainer.get_params())
for l in sbd.sentences_from_text(text):
print(cleanup(l))
-
-
diff --git a/nltk/tokenize/regexp.py b/nltk/tokenize/regexp.py
index eb5a5e9..cb0b61d 100644
--- a/nltk/tokenize/regexp.py
+++ b/nltk/tokenize/regexp.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Tokenizers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# Trevor Cohn <tacohn at csse.unimelb.edu.au>
diff --git a/nltk/tokenize/repp.py b/nltk/tokenize/repp.py
new file mode 100644
index 0000000..aa2aa6c
--- /dev/null
+++ b/nltk/tokenize/repp.py
@@ -0,0 +1,151 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Interface to the Repp Tokenizer
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Authors: Rebecca Dridan and Stephan Oepen
+# Contributors: Liling Tan
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import unicode_literals, print_function
+from six import text_type
+
+import os
+import re
+import sys
+import subprocess
+import tempfile
+
+
+from nltk.data import ZipFilePathPointer
+from nltk.internals import find_dir
+
+from nltk.tokenize.api import TokenizerI
+
+class ReppTokenizer(TokenizerI):
+ """
+ A class for word tokenization using the REPP parser described in
+ Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a
+ Long Solved Problem - A Survey, Contrastive Experiment, Recommendations,
+ and Toolkit. In ACL. http://anthology.aclweb.org/P/P12/P12-2.pdf#page=406
+
+ >>> sents = ['Tokenization is widely regarded as a solved problem due to the high accuracy that rulebased tokenizers achieve.' ,
+ ... 'But rule-based tokenizers are hard to maintain and their rules language specific.' ,
+ ... 'We evaluated our method on three languages and obtained error rates of 0.27% (English), 0.35% (Dutch) and 0.76% (Italian) for our best models.'
+ ... ]
+ >>> tokenizer = ReppTokenizer('/home/alvas/repp/') # doctest: +SKIP
+ >>> for sent in sents: # doctest: +SKIP
+ ... tokenizer.tokenize(sent) # doctest: +SKIP
+ ...
+ (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
+ (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
+ (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
+
+ >>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP
+ ... print sent # doctest: +SKIP
+ ...
+ (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
+ (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
+ (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
+ >>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP
+ ... print sent # doctest: +SKIP
+ ...
+ [(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)]
+ [(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)]
+ [(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111 [...]
+ """
+ def __init__(self, repp_dir, encoding='utf8'):
+ self.repp_dir = self.find_repptokenizer(repp_dir)
+ # Set a directory to store the temporary files.
+ self.working_dir = tempfile.gettempdir()
+ # Set an encoding for the input strings.
+ self.encoding = encoding
+
+ def tokenize(self, sentence):
+ """
+ Use Repp to tokenize a single sentence.
+
+ :param sentence: A single sentence string.
+ :type sentence: str
+ :return: A tuple of tokens.
+ :rtype: tuple(str)
+ """
+ return next(self.tokenize_sents([sentence]))
+
+ def tokenize_sents(self, sentences, keep_token_positions=False):
+ """
+ Tokenize multiple sentences using Repp.
+
+ :param sentences: A list of sentence strings.
+ :type sentences: list(str)
+ :return: A list of tuples of tokens
+ :rtype: iter(tuple(str))
+ """
+ with tempfile.NamedTemporaryFile(prefix='repp_input.',
+ dir=self.working_dir, mode='w', delete=False) as input_file:
+ # Write sentences to temporary input file.
+ for sent in sentences:
+ input_file.write(text_type(sent) + '\n')
+ input_file.close()
+ # Generate command to run REPP.
+ cmd =self.generate_repp_command(input_file.name)
+ # Decode the stdout and strips the ending newline.
+ repp_output = self._execute(cmd).decode(self.encoding).strip()
+ for tokenized_sent in self.parse_repp_outputs(repp_output):
+ if not keep_token_positions:
+ # Removes token position information.
+ tokenized_sent, starts, ends = zip(*tokenized_sent)
+ yield tokenized_sent
+
+ def generate_repp_command(self, inputfilename):
+ """
+ This module generates the REPP command to be used at the terminal.
+
+ :param inputfilename: path to the input file
+ :type inputfilename: str
+ """
+ cmd = [self.repp_dir + '/src/repp']
+ cmd+= ['-c', self.repp_dir + '/erg/repp.set']
+ cmd+= ['--format', 'triple']
+ cmd+= [inputfilename]
+ return cmd
+
+ @staticmethod
+ def _execute(cmd):
+ p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdout, stderr = p.communicate()
+ return stdout
+
+ @staticmethod
+ def parse_repp_outputs(repp_output):
+ """
+ This module parses the tri-tuple format that REPP outputs using the
+ "--format triple" option and returns an generator with tuple of string
+ tokens.
+
+ :param repp_output:
+ :type repp_output: type
+ :return: an iterable of the tokenized sentences as tuples of strings
+ :rtype: iter(tuple)
+ """
+ line_regex = re.compile('^\((\d+), (\d+), (.+)\)$', re.MULTILINE)
+ for section in repp_output.split('\n\n'):
+ words_with_positions = [(token, int(start), int(end))
+ for start, end, token in
+ line_regex.findall(section)]
+ words = tuple(t[2] for t in words_with_positions)
+ yield words_with_positions
+
+ def find_repptokenizer(self, repp_dirname):
+ """
+ A module to find REPP tokenizer binary and its *repp.set* config file.
+ """
+ if os.path.exists(repp_dirname): # If a full path is given.
+ _repp_dir = repp_dirname
+ else: # Try to find path to REPP directory in environment variables.
+ _repp_dir = find_dir(repp_dirname, env_vars=('REPP_TOKENIZER',))
+ # Checks for the REPP binary and erg/repp.set config file.
+ assert os.path.exists(_repp_dir+'/src/repp')
+ assert os.path.exists(_repp_dir+'/erg/repp.set')
+ return _repp_dir
diff --git a/nltk/tokenize/sexpr.py b/nltk/tokenize/sexpr.py
index 18a1cdf..d1bdb4e 100644
--- a/nltk/tokenize/sexpr.py
+++ b/nltk/tokenize/sexpr.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Tokenizers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Yoav Goldberg <yoavg at cs.bgu.ac.il>
# Steven Bird <stevenbird1 at gmail.com> (minor edits)
# URL: <http://nltk.sourceforge.net>
diff --git a/nltk/tokenize/simple.py b/nltk/tokenize/simple.py
index e519b25..2b7ffe4 100644
--- a/nltk/tokenize/simple.py
+++ b/nltk/tokenize/simple.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Simple Tokenizers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.sourceforge.net>
diff --git a/nltk/tokenize/stanford.py b/nltk/tokenize/stanford.py
index e214d21..ec6b312 100644
--- a/nltk/tokenize/stanford.py
+++ b/nltk/tokenize/stanford.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford Tokenizer
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Xu <xxu at student.unimelb.edu.au>
#
# URL: <http://nltk.org/>
@@ -14,12 +14,13 @@ import os
import json
from subprocess import PIPE
-from nltk import compat
-from nltk.internals import find_jar, config_java, java, _java_options, find_jars_within_path
+from six import text_type
+
+from nltk.internals import find_jar, config_java, java, _java_options
from nltk.tokenize.api import TokenizerI
-_stanford_url = 'http://nlp.stanford.edu/software/tokenizer.shtml'
+_stanford_url = 'https://nlp.stanford.edu/software/tokenizer.shtml'
class StanfordTokenizer(TokenizerI):
r"""
@@ -43,11 +44,7 @@ class StanfordTokenizer(TokenizerI):
searchpath=(), url=_stanford_url,
verbose=verbose
)
-
- # Adding logging jar files to classpath
- stanford_dir = os.path.split(self._stanford_jar)[0]
- self._stanford_jar = tuple(find_jars_within_path(stanford_dir))
-
+
self._encoding = encoding
self.java_options = java_options
@@ -82,7 +79,7 @@ class StanfordTokenizer(TokenizerI):
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
# Write the actual sentences to the temporary input file
- if isinstance(input_, compat.text_type) and encoding:
+ if isinstance(input_, text_type) and encoding:
input_ = input_.encode(encoding)
input_file.write(input_)
input_file.flush()
@@ -109,5 +106,3 @@ def setup_module(module):
StanfordTokenizer()
except LookupError:
raise SkipTest('doctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn\'t exist')
-
-
diff --git a/nltk/tokenize/stanford_segmenter.py b/nltk/tokenize/stanford_segmenter.py
index 54a7dbf..40613fc 100644
--- a/nltk/tokenize/stanford_segmenter.py
+++ b/nltk/tokenize/stanford_segmenter.py
@@ -1,10 +1,12 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-# Natural Language Toolkit: Interface to the Stanford Chinese Segmenter
+# Natural Language Toolkit: Interface to the Stanford Segmenter
+# for Chinese and Arabic
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: 52nlp <52nlpcn at gmail.com>
# Casper Lehmann-Strøm <casperlehmann at gmail.com>
+# Alex Constantin <alex at keyworder.ch>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -17,37 +19,46 @@ import json
from subprocess import PIPE
from nltk import compat
-from nltk.internals import find_jar, config_java, java, _java_options
-
+from nltk.internals import find_jar, find_file, find_dir, \
+ config_java, java, _java_options
from nltk.tokenize.api import TokenizerI
-_stanford_url = 'http://nlp.stanford.edu/software'
+from six import text_type
+
+_stanford_url = 'https://nlp.stanford.edu/software'
+
class StanfordSegmenter(TokenizerI):
- r"""
+ """
Interface to the Stanford Segmenter
>>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
- >>> segmenter = StanfordSegmenter(
- ... path_to_jar="stanford-segmenter-3.6.0.jar",
- ... path_to_slf4j = "slf4j-api.jar"
- ... path_to_sihan_corpora_dict="./data",
- ... path_to_model="./data/pku.gz",
- ... path_to_dict="./data/dict-chris6.ser.gz")
- >>> sentence = u"这是斯坦福中文分词器测试"
- >>> segmenter.segment(sentence)
- >>> u'\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5\n'
- >>> segmenter.segment_file("test.simp.utf8")
- >>> u'\u9762\u5bf9 \u65b0 \u4e16\u7eaa \uff0c \u4e16\u754c \u5404\u56fd ...
+ >>> seg = StanfordSegmenter()
+ >>> seg.default_config('zh')
+ >>> sent = u'这是斯坦福中文分词器测试'
+ >>> print(seg.segment(sent))
+ \u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5
+ <BLANKLINE>
+ >>> seg.default_config('ar')
+ >>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
+ >>> print(seg.segment(sent.split()))
+ \u0647\u0630\u0627 \u0647\u0648 \u062a\u0635\u0646\u064a\u0641 \u0633\u062a\u0627\u0646\u0641\u0648\u0631\u062f \u0627\u0644\u0639\u0631\u0628\u064a \u0644 \u0627\u0644\u0643\u0644\u0645\u0627\u062a
+ <BLANKLINE>
"""
_JAR = 'stanford-segmenter.jar'
_SLF4J = 'slf4j-api.jar'
- def __init__(self, path_to_jar=None, path_to_slf4j=None,
- path_to_sihan_corpora_dict=None,
- path_to_model=None, path_to_dict=None,
- encoding='UTF-8', options=None,
- verbose=False, java_options='-mx2g'):
+ def __init__(self,
+ path_to_jar=None, path_to_slf4j=None,
+ java_class=None,
+ path_to_model=None,
+ path_to_dict=None,
+ path_to_sihan_corpora_dict=None,
+ sihan_post_processing='false',
+ keep_whitespaces='false',
+ encoding='UTF-8', options=None,
+ verbose=False, java_options='-mx2g'):
+
stanford_segmenter = find_jar(
self._JAR, path_to_jar,
env_vars=('STANFORD_SEGMENTER',),
@@ -55,16 +66,19 @@ class StanfordSegmenter(TokenizerI):
verbose=verbose)
slf4j = find_jar(
self._SLF4J, path_to_slf4j,
- env_vars=('SLF4J',),
+ env_vars=('SLF4J', 'STANFORD_SEGMENTER',),
searchpath=(), url=_stanford_url,
verbose=verbose)
# This is passed to java as the -cp option, the segmenter needs slf4j.
- self._stanford_jar = ':'.join(
+ self._stanford_jar = os.pathsep.join(
[_ for _ in [stanford_segmenter, slf4j] if not _ is None])
- self._sihan_corpora_dict = path_to_sihan_corpora_dict
+ self._java_class = java_class
self._model = path_to_model
+ self._sihan_corpora_dict = path_to_sihan_corpora_dict
+ self._sihan_post_processing = sihan_post_processing
+ self._keep_whitespaces = keep_whitespaces
self._dict = path_to_dict
self._encoding = encoding
@@ -72,6 +86,59 @@ class StanfordSegmenter(TokenizerI):
options = {} if options is None else options
self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
+ def default_config(self, lang):
+ """
+ Attempt to intialize Stanford Word Segmenter for the specified language
+ using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
+ """
+
+ search_path = ()
+ if os.environ.get('STANFORD_SEGMENTER'):
+ search_path = {os.path.join(os.environ.get('STANFORD_SEGMENTER'), 'data')}
+
+ # init for Chinese-specific files
+ self._dict = None
+ self._sihan_corpora_dict = None
+ self._sihan_post_processing = 'false'
+
+ if lang == 'ar':
+ self._java_class = 'edu.stanford.nlp.international.arabic.process.ArabicSegmenter'
+ model = 'arabic-segmenter-atb+bn+arztrain.ser.gz'
+
+ elif lang == 'zh':
+ self._java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier'
+ model = 'pku.gz'
+ self._sihan_post_processing = 'true'
+
+ path_to_dict = 'dict-chris6.ser.gz'
+ try:
+ self._dict = find_file(path_to_dict, searchpath=search_path,
+ url=_stanford_url, verbose=False,
+ env_vars=('STANFORD_MODELS',))
+ except LookupError:
+ raise LookupError("Could not find '%s' (tried using env. "
+ "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % path_to_dict)
+
+ sihan_dir = './data/'
+ try:
+ path_to_sihan_dir = find_dir(sihan_dir,
+ url=_stanford_url, verbose=False,
+ env_vars=('STANFORD_SEGMENTER',))
+ self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
+ except LookupError:
+ raise LookupError("Could not find '%s' (tried using the "
+ "STANFORD_SEGMENTER environment variable)" % sihan_dir)
+ else:
+ raise LookupError("Unsupported language '%'" % lang)
+
+ try:
+ self._model = find_file(model, searchpath=search_path,
+ url=_stanford_url, verbose=False,
+ env_vars=('STANFORD_MODELS', 'STANFORD_SEGMENTER',))
+ except LookupError:
+ raise LookupError("Could not find '%s' (tried using env. "
+ "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model)
+
def tokenize(self, s):
super().tokenize(s)
@@ -79,14 +146,15 @@ class StanfordSegmenter(TokenizerI):
"""
"""
cmd = [
- 'edu.stanford.nlp.ie.crf.CRFClassifier',
- '-sighanCorporaDict', self._sihan_corpora_dict,
- '-textFile', input_file_path,
- '-sighanPostProcessing', 'true',
- '-keepAllWhitespaces', 'false',
+ self._java_class,
'-loadClassifier', self._model,
- '-serDictionary', self._dict
+ '-keepAllWhitespaces', self._keep_whitespaces,
+ '-textFile', input_file_path
]
+ if self._sihan_corpora_dict is not None:
+ cmd.extend(['-serDictionary', self._dict,
+ '-sighanCorporaDict', self._sihan_corpora_dict,
+ '-sighanPostProcessing', self._sihan_post_processing])
stdout = self._execute(cmd)
@@ -105,20 +173,21 @@ class StanfordSegmenter(TokenizerI):
# Write the actural sentences to the temporary input file
_input_fh = os.fdopen(_input_fh, 'wb')
_input = '\n'.join((' '.join(x) for x in sentences))
- if isinstance(_input, compat.text_type) and encoding:
+ if isinstance(_input, text_type) and encoding:
_input = _input.encode(encoding)
_input_fh.write(_input)
_input_fh.close()
cmd = [
- 'edu.stanford.nlp.ie.crf.CRFClassifier',
- '-sighanCorporaDict', self._sihan_corpora_dict,
- '-textFile', self._input_file_path,
- '-sighanPostProcessing', 'true',
- '-keepAllWhitespaces', 'false',
+ self._java_class,
'-loadClassifier', self._model,
- '-serDictionary', self._dict
+ '-keepAllWhitespaces', self._keep_whitespaces,
+ '-textFile', self._input_file_path
]
+ if self._sihan_corpora_dict is not None:
+ cmd.extend(['-serDictionary', self._dict,
+ '-sighanCorporaDict', self._sihan_corpora_dict,
+ '-sighanPostProcessing', self._sihan_post_processing])
stdout = self._execute(cmd)
@@ -139,8 +208,7 @@ class StanfordSegmenter(TokenizerI):
# Configure java.
config_java(options=self.java_options, verbose=verbose)
- stdout, _stderr = java(
- cmd,classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE)
+ stdout, _stderr = java(cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE)
stdout = stdout.decode(encoding)
# Return java configurations to their default values.
@@ -148,10 +216,13 @@ class StanfordSegmenter(TokenizerI):
return stdout
+
def setup_module(module):
from nose import SkipTest
try:
- StanfordSegmenter()
- except LookupError:
- raise SkipTest('doctests from nltk.tokenize.stanford_segmenter are skipped because the stanford segmenter jar doesn\'t exist')
+ seg = StanfordSegmenter()
+ seg.default_config('ar')
+ seg.default_config('zh')
+ except LookupError as e:
+ raise SkipTest('Tests for nltk.tokenize.stanford_segmenter skipped: %s' % str(e))
diff --git a/nltk/tokenize/texttiling.py b/nltk/tokenize/texttiling.py
index 9fa359e..65ab241 100644
--- a/nltk/tokenize/texttiling.py
+++ b/nltk/tokenize/texttiling.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: TextTiling
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: George Boutsioukis
#
# URL: <http://nltk.org/>
diff --git a/nltk/tokenize/toktok.py b/nltk/tokenize/toktok.py
new file mode 100644
index 0000000..3c46373
--- /dev/null
+++ b/nltk/tokenize/toktok.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Python port of the tok-tok.pl tokenizer.
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Jon Dehdari
+# Contributors: Liling Tan, Selcuk Ayguney, ikegami, Martijn Pieters
+#
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+"""
+The tok-tok tokenizer is a simple, general tokenizer, where the input has one
+sentence per line; thus only final period is tokenized.
+
+Tok-tok has been tested on, and gives reasonably good results for English,
+Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others.
+The input should be in UTF-8 encoding.
+
+Reference:
+Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language
+Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
+"""
+
+import re
+from six import text_type
+
+from nltk.tokenize.api import TokenizerI
+
+class ToktokTokenizer(TokenizerI):
+ """
+ This is a Python port of the tok-tok.pl from
+ https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl
+
+ >>> toktok = ToktokTokenizer()
+ >>> text = u'Is 9.5 or 525,600 my favorite number?'
+ >>> print (toktok.tokenize(text, return_str=True))
+ Is 9.5 or 525,600 my favorite number ?
+ >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
+ >>> print (toktok.tokenize(text, return_str=True))
+ The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
+ >>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
+ >>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
+ >>> assert toktok.tokenize(text, return_str=True) == expected
+ >>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf']
+ True
+ """
+ # Replace non-breaking spaces with normal spaces.
+ NON_BREAKING = re.compile(u"\u00A0"), " "
+
+ # Pad some funky punctuation.
+ FUNKY_PUNCT_1 = re.compile(u'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 "
+ # Pad more funky punctuation.
+ FUNKY_PUNCT_2 = re.compile(u'([({\[“‘„‚«‹「『])'), r" \1 "
+ # Pad En dash and em dash
+ EN_EM_DASHES = re.compile(u'([–—])'), r" \1 "
+
+ # Replace problematic character with numeric character reference.
+ AMPERCENT = re.compile('& '), '& '
+ TAB = re.compile('\t'), ' '
+ PIPE = re.compile('\|'), ' | '
+
+ # Pad numbers with commas to keep them from further tokenization.
+ COMMA_IN_NUM = re.compile(r'(?<!,)([,،])(?![,\d])'), r' \1 '
+
+ # Just pad problematic (often neurotic) hyphen/single quote, etc.
+ PROB_SINGLE_QUOTES = re.compile(r"(['’`])"), r' \1 '
+ # Group ` ` stupid quotes ' ' into a single token.
+ STUPID_QUOTES_1 = re.compile(r" ` ` "), r" `` "
+ STUPID_QUOTES_2 = re.compile(r" ' ' "), r" '' "
+
+ # Don't tokenize period unless it ends the line and that it isn't
+ # preceded by another period, e.g.
+ # "something ..." -> "something ..."
+ # "something." -> "something ."
+ FINAL_PERIOD_1 = re.compile(r"(?<!\.)\.$"), r" ."
+ # Don't tokenize period unless it ends the line eg.
+ # " ... stuff." -> "... stuff ."
+ FINAL_PERIOD_2 = re.compile(r"""(?<!\.)\.\s*(["'’»›”]) *$"""), r" . \1"
+
+ # Treat continuous commas as fake German,Czech, etc.: „
+ MULTI_COMMAS = re.compile(r'(,{2,})'), r' \1 '
+ # Treat continuous dashes as fake en-dash, etc.
+ MULTI_DASHES = re.compile(r'(-{2,})'), r' \1 '
+ # Treat multiple periods as a thing (eg. ellipsis)
+ MULTI_DOTS = re.compile(r'(\.{2,})'), r' \1 '
+
+ # This is the \p{Open_Punctuation} from Perl's perluniprops
+ # see http://perldoc.perl.org/perluniprops.html
+ OPEN_PUNCT = text_type(u'([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d'
+ u'\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772'
+ u'\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983'
+ u'\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993'
+ u'\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26'
+ u'\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016'
+ u'\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39'
+ u'\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b'
+ u'\ufe5d\uff08\uff3b\uff5b\uff5f\uff62')
+ # This is the \p{Close_Punctuation} from Perl's perluniprops
+ CLOSE_PUNCT = text_type(u')]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a'
+ u'\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6'
+ u'\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988'
+ u'\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998'
+ u'\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009'
+ u'\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b'
+ u'\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c'
+ u'\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e'
+ u'\uff09\uff3d\uff5d\uff60\uff63')
+ # This is the \p{Close_Punctuation} from Perl's perluniprops
+ CURRENCY_SYM = text_type(u'$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb'
+ u'\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3'
+ u'\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab'
+ u'\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3'
+ u'\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838'
+ u'\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6')
+
+ # Pad spaces after opening punctuations.
+ OPEN_PUNCT_RE = re.compile(u'([{}])'.format(OPEN_PUNCT)), r'\1 '
+ # Pad spaces before closing punctuations.
+ CLOSE_PUNCT_RE = re.compile(u'([{}])'.format(CLOSE_PUNCT)), r'\1 '
+ # Pad spaces after currency symbols.
+ CURRENCY_SYM_RE = re.compile(u'([{}])'.format(CURRENCY_SYM)), r'\1 '
+
+ # Use for tokenizing URL-unfriendly characters: [:/?#]
+ URL_FOE_1 = re.compile(r':(?!//)'), r' : ' # in perl s{:(?!//)}{ : }g;
+ URL_FOE_2 = re.compile(r'\?(?!\S)'), r' ? ' # in perl s{\?(?!\S)}{ ? }g;
+ # in perl: m{://} or m{\S+\.\S+/\S+} or s{/}{ / }g;
+ URL_FOE_3 = re.compile(r'(:\/\/)[\S+\.\S+\/\S+][\/]'), ' / '
+ URL_FOE_4 = re.compile(r' /'), r' / ' # s{ /}{ / }g;
+
+ # Left/Right strip, i.e. remove heading/trailing spaces.
+ # These strip regexes should NOT be used,
+ # instead use str.lstrip(), str.rstrip() or str.strip()
+ # (They are kept for reference purposes to the original toktok.pl code)
+ LSTRIP = re.compile(r'^ +'), ''
+ RSTRIP = re.compile(r'\s+$'),'\n'
+ # Merge multiple spaces.
+ ONE_SPACE = re.compile(r' {2,}'), ' '
+
+ TOKTOK_REGEXES = [NON_BREAKING, FUNKY_PUNCT_1,
+ URL_FOE_1, URL_FOE_2, URL_FOE_3, URL_FOE_4,
+ AMPERCENT, TAB, PIPE,
+ OPEN_PUNCT_RE, CLOSE_PUNCT_RE,
+ MULTI_COMMAS, COMMA_IN_NUM, FINAL_PERIOD_2,
+ PROB_SINGLE_QUOTES, STUPID_QUOTES_1, STUPID_QUOTES_2,
+ CURRENCY_SYM_RE, EN_EM_DASHES, MULTI_DASHES, MULTI_DOTS,
+ FINAL_PERIOD_1, FINAL_PERIOD_2, ONE_SPACE]
+
+ def tokenize(self, text, return_str=False):
+ text = text_type(text) # Converts input string into unicode.
+ for regexp, subsitution in self.TOKTOK_REGEXES:
+ text = regexp.sub(subsitution, text)
+ # Finally, strips heading and trailing spaces
+ # and converts output string into unicode.
+ text = text_type(text.strip())
+ return text if return_str else text.split()
\ No newline at end of file
diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py
index 73216d9..2d7b162 100644
--- a/nltk/tokenize/treebank.py
+++ b/nltk/tokenize/treebank.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Tokenizers
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Michael Heilman <mheilman at cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
#
@@ -18,6 +18,24 @@ and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
import re
from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.util import align_tokens
+
+
+class MacIntyreContractions:
+ """
+ List of contractions adapted from Robert MacIntyre's tokenizer.
+ """
+ CONTRACTIONS2 = [r"(?i)\b(can)(?#X)(not)\b",
+ r"(?i)\b(d)(?#X)('ye)\b",
+ r"(?i)\b(gim)(?#X)(me)\b",
+ r"(?i)\b(gon)(?#X)(na)\b",
+ r"(?i)\b(got)(?#X)(ta)\b",
+ r"(?i)\b(lem)(?#X)(me)\b",
+ r"(?i)\b(mor)(?#X)('n)\b",
+ r"(?i)\b(wan)(?#X)(na)\s"]
+ CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
+ CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b",
+ r"(?i)\b(wha)(t)(cha)\b"]
class TreebankWordTokenizer(TokenizerI):
@@ -58,50 +76,55 @@ class TreebankWordTokenizer(TokenizerI):
(re.compile(r'([:,])$'), r' \1 '),
(re.compile(r'\.\.\.'), r' ... '),
(re.compile(r'[;@#$%&]'), r' \g<0> '),
- (re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), r'\1 \2\3 '),
+ (re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), r'\1 \2\3 '), # Handles the final period.
(re.compile(r'[?!]'), r' \g<0> '),
(re.compile(r"([^'])' "), r"\1 ' "),
]
- #parens, brackets, etc.
- PARENS_BRACKETS = [
- (re.compile(r'[\]\[\(\)\{\}\<\>]'), r' \g<0> '),
- (re.compile(r'--'), r' -- '),
+ # Pads parentheses
+ PARENS_BRACKETS = (re.compile(r'[\]\[\(\)\{\}\<\>]'), r' \g<0> ')
+
+ # Optionally: Convert parentheses, brackets and converts them to PTB symbols.
+ CONVERT_PARENTHESES = [
+ (re.compile(r'\('), '-LRB-'), (re.compile(r'\)'), '-RRB-'),
+ (re.compile(r'\['), '-LSB-'), (re.compile(r'\]'), '-RSB-'),
+ (re.compile(r'\{'), '-LCB-'), (re.compile(r'\}'), '-RCB-')
]
+ DOUBLE_DASHES = (re.compile(r'--'), r' -- ')
+
#ending quotes
ENDING_QUOTES = [
(re.compile(r'"'), " '' "),
(re.compile(r'(\S)(\'\')'), r'\1 \2 '),
-
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
(re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
]
# List of contractions adapted from Robert MacIntyre's tokenizer.
- CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
- re.compile(r"(?i)\b(d)('ye)\b"),
- re.compile(r"(?i)\b(gim)(me)\b"),
- re.compile(r"(?i)\b(gon)(na)\b"),
- re.compile(r"(?i)\b(got)(ta)\b"),
- re.compile(r"(?i)\b(lem)(me)\b"),
- re.compile(r"(?i)\b(mor)('n)\b"),
- re.compile(r"(?i)\b(wan)(na) ")]
- CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
- re.compile(r"(?i) ('t)(was)\b")]
- CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
- re.compile(r"(?i)\b(wha)(t)(cha)\b")]
-
- def tokenize(self, text):
+ _contractions = MacIntyreContractions()
+ CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
+ CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
+
+ def tokenize(self, text, convert_parentheses=False, return_str=False):
for regexp, substitution in self.STARTING_QUOTES:
text = regexp.sub(substitution, text)
for regexp, substitution in self.PUNCTUATION:
text = regexp.sub(substitution, text)
- for regexp, substitution in self.PARENS_BRACKETS:
- text = regexp.sub(substitution, text)
+ # Handles parentheses.
+ regexp, substitution = self.PARENS_BRACKETS
+ text = regexp.sub(substitution, text)
+ # Optionally convert parentheses
+ if convert_parentheses:
+ for regexp, substitution in self.CONVERT_PARENTHESES:
+ text = regexp.sub(substitution, text)
+
+ # Handles double dash.
+ regexp, substitution = self.DOUBLE_DASHES
+ text = regexp.sub(substitution, text)
#add extra space to make things easier
text = " " + text + " "
@@ -116,9 +139,168 @@ class TreebankWordTokenizer(TokenizerI):
# We are not using CONTRACTIONS4 since
# they are also commented out in the SED scripts
- # for regexp in self.CONTRACTIONS4:
+ # for regexp in self._contractions.CONTRACTIONS4:
# text = regexp.sub(r' \1 \2 \3 ', text)
- return text.split()
+ return text if return_str else text.split()
+
+ def span_tokenize(self, text):
+ """
+ Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
+
+ >>> from nltk.tokenize import TreebankWordTokenizer
+ >>> s = '''Good muffins cost $3.88\\nin New (York). Please (buy) me\\ntwo of them.\\n(Thanks).'''
+ >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
+ ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
+ ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
+ ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
+ >>> TreebankWordTokenizer().span_tokenize(s) == expected
+ True
+ >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
+ ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
+ ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
+ >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
+ True
+
+ """
+ tokens = self.tokenize(text)
+ return align_tokens(tokens, text)
+
+
+class TreebankWordDetokenizer(TokenizerI):
+ """
+ The Treebank detokenizer uses the reverse regex operations corresponding to
+ the Treebank tokenizer's regexes.
+
+ Note:
+ - There're additional assumption mades when undoing the padding of [;@#$%&]
+ punctuation symbols that isn't presupposed in the TreebankTokenizer.
+ - There're additional regexes added in reversing the parentheses tokenization,
+ - the r'([\]\)\}\>])\s([:;,.])' removes the additional right padding added
+ to the closing parentheses precedding [:;,.].
+ - It's not possible to return the original whitespaces as they were because
+ there wasn't explicit records of where '\n', '\t' or '\s' were removed at
+ the text.split() operation.
+
+ >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
+ >>> s = '''Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.'''
+ >>> d = TreebankWordDetokenizer()
+ >>> t = TreebankWordTokenizer()
+ >>> toks = t.tokenize(s)
+ >>> d.detokenize(toks)
+ 'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'
+
+ The MXPOST parentheses substitution can be undone using the `convert_parentheses`
+ parameter:
+
+ >>> s = '''Good muffins cost $3.88\\nin New (York). Please (buy) me\\ntwo of them.\\n(Thanks).'''
+ >>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
+ ... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy',
+ ... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.']
+ >>> expected_tokens == t.tokenize(s, convert_parentheses=True)
+ True
+ >>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
+ >>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
+ True
+ """
+ _contractions = MacIntyreContractions()
+ CONTRACTIONS2 = [re.compile(pattern.replace('(?#X)', '\s'))
+ for pattern in _contractions.CONTRACTIONS2]
+ CONTRACTIONS3 = [re.compile(pattern.replace('(?#X)', '\s'))
+ for pattern in _contractions.CONTRACTIONS3]
+
+ #ending quotes
+ ENDING_QUOTES = [
+ (re.compile(r"([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1\2 "),
+ (re.compile(r"([^' ])\s('[sS]|'[mM]|'[dD]|') "), r"\1\2 "),
+ (re.compile(r'(\S)(\'\')'), r'\1\2 '),
+ (re.compile(r" '' "), '"')
+ ]
+
+ # Handles double dashes
+ DOUBLE_DASHES = (re.compile(r' -- '), r'--')
+
+ # Optionally: Convert parentheses, brackets and converts them from PTB symbols.
+ CONVERT_PARENTHESES = [
+ (re.compile('-LRB-'), '('), (re.compile('-RRB-'), ')'),
+ (re.compile('-LSB-'), '['), (re.compile('-RSB-'), ']'),
+ (re.compile('-LCB-'), '{'), (re.compile('-RCB-'), '}')
+ ]
+
+ # Undo padding on parentheses.
+ PARENS_BRACKETS = [(re.compile(r'\s([\[\(\{\<])\s'), r' \g<1>'),
+ (re.compile(r'\s([\]\)\}\>])\s'), r'\g<1> '),
+ (re.compile(r'([\]\)\}\>])\s([:;,.])'), r'\1\2')]
+
+ #punctuation
+ PUNCTUATION = [
+ (re.compile(r"([^'])\s'\s"), r"\1' "),
+ (re.compile(r'\s([?!])\s'), r'\g<1>'),
+ (re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r'\1\2\3'),
+ # When tokenizing, [;@#$%&] are padded with whitespace regardless of
+ # whether there are spaces before or after them.
+ # But during detokenization, we need to distinguish between left/right
+ # pad, so we split this up.
+ (re.compile(r'\s([#$])\s'), r' \g<1>'), # Left pad.
+ (re.compile(r'\s([;%])\s'), r'\g<1> '), # Right pad.
+ (re.compile(r'\s([&])\s'), r' \g<1> '), # Unknown pad.
+ (re.compile(r'\s\.\.\.\s'), r'...'),
+ (re.compile(r'\s([:,])\s$'), r'\1'),
+ (re.compile(r'\s([:,])\s([^\d])'), r'\1\2')
+ ]
+
+ #starting quotes
+ STARTING_QUOTES = [
+ (re.compile(r'([ (\[{<])\s``'), r'\1"'),
+ (re.compile(r'\s(``)\s'), r'\1'),
+ (re.compile(r'^``'), r'\"'),
+ ]
+
+ def tokenize(self, tokens, convert_parentheses=False):
+ """
+ Python port of the Moses detokenizer.
+
+ :param tokens: A list of strings, i.e. tokenized text.
+ :type tokens: list(str)
+ :return: str
+ """
+ text = ' '.join(tokens)
+ # Reverse the contractions regexes.
+ # Note: CONTRACTIONS4 are not used in tokenization.
+ for regexp in self.CONTRACTIONS3:
+ text = regexp.sub(r'\1\2', text)
+ for regexp in self.CONTRACTIONS2:
+ text = regexp.sub(r'\1\2', text)
+
+ # Reverse the regexes applied for ending quotes.
+ for regexp, substitution in self.ENDING_QUOTES:
+ text = regexp.sub(substitution, text)
+
+ # Undo the space padding.
+ text = text.strip()
+
+ # Reverse the padding on double dashes.
+ regexp, substitution = self.DOUBLE_DASHES
+ text = regexp.sub(substitution, text)
+
+ if convert_parentheses:
+ for regexp, substitution in self.CONVERT_PARENTHESES:
+ text = regexp.sub(substitution, text)
+
+ # Reverse the padding regexes applied for parenthesis/brackets.
+ for regexp, substitution in self.PARENS_BRACKETS:
+ text = regexp.sub(substitution, text)
+
+ # Reverse the regexes applied for punctuations.
+ for regexp, substitution in self.PUNCTUATION:
+ text = regexp.sub(substitution, text)
+
+ # Reverse the regexes applied for starting quotes.
+ for regexp, substitution in self.STARTING_QUOTES:
+ text = regexp.sub(substitution, text)
+ return text.strip()
+ def detokenize(self, tokens, convert_parentheses=False):
+ """ Duck-typing the abstract *tokenize()*."""
+ return self.tokenize(tokens, convert_parentheses)
diff --git a/nltk/tokenize/util.py b/nltk/tokenize/util.py
index a49900b..7229e21 100644
--- a/nltk/tokenize/util.py
+++ b/nltk/tokenize/util.py
@@ -1,11 +1,13 @@
+# -*- coding: utf-8 -*-
# Natural Language Toolkit: Tokenizer Utilities
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.sourceforge.net>
# For license information, see LICENSE.TXT
from re import finditer
+from xml.sax.saxutils import escape
def string_span_tokenize(s, sep):
r"""
@@ -88,3 +90,146 @@ def spans_to_relative(spans):
prev = right
+class CJKChars(object):
+ """
+ An object that enumerates the code points of the CJK characters as listed on
+ http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
+
+ This is a Python port of the CJK code point enumerations of Moses tokenizer:
+ https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309
+ """
+ # Hangul Jamo (1100–11FF)
+ Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff"))
+
+ # CJK Radicals Supplement (2E80–2EFF)
+ # Kangxi Radicals (2F00–2FDF)
+ # Ideographic Description Characters (2FF0–2FFF)
+ # CJK Symbols and Punctuation (3000–303F)
+ # Hiragana (3040–309F)
+ # Katakana (30A0–30FF)
+ # Bopomofo (3100–312F)
+ # Hangul Compatibility Jamo (3130–318F)
+ # Kanbun (3190–319F)
+ # Bopomofo Extended (31A0–31BF)
+ # CJK Strokes (31C0–31EF)
+ # Katakana Phonetic Extensions (31F0–31FF)
+ # Enclosed CJK Letters and Months (3200–32FF)
+ # CJK Compatibility (3300–33FF)
+ # CJK Unified Ideographs Extension A (3400–4DBF)
+ # Yijing Hexagram Symbols (4DC0–4DFF)
+ # CJK Unified Ideographs (4E00–9FFF)
+ # Yi Syllables (A000–A48F)
+ # Yi Radicals (A490–A4CF)
+ CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf"))
+
+ # Phags-pa (A840–A87F)
+ Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f"))
+
+ # Hangul Syllables (AC00–D7AF)
+ Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF"))
+
+ # CJK Compatibility Ideographs (F900–FAFF)
+ CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF"))
+
+ # CJK Compatibility Forms (FE30–FE4F)
+ CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F"))
+
+ # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
+ Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC"))
+
+ # Supplementary Ideographic Plane 20000–2FFFF
+ Supplementary_Ideographic_Plane = (131072, 196607) # (ord(u"\U00020000"), ord(u"\U0002FFFF"))
+
+ ranges = [Hangul_Jamo, CJK_Radicals, Phags_Pa, Hangul_Syllables,
+ CJK_Compatibility_Ideographs, CJK_Compatibility_Forms,
+ Katakana_Hangul_Halfwidth, Supplementary_Ideographic_Plane]
+
+
+
+def is_cjk(character):
+ """
+ Python port of Moses' code to check for CJK character.
+
+ >>> CJKChars().ranges
+ [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)]
+ >>> is_cjk(u'\u33fe')
+ True
+ >>> is_cjk(u'\uFE5F')
+ False
+
+ :param character: The character that needs to be checked.
+ :type character: char
+ :return: bool
+ """
+ return any([start <= ord(character) <= end for start, end in
+ [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215),
+ (63744, 64255), (65072, 65103), (65381, 65500),
+ (131072, 196607)]
+ ])
+
+
+def xml_escape(text):
+ """
+ This function transforms the input text into an "escaped" version suitable
+ for well-formed XML formatting.
+
+ Note that the default xml.sax.saxutils.escape() function don't escape
+ some characters that Moses does so we have to manually add them to the
+ entities dictionary.
+
+ >>> input_str = ''')| & < > ' " ] ['''
+ >>> expected_output = ''')| & < > ' " ] ['''
+ >>> escape(input_str) == expected_output
+ True
+ >>> xml_escape(input_str)
+ ')| & < > ' " ] ['
+
+ :param text: The text that needs to be escaped.
+ :type text: str
+ :rtype: str
+ """
+ return escape(text, entities={ r"'": r"'", r'"': r""",
+ r"|": r"|",
+ r"[": r"[", r"]": r"]", })
+
+
+def align_tokens(tokens, sentence):
+ """
+ This module attempt to find the offsets of the tokens in *s*, as a sequence
+ of ``(start, end)`` tuples, given the tokens and also the source string.
+
+ >>> from nltk.tokenize import TreebankWordTokenizer
+ >>> from nltk.tokenize.util import align_tokens
+ >>> s = str("The plane, bound for St Petersburg, crashed in Egypt's "
+ ... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh "
+ ... "on Saturday.")
+ >>> tokens = TreebankWordTokenizer().tokenize(s)
+ >>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23),
+ ... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54),
+ ... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89),
+ ... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122),
+ ... (123, 131), (131, 132)]
+ >>> output = list(align_tokens(tokens, s))
+ >>> len(tokens) == len(expected) == len(output) # Check that length of tokens and tuples are the same.
+ True
+ >>> expected == list(align_tokens(tokens, s)) # Check that the output is as expected.
+ True
+ >>> tokens == [s[start:end] for start, end in output] # Check that the slices of the string corresponds to the tokens.
+ True
+
+ :param tokens: The list of strings that are the result of tokenization
+ :type tokens: list(str)
+ :param sentence: The original string
+ :type sentence: str
+ :rtype: list(tuple(int,int))
+ """
+ point = 0
+ offsets = []
+ for token in tokens:
+ try:
+ start = sentence.index(token, point)
+ except ValueError:
+ raise ValueError('substring "{}" not found in "{}"'.format(token, sentence))
+ point = start + len(token)
+ offsets.append((start, point))
+ return offsets
diff --git a/nltk/toolbox.py b/nltk/toolbox.py
index cecb282..a02ecbd 100644
--- a/nltk/toolbox.py
+++ b/nltk/toolbox.py
@@ -1,7 +1,7 @@
# coding: utf-8
# Natural Language Toolkit: Toolbox Reader
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Greg Aumann <greg_aumann at sil.org>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
@@ -15,7 +15,9 @@ from __future__ import print_function
import os, re, codecs
from xml.etree.ElementTree import ElementTree, TreeBuilder, Element, SubElement
-from nltk.compat import StringIO, u, PY3
+from six import u
+
+from nltk.compat import StringIO, PY3
from nltk.data import PathPointer, ZipFilePathPointer, find
diff --git a/nltk/translate/__init__.py b/nltk/translate/__init__.py
index 6ec96a0..9e243e4 100644
--- a/nltk/translate/__init__.py
+++ b/nltk/translate/__init__.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Machine Translation
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>, Tah Wei Hoon <hoon.tw at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/translate/api.py b/nltk/translate/api.py
index d90bd50..c05db53 100644
--- a/nltk/translate/api.py
+++ b/nltk/translate/api.py
@@ -1,6 +1,6 @@
-# Natural Language Toolkit: API for alignment and translation objects
+# Natural Language Toolkit: API for alignment and translation objects
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Will Zhang <wilzzha at gmail.com>
# Guan Gui <ggui at student.unimelb.edu.au>
# Steven Bird <stevenbird1 at gmail.com>
@@ -12,7 +12,7 @@ from __future__ import print_function, unicode_literals
import subprocess
from collections import namedtuple
-from nltk.compat import python_2_unicode_compatible, string_types
+from nltk.compat import python_2_unicode_compatible
@python_2_unicode_compatible
class AlignedSent(object):
@@ -63,7 +63,7 @@ class AlignedSent(object):
def _get_alignment(self):
return self._alignment
-
+
def _set_alignment(self, alignment):
_check_alignment(len(self.words), len(self.mots), alignment)
self._alignment = alignment
@@ -82,41 +82,41 @@ class AlignedSent(object):
def _to_dot(self):
"""
- Dot representation of the aligned sentence
- """
+ Dot representation of the aligned sentence
+ """
s = 'graph align {\n'
s += 'node[shape=plaintext]\n'
-
- # Declare node
+
+ # Declare node
for w in self._words:
s += '"%s_source" [label="%s"] \n' % (w, w)
-
+
for w in self._mots:
s += '"%s_target" [label="%s"] \n' % (w, w)
-
- # Alignment
- for u,v in self._alignment:
+
+ # Alignment
+ for u,v in self._alignment:
s += '"%s_source" -- "%s_target" \n' % (self._words[u] , self._mots[v] )
-
- # Connect the source words
+
+ # Connect the source words
for i in range(len(self._words)-1) :
s += '"%s_source" -- "%s_source" [style=invis]\n' % (self._words[i] , self._words[i+1])
-
- # Connect the target words
+
+ # Connect the target words
for i in range(len(self._mots)-1) :
s += '"%s_target" -- "%s_target" [style=invis]\n' % (self._mots[i] , self._mots[i+1])
-
- # Put it in the same rank
+
+ # Put it in the same rank
s += '{rank = same; %s}\n' % (' '.join('"%s_source"' % w for w in self._words))
s += '{rank = same; %s}\n' % (' '.join('"%s_target"' % w for w in self._mots))
s += '}'
-
- return s
-
+
+ return s
+
def _repr_svg_(self):
"""
- Ipython magic : show SVG representation of this ``AlignedSent``.
+ Ipython magic : show SVG representation of this ``AlignedSent``.
"""
dot_string = self._to_dot().encode('utf8')
output_format = 'svg'
@@ -126,10 +126,10 @@ class AlignedSent(object):
except OSError:
raise Exception('Cannot find the dot binary from Graphviz package')
out, err = process.communicate(dot_string)
-
+
return out.decode('utf8')
-
-
+
+
def __str__(self):
"""
Return a human-readable string representation for this ``AlignedSent``.
diff --git a/nltk/translate/bleu_score.py b/nltk/translate/bleu_score.py
index 668149b..e30d112 100644
--- a/nltk/translate/bleu_score.py
+++ b/nltk/translate/bleu_score.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: BLEU Score
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
# Contributors: Dmitrijs Milajevs, Liling Tan
# URL: <http://nltk.org/>
@@ -11,7 +11,9 @@
from __future__ import division
import math
+import sys
import fractions
+import warnings
from collections import Counter
from nltk.util import ngrams
@@ -21,14 +23,15 @@ try:
from fractions import Fraction
except TypeError:
from nltk.compat import Fraction
-
+
def sentence_bleu(references, hypothesis, weights=(0.25, 0.25, 0.25, 0.25),
- smoothing_function=None):
+ smoothing_function=None, auto_reweigh=False,
+ emulate_multibleu=False):
"""
Calculate BLEU score (Bilingual Evaluation Understudy) from
Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
- "BLEU: a method for automatic evaluation of machine translation."
+ "BLEU: a method for automatic evaluation of machine translation."
In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
@@ -59,37 +62,45 @@ def sentence_bleu(references, hypothesis, weights=(0.25, 0.25, 0.25, 0.25),
0.3969...
The default BLEU calculates a score for up to 4grams using uniform
- weights. To evaluate your translations with higher/lower order ngrams,
+ weights. To evaluate your translations with higher/lower order ngrams,
use customized weights. E.g. when accounting for up to 6grams with uniform
weights:
>>> weights = (0.1666, 0.1666, 0.1666, 0.1666, 0.1666)
- >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights)
- 0.45838627164939455
-
+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
+ 0.4583...
+
:param references: reference sentences
:type references: list(list(str))
:param hypothesis: a hypothesis sentence
:type hypothesis: list(str)
:param weights: weights for unigrams, bigrams, trigrams and so on
:type weights: list(float)
+ :param smoothing_function:
+ :type smoothing_function: SmoothingFunction
+ :param auto_reweigh:
+ :type auto_reweigh: bool
+ :param emulate_multibleu: bool
:return: The sentence-level BLEU score.
:rtype: float
"""
- return corpus_bleu([references], [hypothesis], weights, smoothing_function)
+ return corpus_bleu([references], [hypothesis],
+ weights, smoothing_function, auto_reweigh,
+ emulate_multibleu)
def corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25),
- smoothing_function=None):
+ smoothing_function=None, auto_reweigh=False,
+ emulate_multibleu=False):
"""
- Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
- the hypotheses and their respective references.
+ Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
+ the hypotheses and their respective references.
- Instead of averaging the sentence level BLEU scores (i.e. marco-average
- precision), the original BLEU metric (Papineni et al. 2002) accounts for
+ Instead of averaging the sentence level BLEU scores (i.e. marco-average
+ precision), the original BLEU metric (Papineni et al. 2002) accounts for
the micro-average precision (i.e. summing the numerators and denominators
for each hypothesis-reference(s) pairs before the division).
-
+
>>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
... 'ensures', 'that', 'the', 'military', 'always',
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
@@ -102,31 +113,36 @@ def corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)
>>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
... 'army', 'always', 'to', 'heed', 'the', 'directions',
... 'of', 'the', 'party']
-
- >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
+
+ >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
... 'interested', 'in', 'world', 'history']
- >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
+ >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
... 'because', 'he', 'read', 'the', 'book']
-
+
>>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
>>> hypotheses = [hyp1, hyp2]
>>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
0.5920...
-
- The example below show that corpus_bleu() is different from averaging
- sentence_bleu() for hypotheses
-
+
+ The example below show that corpus_bleu() is different from averaging
+ sentence_bleu() for hypotheses
+
>>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
>>> score2 = sentence_bleu([ref2a], hyp2)
>>> (score1 + score2) / 2 # doctest: +ELLIPSIS
0.6223...
-
+
:param references: a corpus of lists of reference sentences, w.r.t. hypotheses
:type references: list(list(list(str)))
:param hypotheses: a list of hypothesis sentences
:type hypotheses: list(list(str))
:param weights: weights for unigrams, bigrams, trigrams and so on
:type weights: list(float)
+ :param smoothing_function:
+ :type smoothing_function: SmoothingFunction
+ :param auto_reweigh:
+ :type auto_reweigh: bool
+ :param emulate_multibleu: bool
:return: The corpus-level BLEU score.
:rtype: float
"""
@@ -135,49 +151,55 @@ def corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)
p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
hyp_lengths, ref_lengths = 0, 0
-
+
assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same"
-
+
# Iterate through each hypothesis and their corresponding references.
for references, hypothesis in zip(list_of_references, hypotheses):
# For each order of ngram, calculate the numerator and
# denominator for the corpus-level modified precision.
- for i, _ in enumerate(weights, start=1):
+ for i, _ in enumerate(weights, start=1):
p_i = modified_precision(references, hypothesis, i)
p_numerators[i] += p_i.numerator
p_denominators[i] += p_i.denominator
-
+
# Calculate the hypothesis length and the closest reference length.
# Adds them to the corpus-level hypothesis and reference counts.
hyp_len = len(hypothesis)
hyp_lengths += hyp_len
ref_lengths += closest_ref_length(references, hyp_len)
-
+
# Calculate corpus-level brevity penalty.
bp = brevity_penalty(ref_lengths, hyp_lengths)
-
+
+ # Uniformly re-weighting based on maximum hypothesis lengths if largest
+ # order of n-grams < 4 and weights is set at default.
+ if auto_reweigh:
+ if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
+ weights = ( 1 / hyp_lengths ,) * hyp_lengths
+
# Collects the various precision values for the different ngram orders.
- p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
+ p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
for i, _ in enumerate(weights, start=1)]
-
- # Returns 0 if there's no matching n-grams
+
+ # Returns 0 if there's no matching n-grams
# We only need to check for p_numerators[1] == 0, since if there's
# no unigrams, there won't be any higher order ngrams.
if p_numerators[1] == 0:
return 0
-
+
+ # If there's no smoothing, set use method0 from SmoothinFunction class.
+ if not smoothing_function:
+ smoothing_function = SmoothingFunction().method0
# Smoothen the modified precision.
- # Note: smooth_precision() converts values into float.
- if smoothing_function:
- p_n = smoothing_function(p_n, references=references,
- hypothesis=hypothesis, hyp_len=hyp_len)
-
- # Calculates the overall modified precision for all ngrams.
- # By sum of the product of the weights and the respective *p_n*
- s = (w * math.log(p_i) for w, p_i in zip(weights, p_n)
- if p_i.numerator != 0)
-
- return bp * math.exp(math.fsum(s))
+ # Note: smoothing_function() may convert values into floats;
+ # it tries to retain the Fraction object as much as the
+ # smoothing method allows.
+ p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis,
+ hyp_len=hyp_len, emulate_multibleu=emulate_multibleu)
+ s = (w * math.log(p_i) for i, (w, p_i) in enumerate(zip(weights, p_n)))
+ s = bp * math.exp(math.fsum(s))
+ return round(s, 4) if emulate_multibleu else s
def modified_precision(references, hypothesis, n):
@@ -186,28 +208,28 @@ def modified_precision(references, hypothesis, n):
The normal precision method may lead to some wrong translations with
high-precision, e.g., the translation, in which a word of reference
- repeats several times, has very high precision.
+ repeats several times, has very high precision.
+
+ This function only returns the Fraction object that contains the numerator
+ and denominator necessary to calculate the corpus-level precision.
+ To calculate the modified precision for a single pair of hypothesis and
+ references, cast the Fraction object into a float.
- This function only returns the Fraction object that contains the numerator
- and denominator necessary to calculate the corpus-level precision.
- To calculate the modified precision for a single pair of hypothesis and
- references, cast the Fraction object into a float.
-
The famous "the the the ... " example shows that you can get BLEU precision
by duplicating high frequency words.
-
+
>>> reference1 = 'the cat is on the mat'.split()
>>> reference2 = 'there is a cat on the mat'.split()
>>> hypothesis1 = 'the the the the the the the'.split()
>>> references = [reference1, reference2]
>>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS
0.2857...
-
- In the modified n-gram precision, a reference word will be considered
+
+ In the modified n-gram precision, a reference word will be considered
exhausted after a matching hypothesis word is identified, e.g.
-
+
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
- ... 'ensures', 'that', 'the', 'military', 'will',
+ ... 'ensures', 'that', 'the', 'military', 'will',
... 'forever', 'heed', 'Party', 'commands']
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
... 'guarantees', 'the', 'military', 'forces', 'always',
@@ -222,26 +244,26 @@ def modified_precision(references, hypothesis, n):
1.0
>>> float(modified_precision(references, hypothesis, n=2))
1.0
-
+
An example of a normal machine translation hypothesis:
-
+
>>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
... 'ensures', 'that', 'the', 'military', 'always',
... 'obeys', 'the', 'commands', 'of', 'the', 'party']
-
+
>>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
... 'forever', 'hearing', 'the', 'activity', 'guidebook',
... 'that', 'party', 'direct']
-
+
>>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
- ... 'ensures', 'that', 'the', 'military', 'will',
+ ... 'ensures', 'that', 'the', 'military', 'will',
... 'forever', 'heed', 'Party', 'commands']
-
+
>>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
... 'guarantees', 'the', 'military', 'forces', 'always',
... 'being', 'under', 'the', 'command', 'of', 'the',
... 'Party']
-
+
>>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
... 'army', 'always', 'to', 'heed', 'the', 'directions',
... 'of', 'the', 'party']
@@ -254,8 +276,8 @@ def modified_precision(references, hypothesis, n):
0.5882352941176471
>>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS
0.07692...
-
-
+
+
:param references: A list of reference translations.
:type references: list(list(str))
:param hypothesis: A hypothesis translation.
@@ -265,43 +287,45 @@ def modified_precision(references, hypothesis, n):
:return: BLEU's modified precision for the nth order ngram.
:rtype: Fraction
"""
- # Extracts all ngrams in hypothesis.
- counts = Counter(ngrams(hypothesis, n))
-
+ # Extracts all ngrams in hypothesis
+ # Set an empty Counter if hypothesis is empty.
+ counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
# Extract a union of references' counts.
## max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
max_counts = {}
for reference in references:
- reference_counts = Counter(ngrams(reference, n))
+ reference_counts = Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
for ngram in counts:
- max_counts[ngram] = max(max_counts.get(ngram, 0),
+ max_counts[ngram] = max(max_counts.get(ngram, 0),
reference_counts[ngram])
-
+
# Assigns the intersection between hypothesis and references' counts.
- clipped_counts = {ngram: min(count, max_counts[ngram])
+ clipped_counts = {ngram: min(count, max_counts[ngram])
for ngram, count in counts.items()}
numerator = sum(clipped_counts.values())
- denominator = sum(counts.values())
-
- return Fraction(numerator, denominator, _normalize=False)
-
+ # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
+ # Usually this happens when the ngram order is > len(reference).
+ denominator = max(1, sum(counts.values()))
+
+ return Fraction(numerator, denominator, _normalize=False)
+
def closest_ref_length(references, hyp_len):
"""
- This function finds the reference that is the closest length to the
- hypothesis. The closest reference length is referred to as *r* variable
+ This function finds the reference that is the closest length to the
+ hypothesis. The closest reference length is referred to as *r* variable
from the brevity penalty formula in Papineni et. al. (2002)
-
+
:param references: A list of reference translations.
:type references: list(list(str))
:param hypothesis: The length of the hypothesis.
:type hypothesis: int
:return: The length of the reference that's closest to the hypothesis.
- :rtype: int
+ :rtype: int
"""
ref_lens = (len(reference) for reference in references)
- closest_ref_len = min(ref_lens, key=lambda ref_len:
+ closest_ref_len = min(ref_lens, key=lambda ref_len:
(abs(ref_len - hyp_len), ref_len))
return closest_ref_len
@@ -379,28 +403,31 @@ def brevity_penalty(closest_ref_len, hyp_len):
>>> closest_ref_len = closest_ref_length(references, hyp_len)
>>> brevity_penalty(closest_ref_len, hyp_len)
1.0
-
- :param hyp_len: The length of the hypothesis for a single sentence OR the
+
+ :param hyp_len: The length of the hypothesis for a single sentence OR the
sum of all the hypotheses' lengths for a corpus
:type hyp_len: int
- :param closest_ref_len: The length of the closest reference for a single
+ :param closest_ref_len: The length of the closest reference for a single
hypothesis OR the sum of all the closest references for every hypotheses.
- :type closest_reference_len: int
+ :type closest_reference_len: int
:return: BLEU's brevity penalty.
:rtype: float
"""
if hyp_len > closest_ref_len:
return 1
+ # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
+ elif hyp_len == 0:
+ return 0
else:
return math.exp(1 - closest_ref_len / hyp_len)
class SmoothingFunction:
"""
- This is an implementation of the smoothing techniques
- for segment-level BLEU scores that was presented in
- Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
- Smoothing Techniques for Sentence-Level BLEU. In WMT14.
+ This is an implementation of the smoothing techniques
+ for segment-level BLEU scores that was presented in
+ Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
+ Smoothing Techniques for Sentence-Level BLEU. In WMT14.
http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
"""
def __init__(self, epsilon=0.1, alpha=5, k=5):
@@ -409,13 +436,13 @@ class SmoothingFunction:
techniques, the default values are set to the numbers used in the
experiments from Chen and Cherry (2014).
- >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
- ... 'that', 'the', 'military', 'always', 'obeys', 'the',
+ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
+ ... 'that', 'the', 'military', 'always', 'obeys', 'the',
... 'commands', 'of', 'the', 'party']
- >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
- ... 'that', 'the', 'military', 'will', 'forever', 'heed',
+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
+ ... 'that', 'the', 'military', 'will', 'forever', 'heed',
... 'Party', 'commands']
-
+
>>> chencherry = SmoothingFunction()
>>> print (sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
0.4118...
@@ -432,7 +459,7 @@ class SmoothingFunction:
>>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
0.4905...
>>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
- 0.1801...
+ 0.4135...
>>> print (sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
0.4905...
@@ -446,31 +473,46 @@ class SmoothingFunction:
self.epsilon = epsilon
self.alpha = alpha
self.k = k
-
+
def method0(self, p_n, *args, **kwargs):
""" No smoothing. """
- return p_n
-
+ p_n_new = []
+ _emulate_multibleu = kwargs['emulate_multibleu']
+ for i, p_i in enumerate(p_n):
+ if p_i.numerator != 0:
+ p_n_new.append(p_i)
+ elif _emulate_multibleu and i < 5:
+ return [sys.float_info.min]
+ else:
+ _msg = str("\nCorpus/Sentence contains 0 counts of {}-gram overlaps.\n"
+ "BLEU scores might be undesirable; "
+ "use SmoothingFunction().").format(i+1)
+ warnings.warn(_msg)
+ # If this order of n-gram returns 0 counts, the higher order
+ # n-gram would also return 0, thus breaking the loop here.
+ break
+ return p_n_new
+
def method1(self, p_n, *args, **kwargs):
- """
+ """
Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
- """
- return [(p_i.numerator + self.epsilon)/ p_i.denominator
+ """
+ return [(p_i.numerator + self.epsilon)/ p_i.denominator
if p_i.numerator == 0 else p_i for p_i in p_n]
-
+
def method2(self, p_n, *args, **kwargs):
"""
- Smoothing method 2: Add 1 to both numerator and denominator from
- Chin-Yew Lin and Franz Josef Och (2004) Automatic evaluation of
- machine translation quality using longest common subsequence and
+ Smoothing method 2: Add 1 to both numerator and denominator from
+ Chin-Yew Lin and Franz Josef Och (2004) Automatic evaluation of
+ machine translation quality using longest common subsequence and
skip-bigram statistics. In ACL04.
"""
return [Fraction(p_i.numerator + 1, p_i.denominator + 1, _normalize=False) for p_i in p_n]
-
+
def method3(self, p_n, *args, **kwargs):
"""
- Smoothing method 3: NIST geometric sequence smoothing
- The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
+ Smoothing method 3: NIST geometric sequence smoothing
+ The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
precision score whose matching n-gram count is null.
k is 1 for the first 'n' value for which the n-gram match count is null/
For example, if the text contains:
@@ -488,28 +530,27 @@ class SmoothingFunction:
p_n[i] = 1 / (2**incvnt * p_i.denominator)
incvnt+=1
return p_n
-
- def method4(self, p_n, references, hypothesis, hyp_len):
+
+ def method4(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
"""
- Smoothing method 4:
- Shorter translations may have inflated precision values due to having
+ Smoothing method 4:
+ Shorter translations may have inflated precision values due to having
smaller denominators; therefore, we give them proportionally
- smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
+ smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
suggests dividing by 1/ln(len(T)), where T is the length of the translation.
"""
- incvnt = 1
for i, p_i in enumerate(p_n):
if p_i.numerator == 0 and hyp_len != 0:
- p_n[i] = incvnt * self.k / math.log(hyp_len) # Note that this K is different from the K from NIST.
- incvnt+=1
+ incvnt = i+1 * self.k / math.log(hyp_len) # Note that this K is different from the K from NIST.
+ p_n[i] = 1 / incvnt
return p_n
- def method5(self, p_n, references, hypothesis, hyp_len):
+ def method5(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
"""
Smoothing method 5:
- The matched counts for similar values of n should be similar. To a
- calculate the n-gram matched count, it averages the n−1, n and n+1 gram
+ The matched counts for similar values of n should be similar. To a
+ calculate the n-gram matched count, it averages the n−1, n and n+1 gram
matched counts.
"""
m = {}
@@ -518,31 +559,40 @@ class SmoothingFunction:
m[-1] = p_n[0] + 1
for i, p_i in enumerate(p_n):
p_n[i] = (m[i-1] + p_i + p_n_plus1[i+1]) / 3
- m[i] = p_n[i]
+ m[i] = p_n[i]
return p_n
-
- def method6(self, p_n, references, hypothesis, hyp_len):
+
+ def method6(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
"""
Smoothing method 6:
- Interpolates the maximum likelihood estimate of the precision *p_n* with
- a prior estimate *pi0*. The prior is estimated by assuming that the ratio
- between pn and pn−1 will be the same as that between pn−1 and pn−2.
+ Interpolates the maximum likelihood estimate of the precision *p_n* with
+ a prior estimate *pi0*. The prior is estimated by assuming that the ratio
+ between pn and pn−1 will be the same as that between pn−1 and pn−2; from
+ Gao and He (2013) Training MRF-Based Phrase Translation Models using
+ Gradient Ascent. In NAACL.
"""
+ # This smoothing only works when p_1 and p_2 is non-zero.
+ # Raise an error with an appropriate message when the input is too short
+ # to use this smoothing technique.
+ assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
for i, p_i in enumerate(p_n):
- if i in [1,2]: # Skips the first 2 orders of ngrams.
+ if i in [0,1]: # Skips the first 2 orders of ngrams.
continue
else:
- pi0 = 0 if p_n[i-2] == 0 else p_n[i-1]**2 / p_n[i-2]
+ pi0 = 0 if p_n[i-2] == 0 else p_n[i-1]**2 / p_n[i-2]
+ # No. of ngrams in translation that matches the reference.
+ m = p_i.numerator
# No. of ngrams in translation.
l = sum(1 for _ in ngrams(hypothesis, i+1))
- p_n[i] = (p_i + self.alpha * pi0) / (l + self.alpha)
+ # Calculates the interpolated precision.
+ p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
return p_n
-
- def method7(self, p_n, references, hypothesis, hyp_len):
+
+ def method7(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
"""
Smoothing method 6:
- Interpolates the maximum likelihood estimate of the precision *p_n* with
- a prior estimate *pi0*. The prior is estimated by assuming that the ratio
+ Interpolates the maximum likelihood estimate of the precision *p_n* with
+ a prior estimate *pi0*. The prior is estimated by assuming that the ratio
between pn and pn−1 will be the same as that between pn−1 and pn−2.
"""
p_n = self.method4(p_n, references, hypothesis, hyp_len)
diff --git a/nltk/translate/chrf_score.py b/nltk/translate/chrf_score.py
new file mode 100644
index 0000000..1748633
--- /dev/null
+++ b/nltk/translate/chrf_score.py
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: ChrF score
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors: Maja Popovic
+# Contributors: Liling Tan
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+""" ChrF score implementation """
+from __future__ import division
+from collections import Counter
+
+from nltk.util import ngrams, everygrams
+
+def sentence_chrf(reference, hypothesis, min_len=1, max_len=6, beta=3.0):
+ """
+ Calculates the sentence level CHRF (Character n-gram F-score) described in
+ - Maja Popovic. 2015. CHRF: Character n-gram F-score for Automatic MT Evaluation.
+ In Proceedings of the 10th Workshop on Machine Translation.
+ http://www.statmt.org/wmt15/pdf/WMT49.pdf
+ - Maja Popovic. 2016. CHRF Deconstructed: β Parameters and n-gram Weights.
+ In Proceedings of the 1st Conference on Machine Translation.
+ http://www.statmt.org/wmt16/pdf/W16-2341.pdf
+
+ Unlike multi-reference BLEU, CHRF only supports a single reference.
+
+ An example from the original BLEU paper
+ http://www.aclweb.org/anthology/P02-1040.pdf
+
+ >>> ref1 = str('It is a guide to action that ensures that the military '
+ ... 'will forever heed Party commands').split()
+ >>> hyp1 = str('It is a guide to action which ensures that the military '
+ ... 'always obeys the commands of the party').split()
+ >>> hyp2 = str('It is to insure the troops forever hearing the activity '
+ ... 'guidebook that party direct').split()
+ >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
+ 0.6768...
+ >>> sentence_chrf(ref1, hyp2) # doctest: +ELLIPSIS
+ 0.4201...
+
+ The infamous "the the the ... " example
+
+ >>> ref = 'the cat is on the mat'.split()
+ >>> hyp = 'the the the the the the the'.split()
+ >>> sentence_chrf(ref, hyp) # doctest: +ELLIPSIS
+ 0.2530...
+
+ An example to show that this function allows users to use strings instead of
+ tokens, i.e. list(str) as inputs.
+
+ >>> ref1 = str('It is a guide to action that ensures that the military '
+ ... 'will forever heed Party commands')
+ >>> hyp1 = str('It is a guide to action which ensures that the military '
+ ... 'always obeys the commands of the party')
+ >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
+ 0.6768...
+ >>> type(ref1) == type(hyp1) == str
+ True
+ >>> sentence_chrf(ref1.split(), hyp1.split()) # doctest: +ELLIPSIS
+ 0.6768...
+
+ To skip the unigrams and only use 2- to 3-grams:
+
+ >>> sentence_chrf(ref1, hyp1, min_len=2, max_len=3) # doctest: +ELLIPSIS
+ 0.7018...
+
+ :param references: reference sentence
+ :type references: list(str) / str
+ :param hypothesis: a hypothesis sentence
+ :type hypothesis: list(str) / str
+ :param min_len: The minimum order of n-gram this function should extract.
+ :type min_len: int
+ :param max_len: The maximum order of n-gram this function should extract.
+ :type max_len: int
+ :param beta: the parameter to assign more importance to recall over precision
+ :type beta: float
+ :return: the sentence level CHRF score.
+ :rtype: float
+ """
+ return corpus_chrf([reference], [hypothesis], min_len, max_len, beta=beta)
+
+
+def corpus_chrf(list_of_references, hypotheses, min_len=1, max_len=6, beta=3.0):
+ """
+ Calculates the corpus level CHRF (Character n-gram F-score), it is the
+ micro-averaged value of the sentence/segment level CHRF score.
+
+ CHRF only supports a single reference.
+
+ >>> ref1 = str('It is a guide to action that ensures that the military '
+ ... 'will forever heed Party commands').split()
+ >>> ref2 = str('It is the guiding principle which guarantees the military '
+ ... 'forces always being under the command of the Party').split()
+ >>>
+ >>> hyp1 = str('It is a guide to action which ensures that the military '
+ ... 'always obeys the commands of the party').split()
+ >>> hyp2 = str('It is to insure the troops forever hearing the activity '
+ ... 'guidebook that party direct')
+ >>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS
+ 0.4915...
+
+ :param references: a corpus of list of reference sentences, w.r.t. hypotheses
+ :type references: list(list(str)) / list(str)
+ :param hypotheses: a list of hypothesis sentences
+ :type hypotheses: list(list(str)) / list(str)
+ :param min_len: The minimum order of n-gram this function should extract.
+ :type min_len: int
+ :param max_len: The maximum order of n-gram this function should extract.
+ :type max_len: int
+ :param beta: the parameter to assign more importance to recall over precision
+ :type beta: float
+ :return: the sentence level CHRF score.
+ :rtype: float
+ """
+
+ assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their references should be the same"
+
+ # Iterate through each hypothesis and their corresponding references.
+ for reference, hypothesis in zip(list_of_references, hypotheses):
+ # Cheating condition to allow users to input strings instead of tokens.
+ if type(reference) and type(hypothesis) != str:
+ reference, hypothesis = ' '.join(reference), ' '.join(hypothesis)
+ # For each order of ngram, calculate the no. of ngram matches and
+ # keep track of no. of ngram in references.
+ ref_ngrams = Counter(everygrams(reference, min_len, max_len))
+ hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
+ overlap_ngrams = ref_ngrams & hyp_ngrams
+ tp = sum(overlap_ngrams.values()) # True positives.
+ tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
+ tffn = sum(ref_ngrams.values()) # True posities + False negatives.
+
+ precision = tp / tpfp
+ recall = tp / tffn
+ factor = beta**2
+ score = (1+ factor ) * (precision * recall) / ( factor * precision + recall)
+ return score
diff --git a/nltk/translate/gale_church.py b/nltk/translate/gale_church.py
index 0ac3e45..a543b4c 100644
--- a/nltk/translate/gale_church.py
+++ b/nltk/translate/gale_church.py
@@ -2,8 +2,9 @@
# Natural Language Toolkit: Gale-Church Aligner
#
-# Copyright (C) 2001-2013 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Torsten Marek <marek at ifi.uzh.ch>
+# Contributor: Cassidy Laidlaw, Liling Tan
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -76,16 +77,30 @@ class LanguageIndependent(object):
VARIANCE_CHARACTERS = 6.8
-def trace(backlinks, source, target):
+def trace(backlinks, source_sents_lens, target_sents_lens):
+ """
+ Traverse the alignment cost from the tracebacks and retrieves
+ appropriate sentence pairs.
+
+ :param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS)
+ :type backlinks: dict
+ :param source_sents_lens: A list of target sentences' lengths
+ :type source_sents_lens: list(int)
+ :param target_sents_lens: A list of target sentences' lengths
+ :type target_sents_lens: list(int)
+ """
links = []
- pos = (len(source), len(target))
-
- while pos != (0, 0):
- s, t = backlinks[pos]
+ position = (len(source_sents_lens), len(target_sents_lens))
+ while position != (0, 0) and all(p >=0 for p in position):
+ try:
+ s, t = backlinks[position]
+ except TypeError:
+ position = (position[0]-1 , position[1]-1)
+ continue
for i in range(s):
for j in range(t):
- links.append((pos[0] - i - 1, pos[1] - j - 1))
- pos = (pos[0] - s, pos[1] - t)
+ links.append((position[0] - i - 1, position[1] - j - 1))
+ position = (position[0] - s, position[1] - t)
return links[::-1]
@@ -116,7 +131,7 @@ def align_log_prob(i, j, source_sents, target_sents, alignment, params):
return - (LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment]))
-def align_blocks(source_sents, target_sents, params = LanguageIndependent):
+def align_blocks(source_sents_lens, target_sents_lens, params = LanguageIndependent):
"""Return the sentence alignment of two text blocks (usually paragraphs).
>>> align_blocks([5,5,5], [7,7,7])
@@ -128,8 +143,8 @@ def align_blocks(source_sents, target_sents, params = LanguageIndependent):
>>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12])
[(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)]
- @param source_sents: The list of source sentence lengths.
- @param target_sents: The list of target sentence lengths.
+ @param source_sents_lens: The list of source sentence lengths.
+ @param target_sents_lens: The list of target sentence lengths.
@param params: the sentence alignment parameters.
@return: The sentence alignments, a list of index pairs.
"""
@@ -141,8 +156,8 @@ def align_blocks(source_sents, target_sents, params = LanguageIndependent):
backlinks = {}
- for i in range(len(source_sents) + 1):
- for j in range(len(target_sents) + 1):
+ for i in range(len(source_sents_lens) + 1):
+ for j in range(len(target_sents_lens) + 1):
min_dist = float('inf')
min_align = None
for a in alignment_types:
@@ -150,7 +165,8 @@ def align_blocks(source_sents, target_sents, params = LanguageIndependent):
prev_j = j - a[1]
if prev_i < -len(D) or prev_j < 0:
continue
- p = D[prev_i][prev_j] + align_log_prob(i, j, source_sents, target_sents, a, params)
+ p = D[prev_i][prev_j] + align_log_prob(i, j, source_sents_lens,
+ target_sents_lens, a, params)
if p < min_dist:
min_dist = p
min_align = a
@@ -164,8 +180,8 @@ def align_blocks(source_sents, target_sents, params = LanguageIndependent):
if len(D) > 2:
D.pop(0)
D.append([])
-
- return trace(backlinks, source_sents, target_sents)
+
+ return trace(backlinks, source_sents_lens, target_sents_lens)
def align_texts(source_blocks, target_blocks, params = LanguageIndependent):
@@ -230,6 +246,3 @@ def parse_token_stream(stream, soft_delimiter, hard_delimiter):
# print align_texts(source, target)
-
-
-
diff --git a/nltk/translate/gdfa.py b/nltk/translate/gdfa.py
index 464ee15..e0e7f04 100644
--- a/nltk/translate/gdfa.py
+++ b/nltk/translate/gdfa.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: GDFA word alignment symmetrization
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Liling Tan
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/translate/gleu_score.py b/nltk/translate/gleu_score.py
new file mode 100644
index 0000000..e73be4e
--- /dev/null
+++ b/nltk/translate/gleu_score.py
@@ -0,0 +1,193 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: GLEU Score
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors:
+# Contributors: Mike Schuster, Michael Wayne Goodman, Liling Tan
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+""" GLEU score implementation. """
+from __future__ import division
+from collections import Counter
+
+from nltk.util import ngrams, everygrams
+
+
+def sentence_gleu(references, hypothesis, min_len=1, max_len=4):
+ """
+ Calculates the sentence level GLEU (Google-BLEU) score described in
+
+ Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi,
+ Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey,
+ Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser,
+ Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens,
+ George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith,
+ Jason Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes,
+ Jeffrey Dean. (2016) Google’s Neural Machine Translation System:
+ Bridging the Gap between Human and Machine Translation.
+ eprint arXiv:1609.08144. https://arxiv.org/pdf/1609.08144v2.pdf
+ Retrieved on 27 Oct 2016.
+
+ From Wu et al. (2016):
+ "The BLEU score has some undesirable properties when used for single
+ sentences, as it was designed to be a corpus measure. We therefore
+ use a slightly different score for our RL experiments which we call
+ the 'GLEU score'. For the GLEU score, we record all sub-sequences of
+ 1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then
+ compute a recall, which is the ratio of the number of matching n-grams
+ to the number of total n-grams in the target (ground truth) sequence,
+ and a precision, which is the ratio of the number of matching n-grams
+ to the number of total n-grams in the generated output sequence. Then
+ GLEU score is simply the minimum of recall and precision. This GLEU
+ score's range is always between 0 (no matches) and 1 (all match) and
+ it is symmetrical when switching output and target. According to
+ our experiments, GLEU score correlates quite well with the BLEU
+ metric on a corpus level but does not have its drawbacks for our per
+ sentence reward objective."
+
+ Note: The initial implementation only allowed a single reference, but now
+ a list of references is required (which is consistent with
+ bleu_score.sentence_bleu()).
+
+ The infamous "the the the ... " example
+
+ >>> ref = 'the cat is on the mat'.split()
+ >>> hyp = 'the the the the the the the'.split()
+ >>> sentence_gleu([ref], hyp) # doctest: +ELLIPSIS
+ 0.0909...
+
+ An example to evaluate normal machine translation outputs
+
+ >>> ref1 = str('It is a guide to action that ensures that the military '
+ ... 'will forever heed Party commands').split()
+ >>> hyp1 = str('It is a guide to action which ensures that the military '
+ ... 'always obeys the commands of the party').split()
+ >>> hyp2 = str('It is to insure the troops forever hearing the activity '
+ ... 'guidebook that party direct').split()
+ >>> sentence_gleu([ref1], hyp1) # doctest: +ELLIPSIS
+ 0.4393...
+ >>> sentence_gleu([ref1], hyp2) # doctest: +ELLIPSIS
+ 0.1206...
+
+ :param references: a list of reference sentences
+ :type references: list(list(str))
+ :param hypothesis: a hypothesis sentence
+ :type hypothesis: list(str)
+ :param min_len: The minimum order of n-gram this function should extract.
+ :type min_len: int
+ :param max_len: The maximum order of n-gram this function should extract.
+ :type max_len: int
+ :return: the sentence level GLEU score.
+ :rtype: float
+ """
+ return corpus_gleu(
+ [references],
+ [hypothesis],
+ min_len=min_len,
+ max_len=max_len
+ )
+
+def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4):
+ """
+ Calculate a single corpus-level GLEU score (aka. system-level GLEU) for all
+ the hypotheses and their respective references.
+
+ Instead of averaging the sentence level GLEU scores (i.e. macro-average
+ precision), Wu et al. (2016) sum up the matching tokens and the max of
+ hypothesis and reference tokens for each sentence, then compute using the
+ aggregate values.
+
+ From Mike Schuster (via email):
+ "For the corpus, we just add up the two statistics n_match and
+ n_all = max(n_all_output, n_all_target) for all sentences, then
+ calculate gleu_score = n_match / n_all, so it is not just a mean of
+ the sentence gleu scores (in our case, longer sentences count more,
+ which I think makes sense as they are more difficult to translate)."
+
+ >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+ ... 'ensures', 'that', 'the', 'military', 'always',
+ ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
+ >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+ ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
+ ... 'heed', 'Party', 'commands']
+ >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+ ... 'guarantees', 'the', 'military', 'forces', 'always',
+ ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
+ >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+ ... 'army', 'always', 'to', 'heed', 'the', 'directions',
+ ... 'of', 'the', 'party']
+
+ >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
+ ... 'interested', 'in', 'world', 'history']
+ >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
+ ... 'because', 'he', 'read', 'the', 'book']
+
+ >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
+ >>> hypotheses = [hyp1, hyp2]
+ >>> corpus_gleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
+ 0.5673...
+
+ The example below show that corpus_gleu() is different from averaging
+ sentence_gleu() for hypotheses
+
+ >>> score1 = sentence_gleu([ref1a], hyp1)
+ >>> score2 = sentence_gleu([ref2a], hyp2)
+ >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
+ 0.6144...
+
+ :param list_of_references: a list of reference sentences, w.r.t. hypotheses
+ :type list_of_references: list(list(list(str)))
+ :param hypotheses: a list of hypothesis sentences
+ :type hypotheses: list(list(str))
+ :param min_len: The minimum order of n-gram this function should extract.
+ :type min_len: int
+ :param max_len: The maximum order of n-gram this function should extract.
+ :type max_len: int
+ :return: The corpus-level GLEU score.
+ :rtype: float
+ """
+ # sanity check
+ assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same"
+
+ # sum matches and max-token-lengths over all sentences
+ corpus_n_match = 0
+ corpus_n_all = 0
+
+ for references, hypothesis in zip(list_of_references, hypotheses):
+ hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
+ tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
+
+ hyp_counts = []
+ for reference in references:
+ ref_ngrams = Counter(everygrams(reference, min_len, max_len))
+ tpfn = sum(ref_ngrams.values()) # True positives + False negatives.
+
+ overlap_ngrams = ref_ngrams & hyp_ngrams
+ tp = sum(overlap_ngrams.values()) # True positives.
+
+ # While GLEU is defined as the minimum of precision and
+ # recall, we can reduce the number of division operations by one by
+ # instead finding the maximum of the denominators for the precision
+ # and recall formulae, since the numerators are the same:
+ # precision = tp / tpfp
+ # recall = tp / tpfn
+ # gleu_score = min(precision, recall) == tp / max(tpfp, tpfn)
+ n_all = max(tpfp, tpfn)
+
+ if n_all > 0:
+ hyp_counts.append((tp, n_all))
+
+ # use the reference yielding the highest score
+ if hyp_counts:
+ n_match, n_all = max(hyp_counts, key=lambda hc: hc[0]/hc[1])
+ corpus_n_match += n_match
+ corpus_n_all += n_all
+
+ # corner case: empty corpus or empty references---don't divide by zero!
+ if corpus_n_all == 0:
+ gleu_score = 0.0
+ else:
+ gleu_score = corpus_n_match / corpus_n_all
+
+ return gleu_score
diff --git a/nltk/translate/ibm4.py b/nltk/translate/ibm4.py
index 6a93c7e..bb8d913 100644
--- a/nltk/translate/ibm4.py
+++ b/nltk/translate/ibm4.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: IBM Model 4
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tah Wei Hoon <hoon.tw at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/translate/ibm5.py b/nltk/translate/ibm5.py
index ca49fc6..df34afc 100644
--- a/nltk/translate/ibm5.py
+++ b/nltk/translate/ibm5.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: IBM Model 5
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tah Wei Hoon <hoon.tw at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/translate/ibm_model.py b/nltk/translate/ibm_model.py
index 1d70b56..fa5312f 100644
--- a/nltk/translate/ibm_model.py
+++ b/nltk/translate/ibm_model.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: IBM Model Core
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tah Wei Hoon <hoon.tw at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/translate/metrics.py b/nltk/translate/metrics.py
index 5d54b97..e9fef3e 100644
--- a/nltk/translate/metrics.py
+++ b/nltk/translate/metrics.py
@@ -1,6 +1,6 @@
# Natural Language Toolkit: Translation metrics
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Will Zhang <wilzzha at gmail.com>
# Guan Gui <ggui at student.unimelb.edu.au>
# Steven Bird <stevenbird1 at gmail.com>
diff --git a/nltk/translate/phrase_based.py b/nltk/translate/phrase_based.py
index 71b1008..5bbc094 100644
--- a/nltk/translate/phrase_based.py
+++ b/nltk/translate/phrase_based.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Phrase Extraction Algorithm
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Authors: Liling Tan, Fredrik Hedman, Petra Barancikova
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/translate/ribes_score.py b/nltk/translate/ribes_score.py
index 5c8e531..553e68f 100644
--- a/nltk/translate/ribes_score.py
+++ b/nltk/translate/ribes_score.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: RIBES Score
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Contributors: Katsuhito Sudoh, Liling Tan, Kasramvd, J.F.Sebastian
# Mark Byers, ekhumoro, P. Ortiz
# URL: <http://nltk.org/>
diff --git a/nltk/translate/stack_decoder.py b/nltk/translate/stack_decoder.py
index eda5f41..e9442d7 100644
--- a/nltk/translate/stack_decoder.py
+++ b/nltk/translate/stack_decoder.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Stack decoder
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Tah Wei Hoon <hoon.tw at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -493,3 +493,7 @@ class _Stack(object):
def __contains__(self, hypothesis):
return hypothesis in self.items
+
+ def __bool__(self):
+ return len(self.items) != 0
+ __nonzero__=__bool__
diff --git a/nltk/tree.py b/nltk/tree.py
index c38eacb..193a003 100644
--- a/nltk/tree.py
+++ b/nltk/tree.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Text Trees
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Edward Loper <edloper at gmail.com>
# Steven Bird <stevenbird1 at gmail.com>
# Peter Ljunglöf <peter.ljunglof at gu.se>
@@ -14,15 +14,19 @@ Class for representing hierarchical language structures, such as
syntax trees and morphological trees.
"""
from __future__ import print_function, unicode_literals
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
# TODO: add LabelledTree (can be used for dependency trees)
import re
+from six import string_types
+
from nltk.grammar import Production, Nonterminal
from nltk.probability import ProbabilisticMixIn
from nltk.util import slice_bounds
-from nltk.compat import string_types, python_2_unicode_compatible, unicode_repr
+from nltk.compat import python_2_unicode_compatible, unicode_repr
from nltk.internals import raise_unorderable_types
######################################################################
@@ -694,7 +698,7 @@ class Tree(list):
from nltk.treeprettyprinter import TreePrettyPrinter
print(TreePrettyPrinter(self, sentence, highlight).text(**kwargs),
file=stream)
-
+
def __repr__(self):
childstr = ", ".join(unicode_repr(c) for c in self)
return '%s(%s, [%s])' % (type(self).__name__, unicode_repr(self._label), childstr)
@@ -877,7 +881,7 @@ class ImmutableTree(Tree):
######################################################################
## Parented trees
######################################################################
-
+ at add_metaclass(ABCMeta)
class AbstractParentedTree(Tree):
"""
An abstract base class for a ``Tree`` that automatically maintains
@@ -921,7 +925,7 @@ class AbstractParentedTree(Tree):
#////////////////////////////////////////////////////////////
# Parent management
#////////////////////////////////////////////////////////////
-
+ @abstractmethod
def _setparent(self, child, index, dry_run=False):
"""
Update the parent pointer of ``child`` to point to ``self``. This
@@ -942,8 +946,8 @@ class AbstractParentedTree(Tree):
parent pointer; just check for any error conditions, and
raise an exception if one is found.
"""
- raise NotImplementedError()
+ @abstractmethod
def _delparent(self, child, index):
"""
Update the parent pointer of ``child`` to not point to self. This
@@ -956,7 +960,6 @@ class AbstractParentedTree(Tree):
:type index: int
:param index: The index of ``child`` in ``self``.
"""
- raise NotImplementedError()
#////////////////////////////////////////////////////////////
# Methods that add/remove children
@@ -1600,4 +1603,3 @@ __all__ = ['ImmutableProbabilisticTree', 'ImmutableTree', 'ProbabilisticMixIn',
'ProbabilisticTree', 'Tree', 'bracket_parse',
'sinica_parse', 'ParentedTree', 'MultiParentedTree',
'ImmutableParentedTree', 'ImmutableMultiParentedTree']
-
diff --git a/nltk/treeprettyprinter.py b/nltk/treeprettyprinter.py
index 02d83e1..9e82d5b 100644
--- a/nltk/treeprettyprinter.py
+++ b/nltk/treeprettyprinter.py
@@ -1,14 +1,14 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: ASCII visualization of NLTK trees
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Andreas van Cranenburgh <A.W.vanCranenburgh at uva.nl>
# Peter Ljunglöf <peter.ljunglof at gu.se>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
-Pretty-printing of discontinuous trees.
+Pretty-printing of discontinuous trees.
Adapted from the disco-dop project, by Andreas van Cranenburgh.
https://github.com/andreasvc/disco-dop
@@ -21,7 +21,7 @@ http://jgaa.info/accepted/2006/EschbachGuentherBecker2006.10.2.pdf
from __future__ import division, print_function, unicode_literals
from nltk.util import slice_bounds, OrderedDict
-from nltk.compat import string_types, python_2_unicode_compatible, unicode_repr
+from nltk.compat import python_2_unicode_compatible, unicode_repr
from nltk.internals import raise_unorderable_types
from nltk.tree import Tree
@@ -56,8 +56,8 @@ class TreePrettyPrinter(object):
creates an object from which different visualizations can be created.
:param tree: a Tree object.
- :param sentence: a list of words (strings). If `sentence` is given,
- `tree` must contain integers as leaves, which are taken as indices
+ :param sentence: a list of words (strings). If `sentence` is given,
+ `tree` must contain integers as leaves, which are taken as indices
in `sentence`. Using this you can display a discontinuous tree.
:param highlight: Optionally, a sequence of Tree objects in `tree` which
should be highlighted. Has the effect of only applying colors to nodes
@@ -521,7 +521,7 @@ class TreePrettyPrinter(object):
result += ['\t<text style="text-anchor: middle; fill: %s; '
'font-size: %dpx;" x="%g" y="%g">%s</text>' % (
color, fontsize, x, y,
- escape(node.label() if isinstance(node, Tree)
+ escape(node.label() if isinstance(node, Tree)
else node))]
result += ['</svg>']
diff --git a/nltk/twitter/__init__.py b/nltk/twitter/__init__.py
index edf5204..655d7a9 100644
--- a/nltk/twitter/__init__.py
+++ b/nltk/twitter/__init__.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/nltk/twitter/api.py b/nltk/twitter/api.py
index 9b4c2c6..05c71f8 100644
--- a/nltk/twitter/api.py
+++ b/nltk/twitter/api.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter API
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig at gmail.com>
# URL: <http://nltk.org/>
@@ -12,6 +12,8 @@ This module provides an interface for TweetHandlers, and support for timezone
handling.
"""
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
from datetime import tzinfo, timedelta, datetime
from nltk.compat import UTC
import time as _time
@@ -42,8 +44,11 @@ class LocalTimezoneOffsetWithUTC(tzinfo):
"""
return self.DSTOFFSET
+
LOCAL = LocalTimezoneOffsetWithUTC()
+
+ at add_metaclass(ABCMeta)
class BasicTweetHandler(object):
"""
Minimal implementation of `TweetHandler`.
@@ -101,17 +106,17 @@ class TweetHandlerI(BasicTweetHandler):
self.startingup = True
+ @abstractmethod
def handle(self, data):
"""
Deal appropriately with data returned by the Twitter API
"""
- raise NotImplementedError
+ @abstractmethod
def on_finish(self):
"""
Actions when the tweet limit has been reached
"""
- raise NotImplementedError
def check_date_limit(self, data, verbose=False):
"""
diff --git a/nltk/twitter/common.py b/nltk/twitter/common.py
index bae7be1..811f56f 100644
--- a/nltk/twitter/common.py
+++ b/nltk/twitter/common.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter client
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/twitter/twitter_demo.py b/nltk/twitter/twitter_demo.py
index dad0c55..3338587 100644
--- a/nltk/twitter/twitter_demo.py
+++ b/nltk/twitter/twitter_demo.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter client
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/twitter/twitterclient.py b/nltk/twitter/twitterclient.py
index d78d270..bd6197f 100644
--- a/nltk/twitter/twitterclient.py
+++ b/nltk/twitter/twitterclient.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter client
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/twitter/util.py b/nltk/twitter/util.py
index 8861132..16b1507 100644
--- a/nltk/twitter/util.py
+++ b/nltk/twitter/util.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter client
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Ewan Klein <ewan at inf.ed.ac.uk>
# Lorenzo Rubio <lrnzcig at gmail.com>
# URL: <http://nltk.org/>
diff --git a/nltk/util.py b/nltk/util.py
index d16883e..2dcb782 100644
--- a/nltk/util.py
+++ b/nltk/util.py
@@ -1,11 +1,13 @@
# Natural Language Toolkit: Utility functions
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import print_function
+import sys
+import inspect
import locale
import re
import types
@@ -19,19 +21,23 @@ from pprint import pprint
from collections import defaultdict, deque
from sys import version_info
+from six import class_types, string_types, text_type
+from six.moves.urllib.request import (build_opener, install_opener, getproxies,
+ ProxyHandler, ProxyBasicAuthHandler,
+ ProxyDigestAuthHandler,
+ HTTPPasswordMgrWithDefaultRealm)
+
from nltk.internals import slice_bounds, raise_unorderable_types
-from nltk.compat import (class_types, text_type, string_types, total_ordering,
- python_2_unicode_compatible, getproxies,
- ProxyHandler, build_opener, install_opener,
- HTTPPasswordMgrWithDefaultRealm,
- ProxyBasicAuthHandler, ProxyDigestAuthHandler)
+from nltk.collections import *
+from nltk.compat import python_2_unicode_compatible
+
+
######################################################################
# Short usage message
######################################################################
def usage(obj, selfname='self'):
- import inspect
str(obj) # In case it's lazy, this will load it.
if not isinstance(obj, class_types):
@@ -42,7 +48,11 @@ def usage(obj, selfname='self'):
if name.startswith('_'): continue
if getattr(method, '__deprecated__', False): continue
- args, varargs, varkw, defaults = inspect.getargspec(method)
+ if sys.version_info[0] >= 3:
+ getargspec = inspect.getfullargspec
+ else:
+ getargspec = inspect.getargspec
+ args, varargs, varkw, defaults = getargspec(method)[:4]
if (args and args[0]=='self' and
(defaults is None or len(args)>len(defaults))):
args = args[1:]
@@ -378,18 +388,18 @@ def flatten(*args):
# Ngram iteration
##########################################################################
-def pad_sequence(sequence, n, pad_left=False, pad_right=False,
+def pad_sequence(sequence, n, pad_left=False, pad_right=False,
left_pad_symbol=None, right_pad_symbol=None):
"""
Returns a padded sequence of items before ngram extraction.
-
+
>>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
['<s>', 1, 2, 3, 4, 5, '</s>']
>>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
['<s>', 1, 2, 3, 4, 5]
>>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
[1, 2, 3, 4, 5, '</s>']
-
+
:param sequence: the source data to be padded
:type sequence: sequence or iter
:param n: the degree of the ngrams
@@ -413,7 +423,7 @@ def pad_sequence(sequence, n, pad_left=False, pad_right=False,
# add a flag to pad the sequence so we get peripheral ngrams?
-def ngrams(sequence, n, pad_left=False, pad_right=False,
+def ngrams(sequence, n, pad_left=False, pad_right=False,
left_pad_symbol=None, right_pad_symbol=None):
"""
Return the ngrams generated from a sequence of items, as an iterator.
@@ -423,7 +433,7 @@ def ngrams(sequence, n, pad_left=False, pad_right=False,
>>> list(ngrams([1,2,3,4,5], 3))
[(1, 2, 3), (2, 3, 4), (3, 4, 5)]
- Use ngrams for a list version of this function. Set pad_left
+ Wrap with list for a list version of this function. Set pad_left
or pad_right to true in order to get additional ngrams:
>>> list(ngrams([1,2,3,4,5], 2, pad_right=True))
@@ -452,7 +462,7 @@ def ngrams(sequence, n, pad_left=False, pad_right=False,
"""
sequence = pad_sequence(sequence, n, pad_left, pad_right,
left_pad_symbol, right_pad_symbol)
-
+
history = []
while n > 1:
history.append(next(sequence))
@@ -503,13 +513,13 @@ def trigrams(sequence, **kwargs):
def everygrams(sequence, min_len=1, max_len=-1, **kwargs):
"""
Returns all possible ngrams generated from a sequence of items, as an iterator.
-
+
>>> sent = 'a b c'.split()
>>> list(everygrams(sent))
[('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c'), ('a', 'b', 'c')]
>>> list(everygrams(sent, max_len=2))
[('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c')]
-
+
:param sequence: the source data to be converted into trigrams
:type sequence: sequence or iter
:param min_len: minimum length of the ngrams, aka. n-gram order/degree of ngram
@@ -518,7 +528,7 @@ def everygrams(sequence, min_len=1, max_len=-1, **kwargs):
:type max_len: int
:rtype: iter(tuple)
"""
-
+
if max_len == -1:
max_len = len(sequence)
for n in range(min_len, max_len+1):
@@ -530,13 +540,13 @@ def skipgrams(sequence, n, k, **kwargs):
Returns all possible skipgrams generated from a sequence of items, as an iterator.
Skipgrams are ngrams that allows tokens to be skipped.
Refer to http://homepages.inf.ed.ac.uk/ballison/pdf/lrec_skipgrams.pdf
-
+
>>> sent = "Insurgents killed in ongoing fighting".split()
>>> list(skipgrams(sent, 2, 2))
[('Insurgents', 'killed'), ('Insurgents', 'in'), ('Insurgents', 'ongoing'), ('killed', 'in'), ('killed', 'ongoing'), ('killed', 'fighting'), ('in', 'ongoing'), ('in', 'fighting'), ('ongoing', 'fighting')]
>>> list(skipgrams(sent, 3, 2))
[('Insurgents', 'killed', 'in'), ('Insurgents', 'killed', 'ongoing'), ('Insurgents', 'killed', 'fighting'), ('Insurgents', 'in', 'ongoing'), ('Insurgents', 'in', 'fighting'), ('Insurgents', 'ongoing', 'fighting'), ('killed', 'in', 'ongoing'), ('killed', 'in', 'fighting'), ('killed', 'ongoing', 'fighting'), ('in', 'ongoing', 'fighting')]
-
+
:param sequence: the source data to be converted into trigrams
:type sequence: sequence or iter
:param n: the degree of the ngrams
@@ -545,13 +555,13 @@ def skipgrams(sequence, n, k, **kwargs):
:type k: int
:rtype: iter(tuple)
"""
-
+
# Pads the sequence as desired by **kwargs.
if 'pad_left' in kwargs or 'pad_right' in kwargs:
sequence = pad_sequence(sequence, n, **kwargs)
-
+
# Note when iterating through the ngrams, the pad_right here is not
- # the **kwargs padding, it's for the algorithm to detect the SENTINEL
+ # the **kwargs padding, it's for the algorithm to detect the SENTINEL
# object on the right pad to stop inner loop.
SENTINEL = object()
for ngram in ngrams(sequence, n + k, pad_right=True, right_pad_symbol=SENTINEL):
@@ -562,531 +572,6 @@ def skipgrams(sequence, n, k, **kwargs):
continue
yield head + skip_tail
-##########################################################################
-# Ordered Dictionary
-##########################################################################
-
-class OrderedDict(dict):
- def __init__(self, data=None, **kwargs):
- self._keys = self.keys(data, kwargs.get('keys'))
- self._default_factory = kwargs.get('default_factory')
- if data is None:
- dict.__init__(self)
- else:
- dict.__init__(self, data)
-
- def __delitem__(self, key):
- dict.__delitem__(self, key)
- self._keys.remove(key)
-
- def __getitem__(self, key):
- try:
- return dict.__getitem__(self, key)
- except KeyError:
- return self.__missing__(key)
-
- def __iter__(self):
- return (key for key in self.keys())
-
- def __missing__(self, key):
- if not self._default_factory and key not in self._keys:
- raise KeyError()
- return self._default_factory()
-
- def __setitem__(self, key, item):
- dict.__setitem__(self, key, item)
- if key not in self._keys:
- self._keys.append(key)
-
- def clear(self):
- dict.clear(self)
- self._keys.clear()
-
- def copy(self):
- d = dict.copy(self)
- d._keys = self._keys
- return d
-
- def items(self):
- # returns iterator under python 3 and list under python 2
- return zip(self.keys(), self.values())
-
- def keys(self, data=None, keys=None):
- if data:
- if keys:
- assert isinstance(keys, list)
- assert len(data) == len(keys)
- return keys
- else:
- assert isinstance(data, dict) or \
- isinstance(data, OrderedDict) or \
- isinstance(data, list)
- if isinstance(data, dict) or isinstance(data, OrderedDict):
- return data.keys()
- elif isinstance(data, list):
- return [key for (key, value) in data]
- elif '_keys' in self.__dict__:
- return self._keys
- else:
- return []
-
- def popitem(self):
- if not self._keys:
- raise KeyError()
-
- key = self._keys.pop()
- value = self[key]
- del self[key]
- return (key, value)
-
- def setdefault(self, key, failobj=None):
- dict.setdefault(self, key, failobj)
- if key not in self._keys:
- self._keys.append(key)
-
- def update(self, data):
- dict.update(self, data)
- for key in self.keys(data):
- if key not in self._keys:
- self._keys.append(key)
-
- def values(self):
- # returns iterator under python 3
- return map(self.get, self._keys)
-
-######################################################################
-# Lazy Sequences
-######################################################################
-
- at total_ordering
- at python_2_unicode_compatible
-class AbstractLazySequence(object):
- """
- An abstract base class for read-only sequences whose values are
- computed as needed. Lazy sequences act like tuples -- they can be
- indexed, sliced, and iterated over; but they may not be modified.
-
- The most common application of lazy sequences in NLTK is for
- corpus view objects, which provide access to the contents of a
- corpus without loading the entire corpus into memory, by loading
- pieces of the corpus from disk as needed.
-
- The result of modifying a mutable element of a lazy sequence is
- undefined. In particular, the modifications made to the element
- may or may not persist, depending on whether and when the lazy
- sequence caches that element's value or reconstructs it from
- scratch.
-
- Subclasses are required to define two methods: ``__len__()``
- and ``iterate_from()``.
- """
- def __len__(self):
- """
- Return the number of tokens in the corpus file underlying this
- corpus view.
- """
- raise NotImplementedError('should be implemented by subclass')
-
- def iterate_from(self, start):
- """
- Return an iterator that generates the tokens in the corpus
- file underlying this corpus view, starting at the token number
- ``start``. If ``start>=len(self)``, then this iterator will
- generate no tokens.
- """
- raise NotImplementedError('should be implemented by subclass')
-
- def __getitem__(self, i):
- """
- Return the *i* th token in the corpus file underlying this
- corpus view. Negative indices and spans are both supported.
- """
- if isinstance(i, slice):
- start, stop = slice_bounds(self, i)
- return LazySubsequence(self, start, stop)
- else:
- # Handle negative indices
- if i < 0: i += len(self)
- if i < 0: raise IndexError('index out of range')
- # Use iterate_from to extract it.
- try:
- return next(self.iterate_from(i))
- except StopIteration:
- raise IndexError('index out of range')
-
- def __iter__(self):
- """Return an iterator that generates the tokens in the corpus
- file underlying this corpus view."""
- return self.iterate_from(0)
-
- def count(self, value):
- """Return the number of times this list contains ``value``."""
- return sum(1 for elt in self if elt==value)
-
- def index(self, value, start=None, stop=None):
- """Return the index of the first occurrence of ``value`` in this
- list that is greater than or equal to ``start`` and less than
- ``stop``. Negative start and stop values are treated like negative
- slice bounds -- i.e., they count from the end of the list."""
- start, stop = slice_bounds(self, slice(start, stop))
- for i, elt in enumerate(islice(self, start, stop)):
- if elt == value: return i+start
- raise ValueError('index(x): x not in list')
-
- def __contains__(self, value):
- """Return true if this list contains ``value``."""
- return bool(self.count(value))
-
- def __add__(self, other):
- """Return a list concatenating self with other."""
- return LazyConcatenation([self, other])
-
- def __radd__(self, other):
- """Return a list concatenating other with self."""
- return LazyConcatenation([other, self])
-
- def __mul__(self, count):
- """Return a list concatenating self with itself ``count`` times."""
- return LazyConcatenation([self] * count)
-
- def __rmul__(self, count):
- """Return a list concatenating self with itself ``count`` times."""
- return LazyConcatenation([self] * count)
-
- _MAX_REPR_SIZE = 60
- def __repr__(self):
- """
- Return a string representation for this corpus view that is
- similar to a list's representation; but if it would be more
- than 60 characters long, it is truncated.
- """
- pieces = []
- length = 5
- for elt in self:
- pieces.append(repr(elt))
- length += len(pieces[-1]) + 2
- if length > self._MAX_REPR_SIZE and len(pieces) > 2:
- return '[%s, ...]' % text_type(', ').join(pieces[:-1])
- else:
- return '[%s]' % text_type(', ').join(pieces)
-
- def __eq__(self, other):
- return (type(self) == type(other) and list(self) == list(other))
-
- def __ne__(self, other):
- return not self == other
-
- def __lt__(self, other):
- if type(other) != type(self):
- raise_unorderable_types("<", self, other)
- return list(self) < list(other)
-
- def __hash__(self):
- """
- :raise ValueError: Corpus view objects are unhashable.
- """
- raise ValueError('%s objects are unhashable' %
- self.__class__.__name__)
-
-
-class LazySubsequence(AbstractLazySequence):
- """
- A subsequence produced by slicing a lazy sequence. This slice
- keeps a reference to its source sequence, and generates its values
- by looking them up in the source sequence.
- """
-
- MIN_SIZE = 100
- """
- The minimum size for which lazy slices should be created. If
- ``LazySubsequence()`` is called with a subsequence that is
- shorter than ``MIN_SIZE``, then a tuple will be returned instead.
- """
-
- def __new__(cls, source, start, stop):
- """
- Construct a new slice from a given underlying sequence. The
- ``start`` and ``stop`` indices should be absolute indices --
- i.e., they should not be negative (for indexing from the back
- of a list) or greater than the length of ``source``.
- """
- # If the slice is small enough, just use a tuple.
- if stop-start < cls.MIN_SIZE:
- return list(islice(source.iterate_from(start), stop-start))
- else:
- return object.__new__(cls)
-
- def __init__(self, source, start, stop):
- self._source = source
- self._start = start
- self._stop = stop
-
- def __len__(self):
- return self._stop - self._start
-
- def iterate_from(self, start):
- return islice(self._source.iterate_from(start+self._start),
- max(0, len(self)-start))
-
-
-class LazyConcatenation(AbstractLazySequence):
- """
- A lazy sequence formed by concatenating a list of lists. This
- underlying list of lists may itself be lazy. ``LazyConcatenation``
- maintains an index that it uses to keep track of the relationship
- between offsets in the concatenated lists and offsets in the
- sublists.
- """
- def __init__(self, list_of_lists):
- self._list = list_of_lists
- self._offsets = [0]
-
- def __len__(self):
- if len(self._offsets) <= len(self._list):
- for tok in self.iterate_from(self._offsets[-1]): pass
- return self._offsets[-1]
-
- def iterate_from(self, start_index):
- if start_index < self._offsets[-1]:
- sublist_index = bisect.bisect_right(self._offsets, start_index)-1
- else:
- sublist_index = len(self._offsets)-1
-
- index = self._offsets[sublist_index]
-
- # Construct an iterator over the sublists.
- if isinstance(self._list, AbstractLazySequence):
- sublist_iter = self._list.iterate_from(sublist_index)
- else:
- sublist_iter = islice(self._list, sublist_index, None)
-
- for sublist in sublist_iter:
- if sublist_index == (len(self._offsets)-1):
- assert index+len(sublist) >= self._offsets[-1], (
- 'offests not monotonic increasing!')
- self._offsets.append(index+len(sublist))
- else:
- assert self._offsets[sublist_index+1] == index+len(sublist), (
- 'inconsistent list value (num elts)')
-
- for value in sublist[max(0, start_index-index):]:
- yield value
-
- index += len(sublist)
- sublist_index += 1
-
-
-class LazyMap(AbstractLazySequence):
- """
- A lazy sequence whose elements are formed by applying a given
- function to each element in one or more underlying lists. The
- function is applied lazily -- i.e., when you read a value from the
- list, ``LazyMap`` will calculate that value by applying its
- function to the underlying lists' value(s). ``LazyMap`` is
- essentially a lazy version of the Python primitive function
- ``map``. In particular, the following two expressions are
- equivalent:
-
- >>> from nltk.util import LazyMap
- >>> function = str
- >>> sequence = [1,2,3]
- >>> map(function, sequence) # doctest: +SKIP
- ['1', '2', '3']
- >>> list(LazyMap(function, sequence))
- ['1', '2', '3']
-
- Like the Python ``map`` primitive, if the source lists do not have
- equal size, then the value None will be supplied for the
- 'missing' elements.
-
- Lazy maps can be useful for conserving memory, in cases where
- individual values take up a lot of space. This is especially true
- if the underlying list's values are constructed lazily, as is the
- case with many corpus readers.
-
- A typical example of a use case for this class is performing
- feature detection on the tokens in a corpus. Since featuresets
- are encoded as dictionaries, which can take up a lot of memory,
- using a ``LazyMap`` can significantly reduce memory usage when
- training and running classifiers.
- """
- def __init__(self, function, *lists, **config):
- """
- :param function: The function that should be applied to
- elements of ``lists``. It should take as many arguments
- as there are ``lists``.
- :param lists: The underlying lists.
- :param cache_size: Determines the size of the cache used
- by this lazy map. (default=5)
- """
- if not lists:
- raise TypeError('LazyMap requires at least two args')
-
- self._lists = lists
- self._func = function
- self._cache_size = config.get('cache_size', 5)
- self._cache = ({} if self._cache_size > 0 else None)
-
- # If you just take bool() of sum() here _all_lazy will be true just
- # in case n >= 1 list is an AbstractLazySequence. Presumably this
- # isn't what's intended.
- self._all_lazy = sum(isinstance(lst, AbstractLazySequence)
- for lst in lists) == len(lists)
-
- def iterate_from(self, index):
- # Special case: one lazy sublist
- if len(self._lists) == 1 and self._all_lazy:
- for value in self._lists[0].iterate_from(index):
- yield self._func(value)
- return
-
- # Special case: one non-lazy sublist
- elif len(self._lists) == 1:
- while True:
- try: yield self._func(self._lists[0][index])
- except IndexError: return
- index += 1
-
- # Special case: n lazy sublists
- elif self._all_lazy:
- iterators = [lst.iterate_from(index) for lst in self._lists]
- while True:
- elements = []
- for iterator in iterators:
- try: elements.append(next(iterator))
- except: elements.append(None)
- if elements == [None] * len(self._lists):
- return
- yield self._func(*elements)
- index += 1
-
- # general case
- else:
- while True:
- try: elements = [lst[index] for lst in self._lists]
- except IndexError:
- elements = [None] * len(self._lists)
- for i, lst in enumerate(self._lists):
- try: elements[i] = lst[index]
- except IndexError: pass
- if elements == [None] * len(self._lists):
- return
- yield self._func(*elements)
- index += 1
-
- def __getitem__(self, index):
- if isinstance(index, slice):
- sliced_lists = [lst[index] for lst in self._lists]
- return LazyMap(self._func, *sliced_lists)
- else:
- # Handle negative indices
- if index < 0: index += len(self)
- if index < 0: raise IndexError('index out of range')
- # Check the cache
- if self._cache is not None and index in self._cache:
- return self._cache[index]
- # Calculate the value
- try: val = next(self.iterate_from(index))
- except StopIteration:
- raise IndexError('index out of range')
- # Update the cache
- if self._cache is not None:
- if len(self._cache) > self._cache_size:
- self._cache.popitem() # discard random entry
- self._cache[index] = val
- # Return the value
- return val
-
- def __len__(self):
- return max(len(lst) for lst in self._lists)
-
-
-class LazyZip(LazyMap):
- """
- A lazy sequence whose elements are tuples, each containing the i-th
- element from each of the argument sequences. The returned list is
- truncated in length to the length of the shortest argument sequence. The
- tuples are constructed lazily -- i.e., when you read a value from the
- list, ``LazyZip`` will calculate that value by forming a tuple from
- the i-th element of each of the argument sequences.
-
- ``LazyZip`` is essentially a lazy version of the Python primitive function
- ``zip``. In particular, an evaluated LazyZip is equivalent to a zip:
-
- >>> from nltk.util import LazyZip
- >>> sequence1, sequence2 = [1, 2, 3], ['a', 'b', 'c']
- >>> zip(sequence1, sequence2) # doctest: +SKIP
- [(1, 'a'), (2, 'b'), (3, 'c')]
- >>> list(LazyZip(sequence1, sequence2))
- [(1, 'a'), (2, 'b'), (3, 'c')]
- >>> sequences = [sequence1, sequence2, [6,7,8,9]]
- >>> list(zip(*sequences)) == list(LazyZip(*sequences))
- True
-
- Lazy zips can be useful for conserving memory in cases where the argument
- sequences are particularly long.
-
- A typical example of a use case for this class is combining long sequences
- of gold standard and predicted values in a classification or tagging task
- in order to calculate accuracy. By constructing tuples lazily and
- avoiding the creation of an additional long sequence, memory usage can be
- significantly reduced.
- """
- def __init__(self, *lists):
- """
- :param lists: the underlying lists
- :type lists: list(list)
- """
- LazyMap.__init__(self, lambda *elts: elts, *lists)
-
- def iterate_from(self, index):
- iterator = LazyMap.iterate_from(self, index)
- while index < len(self):
- yield next(iterator)
- index += 1
- return
-
- def __len__(self):
- return min(len(lst) for lst in self._lists)
-
-
-class LazyEnumerate(LazyZip):
- """
- A lazy sequence whose elements are tuples, each ontaining a count (from
- zero) and a value yielded by underlying sequence. ``LazyEnumerate`` is
- useful for obtaining an indexed list. The tuples are constructed lazily
- -- i.e., when you read a value from the list, ``LazyEnumerate`` will
- calculate that value by forming a tuple from the count of the i-th
- element and the i-th element of the underlying sequence.
-
- ``LazyEnumerate`` is essentially a lazy version of the Python primitive
- function ``enumerate``. In particular, the following two expressions are
- equivalent:
-
- >>> from nltk.util import LazyEnumerate
- >>> sequence = ['first', 'second', 'third']
- >>> list(enumerate(sequence))
- [(0, 'first'), (1, 'second'), (2, 'third')]
- >>> list(LazyEnumerate(sequence))
- [(0, 'first'), (1, 'second'), (2, 'third')]
-
- Lazy enumerations can be useful for conserving memory in cases where the
- argument sequences are particularly long.
-
- A typical example of a use case for this class is obtaining an indexed
- list for a long sequence of values. By constructing tuples lazily and
- avoiding the creation of an additional long sequence, memory usage can be
- significantly reduced.
- """
-
- def __init__(self, lst):
- """
- :param lst: the underlying list
- :type lst: list
- """
- LazyZip.__init__(self, range(len(lst)), lst)
-
-
######################################################################
# Binary Search in a File
######################################################################
@@ -1186,7 +671,7 @@ def set_proxy(proxy, user=None, password=''):
raise ValueError('Could not detect default proxy settings')
# Set up the proxy handler
- proxy_handler = ProxyHandler({'http': proxy})
+ proxy_handler = ProxyHandler({'https': proxy, 'http': proxy})
opener = build_opener(proxy_handler)
if user is not None:
@@ -1210,9 +695,9 @@ def elementtree_indent(elem, level=0):
"""
Recursive function to indent an ElementTree._ElementInterface
used for pretty printing. Run indent on elem and then output
- in the normal way.
-
- :param elem: element to be indented. will be modified.
+ in the normal way.
+
+ :param elem: element to be indented. will be modified.
:type elem: ElementTree._ElementInterface
:param level: level of indentation for this element
:type level: nonnegative integer
@@ -1239,17 +724,17 @@ def elementtree_indent(elem, level=0):
def choose(n, k):
"""
This function is a fast way to calculate binomial coefficients, commonly
- known as nCk, i.e. the number of combinations of n things taken k at a time.
+ known as nCk, i.e. the number of combinations of n things taken k at a time.
(https://en.wikipedia.org/wiki/Binomial_coefficient).
-
- This is the *scipy.special.comb()* with long integer computation but this
+
+ This is the *scipy.special.comb()* with long integer computation but this
approximation is faster, see https://github.com/nltk/nltk/issues/1181
-
+
>>> choose(4, 2)
6
>>> choose(6, 2)
15
-
+
:param n: The number of things.
:type n: int
:param r: The number of times a thing is taken.
@@ -1264,101 +749,3 @@ def choose(n, k):
return ntok // ktok
else:
return 0
-
-######################################################################
-# Trie Implementation
-######################################################################
-class Trie(defaultdict):
- """A Trie implementation for strings"""
- LEAF = True
-
- def __init__(self, strings=None):
- """Builds a Trie object, which is built around a ``defaultdict``
-
- If ``strings`` is provided, it will add the ``strings``, which
- consist of a ``list`` of ``strings``, to the Trie.
- Otherwise, it'll construct an empty Trie.
-
- :param strings: List of strings to insert into the trie
- (Default is ``None``)
- :type strings: list(str)
-
- """
- defaultdict.__init__(self, Trie)
- if strings:
- for string in strings:
- self.insert(string)
-
- def insert(self, string):
- """Inserts ``string`` into the Trie
-
- :param string: String to insert into the trie
- :type string: str
-
- :Example:
-
- >>> from nltk.util import Trie
- >>> trie = Trie(["ab"])
- >>> trie
- defaultdict(<class 'nltk.util.Trie'>, {'a': defaultdict(<class 'nltk.util.Trie'>, {'b': defaultdict(<class 'nltk.util.Trie'>, {True: None})})})
-
- """
- if len(string):
- self[string[0]].insert(string[1:])
- else:
- # mark the string is complete
- self[Trie.LEAF] = None
-
- def __str__(self):
- return str(self.as_dict())
-
- def as_dict(self, d=None):
- """Convert ``defaultdict`` to common ``dict`` representation.
-
- :param: A defaultdict containing strings mapped to nested defaultdicts.
- This is the structure of the trie. (Default is None)
- :type: defaultdict(str -> defaultdict)
- :return: Even though ``defaultdict`` is a subclass of ``dict`` and thus
- can be converted to a simple ``dict`` using ``dict()``, in our case
- it's a nested ``defaultdict``, so here's a quick trick to provide to
- us the ``dict`` representation of the ``Trie`` without
- ``defaultdict(<class 'nltk.util.Trie'>, ...``
- :rtype: dict(str -> dict(bool -> None))
- Note: there can be an arbitrarily deeply nested
- ``dict(str -> dict(str -> dict(..))``, but the last
- level will have ``dict(str -> dict(bool -> None))``
-
- :Example:
-
- >>> from nltk.util import Trie
- >>> trie = Trie(["abc", "def"])
- >>> expected = {'a': {'b': {'c': {True: None}}}, 'd': {'e': {'f': {True: None}}}}
- >>> trie.as_dict() == expected
- True
-
- """
- def _default_to_regular(d):
- """
- Source: http://stackoverflow.com/a/26496899/4760801
-
- :param d: Nested ``defaultdict`` to convert to regular ``dict``
- :type d: defaultdict(str -> defaultdict(...))
- :return: A dict representation of the defaultdict
- :rtype: dict(str -> dict(str -> ...))
-
- :Example:
-
- >>> from collections import defaultdict
- >>> d = defaultdict(defaultdict)
- >>> d["one"]["two"] = "three"
- >>> d
- defaultdict(<type 'collections.defaultdict'>, {'one': defaultdict(None, {'two': 'three'})})
- >>> _default_to_regular(d)
- {'one': {'two': 'three'}}
-
- """
- if isinstance(d, defaultdict):
- d = {k: _default_to_regular(v) for k, v in d.items()}
- return d
-
- return _default_to_regular(self)
diff --git a/nltk/wsd.py b/nltk/wsd.py
index 7524244..f77b0cb 100644
--- a/nltk/wsd.py
+++ b/nltk/wsd.py
@@ -3,7 +3,7 @@
# Authors: Liling Tan <alvations at gmail.com>,
# Dmitrijs Milajevs <dimazest at gmail.com>
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
diff --git a/setup.cfg b/setup.cfg
index 00bb0ae..8bfd5a1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,4 @@
[egg_info]
-tag_date = 0
-tag_svn_revision = 0
tag_build =
+tag_date = 0
diff --git a/setup.py b/setup.py
index 3d1adc7..a29a1c3 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
#
# Setup script for the Natural Language Toolkit
#
-# Copyright (C) 2001-2016 NLTK Project
+# Copyright (C) 2001-2017 NLTK Project
# Author: Steven Bird <stevenbird1 at gmail.com>
# Edward Loper <edloper at gmail.com>
# Ewan Klein <ewan at inf.ed.ac.uk>
@@ -29,6 +29,32 @@ with open(version_file) as fh:
# setuptools
from setuptools import setup, find_packages
+# Specify groups of optional dependencies
+extras_require = {
+ 'machine_learning': [
+ 'gensim',
+ 'numpy',
+ 'python-crfsuite',
+ 'scikit-learn',
+ 'scipy'
+ ],
+ 'plot': [
+ 'matplotlib',
+ ],
+ 'tgrep': [
+ 'pyparsing',
+ ],
+ 'twitter': [
+ 'twython',
+ ],
+ 'corenlp': [
+ 'requests',
+ ],
+}
+
+# Add a group made up of all optional dependencies
+extras_require['all'] = set(package for group in extras_require.values() for package in group)
+
setup(
name = "nltk",
description = "Natural Language Toolkit",
@@ -36,7 +62,7 @@ setup(
url = "http://nltk.org/",
long_description = """\
The Natural Language Toolkit (NLTK) is a Python package for
-natural language processing. NLTK requires Python 2.7, or 3.2+.""",
+natural language processing. NLTK requires Python 2.7, 3.4, or 3.5.""",
license = "Apache License, Version 2.0",
keywords = ['NLP', 'CL', 'natural language processing',
'computational linguistics', 'parsing', 'tagging',
@@ -55,9 +81,8 @@ natural language processing. NLTK requires Python 2.7, or 3.2+.""",
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 2.7',
- 'Programming Language :: Python :: 3.2',
- 'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
+ 'Programming Language :: Python :: 3.5',
'Topic :: Scientific/Engineering',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Scientific/Engineering :: Human Machine Interfaces',
@@ -69,7 +94,8 @@ natural language processing. NLTK requires Python 2.7, or 3.2+.""",
'Topic :: Text Processing :: Linguistic',
],
package_data = {'nltk': ['test/*.doctest', 'VERSION']},
-# install_requires = ['six>=1.9.0'],
+ install_requires = ['six'],
+ extras_require = extras_require,
packages = find_packages(),
zip_safe=False, # since normal files will be present too?
)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/nltk.git
More information about the debian-science-commits
mailing list