[nltk] 02/08: Imported Upstream version 3.0.5
Daniel Stender
danstender-guest at moszumanska.debian.org
Thu Sep 10 12:23:13 UTC 2015
This is an automated email from the git hooks/post-receive script.
danstender-guest pushed a commit to branch master
in repository nltk.
commit 664c4cd9771597f3d916680f2ed41739999c88af
Author: Daniel Stender <debian at danielstender.com>
Date: Tue Sep 8 17:58:19 2015 +0200
Imported Upstream version 3.0.5
---
PKG-INFO | 2 +-
README.txt | 55 ---
nltk.egg-info/PKG-INFO | 2 +-
nltk.egg-info/SOURCES.txt | 23 +-
nltk.egg-info/requires.txt | 1 +
nltk/VERSION | 2 +-
nltk/align/__init__.py | 3 +
nltk/align/api.py | 5 +-
nltk/align/bleu_score.py | 4 -
nltk/align/gale_church.py | 3 -
nltk/align/gdfa.py | 4 -
nltk/align/ibm1.py | 268 ++++++++-----
nltk/align/ibm2.py | 395 ++++++++++---------
nltk/align/ibm3.py | 670 +++++++++++++++------------------
nltk/align/ibm4.py | 473 +++++++++++++++++++++++
nltk/align/ibm5.py | 642 +++++++++++++++++++++++++++++++
nltk/align/ibm_model.py | 526 ++++++++++++++++++++++++++
nltk/align/phrase_based.py | 4 -
nltk/classify/positivenaivebayes.py | 7 -
nltk/classify/senna.py | 3 -
nltk/cluster/util.py | 1 -
nltk/collocations.py | 31 +-
nltk/compat.py | 159 ++++++++
nltk/corpus/__init__.py | 2 +
nltk/corpus/reader/__init__.py | 3 +-
nltk/corpus/reader/bnc.py | 2 +-
nltk/corpus/reader/childes.py | 26 +-
nltk/corpus/reader/sentiwordnet.py | 3 -
nltk/corpus/reader/twitter.py | 157 ++++++++
nltk/corpus/reader/verbnet.py | 12 +
nltk/corpus/reader/wordnet.py | 5 +-
nltk/decorators.py | 2 -
nltk/internals.py | 27 +-
nltk/jsontags.py | 10 +-
nltk/metrics/segmentation.py | 3 -
nltk/parse/bllip.py | 3 -
nltk/parse/dependencygraph.py | 61 ++-
nltk/parse/evaluate.py | 3 -
nltk/parse/malt.py | 505 ++++++++++++++++---------
nltk/parse/stanford.py | 3 -
nltk/parse/transitionparser.py | 3 -
nltk/parse/util.py | 59 +++
nltk/sem/glue.py | 13 +-
nltk/stem/__init__.py | 3 -
nltk/stem/api.py | 3 -
nltk/stem/isri.py | 3 -
nltk/stem/lancaster.py | 3 -
nltk/stem/porter.py | 4 -
nltk/stem/regexp.py | 3 -
nltk/stem/rslp.py | 3 -
nltk/stem/snowball.py | 3 -
nltk/stem/wordnet.py | 3 -
nltk/tag/__init__.py | 3 -
nltk/tag/api.py | 3 -
nltk/tag/brill.py | 3 -
nltk/tag/brill_trainer.py | 9 +-
nltk/tag/brill_trainer_orig.py | 414 --------------------
nltk/tag/crf.py | 3 -
nltk/tag/hmm.py | 3 -
nltk/tag/hunpos.py | 3 -
nltk/tag/mapping.py | 3 -
nltk/tag/senna.py | 45 ++-
nltk/tag/sequential.py | 11 +-
nltk/tag/stanford.py | 12 +-
nltk/tag/tnt.py | 3 -
nltk/tag/util.py | 3 -
nltk/tbl/feature.py | 3 -
nltk/tbl/rule.py | 3 -
nltk/tbl/template.py | 3 -
nltk/test/align.doctest | 67 ++--
nltk/test/bnc.doctest | 11 +-
nltk/test/corpus.doctest | 29 +-
nltk/test/gensim.doctest | 251 ++++++------
nltk/test/gluesemantics_malt_fixt.py | 2 +-
nltk/test/stem.doctest | 11 +-
nltk/test/unit/align/__init__.py | 1 +
nltk/test/unit/align/test_ibm1.py | 42 +++
nltk/test/unit/align/test_ibm2.py | 54 +++
nltk/test/unit/align/test_ibm4.py | 126 +++++++
nltk/test/unit/align/test_ibm5.py | 166 ++++++++
nltk/test/unit/align/test_ibm_model.py | 270 +++++++++++++
nltk/test/unit/test_json2csv_corpus.py | 187 +++++++++
nltk/test/unit/test_twitter_auth.py | 187 +++++++++
nltk/tgrep.py | 4 -
nltk/tokenize/__init__.py | 5 +-
nltk/tokenize/api.py | 3 -
nltk/tokenize/casual.py | 357 ++++++++++++++++++
nltk/tokenize/punkt.py | 3 -
nltk/tokenize/regexp.py | 3 -
nltk/tokenize/sexpr.py | 3 -
nltk/tokenize/simple.py | 3 -
nltk/tokenize/stanford.py | 7 +-
nltk/tokenize/texttiling.py | 3 -
nltk/tokenize/treebank.py | 3 -
nltk/tokenize/util.py | 3 -
nltk/tree.py | 3 -
nltk/treeprettyprinter.py | 2 -
nltk/twitter/__init__.py | 25 ++
nltk/twitter/api.py | 101 +++++
nltk/twitter/twitter_demo.py | 268 +++++++++++++
nltk/twitter/twitterclient.py | 506 +++++++++++++++++++++++++
nltk/twitter/util.py | 389 +++++++++++++++++++
nltk/wsd.py | 3 -
setup.cfg | 2 +-
setup.py | 1 +
105 files changed, 6159 insertions(+), 1682 deletions(-)
diff --git a/PKG-INFO b/PKG-INFO
index 4a7ba3b..91d3468 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: nltk
-Version: 3.0.4
+Version: 3.0.5
Summary: Natural Language Toolkit
Home-page: http://nltk.org/
Author: Steven Bird
diff --git a/README.txt b/README.txt
deleted file mode 100644
index 9da482d..0000000
--- a/README.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-Natural Language Toolkit (NLTK) nltk.org
-
-Authors: Steven Bird <stevenbird1 at gmail.com>
- Edward Loper <edloper at gmail.com>
- Ewan Klein <ewan at inf.ed.ac.uk>
-
-Copyright (C) 2001-2015 NLTK Project
-
-For license information, see LICENSE.txt
-
-NLTK -- the Natural Language Toolkit -- is a suite of open source
-Python modules, data sets and tutorials supporting research and
-development in Natural Language Processing.
-
-Documentation: A substantial amount of documentation about how
-to use NLTK, including a textbook and API documentation, is
-available from the NLTK website: http://nltk.org/
-
- - The book covers a wide range of introductory topics in NLP, and
- shows how to do all the processing tasks using the toolkit.
-
- - The toolkit's reference documentation describes every module,
- interface, class, method, function, and variable in the toolkit.
- This documentation should be useful to both users and developers.
-
-Mailing Lists: There are several mailing lists associated with NLTK:
-
- - nltk: Public information and announcements about NLTK (very low volume)
- http://groups.google.com/group/nltk
- - nltk-users: Discussions amongst NLTK users
- http://groups.google.com/group/nltk-users
- - nltk-dev: Discussions amongst NLTK developers
- http://groups.google.com/group/nltk-dev
- - nltk-translation: Discussions about translating the NLTK book
- http://groups.google.com/group/nltk-translation
- - nltk-commits: Subversion commit logs for NLTK
- http://groups.google.com/group/nltk-commits
-
-Contributing: If you would like to contribute to NLTK,
- please see http://nltk.org/contribute
-
-Donating: Have you found the toolkit helpful? Please support NLTK development
- by donating to the project via PayPal, using the link on the NLTK homepage.
-
-Redistributing: NLTK source code is distributed under the Apache 2.0 License.
- NLTK documentation is distributed under the Creative Commons
- Attribution-Noncommercial-No Derivative Works 3.0 United States license.
- NLTK corpora are provided under the terms given in the README file
- for each corpus; all are redistributable, and available for non-commercial use.
- NLTK may be freely redistributed, subject to the provisions of these licenses.
-
-Citing: If you publish work that uses NLTK, please cite the NLTK book, as follows:
-
- Bird, Steven, Edward Loper and Ewan Klein (2009).
- Natural Language Processing with Python. O'Reilly Media Inc.
diff --git a/nltk.egg-info/PKG-INFO b/nltk.egg-info/PKG-INFO
index 4a7ba3b..91d3468 100644
--- a/nltk.egg-info/PKG-INFO
+++ b/nltk.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: nltk
-Version: 3.0.4
+Version: 3.0.5
Summary: Natural Language Toolkit
Home-page: http://nltk.org/
Author: Steven Bird
diff --git a/nltk.egg-info/SOURCES.txt b/nltk.egg-info/SOURCES.txt
index 79c94ce..9aa27ea 100644
--- a/nltk.egg-info/SOURCES.txt
+++ b/nltk.egg-info/SOURCES.txt
@@ -1,7 +1,6 @@
INSTALL.txt
LICENSE.txt
MANIFEST.in
-README.txt
setup.cfg
setup.py
nltk/VERSION
@@ -31,6 +30,7 @@ nltk.egg-info/PKG-INFO
nltk.egg-info/SOURCES.txt
nltk.egg-info/dependency_links.txt
nltk.egg-info/not-zip-safe
+nltk.egg-info/requires.txt
nltk.egg-info/top_level.txt
nltk/align/__init__.py
nltk/align/api.py
@@ -40,6 +40,9 @@ nltk/align/gdfa.py
nltk/align/ibm1.py
nltk/align/ibm2.py
nltk/align/ibm3.py
+nltk/align/ibm4.py
+nltk/align/ibm5.py
+nltk/align/ibm_model.py
nltk/align/phrase_based.py
nltk/align/util.py
nltk/app/__init__.py
@@ -128,6 +131,7 @@ nltk/corpus/reader/switchboard.py
nltk/corpus/reader/tagged.py
nltk/corpus/reader/timit.py
nltk/corpus/reader/toolbox.py
+nltk/corpus/reader/twitter.py
nltk/corpus/reader/udhr.py
nltk/corpus/reader/util.py
nltk/corpus/reader/verbnet.py
@@ -212,7 +216,6 @@ nltk/tag/__init__.py
nltk/tag/api.py
nltk/tag/brill.py
nltk/tag/brill_trainer.py
-nltk/tag/brill_trainer_orig.py
nltk/tag/crf.py
nltk/tag/hmm.py
nltk/tag/hunpos.py
@@ -308,14 +311,23 @@ nltk/test/unit/test_collocations.py
nltk/test/unit/test_corpora.py
nltk/test/unit/test_corpus_views.py
nltk/test/unit/test_hmm.py
+nltk/test/unit/test_json2csv_corpus.py
nltk/test/unit/test_naivebayes.py
nltk/test/unit/test_seekable_unicode_stream_reader.py
nltk/test/unit/test_stem.py
nltk/test/unit/test_tag.py
nltk/test/unit/test_tgrep.py
+nltk/test/unit/test_twitter_auth.py
nltk/test/unit/utils.py
+nltk/test/unit/align/__init__.py
+nltk/test/unit/align/test_ibm1.py
+nltk/test/unit/align/test_ibm2.py
+nltk/test/unit/align/test_ibm4.py
+nltk/test/unit/align/test_ibm5.py
+nltk/test/unit/align/test_ibm_model.py
nltk/tokenize/__init__.py
nltk/tokenize/api.py
+nltk/tokenize/casual.py
nltk/tokenize/punkt.py
nltk/tokenize/regexp.py
nltk/tokenize/sexpr.py
@@ -323,4 +335,9 @@ nltk/tokenize/simple.py
nltk/tokenize/stanford.py
nltk/tokenize/texttiling.py
nltk/tokenize/treebank.py
-nltk/tokenize/util.py
\ No newline at end of file
+nltk/tokenize/util.py
+nltk/twitter/__init__.py
+nltk/twitter/api.py
+nltk/twitter/twitter_demo.py
+nltk/twitter/twitterclient.py
+nltk/twitter/util.py
\ No newline at end of file
diff --git a/nltk.egg-info/requires.txt b/nltk.egg-info/requires.txt
new file mode 100644
index 0000000..dde8185
--- /dev/null
+++ b/nltk.egg-info/requires.txt
@@ -0,0 +1 @@
+six>=1.9.0
diff --git a/nltk/VERSION b/nltk/VERSION
index b0f2dcb..eca690e 100644
--- a/nltk/VERSION
+++ b/nltk/VERSION
@@ -1 +1 @@
-3.0.4
+3.0.5
diff --git a/nltk/align/__init__.py b/nltk/align/__init__.py
index 3f53c1c..82de78e 100644
--- a/nltk/align/__init__.py
+++ b/nltk/align/__init__.py
@@ -12,9 +12,12 @@ These interfaces are prone to change.
"""
from nltk.align.api import AlignedSent, Alignment
+from nltk.align.ibm_model import IBMModel
from nltk.align.ibm1 import IBMModel1
from nltk.align.ibm2 import IBMModel2
from nltk.align.ibm3 import IBMModel3
+from nltk.align.ibm4 import IBMModel4
+from nltk.align.ibm5 import IBMModel5
from nltk.align.bleu_score import bleu
diff --git a/nltk/align/api.py b/nltk/align/api.py
index ddb1470..9d9f443 100644
--- a/nltk/align/api.py
+++ b/nltk/align/api.py
@@ -79,7 +79,7 @@ class AlignedSent(object):
"""
if not all(0 <= p[0] < len(self._words) for p in a):
raise IndexError("Alignment is outside boundary of words")
- if not all(0 <= p[1] < len(self._mots) for p in a):
+ if not all(p[1] is None or 0 <= p[1] < len(self._mots) for p in a):
raise IndexError("Alignment is outside boundary of mots")
return True
@@ -346,6 +346,3 @@ def _naacl2pair(pair_string):
i, j, p = pair_string.split("-")
return int(i), int(j)
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/align/bleu_score.py b/nltk/align/bleu_score.py
index 8878250..404428f 100644
--- a/nltk/align/bleu_score.py
+++ b/nltk/align/bleu_score.py
@@ -261,7 +261,3 @@ def _brevity_penalty(candidate, references):
return math.exp(1 - r / c)
-# run doctests
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.ELLIPSIS)
diff --git a/nltk/align/gale_church.py b/nltk/align/gale_church.py
index 9d1c3a7..0ac3e45 100644
--- a/nltk/align/gale_church.py
+++ b/nltk/align/gale_church.py
@@ -219,9 +219,6 @@ def parse_token_stream(stream, soft_delimiter, hard_delimiter):
for block_it in split_at(stream, hard_delimiter)]
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
# Code for test files in nltk_contrib/align/data/*.tok
diff --git a/nltk/align/gdfa.py b/nltk/align/gdfa.py
index f2e9743..23b7868 100644
--- a/nltk/align/gdfa.py
+++ b/nltk/align/gdfa.py
@@ -129,7 +129,3 @@ def grow_diag_final_and(srclen, trglen, e2f, f2e):
final_and(f2e)
return alignment
-# run doctests
-if __name__ == "__main__":
- import doctest
- doctest.testmod()
diff --git a/nltk/align/ibm1.py b/nltk/align/ibm1.py
index 43e618f..43e14f8 100644
--- a/nltk/align/ibm1.py
+++ b/nltk/align/ibm1.py
@@ -12,124 +12,196 @@
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-from __future__ import division
-from collections import defaultdict
-from nltk.align import AlignedSent
+"""
+Lexical translation model that ignores word order.
-class IBMModel1(object):
- """
- This class implements the algorithm of Expectation Maximization for
- the IBM Model 1.
-
- Step 1 - Collect the evidence of a English word being translated by a
- foreign language word.
-
- Step 2 - Estimate the probability of translation according to the
- evidence from Step 1.
-
- >>> from nltk.corpus import comtrans
- >>> bitexts = comtrans.aligned_sents()[:100]
- >>> ibm = IBMModel1(bitexts, 20)
-
- >>> aligned_sent = ibm.align(bitexts[6])
- >>> aligned_sent.alignment
- Alignment([(0, 0), (1, 1), (2, 2), (3, 7), (4, 7), (5, 8)])
- >>> print('{0:.3f}'.format(bitexts[6].precision(aligned_sent)))
- 0.556
- >>> print('{0:.3f}'.format(bitexts[6].recall(aligned_sent)))
- 0.833
- >>> print('{0:.3f}'.format(bitexts[6].alignment_error_rate(aligned_sent)))
- 0.333
-
- """
- def __init__(self, align_sents, num_iter):
- self.probabilities = self.train(align_sents, num_iter)
+In IBM Model 1, word order is ignored for simplicity. Thus, the
+following two alignments are equally likely.
- def train(self, align_sents, num_iter):
- """
- Return the translation probability model trained by IBM model 1.
+Source: je mange du jambon
+Target: i eat some ham
+Alignment: (1,1) (2,2) (3,3) (4,4)
- Arguments:
- align_sents -- A list of instances of AlignedSent class, which
- contains sentence pairs.
- num_iter -- The number of iterations.
+Source: je mange du jambon
+Target: some ham eat i
+Alignment: (1,4) (2,3) (3,2) (4,1)
- Returns:
- t_ef -- A dictionary of translation probabilities.
- """
+The EM algorithm used in Model 1 is:
+E step - In the training data, count how many times a source language
+ word is translated into a target language word, weighted by
+ the prior probability of the translation.
- # Vocabulary of each language
- fr_vocab = set()
- en_vocab = set()
- for alignSent in align_sents:
- en_vocab.update(alignSent.words)
- fr_vocab.update(alignSent.mots)
- # Add the Null token
- fr_vocab.add(None)
+M step - Estimate the new probability of translation based on the
+ counts from the Expectation step.
- # Initial probability
- init_prob = 1 / len(en_vocab)
- # Create the translation model with initial probability
- t_ef = defaultdict(lambda: defaultdict(lambda: init_prob))
+Notations:
+i: Position in the source sentence
+ Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
+j: Position in the target sentence
+ Valid values are 1, 2, ..., length of target sentence
+s: A word in the source language
+t: A word in the target language
- total_e = defaultdict(lambda: 0.0)
- for i in range(0, num_iter):
- count_ef = defaultdict(lambda: defaultdict(lambda: 0.0))
- total_f = defaultdict(lambda: 0.0)
+References:
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
- for alignSent in align_sents:
- en_set = alignSent.words
- fr_set = [None] + alignSent.mots
+Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
+Robert L. Mercer. 1993. The Mathematics of Statistical Machine
+Translation: Parameter Estimation. Computational Linguistics, 19 (2),
+263-311.
+"""
- # Compute normalization
- for e in en_set:
- total_e[e] = 0.0
- for f in fr_set:
- total_e[e] += t_ef[e][f]
+from __future__ import division
+from collections import defaultdict
+from nltk.align import AlignedSent
+from nltk.align import Alignment
+from nltk.align import IBMModel
+import warnings
- # Collect counts
- for e in en_set:
- for f in fr_set:
- c = t_ef[e][f] / total_e[e]
- count_ef[e][f] += c
- total_f[f] += c
- # Compute the estimate probabilities
- for f in fr_vocab:
- for e in en_vocab:
- t_ef[e][f] = count_ef[e][f] / total_f[f]
+class IBMModel1(IBMModel):
+ """
+ Lexical translation model that ignores word order
+
+ >>> bitext = []
+ >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
+ >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
+ >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
+ >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
+ >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
+ >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
+
+ >>> ibm1 = IBMModel1(bitext, 5)
+
+ >>> print('{0:.3f}'.format(ibm1.translation_table['buch']['book']))
+ 0.889
+ >>> print('{0:.3f}'.format(ibm1.translation_table['das']['book']))
+ 0.062
+ >>> print('{0:.3f}'.format(ibm1.translation_table['buch'][None]))
+ 0.113
+ >>> print('{0:.3f}'.format(ibm1.translation_table['ja'][None]))
+ 0.073
+
+ >>> test_sentence = bitext[2]
+ >>> test_sentence.words
+ ['das', 'buch', 'ist', 'ja', 'klein']
+ >>> test_sentence.mots
+ ['the', 'book', 'is', 'small']
+ >>> test_sentence.alignment
+ Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
- return t_ef
+ """
- def align(self, align_sent):
+ def __init__(self, sentence_aligned_corpus, iterations):
"""
- Returns the alignment result for one sentence pair.
+ Train on ``sentence_aligned_corpus`` and create a lexical
+ translation model.
+
+ Translation direction is from ``AlignedSent.mots`` to
+ ``AlignedSent.words``.
+
+ :param sentence_aligned_corpus: Sentence-aligned parallel corpus
+ :type sentence_aligned_corpus: list(AlignedSent)
+
+ :param iterations: Number of iterations to run training algorithm
+ :type iterations: int
+ """
+ super(IBMModel1, self).__init__(sentence_aligned_corpus)
+
+ # seed with a uniform distribution
+ initial_prob = 1 / len(self.trg_vocab)
+ if initial_prob > IBMModel.MIN_PROB:
+ for t in self.trg_vocab:
+ for s in self.src_vocab:
+ self.translation_table[t][s] = initial_prob
+ else:
+ warnings.warn("Target language vocabulary is too large. "
+ "Results may be less accurate.")
+
+ self.train(sentence_aligned_corpus, iterations)
+ self.__align_all(sentence_aligned_corpus)
+
+ def train(self, parallel_corpus, iterations):
+ for i in range(0, iterations):
+ count_t_given_s = defaultdict(lambda: defaultdict(lambda: 0.0))
+ count_any_t_given_s = defaultdict(lambda: 0.0)
+
+ for aligned_sentence in parallel_corpus:
+ trg_sentence = aligned_sentence.words
+ src_sentence = [None] + aligned_sentence.mots
+ total_count = defaultdict(lambda: 0.0)
+
+ # E step (a): Compute normalization factors to weigh counts
+ for t in trg_sentence:
+ if total_count[t] == 0.0:
+ for s in src_sentence:
+ total_count[t] += self.translation_table[t][s]
+
+ # E step (b): Collect counts
+ for t in trg_sentence:
+ for s in src_sentence:
+ count = self.translation_table[t][s]
+ normalized_count = count / total_count[t]
+ count_t_given_s[t][s] += normalized_count
+ count_any_t_given_s[s] += normalized_count
+
+ # M step: Update probabilities with maximum likelihood estimate
+ for s in self.src_vocab:
+ for t in self.trg_vocab:
+ estimate = count_t_given_s[t][s] / count_any_t_given_s[s]
+ self.translation_table[t][s] = max(estimate,
+ IBMModel.MIN_PROB)
+
+ def prob_t_a_given_s(self, alignment_info):
"""
+ Probability of target sentence and an alignment given the
+ source sentence
+ """
+ prob = 1.0
- if self.probabilities is None:
- raise ValueError("The model does not train.")
+ for j, i in enumerate(alignment_info.alignment):
+ if j == 0:
+ continue # skip the dummy zeroeth element
+ trg_word = alignment_info.trg_sentence[j]
+ src_word = alignment_info.src_sentence[i]
+ prob *= self.translation_table[trg_word][src_word]
- alignment = []
+ return max(prob, IBMModel.MIN_PROB)
- for j, en_word in enumerate(align_sent.words):
-
- # Initialize the maximum probability with Null token
- max_align_prob = (self.probabilities[en_word][None], None)
- for i, fr_word in enumerate(align_sent.mots):
- # Find out the maximum probability
- max_align_prob = max(max_align_prob,
- (self.probabilities[en_word][fr_word], i))
+ def __align_all(self, parallel_corpus):
+ for sentence_pair in parallel_corpus:
+ self.__align(sentence_pair)
- # If the maximum probability is not Null token,
- # then append it to the alignment.
- if max_align_prob[1] is not None:
- alignment.append((j, max_align_prob[1]))
+ def __align(self, sentence_pair):
+ """
+ Determines the best word alignment for one sentence pair from
+ the corpus that the model was trained on.
- return AlignedSent(align_sent.words, align_sent.mots, alignment)
+ The best alignment will be set in ``sentence_pair`` when the
+ method returns. In contrast with the internal implementation of
+ IBM models, the word indices in the ``Alignment`` are zero-
+ indexed, not one-indexed.
-# run doctests
-if __name__ == "__main__":
- import doctest
- doctest.testmod()
+ :param sentence_pair: A sentence in the source language and its
+ counterpart sentence in the target language
+ :type sentence_pair: AlignedSent
+ """
+ best_alignment = []
+
+ for j, trg_word in enumerate(sentence_pair.words):
+ # Initialize trg_word to align with the NULL token
+ best_prob = max(self.translation_table[trg_word][None],
+ IBMModel.MIN_PROB)
+ best_alignment_point = None
+ for i, src_word in enumerate(sentence_pair.mots):
+ align_prob = self.translation_table[trg_word][src_word]
+ if align_prob >= best_prob: # prefer newer word in case of tie
+ best_prob = align_prob
+ best_alignment_point = i
+
+ best_alignment.append((j, best_alignment_point))
+
+ sentence_pair.alignment = Alignment(best_alignment)
diff --git a/nltk/align/ibm2.py b/nltk/align/ibm2.py
index cfa70d4..45e768a 100644
--- a/nltk/align/ibm2.py
+++ b/nltk/align/ibm2.py
@@ -6,195 +6,252 @@
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-from __future__ import division
+"""
+Lexical translation model that considers word order.
+
+IBM Model 2 improves on Model 1 by accounting for word order.
+An alignment probability is introduced, a(i | j,l,m), which predicts
+a source word position, given its aligned target word's position.
+
+The EM algorithm used in Model 2 is:
+E step - In the training data, collect counts, weighted by prior
+ probabilities.
+ (a) count how many times a source language word is translated
+ into a target language word
+ (b) count how many times a particular position in the source
+ sentence is aligned to a particular position in the target
+ sentence
+
+M step - Estimate new probabilities based on the counts from the E step
+
+
+Notations:
+i: Position in the source sentence
+ Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
+j: Position in the target sentence
+ Valid values are 1, 2, ..., length of target sentence
+l: Number of words in the source sentence, excluding NULL
+m: Number of words in the target sentence
+s: A word in the source language
+t: A word in the target language
+
+
+References:
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
+
+Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
+Robert L. Mercer. 1993. The Mathematics of Statistical Machine
+Translation: Parameter Estimation. Computational Linguistics, 19 (2),
+263-311.
+"""
+
+from __future__ import division
from collections import defaultdict
-from nltk.align import AlignedSent
-from nltk.align.ibm1 import IBMModel1
+from nltk.align import AlignedSent
+from nltk.align import Alignment
+from nltk.align import IBMModel
+from nltk.align import IBMModel1
+import warnings
-class IBMModel2(object):
+
+class IBMModel2(IBMModel):
"""
- This class implements the algorithm of Expectation Maximization for
- the IBM Model 2.
-
- Step 1 - Run a number of iterations of IBM Model 1 and get the initial
- distribution of translation probability.
-
- Step 2 - Collect the evidence of an English word being translated by a
- foreign language word.
-
- Step 3 - Estimate the probability of translation and alignment according
- to the evidence from Step 2.
-
- >>> from nltk.corpus import comtrans
- >>> bitexts = comtrans.aligned_sents()[:100]
- >>> ibm = IBMModel2(bitexts, 5)
- >>> aligned_sent = ibm.align(bitexts[0])
- >>> aligned_sent.words
- ['Wiederaufnahme', 'der', 'Sitzungsperiode']
- >>> aligned_sent.mots
- ['Resumption', 'of', 'the', 'session']
- >>> aligned_sent.alignment
- Alignment([(0, 0), (1, 2), (2, 3)])
- >>> bitexts[0].precision(aligned_sent)
- 0.75
- >>> bitexts[0].recall(aligned_sent)
- 1.0
- >>> bitexts[0].alignment_error_rate(aligned_sent)
- 0.1428571428571429
+ Lexical translation model that considers word order
+
+ >>> bitext = []
+ >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
+ >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
+ >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
+ >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
+ >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
+ >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
+
+ >>> ibm2 = IBMModel2(bitext, 5)
+
+ >>> print('{0:.3f}'.format(ibm2.translation_table['buch']['book']))
+ 1.000
+ >>> print('{0:.3f}'.format(ibm2.translation_table['das']['book']))
+ 0.000
+ >>> print('{0:.3f}'.format(ibm2.translation_table['buch'][None]))
+ 0.000
+ >>> print('{0:.3f}'.format(ibm2.translation_table['ja'][None]))
+ 0.000
+
+ >>> print('{0:.3f}'.format(ibm2.alignment_table[1][1][2][2]))
+ 0.939
+ >>> print('{0:.3f}'.format(ibm2.alignment_table[1][2][2][2]))
+ 0.000
+ >>> print('{0:.3f}'.format(ibm2.alignment_table[2][2][4][5]))
+ 1.000
+
+ >>> test_sentence = bitext[2]
+ >>> test_sentence.words
+ ['das', 'buch', 'ist', 'ja', 'klein']
+ >>> test_sentence.mots
+ ['the', 'book', 'is', 'small']
+ >>> test_sentence.alignment
+ Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
"""
- def __init__(self, align_sents, num_iter):
- self.probabilities, self.alignments = self.train(align_sents, num_iter)
- def train(self, align_sents, num_iter):
+ def __init__(self, sentence_aligned_corpus, iterations):
"""
- Return the translation and alignment probability distributions
- trained by the Expectation Maximization algorithm for IBM Model 2.
+ Train on ``sentence_aligned_corpus`` and create a lexical
+ translation model and an alignment model.
+
+ Translation direction is from ``AlignedSent.mots`` to
+ ``AlignedSent.words``.
- Arguments:
- align_sents -- A list contains some sentence pairs.
- num_iter -- The number of iterations.
+ Runs a few iterations of Model 1 training to initialize
+ model parameters.
- Returns:
- t_ef -- A distribution of translation probabilities.
- align -- A distribution of alignment probabilities.
+ :param sentence_aligned_corpus: Sentence-aligned parallel corpus
+ :type sentence_aligned_corpus: list(AlignedSent)
+
+ :param iterations: Number of iterations to run training algorithm
+ :type iterations: int
"""
+ super(IBMModel2, self).__init__(sentence_aligned_corpus)
# Get initial translation probability distribution
# from a few iterations of Model 1 training.
- ibm1 = IBMModel1(align_sents, 10)
- t_ef = ibm1.probabilities
-
- # Vocabulary of each language
- fr_vocab = set()
- en_vocab = set()
- for alignSent in align_sents:
- en_vocab.update(alignSent.words)
- fr_vocab.update(alignSent.mots)
- fr_vocab.add(None)
-
- align = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: float))))
+ ibm1 = IBMModel1(sentence_aligned_corpus, 10)
+ self.translation_table = ibm1.translation_table
# Initialize the distribution of alignment probability,
- # a(i|j,l_e, l_f) = 1/(l_f + 1)
- for alignSent in align_sents:
- en_set = alignSent.words
- fr_set = [None] + alignSent.mots
- l_f = len(fr_set) - 1
- l_e = len(en_set)
- initial_value = 1 / (l_f + 1)
- for i in range(0, l_f+1):
- for j in range(1, l_e+1):
- align[i][j][l_e][l_f] = initial_value
-
-
- for i in range(0, num_iter):
- count_ef = defaultdict(lambda: defaultdict(float))
- total_f = defaultdict(float)
-
- count_align = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))))
- total_align = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
-
- total_e = defaultdict(float)
-
- for alignSent in align_sents:
- en_set = alignSent.words
- fr_set = [None] + alignSent.mots
- l_f = len(fr_set) - 1
- l_e = len(en_set)
-
- # compute normalization
- for j in range(1, l_e+1):
- en_word = en_set[j-1]
- total_e[en_word] = 0
- for i in range(0, l_f+1):
- total_e[en_word] += t_ef[en_word][fr_set[i]] * align[i][j][l_e][l_f]
-
- # collect counts
- for j in range(1, l_e+1):
- en_word = en_set[j-1]
- for i in range(0, l_f+1):
- fr_word = fr_set[i]
- c = t_ef[en_word][fr_word] * align[i][j][l_e][l_f] / total_e[en_word]
- count_ef[en_word][fr_word] += c
- total_f[fr_word] += c
- count_align[i][j][l_e][l_f] += c
- total_align[j][l_e][l_f] += c
-
- # estimate probabilities
- t_ef = defaultdict(lambda: defaultdict(lambda: 0.0))
- align = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))))
-
- # Smoothing the counts for alignments
- for alignSent in align_sents:
- en_set = alignSent.words
- fr_set = [None] + alignSent.mots
- l_f = len(fr_set) - 1
- l_e = len(en_set)
-
- laplace = 1.0
- for i in range(0, l_f+1):
- for j in range(1, l_e+1):
- value = count_align[i][j][l_e][l_f]
- if 0 < value < laplace:
- laplace = value
-
- laplace *= 0.5
- for i in range(0, l_f+1):
- for j in range(1, l_e+1):
- count_align[i][j][l_e][l_f] += laplace
-
- initial_value = laplace * l_e
- for j in range(1, l_e+1):
- total_align[j][l_e][l_f] += initial_value
-
- # Estimate the new lexical translation probabilities
- for f in fr_vocab:
- for e in en_vocab:
- t_ef[e][f] = count_ef[e][f] / total_f[f]
-
- # Estimate the new alignment probabilities
- for alignSent in align_sents:
- en_set = alignSent.words
- fr_set = [None] + alignSent.mots
- l_f = len(fr_set) - 1
- l_e = len(en_set)
- for i in range(0, l_f+1):
- for j in range(1, l_e+1):
- align[i][j][l_e][l_f] = count_align[i][j][l_e][l_f] / total_align[j][l_e][l_f]
-
- return t_ef, align
-
- def align(self, align_sent):
+ # a(i | j,l,m) = 1 / (l+1) for all i, j, l, m
+ for aligned_sentence in sentence_aligned_corpus:
+ l = len(aligned_sentence.mots)
+ m = len(aligned_sentence.words)
+ initial_value = 1 / (l + 1)
+ if initial_value > IBMModel.MIN_PROB:
+ for i in range(0, l + 1):
+ for j in range(1, m + 1):
+ self.alignment_table[i][j][l][m] = initial_value
+ else:
+ warnings.warn("Source sentence is too long (" + str(l) +
+ " words). Results may be less accurate.")
+
+ self.train(sentence_aligned_corpus, iterations)
+ self.__align_all(sentence_aligned_corpus)
+
+ def train(self, parallel_corpus, iterations):
+ for i in range(0, iterations):
+ count_t_given_s = defaultdict(lambda: defaultdict(float))
+ count_any_t_given_s = defaultdict(float)
+
+ # count of i given j, l, m
+ alignment_count = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+ lambda: 0.0))))
+ alignment_count_for_any_i = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(
+ lambda: 0.0)))
+
+ for aligned_sentence in parallel_corpus:
+ src_sentence = [None] + aligned_sentence.mots
+ trg_sentence = ['UNUSED'] + aligned_sentence.words # 1-indexed
+ l = len(aligned_sentence.mots)
+ m = len(aligned_sentence.words)
+ total_count = defaultdict(float)
+
+ # E step (a): Compute normalization factors to weigh counts
+ for j in range(1, m + 1):
+ t = trg_sentence[j]
+ total_count[t] = 0
+ for i in range(0, l + 1):
+ s = src_sentence[i]
+ count = (self.translation_table[t][s] *
+ self.alignment_table[i][j][l][m])
+ total_count[t] += count
+
+ # E step (b): Collect counts
+ for j in range(1, m + 1):
+ t = trg_sentence[j]
+ for i in range(0, l + 1):
+ s = src_sentence[i]
+ count = (self.translation_table[t][s] *
+ self.alignment_table[i][j][l][m])
+ normalized_count = count / total_count[t]
+
+ count_t_given_s[t][s] += normalized_count
+ count_any_t_given_s[s] += normalized_count
+ alignment_count[i][j][l][m] += normalized_count
+ alignment_count_for_any_i[j][l][m] += normalized_count
+
+ # M step: Update probabilities with maximum likelihood estimates
+ for s in self.src_vocab:
+ for t in self.trg_vocab:
+ estimate = count_t_given_s[t][s] / count_any_t_given_s[s]
+ self.translation_table[t][s] = max(estimate,
+ IBMModel.MIN_PROB)
+
+ for aligned_sentence in parallel_corpus:
+ l = len(aligned_sentence.mots)
+ m = len(aligned_sentence.words)
+ for i in range(0, l + 1):
+ for j in range(1, m + 1):
+ estimate = (alignment_count[i][j][l][m] /
+ alignment_count_for_any_i[j][l][m])
+ self.alignment_table[i][j][l][m] = max(estimate,
+ IBMModel.MIN_PROB)
+
+ def prob_t_a_given_s(self, alignment_info):
"""
- Returns the alignment result for one sentence pair.
+ Probability of target sentence and an alignment given the
+ source sentence
"""
+ prob = 1.0
+ l = len(alignment_info.src_sentence) - 1
+ m = len(alignment_info.trg_sentence) - 1
- if self.probabilities is None or self.alignments is None:
- raise ValueError("The model does not train.")
-
- alignment = []
+ for j, i in enumerate(alignment_info.alignment):
+ if j == 0:
+ continue # skip the dummy zeroeth element
+ trg_word = alignment_info.trg_sentence[j]
+ src_word = alignment_info.src_sentence[i]
+ prob *= (self.translation_table[trg_word][src_word] *
+ self.alignment_table[i][j][l][m])
- l_e = len(align_sent.words)
- l_f = len(align_sent.mots)
+ return max(prob, IBMModel.MIN_PROB)
- for j, en_word in enumerate(align_sent.words):
-
- # Initialize the maximum probability with Null token
- max_align_prob = (self.probabilities[en_word][None]*self.alignments[0][j+1][l_e][l_f], None)
- for i, fr_word in enumerate(align_sent.mots):
- # Find out the maximum probability
- max_align_prob = max(max_align_prob,
- (self.probabilities[en_word][fr_word]*self.alignments[i+1][j+1][l_e][l_f], i))
+ def __align_all(self, parallel_corpus):
+ for sentence_pair in parallel_corpus:
+ self.__align(sentence_pair)
- # If the maximum probability is not Null token,
- # then append it to the alignment.
- if max_align_prob[1] is not None:
- alignment.append((j, max_align_prob[1]))
+ def __align(self, sentence_pair):
+ """
+ Determines the best word alignment for one sentence pair from
+ the corpus that the model was trained on.
- return AlignedSent(align_sent.words, align_sent.mots, alignment)
+ The best alignment will be set in ``sentence_pair`` when the
+ method returns. In contrast with the internal implementation of
+ IBM models, the word indices in the ``Alignment`` are zero-
+ indexed, not one-indexed.
-# run doctests
-if __name__ == "__main__":
- import doctest
- doctest.testmod()
+ :param sentence_pair: A sentence in the source language and its
+ counterpart sentence in the target language
+ :type sentence_pair: AlignedSent
+ """
+ best_alignment = []
+
+ l = len(sentence_pair.mots)
+ m = len(sentence_pair.words)
+
+ for j, trg_word in enumerate(sentence_pair.words):
+ # Initialize trg_word to align with the NULL token
+ best_prob = (self.translation_table[trg_word][None] *
+ self.alignment_table[0][j + 1][l][m])
+ best_prob = max(best_prob, IBMModel.MIN_PROB)
+ best_alignment_point = None
+ for i, src_word in enumerate(sentence_pair.mots):
+ align_prob = (self.translation_table[trg_word][src_word] *
+ self.alignment_table[i + 1][j + 1][l][m])
+ if align_prob >= best_prob:
+ best_prob = align_prob
+ best_alignment_point = i
+
+ best_alignment.append((j, best_alignment_point))
+
+ sentence_pair.alignment = Alignment(best_alignment)
diff --git a/nltk/align/ibm3.py b/nltk/align/ibm3.py
index ec54fb4..542c024 100644
--- a/nltk/align/ibm3.py
+++ b/nltk/align/ibm3.py
@@ -6,398 +6,348 @@
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
-from __future__ import division
+"""
+Translation model that considers how a word can be aligned to
+multiple words in another language.
+
+IBM Model 3 improves on Model 2 by directly modeling the phenomenon
+where a word in one language may be translated into zero or more words
+in another. This is expressed by the fertility probability,
+n(phi | source word).
+
+If a source word translates into more than one word, it is possible to
+generate sentences that have the same alignment in multiple ways. This
+is modeled by a distortion step. The distortion probability, d(j|i,l,m),
+predicts a target word position, given its aligned source word's
+position. The distortion probability replaces the alignment probability
+of Model 2.
+
+The fertility probability is not applicable for NULL. Target words that
+align to NULL are assumed to be distributed uniformly in the target
+sentence. The existence of these words is modeled by p1, the probability
+that a target word produced by a real source word requires another
+target word that is produced by NULL.
+
+The EM algorithm used in Model 3 is:
+E step - In the training data, collect counts, weighted by prior
+ probabilities.
+ (a) count how many times a source language word is translated
+ into a target language word
+ (b) count how many times a particular position in the target
+ sentence is aligned to a particular position in the source
+ sentence
+ (c) count how many times a source word is aligned to phi number
+ of target words
+ (d) count how many times NULL is aligned to a target word
+
+M step - Estimate new probabilities based on the counts from the E step
+
+Because there are too many possible alignments, only the most probable
+ones are considered. First, the best alignment is determined using prior
+probabilities. Then, a hill climbing approach is used to find other good
+candidates.
+
+
+Notations:
+i: Position in the source sentence
+ Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
+j: Position in the target sentence
+ Valid values are 1, 2, ..., length of target sentence
+l: Number of words in the source sentence, excluding NULL
+m: Number of words in the target sentence
+s: A word in the source language
+t: A word in the target language
+phi: Fertility, the number of target words produced by a source word
+p1: Probability that a target word produced by a source word is
+ accompanied by another target word that is aligned to NULL
+p0: 1 - p1
+
+
+References:
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
+
+Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
+Robert L. Mercer. 1993. The Mathematics of Statistical Machine
+Translation: Parameter Estimation. Computational Linguistics, 19 (2),
+263-311.
+"""
+
+from __future__ import division
from collections import defaultdict
-from nltk.align import AlignedSent
-from nltk.align.ibm2 import IBMModel2
from math import factorial
+from nltk.align import AlignedSent
+from nltk.align import Alignment
+from nltk.align import IBMModel
+from nltk.align import IBMModel2
+import warnings
-class HashableDict(dict):
- """
- This class implements a hashable dict, which can be
- put into a set.
- """
- def __key(self):
- return tuple((k,self[k]) for k in sorted(self))
-
- def __hash__(self):
- return hash(self.__key())
-
- def __eq__(self, other):
- return self.__key() == other.__key()
-class IBMModel3(object):
+class IBMModel3(IBMModel):
"""
- This class implements the algorithm of Expectation Maximization for
- the IBM Model 3.
-
- Step 1 - Run a number of iterations of IBM Model 2 and get the initial
- distribution of translation probability.
-
- Step 2 - Sample the alignment spaces by using the hillclimb approach.
-
- Step 3 - Collect the evidence of translation probabilities, distortion,
- the probability of null insertion, and fertility.
-
- Step 4 - Estimate the new probabilities according to the evidence from
- Step 3.
+ Translation model that considers how a word can be aligned to
+ multiple words in another language
+
+ >>> bitext = []
+ >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
+ >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
+ >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
+ >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
+ >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
+ >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
+ >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
+ >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
+ >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
+
+ >>> ibm3 = IBMModel3(bitext, 5)
+
+ >>> print('{0:.3f}'.format(ibm3.translation_table['buch']['book']))
+ 1.000
+ >>> print('{0:.3f}'.format(ibm3.translation_table['das']['book']))
+ 0.000
+ >>> print('{0:.3f}'.format(ibm3.translation_table['ja'][None]))
+ 1.000
+
+ >>> print('{0:.3f}'.format(ibm3.distortion_table[1][1][2][2]))
+ 1.000
+ >>> print('{0:.3f}'.format(ibm3.distortion_table[1][2][2][2]))
+ 0.000
+ >>> print('{0:.3f}'.format(ibm3.distortion_table[2][2][4][5]))
+ 0.750
+
+ >>> print('{0:.3f}'.format(ibm3.fertility_table[2]['summarize']))
+ 1.000
+ >>> print('{0:.3f}'.format(ibm3.fertility_table[1]['book']))
+ 1.000
+
+ >>> print('{0:.3f}'.format(ibm3.p1))
+ 0.026
+
+ >>> test_sentence = bitext[2]
+ >>> test_sentence.words
+ ['das', 'buch', 'ist', 'ja', 'klein']
+ >>> test_sentence.mots
+ ['the', 'book', 'is', 'small']
+ >>> test_sentence.alignment
+ Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
- >>> align_sents = []
- >>> align_sents.append(AlignedSent(['klein', 'ist', 'das', 'Haus'], ['the', 'house', 'is', 'small']))
- >>> align_sents.append(AlignedSent(['das', 'Haus'], ['the', 'house']))
- >>> align_sents.append(AlignedSent(['das', 'Buch'], ['the', 'book']))
- >>> align_sents.append(AlignedSent(['ein', 'Buch'], ['a', 'book']))
-
- >>> ibm3 = IBMModel3(align_sents, 5)
-
- >>> print('{0:.1f}'.format(ibm3.probabilities['Buch']['book']))
- 1.0
- >>> print('{0:.1f}'.format(ibm3.probabilities['das']['book']))
- 0.0
- >>> print('{0:.1f}'.format(ibm3.probabilities[None]['book']))
- 0.0
+ """
- >>> aligned_sent = ibm3.align(align_sents[0])
- >>> aligned_sent.words
- ['klein', 'ist', 'das', 'Haus']
- >>> aligned_sent.mots
- ['the', 'house', 'is', 'small']
- >>> aligned_sent.alignment
- Alignment([(0, 2), (1, 3), (2, 0), (3, 1)])
+ def __init__(self, sentence_aligned_corpus, iterations):
+ """
+ Train on ``sentence_aligned_corpus`` and create a lexical
+ translation model, a distortion model, a fertility model, and a
+ model for generating NULL-aligned words.
- """
+ Translation direction is from ``AlignedSent.mots`` to
+ ``AlignedSent.words``.
- def __init__(self, align_sents, num_iter):
- # If there is not an initial value, it throws an exception of
- # the number divided by zero. And the value of computing
- # probability will be always zero.
- self.PROB_SMOOTH = 0.1
+ Runs a few iterations of Model 2 training to initialize
+ model parameters.
- self.train(align_sents, num_iter)
+ :param sentence_aligned_corpus: Sentence-aligned parallel corpus
+ :type sentence_aligned_corpus: list(AlignedSent)
+ :param iterations: Number of iterations to run training algorithm
+ :type iterations: int
+ """
+ super(IBMModel3, self).__init__(sentence_aligned_corpus)
- def train(self, align_sents, num_iter):
+ self.distortion_table = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+ lambda: self.MIN_PROB))))
"""
- This function is the main process of training model, which
- initialize all the probability distributions and executes
- a specific number of iterations.
+ dict[int][int][int][int]: float. Probability(j | i,l,m).
+ Values accessed as ``distortion_table[j][i][l][m]``.
"""
- # Get the translation and alignment probabilities from IBM model 2
- ibm2 = IBMModel2(align_sents, num_iter)
- self.probabilities, self.align_table = ibm2.probabilities, ibm2.alignments
-
- fr_vocab = set()
- en_vocab = set()
- for alignSent in align_sents:
- en_vocab.update(alignSent.words)
- fr_vocab.update(alignSent.mots)
- fr_vocab.add(None)
- # Initial probability of null insertion.
- self.null_insertion = 0.5
-
- self.fertility = defaultdict(lambda: defaultdict(lambda: self.PROB_SMOOTH))
- self.distortion = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: self.PROB_SMOOTH))))
-
- for k in range(0, num_iter):
- max_fert = 0
- # Set all count* and total* to 0
- count_t = defaultdict(lambda: defaultdict(lambda: 0.0))
- total_t = defaultdict(lambda: 0.0)
-
- count_d = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))))
- total_d = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+ # Get the translation and alignment probabilities from IBM model 2
+ ibm2 = IBMModel2(sentence_aligned_corpus, iterations)
+ self.translation_table = ibm2.translation_table
+
+ # Alignment table is only used for hill climbing and is not part
+ # of the output of Model 3 training
+ self.alignment_table = ibm2.alignment_table
+
+ # Initialize the distribution of distortion probability,
+ # d(j | i,l,m) = 1 / m for all i, j, l, m
+ for aligned_sentence in sentence_aligned_corpus:
+ l = len(aligned_sentence.mots)
+ m = len(aligned_sentence.words)
+ initial_value = 1 / m
+ if initial_value > IBMModel.MIN_PROB:
+ for i in range(0, l + 1):
+ for j in range(1, m + 1):
+ self.distortion_table[j][i][l][m] = initial_value
+ else:
+ warnings.warn("Target sentence is too long (" + str(m) +
+ " words). Results may be less accurate.")
+
+ self.train(sentence_aligned_corpus, iterations)
+
+ def train(self, parallel_corpus, iterations):
+ for k in range(0, iterations):
+ max_fertility = 0
+
+ # Reset all counts
+ count_t_given_s = defaultdict(lambda: defaultdict(lambda: 0.0))
+ count_any_t_given_s = defaultdict(lambda: 0.0)
+
+ distortion_count = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+ lambda: 0.0))))
+ distortion_count_for_any_j = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
count_p0 = 0.0
count_p1 = 0.0
- count_f = defaultdict(lambda: defaultdict(lambda: 0.0))
- total_f = defaultdict(lambda: 0.0)
+ fertility_count = defaultdict(lambda: defaultdict(lambda: 0.0))
+ fertility_count_for_any_phi = defaultdict(lambda: 0.0)
- for alignSent in align_sents:
-
- en_set = alignSent.words
- fr_set = [None] + alignSent.mots
- l_f = len(fr_set) - 1
- l_e = len(en_set)
+ for aligned_sentence in parallel_corpus:
+ src_sentence = [None] + aligned_sentence.mots
+ trg_sentence = ['UNUSED'] + aligned_sentence.words # 1-indexed
+ l = len(aligned_sentence.mots)
+ m = len(aligned_sentence.words)
# Sample the alignment space
- A = self.sample( en_set, fr_set)
-
- # Collect counts
- c_total = 0.0
-
- for (a, fert) in A:
- c_total += self.probability(a, en_set, fr_set, fert)
-
- for (a, fert) in A:
- c = self.probability(a, en_set, fr_set, fert)/c_total
- null = 0
-
- for j in range(1, l_e+1):
- en_word = en_set[j-1]
- fr_word = fr_set[a[j]]
+ sampled_alignments, best_alignment = self.sample(
+ aligned_sentence)
+ # Record the most probable alignment
+ aligned_sentence.alignment = Alignment(
+ best_alignment.zero_indexed_alignment())
+
+ total_count = 0.0
+
+ # E step (a): Compute normalization factors to weigh counts
+ for alignment_info in sampled_alignments:
+ count = self.prob_t_a_given_s(alignment_info)
+ total_count += count
+
+ # E step (b): Collect counts
+ for alignment_info in sampled_alignments:
+ count = self.prob_t_a_given_s(alignment_info)
+ normalized_count = count / total_count
+ null_count = 0
+
+ for j in range(1, m + 1):
+ t = trg_sentence[j]
+ i = alignment_info.alignment[j]
+ s = src_sentence[i]
# Lexical translation
- count_t[en_word][fr_word] += c
- total_t[fr_word] += c
+ count_t_given_s[t][s] += normalized_count
+ count_any_t_given_s[s] += normalized_count
# Distortion
- count_d[j][a[j]][l_e][l_f] += c
- total_d[a[j]][l_e][l_f] += c
+ distortion_count[j][i][l][m] += normalized_count
+ distortion_count_for_any_j[i][l][m] += normalized_count
- if a[j] == 0:
- null += 1
+ if i == 0:
+ null_count += 1
- # Collect the counts of null insetion
- count_p1 += null * c
- count_p0 += (l_e - 2 * null) * c
+ # NULL-aligned words generation
+ count_p1 += null_count * normalized_count
+ count_p0 += (m - 2 * null_count) * normalized_count
- # Collect the counts of fertility
- for i in range(0, l_f+1):
+ # Fertility
+ for i in range(0, l + 1):
fertility = 0
- for j in range(1, l_e+1):
- if i == a[j]:
+ for j in range(1, m + 1):
+ if i == alignment_info.alignment[j]:
fertility += 1
- fr_word = fr_set[i]
- count_f[fertility][fr_word] += c
- total_f[fr_word] += c
-
- if fertility > max_fert:
- max_fert = fertility
-
-
- self.probabilities = defaultdict(lambda: defaultdict(lambda: 0.0))
- self.distortion = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))))
- self.fertility = defaultdict(lambda: defaultdict(lambda: 0.0))
-
- # Estimate translation probability distribution
- for f in fr_vocab:
- for e in en_vocab:
- self.probabilities[e][f] = count_t[e][f] / total_t[f]
-
- # Estimate distortion
- for alignSent in align_sents:
- en_set = alignSent.words
- fr_set = [None] + alignSent.mots
- l_f = len(fr_set) - 1
- l_e = len(en_set)
-
- for i in range(0, l_f+1):
- for j in range(1, l_e+1):
- self.distortion[j][i][l_e][l_f] = count_d[j][i][l_e][l_f] / total_d[i][l_e][l_f]
-
- # Estimate the fertility, n(Fertility | input word)
- for ferti in range(0, max_fert+1):
- for fr_word in fr_vocab:
- self.fertility[ferti][fr_word] = count_f[ferti][fr_word] / total_f[fr_word]
-
- # Estimate the probability of null insertion
- p1 = count_p1 / (count_p1+count_p0)
- self.null_insertion = 1 - p1
-
- def sample(self, e, f):
- """
- This function returns a sample from the entire alignment space.
- First, it pegs one alignment point and finds out the best alignment
- through the IBM model 2. Then, using the hillclimb approach, it
- finds out the best alignment on local and returns all its neighborings,
- which are swapped or moved one distance from the best alignment.
- """
- A = set()
-
- le = len(e)
- lf = len(f) - 1
-
- # Compute Normalization
- for i in range(0, lf+1):
- for j in range(1, le+1):
- a = HashableDict()
- fert = HashableDict()
- # Initialize all fertility to zero
- for ii in range(0, lf+1):
- fert[ii] = 0
-
- # Pegging one alignment point
- a[j] = i
- fert[i] = 1
-
- for jj in range(1, le+1):
- if jj != j:
- # Find the best alignment according to model 2
- maxalignment = 0
- besti = 1
-
- for ii in range(0, lf+1):
- # Notice that the probabilities returned by IBM model 2,
- # which is not distortion, is alignment.
- #
- # The alignment probability predicts foreign input word
- # positions conditioned on English output word positions.
- # However, the distortion probability in a reverse direction
- # predicts the output word position based on input word
- # position.
- #
- # Actually, you cannot just change the index to get a
- # distortion from alignment table, because its process of
- # collecting evidence is different from each other.
- alignment = self.probabilities[e[jj-1]][f[ii]] * self.align_table[ii][jj][le][lf]
- if alignment > maxalignment:
- maxalignment = alignment
- besti = ii
-
- a[jj] = besti
- fert[besti] += 1
-
- a = self.hillclimb(a, j, e, f, fert)
- neighbor = self.neighboring(a, j, e, f, fert)
- A.update(neighbor)
-
- return A
-
- def hillclimb(self, a, j_pegged, es, fs, fert):
- """
- This function returns the best alignment on local. It gets
- some neighboring alignments and finds out the alignment with
- highest probability in those alignment spaces. If the current
- alignment recorded has the highest probability, then stop the
- search loop. If not, then continue the search loop until it
- finds out the highest probability of alignment in local.
- """
- so_far_fert = fert
-
- while True:
- a_old = a
-
- for (a_nerghbor, neighbor_Fert) in self.neighboring(a, j_pegged, es, fs, so_far_fert):
- if self.probability(a_nerghbor, es, fs, neighbor_Fert) > self.probability(a, es, fs, so_far_fert):
- # If the probability of an alignment is higher than
- # the current alignment recorded, then replace the
- # current one.
- a = a_nerghbor
- so_far_fert = neighbor_Fert
-
- if a == a_old:
- # Until this alignment is the highest one in local
- break
-
- return a
-
- def probability(self, a, es, fs, Fert):
- """
- This function returns the probability given an alignment.
- The Fert variable is math syntax 'Phi' in the fomula, which
- represents the fertility according to the current alignment,
- which records how many output words are generated by each
- input word.
- """
- l_e = len(es)
- l_f = len(fs) - 1
- p1 = 1 - self.null_insertion
-
- total = 1.0
-
- # Compute the NULL insertation
- total *= pow(p1, Fert[0]) * pow(self.null_insertion, l_e - 2 * Fert[0])
- if total == 0:
- return total
-
- # Compute the combination (l_e - Fert[0]) choose Fert[0]
- for i in range(1, Fert[0]+1):
- total *= (l_e - Fert[0] - i + 1) / i
- if total == 0:
- return total
-
- # Compute fertilities term
- for i in range(1, l_f+1):
- total *= factorial(Fert[i]) * self.fertility[Fert[i]][fs[i]]
- if total == 0:
- return total
-
- # Multiply the lexical and distortion probabilities
- for j in range(1, l_e+1):
- en_word = es[j-1]
- fr_word = fs[a[j]]
-
- total *= self.probabilities[en_word][fr_word]
- total *= self.distortion[j][a[j]][l_e][l_f]
- if total == 0:
- return total
-
- return total
-
- def neighboring(self, a, j_pegged, es, fs, fert):
- """
- This function returns the neighboring alignments from
- the given alignment by moving or swapping one distance.
- """
- N = set()
-
- l_e = len(es)
- l_f = len(fs) - 1
-
- for j in range(1, l_e+1):
- if j != j_pegged:
- # Moves
- for i in range(0, l_f+1):
- new_align = HashableDict(a)
- new_align[j] = i
-
- new_fert = fert
- if new_fert[a[j]] > 0:
- new_fert = HashableDict(fert)
- new_fert[a[j]] -= 1
- new_fert[i] += 1
-
- N.update([(new_align, new_fert)])
-
-
- for j_one in range(1, l_e+1):
- if j_one != j_pegged:
- # Swaps
- for j_two in range(1, l_e+1):
- if j_two != j_pegged and j_two != j_one:
- new_align = HashableDict(a)
- new_fert = fert
- new_align[j_one] = a[j_two]
- new_align[j_two] = a[j_one]
-
- N.update([(new_align, new_fert)])
-
- return N
-
- def align(self, align_sent):
+ s = src_sentence[i]
+ fertility_count[fertility][s] += normalized_count
+ fertility_count_for_any_phi[s] += normalized_count
+
+ if fertility > max_fertility:
+ max_fertility = fertility
+
+ # M step: Update probabilities with maximum likelihood estimates
+ # If any probability is less than MIN_PROB, clamp it to MIN_PROB
+ MIN_PROB = IBMModel.MIN_PROB
+
+ # Lexical translation
+ for s in self.src_vocab:
+ for t in self.trg_vocab:
+ estimate = count_t_given_s[t][s] / count_any_t_given_s[s]
+ self.translation_table[t][s] = max(estimate, MIN_PROB)
+
+ # Distortion
+ for aligned_sentence in parallel_corpus:
+ l = len(aligned_sentence.mots)
+ m = len(aligned_sentence.words)
+
+ for i in range(0, l + 1):
+ for j in range(1, m + 1):
+ estimate = (distortion_count[j][i][l][m] /
+ distortion_count_for_any_j[i][l][m])
+ self.distortion_table[j][i][l][m] = max(estimate,
+ MIN_PROB)
+
+ # Fertility
+ for fertility in range(0, max_fertility + 1):
+ for s in self.src_vocab:
+ estimate = (fertility_count[fertility][s] /
+ fertility_count_for_any_phi[s])
+ self.fertility_table[fertility][s] = max(estimate, MIN_PROB)
+
+ # NULL-aligned words generation
+ p1_estimate = count_p1 / (count_p1 + count_p0)
+ p1_estimate = max(p1_estimate, MIN_PROB)
+
+ # Clip p1 if it is too large, because p0 = 1 - p1 should
+ # not be smaller than MIN_PROB
+ self.p1 = min(p1_estimate, 1 - MIN_PROB)
+
+ def prob_t_a_given_s(self, alignment_info):
"""
- Returns the alignment result for one sentence pair.
+ Probability of target sentence and an alignment given the
+ source sentence
"""
-
- if self.probabilities is None or self.distortion is None:
- raise ValueError("The model does not train.")
-
- alignment = []
-
- l_e = len(align_sent.words)
- l_f = len(align_sent.mots)
-
- for j, en_word in enumerate(align_sent.words):
-
- # Initialize the maximum probability with Null token
- max_align_prob = (self.probabilities[en_word][None]*self.distortion[j+1][0][l_e][l_f], 0)
- for i, fr_word in enumerate(align_sent.mots):
- # Find out the maximum probability
- max_align_prob = max(max_align_prob,
- (self.probabilities[en_word][fr_word]*self.distortion[j+1][i+1][l_e][l_f], i))
-
- # If the maximum probability is not Null token,
- # then append it to the alignment.
- if max_align_prob[1] is not None:
- alignment.append((j, max_align_prob[1]))
-
- return AlignedSent(align_sent.words, align_sent.mots, alignment)
-
-# run doctests
-if __name__ == "__main__":
- import doctest
- doctest.testmod()
+ src_sentence = alignment_info.src_sentence
+ trg_sentence = alignment_info.trg_sentence
+ l = len(src_sentence) - 1 # exclude NULL
+ m = len(trg_sentence) - 1
+ p1 = self.p1
+ p0 = 1 - p1
+
+ probability = 1.0
+ MIN_PROB = IBMModel.MIN_PROB
+
+ # Combine NULL insertion probability
+ null_fertility = alignment_info.fertility_of_i(0)
+ probability *= (pow(p1, null_fertility) *
+ pow(p0, m - 2 * null_fertility))
+ if probability < MIN_PROB:
+ return MIN_PROB
+
+ # Compute combination (m - null_fertility) choose null_fertility
+ for i in range(1, null_fertility + 1):
+ probability *= (m - null_fertility - i + 1) / i
+ if probability < MIN_PROB:
+ return MIN_PROB
+
+ # Combine fertility probabilities
+ for i in range(1, l + 1):
+ fertility = alignment_info.fertility_of_i(i)
+ probability *= (factorial(fertility) *
+ self.fertility_table[fertility][src_sentence[i]])
+ if probability < MIN_PROB:
+ return MIN_PROB
+
+ # Combine lexical and distortion probabilities
+ for j in range(1, m + 1):
+ t = trg_sentence[j]
+ i = alignment_info.alignment[j]
+ s = src_sentence[i]
+
+ probability *= (self.translation_table[t][s] *
+ self.distortion_table[j][i][l][m])
+ if probability < MIN_PROB:
+ return MIN_PROB
+
+ return probability
diff --git a/nltk/align/ibm4.py b/nltk/align/ibm4.py
new file mode 100644
index 0000000..6726e0b
--- /dev/null
+++ b/nltk/align/ibm4.py
@@ -0,0 +1,473 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: IBM Model 4
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Tah Wei Hoon <hoon.tw at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Translation model that reorders output words based on their type and
+distance from other related words in the output sentence.
+
+IBM Model 4 improves the distortion model of Model 3, motivated by the
+observation that certain words tend to be re-ordered in a predictable
+way relative to one another. For example, <adjective><noun> in English
+usually has its order flipped as <noun><adjective> in French.
+
+Model 4 requires words in the source and target vocabularies to be
+categorized into classes. This can be linguistically driven, like parts
+of speech (adjective, nouns, prepositions, etc). Word classes can also
+be obtained by statistical methods. The original IBM Model 4 uses an
+information theoretic approach to group words into 50 classes for each
+vocabulary.
+
+Terminology:
+Cept:
+ A source word with non-zero fertility i.e. aligned to one or more
+ target words.
+Tablet:
+ The set of target word(s) aligned to a cept.
+Head of cept:
+ The first word of the tablet of that cept.
+Center of cept:
+ The average position of the words in that cept's tablet. If the
+ value is not an integer, the ceiling is taken.
+ For example, for a tablet with words in positions 2, 5, 6 in the
+ target sentence, the center of the corresponding cept is
+ ceil((2 + 5 + 6) / 3) = 5
+Displacement:
+ For a head word, defined as (position of head word - position of
+ previous cept's center). Can be positive or negative.
+ For a non-head word, defined as (position of non-head word -
+ position of previous word in the same tablet). Always positive,
+ because successive words in a tablet are assumed to appear to the
+ right of the previous word.
+
+In contrast to Model 3 which reorders words in a tablet independently of
+other words, Model 4 distinguishes between three cases.
+(1) Words generated by NULL are distributed uniformly.
+(2) For a head word t, its position is modeled by the probability
+ d_head(displacement | word_class_s(s),word_class_t(t)),
+ where s is the previous cept, and word_class_s and word_class_t maps
+ s and t to a source and target language word class respectively.
+(3) For a non-head word t, its position is modeled by the probability
+ d_non_head(displacement | word_class_t(t))
+
+The EM algorithm used in Model 4 is:
+E step - In the training data, collect counts, weighted by prior
+ probabilities.
+ (a) count how many times a source language word is translated
+ into a target language word
+ (b) for a particular word class, count how many times a head
+ word is located at a particular displacement from the
+ previous cept's center
+ (c) for a particular word class, count how many times a
+ non-head word is located at a particular displacement from
+ the previous target word
+ (d) count how many times a source word is aligned to phi number
+ of target words
+ (e) count how many times NULL is aligned to a target word
+
+M step - Estimate new probabilities based on the counts from the E step
+
+Like Model 3, there are too many possible alignments to consider. Thus,
+a hill climbing approach is used to sample good candidates.
+
+
+Notations:
+i: Position in the source sentence
+ Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
+j: Position in the target sentence
+ Valid values are 1, 2, ..., length of target sentence
+l: Number of words in the source sentence, excluding NULL
+m: Number of words in the target sentence
+s: A word in the source language
+t: A word in the target language
+phi: Fertility, the number of target words produced by a source word
+p1: Probability that a target word produced by a source word is
+ accompanied by another target word that is aligned to NULL
+p0: 1 - p1
+dj: Displacement, Δj
+
+
+References:
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
+
+Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
+Robert L. Mercer. 1993. The Mathematics of Statistical Machine
+Translation: Parameter Estimation. Computational Linguistics, 19 (2),
+263-311.
+"""
+
+from __future__ import division
+from collections import defaultdict
+from math import factorial
+from nltk.align import AlignedSent
+from nltk.align import Alignment
+from nltk.align import IBMModel
+from nltk.align import IBMModel3
+from nltk.align.ibm_model import Counts
+from nltk.align.ibm_model import longest_target_sentence_length
+import warnings
+
+
+class IBMModel4(IBMModel):
+ """
+ Translation model that reorders output words based on their type and
+ their distance from other related words in the output sentence
+
+ >>> bitext = []
+ >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
+ >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
+ >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
+ >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
+ >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
+ >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
+ >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
+ >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
+ >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
+ >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'i': 4, 'summarize': 5 }
+ >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 }
+
+ >>> ibm4 = IBMModel4(bitext, 5, src_classes, trg_classes)
+
+ >>> print('{0:.3f}'.format(ibm4.translation_table['buch']['book']))
+ 1.000
+ >>> print('{0:.3f}'.format(ibm4.translation_table['das']['book']))
+ 0.000
+ >>> print('{0:.3f}'.format(ibm4.translation_table['ja'][None]))
+ 1.000
+
+ >>> print('{0:.3f}'.format(ibm4.head_distortion_table[1][0][1]))
+ 1.000
+ >>> print('{0:.3f}'.format(ibm4.head_distortion_table[2][0][1]))
+ 0.000
+ >>> print('{0:.3f}'.format(ibm4.non_head_distortion_table[3][6]))
+ 0.500
+
+ >>> print('{0:.3f}'.format(ibm4.fertility_table[2]['summarize']))
+ 1.000
+ >>> print('{0:.3f}'.format(ibm4.fertility_table[1]['book']))
+ 1.000
+
+ >>> print('{0:.3f}'.format(ibm4.p1))
+ 0.033
+
+ >>> test_sentence = bitext[2]
+ >>> test_sentence.words
+ ['das', 'buch', 'ist', 'ja', 'klein']
+ >>> test_sentence.mots
+ ['the', 'book', 'is', 'small']
+ >>> test_sentence.alignment
+ Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
+
+ """
+
+ def __init__(self, sentence_aligned_corpus, iterations,
+ source_word_classes, target_word_classes,
+ probability_tables=None):
+ """
+ Train on ``sentence_aligned_corpus`` and create a lexical
+ translation model, distortion models, a fertility model, and a
+ model for generating NULL-aligned words.
+
+ Translation direction is from ``AlignedSent.mots`` to
+ ``AlignedSent.words``.
+
+ Runs a few iterations of Model 3 training to initialize
+ model parameters.
+
+ :param sentence_aligned_corpus: Sentence-aligned parallel corpus
+ :type sentence_aligned_corpus: list(AlignedSent)
+
+ :param iterations: Number of iterations to run training algorithm
+ :type iterations: int
+
+ :param source_word_classes: Lookup table that maps a source word
+ to its word class, the latter represented by an integer id
+ :type source_word_classes: dict[str]: int
+
+ :param target_word_classes: Lookup table that maps a target word
+ to its word class, the latter represented by an integer id
+ :type target_word_classes: dict[str]: int
+
+ :param probability_tables: Optional. Use this to pass in custom
+ probability values. If not specified, probabilities will be
+ set to a uniform distribution, or some other sensible value.
+ If specified, all the following entries must be present:
+ ``translation_table``, ``alignment_table``,
+ ``fertility_table``, ``p1``, ``head_distortion_table``,
+ ``non_head_distortion_table``. See ``IBMModel`` and
+ ``IBMModel4`` for the type and purpose of these tables.
+ :type probability_tables: dict[str]: object
+ """
+ super(IBMModel4, self).__init__(sentence_aligned_corpus)
+ self.reset_probabilities()
+ self.src_classes = source_word_classes
+ self.trg_classes = target_word_classes
+
+ if probability_tables is None:
+ # Get probabilities from IBM model 3
+ ibm3 = IBMModel3(sentence_aligned_corpus, iterations)
+ self.translation_table = ibm3.translation_table
+ self.alignment_table = ibm3.alignment_table
+ self.fertility_table = ibm3.fertility_table
+ self.p1 = ibm3.p1
+ self.set_uniform_distortion_probabilities(sentence_aligned_corpus)
+ else:
+ # Set user-defined probabilities
+ self.translation_table = probability_tables['translation_table']
+ self.alignment_table = probability_tables['alignment_table']
+ self.fertility_table = probability_tables['fertility_table']
+ self.p1 = probability_tables['p1']
+ self.head_distortion_table = probability_tables[
+ 'head_distortion_table']
+ self.non_head_distortion_table = probability_tables[
+ 'non_head_distortion_table']
+
+ for k in range(0, iterations):
+ self.train(sentence_aligned_corpus)
+
+ def reset_probabilities(self):
+ super(IBMModel4, self).reset_probabilities()
+ self.head_distortion_table = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)))
+ """
+ dict[int][int][int]: float. Probability(displacement of head
+ word | word class of previous cept,target word class).
+ Values accessed as ``distortion_table[dj][src_class][trg_class]``.
+ """
+
+ self.non_head_distortion_table = defaultdict(
+ lambda: defaultdict(lambda: self.MIN_PROB))
+ """
+ dict[int][int]: float. Probability(displacement of non-head
+ word | target word class).
+ Values accessed as ``distortion_table[dj][trg_class]``.
+ """
+
+ def set_uniform_distortion_probabilities(self, sentence_aligned_corpus):
+ """
+ Set distortion probabilities uniformly to
+ 1 / cardinality of displacement values
+ """
+ max_m = longest_target_sentence_length(sentence_aligned_corpus)
+
+ # The maximum displacement is m-1, when a word is in the last
+ # position m of the target sentence and the previously placed
+ # word is in the first position.
+ # Conversely, the minimum displacement is -(m-1).
+ # Thus, the displacement range is (m-1) - (-(m-1)). Note that
+ # displacement cannot be zero and is not included in the range.
+ if max_m <= 1:
+ initial_prob = IBMModel.MIN_PROB
+ else:
+ initial_prob = float(1) / (2 * (max_m - 1))
+ if initial_prob < IBMModel.MIN_PROB:
+ warnings.warn("A target sentence is too long (" + str(max_m) +
+ " words). Results may be less accurate.")
+
+ for dj in range(1, max_m):
+ self.head_distortion_table[dj] = defaultdict(
+ lambda: defaultdict(lambda: initial_prob))
+ self.head_distortion_table[-dj] = defaultdict(
+ lambda: defaultdict(lambda: initial_prob))
+ self.non_head_distortion_table[dj] = defaultdict(
+ lambda: initial_prob)
+ self.non_head_distortion_table[-dj] = defaultdict(
+ lambda: initial_prob)
+
+ def train(self, parallel_corpus):
+ # Reset all counts
+ counts = Model4Counts()
+
+ for aligned_sentence in parallel_corpus:
+ m = len(aligned_sentence.words)
+
+ # Sample the alignment space
+ sampled_alignments, best_alignment = self.sample(aligned_sentence)
+ # Record the most probable alignment
+ aligned_sentence.alignment = Alignment(
+ best_alignment.zero_indexed_alignment())
+
+ # E step (a): Compute normalization factors to weigh counts
+ total_count = self.prob_of_alignments(sampled_alignments)
+
+ # E step (b): Collect counts
+ for alignment_info in sampled_alignments:
+ count = self.prob_t_a_given_s(alignment_info)
+ normalized_count = count / total_count
+
+ for j in range(1, m + 1):
+ counts.update_lexical_translation(
+ normalized_count, alignment_info, j)
+ counts.update_distortion(
+ normalized_count, alignment_info, j,
+ self.src_classes, self.trg_classes)
+
+ counts.update_null_generation(normalized_count, alignment_info)
+ counts.update_fertility(normalized_count, alignment_info)
+
+ # M step: Update probabilities with maximum likelihood estimates
+ # If any probability is less than MIN_PROB, clamp it to MIN_PROB
+ existing_alignment_table = self.alignment_table
+ self.reset_probabilities()
+ # don't retrain alignment table
+ self.alignment_table = existing_alignment_table
+
+ self.maximize_lexical_translation_probabilities(counts)
+ self.maximize_distortion_probabilities(counts)
+ self.maximize_fertility_probabilities(counts)
+ self.maximize_null_generation_probabilities(counts)
+
+ def maximize_distortion_probabilities(self, counts):
+ head_d_table = self.head_distortion_table
+ for dj, src_classes in counts.head_distortion.items():
+ for s_cls, trg_classes in src_classes.items():
+ for t_cls in trg_classes:
+ estimate = (counts.head_distortion[dj][s_cls][t_cls] /
+ counts.head_distortion_for_any_dj[s_cls][t_cls])
+ head_d_table[dj][s_cls][t_cls] = max(estimate,
+ IBMModel.MIN_PROB)
+
+ non_head_d_table = self.non_head_distortion_table
+ for dj, trg_classes in counts.non_head_distortion.items():
+ for t_cls in trg_classes:
+ estimate = (counts.non_head_distortion[dj][t_cls] /
+ counts.non_head_distortion_for_any_dj[t_cls])
+ non_head_d_table[dj][t_cls] = max(estimate, IBMModel.MIN_PROB)
+
+ def prob_t_a_given_s(self, alignment_info):
+ """
+ Probability of target sentence and an alignment given the
+ source sentence
+ """
+ return IBMModel4.model4_prob_t_a_given_s(alignment_info, self)
+
+ @staticmethod # exposed for Model 5 to use
+ def model4_prob_t_a_given_s(alignment_info, ibm_model):
+ probability = 1.0
+ MIN_PROB = IBMModel.MIN_PROB
+
+ def null_generation_term():
+ # Binomial distribution: B(m - null_fertility, p1)
+ value = 1.0
+ p1 = ibm_model.p1
+ p0 = 1 - p1
+ null_fertility = alignment_info.fertility_of_i(0)
+ m = len(alignment_info.trg_sentence) - 1
+ value *= (pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility))
+ if value < MIN_PROB:
+ return MIN_PROB
+
+ # Combination: (m - null_fertility) choose null_fertility
+ for i in range(1, null_fertility + 1):
+ value *= (m - null_fertility - i + 1) / i
+ return value
+
+ def fertility_term():
+ value = 1.0
+ src_sentence = alignment_info.src_sentence
+ for i in range(1, len(src_sentence)):
+ fertility = alignment_info.fertility_of_i(i)
+ value *= (factorial(fertility) *
+ ibm_model.fertility_table[fertility][src_sentence[i]])
+ if value < MIN_PROB:
+ return MIN_PROB
+ return value
+
+ def lexical_translation_term(j):
+ t = alignment_info.trg_sentence[j]
+ i = alignment_info.alignment[j]
+ s = alignment_info.src_sentence[i]
+ return ibm_model.translation_table[t][s]
+
+ def distortion_term(j):
+ t = alignment_info.trg_sentence[j]
+ i = alignment_info.alignment[j]
+ if i == 0:
+ # case 1: t is aligned to NULL
+ return 1.0
+ if alignment_info.is_head_word(j):
+ # case 2: t is the first word of a tablet
+ previous_cept = alignment_info.previous_cept(j)
+ src_class = None
+ if previous_cept is not None:
+ previous_s = alignment_info.src_sentence[previous_cept]
+ src_class = ibm_model.src_classes[previous_s]
+ trg_class = ibm_model.trg_classes[t]
+ dj = j - alignment_info.center_of_cept(previous_cept)
+ return ibm_model.head_distortion_table[dj][src_class][trg_class]
+
+ # case 3: t is a subsequent word of a tablet
+ previous_position = alignment_info.previous_in_tablet(j)
+ trg_class = ibm_model.trg_classes[t]
+ dj = j - previous_position
+ return ibm_model.non_head_distortion_table[dj][trg_class]
+ # end nested functions
+
+ # Abort computation whenever probability falls below MIN_PROB at
+ # any point, since MIN_PROB can be considered as zero
+ probability *= null_generation_term()
+ if probability < MIN_PROB:
+ return MIN_PROB
+
+ probability *= fertility_term()
+ if probability < MIN_PROB:
+ return MIN_PROB
+
+ for j in range(1, len(alignment_info.trg_sentence)):
+ probability *= lexical_translation_term(j)
+ if probability < MIN_PROB:
+ return MIN_PROB
+
+ probability *= distortion_term(j)
+ if probability < MIN_PROB:
+ return MIN_PROB
+
+ return probability
+
+
+class Model4Counts(Counts):
+ """
+ Data object to store counts of various parameters during training.
+ Include counts for distortion.
+ """
+ def __init__(self):
+ super(Model4Counts, self).__init__()
+ self.head_distortion = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+ self.head_distortion_for_any_dj = defaultdict(
+ lambda: defaultdict(lambda: 0.0))
+ self.non_head_distortion = defaultdict(
+ lambda: defaultdict(lambda: 0.0))
+ self.non_head_distortion_for_any_dj = defaultdict(lambda: 0.0)
+
+ def update_distortion(self, count, alignment_info, j,
+ src_classes, trg_classes):
+ i = alignment_info.alignment[j]
+ t = alignment_info.trg_sentence[j]
+ if i == 0:
+ # case 1: t is aligned to NULL
+ pass
+ elif alignment_info.is_head_word(j):
+ # case 2: t is the first word of a tablet
+ previous_cept = alignment_info.previous_cept(j)
+ if previous_cept is not None:
+ previous_src_word = alignment_info.src_sentence[previous_cept]
+ src_class = src_classes[previous_src_word]
+ else:
+ src_class = None
+ trg_class = trg_classes[t]
+ dj = j - alignment_info.center_of_cept(previous_cept)
+ self.head_distortion[dj][src_class][trg_class] += count
+ self.head_distortion_for_any_dj[src_class][trg_class] += count
+ else:
+ # case 3: t is a subsequent word of a tablet
+ previous_j = alignment_info.previous_in_tablet(j)
+ trg_class = trg_classes[t]
+ dj = j - previous_j
+ self.non_head_distortion[dj][trg_class] += count
+ self.non_head_distortion_for_any_dj[trg_class] += count
diff --git a/nltk/align/ibm5.py b/nltk/align/ibm5.py
new file mode 100644
index 0000000..e50129e
--- /dev/null
+++ b/nltk/align/ibm5.py
@@ -0,0 +1,642 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: IBM Model 5
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Tah Wei Hoon <hoon.tw at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Translation model that keeps track of vacant positions in the target
+sentence to decide where to place translated words.
+
+Translation can be viewed as a process where each word in the source
+sentence is stepped through sequentially, generating translated words
+for each source word. The target sentence can be viewed as being made
+up of ``m`` empty slots initially, which gradually fill up as generated
+words are placed in them.
+
+Models 3 and 4 use distortion probabilities to decide how to place
+translated words. For simplicity, these models ignore the history of
+which slots have already been occupied with translated words.
+Consider the placement of the last translated word: there is only one
+empty slot left in the target sentence, so the distortion probability
+should be 1.0 for that position and 0.0 everywhere else. However, the
+distortion probabilities for Models 3 and 4 are set up such that all
+positions are under consideration.
+
+IBM Model 5 fixes this deficiency by accounting for occupied slots
+during translation. It introduces the vacancy function v(j), the number
+of vacancies up to, and including, position j in the target sentence.
+
+Terminology:
+Maximum vacancy:
+ The number of valid slots that a word can be placed in.
+ This is not necessarily the same as the number of vacant slots.
+ For example, if a tablet contains more than one word, the head word
+ cannot be placed at the last vacant slot because there will be no
+ space for the other words in the tablet. The number of valid slots
+ has to take into account the length of the tablet.
+ Non-head words cannot be placed before the head word, so vacancies
+ to the left of the head word are ignored.
+Vacancy difference:
+ For a head word: (v(j) - v(center of previous cept))
+ Can be positive or negative.
+ For a non-head word: (v(j) - v(position of previously placed word))
+ Always positive, because successive words in a tablet are assumed to
+ appear to the right of the previous word.
+
+Positioning of target words fall under three cases:
+(1) Words generated by NULL are distributed uniformly
+(2) For a head word t, its position is modeled by the probability
+ v_head(dv | max_v,word_class_t(t))
+(3) For a non-head word t, its position is modeled by the probability
+ v_non_head(dv | max_v,word_class_t(t))
+dv and max_v are defined differently for head and non-head words.
+
+The EM algorithm used in Model 5 is:
+E step - In the training data, collect counts, weighted by prior
+ probabilities.
+ (a) count how many times a source language word is translated
+ into a target language word
+ (b) for a particular word class and maximum vacancy, count how
+ many times a head word and the previous cept's center have
+ a particular difference in number of vacancies
+ (b) for a particular word class and maximum vacancy, count how
+ many times a non-head word and the previous target word
+ have a particular difference in number of vacancies
+ (d) count how many times a source word is aligned to phi number
+ of target words
+ (e) count how many times NULL is aligned to a target word
+
+M step - Estimate new probabilities based on the counts from the E step
+
+Like Model 4, there are too many possible alignments to consider. Thus,
+a hill climbing approach is used to sample good candidates. In addition,
+pruning is used to weed out unlikely alignments based on Model 4 scores.
+
+
+Notations:
+i: Position in the source sentence
+ Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
+j: Position in the target sentence
+ Valid values are 1, 2, ..., length of target sentence
+l: Number of words in the source sentence, excluding NULL
+m: Number of words in the target sentence
+s: A word in the source language
+t: A word in the target language
+phi: Fertility, the number of target words produced by a source word
+p1: Probability that a target word produced by a source word is
+ accompanied by another target word that is aligned to NULL
+p0: 1 - p1
+max_v: Maximum vacancy
+dv: Vacancy difference, Δv
+
+The definition of v_head here differs from GIZA++, section 4.7 of
+[Brown et al., 1993], and [Koehn, 2010]. In the latter cases, v_head is
+v_head(v(j) | v(center of previous cept),max_v,word_class(t)).
+
+Here, we follow appendix B of [Brown et al., 1993] and combine v(j) with
+v(center of previous cept) to obtain dv:
+v_head(v(j) - v(center of previous cept) | max_v,word_class(t)).
+
+
+References:
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
+
+Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
+Robert L. Mercer. 1993. The Mathematics of Statistical Machine
+Translation: Parameter Estimation. Computational Linguistics, 19 (2),
+263-311.
+"""
+
+from __future__ import division
+from collections import defaultdict
+from math import factorial
+from nltk.align import AlignedSent
+from nltk.align import Alignment
+from nltk.align import IBMModel
+from nltk.align import IBMModel4
+from nltk.align.ibm_model import Counts
+from nltk.align.ibm_model import longest_target_sentence_length
+import warnings
+
+
+class IBMModel5(IBMModel):
+ """
+ Translation model that keeps track of vacant positions in the target
+ sentence to decide where to place translated words
+
+ >>> bitext = []
+ >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
+ >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
+ >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
+ >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
+ >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
+ >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
+ >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
+ >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
+ >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
+ >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'i': 4, 'summarize': 5 }
+ >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 }
+
+ >>> ibm5 = IBMModel5(bitext, 5, src_classes, trg_classes)
+
+ >>> print('{0:.3f}'.format(ibm5.head_vacancy_table[1][1][1]))
+ 1.000
+ >>> print('{0:.3f}'.format(ibm5.head_vacancy_table[2][1][1]))
+ 0.000
+ >>> print('{0:.3f}'.format(ibm5.non_head_vacancy_table[3][3][6]))
+ 1.000
+
+ >>> print('{0:.3f}'.format(ibm5.fertility_table[2]['summarize']))
+ 1.000
+ >>> print('{0:.3f}'.format(ibm5.fertility_table[1]['book']))
+ 1.000
+
+ >>> print('{0:.3f}'.format(ibm5.p1))
+ 0.033
+
+ >>> test_sentence = bitext[2]
+ >>> test_sentence.words
+ ['das', 'buch', 'ist', 'ja', 'klein']
+ >>> test_sentence.mots
+ ['the', 'book', 'is', 'small']
+ >>> test_sentence.alignment
+ Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
+
+ """
+ MIN_SCORE_FACTOR = 0.2
+ """
+ Alignments with scores below this factor are pruned during sampling
+ """
+
+ def __init__(self, sentence_aligned_corpus, iterations,
+ source_word_classes, target_word_classes,
+ probability_tables=None):
+ """
+ Train on ``sentence_aligned_corpus`` and create a lexical
+ translation model, vacancy models, a fertility model, and a
+ model for generating NULL-aligned words.
+
+ Translation direction is from ``AlignedSent.mots`` to
+ ``AlignedSent.words``.
+
+ :param sentence_aligned_corpus: Sentence-aligned parallel corpus
+ :type sentence_aligned_corpus: list(AlignedSent)
+
+ :param iterations: Number of iterations to run training algorithm
+ :type iterations: int
+
+ :param source_word_classes: Lookup table that maps a source word
+ to its word class, the latter represented by an integer id
+ :type source_word_classes: dict[str]: int
+
+ :param target_word_classes: Lookup table that maps a target word
+ to its word class, the latter represented by an integer id
+ :type target_word_classes: dict[str]: int
+
+ :param probability_tables: Optional. Use this to pass in custom
+ probability values. If not specified, probabilities will be
+ set to a uniform distribution, or some other sensible value.
+ If specified, all the following entries must be present:
+ ``translation_table``, ``alignment_table``,
+ ``fertility_table``, ``p1``, ``head_distortion_table``,
+ ``non_head_distortion_table``, ``head_vacancy_table``,
+ ``non_head_vacancy_table``. See ``IBMModel``, ``IBMModel4``,
+ and ``IBMModel5`` for the type and purpose of these tables.
+ :type probability_tables: dict[str]: object
+ """
+ super(IBMModel5, self).__init__(sentence_aligned_corpus)
+ self.reset_probabilities()
+ self.src_classes = source_word_classes
+ self.trg_classes = target_word_classes
+
+ if probability_tables is None:
+ # Get probabilities from IBM model 4
+ ibm4 = IBMModel4(sentence_aligned_corpus, iterations,
+ source_word_classes, target_word_classes)
+ self.translation_table = ibm4.translation_table
+ self.alignment_table = ibm4.alignment_table
+ self.fertility_table = ibm4.fertility_table
+ self.p1 = ibm4.p1
+ self.head_distortion_table = ibm4.head_distortion_table
+ self.non_head_distortion_table = ibm4.non_head_distortion_table
+ self.set_uniform_distortion_probabilities(sentence_aligned_corpus)
+ else:
+ # Set user-defined probabilities
+ self.translation_table = probability_tables['translation_table']
+ self.alignment_table = probability_tables['alignment_table']
+ self.fertility_table = probability_tables['fertility_table']
+ self.p1 = probability_tables['p1']
+ self.head_distortion_table = probability_tables[
+ 'head_distortion_table']
+ self.non_head_distortion_table = probability_tables[
+ 'non_head_distortion_table']
+ self.head_vacancy_table = probability_tables[
+ 'head_vacancy_table']
+ self.non_head_vacancy_table = probability_tables[
+ 'non_head_vacancy_table']
+
+ for k in range(0, iterations):
+ self.train(sentence_aligned_corpus)
+
+ def reset_probabilities(self):
+ super(IBMModel5, self).reset_probabilities()
+ self.head_vacancy_table = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)))
+ """
+ dict[int][int][int]: float. Probability(vacancy difference |
+ number of remaining valid positions,target word class).
+ Values accessed as ``head_vacancy_table[dv][v_max][trg_class]``.
+ """
+
+ self.non_head_vacancy_table = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)))
+ """
+ dict[int][int][int]: float. Probability(vacancy difference |
+ number of remaining valid positions,target word class).
+ Values accessed as ``non_head_vacancy_table[dv][v_max][trg_class]``.
+ """
+
+ def set_uniform_distortion_probabilities(self, sentence_aligned_corpus):
+ """
+ Set vacancy probabilities uniformly to
+ 1 / cardinality of vacancy difference values
+ """
+ max_m = longest_target_sentence_length(sentence_aligned_corpus)
+
+ # The maximum vacancy difference occurs when a word is placed in
+ # the last available position m of the target sentence and the
+ # previous word position has no vacancies.
+ # The minimum is 1-max_v, when a word is placed in the first
+ # available position and the previous word is placed beyond the
+ # last available position.
+ # Thus, the number of possible vacancy difference values is
+ # (max_v) - (1-max_v) + 1 = 2 * max_v.
+ if max_m > 0 and (float(1) / (2 * max_m)) < IBMModel.MIN_PROB:
+ warnings.warn("A target sentence is too long (" + str(max_m) +
+ " words). Results may be less accurate.")
+
+ for max_v in range(1, max_m + 1):
+ for dv in range(1, max_m + 1):
+ initial_prob = 1 / (2 * max_v)
+ self.head_vacancy_table[dv][max_v] = defaultdict(
+ lambda: initial_prob)
+ self.head_vacancy_table[-(dv-1)][max_v] = defaultdict(
+ lambda: initial_prob)
+ self.non_head_vacancy_table[dv][max_v] = defaultdict(
+ lambda: initial_prob)
+ self.non_head_vacancy_table[-(dv-1)][max_v] = defaultdict(
+ lambda: initial_prob)
+
+ def train(self, parallel_corpus):
+ # Reset all counts
+ counts = Model5Counts()
+
+ for aligned_sentence in parallel_corpus:
+ l = len(aligned_sentence.mots)
+ m = len(aligned_sentence.words)
+
+ # Sample the alignment space
+ sampled_alignments, best_alignment = self.sample(aligned_sentence)
+ # Record the most probable alignment
+ aligned_sentence.alignment = Alignment(
+ best_alignment.zero_indexed_alignment())
+
+ # E step (a): Compute normalization factors to weigh counts
+ total_count = self.prob_of_alignments(sampled_alignments)
+
+ # E step (b): Collect counts
+ for alignment_info in sampled_alignments:
+ count = self.prob_t_a_given_s(alignment_info)
+ normalized_count = count / total_count
+
+ for j in range(1, m + 1):
+ counts.update_lexical_translation(
+ normalized_count, alignment_info, j)
+
+ slots = Slots(m)
+ for i in range(1, l + 1):
+ counts.update_vacancy(
+ normalized_count, alignment_info, i,
+ self.trg_classes, slots)
+
+ counts.update_null_generation(normalized_count, alignment_info)
+ counts.update_fertility(normalized_count, alignment_info)
+
+ # M step: Update probabilities with maximum likelihood estimates
+ # If any probability is less than MIN_PROB, clamp it to MIN_PROB
+ existing_alignment_table = self.alignment_table
+ self.reset_probabilities()
+ # don't retrain alignment table
+ self.alignment_table = existing_alignment_table
+
+ self.maximize_lexical_translation_probabilities(counts)
+ self.maximize_vacancy_probabilities(counts)
+ self.maximize_fertility_probabilities(counts)
+ self.maximize_null_generation_probabilities(counts)
+
+ def sample(self, sentence_pair):
+ """
+ Sample the most probable alignments from the entire alignment
+ space according to Model 4
+
+ Note that Model 4 scoring is used instead of Model 5 because the
+ latter is too expensive to compute.
+
+ First, determine the best alignment according to IBM Model 2.
+ With this initial alignment, use hill climbing to determine the
+ best alignment according to a IBM Model 4. Add this
+ alignment and its neighbors to the sample set. Repeat this
+ process with other initial alignments obtained by pegging an
+ alignment point. Finally, prune alignments that have
+ substantially lower Model 4 scores than the best alignment.
+
+ :param sentence_pair: Source and target language sentence pair
+ to generate a sample of alignments from
+ :type sentence_pair: AlignedSent
+
+ :return: A set of best alignments represented by their ``AlignmentInfo``
+ and the best alignment of the set for convenience
+ :rtype: set(AlignmentInfo), AlignmentInfo
+ """
+ sampled_alignments, best_alignment = super(
+ IBMModel5, self).sample(sentence_pair)
+ return self.prune(sampled_alignments), best_alignment
+
+ def prune(self, alignment_infos):
+ """
+ Removes alignments from ``alignment_infos`` that have
+ substantially lower Model 4 scores than the best alignment
+
+ :return: Pruned alignments
+ :rtype: set(AlignmentInfo)
+ """
+ alignments = []
+ best_score = 0
+
+ for alignment_info in alignment_infos:
+ score = IBMModel4.model4_prob_t_a_given_s(alignment_info, self)
+ best_score = max(score, best_score)
+ alignments.append((alignment_info, score))
+
+ threshold = IBMModel5.MIN_SCORE_FACTOR * best_score
+ alignments = [a[0] for a in alignments if a[1] > threshold]
+ return set(alignments)
+
+ def hillclimb(self, alignment_info, j_pegged=None):
+ """
+ Starting from the alignment in ``alignment_info``, look at
+ neighboring alignments iteratively for the best one, according
+ to Model 4
+
+ Note that Model 4 scoring is used instead of Model 5 because the
+ latter is too expensive to compute.
+
+ There is no guarantee that the best alignment in the alignment
+ space will be found, because the algorithm might be stuck in a
+ local maximum.
+
+ :param j_pegged: If specified, the search will be constrained to
+ alignments where ``j_pegged`` remains unchanged
+ :type j_pegged: int
+
+ :return: The best alignment found from hill climbing
+ :rtype: AlignmentInfo
+ """
+ alignment = alignment_info # alias with shorter name
+ max_probability = IBMModel4.model4_prob_t_a_given_s(alignment, self)
+
+ while True:
+ old_alignment = alignment
+ for neighbor_alignment in self.neighboring(alignment, j_pegged):
+ neighbor_probability = IBMModel4.model4_prob_t_a_given_s(
+ neighbor_alignment, self)
+
+ if neighbor_probability > max_probability:
+ alignment = neighbor_alignment
+ max_probability = neighbor_probability
+
+ if alignment == old_alignment:
+ # Until there are no better alignments
+ break
+
+ alignment.score = max_probability
+ return alignment
+
+ def prob_t_a_given_s(self, alignment_info):
+ """
+ Probability of target sentence and an alignment given the
+ source sentence
+ """
+ probability = 1.0
+ MIN_PROB = IBMModel.MIN_PROB
+ slots = Slots(len(alignment_info.trg_sentence) - 1)
+
+ def null_generation_term():
+ # Binomial distribution: B(m - null_fertility, p1)
+ value = 1.0
+ p1 = self.p1
+ p0 = 1 - p1
+ null_fertility = alignment_info.fertility_of_i(0)
+ m = len(alignment_info.trg_sentence) - 1
+ value *= (pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility))
+ if value < MIN_PROB:
+ return MIN_PROB
+
+ # Combination: (m - null_fertility) choose null_fertility
+ for i in range(1, null_fertility + 1):
+ value *= (m - null_fertility - i + 1) / i
+ return value
+
+ def fertility_term():
+ value = 1.0
+ src_sentence = alignment_info.src_sentence
+ for i in range(1, len(src_sentence)):
+ fertility = alignment_info.fertility_of_i(i)
+ value *= (factorial(fertility) *
+ self.fertility_table[fertility][src_sentence[i]])
+ if value < MIN_PROB:
+ return MIN_PROB
+ return value
+
+ def lexical_translation_term(j):
+ t = alignment_info.trg_sentence[j]
+ i = alignment_info.alignment[j]
+ s = alignment_info.src_sentence[i]
+ return self.translation_table[t][s]
+
+ def vacancy_term(i):
+ value = 1.0
+ tablet = alignment_info.cepts[i]
+ tablet_length = len(tablet)
+ total_vacancies = slots.vacancies_at(len(slots))
+
+ # case 1: NULL-aligned words
+ if tablet_length == 0:
+ return value
+
+ # case 2: head word
+ j = tablet[0]
+ previous_cept = alignment_info.previous_cept(j)
+ previous_center = alignment_info.center_of_cept(previous_cept)
+ dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center)
+ max_v = total_vacancies - tablet_length + 1
+ trg_class = self.trg_classes[alignment_info.trg_sentence[j]]
+ value *= self.head_vacancy_table[dv][max_v][trg_class]
+ slots.occupy(j) # mark position as occupied
+ total_vacancies -= 1
+ if value < MIN_PROB:
+ return MIN_PROB
+
+ # case 3: non-head words
+ for k in range(1, tablet_length):
+ previous_position = tablet[k - 1]
+ previous_vacancies = slots.vacancies_at(previous_position)
+ j = tablet[k]
+ dv = slots.vacancies_at(j) - previous_vacancies
+ max_v = (total_vacancies - tablet_length + k + 1 -
+ previous_vacancies)
+ trg_class = self.trg_classes[alignment_info.trg_sentence[j]]
+ value *= self.non_head_vacancy_table[dv][max_v][trg_class]
+ slots.occupy(j) # mark position as occupied
+ total_vacancies -= 1
+ if value < MIN_PROB:
+ return MIN_PROB
+
+ return value
+ # end nested functions
+
+ # Abort computation whenever probability falls below MIN_PROB at
+ # any point, since MIN_PROB can be considered as zero
+ probability *= null_generation_term()
+ if probability < MIN_PROB:
+ return MIN_PROB
+
+ probability *= fertility_term()
+ if probability < MIN_PROB:
+ return MIN_PROB
+
+ for j in range(1, len(alignment_info.trg_sentence)):
+ probability *= lexical_translation_term(j)
+ if probability < MIN_PROB:
+ return MIN_PROB
+
+ for i in range(1, len(alignment_info.src_sentence)):
+ probability *= vacancy_term(i)
+ if probability < MIN_PROB:
+ return MIN_PROB
+
+ return probability
+
+ def maximize_vacancy_probabilities(self, counts):
+ MIN_PROB = IBMModel.MIN_PROB
+ head_vacancy_table = self.head_vacancy_table
+ for dv, max_vs in counts.head_vacancy.items():
+ for max_v, trg_classes in max_vs.items():
+ for t_cls in trg_classes:
+ estimate = (counts.head_vacancy[dv][max_v][t_cls] /
+ counts.head_vacancy_for_any_dv[max_v][t_cls])
+ head_vacancy_table[dv][max_v][t_cls] = max(estimate,
+ MIN_PROB)
+
+ non_head_vacancy_table = self.non_head_vacancy_table
+ for dv, max_vs in counts.non_head_vacancy.items():
+ for max_v, trg_classes in max_vs.items():
+ for t_cls in trg_classes:
+ estimate = (
+ counts.non_head_vacancy[dv][max_v][t_cls] /
+ counts.non_head_vacancy_for_any_dv[max_v][t_cls])
+ non_head_vacancy_table[dv][max_v][t_cls] = max(estimate,
+ MIN_PROB)
+
+
+class Model5Counts(Counts):
+ """
+ Data object to store counts of various parameters during training.
+ Include counts for vacancies.
+ """
+ def __init__(self):
+ super(Model5Counts, self).__init__()
+ self.head_vacancy = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+ self.head_vacancy_for_any_dv = defaultdict(
+ lambda: defaultdict(lambda: 0.0))
+ self.non_head_vacancy = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+ self.non_head_vacancy_for_any_dv = defaultdict(
+ lambda: defaultdict(lambda: 0.0))
+
+ def update_vacancy(self, count, alignment_info, i, trg_classes, slots):
+ """
+ :param count: Value to add to the vacancy counts
+ :param alignment_info: Alignment under consideration
+ :param i: Source word position under consideration
+ :param trg_classes: Target word classes
+ :param slots: Vacancy states of the slots in the target sentence.
+ Output parameter that will be modified as new words are placed
+ in the target sentence.
+ """
+ tablet = alignment_info.cepts[i]
+ tablet_length = len(tablet)
+ total_vacancies = slots.vacancies_at(len(slots))
+
+ # case 1: NULL aligned words
+ if tablet_length == 0:
+ return # ignore zero fertility words
+
+ # case 2: head word
+ j = tablet[0]
+ previous_cept = alignment_info.previous_cept(j)
+ previous_center = alignment_info.center_of_cept(previous_cept)
+ dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center)
+ max_v = total_vacancies - tablet_length + 1
+ trg_class = trg_classes[alignment_info.trg_sentence[j]]
+ self.head_vacancy[dv][max_v][trg_class] += count
+ self.head_vacancy_for_any_dv[max_v][trg_class] += count
+ slots.occupy(j) # mark position as occupied
+ total_vacancies -= 1
+
+ # case 3: non-head words
+ for k in range(1, tablet_length):
+ previous_position = tablet[k - 1]
+ previous_vacancies = slots.vacancies_at(previous_position)
+ j = tablet[k]
+ dv = slots.vacancies_at(j) - previous_vacancies
+ max_v = (total_vacancies - tablet_length + k + 1 -
+ previous_vacancies)
+ trg_class = trg_classes[alignment_info.trg_sentence[j]]
+ self.non_head_vacancy[dv][max_v][trg_class] += count
+ self.non_head_vacancy_for_any_dv[max_v][trg_class] += count
+ slots.occupy(j) # mark position as occupied
+ total_vacancies -= 1
+
+
+class Slots(object):
+ """
+ Represents positions in a target sentence. Used to keep track of
+ which slot (position) is occupied.
+ """
+ def __init__(self, target_sentence_length):
+ self._slots = [False] * (target_sentence_length + 1) # 1-indexed
+
+ def occupy(self, position):
+ """
+ :return: Mark slot at ``position`` as occupied
+ """
+ self._slots[position] = True
+
+ def vacancies_at(self, position):
+ """
+ :return: Number of vacant slots up to, and including, ``position``
+ """
+ vacancies = 0
+ for k in range(1, position + 1):
+ if not self._slots[k]:
+ vacancies += 1
+ return vacancies
+
+ def __len__(self):
+ return len(self._slots) - 1 # exclude dummy zeroeth element
diff --git a/nltk/align/ibm_model.py b/nltk/align/ibm_model.py
new file mode 100644
index 0000000..cec7246
--- /dev/null
+++ b/nltk/align/ibm_model.py
@@ -0,0 +1,526 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: IBM Model Core
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Tah Wei Hoon <hoon.tw at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Common methods and classes for all IBM models. See ``IBMModel1``,
+``IBMModel2``, ``IBMModel3``, ``IBMModel4``, and ``IBMModel5``
+for specific implementations.
+
+The IBM models are a series of generative models that learn lexical
+translation probabilities, p(target language word|source language word),
+given a sentence-aligned parallel corpus.
+
+The models increase in sophistication from model 1 to 5. Typically, the
+output of lower models is used to seed the higher models. All models
+use the Expectation-Maximization (EM) algorithm to learn various
+probability tables.
+
+Words in a sentence are one-indexed. The first word of a sentence has
+position 1, not 0. Index 0 is reserved in the source sentence for the
+NULL token. The concept of position does not apply to NULL, but it is
+indexed at 0 by convention.
+
+Each target word is aligned to exactly one source word or the NULL
+token.
+
+References:
+Philipp Koehn. 2010. Statistical Machine Translation.
+Cambridge University Press, New York.
+
+Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
+Robert L. Mercer. 1993. The Mathematics of Statistical Machine
+Translation: Parameter Estimation. Computational Linguistics, 19 (2),
+263-311.
+"""
+
+from bisect import insort_left
+from collections import defaultdict
+from copy import deepcopy
+from math import ceil
+
+
+def longest_target_sentence_length(sentence_aligned_corpus):
+ """
+ :param sentence_aligned_corpus: Parallel corpus under consideration
+ :type sentence_aligned_corpus: list(AlignedSent)
+ :return: Number of words in the longest target language sentence
+ of ``sentence_aligned_corpus``
+ """
+ max_m = 0
+ for aligned_sentence in sentence_aligned_corpus:
+ m = len(aligned_sentence.words)
+ if m > max_m:
+ max_m = m
+ return max_m
+
+
+class IBMModel(object):
+ """
+ Abstract base class for all IBM models
+ """
+ # Avoid division by zero and precision errors by imposing a minimum
+ # value for probabilities. Note that this approach is theoretically
+ # incorrect, since it may create probabilities that sum to more
+ # than 1. In practice, the contribution of probabilities with MIN_PROB
+ # is tiny enough that the value of MIN_PROB can be treated as zero.
+ MIN_PROB = 1.0e-12 # GIZA++ is more liberal and uses 1.0e-7
+
+ def __init__(self, sentence_aligned_corpus):
+ self.init_vocab(sentence_aligned_corpus)
+ self.reset_probabilities()
+
+ def reset_probabilities(self):
+ self.translation_table = defaultdict(
+ lambda: defaultdict(lambda: IBMModel.MIN_PROB))
+ """
+ dict[str][str]: float. Probability(target word | source word).
+ Values accessed as ``translation_table[target_word][source_word]``.
+ """
+
+ self.alignment_table = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+ lambda: IBMModel.MIN_PROB))))
+ """
+ dict[int][int][int][int]: float. Probability(i | j,l,m).
+ Values accessed as ``alignment_table[i][j][l][m]``.
+ Used in model 2 and hill climbing in models 3 and above
+ """
+
+ self.fertility_table = defaultdict(
+ lambda: defaultdict(lambda: self.MIN_PROB))
+ """
+ dict[int][str]: float. Probability(fertility | source word).
+ Values accessed as ``fertility_table[fertility][source_word]``.
+ Used in model 3 and higher.
+ """
+
+ self.p1 = 0.5
+ """
+ Probability that a generated word requires another target word
+ that is aligned to NULL.
+ Used in model 3 and higher.
+ """
+
+ def init_vocab(self, sentence_aligned_corpus):
+ src_vocab = set()
+ trg_vocab = set()
+ for aligned_sentence in sentence_aligned_corpus:
+ trg_vocab.update(aligned_sentence.words)
+ src_vocab.update(aligned_sentence.mots)
+ # Add the NULL token
+ src_vocab.add(None)
+
+ self.src_vocab = src_vocab
+ """
+ set(str): All source language words used in training
+ """
+
+ self.trg_vocab = trg_vocab
+ """
+ set(str): All target language words used in training
+ """
+
+ def sample(self, sentence_pair):
+ """
+ Sample the most probable alignments from the entire alignment
+ space
+
+ First, determine the best alignment according to IBM Model 2.
+ With this initial alignment, use hill climbing to determine the
+ best alignment according to a higher IBM Model. Add this
+ alignment and its neighbors to the sample set. Repeat this
+ process with other initial alignments obtained by pegging an
+ alignment point.
+
+ Hill climbing may be stuck in a local maxima, hence the pegging
+ and trying out of different alignments.
+
+ :param sentence_pair: Source and target language sentence pair
+ to generate a sample of alignments from
+ :type sentence_pair: AlignedSent
+
+ :return: A set of best alignments represented by their ``AlignmentInfo``
+ and the best alignment of the set for convenience
+ :rtype: set(AlignmentInfo), AlignmentInfo
+ """
+ sampled_alignments = set()
+ l = len(sentence_pair.mots)
+ m = len(sentence_pair.words)
+
+ # Start from the best model 2 alignment
+ initial_alignment = self.best_model2_alignment(sentence_pair)
+ potential_alignment = self.hillclimb(initial_alignment)
+ sampled_alignments.update(self.neighboring(potential_alignment))
+ best_alignment = potential_alignment
+
+ # Start from other model 2 alignments,
+ # with the constraint that j is aligned (pegged) to i
+ for j in range(1, m + 1):
+ for i in range(0, l + 1):
+ initial_alignment = self.best_model2_alignment(
+ sentence_pair, j, i)
+ potential_alignment = self.hillclimb(initial_alignment, j)
+ neighbors = self.neighboring(potential_alignment, j)
+ sampled_alignments.update(neighbors)
+ if potential_alignment.score > best_alignment.score:
+ best_alignment = potential_alignment
+
+ return sampled_alignments, best_alignment
+
+ def best_model2_alignment(self, sentence_pair, j_pegged=None, i_pegged=0):
+ """
+ Finds the best alignment according to IBM Model 2
+
+ Used as a starting point for hill climbing in Models 3 and
+ above, because it is easier to compute than the best alignments
+ in higher models
+
+ :param sentence_pair: Source and target language sentence pair
+ to be word-aligned
+ :type sentence_pair: AlignedSent
+
+ :param j_pegged: If specified, the alignment point of j_pegged
+ will be fixed to i_pegged
+ :type j_pegged: int
+
+ :param i_pegged: Alignment point to j_pegged
+ :type i_pegged: int
+ """
+ src_sentence = [None] + sentence_pair.mots
+ trg_sentence = ['UNUSED'] + sentence_pair.words # 1-indexed
+
+ l = len(src_sentence) - 1 # exclude NULL
+ m = len(trg_sentence) - 1
+
+ alignment = [0] * (m + 1) # init all alignments to NULL
+ cepts = [[] for i in range((l + 1))] # init all cepts to empty list
+
+ for j in range(1, m + 1):
+ if j == j_pegged:
+ # use the pegged alignment instead of searching for best one
+ best_i = i_pegged
+ else:
+ best_i = 0
+ max_alignment_prob = IBMModel.MIN_PROB
+ t = trg_sentence[j]
+
+ for i in range(0, l + 1):
+ s = src_sentence[i]
+ alignment_prob = (self.translation_table[t][s] *
+ self.alignment_table[i][j][l][m])
+
+ if alignment_prob >= max_alignment_prob:
+ max_alignment_prob = alignment_prob
+ best_i = i
+
+ alignment[j] = best_i
+ cepts[best_i].append(j)
+
+ return AlignmentInfo(tuple(alignment), tuple(src_sentence),
+ tuple(trg_sentence), cepts)
+
+ def hillclimb(self, alignment_info, j_pegged=None):
+ """
+ Starting from the alignment in ``alignment_info``, look at
+ neighboring alignments iteratively for the best one
+
+ There is no guarantee that the best alignment in the alignment
+ space will be found, because the algorithm might be stuck in a
+ local maximum.
+
+ :param j_pegged: If specified, the search will be constrained to
+ alignments where ``j_pegged`` remains unchanged
+ :type j_pegged: int
+
+ :return: The best alignment found from hill climbing
+ :rtype: AlignmentInfo
+ """
+ alignment = alignment_info # alias with shorter name
+ max_probability = self.prob_t_a_given_s(alignment)
+
+ while True:
+ old_alignment = alignment
+ for neighbor_alignment in self.neighboring(alignment, j_pegged):
+ neighbor_probability = self.prob_t_a_given_s(neighbor_alignment)
+
+ if neighbor_probability > max_probability:
+ alignment = neighbor_alignment
+ max_probability = neighbor_probability
+
+ if alignment == old_alignment:
+ # Until there are no better alignments
+ break
+
+ alignment.score = max_probability
+ return alignment
+
+ def neighboring(self, alignment_info, j_pegged=None):
+ """
+ Determine the neighbors of ``alignment_info``, obtained by
+ moving or swapping one alignment point
+
+ :param j_pegged: If specified, neighbors that have a different
+ alignment point from j_pegged will not be considered
+ :type j_pegged: int
+
+ :return: A set neighboring alignments represented by their
+ ``AlignmentInfo``
+ :rtype: set(AlignmentInfo)
+ """
+ neighbors = set()
+
+ l = len(alignment_info.src_sentence) - 1 # exclude NULL
+ m = len(alignment_info.trg_sentence) - 1
+ original_alignment = alignment_info.alignment
+ original_cepts = alignment_info.cepts
+
+ for j in range(1, m + 1):
+ if j != j_pegged:
+ # Add alignments that differ by one alignment point
+ for i in range(0, l + 1):
+ new_alignment = list(original_alignment)
+ new_cepts = deepcopy(original_cepts)
+ old_i = original_alignment[j]
+
+ # update alignment
+ new_alignment[j] = i
+
+ # update cepts
+ insort_left(new_cepts[i], j)
+ new_cepts[old_i].remove(j)
+
+ new_alignment_info = AlignmentInfo(
+ tuple(new_alignment), alignment_info.src_sentence,
+ alignment_info.trg_sentence, new_cepts)
+ neighbors.add(new_alignment_info)
+
+ for j in range(1, m + 1):
+ if j != j_pegged:
+ # Add alignments that have two alignment points swapped
+ for other_j in range(1, m + 1):
+ if other_j != j_pegged and other_j != j:
+ new_alignment = list(original_alignment)
+ new_cepts = deepcopy(original_cepts)
+ other_i = original_alignment[other_j]
+ i = original_alignment[j]
+
+ # update alignments
+ new_alignment[j] = other_i
+ new_alignment[other_j] = i
+
+ # update cepts
+ new_cepts[other_i].remove(other_j)
+ insort_left(new_cepts[other_i], j)
+ new_cepts[i].remove(j)
+ insort_left(new_cepts[i], other_j)
+
+ new_alignment_info = AlignmentInfo(
+ tuple(new_alignment), alignment_info.src_sentence,
+ alignment_info.trg_sentence, new_cepts)
+ neighbors.add(new_alignment_info)
+
+ return neighbors
+
+ def maximize_lexical_translation_probabilities(self, counts):
+ for t, src_words in counts.t_given_s.items():
+ for s in src_words:
+ estimate = counts.t_given_s[t][s] / counts.any_t_given_s[s]
+ self.translation_table[t][s] = max(estimate, IBMModel.MIN_PROB)
+
+ def maximize_fertility_probabilities(self, counts):
+ for phi, src_words in counts.fertility.items():
+ for s in src_words:
+ estimate = (counts.fertility[phi][s] /
+ counts.fertility_for_any_phi[s])
+ self.fertility_table[phi][s] = max(estimate, IBMModel.MIN_PROB)
+
+ def maximize_null_generation_probabilities(self, counts):
+ p1_estimate = counts.p1 / (counts.p1 + counts.p0)
+ p1_estimate = max(p1_estimate, IBMModel.MIN_PROB)
+ # Clip p1 if it is too large, because p0 = 1 - p1 should not be
+ # smaller than MIN_PROB
+ self.p1 = min(p1_estimate, 1 - IBMModel.MIN_PROB)
+
+ def prob_of_alignments(self, alignments):
+ probability = 0
+ for alignment_info in alignments:
+ probability += self.prob_t_a_given_s(alignment_info)
+ return probability
+
+ def prob_t_a_given_s(self, alignment_info):
+ """
+ Probability of target sentence and an alignment given the
+ source sentence
+
+ All required information is assumed to be in ``alignment_info``
+ and self.
+
+ Derived classes should override this method
+ """
+ return 0.0
+
+
+class AlignmentInfo(object):
+ """
+ Helper data object for training IBM Models 3 and up
+
+ Read-only. For a source sentence and its counterpart in the target
+ language, this class holds information about the sentence pair's
+ alignment, cepts, and fertility.
+
+ Warning: Alignments are one-indexed here, in contrast to
+ nltk.align.Alignment and nltk.align.AlignedSent, which are zero-
+ indexed. This class is not meant to be used outside of IBM models.
+ """
+
+ def __init__(self, alignment, src_sentence, trg_sentence, cepts):
+ if not isinstance(alignment, tuple):
+ raise TypeError("The alignment must be a tuple because it is used "
+ "to uniquely identify AlignmentInfo objects.")
+
+ self.alignment = alignment
+ """
+ tuple(int): Alignment function. ``alignment[j]`` is the position
+ in the source sentence that is aligned to the position j in the
+ target sentence.
+ """
+
+ self.src_sentence = src_sentence
+ """
+ tuple(str): Source sentence referred to by this object.
+ Should include NULL token (None) in index 0.
+ """
+
+ self.trg_sentence = trg_sentence
+ """
+ tuple(str): Target sentence referred to by this object.
+ Should have a dummy element in index 0 so that the first word
+ starts from index 1.
+ """
+
+ self.cepts = cepts
+ """
+ list(list(int)): The positions of the target words, in
+ ascending order, aligned to a source word position. For example,
+ cepts[4] = (2, 3, 7) means that words in positions 2, 3 and 7
+ of the target sentence are aligned to the word in position 4 of
+ the source sentence
+ """
+
+ self.score = None
+ """
+ float: Optional. Probability of alignment, as defined by the
+ IBM model that assesses this alignment
+ """
+
+ def fertility_of_i(self, i):
+ """
+ Fertility of word in position ``i`` of the source sentence
+ """
+ return len(self.cepts[i])
+
+ def is_head_word(self, j):
+ """
+ :return: Whether the word in position ``j`` of the target
+ sentence is a head word
+ """
+ i = self.alignment[j]
+ return self.cepts[i][0] == j
+
+ def center_of_cept(self, i):
+ """
+ :return: The ceiling of the average positions of the words in
+ the tablet of cept ``i``, or 0 if ``i`` is None
+ """
+ if i is None:
+ return 0
+
+ average_position = float(sum(self.cepts[i])) / len(self.cepts[i])
+ return int(ceil(average_position))
+
+ def previous_cept(self, j):
+ """
+ :return: The previous cept of ``j``, or None if ``j`` belongs to
+ the first cept
+ """
+ i = self.alignment[j]
+ if i == 0:
+ raise ValueError("Words aligned to NULL cannot have a previous "
+ "cept because NULL has no position")
+ previous_cept = i - 1
+ while previous_cept > 0 and self.fertility_of_i(previous_cept) == 0:
+ previous_cept -= 1
+
+ if previous_cept <= 0:
+ previous_cept = None
+ return previous_cept
+
+ def previous_in_tablet(self, j):
+ """
+ :return: The position of the previous word that is in the same
+ tablet as ``j``, or None if ``j`` is the first word of the
+ tablet
+ """
+ i = self.alignment[j]
+ tablet_position = self.cepts[i].index(j)
+ if tablet_position == 0:
+ return None
+ return self.cepts[i][tablet_position - 1]
+
+ def zero_indexed_alignment(self):
+ """
+ :return: Zero-indexed alignment, suitable for use in external
+ ``nltk.align`` modules like ``nltk.align.Alignment``
+ :rtype: list(tuple)
+ """
+ zero_indexed_alignment = []
+ for j in range(1, len(self.trg_sentence)):
+ i = self.alignment[j] - 1
+ if i < 0:
+ i = None # alignment to NULL token
+ zero_indexed_alignment.append((j - 1, i))
+ return zero_indexed_alignment
+
+ def __eq__(self, other):
+ return self.alignment == other.alignment
+
+ def __hash__(self):
+ return hash(self.alignment)
+
+
+class Counts(object):
+ """
+ Data object to store counts of various parameters during training
+ """
+ def __init__(self):
+ self.t_given_s = defaultdict(lambda: defaultdict(lambda: 0.0))
+ self.any_t_given_s = defaultdict(lambda: 0.0)
+ self.p0 = 0.0
+ self.p1 = 0.0
+ self.fertility = defaultdict(lambda: defaultdict(lambda: 0.0))
+ self.fertility_for_any_phi = defaultdict(lambda: 0.0)
+
+ def update_lexical_translation(self, count, alignment_info, j):
+ i = alignment_info.alignment[j]
+ t = alignment_info.trg_sentence[j]
+ s = alignment_info.src_sentence[i]
+ self.t_given_s[t][s] += count
+ self.any_t_given_s[s] += count
+
+ def update_null_generation(self, count, alignment_info):
+ m = len(alignment_info.trg_sentence) - 1
+ fertility_of_null = alignment_info.fertility_of_i(0)
+ self.p1 += fertility_of_null * count
+ self.p0 += (m - 2 * fertility_of_null) * count
+
+ def update_fertility(self, count, alignment_info):
+ for i in range(0, len(alignment_info.src_sentence)):
+ s = alignment_info.src_sentence[i]
+ phi = len(alignment_info.cepts[i])
+ self.fertility[phi][s] += count
+ self.fertility_for_any_phi[s] += count
diff --git a/nltk/align/phrase_based.py b/nltk/align/phrase_based.py
index 87de8f3..7c16043 100644
--- a/nltk/align/phrase_based.py
+++ b/nltk/align/phrase_based.py
@@ -181,7 +181,3 @@ def phrase_extraction(srctext, trgtext, alignment):
bp.update(phrases)
return bp
-# run doctests
-if __name__ == "__main__":
- import doctest
- doctest.testmod()
diff --git a/nltk/classify/positivenaivebayes.py b/nltk/classify/positivenaivebayes.py
index 26f727e..c8f5511 100644
--- a/nltk/classify/positivenaivebayes.py
+++ b/nltk/classify/positivenaivebayes.py
@@ -168,10 +168,3 @@ def demo():
classifier = partial_names_demo(PositiveNaiveBayesClassifier.train)
classifier.show_most_informative_features()
-##//////////////////////////////////////////////////////
-## Test
-##//////////////////////////////////////////////////////
-
-if __name__ == '__main__':
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/classify/senna.py b/nltk/classify/senna.py
index f5b9c5d..b8b0aad 100644
--- a/nltk/classify/senna.py
+++ b/nltk/classify/senna.py
@@ -178,7 +178,4 @@ def setup_module(module):
except OSError:
raise SkipTest("Senna executable not found")
-if __name__ == '__main__':
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS)
diff --git a/nltk/cluster/util.py b/nltk/cluster/util.py
index fe81e68..422aab4 100644
--- a/nltk/cluster/util.py
+++ b/nltk/cluster/util.py
@@ -58,7 +58,6 @@ class VectorSpaceClusterer(ClusterI):
# assign the vectors to clusters
if assign_clusters:
- print(self._Tt, vectors)
return [self.classify(vector) for vector in vectors]
def cluster_vectorspace(self, vectors, trace):
diff --git a/nltk/collocations.py b/nltk/collocations.py
index 9d02b9c..fdb4b22 100644
--- a/nltk/collocations.py
+++ b/nltk/collocations.py
@@ -54,13 +54,25 @@ class AbstractCollocationFinder(object):
def __init__(self, word_fd, ngram_fd):
self.word_fd = word_fd
self.ngram_fd = ngram_fd
-
+
+ @classmethod
+ def _build_new_documents(cls,documents, window_size, pad_left=False, pad_right=False, pad_symbol=None):
+ '''
+ Pad the document with the place holder according to the window_size
+ '''
+ padding = (pad_symbol,) * (window_size - 1)
+ if pad_right:
+ return _itertools.chain.from_iterable(_itertools.chain(doc, padding) for doc in documents)
+ if pad_left:
+ return _itertools.chain.from_iterable(_itertools.chain(padding, doc) for doc in documents)
+
@classmethod
def from_documents(cls, documents):
"""Constructs a collocation finder given a collection of documents,
each of which is a list (or iterable) of tokens.
"""
- return cls.from_words(_itertools.chain(*documents))
+ #return cls.from_words(_itertools.chain(*documents))
+ return cls.from_words(cls._build_new_documents(documents, cls.default_ws, pad_right=True))
@staticmethod
def _ngram_freqdist(words, n):
@@ -127,7 +139,8 @@ class BigramCollocationFinder(AbstractCollocationFinder):
association measures. It is often useful to use from_words() rather than
constructing an instance directly.
"""
-
+ default_ws = 2
+
def __init__(self, word_fd, bigram_fd, window_size=2):
"""Construct a BigramCollocationFinder, given FreqDists for
appearances of words and (possibly non-contiguous) bigrams.
@@ -149,6 +162,8 @@ class BigramCollocationFinder(AbstractCollocationFinder):
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
+ if w1 is None:
+ continue
wfd[w1] += 1
for w2 in window[1:]:
if w2 is not None:
@@ -174,7 +189,8 @@ class TrigramCollocationFinder(AbstractCollocationFinder):
association measures. It is often useful to use from_words() rather than
constructing an instance directly.
"""
-
+ default_ws = 3
+
def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd):
"""Construct a TrigramCollocationFinder, given FreqDists for
appearances of words, bigrams, two words with any word between them,
@@ -198,6 +214,8 @@ class TrigramCollocationFinder(AbstractCollocationFinder):
tfd = FreqDist()
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
+ if w1 is None:
+ continue
for w2, w3 in _itertools.combinations(window[1:], 2):
wfd[w1] += 1
if w2 is None:
@@ -240,7 +258,8 @@ class QuadgramCollocationFinder(AbstractCollocationFinder):
"""A tool for the finding and ranking of quadgram collocations or other association measures.
It is often useful to use from_words() rather than constructing an instance directly.
"""
-
+ default_ws = 4
+
def __init__(self, word_fd, quadgram_fd, ii, iii, ixi, ixxi, iixi, ixii):
"""Construct a QuadgramCollocationFinder, given FreqDists for appearances of words,
bigrams, trigrams, two words with one word and two words between them, three words
@@ -269,6 +288,8 @@ class QuadgramCollocationFinder(AbstractCollocationFinder):
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
+ if w1 is None:
+ continue
for w2, w3, w4 in _itertools.combinations(window[1:], 3):
ixxx[w1] += 1
if w2 is None:
diff --git a/nltk/compat.py b/nltk/compat.py
index 6e4cdce..1534c55 100755
--- a/nltk/compat.py
+++ b/nltk/compat.py
@@ -54,6 +54,20 @@ if PY3:
from collections import Counter
+ from datetime import timezone
+ UTC = timezone.utc
+
+ from tempfile import TemporaryDirectory
+
+ unichr = chr
+ if sys.version_info[1] <= 1:
+ def int2byte(i):
+ return bytes((i,))
+ else:
+ # This is about 2x faster than the implementation above on 3.2+
+ import operator
+ int2byte = operator.methodcaller("to_bytes", 1, "big")
+
else:
def b(s):
return s
@@ -121,6 +135,151 @@ else:
sys.meta_path.insert(0, TkinterLoader())
+ from datetime import tzinfo, timedelta
+
+ ZERO = timedelta(0)
+ HOUR = timedelta(hours=1)
+
+ # A UTC class for python 2.7
+ class UTC(tzinfo):
+ """UTC"""
+
+ def utcoffset(self, dt):
+ return ZERO
+
+ def tzname(self, dt):
+ return "UTC"
+
+ def dst(self, dt):
+ return ZERO
+
+ UTC = UTC()
+
+ unichr = unichr
+ int2byte = chr
+
+ import csv, codecs, cStringIO
+ class UnicodeWriter:
+ """
+ A CSV writer which will write rows to CSV file "f",
+ which is encoded in the given encoding.
+ see https://docs.python.org/2/library/csv.html
+ """
+
+ def __init__(self, f, dialect=csv.excel, encoding="utf-8", errors='replace', **kwds):
+ # Redirect output to a queue
+ self.queue = cStringIO.StringIO()
+ self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
+ self.stream = f
+ self.encoder = codecs.getincrementalencoder(encoding)(errors=errors)
+
+ def encode(self, data):
+ if isinstance(data, basestring):
+ return data.encode("utf-8")
+ else:
+ return data
+
+ def writerow(self, row):
+ self.writer.writerow([self.encode(s) for s in row])
+ # Fetch UTF-8 output from the queue ...
+ data = self.queue.getvalue()
+ data = data.decode("utf-8")
+ # ... and reencode it into the target encoding
+ data = self.encoder.encode(data, 'replace')
+ # write to the target stream
+ self.stream.write(data)
+ # empty queue
+ self.queue.truncate(0)
+
+
+ import warnings as _warnings
+ import os as _os
+ from tempfile import mkdtemp
+
+ class TemporaryDirectory(object):
+ """Create and return a temporary directory. This has the same
+ behavior as mkdtemp but can be used as a context manager. For
+ example:
+
+ with TemporaryDirectory() as tmpdir:
+ ...
+
+ Upon exiting the context, the directory and everything contained
+ in it are removed.
+
+ http://stackoverflow.com/questions/19296146/tempfile-temporarydirectory-context-manager-in-python-2-7
+ """
+
+ def __init__(self, suffix="", prefix="tmp", dir=None):
+ self._closed = False
+ self.name = None # Handle mkdtemp raising an exception
+ self.name = mkdtemp(suffix, prefix, dir)
+
+ def __repr__(self):
+ return "<{} {!r}>".format(self.__class__.__name__, self.name)
+
+ def __enter__(self):
+ return self.name
+
+ def cleanup(self, _warn=False):
+ if self.name and not self._closed:
+ try:
+ self._rmtree(self.name)
+ except (TypeError, AttributeError) as ex:
+ # Issue #10188: Emit a warning on stderr
+ # if the directory could not be cleaned
+ # up due to missing globals
+ if "None" not in str(ex):
+ raise
+ print("ERROR: {!r} while cleaning up {!r}".format(ex, self,),
+ file=sys.stderr)
+ return
+ self._closed = True
+ if _warn:
+ self._warn("Implicitly cleaning up {!r}".format(self),
+ ResourceWarning)
+
+ def __exit__(self, exc, value, tb):
+ self.cleanup()
+
+ def __del__(self):
+ # Issue a ResourceWarning if implicit cleanup needed
+ self.cleanup(_warn=True)
+
+ # XXX (ncoghlan): The following code attempts to make
+ # this class tolerant of the module nulling out process
+ # that happens during CPython interpreter shutdown
+ # Alas, it doesn't actually manage it. See issue #10188
+ _listdir = staticmethod(_os.listdir)
+ _path_join = staticmethod(_os.path.join)
+ _isdir = staticmethod(_os.path.isdir)
+ _islink = staticmethod(_os.path.islink)
+ _remove = staticmethod(_os.remove)
+ _rmdir = staticmethod(_os.rmdir)
+ _warn = _warnings.warn
+
+ def _rmtree(self, path):
+ # Essentially a stripped down version of shutil.rmtree. We can't
+ # use globals because they may be None'ed out at shutdown.
+ for name in self._listdir(path):
+ fullname = self._path_join(path, name)
+ try:
+ isdir = self._isdir(fullname) and not self._islink(fullname)
+ except OSError:
+ isdir = False
+ if isdir:
+ self._rmtree(fullname)
+ else:
+ try:
+ self._remove(fullname)
+ except OSError:
+ pass
+ try:
+ self._rmdir(path)
+ except OSError:
+ pass
+
+
if PY26:
from operator import itemgetter
from heapq import nlargest
diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py
index 33f1a8a..daaf3ad 100644
--- a/nltk/corpus/__init__.py
+++ b/nltk/corpus/__init__.py
@@ -206,6 +206,8 @@ treebank_chunk = LazyCorpusLoader(
para_block_reader=tagged_treebank_para_block_reader, encoding='ascii')
treebank_raw = LazyCorpusLoader(
'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
+twitter_samples = LazyCorpusLoader(
+ 'twitter_samples', TwitterCorpusReader, '.*\.json')
udhr = LazyCorpusLoader(
'udhr', UdhrCorpusReader)
udhr2 = LazyCorpusLoader(
diff --git a/nltk/corpus/reader/__init__.py b/nltk/corpus/reader/__init__.py
index 047f358..de78ba1 100644
--- a/nltk/corpus/reader/__init__.py
+++ b/nltk/corpus/reader/__init__.py
@@ -93,6 +93,7 @@ from nltk.corpus.reader.framenet import *
from nltk.corpus.reader.udhr import *
from nltk.corpus.reader.bnc import *
from nltk.corpus.reader.sentiwordnet import *
+from nltk.corpus.reader.twitter import *
from nltk.corpus.reader.nkjp import *
from nltk.corpus.reader.crubadan import *
@@ -130,5 +131,5 @@ __all__ = [
'TimitTaggedCorpusReader', 'LinThesaurusCorpusReader',
'SemcorCorpusReader', 'FramenetCorpusReader', 'UdhrCorpusReader',
'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset',
- 'NKJPCorpusReader', 'CrubadanCorpusReader'
+ 'TwitterCorpusReader', 'NKJPCorpusReader', 'CrubadanCorpusReader'
]
diff --git a/nltk/corpus/reader/bnc.py b/nltk/corpus/reader/bnc.py
index d0e692d..a36934e 100644
--- a/nltk/corpus/reader/bnc.py
+++ b/nltk/corpus/reader/bnc.py
@@ -22,7 +22,7 @@ class BNCCorpusReader(XMLCorpusReader):
http://www.ota.ox.ac.uk/desc/2554
If you extracted the archive to a directory called `BNC`, then you can
- instantiate the reder as::
+ instantiate the reader as::
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
diff --git a/nltk/corpus/reader/childes.py b/nltk/corpus/reader/childes.py
index eaf373a..5ab4b3a 100644
--- a/nltk/corpus/reader/childes.py
+++ b/nltk/corpus/reader/childes.py
@@ -270,7 +270,7 @@ class CHILDESCorpusReader(XMLCorpusReader):
# select speakers
if speaker == 'ALL' or xmlsent.get('who') in speaker:
for xmlword in xmlsent.findall('.//{%s}w' % NS):
- infl = None ; suffixStem = None
+ infl = None ; suffixStem = None; suffixTag = None
# getting replaced words
if replace and xmlsent.find('.//{%s}w/{%s}replacement'
% (NS,NS)):
@@ -307,6 +307,8 @@ class CHILDESCorpusReader(XMLCorpusReader):
suffixStem = xmlsuffix.text
except AttributeError:
suffixStem = ""
+ if suffixStem:
+ word += "~"+suffixStem
# pos
if relation or pos:
try:
@@ -316,11 +318,22 @@ class CHILDESCorpusReader(XMLCorpusReader):
tag = xmlpos[0].text+":"+xmlpos2[0].text
else:
tag = xmlpos[0].text
- word = (word,tag)
except (AttributeError,IndexError) as e:
- word = (word,None)
- if suffixStem:
- suffixStem = (suffixStem,None)
+ tag = ""
+ try:
+ xmlsuffixpos = xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c'
+ % (NS,NS,NS,NS,NS))
+ xmlsuffixpos2 = xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s'
+ % (NS,NS,NS,NS,NS))
+ if xmlsuffixpos2:
+ suffixTag = xmlsuffixpos[0].text+":"+xmlsuffixpos2[0].text
+ else:
+ suffixTag = xmlsuffixpos[0].text
+ except:
+ pass
+ if suffixTag:
+ tag += "~"+suffixTag
+ word = (word, tag)
# relational
# the gold standard is stored in
# <mor></mor><mor type="trn"><gra type="grt">
@@ -357,8 +370,6 @@ class CHILDESCorpusReader(XMLCorpusReader):
except:
pass
sents.append(word)
- if suffixStem:
- sents.append(suffixStem)
if sent or relation:
results.append(sents)
else:
@@ -480,3 +491,4 @@ def demo(corpus_root=None):
if __name__ == "__main__":
demo()
+
diff --git a/nltk/corpus/reader/sentiwordnet.py b/nltk/corpus/reader/sentiwordnet.py
index b53ec56..ec678ce 100644
--- a/nltk/corpus/reader/sentiwordnet.py
+++ b/nltk/corpus/reader/sentiwordnet.py
@@ -129,6 +129,3 @@ class SentiSynset(object):
def __repr__(self):
return "Senti" + repr(self.synset)
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/corpus/reader/twitter.py b/nltk/corpus/reader/twitter.py
new file mode 100644
index 0000000..cd957a4
--- /dev/null
+++ b/nltk/corpus/reader/twitter.py
@@ -0,0 +1,157 @@
+# Natural Language Toolkit: Twitter Corpus Reader
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Ewan Klein <ewan at inf.ed.ac.uk>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A reader for corpora that consist of Tweets. It is assumed that the Tweets
+have been serialised into line-delimited JSON.
+"""
+
+import json
+import os
+
+from nltk import compat
+from nltk.tokenize import TweetTokenizer
+
+from nltk.corpus.reader.util import StreamBackedCorpusView, concat, ZipFilePathPointer
+from nltk.corpus.reader.api import CorpusReader
+
+
+class TwitterCorpusReader(CorpusReader):
+ """
+ Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.
+
+ Individual Tweets can be tokenized using the default tokenizer, or by a
+ custom tokenizer specified as a parameter to the constructor.
+
+ Construct a new Tweet corpus reader for a set of documents
+ located at the given root directory.
+
+ If you made your own tweet collection in a directory called
+ `twitter-files`, then you can initialise the reader as::
+
+ from nltk.corpus import TwitterCorpusReader
+ reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
+
+ However, the recommended approach is to set the relevant directory as the
+ value of the environmental variable `TWITTER`, and then invoke the reader
+ as follows::
+
+ root = os.environ['TWITTER']
+ reader = TwitterCorpusReader(root, '.*\.json')
+
+ If you want to work directly with the raw Tweets, the `json` library can
+ be used::
+
+ import json
+ for tweet in reader.docs():
+ print(json.dumps(tweet, indent=1, sort_keys=True))
+
+ """
+
+ CorpusView = StreamBackedCorpusView
+ """
+ The corpus view class used by this reader.
+ """
+
+ def __init__(self, root, fileids=None,
+ word_tokenizer=TweetTokenizer(),
+ encoding='utf8'):
+ """
+
+ :param root: The root directory for this corpus.
+
+ :param fileids: A list or regexp specifying the fileids in this corpus.
+
+ :param word_tokenizer: Tokenizer for breaking the text of Tweets into
+ smaller units, including but not limited to words.
+
+ """
+ CorpusReader.__init__(self, root, fileids, encoding)
+
+ for path in self.abspaths(self._fileids):
+ if isinstance(path, ZipFilePathPointer):
+ pass
+ elif os.path.getsize(path) == 0:
+ raise ValueError("File {} is empty".format(path))
+ """Check that all user-created corpus files are non-empty."""
+
+ self._word_tokenizer = word_tokenizer
+
+
+
+ def docs(self, fileids=None):
+ """
+ Returns the full Tweet objects, as specified by `Twitter
+ documentation on Tweets
+ <https://dev.twitter.com/docs/platform-objects/tweets>`_
+
+ :return: the given file(s) as a list of dictionaries deserialised
+ from JSON.
+ :rtype: list(dict)
+ """
+ return concat([self.CorpusView(path, self._read_tweets, encoding=enc)
+ for (path, enc, fileid) in self.abspaths(fileids, True, True)])
+
+
+ def strings(self, fileids=None):
+ """
+ Returns only the text content of Tweets in the file(s)
+
+ :return: the given file(s) as a list of Tweets.
+ :rtype: list(str)
+ """
+ fulltweets = self.docs(fileids)
+ tweets = []
+ for jsono in fulltweets:
+ try:
+ text = jsono['text']
+ if isinstance(text, bytes):
+ text = text.decode(self.encoding)
+ tweets.append(text)
+ except KeyError:
+ pass
+ return tweets
+
+
+ def tokenized(self, fileids=None):
+ """
+ :return: the given file(s) as a list of the text content of Tweets as
+ as a list of words, screenanames, hashtags, URLs and punctuation symbols.
+
+ :rtype: list(list(str))
+ """
+ tweets = self.strings(fileids)
+ tokenizer = self._word_tokenizer
+ return [tokenizer.tokenize(t) for t in tweets]
+
+
+ def raw(self, fileids=None):
+ """
+ Return the corpora in their raw form.
+ """
+ if fileids is None:
+ fileids = self._fileids
+ elif isinstance(fileids, compat.string_types):
+ fileids = [fileids]
+ return concat([self.open(f).read() for f in fileids])
+
+
+ def _read_tweets(self, stream):
+ """
+ Assumes that each line in ``stream`` is a JSON-serialised object.
+ """
+ tweets = []
+ for i in range(10):
+ line = stream.readline()
+ if not line:
+ return tweets
+ tweet = json.loads(line)
+ tweets.append(tweet)
+ return tweets
+
+
+
diff --git a/nltk/corpus/reader/verbnet.py b/nltk/corpus/reader/verbnet.py
index 35a3851..78a0401 100644
--- a/nltk/corpus/reader/verbnet.py
+++ b/nltk/corpus/reader/verbnet.py
@@ -21,6 +21,18 @@ from nltk import compat
from nltk.corpus.reader.xmldocs import XMLCorpusReader
class VerbnetCorpusReader(XMLCorpusReader):
+ """
+ An NLTK interface to the VerbNet verb lexicon.
+
+ From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest
+ on-line verb lexicon currently available for English. It is a hierarchical
+ domain-independent, broad-coverage verb lexicon with mappings to other
+ lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), Xtag
+ (XTAG Research Group, 2001), and FrameNet (Baker et al., 1998)."
+
+ For details about VerbNet see:
+ http://verbs.colorado.edu/~mpalmer/projects/verbnet.html
+ """
# No unicode encoding param, since the data files are all XML.
def __init__(self, root, fileids, wrap_etree=False):
diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py
index 477dea7..b4ee146 100644
--- a/nltk/corpus/reader/wordnet.py
+++ b/nltk/corpus/reader/wordnet.py
@@ -1512,9 +1512,8 @@ class WordNetCorpusReader(CorpusReader):
# adjective satellites are in the same file as
# adjectives so only yield the synset if it's actually
# a satellite
- if pos_tag == ADJ_SAT:
- if synset._pos == pos_tag:
- yield synset
+ if synset._pos == ADJ_SAT:
+ yield synset
# for all other POS tags, yield all synsets (this means
# that adjectives also include adjective satellites)
diff --git a/nltk/decorators.py b/nltk/decorators.py
index 3e01304..0c36943 100644
--- a/nltk/decorators.py
+++ b/nltk/decorators.py
@@ -191,8 +191,6 @@ def memoize(func, *args):
dic[args] = result
return result
-if __name__ == "__main__":
- import doctest; doctest.testmod()
########################## LEGALESE ###############################
diff --git a/nltk/internals.py b/nltk/internals.py
index cfe938e..4524405 100644
--- a/nltk/internals.py
+++ b/nltk/internals.py
@@ -10,6 +10,7 @@ from __future__ import print_function
import subprocess
import os
+import fnmatch
import re, sre_constants, sre_parse, sre_compile
import warnings
import textwrap
@@ -411,7 +412,7 @@ class Counter:
##########################################################################
def find_file_iter(filename, env_vars=(), searchpath=(),
- file_names=None, url=None, verbose=True):
+ file_names=None, url=None, verbose=True, finding_dir=False):
"""
Search for a file to be used by nltk.
@@ -455,6 +456,10 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
# Check environment variables
for env_var in env_vars:
if env_var in os.environ:
+ if finding_dir: # This is to file a directory instead of file
+ yielded = True
+ yield os.environ[env_var]
+
for env_dir in os.environ[env_var].split(os.pathsep):
# Check if the environment variable contains a direct path to the bin
if os.path.isfile(env_dir):
@@ -471,7 +476,11 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
yielded = True
yield path_to_file
# Check if the alternative is inside a 'file' directory
- path_to_file = os.path.join(env_dir, 'file', alternative)
+ # path_to_file = os.path.join(env_dir, 'file', alternative)
+
+ # Check if the alternative is inside a 'bin' directory
+ path_to_file = os.path.join(env_dir, 'bin', alternative)
+
if os.path.isfile(path_to_file):
if verbose:
print('[Found %s: %s]' % (filename, path_to_file))
@@ -518,11 +527,19 @@ def find_file_iter(filename, env_vars=(), searchpath=(),
div = '='*75
raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
+
def find_file(filename, env_vars=(), searchpath=(),
file_names=None, url=None, verbose=True):
return next(find_file_iter(filename, env_vars, searchpath,
file_names, url, verbose))
+
+def find_dir(filename, env_vars=(), searchpath=(),
+ file_names=None, url=None, verbose=True):
+ return next(find_file_iter(filename, env_vars, searchpath,
+ file_names, url, verbose, finding_dir=True))
+
+
def find_binary_iter(name, path_to_bin=None, env_vars=(), searchpath=(),
binary_names=None, url=None, verbose=True):
"""
@@ -662,6 +679,12 @@ def find_jar(name_pattern, path_to_jar=None, env_vars=(),
return next(find_jar_iter(name_pattern, path_to_jar, env_vars,
searchpath, url, verbose, is_regex))
+
+def find_jars_within_path(path_to_jars):
+ return [os.path.join(root, filename)
+ for root, dirnames, filenames in os.walk(path_to_jars)
+ for filename in fnmatch.filter(filenames, '*.jar')]
+
def _decode_stdoutdata(stdoutdata):
""" Convert data read from stdout/stderr to unicode """
if not isinstance(stdoutdata, bytes):
diff --git a/nltk/jsontags.py b/nltk/jsontags.py
index 3053568..4059501 100644
--- a/nltk/jsontags.py
+++ b/nltk/jsontags.py
@@ -43,18 +43,18 @@ class JSONTaggedDecoder(json.JSONDecoder):
@classmethod
def decode_obj(cls, obj):
- #Decode nested objects first.
+ # Decode nested objects first.
if isinstance(obj, dict):
- obj=dict((key, cls.decode_obj(val)) for (key, val) in obj.items())
+ obj = dict((key, cls.decode_obj(val)) for (key, val) in obj.items())
elif isinstance(obj, list):
- obj=list(cls.decode_obj(val) for val in obj)
- #Check if we have a tagged object.
+ obj = list(cls.decode_obj(val) for val in obj)
+ # Check if we have a tagged object.
if not isinstance(obj, dict) or len(obj) != 1:
return obj
obj_tag = next(iter(obj.keys()))
if not obj_tag.startswith('!'):
return obj
- if not obj_tag in json_tags:
+ if obj_tag not in json_tags:
raise ValueError('Unknown tag', obj_tag)
obj_cls = json_tags[obj_tag]
return obj_cls.decode_json_obj(obj[obj_tag])
diff --git a/nltk/metrics/segmentation.py b/nltk/metrics/segmentation.py
index 8d9f745..12d4d87 100644
--- a/nltk/metrics/segmentation.py
+++ b/nltk/metrics/segmentation.py
@@ -230,6 +230,3 @@ def setup_module(module):
raise SkipTest("numpy is required for nltk.metrics.segmentation")
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/parse/bllip.py b/nltk/parse/bllip.py
index 739ef1b..ee0f8cd 100644
--- a/nltk/parse/bllip.py
+++ b/nltk/parse/bllip.py
@@ -279,7 +279,4 @@ def setup_module(module):
raise SkipTest('doctests from nltk.parse.bllip are skipped because '
'the bllipparser module is not installed')
-if __name__ == '__main__':
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS)
diff --git a/nltk/parse/dependencygraph.py b/nltk/parse/dependencygraph.py
index f5f6ef8..72016f4 100755
--- a/nltk/parse/dependencygraph.py
+++ b/nltk/parse/dependencygraph.py
@@ -138,38 +138,75 @@ class DependencyGraph(object):
return node_address in self.nodes
def to_dot(self):
- """
- Returns a dot representation suitable for using with Graphviz
- @rtype C{String}
+ """Return a dot representation suitable for using with Graphviz.
+
+ >>> dg = DependencyGraph(
+ ... 'John N 2\\n'
+ ... 'loves V 0\\n'
+ ... 'Mary N 2'
+ ... )
+ >>> print(dg.to_dot())
+ digraph G{
+ edge [dir=forward]
+ node [shape=plaintext]
+ <BLANKLINE>
+ 0 [label="0 (None)"]
+ 0 -> 2 [label="ROOT"]
+ 1 [label="1 (John)"]
+ 2 [label="2 (loves)"]
+ 2 -> 1 [label=""]
+ 2 -> 3 [label=""]
+ 3 [label="3 (Mary)"]
+ }
+
"""
# Start the digraph specification
s = 'digraph G{\n'
s += 'edge [dir=forward]\n'
s += 'node [shape=plaintext]\n'
+
# Draw the remaining nodes
- for node in sorted(self.nodes.values()):
+ for node in sorted(self.nodes.values(), key=lambda v: v['address']):
s += '\n%s [label="%s (%s)"]' % (node['address'], node['address'], node['word'])
- for rel, deps in node['deps'].iteritems():
+ for rel, deps in node['deps'].items():
for dep in deps:
- if rel != None:
+ if rel is not None:
s += '\n%s -> %s [label="%s"]' % (node['address'], dep, rel)
else:
s += '\n%s -> %s ' % (node['address'], dep)
s += "\n}"
+
return s
def _repr_svg_(self):
- """Ipython magic: show SVG representation of the transducer"""
- dot_string = self.draw_dot()
- format = 'svg'
+ """Show SVG representation of the transducer (IPython magic).
+
+ >>> dg = DependencyGraph(
+ ... 'John N 2\\n'
+ ... 'loves V 0\\n'
+ ... 'Mary N 2'
+ ... )
+ >>> dg._repr_svg_().split('\\n')[0]
+ '<?xml version="1.0" encoding="UTF-8" standalone="no"?>'
+
+ """
+ dot_string = self.to_dot()
+
try:
- process = subprocess.Popen(['dot', '-T%s' % format], stdin=subprocess.PIPE,
- stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ process = subprocess.Popen(
+ ['dot', '-Tsvg'],
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ universal_newlines=True,
+ )
except OSError:
raise Exception('Cannot find the dot binary from Graphviz package')
out, err = process.communicate(dot_string)
if err:
- raise Exception('Cannot create %s representation by running dot from string\n:%s' % (format, dot_string))
+ raise Exception(
+ 'Cannot create svg representation by running dot from string: {}'
+ ''.format(dot_string))
return out
def __str__(self):
diff --git a/nltk/parse/evaluate.py b/nltk/parse/evaluate.py
index 88a47f5..bd20686 100644
--- a/nltk/parse/evaluate.py
+++ b/nltk/parse/evaluate.py
@@ -127,6 +127,3 @@ class DependencyEvaluator(object):
return corr / total, corrL / total
-if __name__ == '__main__':
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS)
diff --git a/nltk/parse/malt.py b/nltk/parse/malt.py
index b9455ed..49f6b9e 100644
--- a/nltk/parse/malt.py
+++ b/nltk/parse/malt.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to MaltParser
#
# Author: Dan Garrette <dhgarrette at gmail.com>
@@ -5,245 +6,375 @@
# Copyright (C) 2001-2015 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
+
from __future__ import print_function
+from __future__ import unicode_literals
+from six import text_type
import os
import tempfile
-import glob
-from operator import add
-from functools import reduce
import subprocess
+import inspect
-from nltk.data import ZipFilePathPointer
from nltk.tokenize import word_tokenize
-from nltk.internals import find_binary
+from nltk.tag import pos_tag
+from nltk.data import ZipFilePathPointer
+from nltk.internals import find_dir, find_file, find_jars_within_path
from nltk.parse.api import ParserI
from nltk.parse.dependencygraph import DependencyGraph
+from nltk.parse.util import taggedsents_to_conll
+
+
+def malt_regex_tagger():
+ from nltk.tag import RegexpTagger
+ _tagger = RegexpTagger(
+ [(r'\.$','.'), (r'\,$',','), (r'\?$','?'), # fullstop, comma, Qmark
+ (r'\($','('), (r'\)$',')'), # round brackets
+ (r'\[$','['), (r'\]$',']'), # square brackets
+ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
+ (r'(The|the|A|a|An|an)$', 'DT'), # articles
+ (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'), # pronouns
+ (r'(His|his|Her|her|Its|its)$', 'PRP$'), # possesive
+ (r'(my|Your|your|Yours|yours)$', 'PRP$'), # possesive
+ (r'(on|On|in|In|at|At|since|Since)$', 'IN'),# time prepopsitions
+ (r'(for|For|ago|Ago|before|Before)$', 'IN'),# time prepopsitions
+ (r'(till|Till|until|Until)$', 'IN'), # time prepopsitions
+ (r'(by|By|beside|Beside)$', 'IN'), # space prepopsitions
+ (r'(under|Under|below|Below)$', 'IN'), # space prepopsitions
+ (r'(over|Over|above|Above)$', 'IN'), # space prepopsitions
+ (r'(across|Across|through|Through)$', 'IN'),# space prepopsitions
+ (r'(into|Into|towards|Towards)$', 'IN'), # space prepopsitions
+ (r'(onto|Onto|from|From)$', 'IN'), # space prepopsitions
+ (r'.*able$', 'JJ'), # adjectives
+ (r'.*ness$', 'NN'), # nouns formed from adjectives
+ (r'.*ly$', 'RB'), # adverbs
+ (r'.*s$', 'NNS'), # plural nouns
+ (r'.*ing$', 'VBG'), # gerunds
+ (r'.*ed$', 'VBD'), # past tense verbs
+ (r'.*', 'NN'), # nouns (default)
+ ])
+ return _tagger.tag
+
+
+def find_maltparser(parser_dirname):
+ """
+ A module to find MaltParser .jar file and its dependencies.
+ """
+ if os.path.exists(parser_dirname): # If a full path is given.
+ _malt_dir = parser_dirname
+ else: # Try to find path to maltparser directory in environment variables.
+ _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',))
+ # Checks that that the found directory contains all the necessary .jar
+ malt_dependencies = ['','','']
+ _malt_jars = set(find_jars_within_path(_malt_dir))
+ _jars = set(jar.rpartition('/')[2] for jar in _malt_jars)
+ malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar'])
+
+ assert malt_dependencies.issubset(_jars)
+ assert any(filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars))
+ return list(_malt_jars)
-class MaltParser(ParserI):
- def __init__(self, tagger=None, mco=None, working_dir=None, additional_java_args=None):
+def find_malt_model(model_filename):
+ """
+ A module to find pre-trained MaltParser model.
+ """
+ if model_filename == None:
+ return 'malt_temp.mco'
+ elif os.path.exists(model_filename): # If a full path is given.
+ return model_filename
+ else: # Try to find path to malt model in environment variables.
+ return find_file(model_filename, env_vars=('MALT_MODEL',), verbose=False)
+
+
+class MaltParser(ParserI):
+ """
+ A class for dependency parsing with MaltParser. The input is the paths to:
+ - a maltparser directory
+ - (optionally) the path to a pre-trained MaltParser .mco model file
+ - (optionally) the tagger to use for POS tagging before parsing
+ - (optionally) additional Java arguments
+
+ Example:
+ >>> from nltk.parse import malt
+ >>> # With MALT_PARSER and MALT_MODEL environment set.
+ >>> mp = malt.MaltParser('maltparser-1.7.2', 'engmalt.linear-1.7.mco') # doctest: +SKIP
+ >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
+ (shot I (elephant an) (in (pajamas my)) .)
+ >>> # Without MALT_PARSER and MALT_MODEL environment.
+ >>> mp = malt.MaltParser('/home/user/maltparser-1.7.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP
+ >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
+ (shot I (elephant an) (in (pajamas my)) .)
+ """
+ def __init__(self, parser_dirname, model_filename=None, tagger=None, additional_java_args=None):
"""
An interface for parsing with the Malt Parser.
- :param mco: The name of the pre-trained model. If provided, training
- will not be required, and MaltParser will use the model file in
- ${working_dir}/${mco}.mco.
- :type mco: str
+ :param parser_dirname: The path to the maltparser directory that
+ contains the maltparser-1.x.jar
+ :type parser_dirname: str
+ :param model_filename: The name of the pre-trained model with .mco file
+ extension. If provided, training will not be required.
+ (see http://www.maltparser.org/mco/mco.html and
+ see http://www.patful.com/chalk/node/185)
+ :type model_filename: str
+ :param tagger: The tagger used to POS tag the raw string before
+ formatting to CONLL format. It should behave like `nltk.pos_tag`
+ :type tagger: function
+ :param additional_java_args: This is the additional Java arguments that
+ one can use when calling Maltparser, usually this is the heapsize
+ limits, e.g. `additional_java_args=['-Xmx1024m']`
+ (see http://goo.gl/mpDBvQ)
+ :type additional_java_args: list
+ """
+
+ # Find all the necessary jar files for MaltParser.
+ self.malt_jars = find_maltparser(parser_dirname)
+ # Initialize additional java arguments.
+ self.additional_java_args = additional_java_args if \
+ additional_java_args is not None else []
+ # Initialize model.
+ self.model = find_malt_model(model_filename)
+ self._trained = self.model != 'malt_temp.mco'
+ # Set the working_dir parameters i.e. `-w` from MaltParser's option.
+ self.working_dir = tempfile.gettempdir()
+ # Initialize POS tagger.
+ self.tagger = tagger if tagger is not None else malt_regex_tagger()
+
+ def pretrained_model_sanity_checks(self, tree_str):
"""
- self.config_malt()
- self.mco = 'malt_temp' if mco is None else mco
- self.working_dir = tempfile.gettempdir() if working_dir is None\
- else working_dir
- self.additional_java_args = [] if additional_java_args is None else additional_java_args
- self._trained = mco is not None
-
- if tagger is not None:
- self.tagger = tagger
- else:
- from nltk.tag import RegexpTagger
- self.tagger = RegexpTagger(
- [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
- (r'(The|the|A|a|An|an)$', 'AT'), # articles
- (r'.*able$', 'JJ'), # adjectives
- (r'.*ness$', 'NN'), # nouns formed from adjectives
- (r'.*ly$', 'RB'), # adverbs
- (r'.*s$', 'NNS'), # plural nouns
- (r'.*ing$', 'VBG'), # gerunds
- (r'.*ed$', 'VBD'), # past tense verbs
- (r'.*', 'NN') # nouns (default)
- ])
-
- def config_malt(self, bin=None, verbose=False):
+ Performs sanity checks and replace oddities in pre-trained model
+ outputs from http://www.maltparser.org/mco/english_parser/engmalt.html
+ Note: This hack function should go away once nltk.parse.DependencyGraph
+ handles optional TOP label!!!
+ :param tree_str: The CONLL output file for a single parse
+ :type tree_str: str
+ :return: str
"""
- Configure NLTK's interface to the ``malt`` package. This
- searches for a directory containing the malt jar
-
- :param bin: The full path to the ``malt`` binary. If not
- specified, then nltk will search the system for a ``malt``
- binary; and if one is not found, it will raise a
- ``LookupError`` exception.
- :type bin: str
+ # Checks for oddities in English pre-trained model.
+ if (
+ '\t0\tnull\t' in tree_str and
+ self.model.endswith(('engmalt.linear-1.7.mco', 'engmalt.poly-1.7.mco'))
+ ):
+ tree_str = tree_str.replace('\t0\tnull\t','\t0\tROOT\t')
+ # Checks for oddities in French pre-trained model.
+ if '\t0\troot\t' in tree_str and \
+ self.model.endswith('fremalt-1.7.mco'):
+ tree_str = tree_str.replace('\t0\troot\t','\t0\tROOT\t')
+ return tree_str
+
+ def parse_tagged_sents(self, sentences, verbose=False):
"""
- #: A list of directories that should be searched for the malt
- #: executables. This list is used by ``config_malt`` when searching
- #: for the malt executables.
- _malt_path = ['.',
- '/usr/lib/malt-1*',
- '/usr/share/malt-1*',
- '/usr/local/bin',
- '/usr/local/malt-1*',
- '/usr/local/bin/malt-1*',
- '/usr/local/malt-1*',
- '/usr/local/share/malt-1*']
-
- # Expand wildcards in _malt_path:
- malt_path = reduce(add, map(glob.glob, _malt_path))
-
- # Find the malt binary.
- self._malt_bin = find_binary('malt.jar', bin,
- searchpath=malt_path, env_vars=['MALT_PARSER'],
- url='http://www.maltparser.org/',
- verbose=verbose)
+ Use MaltParser to parse multiple POS tagged sentences. Takes multiple
+ sentences where each sentence is a list of (word, tag) tuples.
+ The sentences must have already been tokenized and tagged.
+ :param sentences: Input sentences to parse
+ :type sentence: list(list(tuple(str, str)))
+ :return: iter(iter(``DependencyGraph``)) the dependency graph
+ representation of each sentence
+ """
+ if not self._trained:
+ raise Exception("Parser has not been trained. Call train() first.")
+
+
+ with tempfile.NamedTemporaryFile(prefix='malt_input.conll.',
+ dir=self.working_dir, mode='w', delete=False) as input_file:
+ with tempfile.NamedTemporaryFile(prefix='malt_output.conll.',
+ dir=self.working_dir, mode='w', delete=False) as output_file:
+ # Convert list of sentences to CONLL format.
+ for line in taggedsents_to_conll(sentences):
+ input_file.write(text_type(line))
+ input_file.close()
+
+ # Generate command to run maltparser.
+ cmd =self.generate_malt_command(input_file.name,
+ output_file.name, mode="parse")
+
+ # This is a maltparser quirk, it needs to be run
+ # where the model file is. otherwise it goes into an awkward
+ # missing .jars or strange -w working_dir problem.
+ _current_path = os.getcwd() # Remembers the current path.
+ try: # Change to modelfile path
+ os.chdir(os.path.split(self.model)[0])
+ except:
+ pass
+ ret = self._execute(cmd, verbose) # Run command.
+ os.chdir(_current_path) # Change back to current path.
+
+ if ret is not 0:
+ raise Exception("MaltParser parsing (%s) failed with exit "
+ "code %d" % (' '.join(cmd), ret))
+
+ # Must return iter(iter(Tree))
+ with open(output_file.name) as infile:
+ for tree_str in infile.read().split('\n\n'):
+ tree_str = self.pretrained_model_sanity_checks(tree_str)
+ yield(iter([DependencyGraph(tree_str)]))
+
+ os.remove(input_file.name)
+ os.remove(output_file.name)
+
+
def parse_sents(self, sentences, verbose=False):
"""
- Use MaltParser to parse multiple sentences. Takes multiple sentences as a
- list where each sentence is a list of words.
- Each sentence will be automatically tagged with this MaltParser instance's
- tagger.
+ Use MaltParser to parse multiple sentences.
+ Takes a list of sentences, where each sentence is a list of words.
+ Each sentence will be automatically tagged with this
+ MaltParser instance's tagger.
:param sentences: Input sentences to parse
:type sentence: list(list(str))
:return: iter(DependencyGraph)
"""
- tagged_sentences = [self.tagger.tag(sentence) for sentence in sentences]
- return iter(self.tagged_parse_sents(tagged_sentences, verbose))
-
- def tagged_parse(self, sentence, verbose=False):
+ tagged_sentences = (self.tagger(sentence) for sentence in sentences)
+ return self.parse_tagged_sents(tagged_sentences, verbose)
+
+
+ def generate_malt_command(self, inputfilename, outputfilename=None, mode=None):
"""
- Use MaltParser to parse a sentence. Takes a sentence as a list of
- (word, tag) tuples; the sentence must have already been tokenized and
- tagged.
+ This function generates the maltparser command use at the terminal.
- :param sentence: Input sentence to parse
- :type sentence: list(tuple(str, str))
- :return: iter(DependencyGraph) the possible dependency graph representations of the sentence
+ :param inputfilename: path to the input file
+ :type inputfilename: str
+ :param outputfilename: path to the output file
+ :type outputfilename: str
"""
- return next(self.tagged_parse_sents([sentence], verbose))
- def tagged_parse_sents(self, sentences, verbose=False):
- """
- Use MaltParser to parse multiple sentences. Takes multiple sentences
- where each sentence is a list of (word, tag) tuples.
- The sentences must have already been tokenized and tagged.
+ cmd = ['java']
+ cmd+= self.additional_java_args # Adds additional java arguments.
+ cmd+= ['-cp', ':'.join(self.malt_jars)] # Adds classpaths for jars
+ cmd+= ['org.maltparser.Malt'] # Adds the main function.
- :param sentences: Input sentences to parse
- :type sentence: list(list(tuple(str, str)))
- :return: iter(iter(``DependencyGraph``)) the dependency graph representation
- of each sentence
- """
+ # Adds the model file.
+ if os.path.exists(self.model): # when parsing
+ cmd+= ['-c', os.path.split(self.model)[-1]]
+ else: # when learning
+ cmd+= ['-c', self.model]
- if not self._malt_bin:
- raise Exception("MaltParser location is not configured. Call config_malt() first.")
- if not self._trained:
- raise Exception("Parser has not been trained. Call train() first.")
-
- input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll',
- dir=self.working_dir,
- delete=False)
- output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll',
- dir=self.working_dir,
- delete=False)
-
- try:
- for sentence in sentences:
- for (i, (word, tag)) in enumerate(sentence, start=1):
- input_str = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
- (i, word, '_', tag, tag, '_', '0', 'a', '_', '_')
- input_file.write(input_str.encode("utf8"))
- input_file.write(b'\n\n')
- input_file.close()
-
- cmd = ['java'] + self.additional_java_args + ['-jar', self._malt_bin,
- '-w', self.working_dir,
- '-c', self.mco, '-i', input_file.name,
- '-o', output_file.name, '-m', 'parse']
-
- ret = self._execute(cmd, verbose)
- if ret != 0:
- raise Exception("MaltParser parsing (%s) failed with exit "
- "code %d" % (' '.join(cmd), ret))
-
- # Must return iter(iter(Tree))
- return (iter([dep_graph]) for dep_graph in DependencyGraph.load(output_file.name))
- finally:
- input_file.close()
- os.remove(input_file.name)
- output_file.close()
- os.remove(output_file.name)
+ cmd+= ['-i', inputfilename]
+ if mode == 'parse':
+ cmd+= ['-o', outputfilename]
+ cmd+= ['-m', mode] # mode use to generate parses.
+ return cmd
+
+ @staticmethod
+ def _execute(cmd, verbose=False):
+ output = None if verbose else subprocess.PIPE
+ p = subprocess.Popen(cmd, stdout=output, stderr=output)
+ return p.wait()
def train(self, depgraphs, verbose=False):
"""
Train MaltParser from a list of ``DependencyGraph`` objects
:param depgraphs: list of ``DependencyGraph`` objects for training input data
+ :type depgraphs: DependencyGraph
"""
- input_file = tempfile.NamedTemporaryFile(prefix='malt_train.conll',
- dir=self.working_dir,
- delete=False)
- try:
+
+ # Write the conll_str to malt_train.conll file in /tmp/
+ with tempfile.NamedTemporaryFile(prefix='malt_train.conll.',
+ dir=self.working_dir, mode='w', delete=False) as input_file:
input_str = ('\n'.join(dg.to_conll(10) for dg in depgraphs))
- input_file.write(input_str.encode("utf8"))
- input_file.close()
- self.train_from_file(input_file.name, verbose=verbose)
- finally:
- input_file.close()
- os.remove(input_file.name)
-
+ input_file.write(text_type(input_str))
+ # Trains the model with the malt_train.conll
+ self.train_from_file(input_file.name, verbose=verbose)
+ # Removes the malt_train.conll once training finishes.
+ os.remove(input_file.name)
+
def train_from_file(self, conll_file, verbose=False):
"""
Train MaltParser from a file
-
:param conll_file: str for the filename of the training input data
+ :type conll_file: str
"""
- if not self._malt_bin:
- raise Exception("MaltParser location is not configured. Call config_malt() first.")
- # If conll_file is a ZipFilePathPointer, then we need to do some extra
- # massaging
+ # If conll_file is a ZipFilePathPointer,
+ # then we need to do some extra massaging
if isinstance(conll_file, ZipFilePathPointer):
- input_file = tempfile.NamedTemporaryFile(prefix='malt_train.conll',
- dir=self.working_dir,
- delete=False)
- try:
- conll_str = conll_file.open().read()
- conll_file.close()
- input_file.write(conll_str)
- input_file.close()
+ with tempfile.NamedTemporaryFile(prefix='malt_train.conll.',
+ dir=self.working_dir, mode='w', delete=False) as input_file:
+ with conll_file.open() as conll_input_file:
+ conll_str = conll_input_file.read()
+ input_file.write(text_type(conll_str))
return self.train_from_file(input_file.name, verbose=verbose)
- finally:
- input_file.close()
- os.remove(input_file.name)
-
- cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir,
- '-c', self.mco, '-i', conll_file, '-m', 'learn']
+ # Generate command to run maltparser.
+ cmd =self.generate_malt_command(conll_file, mode="learn")
ret = self._execute(cmd, verbose)
if ret != 0:
- raise Exception("MaltParser training (%s) "
- "failed with exit code %d" %
- (' '.join(cmd), ret))
-
+ raise Exception("MaltParser training (%s) failed with exit "
+ "code %d" % (' '.join(cmd), ret))
self._trained = True
- @staticmethod
- def _execute(cmd, verbose=False):
- output = None if verbose else subprocess.PIPE
- p = subprocess.Popen(cmd, stdout=output, stderr=output)
- return p.wait()
-
-
-def demo():
- dg1 = DependencyGraph("""1 John _ NNP _ _ 2 SUBJ _ _
- 2 sees _ VB _ _ 0 ROOT _ _
- 3 a _ DT _ _ 4 SPEC _ _
- 4 dog _ NN _ _ 2 OBJ _ _
- """)
- dg2 = DependencyGraph("""1 John _ NNP _ _ 2 SUBJ _ _
- 2 walks _ VB _ _ 0 ROOT _ _
- """)
-
- verbose = False
-
- maltParser = MaltParser()
- maltParser.train([dg1,dg2], verbose=verbose)
-
- maltParser.parse_one(['John','sees','Mary'], verbose=verbose).tree().pprint()
- maltParser.parse_one(['a','man','runs'], verbose=verbose).tree().pprint()
-
- next(maltParser.tagged_parse([('John','NNP'),('sees','VB'),('Mary','NNP')], verbose)).tree().pprint()
if __name__ == '__main__':
- demo()
+ '''
+ A demostration function to show how NLTK users can use the malt parser API.
+
+ >>> assert 'MALT_PARSER' in os.environ, str(
+ ... "Please set MALT_PARSER in your global environment, e.g.:\n"
+ ... "$ export MALT_PARSER='/home/user/maltparser-1.7.2/'")
+ >>>
+ >>> assert 'MALT_MODEL' in os.environ, str(
+ ... "Please set MALT_MODEL in your global environment, e.g.:\n"
+ ... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'")
+ >>>
+ >>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
+ ... "2 sees _ VB _ _ 0 ROOT _ _\n"
+ ... "3 a _ DT _ _ 4 SPEC _ _\n"
+ ... "4 dog _ NN _ _ 2 OBJ _ _\n"
+ ... "5 . _ . _ _ 2 PUNCT _ _\n")
+ >>>
+ >>>
+ >>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
+ ... "2 walks _ VB _ _ 0 ROOT _ _\n"
+ ... "3 . _ . _ _ 2 PUNCT _ _\n")
+ >>> dg1 = DependencyGraph(_dg1_str)
+ >>> dg2 = DependencyGraph(_dg2_str)
+ >>> # Initialize a MaltParser object
+ >>> parser_dirname = 'maltparser-1.7.2'
+ >>> mp = MaltParser(parser_dirname=parser_dirname)
+ >>>
+ >>> # Trains a model.
+ >>> mp.train([dg1,dg2], verbose=False)
+ >>> sent1 = ['John','sees','Mary', '.']
+ >>> sent2 = ['John', 'walks', 'a', 'dog', '.']
+ >>>
+ >>> # Parse a single sentence.
+ >>> parsed_sent1 = mp.parse_one(sent1)
+ >>> parsed_sent2 = mp.parse_one(sent2)
+ >>> print (parsed_sent1.tree())
+ (sees John Mary .)
+ >>> print (parsed_sent2.tree())
+ (walks John (dog a) .)
+ >>>
+ >>> # Parsing multiple sentences.
+ >>> sentences = [sent1,sent2]
+ >>> parsed_sents = mp.parse_sents(sentences)
+ >>> print(next(next(parsed_sents)).tree())
+ (sees John Mary .)
+ >>> print(next(next(parsed_sents)).tree())
+ (walks John (dog a) .)
+ >>>
+ >>> # Initialize a MaltParser object with an English pre-trained model.
+ >>> parser_dirname = 'maltparser-1.7.2'
+ >>> model_name = 'engmalt.linear-1.7.mco'
+ >>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag)
+ >>> sent1 = 'I shot an elephant in my pajamas .'.split()
+ >>> sent2 = 'Time flies like banana .'.split()
+ >>> # Parse a single sentence.
+ >>> print(mp.parse_one(sent1).tree())
+ (shot I (elephant an) (in (pajamas my)) .)
+ # Parsing multiple sentences
+ >>> sentences = [sent1,sent2]
+ >>> parsed_sents = mp.parse_sents(sentences)
+ >>> print(next(next(parsed_sents)).tree())
+ (shot I (elephant an) (in (pajamas my)) .)
+ >>> print(next(next(parsed_sents)).tree())
+ (flies Time (like banana) .)
+ '''
+ import doctest
+ doctest.testmod()
+
diff --git a/nltk/parse/stanford.py b/nltk/parse/stanford.py
index d385700..11379f9 100644
--- a/nltk/parse/stanford.py
+++ b/nltk/parse/stanford.py
@@ -248,6 +248,3 @@ def setup_module(module):
except LookupError:
raise SkipTest('doctests from nltk.parse.stanford are skipped because the stanford parser jar doesn\'t exist')
-if __name__ == '__main__':
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS)
diff --git a/nltk/parse/transitionparser.py b/nltk/parse/transitionparser.py
index ae39701..e9ffc37 100644
--- a/nltk/parse/transitionparser.py
+++ b/nltk/parse/transitionparser.py
@@ -770,6 +770,3 @@ def demo():
Note that result is very poor because of only one training example.
"""
-if __name__ == '__main__':
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS)
diff --git a/nltk/parse/util.py b/nltk/parse/util.py
index 6120760..18c5824 100644
--- a/nltk/parse/util.py
+++ b/nltk/parse/util.py
@@ -76,6 +76,65 @@ def load_parser(grammar_url, trace=0,
chart_class = Chart
return parser(grammar, trace=trace, chart_class=chart_class)
+def taggedsent_to_conll(sentence):
+ """
+ A module to convert a single POS tagged sentence into CONLL format.
+
+ >>> from nltk import word_tokenize, pos_tag
+ >>> text = "This is a foobar sentence."
+ >>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))):
+ ... print(line, end="")
+ 1 This _ DT DT _ 0 a _ _
+ 2 is _ VBZ VBZ _ 0 a _ _
+ 3 a _ DT DT _ 0 a _ _
+ 4 foobar _ NN NN _ 0 a _ _
+ 5 sentence _ NN NN _ 0 a _ _
+ 6 . _ . . _ 0 a _ _
+
+ :param sentence: A single input sentence to parse
+ :type sentence: list(tuple(str, str))
+ :rtype: iter(str)
+ :return: a generator yielding a single sentence in CONLL format.
+ """
+ for (i, (word, tag)) in enumerate(sentence, start=1):
+ input_str = [str(i), word, '_', tag, tag, '_', '0', 'a', '_', '_']
+ input_str = "\t".join(input_str) + "\n"
+ yield input_str
+
+
+def taggedsents_to_conll(sentences):
+ """
+ A module to convert the a POS tagged document stream
+ (i.e. list of list of tuples, a list of sentences) and yield lines
+ in CONLL format. This module yields one line per word and two newlines
+ for end of sentence.
+
+ >>> from nltk import word_tokenize, sent_tokenize, pos_tag
+ >>> text = "This is a foobar sentence. Is that right?"
+ >>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)]
+ >>> for line in taggedsents_to_conll(sentences):
+ ... if line:
+ ... print(line, end="")
+ 1 This _ DT DT _ 0 a _ _
+ 2 is _ VBZ VBZ _ 0 a _ _
+ 3 a _ DT DT _ 0 a _ _
+ 4 foobar _ NN NN _ 0 a _ _
+ 5 sentence _ NN NN _ 0 a _ _
+ 6 . _ . . _ 0 a _ _
+ 1 Is _ VBZ VBZ _ 0 a _ _
+ 2 that _ IN IN _ 0 a _ _
+ 3 right _ JJ JJ _ 0 a _ _
+ 4 ? _ . . _ 0 a _ _
+
+ :param sentences: Input sentences to parse
+ :type sentence: list(list(tuple(str, str)))
+ :rtype: iter(str)
+ :return: a generator yielding sentences in CONLL format.
+ """
+ for sentence in sentences:
+ for input_str in taggedsent_to_conll(sentence):
+ yield input_str
+ yield '\n\n'
######################################################################
#{ Test Suites
diff --git a/nltk/sem/glue.py b/nltk/sem/glue.py
index 69166b3..7b36fef 100644
--- a/nltk/sem/glue.py
+++ b/nltk/sem/glue.py
@@ -230,10 +230,11 @@ class GlueDict(dict):
def to_glueformula_list(self, depgraph, node=None, counter=None, verbose=False):
if node is None:
+ # TODO: should it be depgraph.root? Is this code tested?
top = depgraph.nodes[0]
depList = sum(list(top['deps'].values()), [])
root = depgraph.nodes[depList[0]]
- #print (root)
+
return self.to_glueformula_list(depgraph, root, Counter(), verbose)
glueformulas = self.lookup(node, depgraph, counter)
@@ -540,13 +541,21 @@ class Glue(object):
return [self.gfl_to_compiled(gfl) for gfl in gfls]
def dep_parse(self, sentence):
+ """
+ Return a dependency graph for the sentence.
+
+ :param sentence: the sentence to be parsed
+ :type sentence: list(str)
+ :rtype: DependencyGraph
+ """
+
#Lazy-initialize the depparser
if self.depparser is None:
from nltk.parse import MaltParser
self.depparser = MaltParser(tagger=self.get_pos_tagger())
if not self.depparser._trained:
self.train_depparser()
- return [self.depparser.parse(sentence, verbose=self.verbose)]
+ return self.depparser.parse(sentence, verbose=self.verbose)
def depgraph_to_glue(self, depgraph):
return self.get_glue_dict().to_glueformula_list(depgraph)
diff --git a/nltk/stem/__init__.py b/nltk/stem/__init__.py
index f495c0a..72fa1f3 100644
--- a/nltk/stem/__init__.py
+++ b/nltk/stem/__init__.py
@@ -31,6 +31,3 @@ from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.rslp import RSLPStemmer
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/stem/api.py b/nltk/stem/api.py
index 09a82e2..1bbea64 100644
--- a/nltk/stem/api.py
+++ b/nltk/stem/api.py
@@ -23,6 +23,3 @@ class StemmerI(object):
raise NotImplementedError()
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/stem/isri.py b/nltk/stem/isri.py
index 2ddc6ca..bb854a1 100644
--- a/nltk/stem/isri.py
+++ b/nltk/stem/isri.py
@@ -343,6 +343,3 @@ class ISRIStemmer(StemmerI):
return word
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/stem/lancaster.py b/nltk/stem/lancaster.py
index 185d4ff..dbabcfd 100644
--- a/nltk/stem/lancaster.py
+++ b/nltk/stem/lancaster.py
@@ -308,6 +308,3 @@ class LancasterStemmer(StemmerI):
return '<LancasterStemmer>'
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/stem/porter.py b/nltk/stem/porter.py
index 721c90b..ac03067 100644
--- a/nltk/stem/porter.py
+++ b/nltk/stem/porter.py
@@ -690,7 +690,3 @@ def demo():
##--NLTK--
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
-
diff --git a/nltk/stem/regexp.py b/nltk/stem/regexp.py
index ee51cd8..e738378 100644
--- a/nltk/stem/regexp.py
+++ b/nltk/stem/regexp.py
@@ -58,7 +58,4 @@ class RegexpStemmer(StemmerI):
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/stem/rslp.py b/nltk/stem/rslp.py
index 9e429a2..6487a2c 100644
--- a/nltk/stem/rslp.py
+++ b/nltk/stem/rslp.py
@@ -140,6 +140,3 @@ class RSLPStemmer(StemmerI):
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/stem/snowball.py b/nltk/stem/snowball.py
index 1f4b751..76d62e0 100644
--- a/nltk/stem/snowball.py
+++ b/nltk/stem/snowball.py
@@ -3710,6 +3710,3 @@ def demo():
print("\n")
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/stem/wordnet.py b/nltk/stem/wordnet.py
index 092a449..1c8b71a 100644
--- a/nltk/stem/wordnet.py
+++ b/nltk/stem/wordnet.py
@@ -49,6 +49,3 @@ def teardown_module(module=None):
from nltk.corpus import wordnet
wordnet._unload()
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tag/__init__.py b/nltk/tag/__init__.py
index 1dce867..fccf506 100644
--- a/nltk/tag/__init__.py
+++ b/nltk/tag/__init__.py
@@ -114,6 +114,3 @@ def pos_tag_sents(sentences):
return tagger.tag_sents(sentences)
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tag/api.py b/nltk/tag/api.py
index fab972a..236a488 100644
--- a/nltk/tag/api.py
+++ b/nltk/tag/api.py
@@ -79,6 +79,3 @@ class FeaturesetTaggerI(TaggerI):
"""
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tag/brill.py b/nltk/tag/brill.py
index 3aa46f8..5d2f646 100644
--- a/nltk/tag/brill.py
+++ b/nltk/tag/brill.py
@@ -424,6 +424,3 @@ class BrillTagger(TaggerI):
return (tagged_tokenses, testing_stats)
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tag/brill_trainer.py b/nltk/tag/brill_trainer.py
index 5fa43e5..7ed4583 100644
--- a/nltk/tag/brill_trainer.py
+++ b/nltk/tag/brill_trainer.py
@@ -11,12 +11,10 @@
from __future__ import print_function, division
import bisect
-from collections import defaultdict
-
import textwrap
+from collections import defaultdict
-from nltk.tag.util import untag
-from nltk.tag.brill import BrillTagger
+from nltk.tag import untag, BrillTagger
######################################################################
# Brill Tagger Trainer
@@ -608,6 +606,3 @@ class BrillTaggerTrainer(object):
print(prefix)
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tag/brill_trainer_orig.py b/nltk/tag/brill_trainer_orig.py
deleted file mode 100644
index b8a4423..0000000
--- a/nltk/tag/brill_trainer_orig.py
+++ /dev/null
@@ -1,414 +0,0 @@
-# -*- coding: utf-8 -*-
-# Natural Language Toolkit: Transformation-based learning
-#
-# Copyright (C) 2001-2013 NLTK Project
-# Author: Marcus Uneson <marcus.uneson at gmail.com>
-# based on previous (nltk2) version by
-# Christopher Maloof, Edward Loper, Steven Bird
-# URL: <http://nltk.org/>
-# For license information, see LICENSE.TXT
-
-from __future__ import print_function, division
-
-from collections import defaultdict
-import textwrap
-
-from nltk.tag.util import untag
-from nltk.tag.brill import BrillTagger
-
-######################################################################
-## Original Brill Tagger Trainer
-######################################################################
-
-class BrillTaggerTrainer(object):
- """
- A trainer for tbl taggers, superseded by nltk.tag.brill_trainer.BrillTaggerTrainer
-
- :param deterministic: If true, then choose between rules that
- have the same score by picking the one whose __repr__
- is lexicographically smaller. If false, then just pick the
- first rule we find with a given score -- this will depend
- on the order in which keys are returned from dictionaries,
- and so may not be the same from one run to the next. If
- not specified, treat as true iff trace > 0.
- """
-
- def __init__(self, initial_tagger, templates, trace=0,
- deterministic=None, ruleformat="str"):
- if deterministic is None:
- deterministic = (trace > 0)
- self._initial_tagger = initial_tagger
- self._templates = templates
- self._trace = trace
- self._deterministic = deterministic
- self._ruleformat = ruleformat
-
- #////////////////////////////////////////////////////////////
- # Training
- #////////////////////////////////////////////////////////////
-
- def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
- """
- Trains the Brill tagger on the corpus *train_sents*,
- producing at most *max_rules* transformations, each of which
- reduces the net number of errors in the corpus by at least
- *min_score*, and each of which has accuracy not lower than
- *min_acc*.
-
- #imports
- >>> from nltk.tbl.template import Template
- >>> from nltk.tag.brill import Pos, Word
- >>> from nltk.tag import RegexpTagger
- >>> from nltk.tag.brill_trainer_orig import BrillTaggerTrainer
-
- #some data
- >>> from nltk.corpus import treebank
- >>> training_data = treebank.tagged_sents()[:100]
- >>> baseline_data = treebank.tagged_sents()[100:200]
- >>> gold_data = treebank.tagged_sents()[200:300]
- >>> testing_data = [untag(s) for s in gold_data]
-
- >>> backoff = RegexpTagger([
- ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
- ... (r'(The|the|A|a|An|an)$', 'AT'), # articles
- ... (r'.*able$', 'JJ'), # adjectives
- ... (r'.*ness$', 'NN'), # nouns formed from adjectives
- ... (r'.*ly$', 'RB'), # adverbs
- ... (r'.*s$', 'NNS'), # plural nouns
- ... (r'.*ing$', 'VBG'), # gerunds
- ... (r'.*ed$', 'VBD'), # past tense verbs
- ... (r'.*', 'NN') # nouns (default)
- ... ])
-
- >>> baseline = backoff #see NOTE1
-
- >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS
- 0.2450142...
-
- #templates
- >>> Template._cleartemplates() #clear any templates created in earlier tests
- >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]
-
- #construct a BrillTaggerTrainer
- >>> tt = BrillTaggerTrainer(baseline, templates, trace=3)
- >>> tagger1 = tt.train(training_data, max_rules=10)
- TBL train (orig) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None)
- <BLANKLINE>
- B |
- S F r O | Score = Fixed - Broken
- c i o t | R Fixed = num tags changed incorrect -> correct
- o x k h | u Broken = num tags changed correct -> incorrect
- r e e e | l Other = num tags changed incorrect -> incorrect
- e d n r | e
- ------------------+-------------------------------------------------------
- 132 132 0 0 | AT->DT if Pos:NN@[-1]
- 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0]
- 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0]
- 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0]
- 47 63 16 161 | NN->IN if Pos:NNS@[-1]
- 33 33 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0]
- 26 26 0 0 | IN->. if Pos:NNS@[-1] & Word:.@[0]
- 24 24 0 0 | IN->, if Pos:NNS@[-1] & Word:,@[0]
- 22 27 5 24 | NN->-NONE- if Pos:VBD@[-1]
- 17 17 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0]
-
-
-
- >>> tagger1.rules()[1:3]
- (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]))
-
-
- >>> train_stats = tagger1.train_stats()
- >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
- [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]
-
-
- ##FIXME: the following test fails -- why?
- #
- #>>> tagger1.print_template_statistics(printunused=False)
- #TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules)
- #TRAIN ( 3163 tokens) initial 2358 0.2545 final: 1719 0.4565
- ##ID | Score (train) | #Rules | Template
- #--------------------------------------------
- #001 | 404 0.632 | 7 0.700 | Template(Pos([-1]),Word([0]))
- #000 | 235 0.368 | 3 0.300 | Template(Pos([-1]))
- #<BLANKLINE>
- #<BLANKLINE>
-
- >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS
- 0.43996...
-
- >>> (tagged, test_stats) = tagger1.batch_tag_incremental(testing_data, gold_data)
-
-
- >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'),
- ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'),
- ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')]
- True
-
-
- >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
- [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]
-
- ##a high-accuracy tagger
- >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)
- TBL train (orig) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99)
- <BLANKLINE>
- B |
- S F r O | Score = Fixed - Broken
- c i o t | R Fixed = num tags changed incorrect -> correct
- o x k h | u Broken = num tags changed correct -> incorrect
- r e e e | l Other = num tags changed incorrect -> incorrect
- e d n r | e
- ------------------+-------------------------------------------------------
- 132 132 0 0 | AT->DT if Pos:NN@[-1]
- 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0]
- 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0]
- 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0]
- 36 36 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0]
- 26 26 0 0 | NN->. if Pos:NNS@[-1] & Word:.@[0]
- 24 24 0 0 | NN->, if Pos:NNS@[-1] & Word:,@[0]
- 19 19 0 6 | NN->VB if Pos:TO@[-1]
- 18 18 0 0 | CD->-NONE- if Pos:NN@[-1] & Word:0@[0]
- 18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0]
-
-
- >>> tagger2.evaluate(gold_data) # doctest: +ELLIPSIS
- 0.44159544...
-
- >>> tagger2.rules()[2:4]
- (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')]))
-
- #NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger,
- #with a RegexpTagger only as backoff. For instance,
- #>>> baseline = UnigramTagger(baseline_data, backoff=backoff)
- #However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results
- #between python versions. The simplistic backoff above is a workaround to make doctests
- #get consistent input.
-
- :param train_sents: training data
- :type train_sents: list(list(tuple))
- :param max_rules: output at most max_rules rules
- :type max_rules: int
- :param min_score: stop training when no rules better than min_score can be found
- :type min_score: int
- :param min_acc: discard any rule with lower accuracy than min_acc
- :type min_acc: float or None
- :return: the learned tagger
- :rtype: BrillTagger
-
-
- :param train_sents: training data
- :type train_sents: list(list(tuple))
- :param max_rules: output at most max_rules rules
- :type max_rules: int
- :param min_score: stop training when no rules better than min_score can be found
- :type min_score: int
- :param min_acc: discard any rule with lower accuracy than min_acc
- :type min_acc: float or None
- :return: the learned tagger
- :rtype: BrillTagger
-
- """
-
- # Create a new copy of the training corpus, and run the
- # initial tagger on it. We will progressively update this
- # test corpus to look more like the training corpus.
- test_sents = [self._initial_tagger.tag(untag(sent))
- for sent in train_sents]
- trainstats = {}
- trainstats['min_acc'] = min_acc
- trainstats['min_score'] = min_score
- trainstats['tokencount'] = sum(len(t) for t in test_sents)
- trainstats['sequencecount'] = len(test_sents)
- trainstats['templatecount'] = len(self._templates)
- trainstats['rulescores'] = []
- trainstats['initialerrors'] = sum(tag[1] != truth[1]
- for paired in zip(test_sents, train_sents)
- for (tag, truth) in zip(*paired))
- trainstats['initialacc'] = 1 - trainstats['initialerrors']/trainstats['tokencount']
- if self._trace > 0:
- print("TBL train (orig) (seqs: {sequencecount}; tokens: {tokencount}; "
- "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})".format(**trainstats))
-
- if self._trace > 2:
- self._trace_header()
-
- # Look for useful rules.
- rules = []
- try:
- while len(rules) < max_rules:
- (rule, score, fixscore) = self._best_rule(test_sents,
- train_sents, min_acc=min_acc)
- if rule is None or score < min_score:
- if self._trace > 1:
- print('Insufficient improvement; stopping')
- break
- else:
- # Add the rule to our list of rules.
- rules.append(rule)
- trainstats['rulescores'].append(score)
- # Use the rules to update the test corpus. Keep
- # track of how many times the rule applied (k).
- k = 0
- for sent in test_sents:
- k += len(rule.apply(sent))
- # Display trace output.
- if self._trace > 1:
- self._trace_rule(rule, score, fixscore, k)
- # The user can also cancel training manually:
- except KeyboardInterrupt:
- print("Training stopped manually -- %d rules found" % len(rules))
-
- trainstats['finalerrors'] = trainstats['initialerrors'] - sum(trainstats['rulescores'])
- trainstats['finalacc'] = 1 - trainstats['finalerrors']/trainstats['tokencount']
- # Create and return a tagger from the rules we found.
- return BrillTagger(self._initial_tagger, rules, trainstats)
-
- #////////////////////////////////////////////////////////////
- # Finding the best rule
- #////////////////////////////////////////////////////////////
-
- # Finds the rule that makes the biggest net improvement in the corpus.
- # Returns a (rule, score) pair.
- def _best_rule(self, test_sents, train_sents, min_acc):
- # Create a dictionary mapping from each tag to a list of the
- # indices that have that tag in both test_sents and
- # train_sents (i.e., where it is correctly tagged).
- correct_indices = defaultdict(list)
- for sentnum, sent in enumerate(test_sents):
- for wordnum, tagged_word in enumerate(sent):
- if tagged_word[1] == train_sents[sentnum][wordnum][1]:
- tag = tagged_word[1]
- correct_indices[tag].append( (sentnum, wordnum) )
-
- # Find all the rules that correct at least one token's tag,
- # and the number of tags that each rule corrects (in
- # descending order of number of tags corrected).
- rules = self._find_rules(test_sents, train_sents)
-
- # Keep track of the current best rule, and its score.
- best_rule, best_score, best_fixscore = None, 0, 0
-
- # Consider each rule, in descending order of fixscore (the
- # number of tags that the rule corrects, not including the
- # number that it breaks).
- for (rule, fixscore) in rules:
- # The actual score must be <= fixscore; so if best_score
- # is bigger than fixscore, then we already have the best
- # rule.
- if best_score > fixscore or (best_score == fixscore and
- not self._deterministic):
- return best_rule, best_score, best_fixscore
-
- # Calculate the actual score, by decrementing score (initialized
- # to fixscore once for each tag that the rule changes to an incorrect
- # value.
- score = fixscore
- if rule.original_tag in correct_indices:
- for (sentnum, wordnum) in correct_indices[rule.original_tag]:
- if rule.applies(test_sents[sentnum], wordnum):
- score -= 1
- # If the rule accuracy goes below min_acc,
- # this rule is not eligible; so move on
-
- if min_acc is not None and fixscore/(2*fixscore-score) < min_acc:
- break
- # If the score goes below best_score, then we know
- # that this isn't the best rule; so move on
- if score < best_score or (score == best_score and
- not self._deterministic):
- break
-
- # If the actual score is better than the best score, then
- # update best_score and best_rule.
- if (( min_acc is None or #IF: either no threshold for accuracy,
- fixscore/(2*fixscore-score) >= min_acc) and #or accuracy good enough AND
- ( score > best_score or #(score is higher than current leader OR
- (score == best_score and #score is same as leader, but this
- self._deterministic and #rule sorts before it when determinism
- repr(rule) < repr(best_rule)))): # is asked for): THEN update...
- best_rule, best_score, best_fixscore = rule, score, fixscore
-
- # Return the best rule, and its score.
- return best_rule, best_score, best_fixscore
-
- def _find_rules(self, test_sents, train_sents):
- """
- Find all rules that correct at least one token's tag in *test_sents*.
-
- :return: A list of tuples ``(rule, fixscore)``, where rule
- is a tbl rule and ``fixscore`` is the number of tokens
- whose tag the rule corrects. Note that ``fixscore`` does
- *not* include the number of tokens whose tags are changed
- to incorrect values.
- """
-
- # Create a list of all indices that are incorrectly tagged.
- error_indices = []
- for sentnum, sent in enumerate(test_sents):
- for wordnum, tagged_word in enumerate(sent):
- if tagged_word[1] != train_sents[sentnum][wordnum][1]:
- error_indices.append( (sentnum, wordnum) )
-
- # Create a dictionary mapping from rules to their positive-only
- # scores.
- rule_score_dict = defaultdict(int)
- for (sentnum, wordnum) in error_indices:
- test_sent = test_sents[sentnum]
- train_sent = train_sents[sentnum]
- for rule in self._find_rules_at(test_sent, train_sent, wordnum):
- rule_score_dict[rule] += 1
-
- # Convert the dictionary into a list of (rule, score) tuples,
- # sorted in descending order of score.
- return sorted(rule_score_dict.items(),
- key=lambda rule_score: -rule_score[1])
-
- def _find_rules_at(self, test_sent, train_sent, i):
- """
- :rtype: set
- :return: the set of all rules (based on the templates) that
- correct token *i*'s tag in *test_sent*.
- """
- applicable_rules = set()
- if test_sent[i][1] != train_sent[i][1]:
- correct_tag = train_sent[i][1]
- for template in self._templates:
- new_rules = template.applicable_rules(test_sent, i,
- correct_tag)
- applicable_rules.update(new_rules)
-
- return applicable_rules
-
- #////////////////////////////////////////////////////////////
- # Tracing
- #////////////////////////////////////////////////////////////
-
- def _trace_header(self):
- print("""
- B |
- S F r O | Score = Fixed - Broken
- c i o t | R Fixed = num tags changed incorrect -> correct
- o x k h | u Broken = num tags changed correct -> incorrect
- r e e e | l Other = num tags changed incorrect -> incorrect
- e d n r | e
-------------------+-------------------------------------------------------
- """.rstrip())
-
- def _trace_rule(self, rule, score, fixscore, numchanges):
- rulestr = rule.format(self._ruleformat)
-
- if self._trace > 2:
- print(('%4d%4d%4d%4d ' % (score, fixscore, fixscore-score,
- numchanges-fixscore*2+score)), '|', end=' ')
- print(textwrap.fill(rulestr, initial_indent=' '*20, width=79,
- subsequent_indent=' '*18+'| ').strip())
- else:
- print(rulestr)
-
-
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
-
diff --git a/nltk/tag/crf.py b/nltk/tag/crf.py
index 0825944..096426a 100644
--- a/nltk/tag/crf.py
+++ b/nltk/tag/crf.py
@@ -198,6 +198,3 @@ class CRFTagger(TaggerI):
return self.tag_sents([tokens])[0]
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tag/hmm.py b/nltk/tag/hmm.py
index 58127f2..f9b01e0 100644
--- a/nltk/tag/hmm.py
+++ b/nltk/tag/hmm.py
@@ -1273,8 +1273,5 @@ def demo_bw():
max_iterations=1000)
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tag/hunpos.py b/nltk/tag/hunpos.py
index f3ba445..f40ebc7 100644
--- a/nltk/tag/hunpos.py
+++ b/nltk/tag/hunpos.py
@@ -130,6 +130,3 @@ def setup_module(module):
except LookupError:
raise SkipTest("HunposTagger is not available")
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tag/mapping.py b/nltk/tag/mapping.py
index d9a7fe3..fea50c3 100644
--- a/nltk/tag/mapping.py
+++ b/nltk/tag/mapping.py
@@ -99,6 +99,3 @@ def map_tag(source, target, source_tag):
return tagset_mapping(source, target)[source_tag]
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tag/senna.py b/nltk/tag/senna.py
index 85fd7e9..84c474e 100644
--- a/nltk/tag/senna.py
+++ b/nltk/tag/senna.py
@@ -73,6 +73,48 @@ class SennaChunkTagger(Senna):
annotations = tagged_sents[i][j]
tagged_sents[i][j] = (annotations['word'], annotations['chk'])
return tagged_sents
+
+ def bio_to_chunks(self, tagged_sent, chunk_type):
+ """
+ Extracts the chunks in a BIO chunk-tagged sentence.
+
+ >>> from nltk.tag import SennaChunkTagger
+ >>> chktagger = SennaChunkTagger('/usr/share/senna-v2.0')
+ >>> sent = 'What is the airspeed of an unladen swallow ?'.split()
+ >>> tagged_sent = chktagger.tag(sent)
+ >>> tagged_sent
+ [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'),
+ ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'),
+ ('?', 'O')]
+ >>> list(chktagger.bio_to_chunks(tagged_sents, chunk_type='NP'))
+ [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')]
+
+ :param tagged_sent: A list of tuples of word and BIO chunk tag.
+ :type tagged_sent: list(tuple)
+ :param tagged_sent: The chunk tag that users want to extract, e.g. 'NP' or 'VP'
+ :type tagged_sent: str
+
+ :return: An iterable of tuples of chunks that users want to extract
+ and their corresponding indices.
+ :rtype: iter(tuple(str))
+ """
+ current_chunk = []
+ current_chunk_position = []
+ for idx, word_pos in enumerate(tagged_sent):
+ word, pos = word_pos
+ if '-'+chunk_type in pos: # Append the word to the current_chunk.
+ current_chunk.append((word))
+ current_chunk_position.append((idx))
+ else:
+ if current_chunk: # Flush the full chunk when out of an NP.
+ _chunk_str = ' '.join(current_chunk)
+ _chunk_pos_str = '-'.join(map(str, current_chunk_position))
+ yield _chunk_str, _chunk_pos_str
+ current_chunk = []
+ current_chunk_position = []
+ if current_chunk: # Flush the last chunk.
+ yield ' '.join(current_chunk), '-'.join(map(str, current_chunk_position))
+
@python_2_unicode_compatible
class SennaNERTagger(Senna):
@@ -101,6 +143,3 @@ def setup_module(module):
except OSError:
raise SkipTest("Senna executable not found")
-if __name__ == '__main__':
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS)
diff --git a/nltk/tag/sequential.py b/nltk/tag/sequential.py
index df2283f..e153775 100644
--- a/nltk/tag/sequential.py
+++ b/nltk/tag/sequential.py
@@ -22,7 +22,7 @@ from __future__ import print_function, unicode_literals
import re
from nltk.probability import ConditionalFreqDist
-from nltk.classify.naivebayes import NaiveBayesClassifier
+from nltk.classify import NaiveBayesClassifier
from nltk.compat import python_2_unicode_compatible
from nltk.tag.api import TaggerI, FeaturesetTaggerI
@@ -216,7 +216,7 @@ class DefaultTagger(SequentialBackoffTagger):
"""
A tagger that assigns the same tag to every token.
- >>> from nltk.tag.sequential import DefaultTagger
+ >>> from nltk.tag import DefaultTagger
>>> default_tagger = DefaultTagger('NN')
>>> list(default_tagger.tag('This is a test'.split()))
[('This', 'NN'), ('is', 'NN'), ('a', 'NN'), ('test', 'NN')]
@@ -308,7 +308,7 @@ class UnigramTagger(NgramTagger):
corpus, and then uses that information to assign tags to new tokens.
>>> from nltk.corpus import brown
- >>> from nltk.tag.sequential import UnigramTagger
+ >>> from nltk.tag import UnigramTagger
>>> test_sent = brown.sents(categories='news')[0]
>>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
>>> for tok, tag in unigram_tagger.tag(test_sent):
@@ -491,7 +491,7 @@ class RegexpTagger(SequentialBackoffTagger):
of speech tag:
>>> from nltk.corpus import brown
- >>> from nltk.tag.sequential import RegexpTagger
+ >>> from nltk.tag import RegexpTagger
>>> test_sent = brown.sents(categories='news')[0]
>>> regexp_tagger = RegexpTagger(
... [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
@@ -741,6 +741,3 @@ class ClassifierBasedPOSTagger(ClassifierBasedTagger):
return features
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tag/stanford.py b/nltk/tag/stanford.py
index c88aff9..5484126 100644
--- a/nltk/tag/stanford.py
+++ b/nltk/tag/stanford.py
@@ -73,7 +73,8 @@ class StanfordTagger(TaggerI):
# Create a temporary input file
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
- self._cmd.extend(['-encoding', encoding])
+ cmd = list(self._cmd)
+ cmd.extend(['-encoding', encoding])
# Write the actual sentences to the temporary input file
_input_fh = os.fdopen(_input_fh, 'wb')
@@ -84,7 +85,7 @@ class StanfordTagger(TaggerI):
_input_fh.close()
# Run the tagger and get the output
- stanpos_output, _stderr = java(self._cmd,classpath=self._stanford_jar,
+ stanpos_output, _stderr = java(cmd, classpath=self._stanford_jar,
stdout=PIPE, stderr=PIPE)
stanpos_output = stanpos_output.decode(encoding)
@@ -113,7 +114,7 @@ class StanfordPOSTagger(StanfordTagger):
- a model trained on training data
- (optionally) the path to the stanford tagger jar file. If not specified here,
then this jar file must be specified in the CLASSPATH envinroment variable.
- - (optionally) the encoding of the training data (default: ASCII)
+ - (optionally) the encoding of the training data (default: UTF-8)
Example:
@@ -142,7 +143,7 @@ class StanfordNERTagger(StanfordTagger):
- a model trained on training data
- (optionally) the path to the stanford tagger jar file. If not specified here,
then this jar file must be specified in the CLASSPATH envinroment variable.
- - (optionally) the encoding of the training data (default: ASCII)
+ - (optionally) the encoding of the training data (default: UTF-8)
Example:
@@ -188,6 +189,3 @@ class StanfordNERTagger(StanfordTagger):
raise NotImplementedError
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tag/tnt.py b/nltk/tag/tnt.py
index 0dd7103..e9d693f 100755
--- a/nltk/tag/tnt.py
+++ b/nltk/tag/tnt.py
@@ -601,7 +601,4 @@ def demo3():
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tag/util.py b/nltk/tag/util.py
index a791cc4..b60f5f8 100644
--- a/nltk/tag/util.py
+++ b/nltk/tag/util.py
@@ -70,6 +70,3 @@ def untag(tagged_sentence):
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tbl/feature.py b/nltk/tbl/feature.py
index 5988f07..8dd704e 100644
--- a/nltk/tbl/feature.py
+++ b/nltk/tbl/feature.py
@@ -259,6 +259,3 @@ class Feature(object):
raise NotImplementedError
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tbl/rule.py b/nltk/tbl/rule.py
index 63dee4f..8919fb7 100644
--- a/nltk/tbl/rule.py
+++ b/nltk/tbl/rule.py
@@ -319,6 +319,3 @@ class Rule(TagRule):
return replacement + conditions
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tbl/template.py b/nltk/tbl/template.py
index 7142733..237807a 100644
--- a/nltk/tbl/template.py
+++ b/nltk/tbl/template.py
@@ -306,6 +306,3 @@ class Template(BrillTemplateI):
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/test/align.doctest b/nltk/test/align.doctest
index 38b5caf..9dbcfea 100644
--- a/nltk/test/align.doctest
+++ b/nltk/test/align.doctest
@@ -12,14 +12,14 @@ Corpus Reader
>>> from nltk.corpus import comtrans
>>> words = comtrans.words('alignment-en-fr.txt')
- >>> for word in words:
+ >>> for word in words[:6]:
... print(word)
Resumption
of
the
session
I
- declare...
+ declare
>>> als = comtrans.aligned_sents('alignment-en-fr.txt')[0]
>>> als # doctest: +NORMALIZE_WHITESPACE
AlignedSent(['Resumption', 'of', 'the', 'session'],
@@ -40,8 +40,8 @@ Aligned sentences are simply a mapping between words in a sentence:
Alignment([(0, 0), (1, 1), (2, 2), (3, 3)])
-Usually we look at them from the perpective of a source to a target languge,
-but they are easilly inverted:
+Usually we look at them from the perspective of a source to a target language,
+but they are easily inverted:
>>> als.invert() # doctest: +NORMALIZE_WHITESPACE
AlignedSent(['Reprise', 'de', 'la', 'session'],
@@ -83,36 +83,36 @@ Here is an example from Koehn, 2010:
... AlignedSent(['the', 'book'], ['das', 'Buch']),
... AlignedSent(['a', 'book'], ['ein', 'Buch'])]
>>> em_ibm1 = IBMModel1(corpus, 20)
- >>> print(round(em_ibm1.probabilities['the']['das'], 1))
+ >>> print(round(em_ibm1.translation_table['the']['das'], 1))
1.0
- >>> print(round(em_ibm1.probabilities['book']['das'], 1))
+ >>> print(round(em_ibm1.translation_table['book']['das'], 1))
0.0
- >>> print(round(em_ibm1.probabilities['house']['das'], 1))
+ >>> print(round(em_ibm1.translation_table['house']['das'], 1))
0.0
- >>> print(round(em_ibm1.probabilities['the']['Buch'], 1))
+ >>> print(round(em_ibm1.translation_table['the']['Buch'], 1))
0.0
- >>> print(round(em_ibm1.probabilities['book']['Buch'], 1))
+ >>> print(round(em_ibm1.translation_table['book']['Buch'], 1))
1.0
- >>> print(round(em_ibm1.probabilities['a']['Buch'], 1))
+ >>> print(round(em_ibm1.translation_table['a']['Buch'], 1))
0.0
- >>> print(round(em_ibm1.probabilities['book']['ein'], 1))
+ >>> print(round(em_ibm1.translation_table['book']['ein'], 1))
0.0
- >>> print(round(em_ibm1.probabilities['a']['ein'], 1))
+ >>> print(round(em_ibm1.translation_table['a']['ein'], 1))
1.0
- >>> print(round(em_ibm1.probabilities['the']['Haus'], 1))
+ >>> print(round(em_ibm1.translation_table['the']['Haus'], 1))
0.0
- >>> print(round(em_ibm1.probabilities['house']['Haus'], 1))
+ >>> print(round(em_ibm1.translation_table['house']['Haus'], 1))
1.0
- >>> print(round(em_ibm1.probabilities['book'][None], 1))
+ >>> print(round(em_ibm1.translation_table['book'][None], 1))
0.5
And using an NLTK corpus. We train on only 10 sentences, since it is so slow:
>>> from nltk.corpus import comtrans
>>> com_ibm1 = IBMModel1(comtrans.aligned_sents()[:10], 20)
- >>> print(round(com_ibm1.probabilities['bitte']['Please'], 1))
+ >>> print(round(com_ibm1.translation_table['bitte']['Please'], 1))
0.2
- >>> print(round(com_ibm1.probabilities['Sitzungsperiode']['session'], 1))
+ >>> print(round(com_ibm1.translation_table['Sitzungsperiode']['session'], 1))
1.0
@@ -142,10 +142,10 @@ Precision
already a set based implementation in NLTK as
`nltk.metrics.scores.precision`_. Since precision is simply interested in the
proportion of correct alignments, we calculate the ratio of the number of our
-test alignments (*A*) that match a possible alignment (*P*) over the number of
-test alignments provided. We compare to the possible alignment set don't
-penalise for coming up with an alignment that a humans would have possibly
-considered to be correct [OCH2000]_.
+test alignments (*A*) that match a possible alignment (*P*), over the number of
+test alignments provided. There is no penalty for missing a possible alignment
+in our test alignments. An easy way to game this metric is to provide just one
+test alignment that is in *P* [OCH2000]_.
Here are some examples:
@@ -173,8 +173,10 @@ Recall
implementation in NLTK as `nltk.metrics.scores.recall`_. Since recall is
simply interested in the proportion of found alignments, we calculate the
ratio of the number of our test alignments (*A*) that match a sure alignment
-(*S*) over the number of sure alignments. Since we are not sure about some of
-our possible alignments we don't penalise for not finding these [OCH2000]_.
+(*S*) over the number of sure alignments. There is no penalty for producing
+a lot of test alignments. An easy way to game this metric is to include every
+possible alignment in our test alignments, regardless if they are correct or
+not [OCH2000]_.
Here are some examples:
@@ -184,8 +186,8 @@ Here are some examples:
1.0
>>> print(als.recall([(0,0), (3,3)]))
1.0
- >>> print(als.recall([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]))
- 0.66666666666...
+ >>> als.recall([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)])
+ 0.66666...
>>> print(my_als.recall(als))
0.75
@@ -205,9 +207,8 @@ possible alignments [MIHALCEA2003]_ [KOEHN2010]_.
.. Note::
[KOEHN2010]_ defines the AER as ``AER = (|A∩S| + |A∩P|) / (|A| + |S|)``
- meaning that the best alignment would when the ``AER = 1.0``. Thus we
- follow [MIHALCEA2003]_ more intuitive definition where we are minimising
- the error rate.
+ in his book, but corrects it to the above in his online errata. This is
+ in line with [MIHALCEA2003]_.
Here are some examples:
@@ -215,11 +216,11 @@ Here are some examples:
1.0
>>> print(als.alignment_error_rate([(0,0), (1,1), (2,2), (3,3)]))
0.0
- >>> print(my_als.alignment_error_rate(als))
- 0.33333333333...
- >>> print(my_als.alignment_error_rate(als,
- ... als.alignment | set([(1,2), (2,1)])))
- 0.22222222222...
+ >>> my_als.alignment_error_rate(als)
+ 0.333333...
+ >>> my_als.alignment_error_rate(als,
+ ... als.alignment | set([(1,2), (2,1)]))
+ 0.222222...
.. [OCH2000] Och, F. and Ney, H. (2000)
diff --git a/nltk/test/bnc.doctest b/nltk/test/bnc.doctest
index acfc44e..0c96b06 100644
--- a/nltk/test/bnc.doctest
+++ b/nltk/test/bnc.doctest
@@ -1,8 +1,13 @@
.. Copyright (C) 2001-2015 NLTK Project
.. For license information, see LICENSE.TXT
+ >>> import os.path
+
>>> from nltk.corpus.reader import BNCCorpusReader
- >>> bnc = BNCCorpusReader(root='.', fileids=r'FX8.xml')
+ >>> import nltk.test
+
+ >>> root = os.path.dirname(nltk.test.__file__)
+ >>> bnc = BNCCorpusReader(root=root, fileids='FX8.xml')
Checking the word access.
-------------------------
@@ -38,9 +43,9 @@ Testing access to the sentences.
[('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]
A not lazy loader.
------------------
+------------------
- >>> eager = BNCCorpusReader(root='.', fileids=r'FX8.xml', lazy=False)
+ >>> eager = BNCCorpusReader(root=root, fileids=r'FX8.xml', lazy=False)
>>> len(eager.words())
151
diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest
index 6048961..67a1a5b 100644
--- a/nltk/test/corpus.doctest
+++ b/nltk/test/corpus.doctest
@@ -238,7 +238,7 @@ to the entire chunk).
>>> [[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]]
[['(DT The)', "(Lemma('group.n.01.group') (NE (NNP Fulton County Grand Jury)))", ...
'(None .)'], ['(DT The)', ... '(None .)']]
-
+
The IEER corpus is another chunked corpus. This corpus is unusual in
that each corpus item contains multiple documents. (This reflects the
@@ -817,6 +817,33 @@ speaker and sentence identifier for a given speech sample:
>>> timit.utteranceids(spkrid=timit.spkrid(item)) # doctest: +ELLIPSIS
['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', ...]
+twitter_samples
+---------------
+
+Twitter is well-known microblog service that allows public data to be
+collected via APIs. NLTK's twitter corpus currently contains a sample of 20k Tweets
+retrieved from the Twitter Streaming API.
+
+ >>> from nltk.corpus import twitter_samples
+ >>> twitter_samples.fileids()
+ ['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']
+
+We follow standard practice in storing full Tweets as line-separated
+JSON. These data structures can be accessed via `tweets.docs()`. However, in general it
+is more practical to focus just on the text field of the Tweets, which
+are accessed via the `strings()` method.
+
+ >>> twitter_samples.strings('tweets.20150430-223406.json')
+ ['RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain \xa3170 billion per year! #BetterOffOut #UKIP', ...]
+
+The default tokenizer for Tweets is specialised for 'casual' text, and
+the `tokenized()` method returns a list of lists of tokens.
+
+ >>> twitter_samples.tokenized('tweets.20150430-223406.json')
+ [['RT', '@KirkKus', ':', 'Indirect', 'cost', 'of', 'the', 'UK', 'being', 'in', ...],
+ ['VIDEO', ':', 'Sturgeon', 'on', 'post-election', 'deals', 'http://t.co/BTJwrpbmOY'], ...]
+
+
rte
---
The RTE (Recognizing Textual Entailment) corpus was derived from the
diff --git a/nltk/test/gensim.doctest b/nltk/test/gensim.doctest
index 4e7e176..5df4508 100644
--- a/nltk/test/gensim.doctest
+++ b/nltk/test/gensim.doctest
@@ -1,144 +1,141 @@
.. Copyright (C) 2001-2015 NLTK Project
.. For license information, see LICENSE.TXT
-=========================================
- Test the word embedding function through Gensim package
-=========================================
-
- >>> import gensim
-
-Overview
-~~~~~~~~
-Use Gensim package, we demo 3 functions.
-- Train the word embeddings using brown corpus.
-- Load the pre-trained model and perform simple tasks.
-- Pruning the pre-trained binary model.
-
-Train the model
-~~~~~~~~~~~~~~~~~~
-The word embedding is trained on Brown corpus
+=======================================
+Demonstrate word embedding using Gensim
+=======================================
+
+We demonstrate three functions:
+- Train the word embeddings using brown corpus;
+- Load the pre-trained model and perform simple tasks; and
+- Pruning the pre-trained binary model.
+
+ >>> import gensim
+
+---------------
+Train the model
+---------------
+
+Here we train a word embedding using the Brown Corpus:
>>> from nltk.corpus import brown
>>> model = gensim.models.Word2Vec(brown.sents())
-It might take sometime to train the model, after the model is trained, probably you want to save and then use it latter
- >>> model.save('brown.embedding')
- >>> new_model = gensim.models.Word2Vec.load('brown.embedding')
+It might take some time to train the model. So, after it is trained, it can be saved as follows:
-The model will be the list of words with their embedding. We can easily get the vector representation of a word.
- >>> len(new_model['university'])
- 100
-
-There are some supporting functions already implemented in Gensim to manipulate with word embeddings.
-For example, to compute the cosine similarity between 2 words
- >>> new_model.similarity('university','school') > 0.3
- True
+ >>> model.save('brown.embedding')
+ >>> new_model = gensim.models.Word2Vec.load('brown.embedding')
+
+The model will be the list of words with their embedding. We can easily get the vector representation of a word.
+ >>> len(new_model['university'])
+ 100
+
+There are some supporting functions already implemented in Gensim to manipulate with word embeddings.
+For example, to compute the cosine similarity between 2 words:
+
+ >>> new_model.similarity('university','school') > 0.3
+ True
+---------------------------
Using the pre-trained model
-~~~~~~~~~~~~~~~~~~~
-NLTK also include a pre-trained model which is part of a model that is trained on 100 billion words from Google News Dataset.
-The full model is from https://code.google.com/p/word2vec/ which is about 3 Gb.
- >>> from nltk.data import find
- >>> word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.bin'))
- >>> model = gensim.models.Word2Vec.load(word2vec_sample)
+---------------------------
+
+NLTK includes a pre-trained model which is part of a model that is trained on 100 billion words from the Google News Dataset.
+The full model is from https://code.google.com/p/word2vec/ (about 3 GB).
+
+ >>> from nltk.data import find
+ >>> word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
+ >>> model = gensim.models.Word2Vec.load_word2vec_format(word2vec_sample, binary=False)
-We pruned the model to only include the most common words (~44k words).
- >>> len(model.vocab)
- 43981
-
-Each of the word is represented in the space of 300 dimensions.
- >>> len(model['university'])
- 300
+We pruned the model to only include the most common words (~44k words).
+
+ >>> len(model.vocab)
+ 43981
+
+Each word is represented in the space of 300 dimensions:
+
+ >>> len(model['university'])
+ 300
+
+Finding the top n words that are similar to a target word is simple. The result is the list of n words with the score.
+
+ >>> model.most_similar(positive=['university'], topn = 3)
+ [(u'universities', 0.70039...), (u'faculty', 0.67809...), (u'undergraduate', 0.65870...)]
-Finding the top n word that similar to a target word is simple. The result is the list of n words with the score.
- >>> model.most_similar(positive=['university'], topn = 3)
- [(u'universities', 0.7003918886184692), (u'faculty', 0.6780908703804016), (u'undergraduate', 0.6587098240852356)]
-
-Find a word that is not in a list is also supported, although, implementing this by yourself is simple.
- >>> model.doesnt_match('breakfast cereal dinner lunch'.split())
- 'cereal'
-
-Mikolov et al. (2013) figured out that word embedding captures much of syntactic and semantic regularities. For example,
-Vector 'King - Man + Woman' results close to 'Queen' or 'Germany - Berlin + Paris' closes to vector 'France'.
+Finding a word that is not in a list is also supported, although, implementing this by yourself is simple.
+
+ >>> model.doesnt_match('breakfast cereal dinner lunch'.split())
+ 'cereal'
+
+Mikolov et al. (2013) figured out that word embedding captures much of syntactic and semantic regularities. For example,
+the vector 'King - Man + Woman' is close to 'Queen' and 'Germany - Berlin + Paris' is close to 'France'.
+
>>> model.most_similar(positive=['woman','king'], negative=['man'], topn = 1)
- [(u'queen', 0.7118192911148071)]
-
+ [(u'queen', 0.71181...)]
+
>>> model.most_similar(positive=['Paris','Germany'], negative=['Berlin'], topn = 1)
- [(u'France', 0.7884092926979065)]
-
-We can visualize the word embeddings using t-SNE (http://lvdmaaten.github.io/tsne/). For demo, we just visualize the first 1000 words.
-You can just change it to a bigger value.
-
- import numpy as np
- labels = []
- count = 0
- max_count = 1000
- X = np.zeros(shape=(max_count,len(model['university'])))
-
- for term in model.vocab:
- X[count] = model[term]
- labels.append(term)
- count+= 1
- if count >= max_count: break
-
- # It is recommended to use PCA first to reduce to ~50 dimensions
- from sklearn.decomposition import PCA
- pca = PCA(n_components=50)
- X_50 = pca.fit_transform(X)
-
- # Using TSNE to further reduce to 2 dimensions
- from sklearn.manifold import TSNE
- model_tsne = TSNE(n_components=2, random_state=0)
- Y = model_tsne.fit_transform(X_50)
-
- # Show the scatter plot
- import matplotlib.pyplot as plt
- plt.scatter(Y[:,0], Y[:,1], 20)
-
- # Add labels
- for label, x, y in zip(labels, Y[:, 0], Y[:, 1]):
- plt.annotate(label, xy = (x,y), xytext = (0, 0), textcoords = 'offset points', size = 10)
-
- plt.show()
+ [(u'France', 0.78840...)]
+
+We can visualize the word embeddings using t-SNE (http://lvdmaaten.github.io/tsne/). For this demonstration, we visualize the first 1000 words.
+| import numpy as np
+| labels = []
+| count = 0
+| max_count = 1000
+| X = np.zeros(shape=(max_count,len(model['university'])))
+|
+| for term in model.vocab:
+| X[count] = model[term]
+| labels.append(term)
+| count+= 1
+| if count >= max_count: break
+|
+| # It is recommended to use PCA first to reduce to ~50 dimensions
+| from sklearn.decomposition import PCA
+| pca = PCA(n_components=50)
+| X_50 = pca.fit_transform(X)
+|
+| # Using TSNE to further reduce to 2 dimensions
+| from sklearn.manifold import TSNE
+| model_tsne = TSNE(n_components=2, random_state=0)
+| Y = model_tsne.fit_transform(X_50)
+|
+| # Show the scatter plot
+| import matplotlib.pyplot as plt
+| plt.scatter(Y[:,0], Y[:,1], 20)
+|
+| # Add labels
+| for label, x, y in zip(labels, Y[:, 0], Y[:, 1]):
+| plt.annotate(label, xy = (x,y), xytext = (0, 0), textcoords = 'offset points', size = 10)
+|
+| plt.show()
+------------------------------
Prune the trained binary model
-~~~~~~~~~~~~~~~~~
-Here is the supporting code to extract part of the binary model (GoogleNews-vectors-negative300.bin.gz) from https://code.google.com/p/word2vec/
-We use this code to get the `word2vec_sample` model.
-
- import gensim
- from gensim.models.word2vec import Word2Vec
- # Load the binary model
- model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True);
-
- # Only output word that appear in the Brown corpus
- from nltk.corpus import brown
- words = set(brown.words())
- print (len(words))
-
- # Output presented word to a temporary file
- out_file = 'pruned.word2vec.txt'
- f = open(out_file,'wb')
-
- word_presented = words.intersection(model.vocab.keys())
- f.write('{} {}\n'.format(len(word_presented),len(model['word'])))
-
- for word in word_presented:
- f.write('{} {}\n'.format(word, ' '.join(str(value) for value in model[word])))
-
- f.close()
-
- # Reload the model from text file
- new_model = Word2Vec.load_word2vec_format(out_file, binary=False);
-
- # Save it as the Gensim model
- gensim_model = "pruned.word2vec.bin"
- new_model.save(gensim_model)
-
- # Load the model
- very_new_model = gensim.models.Word2Vec.load(gensim_model)
-
- # Test it
- very_new_model.most_similar(positive=['king','woman'], negative=['man'], topn=1)
-
\ No newline at end of file
+------------------------------
+
+Here is the supporting code to extract part of the binary model (GoogleNews-vectors-negative300.bin.gz) from https://code.google.com/p/word2vec/
+We use this code to get the `word2vec_sample` model.
+
+| import gensim
+| from gensim.models.word2vec import Word2Vec
+| # Load the binary model
+| model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True);
+|
+| # Only output word that appear in the Brown corpus
+| from nltk.corpus import brown
+| words = set(brown.words())
+| print (len(words))
+|
+| # Output presented word to a temporary file
+| out_file = 'pruned.word2vec.txt'
+| f = open(out_file,'wb')
+|
+| word_presented = words.intersection(model.vocab.keys())
+| f.write('{} {}\n'.format(len(word_presented),len(model['word'])))
+|
+| for word in word_presented:
+| f.write('{} {}\n'.format(word, ' '.join(str(value) for value in model[word])))
+|
+| f.close()
+
diff --git a/nltk/test/gluesemantics_malt_fixt.py b/nltk/test/gluesemantics_malt_fixt.py
index 8e28d5b..ff660aa 100644
--- a/nltk/test/gluesemantics_malt_fixt.py
+++ b/nltk/test/gluesemantics_malt_fixt.py
@@ -6,6 +6,6 @@ def setup_module(module):
from nltk.parse.malt import MaltParser
try:
- depparser = MaltParser()
+ depparser = MaltParser('maltparser-1.7.2')
except LookupError:
raise SkipTest("MaltParser is not available")
diff --git a/nltk/test/stem.doctest b/nltk/test/stem.doctest
index c95378b..2150b64 100644
--- a/nltk/test/stem.doctest
+++ b/nltk/test/stem.doctest
@@ -30,15 +30,12 @@ Test the stemmer on various pluralised words.
... 'meeting', 'stating', 'siezing', 'itemization',
... 'sensational', 'traditional', 'reference', 'colonizer',
... 'plotted']
- >>> singles = []
- >>> for plural in plurals:
- ... singles.append(stemmer.stem(plural))
+ >>> singles = [stemmer.stem(plural) for plural in plurals]
- >>> singles # doctest: +NORMALIZE_WHITESPACE
- [u'caress', u'fli', u'die', u'mule', u'deni', u'die', u'agre', u'own',
- u'humbl', u'size', u'meet', u'state', u'siez', u'item', u'sensat',
- u'tradit', u'refer', u'colon', u'plot']
+ >>> print(' '.join(singles)) # doctest: +NORMALIZE_WHITESPACE
+ caress fli die mule deni die agre own humbl size meet
+ state siez item sensat tradit refer colon plot
Unit tests for Snowball stemmer
diff --git a/nltk/test/unit/align/__init__.py b/nltk/test/unit/align/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/nltk/test/unit/align/__init__.py
@@ -0,0 +1 @@
+
diff --git a/nltk/test/unit/align/test_ibm1.py b/nltk/test/unit/align/test_ibm1.py
new file mode 100644
index 0000000..cd76915
--- /dev/null
+++ b/nltk/test/unit/align/test_ibm1.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for IBM Model 1 training methods
+"""
+
+import unittest
+
+from collections import defaultdict
+from nltk.align import AlignedSent
+from nltk.align.ibm_model import AlignmentInfo
+from nltk.align.ibm1 import IBMModel1
+
+
+class TestIBMModel1(unittest.TestCase):
+ def test_prob_t_a_given_s(self):
+ # arrange
+ src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
+ trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
+ corpus = [AlignedSent(trg_sentence, src_sentence)]
+ alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5),
+ [None] + src_sentence,
+ ['UNUSED'] + trg_sentence,
+ None)
+
+ translation_table = defaultdict(lambda: defaultdict(float))
+ translation_table['i']['ich'] = 0.98
+ translation_table['love']['gern'] = 0.98
+ translation_table['to'][None] = 0.98
+ translation_table['eat']['esse'] = 0.98
+ translation_table['smoked']['räucherschinken'] = 0.98
+ translation_table['ham']['räucherschinken'] = 0.98
+
+ model1 = IBMModel1(corpus, 0)
+ model1.translation_table = translation_table
+
+ # act
+ probability = model1.prob_t_a_given_s(alignment_info)
+
+ # assert
+ lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
+ expected_probability = lexical_translation
+ self.assertEqual(round(probability, 4), round(expected_probability, 4))
diff --git a/nltk/test/unit/align/test_ibm2.py b/nltk/test/unit/align/test_ibm2.py
new file mode 100644
index 0000000..6f63bac
--- /dev/null
+++ b/nltk/test/unit/align/test_ibm2.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for IBM Model 2 training methods
+"""
+
+import unittest
+
+from collections import defaultdict
+from nltk.align import AlignedSent
+from nltk.align.ibm_model import AlignmentInfo
+from nltk.align.ibm2 import IBMModel2
+
+
+class TestIBMModel2(unittest.TestCase):
+ def test_prob_t_a_given_s(self):
+ # arrange
+ src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
+ trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
+ corpus = [AlignedSent(trg_sentence, src_sentence)]
+ alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5),
+ [None] + src_sentence,
+ ['UNUSED'] + trg_sentence,
+ None)
+
+ translation_table = defaultdict(lambda: defaultdict(float))
+ translation_table['i']['ich'] = 0.98
+ translation_table['love']['gern'] = 0.98
+ translation_table['to'][None] = 0.98
+ translation_table['eat']['esse'] = 0.98
+ translation_table['smoked']['räucherschinken'] = 0.98
+ translation_table['ham']['räucherschinken'] = 0.98
+
+ alignment_table = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(
+ lambda: defaultdict(float))))
+ alignment_table[0][3][5][6] = 0.97 # None -> to
+ alignment_table[1][1][5][6] = 0.97 # ich -> i
+ alignment_table[2][4][5][6] = 0.97 # esse -> eat
+ alignment_table[4][2][5][6] = 0.97 # gern -> love
+ alignment_table[5][5][5][6] = 0.96 # räucherschinken -> smoked
+ alignment_table[5][6][5][6] = 0.96 # räucherschinken -> ham
+
+ model2 = IBMModel2(corpus, 0)
+ model2.translation_table = translation_table
+ model2.alignment_table = alignment_table
+
+ # act
+ probability = model2.prob_t_a_given_s(alignment_info)
+
+ # assert
+ lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
+ alignment = 0.97 * 0.97 * 0.97 * 0.97 * 0.96 * 0.96
+ expected_probability = lexical_translation * alignment
+ self.assertEqual(round(probability, 4), round(expected_probability, 4))
diff --git a/nltk/test/unit/align/test_ibm4.py b/nltk/test/unit/align/test_ibm4.py
new file mode 100644
index 0000000..60f974d
--- /dev/null
+++ b/nltk/test/unit/align/test_ibm4.py
@@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for IBM Model 4 training methods
+"""
+
+import unittest
+
+from collections import defaultdict
+from nltk.align import AlignedSent
+from nltk.align.ibm_model import AlignmentInfo
+from nltk.align.ibm_model import IBMModel
+from nltk.align.ibm4 import IBMModel4
+
+
+class TestIBMModel4(unittest.TestCase):
+ def test_set_uniform_distortion_probabilities_of_max_displacements(self):
+ # arrange
+ src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
+ trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
+ corpus = [
+ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
+ AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
+ ]
+ model4 = IBMModel4(corpus, 0, src_classes, trg_classes)
+
+ # act
+ model4.set_uniform_distortion_probabilities(corpus)
+
+ # assert
+ # number of displacement values =
+ # 2 *(number of words in longest target sentence - 1)
+ expected_prob = 1.0 / (2 * (4 - 1))
+
+ # examine the boundary values for (displacement, src_class, trg_class)
+ self.assertEqual(model4.head_distortion_table[3][0][0], expected_prob)
+ self.assertEqual(model4.head_distortion_table[-3][1][2], expected_prob)
+ self.assertEqual(model4.non_head_distortion_table[3][0], expected_prob)
+ self.assertEqual(model4.non_head_distortion_table[-3][2], expected_prob)
+
+ def test_set_uniform_distortion_probabilities_of_non_domain_values(self):
+ # arrange
+ src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
+ trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
+ corpus = [
+ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
+ AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
+ ]
+ model4 = IBMModel4(corpus, 0, src_classes, trg_classes)
+
+ # act
+ model4.set_uniform_distortion_probabilities(corpus)
+
+ # assert
+ # examine displacement values that are not in the training data domain
+ self.assertEqual(model4.head_distortion_table[4][0][0],
+ IBMModel.MIN_PROB)
+ self.assertEqual(model4.head_distortion_table[100][1][2],
+ IBMModel.MIN_PROB)
+ self.assertEqual(model4.non_head_distortion_table[4][0],
+ IBMModel.MIN_PROB)
+ self.assertEqual(model4.non_head_distortion_table[100][2],
+ IBMModel.MIN_PROB)
+
+ def test_prob_t_a_given_s(self):
+ # arrange
+ src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
+ trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
+ src_classes = {'räucherschinken': 0, 'ja': 1, 'ich': 2, 'esse': 3,
+ 'gern': 4}
+ trg_classes = {'ham': 0, 'smoked': 1, 'i': 3, 'love': 4, 'to': 2,
+ 'eat': 4}
+ corpus = [AlignedSent(trg_sentence, src_sentence)]
+ alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5),
+ [None] + src_sentence,
+ ['UNUSED'] + trg_sentence,
+ [[3], [1], [4], [], [2], [5, 6]])
+
+ head_distortion_table = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(float)))
+ head_distortion_table[1][None][3] = 0.97 # None, i
+ head_distortion_table[3][2][4] = 0.97 # ich, eat
+ head_distortion_table[-2][3][4] = 0.97 # esse, love
+ head_distortion_table[3][4][1] = 0.97 # gern, smoked
+
+ non_head_distortion_table = defaultdict(lambda: defaultdict(float))
+ non_head_distortion_table[1][0] = 0.96 # ham
+
+ translation_table = defaultdict(lambda: defaultdict(float))
+ translation_table['i']['ich'] = 0.98
+ translation_table['love']['gern'] = 0.98
+ translation_table['to'][None] = 0.98
+ translation_table['eat']['esse'] = 0.98
+ translation_table['smoked']['räucherschinken'] = 0.98
+ translation_table['ham']['räucherschinken'] = 0.98
+
+ fertility_table = defaultdict(lambda: defaultdict(float))
+ fertility_table[1]['ich'] = 0.99
+ fertility_table[1]['esse'] = 0.99
+ fertility_table[0]['ja'] = 0.99
+ fertility_table[1]['gern'] = 0.99
+ fertility_table[2]['räucherschinken'] = 0.999
+ fertility_table[1][None] = 0.99
+
+ probabilities = {
+ 'p1': 0.167,
+ 'translation_table': translation_table,
+ 'head_distortion_table': head_distortion_table,
+ 'non_head_distortion_table': non_head_distortion_table,
+ 'fertility_table': fertility_table,
+ 'alignment_table': None
+ }
+
+ model4 = IBMModel4(corpus, 0, src_classes, trg_classes,
+ probabilities)
+
+ # act
+ probability = model4.prob_t_a_given_s(alignment_info)
+
+ # assert
+ null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
+ fertility = 1*0.99 * 1*0.99 * 1*0.99 * 1*0.99 * 2*0.999
+ lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
+ distortion = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96
+ expected_probability = (null_generation * fertility *
+ lexical_translation * distortion)
+ self.assertEqual(round(probability, 4), round(expected_probability, 4))
diff --git a/nltk/test/unit/align/test_ibm5.py b/nltk/test/unit/align/test_ibm5.py
new file mode 100644
index 0000000..be8e694
--- /dev/null
+++ b/nltk/test/unit/align/test_ibm5.py
@@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for IBM Model 5 training methods
+"""
+
+import unittest
+
+from collections import defaultdict
+from nltk.align import AlignedSent
+from nltk.align.ibm_model import AlignmentInfo
+from nltk.align.ibm_model import IBMModel
+from nltk.align.ibm4 import IBMModel4
+from nltk.align.ibm5 import IBMModel5
+
+
+class TestIBMModel5(unittest.TestCase):
+ def test_set_uniform_distortion_probabilities_of_max_displacements(self):
+ # arrange
+ src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
+ trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
+ corpus = [
+ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
+ AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
+ ]
+ model5 = IBMModel5(corpus, 0, src_classes, trg_classes)
+
+ # act
+ model5.set_uniform_distortion_probabilities(corpus)
+
+ # assert
+ # number of vacancy difference values =
+ # 2 * number of words in longest target sentence
+ expected_prob = 1.0 / (2 * 4)
+
+ # examine the boundary values for (dv, max_v, trg_class)
+ self.assertEqual(model5.head_vacancy_table[4][4][0], expected_prob)
+ self.assertEqual(model5.head_vacancy_table[-3][1][2], expected_prob)
+ self.assertEqual(model5.non_head_vacancy_table[4][4][0], expected_prob)
+ self.assertEqual(model5.non_head_vacancy_table[-3][1][2], expected_prob)
+
+ def test_set_uniform_distortion_probabilities_of_non_domain_values(self):
+ # arrange
+ src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
+ trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
+ corpus = [
+ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
+ AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
+ ]
+ model5 = IBMModel5(corpus, 0, src_classes, trg_classes)
+
+ # act
+ model5.set_uniform_distortion_probabilities(corpus)
+
+ # assert
+ # examine dv and max_v values that are not in the training data domain
+ self.assertEqual(model5.head_vacancy_table[5][4][0],
+ IBMModel.MIN_PROB)
+ self.assertEqual(model5.head_vacancy_table[-4][1][2],
+ IBMModel.MIN_PROB)
+ self.assertEqual(model5.head_vacancy_table[4][0][0],
+ IBMModel.MIN_PROB)
+ self.assertEqual(model5.non_head_vacancy_table[5][4][0],
+ IBMModel.MIN_PROB)
+ self.assertEqual(model5.non_head_vacancy_table[-4][1][2],
+ IBMModel.MIN_PROB)
+
+ def test_prob_t_a_given_s(self):
+ # arrange
+ src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
+ trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
+ src_classes = {'räucherschinken': 0, 'ja': 1, 'ich': 2, 'esse': 3,
+ 'gern': 4}
+ trg_classes = {'ham': 0, 'smoked': 1, 'i': 3, 'love': 4, 'to': 2,
+ 'eat': 4}
+ corpus = [AlignedSent(trg_sentence, src_sentence)]
+ alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5),
+ [None] + src_sentence,
+ ['UNUSED'] + trg_sentence,
+ [[3], [1], [4], [], [2], [5, 6]])
+
+ head_vacancy_table = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(float)))
+ head_vacancy_table[1 - 0][6][3] = 0.97 # ich -> i
+ head_vacancy_table[3 - 0][5][4] = 0.97 # esse -> eat
+ head_vacancy_table[1 - 2][4][4] = 0.97 # gern -> love
+ head_vacancy_table[2 - 0][2][1] = 0.97 # räucherschinken -> smoked
+
+ non_head_vacancy_table = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(float)))
+ non_head_vacancy_table[1 - 0][1][0] = 0.96 # räucherschinken -> ham
+
+ translation_table = defaultdict(lambda: defaultdict(float))
+ translation_table['i']['ich'] = 0.98
+ translation_table['love']['gern'] = 0.98
+ translation_table['to'][None] = 0.98
+ translation_table['eat']['esse'] = 0.98
+ translation_table['smoked']['räucherschinken'] = 0.98
+ translation_table['ham']['räucherschinken'] = 0.98
+
+ fertility_table = defaultdict(lambda: defaultdict(float))
+ fertility_table[1]['ich'] = 0.99
+ fertility_table[1]['esse'] = 0.99
+ fertility_table[0]['ja'] = 0.99
+ fertility_table[1]['gern'] = 0.99
+ fertility_table[2]['räucherschinken'] = 0.999
+ fertility_table[1][None] = 0.99
+
+ probabilities = {
+ 'p1': 0.167,
+ 'translation_table': translation_table,
+ 'fertility_table': fertility_table,
+ 'head_vacancy_table': head_vacancy_table,
+ 'non_head_vacancy_table': non_head_vacancy_table,
+ 'head_distortion_table': None,
+ 'non_head_distortion_table': None,
+ 'alignment_table': None
+ }
+
+ model5 = IBMModel5(corpus, 0, src_classes, trg_classes,
+ probabilities)
+
+ # act
+ probability = model5.prob_t_a_given_s(alignment_info)
+
+ # assert
+ null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
+ fertility = 1*0.99 * 1*0.99 * 1*0.99 * 1*0.99 * 2*0.999
+ lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
+ vacancy = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96
+ expected_probability = (null_generation * fertility *
+ lexical_translation * vacancy)
+ self.assertEqual(round(probability, 4), round(expected_probability, 4))
+
+ def test_prune(self):
+ # arrange
+ alignment_infos = [
+ AlignmentInfo((1, 1), None, None, None),
+ AlignmentInfo((1, 2), None, None, None),
+ AlignmentInfo((2, 1), None, None, None),
+ AlignmentInfo((2, 2), None, None, None),
+ AlignmentInfo((0, 0), None, None, None)
+ ]
+ min_factor = IBMModel5.MIN_SCORE_FACTOR
+ best_score = 0.9
+ scores = {
+ (1, 1): min(min_factor * 1.5, 1) * best_score, # above threshold
+ (1, 2): best_score,
+ (2, 1): min_factor * best_score, # at threshold
+ (2, 2): min_factor * best_score * 0.5, # low score
+ (0, 0): min(min_factor * 1.1, 1) * 1.2 # above threshold
+ }
+ corpus = [AlignedSent(['a'], ['b'])]
+ original_prob_function = IBMModel4.model4_prob_t_a_given_s
+ # mock static method
+ IBMModel4.model4_prob_t_a_given_s = staticmethod(
+ lambda a, model: scores[a.alignment])
+ model5 = IBMModel5(corpus, 0, None, None)
+
+ # act
+ pruned_alignments = model5.prune(alignment_infos)
+
+ # assert
+ self.assertEqual(len(pruned_alignments), 3)
+
+ # restore static method
+ IBMModel4.model4_prob_t_a_given_s = original_prob_function
diff --git a/nltk/test/unit/align/test_ibm_model.py b/nltk/test/unit/align/test_ibm_model.py
new file mode 100644
index 0000000..e13b3ec
--- /dev/null
+++ b/nltk/test/unit/align/test_ibm_model.py
@@ -0,0 +1,270 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for common methods of IBM translation models
+"""
+
+import unittest
+
+from collections import defaultdict
+from nltk.align import AlignedSent
+from nltk.align.ibm_model import AlignmentInfo
+from nltk.align.ibm_model import IBMModel
+
+
+class TestIBMModel(unittest.TestCase):
+ __TEST_SRC_SENTENCE = ["j'", 'aime', 'bien', 'jambon']
+ __TEST_TRG_SENTENCE = ['i', 'love', 'ham']
+
+ def test_vocabularies_are_initialized(self):
+ parallel_corpora = [
+ AlignedSent(['one', 'two', 'three', 'four'],
+ ['un', 'deux', 'trois']),
+ AlignedSent(['five', 'one', 'six'], ['quatre', 'cinq', 'six']),
+ AlignedSent([], ['sept'])
+ ]
+
+ ibm_model = IBMModel(parallel_corpora)
+ self.assertEqual(len(ibm_model.src_vocab), 8)
+ self.assertEqual(len(ibm_model.trg_vocab), 6)
+
+ def test_vocabularies_are_initialized_even_with_empty_corpora(self):
+ parallel_corpora = []
+
+ ibm_model = IBMModel(parallel_corpora)
+ self.assertEqual(len(ibm_model.src_vocab), 1) # addition of NULL token
+ self.assertEqual(len(ibm_model.trg_vocab), 0)
+
+ def test_best_model2_alignment(self):
+ # arrange
+ sentence_pair = AlignedSent(
+ TestIBMModel.__TEST_TRG_SENTENCE,
+ TestIBMModel.__TEST_SRC_SENTENCE)
+ # None and 'bien' have zero fertility
+ translation_table = {
+ 'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03,
+ None: 0},
+ 'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01,
+ None: 0.03},
+ 'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99,
+ None: 0}
+ }
+ alignment_table = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+ lambda: 0.2))))
+
+ ibm_model = IBMModel([])
+ ibm_model.translation_table = translation_table
+ ibm_model.alignment_table = alignment_table
+
+ # act
+ a_info = ibm_model.best_model2_alignment(sentence_pair)
+
+ # assert
+ self.assertEqual(a_info.alignment[1:], (1, 2, 4)) # 0th element unused
+ self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]])
+
+ def test_best_model2_alignment_does_not_change_pegged_alignment(self):
+ # arrange
+ sentence_pair = AlignedSent(
+ TestIBMModel.__TEST_TRG_SENTENCE,
+ TestIBMModel.__TEST_SRC_SENTENCE)
+ translation_table = {
+ 'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03,
+ None: 0},
+ 'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01,
+ None: 0.03},
+ 'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0}
+ }
+ alignment_table = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+ lambda: 0.2))))
+
+ ibm_model = IBMModel([])
+ ibm_model.translation_table = translation_table
+ ibm_model.alignment_table = alignment_table
+
+ # act: force 'love' to be pegged to 'jambon'
+ a_info = ibm_model.best_model2_alignment(sentence_pair, 2, 4)
+ # assert
+ self.assertEqual(a_info.alignment[1:], (1, 4, 4))
+ self.assertEqual(a_info.cepts, [[], [1], [], [], [2, 3]])
+
+ def test_best_model2_alignment_handles_fertile_words(self):
+ # arrange
+ sentence_pair = AlignedSent(
+ ['i', 'really', ',', 'really', 'love', 'ham'],
+ TestIBMModel.__TEST_SRC_SENTENCE)
+ # 'bien' produces 2 target words: 'really' and another 'really'
+ translation_table = {
+ 'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
+ 'really': {"j'": 0, 'aime': 0, 'bien': 0.9, 'jambon': 0.01, None: 0.09},
+ ',': {"j'": 0, 'aime': 0, 'bien': 0.3, 'jambon': 0, None: 0.7},
+ 'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
+ 'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0}
+ }
+ alignment_table = defaultdict(
+ lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
+ lambda: 0.2))))
+
+ ibm_model = IBMModel([])
+ ibm_model.translation_table = translation_table
+ ibm_model.alignment_table = alignment_table
+
+ # act
+ a_info = ibm_model.best_model2_alignment(sentence_pair)
+
+ # assert
+ self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4))
+ self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]])
+
+ def test_best_model2_alignment_handles_empty_src_sentence(self):
+ # arrange
+ sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, [])
+ ibm_model = IBMModel([])
+
+ # act
+ a_info = ibm_model.best_model2_alignment(sentence_pair)
+
+ # assert
+ self.assertEqual(a_info.alignment[1:], (0, 0, 0))
+ self.assertEqual(a_info.cepts, [[1, 2, 3]])
+
+ def test_best_model2_alignment_handles_empty_trg_sentence(self):
+ # arrange
+ sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE)
+ ibm_model = IBMModel([])
+
+ # act
+ a_info = ibm_model.best_model2_alignment(sentence_pair)
+
+ # assert
+ self.assertEqual(a_info.alignment[1:], ())
+ self.assertEqual(a_info.cepts, [[], [], [], [], []])
+
+ def test_neighboring_finds_neighbor_alignments(self):
+ # arrange
+ a_info = AlignmentInfo(
+ (0, 3, 2),
+ (None, 'des', 'œufs', 'verts'),
+ ('UNUSED', 'green', 'eggs'),
+ [[], [], [2], [1]]
+ )
+ ibm_model = IBMModel([])
+
+ # act
+ neighbors = ibm_model.neighboring(a_info)
+
+ # assert
+ neighbor_alignments = set()
+ for neighbor in neighbors:
+ neighbor_alignments.add(neighbor.alignment)
+ expected_alignments = set([
+ # moves
+ (0, 0, 2), (0, 1, 2), (0, 2, 2),
+ (0, 3, 0), (0, 3, 1), (0, 3, 3),
+ # swaps
+ (0, 2, 3),
+ # original alignment
+ (0, 3, 2)
+ ])
+ self.assertEqual(neighbor_alignments, expected_alignments)
+
+ def test_neighboring_sets_neighbor_alignment_info(self):
+ # arrange
+ a_info = AlignmentInfo(
+ (0, 3, 2),
+ (None, 'des', 'œufs', 'verts'),
+ ('UNUSED', 'green', 'eggs'),
+ [[], [], [2], [1]]
+ )
+ ibm_model = IBMModel([])
+
+ # act
+ neighbors = ibm_model.neighboring(a_info)
+
+ # assert: select a few particular alignments
+ for neighbor in neighbors:
+ if neighbor.alignment == (0, 2, 2):
+ moved_alignment = neighbor
+ elif neighbor.alignment == (0, 3, 2):
+ swapped_alignment = neighbor
+
+ self.assertEqual(moved_alignment.cepts, [[], [], [1, 2], []])
+ self.assertEqual(swapped_alignment.cepts, [[], [], [2], [1]])
+
+ def test_neighboring_returns_neighbors_with_pegged_alignment(self):
+ # arrange
+ a_info = AlignmentInfo(
+ (0, 3, 2),
+ (None, 'des', 'œufs', 'verts'),
+ ('UNUSED', 'green', 'eggs'),
+ [[], [], [2], [1]]
+ )
+ ibm_model = IBMModel([])
+
+ # act: peg 'eggs' to align with 'œufs'
+ neighbors = ibm_model.neighboring(a_info, 2)
+
+ # assert
+ neighbor_alignments = set()
+ for neighbor in neighbors:
+ neighbor_alignments.add(neighbor.alignment)
+ expected_alignments = set([
+ # moves
+ (0, 0, 2), (0, 1, 2), (0, 2, 2),
+ # no swaps
+ # original alignment
+ (0, 3, 2)
+ ])
+ self.assertEqual(neighbor_alignments, expected_alignments)
+
+ def test_hillclimb(self):
+ # arrange
+ initial_alignment = AlignmentInfo((0, 3, 2), None, None, None)
+
+ def neighboring_mock(a, j):
+ if a.alignment == (0, 3, 2):
+ return set([
+ AlignmentInfo((0, 2, 2), None, None, None),
+ AlignmentInfo((0, 1, 1), None, None, None)
+ ])
+ elif a.alignment == (0, 2, 2):
+ return set([
+ AlignmentInfo((0, 3, 3), None, None, None),
+ AlignmentInfo((0, 4, 4), None, None, None)
+ ])
+ return set()
+
+ def prob_t_a_given_s_mock(a):
+ prob_values = {
+ (0, 3, 2): 0.5,
+ (0, 2, 2): 0.6,
+ (0, 1, 1): 0.4,
+ (0, 3, 3): 0.6,
+ (0, 4, 4): 0.7
+ }
+ return prob_values.get(a.alignment, 0.01)
+
+ ibm_model = IBMModel([])
+ ibm_model.neighboring = neighboring_mock
+ ibm_model.prob_t_a_given_s = prob_t_a_given_s_mock
+
+ # act
+ best_alignment = ibm_model.hillclimb(initial_alignment)
+
+ # assert: hill climbing goes from (0, 3, 2) -> (0, 2, 2) -> (0, 4, 4)
+ self.assertEqual(best_alignment.alignment, (0, 4, 4))
+
+ def test_sample(self):
+ # arrange
+ sentence_pair = AlignedSent(
+ TestIBMModel.__TEST_TRG_SENTENCE,
+ TestIBMModel.__TEST_SRC_SENTENCE)
+ ibm_model = IBMModel([])
+ ibm_model.prob_t_a_given_s = lambda x: 0.001
+
+ # act
+ samples, best_alignment = ibm_model.sample(sentence_pair)
+
+ # assert
+ self.assertEqual(len(samples), 61)
diff --git a/nltk/test/unit/test_json2csv_corpus.py b/nltk/test/unit/test_json2csv_corpus.py
new file mode 100644
index 0000000..4194821
--- /dev/null
+++ b/nltk/test/unit/test_json2csv_corpus.py
@@ -0,0 +1,187 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Twitter client
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Lorenzo Rubio <lrnzcig at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Regression tests for `json2csv()` and `json2csv_entities()` in Twitter
+package.
+
+"""
+
+import os
+from nltk.compat import TemporaryDirectory
+import unittest
+
+from nltk.corpus import twitter_samples
+from nltk.twitter.util import json2csv, json2csv_entities
+from nltk.compat import izip
+
+
+def are_files_identical(filename1, filename2, debug=False):
+ """
+ Compare two files, ignoring carriage returns.
+ """
+ with open(filename1, "rb") as fileA:
+ with open(filename2, "rb") as fileB:
+ result = True
+ for lineA, lineB in izip(sorted(fileA.readlines()),
+ sorted(fileB.readlines())):
+ if lineA.strip() != lineB.strip():
+ if debug:
+ print("Error while comparing files. " +
+ "First difference at line below.")
+ print("=> Output file line: {}".format(lineA))
+ print("=> Refer. file line: {}".format(lineB))
+ result = False
+ break
+ return result
+
+
+class TestJSON2CSV(unittest.TestCase):
+
+ def setUp(self):
+ with open(twitter_samples.abspath("tweets.20150430-223406.json")) as infile:
+ self.infile = [next(infile) for x in range(100)]
+ infile.close()
+ self.msg = "Test and reference files are not the same"
+ self.subdir = os.path.join(os.path.dirname(__file__), 'files')
+
+ def tearDown(self):
+ return
+
+ def test_textoutput(self):
+ ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.text.csv.ref')
+ with TemporaryDirectory() as tempdir:
+ outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv')
+ json2csv(self.infile, outfn, ['text'], gzip_compress=False)
+ self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
+
+
+ def test_tweet_metadata(self):
+ ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.tweet.csv.ref')
+ fields = ['created_at', 'favorite_count', 'id',
+ 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_count',
+ 'retweeted', 'text', 'truncated', 'user.id']
+
+ with TemporaryDirectory() as tempdir:
+ outfn = os.path.join(tempdir, 'tweets.20150430-223406.tweet.csv')
+ json2csv(self.infile, outfn, fields, gzip_compress=False)
+ self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
+
+
+ def test_user_metadata(self):
+ ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.user.csv.ref')
+ fields = ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']
+
+ with TemporaryDirectory() as tempdir:
+ outfn = os.path.join(tempdir, 'tweets.20150430-223406.user.csv')
+ json2csv(self.infile, outfn, fields, gzip_compress=False)
+ self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
+
+
+ def test_tweet_hashtag(self):
+ ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.hashtag.csv.ref')
+ with TemporaryDirectory() as tempdir:
+ outfn = os.path.join(tempdir, 'tweets.20150430-223406.hashtag.csv')
+ json2csv_entities(self.infile, outfn,
+ ['id', 'text'], 'hashtags', ['text'],
+ gzip_compress=False)
+ self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
+
+
+ def test_tweet_usermention(self):
+ ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.usermention.csv.ref')
+ with TemporaryDirectory() as tempdir:
+ outfn = os.path.join(tempdir, 'tweets.20150430-223406.usermention.csv')
+ json2csv_entities(self.infile, outfn,
+ ['id', 'text'], 'user_mentions', ['id', 'screen_name'],
+ gzip_compress=False)
+ self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
+
+
+ def test_tweet_media(self):
+ ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.media.csv.ref')
+ with TemporaryDirectory() as tempdir:
+ outfn = os.path.join(tempdir, 'tweets.20150430-223406.media.csv')
+ json2csv_entities(self.infile, outfn,
+ ['id'], 'media', ['media_url', 'url'],
+ gzip_compress=False)
+
+ self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
+
+
+ def test_tweet_url(self):
+ ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.url.csv.ref')
+ with TemporaryDirectory() as tempdir:
+ outfn = os.path.join(tempdir, 'tweets.20150430-223406.url.csv')
+ json2csv_entities(self.infile, outfn,
+ ['id'], 'urls', ['url', 'expanded_url'],
+ gzip_compress=False)
+
+ self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
+
+
+ def test_userurl(self):
+ ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.userurl.csv.ref')
+ with TemporaryDirectory() as tempdir:
+ outfn = os.path.join(tempdir, 'tweets.20150430-223406.userurl.csv')
+ json2csv_entities(self.infile, outfn, ['id', 'screen_name'],
+ 'user.urls', ['url', 'expanded_url'],
+ gzip_compress=False)
+
+ self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
+
+
+ def test_tweet_place(self):
+ ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.place.csv.ref')
+ with TemporaryDirectory() as tempdir:
+ outfn = os.path.join(tempdir, 'tweets.20150430-223406.place.csv')
+ json2csv_entities(self.infile, outfn,
+ ['id', 'text'], 'place', ['name', 'country'],
+ gzip_compress=False)
+
+ self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
+
+
+ def test_tweet_place_boundingbox(self):
+ ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.placeboundingbox.csv.ref')
+ with TemporaryDirectory() as tempdir:
+ outfn = os.path.join(tempdir, 'tweets.20150430-223406.placeboundingbox.csv')
+ json2csv_entities(self.infile, outfn,
+ ['id', 'name'], 'place.bounding_box', ['coordinates'],
+ gzip_compress=False)
+
+ self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
+
+
+ def test_retweet_original_tweet(self):
+ ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref')
+ with TemporaryDirectory() as tempdir:
+ outfn = os.path.join(tempdir, 'tweets.20150430-223406.retweet.csv')
+ json2csv_entities(self.infile, outfn, ['id'], 'retweeted_status',
+ ['created_at', 'favorite_count', 'id', 'in_reply_to_status_id',
+ 'in_reply_to_user_id', 'retweet_count', 'text', 'truncated',
+ 'user.id'],
+ gzip_compress=False)
+
+ self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
+
+
+ def test_file_is_wrong(self):
+ """
+ Sanity check that file comparison is not giving false positives.
+ """
+ ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref')
+ with TemporaryDirectory() as tempdir:
+ outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv')
+ json2csv(self.infile, outfn, ['text'], gzip_compress=False)
+ self.assertFalse(are_files_identical(outfn, ref_fn), msg=self.msg)
+
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/nltk/test/unit/test_twitter_auth.py b/nltk/test/unit/test_twitter_auth.py
new file mode 100644
index 0000000..e266256
--- /dev/null
+++ b/nltk/test/unit/test_twitter_auth.py
@@ -0,0 +1,187 @@
+# -*- coding: utf-8 -*-
+"""
+Tests for static parts of Twitter package
+"""
+
+import os
+import unittest
+
+from nltk.twitter import Authenticate
+
+
+class TestCredentials(unittest.TestCase):
+ """
+ Tests that Twitter credentials information from file is handled correctly.
+ """
+
+ def setUp(self):
+ self.subdir = os.path.join(os.path.dirname(__file__), 'files')
+ self.auth = Authenticate()
+ os.environ['TWITTER'] = 'twitter-files'
+
+ def test_environment(self):
+ """
+ Test that environment variable has been read correctly.
+ """
+ fn = os.path.basename(self.auth.creds_subdir)
+ self.assertEqual(fn, os.environ['TWITTER'])
+
+ def test_empty_subdir1(self):
+ """
+ Setting subdir to empty path should raise an error.
+ """
+ try:
+ self.auth.load_creds(subdir='')
+ # raises ValueError (zero length field name in format) for python 2.6
+ # OSError for the rest
+ except OSError:
+ pass
+ except ValueError:
+ pass
+ except Exception as e:
+ self.fail('Unexpected exception thrown: %s' % e)
+ else:
+ self.fail('OSError exception not thrown.')
+
+
+ def test_empty_subdir2(self):
+ """
+ Setting subdir to `None` should raise an error.
+ """
+ self.auth.creds_subdir = None
+ try:
+ self.auth.load_creds()
+ except ValueError:
+ pass
+ except Exception as e:
+ self.fail('Unexpected exception thrown: %s' % e)
+ else:
+ self.fail('ValueError exception not thrown.')
+
+ def test_missingdir(self):
+ """
+ Setting subdir to nonexistent directory should raise an error.
+ """
+ try:
+ self.auth.load_creds(subdir='/nosuchdir')
+ # raises ValueError (zero length field name in format) for python 2.6
+ # OSError for the rest
+ except OSError:
+ pass
+ except ValueError:
+ pass
+ except Exception as e:
+ self.fail('Unexpected exception thrown: %s' % e)
+ else:
+ self.fail('OSError exception not thrown.')
+
+
+ def test_missingfile1(self):
+ """
+ Defaults for authentication will fail since 'credentials.txt' not
+ present in default subdir, as read from `os.environ['TWITTER']`.
+ """
+ try:
+ self.auth.load_creds()
+ # raises ValueError (zero length field name in format) for python 2.6
+ # OSError for the rest
+ except OSError:
+ pass
+ except ValueError:
+ pass
+ except Exception as e:
+ self.fail('Unexpected exception thrown: %s' % e)
+ else:
+ self.fail('OSError exception not thrown.')
+
+
+ def test_missingfile2(self):
+ """
+ Credentials file 'foobar' cannot be found in default subdir.
+ """
+ try:
+ self.auth.load_creds(creds_file='foobar')
+ # raises ValueError (zero length field name in format) for python 2.6
+ # OSError for the rest
+ except OSError:
+ pass
+ except ValueError:
+ pass
+ except Exception as e:
+ self.fail('Unexpected exception thrown: %s' % e)
+ else:
+ self.fail('OSError exception not thrown.')
+
+
+
+ def test_incomplete_file(self):
+ """
+ Credentials file 'bad_oauth1-1.txt' is incomplete
+ """
+ try:
+ self.auth.load_creds(creds_file='bad_oauth1-1.txt',
+ subdir=self.subdir)
+ except ValueError:
+ pass
+ except Exception as e:
+ self.fail('Unexpected exception thrown: %s' % e)
+ else:
+ self.fail('ValueError exception not thrown.')
+
+
+ def test_malformed_file1(self):
+ """
+ First key in credentials file 'bad_oauth1-2.txt' is ill-formed
+ """
+ try:
+ self.auth.load_creds(creds_file='bad_oauth1-2.txt',
+ subdir=self.subdir)
+ except ValueError:
+ pass
+ except Exception as e:
+ self.fail('Unexpected exception thrown: %s' % e)
+ else:
+ self.fail('ValueError exception not thrown.')
+
+ def test_malformed_file2(self):
+ """
+ First key in credentials file 'bad_oauth1-2.txt' is ill-formed
+ """
+ try:
+ self.auth.load_creds(creds_file='bad_oauth1-3.txt',
+ subdir=self.subdir)
+ except ValueError:
+ pass
+ except Exception as e:
+ self.fail('Unexpected exception thrown: %s' % e)
+ else:
+ self.fail('ValueError exception not thrown.')
+
+ def test_correct_path(self):
+ """
+ Path to default credentials file is well-formed, given specified
+ subdir.
+ """
+ self.auth.load_creds(subdir=self.subdir)
+ self.auth.creds_fullpath = os.path.join(self.subdir, self.auth.creds_file)
+
+
+ def test_correct_file1(self):
+ """
+ Default credentials file is identified
+ """
+ self.auth.load_creds(subdir=self.subdir)
+ self.assertEqual(self.auth.creds_file, 'credentials.txt')
+
+
+ def test_correct_file2(self):
+ """
+ Default credentials file has been read correctluy
+ """
+ oauth = self.auth.load_creds(subdir=self.subdir)
+ self.assertEqual(oauth['app_key'], 'a')
+
+
+if __name__ == '__main__':
+ unittest.main()
+
diff --git a/nltk/tgrep.py b/nltk/tgrep.py
index fd4dfa2..eeea37f 100644
--- a/nltk/tgrep.py
+++ b/nltk/tgrep.py
@@ -934,8 +934,4 @@ def tgrep_nodes(pattern, trees, search_leaves=True):
yield []
-# run module doctests
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tokenize/__init__.py b/nltk/tokenize/__init__.py
index 9200848..934c3fc 100644
--- a/nltk/tokenize/__init__.py
+++ b/nltk/tokenize/__init__.py
@@ -69,7 +69,9 @@ from nltk.tokenize.regexp import (RegexpTokenizer, WhitespaceTokenizer,
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize
from nltk.tokenize.treebank import TreebankWordTokenizer
+from nltk.tokenize.stanford import StanfordTokenizer
from nltk.tokenize.texttiling import TextTilingTokenizer
+from nltk.tokenize.casual import (TweetTokenizer, casual_tokenize)
# Standard sentence tokenizer.
def sent_tokenize(text, language='english'):
@@ -101,6 +103,3 @@ def word_tokenize(text, language='english'):
return [token for sent in sent_tokenize(text, language)
for token in _treebank_word_tokenize(sent)]
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tokenize/api.py b/nltk/tokenize/api.py
index 9eee06d..c2d8743 100644
--- a/nltk/tokenize/api.py
+++ b/nltk/tokenize/api.py
@@ -73,6 +73,3 @@ class StringTokenizer(TokenizerI):
yield span
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tokenize/casual.py b/nltk/tokenize/casual.py
new file mode 100644
index 0000000..f45926a
--- /dev/null
+++ b/nltk/tokenize/casual.py
@@ -0,0 +1,357 @@
+# coding: utf-8
+#
+# Natural Language Toolkit: Twitter Tokenizer
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Christopher Potts <cgpotts at stanford.edu>
+# Ewan Klein <ewan at inf.ed.ac.uk> (modifications)
+# Pierpaolo Pantone <> (modifications)
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+#
+
+
+"""
+Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
+domains and tasks. The basic logic is this:
+
+1. The tuple regex_strings defines a list of regular expression
+ strings.
+
+2. The regex_strings strings are put, in order, into a compiled
+ regular expression object called word_re.
+
+3. The tokenization is done by word_re.findall(s), where s is the
+ user-supplied string, inside the tokenize() method of the class
+ Tokenizer.
+
+4. When instantiating Tokenizer objects, there is a single option:
+ preserve_case. By default, it is set to True. If it is set to
+ False, then the tokenizer will downcase everything except for
+ emoticons.
+
+"""
+
+
+
+######################################################################
+
+from __future__ import unicode_literals
+import re
+from nltk.compat import htmlentitydefs, int2byte, unichr
+
+
+######################################################################
+# The following strings are components in the regular expression
+# that is used for tokenizing. It's important that phone_number
+# appears first in the final regex (since it can contain whitespace).
+# It also could matter that tags comes after emoticons, due to the
+# possibility of having text like
+#
+# <:| and some text >:)
+#
+# Most importantly, the final element should always be last, since it
+# does a last ditch whitespace-based tokenization of whatever is left.
+
+# ToDo: Update with http://en.wikipedia.org/wiki/List_of_emoticons ?
+
+# This particular element is used in a couple ways, so we define it
+# with a name:
+EMOTICONS = r"""
+ (?:
+ [<>]?
+ [:;=8] # eyes
+ [\-o\*\']? # optional nose
+ [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
+ |
+ [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
+ [\-o\*\']? # optional nose
+ [:;=8] # eyes
+ [<>]?
+ |
+ <3 # heart
+ )"""
+
+# URL pattern due to John Gruber, modified by Tom Winzig. See
+# https://gist.github.com/winzig/8894715
+
+URLS = r""" # Capture 1: entire matched URL
+ (?:
+ https?: # URL protocol and colon
+ (?:
+ /{1,3} # 1-3 slashes
+ | # or
+ [a-z0-9%] # Single letter or digit or '%'
+ # (Trying not to match e.g. "URI::Escape")
+ )
+ | # or
+ # looks like domain name followed by a slash:
+ [a-z0-9.\-]+[.]
+ (?:[a-z]{2,13})
+ /
+ )
+ (?: # One or more:
+ [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
+ | # or
+ \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
+ |
+ \([^\s]+?\) # balanced parens, non-recursive: (...)
+ )+
+ (?: # End with:
+ \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
+ |
+ \([^\s]+?\) # balanced parens, non-recursive: (...)
+ | # or
+ [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
+ )
+ | # OR, the following to match naked domains:
+ (?:
+ (?<!@) # not preceded by a @, avoid matching foo at _gmail.com_
+ [a-z0-9]+
+ (?:[.\-][a-z0-9]+)*
+ [.]
+ (?:[a-z]{2,13})
+ \b
+ /?
+ (?!@) # not succeeded by a @,
+ # avoid matching "foo.na" in "foo.na at example.com"
+ )
+"""
+
+# The components of the tokenizer:
+REGEXPS = (
+ URLS,
+ # Phone numbers:
+ r"""
+ (?:
+ (?: # (international)
+ \+?[01]
+ [\-\s.]*
+ )?
+ (?: # (area code)
+ [\(]?
+ \d{3}
+ [\-\s.\)]*
+ )?
+ \d{3} # exchange
+ [\-\s.]*
+ \d{4} # base
+ )"""
+ ,
+ # ASCII Emoticons
+ EMOTICONS
+ ,
+ # HTML tags:
+ r"""<[^>\s]+>"""
+ ,
+ # ASCII Arrows
+ r"""[\-]+>|<[\-]+"""
+ ,
+ # Twitter username:
+ r"""(?:@[\w_]+)"""
+ ,
+ # Twitter hashtags:
+ r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
+ ,
+
+ # Remaining word types:
+ r"""
+ (?:[a-z][a-z'\-_]+[a-z]) # Words with apostrophes or dashes.
+ |
+ (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
+ |
+ (?:[\w_]+) # Words without apostrophes or dashes.
+ |
+ (?:\.(?:\s*\.){1,}) # Ellipsis dots.
+ |
+ (?:\S) # Everything else that isn't whitespace.
+ """
+ )
+
+######################################################################
+# This is the core tokenizing regex:
+
+WORD_RE = re.compile(r"""(%s)""" % "|".join(REGEXPS), re.VERBOSE | re.I
+ | re.UNICODE)
+
+# The emoticon string gets its own regex so that we can preserve case for
+# them as needed:
+EMOTICON_RE = re.compile(EMOTICONS, re.VERBOSE | re.I | re.UNICODE)
+
+# These are for regularizing HTML entities to Unicode:
+ENT_RE = re.compile(r'&(#?(x?))([^&;\s]+);')
+
+
+######################################################################
+# Functions for converting html entities
+######################################################################
+
+def _str_to_unicode(text, encoding=None, errors='strict'):
+ if encoding is None:
+ encoding = 'utf-8'
+ if isinstance(text, bytes):
+ return text.decode(encoding, errors)
+ return text
+
+def _replace_html_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
+ """
+ Remove entities from text by converting them to their
+ corresponding unicode character.
+
+ :param text: a unicode string or a byte string encoded in the given
+ `encoding` (which defaults to 'utf-8').
+
+ :param list keep: list of entity names which should not be replaced.\
+ This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
+ and named entities (such as `` `` or ``>``).
+
+ :param bool remove_illegal: If `True`, entities that can't be converted are\
+ removed. Otherwise, entities that can't be converted are kept "as
+ is".
+
+ :returns: A unicode string with the entities removed.
+
+ See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
+
+ >>> from nltk.tokenize.casual import _replace_html_entities
+ >>> _replace_html_entities(b'Price: £100')
+ 'Price: \\xa3100'
+ >>> print(_replace_html_entities(b'Price: £100'))
+ Price: £100
+ >>>
+ """
+
+ def _convert_entity(match):
+ entity_body = match.group(3)
+ if match.group(1):
+ try:
+ if match.group(2):
+ number = int(entity_body, 16)
+ else:
+ number = int(entity_body, 10)
+ # Numeric character references in the 80-9F range are typically
+ # interpreted by browsers as representing the characters mapped
+ # to bytes 80-9F in the Windows-1252 encoding. For more info
+ # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
+ if 0x80 <= number <= 0x9f:
+ return int2byte(number).decode('cp1252')
+ except ValueError:
+ number = None
+ else:
+ if entity_body in keep:
+ return match.group(0)
+ else:
+ number = htmlentitydefs.name2codepoint.get(entity_body)
+ if number is not None:
+ try:
+ return unichr(number)
+ except ValueError:
+ pass
+
+ return "" if remove_illegal else match.group(0)
+
+ return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding))
+
+
+######################################################################
+
+class TweetTokenizer:
+ r"""
+ Tokenizer for tweets.
+
+ >>> from nltk.tokenize import TweetTokenizer
+ >>> tknzr = TweetTokenizer()
+ >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
+ >>> tknzr.tokenize(s0)
+ ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
+ >>> s1 = "@Joyster2012 @CathStaincliffe Good for you, girl!! Best wishes :-)"
+ >>> tknzr.tokenize(s1)
+ ['@Joyster2012', '@CathStaincliffe', 'Good', 'for', 'you', ',', 'girl', '!', '!', 'Best', 'wishes', ':-)']
+ >>> s2 = "3Points for #DreamTeam Gooo BAILEY! :) #PBB737Gold @PBBabscbn"
+ >>> tknzr.tokenize(s2)
+ ['3Points', 'for', '#DreamTeam', 'Gooo', 'BAILEY', '!', ':)', '#PBB737Gold', '@PBBabscbn']
+ >>> s3 = "@Insanomania They do... Their mentality doesn't :("
+ >>> tknzr.tokenize(s3)
+ ['@Insanomania', 'They', 'do', '...', 'Their', 'mentality', "doesn't", ':(']
+ >>> s4 = "RT @facugambande: Ya por arrancar a grabar !!! #TirenTirenTiren vamoo !!"
+ >>> tknzr.tokenize(s4)
+ ['RT', '@facugambande', ':', 'Ya', 'por', 'arrancar', 'a', 'grabar', '!', '!', '!', '#TirenTirenTiren', 'vamoo', '!', '!']
+ >>> tknzr = TweetTokenizer(reduce_len=True)
+ >>> s5 = "@crushinghes the summer holidays are great but I'm so bored already :("
+ >>> tknzr.tokenize(s5)
+ ['@crushinghes', 'the', 'summer', 'holidays', 'are', 'great', 'but', "I'm", 'so', 'bored', 'already', ':(']
+
+ Examples using `strip_handles` and `reduce_len parameters`:
+
+ >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
+ >>> s6 = '@remy: This is waaaaayyyy too much for you!!!!!!'
+ >>> tknzr.tokenize(s6)
+ [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
+ >>> s7 = '@_willy65: No place for @chuck tonight. Sorry.'
+ >>> tknzr.tokenize(s7)
+ [':', 'No', 'place', 'for', 'tonight', '.', 'Sorry', '.']
+ >>> s8 = '@mar_tin is a great developer. Contact him at mar_tin at email.com'
+ >>> tknzr.tokenize(s8)
+ ['is', 'a', 'great', 'developer', '.', 'Contact', 'him', 'at', 'mar_tin', '@email', '.', 'com']
+ """
+
+ def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False):
+ self.preserve_case = preserve_case
+ self.reduce_len = reduce_len
+ self.strip_handles = strip_handles
+
+ def tokenize(self, text):
+ """
+ :param text: str
+ :rtype: list(str)
+ :return: a tokenized list of strings; concatenating this list returns\
+ the original string if `preserve_case=False`
+ """
+ # Fix HTML character entities:
+ text = _replace_html_entities(text)
+ # Remove username handles
+ if self.strip_handles:
+ text = remove_handles(text)
+ # Normalize word lengthening
+ if self.reduce_len:
+ text = reduce_lengthening(text)
+ # Tokenize:
+ words = WORD_RE.findall(text)
+ # Possibly alter the case, but avoid changing emoticons like :D into :d:
+ if not self.preserve_case:
+ words = list(map((lambda x : x if EMOTICON_RE.search(x) else
+ x.lower()), words))
+ return words
+
+######################################################################
+# Normalization Functions
+######################################################################
+
+def reduce_lengthening(text):
+ """
+ Replace repeated character sequences of length 3 or greater with sequences
+ of length 3.
+ """
+ pattern = re.compile(r"(.)\1{2,}")
+ return pattern.sub(r"\1\1\1", text)
+
+def remove_handles(text):
+ """
+ Remove Twitter username handles from text.
+ """
+ pattern = re.compile(r"(^|(?<=[^\w.-]))@[A-Za-z_]+\w+")
+ return pattern.sub('', text)
+
+######################################################################
+# Tokenization Function
+######################################################################
+
+def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=False):
+ """
+ Convenience function for wrapping the tokenizer.
+ """
+ return TweetTokenizer(preserve_case=preserve_case, reduce_len=reduce_len,
+ strip_handles=strip_handles).tokenize(text)
+
+###############################################################################
+
diff --git a/nltk/tokenize/punkt.py b/nltk/tokenize/punkt.py
index 2700661..31db861 100644
--- a/nltk/tokenize/punkt.py
+++ b/nltk/tokenize/punkt.py
@@ -1599,6 +1599,3 @@ def demo(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer):
print(cleanup(l))
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tokenize/regexp.py b/nltk/tokenize/regexp.py
index 743c725..bfa4976 100644
--- a/nltk/tokenize/regexp.py
+++ b/nltk/tokenize/regexp.py
@@ -206,7 +206,4 @@ blankline_tokenize = BlanklineTokenizer().tokenize
wordpunct_tokenize = WordPunctTokenizer().tokenize
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tokenize/sexpr.py b/nltk/tokenize/sexpr.py
index 6a2713e..2e818f0 100644
--- a/nltk/tokenize/sexpr.py
+++ b/nltk/tokenize/sexpr.py
@@ -138,8 +138,5 @@ class SExprTokenizer(TokenizerI):
sexpr_tokenize = SExprTokenizer().tokenize
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tokenize/simple.py b/nltk/tokenize/simple.py
index 032d2f0..fbf6002 100644
--- a/nltk/tokenize/simple.py
+++ b/nltk/tokenize/simple.py
@@ -133,7 +133,4 @@ def line_tokenize(text, blanklines='discard'):
return LineTokenizer(blanklines).tokenize(text)
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tokenize/stanford.py b/nltk/tokenize/stanford.py
index 581698e..74a18e5 100644
--- a/nltk/tokenize/stanford.py
+++ b/nltk/tokenize/stanford.py
@@ -19,13 +19,13 @@ from nltk.internals import find_jar, config_java, java, _java_options
from nltk.tokenize.api import TokenizerI
-_stanford_url = 'http://nlp.stanford.edu/software/lex-parser.shtml'
+_stanford_url = 'http://nlp.stanford.edu/software/tokenizer.shtml'
class StanfordTokenizer(TokenizerI):
r"""
Interface to the Stanford Tokenizer
- >>> from nltk.tokenize.stanford import StanfordTokenizer
+ >>> from nltk.tokenize import StanfordTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."
>>> StanfordTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
@@ -107,6 +107,3 @@ def setup_module(module):
raise SkipTest('doctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn\'t exist')
-if __name__ == '__main__':
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS)
diff --git a/nltk/tokenize/texttiling.py b/nltk/tokenize/texttiling.py
index 9316fdf..8e7bcf2 100644
--- a/nltk/tokenize/texttiling.py
+++ b/nltk/tokenize/texttiling.py
@@ -458,6 +458,3 @@ def demo(text=None):
pylab.show()
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py
index 7453ef5..55f4e62 100644
--- a/nltk/tokenize/treebank.py
+++ b/nltk/tokenize/treebank.py
@@ -103,6 +103,3 @@ class TreebankWordTokenizer(TokenizerI):
return text.split()
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tokenize/util.py b/nltk/tokenize/util.py
index 7fe7a6f..08b73fe 100644
--- a/nltk/tokenize/util.py
+++ b/nltk/tokenize/util.py
@@ -88,6 +88,3 @@ def spans_to_relative(spans):
prev = right
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/tree.py b/nltk/tree.py
index 2de9cb9..e697d6f 100644
--- a/nltk/tree.py
+++ b/nltk/tree.py
@@ -1601,6 +1601,3 @@ __all__ = ['ImmutableProbabilisticTree', 'ImmutableTree', 'ProbabilisticMixIn',
'sinica_parse', 'ParentedTree', 'MultiParentedTree',
'ImmutableParentedTree', 'ImmutableMultiParentedTree']
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/nltk/treeprettyprinter.py b/nltk/treeprettyprinter.py
index 06c1a86..d1f3ac1 100644
--- a/nltk/treeprettyprinter.py
+++ b/nltk/treeprettyprinter.py
@@ -561,6 +561,4 @@ def test():
__all__ = ['TreePrettyPrinter']
if __name__ == '__main__':
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
test()
diff --git a/nltk/twitter/__init__.py b/nltk/twitter/__init__.py
new file mode 100644
index 0000000..ef83dfc
--- /dev/null
+++ b/nltk/twitter/__init__.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Twitter
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Ewan Klein <ewan at inf.ed.ac.uk>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+NLTK Twitter Package
+
+This package contains classes for retrieving Tweet documents using the
+Twitter API.
+
+"""
+try:
+ import twython
+except ImportError:
+ import warnings
+ warnings.warn("nltk.twitter package not loaded "
+ "(please install twython library).")
+
+from nltk.twitter.util import Authenticate, credsfromfile, json2csv
+from nltk.twitter.twitterclient import Streamer, Query, Twitter,\
+ TweetViewer, TweetWriter
diff --git a/nltk/twitter/api.py b/nltk/twitter/api.py
new file mode 100644
index 0000000..cda926d
--- /dev/null
+++ b/nltk/twitter/api.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Twitter API
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Ewan Klein <ewan at inf.ed.ac.uk>
+# Lorenzo Rubio <lrnzcig at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+This module provides an interface for TweetHandlers, and support for timezone
+handling.
+"""
+
+from datetime import tzinfo, timedelta, datetime
+import time as _time
+
+
+class LocalTimezoneOffsetWithUTC(tzinfo):
+ """
+ This is not intended to be a general purpose class for dealing with the
+ local timezone. In particular:
+
+ * it assumes that the date passed has been created using
+ `datetime(..., tzinfo=Local)`, where `Local` is an instance of
+ the object `LocalTimezoneOffsetWithUTC`;
+ * for such an object, it returns the offset with UTC, used for date comparisons.
+
+ Reference: https://docs.python.org/3/library/datetime.html
+ """
+ STDOFFSET = timedelta(seconds=-_time.timezone)
+
+ if _time.daylight:
+ DSTOFFSET = timedelta(seconds=-_time.altzone)
+ else:
+ DSTOFFSET = STDOFFSET
+
+ def utcoffset(self, dt):
+ """
+ Access the relevant time offset.
+ """
+ return self.DSTOFFSET
+
+LOCAL = LocalTimezoneOffsetWithUTC()
+
+class BasicTweetHandler(object):
+ """
+ Minimum implementation of TweetHandler
+ Counts the number of tweets and decides when the client shoud stop
+ fetching tweets
+ """
+ def __init__(self, limit=20):
+ self.limit = limit
+ self.counter = 0
+
+ """A flag to indicate that to the client to stop for
+ a functional clause (e.g. date limit)"""
+ self.do_stop = False
+
+ def do_continue(self):
+ """
+ Returns false if the client should stop fetching tweets
+ """
+ return self.counter < self.limit and not self.do_stop
+
+class TweetHandlerI(BasicTweetHandler):
+ """
+ Interface class whose subclasses should implement a handle method that
+ Twitter clients can delegate to.
+ """
+ def __init__(self, limit=20, date_limit=None):
+ """
+ :param int limit: The number of data items to process in the current round of\
+ processing.
+
+ :param tuple date_limit: The date at which to stop collecting new\
+ data. This should be entered as a tuple which can serve as the\
+ argument to `datetime.datetime`. E.g. `data_limit=(2015, 4, 1, 12,\
+ 40)` for 12:30 pm on April 1 2015.
+
+ """
+ BasicTweetHandler.__init__(self, limit)
+
+ self.date_limit = date_limit
+ if date_limit is not None:
+ self.date_limit = datetime(*date_limit, tzinfo=LOCAL)
+
+ self.startingup = True
+
+ def handle(self, data):
+ """
+ Deal appropriately with data returned by the Twitter API
+ """
+ raise NotImplementedError
+
+ def on_finish(self):
+ """
+ Actions when the tweet limit has been reached
+ """
+ raise NotImplementedError
+
\ No newline at end of file
diff --git a/nltk/twitter/twitter_demo.py b/nltk/twitter/twitter_demo.py
new file mode 100644
index 0000000..40ca855
--- /dev/null
+++ b/nltk/twitter/twitter_demo.py
@@ -0,0 +1,268 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Twitter client
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Ewan Klein <ewan at inf.ed.ac.uk>
+# Lorenzo Rubio <lrnzcig at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Examples to demo the :py:mod:`twitterclient` code.
+
+These demo functions should all run, with the following caveats:
+
+* You must have obtained API keys from Twitter, and installed them according to
+ the instructions in `nltk/test/twitter.ipynb`.
+
+* If you are on a slow network, some of the calls to the Twitter API may
+ timeout.
+
+* If you are being rate limited while searching, you will receive a 420
+ error response.
+
+* Your terminal window / console must be able to display UTF-8 encoded characters.
+
+For documentation about the Twitter APIs, see `The Streaming APIs Overview
+<https://dev.twitter.com/streaming/overview>`_ and `The REST APIs Overview
+<https://dev.twitter.com/rest/public>`_.
+
+For error codes see Twitter's
+`Error Codes and Responses <https://dev.twitter.com/overview/api/response-codes>`
+"""
+
+from functools import wraps
+import json
+
+from nltk.compat import StringIO
+
+from nltk.twitter import Query, Streamer, Twitter, TweetViewer, TweetWriter,\
+ credsfromfile
+
+
+SPACER = '###################################'
+
+def verbose(func):
+ """Decorator for demo functions"""
+ @wraps(func)
+ def with_formatting(*args, **kwargs):
+ print()
+ print(SPACER)
+ print("Using %s" % (func.__name__))
+ print(SPACER)
+ return func(*args, **kwargs)
+ return with_formatting
+
+
+def setup():
+ """
+ Initialize global variables for the demos.
+ """
+ global DATE, USERIDS, FIELDS
+
+ DATE = (2015, 4, 20, 16, 40)
+ USERIDS = ['759251', '612473', '15108702', '6017542', '2673523800']
+ # UserIDs corresponding to\
+ # @CNN, @BBCNews, @ReutersLive, @BreakingNews, @AJELive
+ FIELDS = ['id_str']
+
+
+ at verbose
+def twitterclass_demo():
+ """
+ Use the simplified :class:`Twitter` class to write some tweets to a file.
+ """
+ tw = Twitter()
+ print("Track from the public stream\n")
+ tw.tweets(keywords='love, hate', limit=10) #public stream
+ print(SPACER)
+ print("Search past Tweets\n")
+ tw = Twitter()
+ tw.tweets(keywords='love, hate', stream=False, limit=10) # search past tweets
+ print(SPACER)
+ print("Follow two accounts in the public stream" +
+ " -- be prepared to wait a few minutes\n")
+ tw = Twitter()
+ tw.tweets(follow=['759251', '6017542'], stream=True, limit=10) #public stream
+
+
+ at verbose
+def sampletoscreen_demo(limit=20):
+ """
+ Sample from the Streaming API and send output to terminal.
+ """
+ oauth = credsfromfile()
+ client = Streamer(**oauth)
+ client.register(TweetViewer(limit=limit))
+ client.sample()
+
+
+ at verbose
+def tracktoscreen_demo(track="taylor swift", limit=10):
+ """
+ Track keywords from the public Streaming API and send output to terminal.
+ """
+ oauth = credsfromfile()
+ client = Streamer(**oauth)
+ client.register(TweetViewer(limit=limit))
+ client.filter(track=track)
+
+
+ at verbose
+def search_demo(keywords='nltk'):
+ """
+ Use the REST API to search for past tweets containing a given keyword.
+ """
+ oauth = credsfromfile()
+ client = Query(**oauth)
+ for tweet in client.search_tweets(keywords=keywords, limit=10):
+ print(tweet['text'])
+
+
+ at verbose
+def tweets_by_user_demo(user='NLTK_org', count=200):
+ """
+ Use the REST API to search for past tweets by a given user.
+ """
+ oauth = credsfromfile()
+ client = Query(**oauth)
+ client.register(TweetWriter())
+ client.user_tweets(user, count)
+
+
+ at verbose
+def lookup_by_userid_demo():
+ """
+ Use the REST API to convert a userID to a screen name.
+ """
+ oauth = credsfromfile()
+ client = Query(**oauth)
+ user_info = client.user_info_from_id(USERIDS)
+ for info in user_info:
+ name = info['screen_name']
+ followers = info['followers_count']
+ following = info['friends_count']
+ print("{0}, followers: {1}, following: {2}".format(name, followers, following))
+
+
+ at verbose
+def followtoscreen_demo(limit=10):
+ """
+ Using the Streaming API, select just the tweets from a specified list of
+ userIDs.
+
+ This is will only give results in a reasonable time if the users in
+ question produce a high volume of tweets, and may even so show some delay.
+ """
+ oauth = credsfromfile()
+ client = Streamer(**oauth)
+ client.register(TweetViewer(limit=limit))
+ client.statuses.filter(follow=USERIDS)
+
+
+ at verbose
+def streamtofile_demo(limit=20):
+ """
+ Write 20 tweets sampled from the public Streaming API to a file.
+ """
+ oauth = credsfromfile()
+ client = Streamer(**oauth)
+ client.register(TweetWriter(limit=limit, repeat=False))
+ client.statuses.sample()
+
+
+ at verbose
+def limit_by_time_demo(limit=20):
+ """
+ Sample from the Streaming API and send output to terminal.
+ """
+ oauth = credsfromfile()
+ client = Streamer(**oauth)
+ client.register(TweetWriter(limit=limit, date_limit=DATE))
+ client.sample()
+
+
+ at verbose
+def corpusreader_demo():
+ """
+ Use :module:`TwitterCorpusReader` tp read a file of tweets, and print out
+
+ * some full tweets in JSON format;
+ * some raw strings from the tweets (i.e., the value of the `text` field); and
+ * the result of tokenising the raw strings.
+
+ """
+ from nltk.corpus import twitter_samples as tweets
+
+ print()
+ print("Complete tweet documents")
+ print(SPACER)
+ for tweet in tweets.docs("tweets.20150430-223406.json")[:1]:
+ print(json.dumps(tweet, indent=1, sort_keys=True))
+
+ print()
+ print("Raw tweet strings:")
+ print(SPACER)
+ for text in tweets.strings("tweets.20150430-223406.json")[:15]:
+ print(text)
+
+ print()
+ print("Tokenized tweet strings:")
+ print(SPACER)
+ for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]:
+ print(toks)
+
+
+ at verbose
+def expand_tweetids_demo():
+ """
+ Given a file object containing a list of Tweet IDs, fetch the
+ corresponding full Tweets.
+
+ """
+ ids_f =\
+ StringIO("""\
+ 588665495492124672
+ 588665495487909888
+ 588665495508766721
+ 588665495513006080
+ 588665495517200384
+ 588665495487811584
+ 588665495525588992
+ 588665495487844352
+ 588665495492014081
+ 588665495512948737""")
+ oauth = credsfromfile()
+ client = Query(**oauth)
+ hydrated = client.expand_tweetids(ids_f)
+
+ for tweet in hydrated:
+ try:
+ id_str = tweet['id_str']
+ print('id: {}\ntext: {}\n'.format(id_str, tweet['text']))
+ except IndexError:
+ pass
+
+
+
+ALL = [twitterclass_demo, sampletoscreen_demo, tracktoscreen_demo,
+ search_demo, tweets_by_user_demo, lookup_by_userid_demo, followtoscreen_demo,
+ streamtofile_demo, limit_by_time_demo, corpusreader_demo, expand_tweetids_demo]
+
+"""
+Select demo functions to run. E.g. replace the following line with "DEMOS =
+ALL[8:]" to execute only the final two demos.
+"""
+DEMOS = ALL[:]
+
+if __name__ == "__main__":
+ setup()
+
+ for demo in DEMOS:
+ demo()
+
+ print("\n" + SPACER)
+ print("All demos completed")
+ print(SPACER)
+
+
diff --git a/nltk/twitter/twitterclient.py b/nltk/twitter/twitterclient.py
new file mode 100644
index 0000000..7b69ea9
--- /dev/null
+++ b/nltk/twitter/twitterclient.py
@@ -0,0 +1,506 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Twitter client
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Ewan Klein <ewan at inf.ed.ac.uk>
+# Lorenzo Rubio <lrnzcig at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+
+"""
+NLTK Twitter client
+
+This module offers methods for collecting and processing Tweets. Most of the
+functionality depends on access to the Twitter APIs, and this is handled via
+the third party Twython library.
+
+If one of the methods below returns an integer, it is probably a `Twitter
+error code <https://dev.twitter.com/overview/api/response-codes>`_. For
+example, the response of '420' means that you have reached the limit of the
+requests you can currently make to the Twitter API. Currently, `rate limits
+for the search API <https://dev.twitter.com/rest/public/rate-limiting>`_ are
+divided into 15 minute windows.
+"""
+
+import datetime
+import itertools
+import json
+import os
+import requests
+import time
+import gzip
+from nltk.compat import UTC
+
+
+from twython import Twython, TwythonStreamer
+from twython.exceptions import TwythonRateLimitError, TwythonError
+
+from nltk.twitter.util import credsfromfile, guess_path
+from nltk.twitter.api import TweetHandlerI, BasicTweetHandler
+
+
+
+class Streamer(TwythonStreamer):
+ """
+ Retrieve data from the Twitter Streaming API.
+
+ The streaming API requires
+ `OAuth 1.0 <http://en.wikipedia.org/wiki/OAuth>`_ authentication.
+ """
+ def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret):
+
+ self.handler = None
+ self.do_continue = True
+ TwythonStreamer.__init__(self, app_key, app_secret, oauth_token,
+ oauth_token_secret)
+
+ def register(self, handler):
+ """
+ Register a method for handling Tweets.
+
+ :param TweetHandlerI handler: method for viewing
+ """
+ self.handler = handler
+
+ def on_success(self, data):
+ """
+ :param data: response from Twitter API
+ """
+ if self.do_continue:
+ if self.handler is not None:
+ if 'text' in data:
+ self.handler.counter += 1
+ self.handler.handle(data)
+ self.do_continue = self.handler.do_continue()
+ else:
+ raise ValueError("No data handler has been registered.")
+ else:
+ self.disconnect()
+ self.handler.on_finish()
+
+
+ def on_error(self, status_code, data):
+ """
+ :param status_code: The status code returned by the Twitter API
+ :param data: The response from Twitter API
+
+ """
+ print(status_code)
+
+ def sample(self):
+ """
+ Wrapper for 'statuses / sample' API call
+ """
+ while self.do_continue:
+
+ # Stream in an endless loop until limit is reached. See twython
+ # issue 288: https://github.com/ryanmcgrath/twython/issues/288
+ # colditzjb commented on 9 Dec 2014
+
+ try:
+ self.statuses.sample()
+ except requests.exceptions.ChunkedEncodingError as e:
+ if e is not None:
+ print("Error (stream will continue): {0}".format(e))
+ continue
+
+ def filter(self, track='', follow='', lang='en'):
+ """
+ Wrapper for 'statuses / filter' API call
+ """
+ while self.do_continue:
+ #Stream in an endless loop until limit is reached
+
+ try:
+ if track == '' and follow == '':
+ msg = "Please supply a value for 'track', 'follow'"
+ raise ValueError(msg)
+ self.statuses.filter(track=track, follow=follow, lang=lang)
+ except requests.exceptions.ChunkedEncodingError as e:
+ if e is not None:
+ print("Error (stream will continue): {0}".format(e))
+ continue
+
+
+class Query(Twython):
+ """
+ Retrieve data from the Twitter REST API.
+ """
+ def __init__(self, app_key, app_secret, oauth_token,
+ oauth_token_secret):
+ self.handler = None
+ self.do_continue = True
+ Twython.__init__(self, app_key, app_secret, oauth_token, oauth_token_secret)
+
+ def register(self, handler):
+ """
+ Register a method for handling Tweets.
+
+ :param TweetHandlerI handler: method for viewing or writing Tweets to a file.
+ """
+ self.handler = handler
+
+ def expand_tweetids(self, ids_f, verbose=True):
+ """
+ Given a file object containing a list of Tweet IDs, fetch the
+ corresponding full Tweets from the Twitter API.
+
+ The API call `statuses/lookup` will fail to retrieve a Tweet if the
+ user has deleted it.
+
+ This call to the Twitter API is rate-limited. See
+ <https://dev.twitter.com/rest/reference/get/statuses/lookup> for details.
+
+ :param ids_f: input file object consisting of Tweet IDs, one to a line
+ :return: iterable of Tweet objects in JSON format
+ """
+ ids = [line.strip() for line in ids_f if line]
+
+ if verbose:
+ print("Counted {0} Tweet IDs in {1}.".format(len(ids), ids_f))
+
+ # The Twitter endpoint takes lists of up to 100 ids, so we chunk the
+ # ids.
+ id_chunks = [ids[i:i+100] for i in range(0, len(ids), 100)]
+
+ chunked_tweets = (self.lookup_status(id=chunk) for chunk in
+ id_chunks)
+
+ return itertools.chain.from_iterable(chunked_tweets)
+
+
+
+ def _search_tweets(self, keywords, limit=100, lang='en'):
+ """
+ Assumes that the handler has been informed. Fetches Tweets from
+ search_tweets generator output and passses them to handler
+
+ :param str keywords: A list of query terms to search for, written as\
+ a comma-separated string.
+ :param int limit: Number of Tweets to process
+ :param str lang: language
+ """
+ while True:
+ if isinstance(self.handler, TweetWriter):
+ max_id = self.handler.max_id
+ else:
+ max_id = None
+ tweets = self.search_tweets(keywords=keywords, limit=limit, lang=lang,
+ max_id=max_id)
+ for tweet in tweets:
+ self.handler.handle(tweet)
+ if not (self.handler.do_continue() and self.handler.repeat):
+ break
+ self.handler.on_finish()
+
+ def search_tweets(self, keywords, limit=100, lang='en', max_id=None,
+ retries_after_twython_exception=0):
+ """
+ Call the REST API ``'search/tweets'`` endpoint with some plausible
+ defaults. See `the Twitter search documentation
+ <https://dev.twitter.com/rest/public/search>`_ for more information
+ about admissable search parameters.
+
+ :param str keywords: A list of query terms to search for, written as\
+ a comma-separated string
+ :param int limit: Number of Tweets to process
+ :param str lang: language
+ :param int max_id: id of the last tweet fetched
+ :param int retries_after_twython_exception: number of retries when\
+ searching Tweets before raising an exception
+ :rtype: python generator
+ """
+ if not self.handler:
+ # if no handler is provided, `BasicTweetHandler` provides minimum
+ # functionality for limiting the number of Tweets retrieved
+ self.handler = BasicTweetHandler(limit=limit)
+
+ count_from_query = 0
+ if not max_id:
+ results = self.search(q=keywords, count=min(100, limit), lang=lang,
+ result_type='recent')
+ count = len(results['statuses'])
+ if count == 0:
+ print("No Tweets available through REST API for those keywords")
+ return
+ count_from_query = count
+ max_id = results['statuses'][count - 1]['id'] - 1
+
+ for result in results['statuses']:
+ yield result
+ self.handler.counter += 1
+ if self.handler.do_continue() == False:
+ return
+
+
+ # Pagination loop: keep fetching Tweets until the desired count is
+ # reached while dealing with Twitter rate limits.
+ retries = 0
+ while count_from_query < limit:
+ try:
+ mcount = min(100, limit-count_from_query)
+ results = self.search(q=keywords, count=mcount, lang=lang,
+ max_id=max_id, result_type='recent')
+ except TwythonRateLimitError as e:
+ print("Waiting for 15 minutes -{0}".format(e))
+ time.sleep(15*60) # wait 15 minutes
+ continue
+ except TwythonError as e:
+ print("Fatal error in Twython request -{0}".format(e))
+ if retries_after_twython_exception == retries:
+ raise e
+ retries += 1
+
+ count = len(results['statuses'])
+ if count == 0:
+ print("No more Tweets available through rest api")
+ return
+ count_from_query += count
+ # the max_id is also present in the Tweet metadata
+ # results['search_metadata']['next_results'], but as part of a
+ # query and difficult to fetch. This is doing the equivalent
+ # (last tweet id minus one)
+ max_id = results['statuses'][count - 1]['id'] - 1
+ self.handler.max_id = max_id
+
+ for result in results['statuses']:
+ yield result
+ self.handler.counter += 1
+ if self.handler.do_continue() == False:
+ return
+
+ def user_info_from_id(self, userids):
+ """
+ Convert a list of userIDs into a variety of information about the users.
+
+ See <https://dev.twitter.com/rest/reference/get/users/show>.
+
+ :param list userids: A list of integer strings corresponding to Twitter userIDs
+ :rtype: list(json)
+ """
+ return [self.show_user(user_id=userid) for userid in userids]
+
+ def user_tweets(self, screen_name, limit, include_rts='false'):
+ """
+ Return a collection of the most recent Tweets posted by the user
+
+ :param str user: The user's screen name; the initial '@' symbol\
+ should be omitted
+ :param int limit: The number of Tweets to recover; 200 is the maximum allowed
+ :param str include_rts: Whether to include statuses which have been\
+ retweeted by the user; possible values are 'true' and 'false'
+ """
+ data = self.get_user_timeline(screen_name=screen_name, count=limit,
+ include_rts=include_rts)
+ self.handler.handle(data)
+
+
+
+
+class Twitter(object):
+ """
+ Wrapper class with restricted functionality and fewer options.
+ """
+ def __init__(self):
+ self._oauth = credsfromfile()
+ self.streamer = Streamer(**self._oauth)
+ self.query = Query(**self._oauth)
+
+
+ def tweets(self, keywords='', follow='', to_screen=True, stream=True,
+ limit=100, date_limit=None, lang='en', repeat=False,
+ gzip_compress=False):
+ """
+ Process some Tweets in a simple manner.
+
+ :param str keywords: Keywords to use for searching or filtering
+ :param list follow: UserIDs to use for filtering Tweets from the public stream
+ :param bool to_screen: If `True`, display the tweet texts on the screen,\
+ otherwise print to a file
+
+ :param bool stream: If `True`, use the live public stream,\
+ otherwise search past public Tweets
+
+ :param int limit: Number of Tweets to process
+ :param tuple date_limit: The date at which to stop collecting new\
+ data. This should be entered as a tuple which can serve as the\
+ argument to `datetime.datetime`. E.g. `data_limit=(2015, 4, 1, 12,\
+ 40)` for 12:30 pm on April 1 2015.\
+ Note that, in the case of streaming, it is the maximum date, i.e.\
+ a date in the future; if not, it is the minimum date, i.e. a date\
+ in the past
+
+ :param str lang: language
+
+ :param bool repeat: flag to determine whether multiple files should be\
+ written. If `True`, the length of each file will be set by the value\
+ of `limit`. Use only if `to_screen` is `False`. See also :py:func:`handle`.
+
+ :param gzip_compress: if `True`, ouput files are compressed with gzip
+ """
+ if to_screen:
+ handler = TweetViewer(limit=limit, date_limit=date_limit)
+ else:
+ handler = TweetWriter(limit=limit, date_limit=date_limit,
+ stream=stream, repeat=repeat,
+ gzip_compress=gzip_compress)
+
+ if stream:
+ self.streamer.register(handler)
+ if keywords == '' and follow == '':
+ self.streamer.sample()
+ else:
+ self.streamer.filter(track=keywords, follow=follow, lang=lang)
+ else:
+ self.query.register(handler)
+ if keywords == '':
+ raise ValueError("Please supply at least one keyword to search for.")
+ else:
+ self.query._search_tweets(keywords, limit=limit, lang=lang)
+
+
+
+class TweetViewer(TweetHandlerI):
+ """
+ Handle data by sending it to the terminal.
+ """
+
+ def handle(self, data):
+ """
+ Direct data to `sys.stdout`
+
+ :return: return ``False`` if processing should cease, otherwise return ``True``.
+ :rtype: bool
+ :param data: Tweet object returned by Twitter API
+ """
+ text = data['text']
+ print(text)
+ self.counter += 1
+
+ def on_finish(self):
+ print('Written {0} Tweets'.format(self.counter))
+
+
+class TweetWriter(TweetHandlerI):
+ """
+ Handle data by writing it to a file.
+ """
+ def __init__(self, limit=2000, date_limit=None, stream=True,
+ fprefix='tweets', subdir='twitter-files', repeat=False,
+ gzip_compress=False):
+ """
+ :param int limit: number of data items to process in the current\
+ round of processing
+
+ :param bool stream: If `True`, use the live public stream,\
+ otherwise search past public Tweets
+
+ :param str fprefix: The prefix to use in creating file names for Tweet\
+ collections
+
+ :param str subdir: The name of the directory where Tweet collection\
+ files should be stored
+
+ :param bool repeat: flag to determine whether multiple files should be\
+ written. If `True`, the length of each file will be set by the value\
+ of `limit`. See also :py:func:`handle`.
+
+ :param gzip_compress: if `True`, ouput files are compressed with gzip
+ """
+ self.fprefix = fprefix
+ self.subdir = guess_path(subdir)
+ self.gzip_compress = gzip_compress
+ self.fname = self.timestamped_file()
+ self.stream = stream
+ self.repeat = repeat
+ # max_id stores the id of the older tweet fetched
+ self.max_id = None
+ self.output = None
+ TweetHandlerI.__init__(self, limit, date_limit)
+
+
+ def timestamped_file(self):
+ """
+ :return: timestamped file name
+ :rtype: str
+ """
+ subdir = self.subdir
+ fprefix = self.fprefix
+ if subdir:
+ if not os.path.exists(subdir):
+ os.mkdir(subdir)
+
+ fname = os.path.join(subdir, fprefix)
+ fmt = '%Y%m%d-%H%M%S'
+ timestamp = datetime.datetime.now().strftime(fmt)
+ if self.gzip_compress:
+ suffix = '.gz'
+ else:
+ suffix = ''
+ outfile = '{0}.{1}.json{2}'.format(fname, timestamp, suffix)
+ return outfile
+
+
+ def handle(self, data):
+ """
+ Write Twitter data as line-delimited JSON into one or more files.
+
+ :return: return `False` if processing should cease, otherwise return `True`.
+ :param data: tweet object returned by Twitter API
+ """
+ if self.startingup:
+ if self.gzip_compress:
+ self.output = gzip.open(self.fname, 'w')
+ else:
+ self.output = open(self.fname, 'w')
+ print('Writing to {0}'.format(self.fname))
+
+ json_data = json.dumps(data)
+ if self.gzip_compress:
+ self.output.write((json_data + "\n").encode('utf-8'))
+ else:
+ self.output.write(json_data + "\n")
+
+ if self.date_limit:
+ tweet_date = datetime.datetime.strptime(data['created_at'], '%a %b %d\
+ %H:%M:%S +0000 %Y').replace(tzinfo=UTC)
+ if (tweet_date > self.date_limit and self.stream == True) or \
+ (tweet_date < self.date_limit and self.stream == False):
+ if self.stream:
+ message = "earlier"
+ else:
+ message = "later"
+ print("Date limit {0} is {1} than date of current tweet {2}".\
+ format(self.date_limit, message, tweet_date))
+ self.do_stop = True
+ return
+
+ self.startingup = False
+
+ def on_finish(self):
+ print('Written {0} Tweets'.format(self.counter))
+ if self.output:
+ self.output.close()
+
+ def do_continue(self):
+ if self.repeat == False:
+ return TweetHandlerI.do_continue(self)
+
+ if self.do_stop:
+ # stop for a functional cause (e.g. date limit)
+ return False
+
+ if self.counter == self.limit:
+ # repeat is True, thus close output file and
+ # create a new one
+ self._restart_file()
+ return True
+
+
+ def _restart_file(self):
+ self.on_finish()
+ self.fname = self.timestamped_file()
+ self.startingup = True
+ self.counter = 0
+
diff --git a/nltk/twitter/util.py b/nltk/twitter/util.py
new file mode 100644
index 0000000..d26d32c
--- /dev/null
+++ b/nltk/twitter/util.py
@@ -0,0 +1,389 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Twitter client
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Ewan Klein <ewan at inf.ed.ac.uk>
+# Lorenzo Rubio <lrnzcig at gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Utility functions to accompany :module:`twitterclient`.
+"""
+from __future__ import print_function
+import csv
+import json
+import os
+import pprint
+import nltk.compat as compat
+import gzip
+
+from twython import Twython
+
+HIER_SEPARATOR = "."
+
+def extract_fields(tweet, fields):
+ """
+ Extract field values from a full tweet and return them as a list
+
+ :param json tweet: The tweet in JSON format
+ :param list fields: The fields to be extracted from the tweet
+ :rtype: list(str)
+ """
+ out = []
+ for field in fields:
+ try:
+ _add_field_to_out(tweet, field, out)
+ except TypeError:
+ raise RuntimeError('Fatal error when extracting fields. Cannot find field ', field)
+ return out
+
+
+def _add_field_to_out(json, field, out):
+ if _is_composed_key(field):
+ key, value = _get_key_value_composed(field)
+ _add_field_to_out(json[key], value, out)
+ else:
+ out += [json[field]]
+
+def _is_composed_key(field):
+ if HIER_SEPARATOR in field:
+ return True
+ return False
+
+def _get_key_value_composed(field):
+ out = field.split(HIER_SEPARATOR)
+ # there could be up to 3 levels
+ key = out[0]
+ value = HIER_SEPARATOR.join(out[1:])
+ return key, value
+
+def _get_entity_recursive(json, entity):
+ if not json:
+ return None
+ elif isinstance(json, dict):
+ for key, value in json.items():
+ if key == entity:
+ return value
+ # 'entities' and 'extended_entities' are wrappers in Twitter json
+ # structure that contain other Twitter objects. See:
+ # https://dev.twitter.com/overview/api/entities-in-twitter-objects
+
+ if key == 'entities' or key == 'extended_entities':
+ candidate = _get_entity_recursive(value, entity)
+ if candidate is not None:
+ return candidate
+ return None
+ elif isinstance(json, list):
+ for item in json:
+ candidate = _get_entity_recursive(item, entity)
+ if candidate is not None:
+ return candidate
+ return None
+ else:
+ return None
+
+def json2csv(fp, outfile, fields, encoding='utf8', errors='replace',
+ gzip_compress=False):
+ """
+ Extract selected fields from a file of line-separated JSON tweets and
+ write to a file in CSV format.
+
+ This utility function allows a file of full tweets to be easily converted
+ to a CSV file for easier processing. For example, just TweetIDs or
+ just the text content of the Tweets can be extracted.
+
+ Additionally, the function allows combinations of fields of other Twitter
+ objects (mainly the users, see below).
+
+ For Twitter entities (e.g. hashtags of a Tweet), and for geolocation, see
+ `json2csv_entities`
+
+ :param str infile: The name of the file containing full tweets
+
+ :param str outfile: The name of the text file where results should be\
+ written
+
+ :param list fields: The list of fields to be extracted. Useful examples\
+ are 'id_str' for the tweetID and 'text' for the text of the tweet. See\
+ <https://dev.twitter.com/overview/api/tweets> for a full list of fields.\
+ e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']\
+ Additonally, it allows IDs from other Twitter objects, e. g.,\
+ ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']
+
+ :param error: Behaviour for encoding errors, see\
+ https://docs.python.org/3/library/codecs.html#codec-base-classes
+
+ :param gzip_compress: if `True`, output files are compressed with gzip
+ """
+ (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
+ # write the list of fields as header
+ writer.writerow(fields)
+ # process the file
+ for line in fp:
+ tweet = json.loads(line)
+ row = extract_fields(tweet, fields)
+ writer.writerow(row)
+ outf.close()
+
+def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
+ """
+ Identify appropriate CSV writer given the Python version
+ """
+ if compat.PY3:
+ if gzip_compress:
+ outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
+ else:
+ outf = open(outfile, 'w', encoding=encoding, errors=errors)
+ writer = csv.writer(outf)
+ else:
+ if gzip_compress:
+ outf = gzip.open(outfile, 'wb')
+ else:
+ outf = open(outfile, 'wb')
+ writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
+ return (writer, outf)
+
+
+
+
+def json2csv_entities(tweets_file, outfile, main_fields, entity_type, entity_fields,
+ encoding='utf8', errors='replace', gzip_compress=False):
+ """
+ Extract selected fields from a file of line-separated JSON tweets and
+ write to a file in CSV format.
+
+ This utility function allows a file of full Tweets to be easily converted
+ to a CSV file for easier processing of Twitter entities. For example, the
+ hashtags or media elements of a tweet can be extracted.
+
+ It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags
+ there will be two lines in the output file, one per hashtag
+
+ :param tweets_file: the file-like object containing full Tweets
+
+ :param str outfile: The path of the text file where results should be\
+ written
+
+ :param list main_fields: The list of fields to be extracted from the main\
+ object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
+ <https://dev.twitter.com/overview/api/tweets> for a full list of fields.
+ e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
+ If `entity_type` is expressed with hierarchy, then it is the list of\
+ fields of the object that corresponds to the key of the entity_type,\
+ (e.g., for entity_type='user.urls', the fields in the main_fields list\
+ belong to the user object; for entity_type='place.bounding_box', the\
+ files in the main_field list belong to the place object of the tweet).
+
+ :param list entity_type: The name of the entity: 'hashtags', 'media',\
+ 'urls' and 'user_mentions' for the tweet object. For a user object,\
+ this needs to be expressed with a hierarchy: `'user.urls'`. For the\
+ bounding box of the Tweet location, use `'place.bounding_box'`.
+
+ :param list entity_fields: The list of fields to be extracted from the\
+ entity. E.g. `['text']` (of the Tweet)
+
+ :param error: Behaviour for encoding errors, see\
+ https://docs.python.org/3/library/codecs.html#codec-base-classes
+
+ :param gzip_compress: if `True`, ouput files are compressed with gzip
+ """
+
+ (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
+ header = get_header_field_list(main_fields, entity_type, entity_fields)
+ writer.writerow(header)
+ for line in tweets_file:
+ tweet = json.loads(line)
+ if _is_composed_key(entity_type):
+ key, value = _get_key_value_composed(entity_type)
+ object_json = _get_entity_recursive(tweet, key)
+ if not object_json:
+ # this can happen in the case of "place"
+ continue
+ object_fields = extract_fields(object_json, main_fields)
+ items = _get_entity_recursive(object_json, value)
+ _write_to_file(object_fields, items, entity_fields, writer)
+ else:
+ tweet_fields = extract_fields(tweet, main_fields)
+ items = _get_entity_recursive(tweet, entity_type)
+ _write_to_file(tweet_fields, items, entity_fields, writer)
+ outf.close()
+
+def get_header_field_list(main_fields, entity_type, entity_fields):
+ if _is_composed_key(entity_type):
+ key, value = _get_key_value_composed(entity_type)
+ main_entity = key
+ sub_entity = value
+ else:
+ main_entity = None
+ sub_entity = entity_type
+
+ if main_entity:
+ output1 = [HIER_SEPARATOR.join([main_entity, x]) for x in main_fields]
+ else:
+ output1 = main_fields
+ output2 = [HIER_SEPARATOR.join([sub_entity, x]) for x in entity_fields]
+ return output1 + output2
+
+def _write_to_file(object_fields, items, entity_fields, writer):
+ if not items:
+ # it could be that the entity is just not present for the tweet
+ # e.g. tweet hashtag is always present, even as [], however
+ # tweet media may not be present
+ return
+ if isinstance(items, dict):
+ # this happens e.g. for "place" of a tweet
+ row = object_fields
+ # there might be composed keys in de list of required fields
+ entity_field_values = [x for x in entity_fields if not _is_composed_key(x)]
+ entity_field_composed = [x for x in entity_fields if _is_composed_key(x)]
+ for field in entity_field_values:
+ value = items[field]
+ if isinstance(value, list):
+ row += value
+ else:
+ row += [value]
+ # now check required dictionaries
+ for d in entity_field_composed:
+ kd, vd = _get_key_value_composed(d)
+ json_dict = items[kd]
+ if not isinstance(json_dict, dict):
+ raise RuntimeError("""Key {0} does not contain a dictionary
+ in the json file""".format(kd))
+ row += [json_dict[vd]]
+ writer.writerow(row)
+ return
+ # in general it is a list
+ for item in items:
+ row = object_fields + extract_fields(item, entity_fields)
+ writer.writerow(row)
+
+
+def credsfromfile(creds_file=None, subdir=None, verbose=False):
+ """
+ Convenience function for authentication
+ """
+ return Authenticate().load_creds(creds_file=creds_file, subdir=subdir, verbose=verbose)
+
+
+class Authenticate(object):
+ """
+ Methods for authenticating with Twitter.
+ """
+ def __init__(self):
+ self.creds_file = 'credentials.txt'
+ self.creds_fullpath = None
+
+ self.oauth = {}
+ try:
+ self.twitter_dir = os.environ['TWITTER']
+ self.creds_subdir = self.twitter_dir
+ except KeyError:
+ self.twitter_dir = None
+ self.creds_subdir = None
+
+
+ def load_creds(self, creds_file=None, subdir=None, verbose=False):
+ """
+ Read OAuth credentials from a text file.
+
+ ::
+ File format for OAuth 1
+ =======================
+ app_key=YOUR_APP_KEY
+ app_secret=YOUR_APP_SECRET
+ oauth_token=OAUTH_TOKEN
+ oauth_token_secret=OAUTH_TOKEN_SECRET
+
+
+ ::
+ File format for OAuth 2
+ =======================
+
+ app_key=YOUR_APP_KEY
+ app_secret=YOUR_APP_SECRET
+ access_token=ACCESS_TOKEN
+
+ :param str file_name: File containing credentials. ``None`` (default) reads\
+ data from `TWITTER/'credentials.txt'`
+ """
+ if creds_file is not None:
+ self.creds_file = creds_file
+
+ if subdir is None:
+ if self.creds_subdir is None:
+ msg = "Supply a value to the 'subdir' parameter or" +\
+ " set the TWITTER environment variable."
+ raise ValueError(msg)
+ else:
+ self.creds_subdir = subdir
+
+ self.creds_fullpath =\
+ os.path.normpath(os.path.join(self.creds_subdir, self.creds_file))
+
+ if not os.path.isfile(self.creds_fullpath):
+ raise OSError('Cannot find file {}'.format(self.creds_fullpath))
+
+ with open(self.creds_fullpath) as infile:
+ if verbose:
+ print('Reading credentials file {}'.format(self.creds_fullpath))
+
+ for line in infile:
+ if '=' in line:
+ name, value = line.split('=', 1)
+ self.oauth[name.strip()] = value.strip()
+
+ self._validate_creds_file(verbose=verbose)
+
+ return self.oauth
+
+ def _validate_creds_file(self, verbose=False):
+ """Check validity of a credentials file."""
+ oauth1 = False
+ oauth1_keys = ['app_key', 'app_secret', 'oauth_token', 'oauth_token_secret']
+ oauth2 = False
+ oauth2_keys = ['app_key', 'app_secret', 'access_token']
+ if all(k in self.oauth for k in oauth1_keys):
+ oauth1 = True
+ elif all(k in self.oauth for k in oauth2_keys):
+ oauth2 = True
+
+ if not (oauth1 or oauth2):
+ msg = 'Missing or incorrect entries in {}\n'.format(self.creds_file)
+ msg += pprint.pformat(self.oauth)
+ raise ValueError(msg)
+ elif verbose:
+ print('Credentials file "{}" looks good'.format(self.creds_file))
+
+
+def add_access_token(creds_file=None):
+ """
+ For OAuth 2, retrieve an access token for an app and append it to a
+ credentials file.
+ """
+ if creds_file is None:
+ path = os.path.dirname(__file__)
+ creds_file = os.path.join(path, 'credentials2.txt')
+ oauth2 = credsfromfile(creds_file=creds_file)
+ app_key = oauth2['app_key']
+ app_secret = oauth2['app_secret']
+
+ twitter = Twython(app_key, app_secret, oauth_version=2)
+ access_token = twitter.obtain_access_token()
+ tok = 'access_token={}\n'.format(access_token)
+ with open(creds_file, 'a') as infile:
+ print(tok, file=infile)
+
+
+def guess_path(pth):
+ """
+ If the path is not absolute, guess that it is a subdirectory of the
+ user's home directory.
+
+ :param str pth: The pathname of the directory where files of tweets should be written
+ """
+ if os.path.isabs(pth):
+ return pth
+ else:
+ return os.path.expanduser(os.path.join("~", pth))
+
diff --git a/nltk/wsd.py b/nltk/wsd.py
index 7648c91..86ad773 100644
--- a/nltk/wsd.py
+++ b/nltk/wsd.py
@@ -51,6 +51,3 @@ def lesk(context_sentence, ambiguous_word, pos=None, synsets=None):
return sense
-if __name__ == "__main__":
- import doctest
- doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
diff --git a/setup.cfg b/setup.cfg
index 861a9f5..ebbec92 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[egg_info]
tag_build =
-tag_date = 0
tag_svn_revision = 0
+tag_date = 0
diff --git a/setup.py b/setup.py
index a90eca6..d465f39 100644
--- a/setup.py
+++ b/setup.py
@@ -70,6 +70,7 @@ natural language processing. NLTK requires Python 2.6, 2.7, or 3.2+.""",
'Topic :: Text Processing :: Linguistic',
],
package_data = {'nltk': ['test/*.doctest', 'VERSION']},
+ install_requires = ['six>=1.9.0'],
packages = find_packages(),
zip_safe=False, # since normal files will be present too?
)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/nltk.git
More information about the debian-science-commits
mailing list