[nltk] 01/05: New upstream version 3.2.5
Gianfranco Costamagna
locutusofborg at moszumanska.debian.org
Thu Oct 26 06:57:11 UTC 2017
This is an automated email from the git hooks/post-receive script.
locutusofborg pushed a commit to branch master
in repository nltk.
commit 2a6229cc9ad9fbf8a705d9b815e8d0505b073984
Author: Gianfranco Costamagna <costamagnagianfranco at yahoo.it>
Date: Thu Oct 26 08:53:50 2017 +0200
New upstream version 3.2.5
---
PKG-INFO | 2 +-
nltk.egg-info/PKG-INFO | 2 +-
nltk.egg-info/SOURCES.txt | 5 +
nltk.egg-info/requires.txt | 12 +-
nltk/VERSION | 2 +-
nltk/book.py | 1 +
nltk/classify/decisiontree.py | 10 +-
nltk/classify/naivebayes.py | 2 +-
nltk/classify/rte_classify.py | 4 +-
nltk/cluster/api.py | 1 +
nltk/collections.py | 3 +-
nltk/corpus/reader/framenet.py | 15 +-
nltk/corpus/reader/nombank.py | 4 +-
nltk/corpus/reader/propbank.py | 4 +-
nltk/corpus/reader/verbnet.py | 402 +++++++++++++++++++--------
nltk/corpus/reader/wordlist.py | 6 +-
nltk/corpus/reader/wordnet.py | 111 +-------
nltk/data.py | 38 ++-
nltk/downloader.py | 3 +-
nltk/parse/corenlp.py | 49 +++-
nltk/parse/recursivedescent.py | 3 +-
nltk/parse/shiftreduce.py | 3 +-
nltk/sem/logic.py | 19 +-
nltk/sentiment/util.py | 11 +-
nltk/sentiment/vader.py | 24 +-
nltk/stem/arlstem.py | 355 ++++++++++++++++++++++++
nltk/stem/snowball.py | 531 +++++++++++++++++++++++++++++++++++-
nltk/stem/util.py | 10 +
nltk/tag/__init__.py | 9 +-
nltk/tag/perceptron.py | 4 +-
nltk/tag/stanford.py | 78 +++++-
nltk/test/corpus.doctest | 30 ++
nltk/test/stem.doctest | 2 +-
nltk/test/tokenize.doctest | 9 +-
nltk/test/unit/test_corenlp.py | 412 ++++++++++++++++++++++++++++
nltk/test/unit/test_stem.py | 14 +
nltk/test/unit/test_tokenize.py | 42 ++-
nltk/test/unit/test_wordnet.py | 134 +++++++++
nltk/tokenize/__init__.py | 2 -
nltk/tokenize/moses.py | 39 ++-
nltk/tokenize/nist.py | 167 ++++++++++++
nltk/tokenize/punkt.py | 4 +-
nltk/tokenize/stanford.py | 40 ++-
nltk/tokenize/stanford_segmenter.py | 34 ++-
nltk/tokenize/treebank.py | 43 ++-
nltk/tokenize/util.py | 26 +-
nltk/translate/ibm1.py | 10 +-
nltk/translate/ibm_model.py | 3 +
nltk/translate/nist_score.py | 155 +++++++++++
49 files changed, 2566 insertions(+), 323 deletions(-)
diff --git a/PKG-INFO b/PKG-INFO
index 8f2b835..69e88e6 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: nltk
-Version: 3.2.4
+Version: 3.2.5
Summary: Natural Language Toolkit
Home-page: http://nltk.org/
Author: Steven Bird
diff --git a/nltk.egg-info/PKG-INFO b/nltk.egg-info/PKG-INFO
index 8f2b835..69e88e6 100644
--- a/nltk.egg-info/PKG-INFO
+++ b/nltk.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: nltk
-Version: 3.2.4
+Version: 3.2.5
Summary: Natural Language Toolkit
Home-page: http://nltk.org/
Author: Steven Bird
diff --git a/nltk.egg-info/SOURCES.txt b/nltk.egg-info/SOURCES.txt
index 0ed3d0b..10e352a 100644
--- a/nltk.egg-info/SOURCES.txt
+++ b/nltk.egg-info/SOURCES.txt
@@ -206,6 +206,7 @@ nltk/sentiment/util.py
nltk/sentiment/vader.py
nltk/stem/__init__.py
nltk/stem/api.py
+nltk/stem/arlstem.py
nltk/stem/isri.py
nltk/stem/lancaster.py
nltk/stem/porter.py
@@ -316,6 +317,7 @@ nltk/test/unit/test_aline.py
nltk/test/unit/test_chunk.py
nltk/test/unit/test_classify.py
nltk/test/unit/test_collocations.py
+nltk/test/unit/test_corenlp.py
nltk/test/unit/test_corpora.py
nltk/test/unit/test_corpus_views.py
nltk/test/unit/test_hmm.py
@@ -328,6 +330,7 @@ nltk/test/unit/test_tag.py
nltk/test/unit/test_tgrep.py
nltk/test/unit/test_tokenize.py
nltk/test/unit/test_twitter_auth.py
+nltk/test/unit/test_wordnet.py
nltk/test/unit/utils.py
nltk/test/unit/translate/__init__.py
nltk/test/unit/translate/test_bleu.py
@@ -343,6 +346,7 @@ nltk/tokenize/api.py
nltk/tokenize/casual.py
nltk/tokenize/moses.py
nltk/tokenize/mwe.py
+nltk/tokenize/nist.py
nltk/tokenize/punkt.py
nltk/tokenize/regexp.py
nltk/tokenize/repp.py
@@ -368,6 +372,7 @@ nltk/translate/ibm4.py
nltk/translate/ibm5.py
nltk/translate/ibm_model.py
nltk/translate/metrics.py
+nltk/translate/nist_score.py
nltk/translate/phrase_based.py
nltk/translate/ribes_score.py
nltk/translate/stack_decoder.py
diff --git a/nltk.egg-info/requires.txt b/nltk.egg-info/requires.txt
index 12c2271..72836ce 100644
--- a/nltk.egg-info/requires.txt
+++ b/nltk.egg-info/requires.txt
@@ -1,15 +1,15 @@
six
[all]
-requests
-twython
-gensim
-scipy
pyparsing
matplotlib
-python-crfsuite
-numpy
+gensim
+twython
scikit-learn
+scipy
+numpy
+requests
+python-crfsuite
[corenlp]
requests
diff --git a/nltk/VERSION b/nltk/VERSION
index 351227f..5ae69bd 100644
--- a/nltk/VERSION
+++ b/nltk/VERSION
@@ -1 +1 @@
-3.2.4
+3.2.5
diff --git a/nltk/book.py b/nltk/book.py
index 7e006d2..5394736 100644
--- a/nltk/book.py
+++ b/nltk/book.py
@@ -11,6 +11,7 @@ from nltk.corpus import (gutenberg, genesis, inaugural,
nps_chat, webtext, treebank, wordnet)
from nltk.text import Text
from nltk.probability import FreqDist
+from nltk.util import bigrams
print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
diff --git a/nltk/classify/decisiontree.py b/nltk/classify/decisiontree.py
index 27897ab..2bf5742 100644
--- a/nltk/classify/decisiontree.py
+++ b/nltk/classify/decisiontree.py
@@ -266,12 +266,12 @@ class DecisionTreeClassifier(ClassifierI):
if stump_error < best_error:
best_error = stump_error
best_stump = stump
- if best_stump._decisions:
- descr = '{0}={1}'.format(best_stump._fname,
- list(best_stump._decisions.keys())[0])
- else:
- descr = '(default)'
if verbose:
+ if best_stump._decisions:
+ descr = '{0}={1}'.format(best_stump._fname,
+ list(best_stump._decisions.keys())[0])
+ else:
+ descr = '(default)'
print(('best stump for {:6d} toks uses {:20} err={:6.4f}'.format \
(len(labeled_featuresets), descr, best_error)))
return best_stump
diff --git a/nltk/classify/naivebayes.py b/nltk/classify/naivebayes.py
index 22f0861..b547a7a 100644
--- a/nltk/classify/naivebayes.py
+++ b/nltk/classify/naivebayes.py
@@ -21,7 +21,7 @@ independent, given the label:
| P(label|features) = --------------------------------------------
| P(features)
-Rather than computing P(featues) explicitly, the algorithm just
+Rather than computing P(features) explicitly, the algorithm just
calculates the numerator for each label, and normalizes them so they
sum to one:
diff --git a/nltk/classify/rte_classify.py b/nltk/classify/rte_classify.py
index 1693560..968a223 100644
--- a/nltk/classify/rte_classify.py
+++ b/nltk/classify/rte_classify.py
@@ -46,7 +46,7 @@ class RTEFeatureExtractor(object):
This builds a bag of words for both the text and the hypothesis after
throwing away some stopwords, then calculates overlap and difference.
"""
- def __init__(self, rtepair, stop=True, lemmatize=False):
+ def __init__(self, rtepair, stop=True, use_lemmatize=False):
"""
:param rtepair: a ``RTEPair`` from which features should be extracted
:param stop: if ``True``, stopwords are thrown away.
@@ -69,7 +69,7 @@ class RTEFeatureExtractor(object):
self.text_words = set(self.text_tokens)
self.hyp_words = set(self.hyp_tokens)
- if lemmatize:
+ if use_lemmatize:
self.text_words = set(lemmatize(token) for token in self.text_tokens)
self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)
diff --git a/nltk/cluster/api.py b/nltk/cluster/api.py
index bf2f4ad..8679324 100644
--- a/nltk/cluster/api.py
+++ b/nltk/cluster/api.py
@@ -63,6 +63,7 @@ class ClusterI(object):
def cluster_names(self):
"""
Returns the names of the clusters.
+ :rtype: list
"""
return list(range(self.num_clusters()))
diff --git a/nltk/collections.py b/nltk/collections.py
index d915c1f..1107f7d 100644
--- a/nltk/collections.py
+++ b/nltk/collections.py
@@ -228,8 +228,7 @@ class AbstractLazySequence(object):
length += len(pieces[-1]) + 2
if length > self._MAX_REPR_SIZE and len(pieces) > 2:
return '[%s, ...]' % text_type(', ').join(pieces[:-1])
- else:
- return '[%s]' % text_type(', ').join(pieces)
+ return '[%s]' % text_type(', ').join(pieces)
def __eq__(self, other):
return (type(self) == type(other) and list(self) == list(other))
diff --git a/nltk/corpus/reader/framenet.py b/nltk/corpus/reader/framenet.py
index 26fa96e..344efb4 100644
--- a/nltk/corpus/reader/framenet.py
+++ b/nltk/corpus/reader/framenet.py
@@ -877,8 +877,7 @@ class PrettyLazyMap(LazyMap):
length += len(pieces[-1]) + 2
if length > self._MAX_REPR_SIZE and len(pieces) > 2:
return "[%s, ...]" % text_type(', ').join(pieces[:-1])
- else:
- return "[%s]" % text_type(', ').join(pieces)
+ return "[%s]" % text_type(', ').join(pieces)
@python_2_unicode_compatible
class PrettyLazyIteratorList(LazyIteratorList):
@@ -900,8 +899,7 @@ class PrettyLazyIteratorList(LazyIteratorList):
length += len(pieces[-1]) + 2
if length > self._MAX_REPR_SIZE and len(pieces) > 2:
return "[%s, ...]" % text_type(', ').join(pieces[:-1])
- else:
- return "[%s]" % text_type(', ').join(pieces)
+ return "[%s]" % text_type(', ').join(pieces)
@python_2_unicode_compatible
class PrettyLazyConcatenation(LazyConcatenation):
@@ -923,8 +921,7 @@ class PrettyLazyConcatenation(LazyConcatenation):
length += len(pieces[-1]) + 2
if length > self._MAX_REPR_SIZE and len(pieces) > 2:
return "[%s, ...]" % text_type(', ').join(pieces[:-1])
- else:
- return "[%s]" % text_type(', ').join(pieces)
+ return "[%s]" % text_type(', ').join(pieces)
def __add__(self, other):
"""Return a list concatenating self with other."""
@@ -1003,6 +1000,10 @@ class FramenetCorpusReader(XMLCorpusReader):
msg = """
+Citation: Nathan Schneider and Chuck Wooters (2017),
+"The NLTK FrameNet API: Designing for Discoverability with a Rich Linguistic Resource".
+Proceedings of EMNLP: System Demonstrations. https://arxiv.org/abs/1703.07438
+
Use the following methods to access data in FrameNet.
Provide a method name to `help()` for more information.
@@ -1023,7 +1024,7 @@ fes() to get frame elements (a.k.a. roles) matching a name pattern, optionally c
LEXICAL UNITS
=============
-lu() to look up a frame by its ID
+lu() to look up an LU by its ID
lus() to get lexical units matching a name pattern, optionally constrained by frame
lu_ids_and_names() to get a mapping from LU IDs to names
diff --git a/nltk/corpus/reader/nombank.py b/nltk/corpus/reader/nombank.py
index e1427ac..c6d7d16 100644
--- a/nltk/corpus/reader/nombank.py
+++ b/nltk/corpus/reader/nombank.py
@@ -111,9 +111,7 @@ class NombankCorpusReader(CorpusReader):
for roleset in etree.findall('predicate/roleset'):
if roleset.attrib['id'] == roleset_id:
return roleset
- else:
- raise ValueError('Roleset %s not found in %s' %
- (roleset_id, framefile))
+ raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
def rolesets(self, baseform=None):
"""
diff --git a/nltk/corpus/reader/propbank.py b/nltk/corpus/reader/propbank.py
index 320c75a..343858a 100644
--- a/nltk/corpus/reader/propbank.py
+++ b/nltk/corpus/reader/propbank.py
@@ -108,9 +108,7 @@ class PropbankCorpusReader(CorpusReader):
for roleset in etree.findall('predicate/roleset'):
if roleset.attrib['id'] == roleset_id:
return roleset
- else:
- raise ValueError('Roleset %s not found in %s' %
- (roleset_id, framefile))
+ raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
def rolesets(self, baseform=None):
"""
diff --git a/nltk/corpus/reader/verbnet.py b/nltk/corpus/reader/verbnet.py
index 6a34113..641cff9 100644
--- a/nltk/corpus/reader/verbnet.py
+++ b/nltk/corpus/reader/verbnet.py
@@ -9,7 +9,7 @@
An NLTK interface to the VerbNet verb lexicon
For details about VerbNet see:
-http://verbs.colorado.edu/~mpalmer/projects/verbnet.html
+https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
"""
from __future__ import unicode_literals
@@ -21,6 +21,7 @@ from six import string_types
from nltk.corpus.reader.xmldocs import XMLCorpusReader
+
class VerbnetCorpusReader(XMLCorpusReader):
"""
An NLTK interface to the VerbNet verb lexicon.
@@ -28,11 +29,11 @@ class VerbnetCorpusReader(XMLCorpusReader):
From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest
on-line verb lexicon currently available for English. It is a hierarchical
domain-independent, broad-coverage verb lexicon with mappings to other
- lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), Xtag
+ lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), XTAG
(XTAG Research Group, 2001), and FrameNet (Baker et al., 1998)."
For details about VerbNet see:
- http://verbs.colorado.edu/~mpalmer/projects/verbnet.html
+ https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
"""
# No unicode encoding param, since the data files are all XML.
@@ -41,11 +42,11 @@ class VerbnetCorpusReader(XMLCorpusReader):
self._lemma_to_class = defaultdict(list)
"""A dictionary mapping from verb lemma strings to lists of
- verbnet class identifiers."""
+ VerbNet class identifiers."""
self._wordnet_to_class = defaultdict(list)
"""A dictionary mapping from wordnet identifier strings to
- lists of verbnet class identifiers."""
+ lists of VerbNet class identifiers."""
self._class_to_fileid = {}
"""A dictionary mapping from class identifiers to
@@ -70,50 +71,49 @@ class VerbnetCorpusReader(XMLCorpusReader):
"""Regular expression used by ``_index()`` to quickly scan the corpus
for basic information."""
- def lemmas(self, classid=None):
+ def lemmas(self, vnclass=None):
"""
Return a list of all verb lemmas that appear in any class, or
in the ``classid`` if specified.
"""
- if classid is None:
+ if vnclass is None:
return sorted(self._lemma_to_class.keys())
else:
# [xx] should this include subclass members?
- vnclass = self.vnclass(classid)
+ if isinstance(vnclass, string_types):
+ vnclass = self.vnclass(vnclass)
return [member.get('name') for member in
vnclass.findall('MEMBERS/MEMBER')]
- def wordnetids(self, classid=None):
+ def wordnetids(self, vnclass=None):
"""
Return a list of all wordnet identifiers that appear in any
class, or in ``classid`` if specified.
"""
- if classid is None:
+ if vnclass is None:
return sorted(self._wordnet_to_class.keys())
else:
# [xx] should this include subclass members?
- vnclass = self.vnclass(classid)
- return sum([member.get('wn','').split() for member in
+ if isinstance(vnclass, string_types):
+ vnclass = self.vnclass(vnclass)
+ return sum([member.get('wn', '').split() for member in
vnclass.findall('MEMBERS/MEMBER')], [])
def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None):
"""
- Return a list of the verbnet class identifiers. If a file
- identifier is specified, then return only the verbnet class
+ Return a list of the VerbNet class identifiers. If a file
+ identifier is specified, then return only the VerbNet class
identifiers for classes (and subclasses) defined by that file.
- If a lemma is specified, then return only verbnet class
+ If a lemma is specified, then return only VerbNet class
identifiers for classes that contain that lemma as a member.
If a wordnetid is specified, then return only identifiers for
classes that contain that wordnetid as a member. If a classid
is specified, then return only identifiers for subclasses of
- the specified verbnet class.
+ the specified VerbNet class.
+ If nothing is specified, return all classids within VerbNet
"""
- if len([x for x in [lemma, wordnetid, fileid, classid]
- if x is not None]) > 1:
- raise ValueError('Specify at most one of: fileid, wordnetid, '
- 'fileid, classid')
if fileid is not None:
- return [c for (c,f) in self._class_to_fileid.items()
+ return [c for (c, f) in self._class_to_fileid.items()
if f == fileid]
elif lemma is not None:
return self._lemma_to_class[lemma]
@@ -127,14 +127,15 @@ class VerbnetCorpusReader(XMLCorpusReader):
return sorted(self._class_to_fileid.keys())
def vnclass(self, fileid_or_classid):
- """
+ """Returns VerbNet class ElementTree
+
Return an ElementTree containing the xml for the specified
- verbnet class.
+ VerbNet class.
:param fileid_or_classid: An identifier specifying which class
should be returned. Can be a file identifier (such as
- ``'put-9.1.xml'``), or a verbnet class identifier (such as
- ``'put-9.1'``) or a short verbnet class identifier (such as
+ ``'put-9.1.xml'``), or a VerbNet class identifier (such as
+ ``'put-9.1'``) or a short VerbNet class identifier (such as
``'9.1'``).
"""
# File identifier: just return the xml.
@@ -153,16 +154,16 @@ class VerbnetCorpusReader(XMLCorpusReader):
if classid == subclass.get('ID'):
return subclass
else:
- assert False # we saw it during _index()!
+ assert False # we saw it during _index()!
else:
- raise ValueError('Unknown identifier %s' % fileid_or_classid)
+ raise ValueError('Unknown identifier {}'.format(fileid_or_classid))
def fileids(self, vnclass_ids=None):
"""
Return a list of fileids that make up this corpus. If
``vnclass_ids`` is specified, then return the fileids that make
- up the specified verbnet class(es).
+ up the specified VerbNet class(es).
"""
if vnclass_ids is None:
return self._fileids
@@ -172,9 +173,74 @@ class VerbnetCorpusReader(XMLCorpusReader):
return [self._class_to_fileid[self.longid(vnclass_id)]
for vnclass_id in vnclass_ids]
+ def frames(self, vnclass):
+ """Given a VerbNet class, this method returns VerbNet frames
+
+ The members returned are:
+ 1) Example
+ 2) Description
+ 3) Syntax
+ 4) Semantics
+
+ :param vnclass: A VerbNet class identifier; or an ElementTree
+ containing the xml contents of a VerbNet class.
+ :return: frames - a list of frame dictionaries
+ """
+ if isinstance(vnclass, string_types):
+ vnclass = self.vnclass(vnclass)
+ frames = []
+ vnframes = vnclass.findall('FRAMES/FRAME')
+ for vnframe in vnframes:
+ frames.append({
+ 'example': self._get_example_within_frame(vnframe),
+ 'description': self._get_description_within_frame(vnframe),
+ 'syntax': self._get_syntactic_list_within_frame(vnframe),
+ 'semantics': self._get_semantics_within_frame(vnframe)
+ })
+ return frames
+
+ def subclasses(self, vnclass):
+ """Returns subclass ids, if any exist
+
+ Given a VerbNet class, this method returns subclass ids (if they exist)
+ in a list of strings.
+
+ :param vnclass: A VerbNet class identifier; or an ElementTree
+ containing the xml contents of a VerbNet class.
+ :return: list of subclasses
+ """
+ if isinstance(vnclass, string_types):
+ vnclass = self.vnclass(vnclass)
+
+ subclasses = [subclass.get('ID') for subclass in
+ vnclass.findall('SUBCLASSES/VNSUBCLASS')]
+ return subclasses
+
+ def themroles(self, vnclass):
+ """Returns thematic roles participating in a VerbNet class
+
+ Members returned as part of roles are-
+ 1) Type
+ 2) Modifiers
+
+ :param vnclass: A VerbNet class identifier; or an ElementTree
+ containing the xml contents of a VerbNet class.
+ :return: themroles: A list of thematic roles in the VerbNet class
+ """
+ if isinstance(vnclass, string_types):
+ vnclass = self.vnclass(vnclass)
+
+ themroles = []
+ for trole in vnclass.findall('THEMROLES/THEMROLE'):
+ themroles.append({
+ 'type': trole.get('type'),
+ 'modifiers': [{'value': restr.get('Value'), 'type': restr.get('type')}
+ for restr in trole.findall('SELRESTRS/SELRESTR')]
+ })
+ return themroles
######################################################################
- #{ Index Initialization
+ # { Index Initialization
######################################################################
def _index(self):
@@ -205,7 +271,7 @@ class VerbnetCorpusReader(XMLCorpusReader):
Initialize the indexes ``_lemma_to_class``,
``_wordnet_to_class``, and ``_class_to_fileid`` by scanning
through the corpus fileids. This doesn't do proper xml parsing,
- but is good enough to find everything in the standard verbnet
+ but is good enough to find everything in the standard VerbNet
corpus -- and it runs about 30 times faster than xml parsing
(with the python ElementTree; only 2-3 times faster with
cElementTree).
@@ -213,7 +279,7 @@ class VerbnetCorpusReader(XMLCorpusReader):
# nb: if we got rid of wordnet_to_class, this would run 2-3
# times faster.
for fileid in self._fileids:
- vnclass = fileid[:-4] # strip the '.xml'
+ vnclass = fileid[:-4] # strip the '.xml'
self._class_to_fileid[vnclass] = fileid
self._shortid_to_longid[self.shortid(vnclass)] = vnclass
for m in self._INDEX_RE.finditer(self.open(fileid).read()):
@@ -224,21 +290,23 @@ class VerbnetCorpusReader(XMLCorpusReader):
self._wordnet_to_class[wn].append(vnclass)
elif groups[2] is not None:
self._class_to_fileid[groups[2]] = fileid
- vnclass = groups[2] # for <MEMBER> elts.
+ vnclass = groups[2] # for <MEMBER> elts.
self._shortid_to_longid[self.shortid(vnclass)] = vnclass
else:
assert False, 'unexpected match condition'
######################################################################
- #{ Identifier conversion
+ # { Identifier conversion
######################################################################
def longid(self, shortid):
- """Given a short verbnet class identifier (eg '37.10'), map it
+ """Returns longid of a VerbNet class
+
+ Given a short VerbNet class identifier (eg '37.10'), map it
to a long id (eg 'confess-37.10'). If ``shortid`` is already a
long id, then return it as-is"""
if self._LONGID_RE.match(shortid):
- return shortid # it's already a longid.
+ return shortid # it's already a longid.
elif not self._SHORTID_RE.match(shortid):
raise ValueError('vnclass identifier %r not found' % shortid)
try:
@@ -247,11 +315,13 @@ class VerbnetCorpusReader(XMLCorpusReader):
raise ValueError('vnclass identifier %r not found' % shortid)
def shortid(self, longid):
- """Given a long verbnet class identifier (eg 'confess-37.10'),
+ """Returns shortid of a VerbNet class
+
+ Given a long VerbNet class identifier (eg 'confess-37.10'),
map it to a short id (eg '37.10'). If ``longid`` is already a
short id, then return it as-is."""
if self._SHORTID_RE.match(longid):
- return longid # it's already a shortid.
+ return longid # it's already a shortid.
m = self._LONGID_RE.match(longid)
if m:
return m.group(2)
@@ -259,16 +329,102 @@ class VerbnetCorpusReader(XMLCorpusReader):
raise ValueError('vnclass identifier %r not found' % longid)
######################################################################
- #{ Pretty Printing
+ # { Frame access utility functions
######################################################################
- def pprint(self, vnclass):
+ def _get_semantics_within_frame(self, vnframe):
+ """Returns semantics within a single frame
+
+ A utility function to retrieve semantics within a frame in VerbNet
+ Members of the semantics dictionary:
+ 1) Predicate value
+ 2) Arguments
+
+ :param vnframe: An ElementTree containing the xml contents of
+ a VerbNet frame.
+ :return: semantics: semantics dictionary
"""
+ semantics_within_single_frame = []
+ for pred in vnframe.findall('SEMANTICS/PRED'):
+ arguments = [{'type': arg.get('type'), 'value': arg.get('value')}
+ for arg in pred.findall('ARGS/ARG')]
+ semantics_within_single_frame.append({
+ 'predicate_value': pred.get('value'),
+ 'arguments': arguments
+ })
+ return semantics_within_single_frame
+
+ def _get_example_within_frame(self, vnframe):
+ """Returns example within a frame
+
+ A utility function to retrieve an example within a frame in VerbNet.
+
+ :param vnframe: An ElementTree containing the xml contents of
+ a VerbNet frame.
+ :return: example_text: The example sentence for this particular frame
+ """
+ example_element = vnframe.find('EXAMPLES/EXAMPLE')
+ if example_element is not None:
+ example_text = example_element.text
+ else:
+ example_text = ""
+ return example_text
+
+ def _get_description_within_frame(self, vnframe):
+ """Returns member description within frame
+
+ A utility function to retrieve a description of participating members
+ within a frame in VerbNet.
+
+ :param vnframe: An ElementTree containing the xml contents of
+ a VerbNet frame.
+ :return: description: a description dictionary with members - primary and secondary
+ """
+ description_element = vnframe.find('DESCRIPTION')
+ return {
+ 'primary': description_element.attrib['primary'],
+ 'secondary': description_element.get('secondary', '')
+ }
+
+ def _get_syntactic_list_within_frame(self, vnframe):
+ """Returns semantics within a frame
+
+ A utility function to retrieve semantics within a frame in VerbNet.
+ Members of the syntactic dictionary:
+ 1) POS Tag
+ 2) Modifiers
+
+ :param vnframe: An ElementTree containing the xml contents of
+ a VerbNet frame.
+ :return: syntax_within_single_frame
+ """
+ syntax_within_single_frame = []
+ for elt in vnframe.find('SYNTAX'):
+ pos_tag = elt.tag
+ modifiers = dict()
+ modifiers['value'] = elt.get('value') if 'value' in elt.attrib else ""
+ modifiers['selrestrs'] = [{'value': restr.get('Value'), 'type': restr.get('type')}
+ for restr in elt.findall('SELRESTRS/SELRESTR')]
+ modifiers['synrestrs'] = [{'value': restr.get('Value'), 'type': restr.get('type')}
+ for restr in elt.findall('SYNRESTRS/SYNRESTR')]
+ syntax_within_single_frame.append({
+ 'pos_tag': pos_tag,
+ 'modifiers': modifiers
+ })
+ return syntax_within_single_frame
+
+ ######################################################################
+ # { Pretty Printing
+ ######################################################################
+
+ def pprint(self, vnclass):
+ """Returns pretty printed version of a VerbNet class
+
Return a string containing a pretty-printed representation of
- the given verbnet class.
+ the given VerbNet class.
- :param vnclass: A verbnet class identifier; or an ElementTree
- containing the xml contents of a verbnet class.
+ :param vnclass: A VerbNet class identifier; or an ElementTree
+ containing the xml contents of a VerbNet class.
"""
if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
@@ -279,129 +435,161 @@ class VerbnetCorpusReader(XMLCorpusReader):
s += ' Thematic roles:\n'
s += self.pprint_themroles(vnclass, indent=' ') + '\n'
s += ' Frames:\n'
- s += '\n'.join(self.pprint_frame(vnframe, indent=' ')
- for vnframe in vnclass.findall('FRAMES/FRAME'))
+ s += self.pprint_frames(vnclass, indent=' ')
return s
def pprint_subclasses(self, vnclass, indent=''):
- """
+ """Returns pretty printed version of subclasses of VerbNet class
+
Return a string containing a pretty-printed representation of
- the given verbnet class's subclasses.
+ the given VerbNet class's subclasses.
- :param vnclass: A verbnet class identifier; or an ElementTree
- containing the xml contents of a verbnet class.
+ :param vnclass: A VerbNet class identifier; or an ElementTree
+ containing the xml contents of a VerbNet class.
"""
if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
- subclasses = [subclass.get('ID') for subclass in
- vnclass.findall('SUBCLASSES/VNSUBCLASS')]
+ subclasses = self.subclasses(vnclass)
if not subclasses: subclasses = ['(none)']
s = 'Subclasses: ' + ' '.join(subclasses)
return textwrap.fill(s, 70, initial_indent=indent,
- subsequent_indent=indent+' ')
+ subsequent_indent=indent + ' ')
def pprint_members(self, vnclass, indent=''):
- """
+ """Returns pretty printed version of members in a VerbNet class
+
Return a string containing a pretty-printed representation of
- the given verbnet class's member verbs.
+ the given VerbNet class's member verbs.
- :param vnclass: A verbnet class identifier; or an ElementTree
- containing the xml contents of a verbnet class.
+ :param vnclass: A VerbNet class identifier; or an ElementTree
+ containing the xml contents of a VerbNet class.
"""
if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
- members = [member.get('name') for member in
- vnclass.findall('MEMBERS/MEMBER')]
- if not members: members = ['(none)']
+ members = self.lemmas(vnclass)
+ if not members:
+ members = ['(none)']
s = 'Members: ' + ' '.join(members)
return textwrap.fill(s, 70, initial_indent=indent,
- subsequent_indent=indent+' ')
+ subsequent_indent=indent + ' ')
def pprint_themroles(self, vnclass, indent=''):
- """
+ """Returns pretty printed version of thematic roles in a VerbNet class
+
Return a string containing a pretty-printed representation of
- the given verbnet class's thematic roles.
+ the given VerbNet class's thematic roles.
- :param vnclass: A verbnet class identifier; or an ElementTree
- containing the xml contents of a verbnet class.
+ :param vnclass: A VerbNet class identifier; or an ElementTree
+ containing the xml contents of a VerbNet class.
"""
if isinstance(vnclass, string_types):
vnclass = self.vnclass(vnclass)
pieces = []
- for themrole in vnclass.findall('THEMROLES/THEMROLE'):
+ for themrole in self.themroles(vnclass):
piece = indent + '* ' + themrole.get('type')
- modifiers = ['%(Value)s%(type)s' % restr.attrib
- for restr in themrole.findall('SELRESTRS/SELRESTR')]
+ modifiers = [modifier['value'] + modifier['type']
+ for modifier in themrole['modifiers']]
if modifiers:
- piece += '[%s]' % ' '.join(modifiers)
+ piece += '[{}]'.format(' '.join(modifiers))
pieces.append(piece)
-
return '\n'.join(pieces)
- def pprint_frame(self, vnframe, indent=''):
- """
+ def pprint_frames(self, vnclass, indent=''):
+ """Returns pretty version of all frames in a VerbNet class
+
Return a string containing a pretty-printed representation of
- the given verbnet frame.
+ the list of frames within the VerbNet class.
- :param vnframe: An ElementTree containing the xml contents of
- a verbnet frame.
+ :param vnclass: A VerbNet class identifier; or an ElementTree
+ containing the xml contents of a VerbNet class.
"""
- s = self.pprint_description(vnframe, indent) + '\n'
- s += self.pprint_syntax(vnframe, indent+' Syntax: ') + '\n'
- s += indent + ' Semantics:\n'
- s += self.pprint_semantics(vnframe, indent+' ')
- return s
+ if isinstance(vnclass, string_types):
+ vnclass = self.vnclass(vnclass)
+ pieces = []
+ for vnframe in self.frames(vnclass):
+ pieces.append(self._pprint_single_frame(vnframe, indent))
+ return '\n'.join(pieces)
- def pprint_description(self, vnframe, indent=''):
+ def _pprint_single_frame(self, vnframe, indent=''):
+ """Returns pretty printed version of a single frame in a VerbNet class
+
+ Returns a string containing a pretty-printed representation of
+ the given frame.
+
+ :param vnframe: An ElementTree containing the xml contents of
+ a VerbNet frame.
"""
+ frame_string = self._pprint_description_within_frame(vnframe, indent) + '\n'
+ frame_string += self._pprint_example_within_frame(vnframe, indent + ' ') + '\n'
+ frame_string += self._pprint_syntax_within_frame(vnframe, indent + ' Syntax: ') + '\n'
+ frame_string += indent + ' Semantics:\n'
+ frame_string += self._pprint_semantics_within_frame(vnframe, indent + ' ')
+ return frame_string
+
+ def _pprint_example_within_frame(self, vnframe, indent=''):
+ """Returns pretty printed version of example within frame in a VerbNet class
+
Return a string containing a pretty-printed representation of
- the given verbnet frame description.
+ the given VerbNet frame example.
:param vnframe: An ElementTree containing the xml contents of
- a verbnet frame.
+ a Verbnet frame.
"""
- descr = vnframe.find('DESCRIPTION')
- s = indent + descr.attrib['primary']
- if descr.get('secondary', ''):
- s += ' (%s)' % descr.get('secondary')
- return s
+ if vnframe['example']:
+ return indent + ' Example: ' + vnframe['example']
- def pprint_syntax(self, vnframe, indent=''):
+ def _pprint_description_within_frame(self, vnframe, indent=''):
+ """Returns pretty printed version of a VerbNet frame description
+
+ Return a string containing a pretty-printed representation of
+ the given VerbNet frame description.
+
+ :param vnframe: An ElementTree containing the xml contents of
+ a VerbNet frame.
"""
+ description = indent + vnframe['description']['primary']
+ if vnframe['description']['secondary']:
+ description += ' ({})'.format(vnframe['description']['secondary'])
+ return description
+
+ def _pprint_syntax_within_frame(self, vnframe, indent=''):
+ """Returns pretty printed version of syntax within a frame in a VerbNet class
+
Return a string containing a pretty-printed representation of
- the given verbnet frame syntax.
+ the given VerbNet frame syntax.
:param vnframe: An ElementTree containing the xml contents of
- a verbnet frame.
+ a VerbNet frame.
"""
pieces = []
- for elt in vnframe.find('SYNTAX'):
- piece = elt.tag
- modifiers = []
- if 'value' in elt.attrib:
- modifiers.append(elt.get('value'))
- modifiers += ['%(Value)s%(type)s' % restr.attrib
- for restr in (elt.findall('SELRESTRS/SELRESTR') +
- elt.findall('SYNRESTRS/SYNRESTR'))]
- if modifiers:
- piece += '[%s]' % ' '.join(modifiers)
+ for element in vnframe['syntax']:
+ piece = element['pos_tag']
+ modifier_list = []
+ if 'value' in element['modifiers'] and element['modifiers']['value']:
+ modifier_list.append(element['modifiers']['value'])
+ modifier_list += ['{}{}'.format(restr['value'], restr['type'])
+ for restr in (element['modifiers']['selrestrs'] +
+ element['modifiers']['synrestrs'])]
+ if modifier_list:
+ piece += '[{}]'.format(' '.join(modifier_list))
pieces.append(piece)
return indent + ' '.join(pieces)
- def pprint_semantics(self, vnframe, indent=''):
- """
+ def _pprint_semantics_within_frame(self, vnframe, indent=''):
+ """Returns a pretty printed version of semantics within frame in a VerbNet class
+
Return a string containing a pretty-printed representation of
- the given verbnet frame semantics.
+ the given VerbNet frame semantics.
:param vnframe: An ElementTree containing the xml contents of
- a verbnet frame.
+ a VerbNet frame.
"""
pieces = []
- for pred in vnframe.findall('SEMANTICS/PRED'):
- args = [arg.get('value') for arg in pred.findall('ARGS/ARG')]
- pieces.append('%s(%s)' % (pred.get('value'), ', '.join(args)))
- return '\n'.join('%s* %s' % (indent, piece) for piece in pieces)
+ for predicate in vnframe['semantics']:
+ arguments = [argument['value'] for argument in predicate['arguments']]
+ pieces.append('{}({})'.format(predicate['predicate_value'], ', '.join(arguments)))
+ return '\n'.join('{}* {}'.format(indent, piece) for piece in pieces)
diff --git a/nltk/corpus/reader/wordlist.py b/nltk/corpus/reader/wordlist.py
index 85f529e..24e06ae 100644
--- a/nltk/corpus/reader/wordlist.py
+++ b/nltk/corpus/reader/wordlist.py
@@ -88,7 +88,9 @@ class UnicharsCorpusReader(WordListCorpusReader):
# These are categories similar to the Perl Unicode Properties
available_categories = ['Close_Punctuation', 'Currency_Symbol',
'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc',
- 'IsSo', 'Open_Punctuation']
+ 'IsSo', 'IsUpper', 'Line_Separator', 'Number',
+ 'Open_Punctuation', 'Punctuation', 'Separator',
+ 'Symbol']
def chars(self, category=None, fileids=None):
"""
@@ -101,7 +103,7 @@ class UnicharsCorpusReader(WordListCorpusReader):
>>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5']
True
>>> pup.available_categories
- ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'Open_Punctuation']
+ ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol']
:return: a list of characters given the specific unicode character category
"""
diff --git a/nltk/corpus/reader/wordnet.py b/nltk/corpus/reader/wordnet.py
index 3069a83..7063aed 100644
--- a/nltk/corpus/reader/wordnet.py
+++ b/nltk/corpus/reader/wordnet.py
@@ -112,7 +112,7 @@ VERB_FRAME_STRINGS = (
"It %s that CLAUSE",
"Something %s INFINITIVE")
-SENSENUM_RE = re.compile(r'\.\d\d\.')
+SENSENUM_RE = re.compile(r'\.[\d]+\.')
######################################################################
@@ -132,13 +132,13 @@ class _WordNetObject(object):
return self._related('@')
def _hypernyms(self):
- return self._related('@', sort=False)
+ return self._related('@')
def instance_hypernyms(self):
return self._related('@i')
def _instance_hypernyms(self):
- return self._related('@i', sort=False)
+ return self._related('@i')
def hyponyms(self):
return self._related('~')
@@ -905,7 +905,7 @@ class Synset(_WordNetObject):
if len(subsumers) == 0:
return None
- subsumer = subsumers[0]
+ subsumer = self if self in subsumers else subsumers[0]
# Get the longest path from the LCS to the root,
# including a correction:
@@ -1244,7 +1244,13 @@ class WordNetCorpusReader(CorpusReader):
# cannot simply split on first '.',
# e.g.: '.45_caliber.a.01..45_caliber'
separator = SENSENUM_RE.search(name).start()
- synset_name, lemma_name = name[:separator+3], name[separator+4:]
+
+ leadingZero = int(name[separator+1]) == 0
+ if (leadingZero):
+ synset_name, lemma_name = name[:separator+3], name[separator+4:]
+ else:
+ synset_name, lemma_name = name[:separator+2], name[separator+3:]
+
synset = self.synset(synset_name)
for lemma in synset.lemmas(lang):
if lemma._name == lemma_name:
@@ -2056,98 +2062,3 @@ def teardown_module(module=None):
from nltk.corpus import wordnet
wordnet._unload()
-
-######################################################################
-# Demo
-######################################################################
-
-def demo():
- import nltk
- print('loading wordnet')
- wn = WordNetCorpusReader(nltk.data.find('corpora/wordnet'), None)
- print('done loading')
- S = wn.synset
- L = wn.lemma
-
- print('getting a synset for go')
- move_synset = S('go.v.21')
- print(move_synset.name(), move_synset.pos(), move_synset.lexname())
- print(move_synset.lemma_names())
- print(move_synset.definition())
- print(move_synset.examples())
-
- zap_n = ['zap.n.01']
- zap_v = ['zap.v.01', 'zap.v.02', 'nuke.v.01', 'microwave.v.01']
-
- def _get_synsets(synset_strings):
- return [S(synset) for synset in synset_strings]
-
- zap_n_synsets = _get_synsets(zap_n)
- zap_v_synsets = _get_synsets(zap_v)
-
- print(zap_n_synsets)
- print(zap_v_synsets)
-
- print("Navigations:")
- print(S('travel.v.01').hypernyms())
- print(S('travel.v.02').hypernyms())
- print(S('travel.v.03').hypernyms())
-
- print(L('zap.v.03.nuke').derivationally_related_forms())
- print(L('zap.v.03.atomize').derivationally_related_forms())
- print(L('zap.v.03.atomise').derivationally_related_forms())
- print(L('zap.v.03.zap').derivationally_related_forms())
-
- print(S('dog.n.01').member_holonyms())
- print(S('dog.n.01').part_meronyms())
-
- print(S('breakfast.n.1').hypernyms())
- print(S('meal.n.1').hyponyms())
- print(S('Austen.n.1').instance_hypernyms())
- print(S('composer.n.1').instance_hyponyms())
-
- print(S('faculty.n.2').member_meronyms())
- print(S('copilot.n.1').member_holonyms())
-
- print(S('table.n.2').part_meronyms())
- print(S('course.n.7').part_holonyms())
-
- print(S('water.n.1').substance_meronyms())
- print(S('gin.n.1').substance_holonyms())
-
- print(L('leader.n.1.leader').antonyms())
- print(L('increase.v.1.increase').antonyms())
-
- print(S('snore.v.1').entailments())
- print(S('heavy.a.1').similar_tos())
- print(S('light.a.1').attributes())
- print(S('heavy.a.1').attributes())
-
- print(L('English.a.1.English').pertainyms())
-
- print(S('person.n.01').root_hypernyms())
- print(S('sail.v.01').root_hypernyms())
- print(S('fall.v.12').root_hypernyms())
-
- print(S('person.n.01').lowest_common_hypernyms(S('dog.n.01')))
- print(S('woman.n.01').lowest_common_hypernyms(S('girlfriend.n.02')))
-
- print(S('dog.n.01').path_similarity(S('cat.n.01')))
- print(S('dog.n.01').lch_similarity(S('cat.n.01')))
- print(S('dog.n.01').wup_similarity(S('cat.n.01')))
-
- wnic = WordNetICCorpusReader(nltk.data.find('corpora/wordnet_ic'),
- '.*\.dat')
- ic = wnic.ic('ic-brown.dat')
- print(S('dog.n.01').jcn_similarity(S('cat.n.01'), ic))
-
- ic = wnic.ic('ic-semcor.dat')
- print(S('dog.n.01').lin_similarity(S('cat.n.01'), ic))
-
- print(S('code.n.03').topic_domains())
- print(S('pukka.a.01').region_domains())
- print(S('freaky.a.01').usage_domains())
-
-
-if __name__ == '__main__':
- demo()
diff --git a/nltk/data.py b/nltk/data.py
index 4f4e375..3295bb8 100644
--- a/nltk/data.py
+++ b/nltk/data.py
@@ -35,16 +35,27 @@ from __future__ import division
from abc import ABCMeta, abstractmethod
from six import add_metaclass
-import sys
+import functools
+import textwrap
import io
import os
-import textwrap
import re
+import sys
import zipfile
import codecs
from gzip import GzipFile, READ as GZ_READ, WRITE as GZ_WRITE
+try: # Python 3.
+ textwrap_indent = functools.partial(textwrap.indent, prefix=' ')
+except AttributeError: # Python 2; indent() not available for Python2.
+ textwrap_fill = functools.partial(textwrap.fill,
+ initial_indent=' ',
+ subsequent_indent=' ',
+ replace_whitespace=False)
+ def textwrap_indent(text):
+ return '\n'.join(textwrap_fill(line) for line in text.splitlines())
+
try:
from zlib import Z_SYNC_FLUSH as FLUSH
except ImportError:
@@ -94,7 +105,9 @@ else:
str('/usr/share/nltk_data'),
str('/usr/local/share/nltk_data'),
str('/usr/lib/nltk_data'),
- str('/usr/local/lib/nltk_data')
+ str('/usr/local/lib/nltk_data'),
+ os.path.join(sys.prefix, str('nltk_data')),
+ os.path.join(sys.prefix, str('lib'), str('nltk_data'))
]
@@ -641,15 +654,22 @@ def find(resource_name, paths=None):
except LookupError:
pass
+ # Identify the package (i.e. the .zip file) to download.
+ resource_zipname = resource_name.split('/')[1]
+ if resource_zipname.endswith('.zip'):
+ resource_zipname = resource_zipname.rpartition('.')[0]
# Display a friendly error message if the resource wasn't found:
- msg = textwrap.fill(
- 'Resource %r not found. Please use the NLTK Downloader to '
- 'obtain the resource: >>> nltk.download()' %
- (resource_name,), initial_indent=' ', subsequent_indent=' ',
- width=66)
+ msg = str("Resource \33[93m{resource}\033[0m not found.\n"
+ "Please use the NLTK Downloader to obtain the resource:\n\n"
+ "\33[31m" # To display red text in terminal.
+ ">>> import nltk\n"
+ ">>> nltk.download(\'{resource}\')\n"
+ "\033[0m").format(resource=resource_zipname)
+ msg = textwrap_indent(msg)
+
msg += '\n Searched in:' + ''.join('\n - %r' % d for d in paths)
sep = '*' * 70
- resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
+ resource_not_found = '\n%s\n%s\n%s\n' % (sep, msg, sep)
raise LookupError(resource_not_found)
diff --git a/nltk/downloader.py b/nltk/downloader.py
index 7beb2c4..452fade 100644
--- a/nltk/downloader.py
+++ b/nltk/downloader.py
@@ -2258,7 +2258,8 @@ if __name__ == '__main__':
parser.add_option("-e", "--exit-on-error", dest="halt_on_error", action="store_true",
default=False, help="exit if an error occurs")
parser.add_option("-u", "--url", dest="server_index_url",
- default=None, help="download server index url")
+ default=os.environ.get('NLTK_DOWNLOAD_URL'),
+ help="download server index url")
(options, args) = parser.parse_args()
diff --git a/nltk/parse/corenlp.py b/nltk/parse/corenlp.py
index 49c428b..f6043ef 100644
--- a/nltk/parse/corenlp.py
+++ b/nltk/parse/corenlp.py
@@ -201,7 +201,7 @@ class GenericCoreNLPParser(ParserI, TokenizerI):
:type sentences: list(list(str))
:rtype: iter(iter(Tree))
"""
-
+ # Converting list(list(str)) -> list(str)
sentences = (' '.join(words) for words in sentences)
return self.raw_parse_sents(sentences, *args, **kwargs)
@@ -271,11 +271,13 @@ class GenericCoreNLPParser(ParserI, TokenizerI):
"""
default_properties = {
- 'ssplit.isOneSentence': 'true',
+ # Only splits on '\n', never inside the sentence.
+ 'ssplit.ssplit.eolonly': 'true',
}
default_properties.update(properties or {})
+ """
for sentence in sentences:
parsed_data = self.api_call(sentence, properties=default_properties)
@@ -284,6 +286,12 @@ class GenericCoreNLPParser(ParserI, TokenizerI):
for parse in parsed_data['sentences']:
tree = self.make_tree(parse)
yield iter([tree])
+ """
+ parsed_data = self.api_call('\n'.join(sentences), properties=default_properties)
+ for parsed_sent in parsed_data['sentences']:
+ tree = self.make_tree(parsed_sent)
+ yield iter([tree])
+
def parse_text(self, text, *args, **kwargs):
"""Parse a piece of text.
@@ -320,6 +328,7 @@ class GenericCoreNLPParser(ParserI, TokenizerI):
"""
default_properties = {
'annotators': 'tokenize,ssplit',
+
}
default_properties.update(properties or {})
@@ -328,7 +337,7 @@ class GenericCoreNLPParser(ParserI, TokenizerI):
for sentence in result['sentences']:
for token in sentence['tokens']:
- yield token['originalText']
+ yield token['originalText'] or token['word']
class CoreNLPParser(GenericCoreNLPParser):
@@ -611,6 +620,40 @@ class CoreNLPDependencyParser(GenericCoreNLPParser):
... )
10
+ >>> print(
+ ... next(
+ ... dep_parser.raw_parse('The underscore _ should not simply disappear.')
+ ... ).to_conll(4)
+ ... ) # doctest: +NORMALIZE_WHITESPACE
+ The DT 3 det
+ underscore VBP 3 amod
+ _ NN 7 nsubj
+ should MD 7 aux
+ not RB 7 neg
+ simply RB 7 advmod
+ disappear VB 0 ROOT
+ . . 7 punct
+
+ >>> print(
+ ... '\\n'.join(
+ ... next(
+ ... dep_parser.raw_parse(
+ ... 'for all of its insights into the dream world of teen life , and its electronic expression through '
+ ... 'cyber culture , the film gives no quarter to anyone seeking to pull a cohesive story out of its 2 '
+ ... '1/2-hour running time .'
+ ... )
+ ... ).to_conll(4).split('\\n')[-8:]
+ ... )
+ ... )
+ its PRP$ 40 nmod:poss
+ 2 1/2 CD 40 nummod
+ - : 40 punct
+ hour NN 31 nmod
+ running VBG 42 amod
+ time NN 40 dep
+ . . 24 punct
+ <BLANKLINE>
+
"""
_OUTPUT_FORMAT = 'conll2007'
diff --git a/nltk/parse/recursivedescent.py b/nltk/parse/recursivedescent.py
index d59eb3e..a84a12f 100644
--- a/nltk/parse/recursivedescent.py
+++ b/nltk/parse/recursivedescent.py
@@ -351,8 +351,7 @@ class SteppingRecursiveDescentParser(RecursiveDescentParser):
:see: ``nltk.grammar``
"""
def __init__(self, grammar, trace=0):
- self._grammar = grammar
- self._trace = trace
+ super(SteppingRecursiveDescentParser, self).__init__(grammar, trace)
self._rtext = None
self._tree = None
self._frontier = [()]
diff --git a/nltk/parse/shiftreduce.py b/nltk/parse/shiftreduce.py
index 4ade68a..7fc8289 100644
--- a/nltk/parse/shiftreduce.py
+++ b/nltk/parse/shiftreduce.py
@@ -290,8 +290,7 @@ class SteppingShiftReduceParser(ShiftReduceParser):
:see: ``nltk.grammar``
"""
def __init__(self, grammar, trace=0):
- self._grammar = grammar
- self._trace = trace
+ super(SteppingShiftReduceParser, self).__init__(grammar, trace)
self._stack = None
self._remaining_text = None
self._history = []
diff --git a/nltk/sem/logic.py b/nltk/sem/logic.py
index dd144d9..1053802 100644
--- a/nltk/sem/logic.py
+++ b/nltk/sem/logic.py
@@ -803,7 +803,7 @@ def read_type(type_string):
class TypeException(Exception):
def __init__(self, msg):
- Exception.__init__(self, msg)
+ super(TypeException, self).__init__(msg)
class InconsistentTypeHierarchyException(TypeException):
def __init__(self, variable, expression=None):
@@ -813,21 +813,20 @@ class InconsistentTypeHierarchyException(TypeException):
else:
msg = "The variable '%s' was found in multiple places with different"\
" types." % (variable)
- Exception.__init__(self, msg)
+ super(InconsistentTypeHierarchyException, self).__init__(msg)
class TypeResolutionException(TypeException):
def __init__(self, expression, other_type):
- Exception.__init__(self, "The type of '%s', '%s', cannot be "
- "resolved with type '%s'" % \
- (expression, expression.type, other_type))
+ super(TypeResolutionException, self).__init__(
+ "The type of '%s', '%s', cannot be resolved with type '%s'" %
+ (expression, expression.type, other_type))
class IllegalTypeException(TypeException):
def __init__(self, expression, other_type, allowed_type):
- Exception.__init__(self, "Cannot set type of %s '%s' to '%s'; "
- "must match type '%s'." %
- (expression.__class__.__name__, expression,
- other_type, allowed_type))
-
+ super(IllegalTypeException, self).__init__(
+ "Cannot set type of %s '%s' to '%s'; must match type '%s'." %
+ (expression.__class__.__name__, expression, other_type,
+ allowed_type))
def typecheck(expressions, signature=None):
"""
diff --git a/nltk/sentiment/util.py b/nltk/sentiment/util.py
index a26a2b5..b8e3fbe 100644
--- a/nltk/sentiment/util.py
+++ b/nltk/sentiment/util.py
@@ -12,7 +12,6 @@ Utility methods for Sentiment Analysis.
"""
from __future__ import division
-from copy import deepcopy
import codecs
import csv
import json
@@ -21,6 +20,8 @@ import random
import re
import sys
import time
+from copy import deepcopy
+from itertools import tee
import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader
@@ -64,6 +65,7 @@ SAD = set([
':c', ':{', '>:\\', ';('
])
+
def timer(method):
"""
A timer decorator to measure execution performance of methods.
@@ -84,6 +86,13 @@ def timer(method):
return result
return timed
+
+def pairwise(iterable):
+ """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
+ a, b = tee(iterable)
+ next(b, None)
+ return zip(a, b)
+
#////////////////////////////////////////////////////////////
#{ Feature extractor functions
#////////////////////////////////////////////////////////////
diff --git a/nltk/sentiment/vader.py b/nltk/sentiment/vader.py
index 72e0ed9..2d232ba 100644
--- a/nltk/sentiment/vader.py
+++ b/nltk/sentiment/vader.py
@@ -27,6 +27,7 @@ import re
import string
from itertools import product
import nltk.data
+from .util import pairwise
##Constants##
@@ -45,15 +46,14 @@ REGEX_REMOVE_PUNCTUATION = re.compile('[{0}]'.format(re.escape(string.punctuatio
PUNC_LIST = [".", "!", "?", ",", ";", ":", "-", "'", "\"",
"!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?"]
-NEGATE = \
-["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
+NEGATE = {"aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
"ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
"dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
"don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
"neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
"oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
"oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
- "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]
+ "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"}
# booster/dampener 'intensifiers' or 'degree adverbs'
# http://en.wiktionary.org/wiki/Category:English_degree_adverbs
@@ -88,18 +88,14 @@ def negated(input_words, include_nt=True):
"""
Determine if input contains negation words
"""
- neg_words = []
- neg_words.extend(NEGATE)
- for word in neg_words:
- if word in input_words:
- return True
+ neg_words = NEGATE
+ if any(word.lower() in neg_words for word in input_words):
+ return True
if include_nt:
- for word in input_words:
- if "n't" in word:
- return True
- if "least" in input_words:
- i = input_words.index("least")
- if i > 0 and input_words[i-1] != "at":
+ if any("n't" in word.lower() for word in input_words):
+ return True
+ for first, second in pairwise(input_words):
+ if second.lower() == "least" and first.lower() != 'at':
return True
return False
diff --git a/nltk/stem/arlstem.py b/nltk/stem/arlstem.py
new file mode 100644
index 0000000..81de360
--- /dev/null
+++ b/nltk/stem/arlstem.py
@@ -0,0 +1,355 @@
+# -*- coding: utf-8 -*-
+#
+# Natural Language Toolkit: ARLSTem Stemmer
+#
+# Copyright (C) 2001-2017 NLTK Project
+#
+# Author: Kheireddine Abainia (x-programer) <k.abainia at gmail.com>
+# Algorithms: Kheireddine Abainia <k.abainia at gmail.com>
+# Siham Ouamour
+# Halim Sayoud
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+
+"""
+ARLSTem Arabic Stemmer
+The details about the implementation of this algorithm are described in:
+K. Abainia, S. Ouamour and H. Sayoud, A Novel Robust Arabic Light Stemmer ,
+Journal of Experimental & Theoretical Artificial Intelligence (JETAI'17),
+Vol. 29, No. 3, 2017, pp. 557-573.
+The ARLSTem is a light Arabic stemmer that is based on removing the affixes
+from the word (i.e. prefixes, suffixes and infixes). It was evaluated and
+compared to several other stemmers using Paice's parameters (under-stemming
+index, over-stemming index and stemming weight), and the results showed that
+ARLSTem is promising and producing high performances. This stemmer is not
+based on any dictionary and can be used on-line effectively.
+"""
+from __future__ import unicode_literals
+import re
+
+from nltk.stem.api import StemmerI
+
+
+class ARLSTem(StemmerI):
+ '''
+ ARLSTem stemmer : a light Arabic Stemming algorithm without any dictionary.
+ Department of Telecommunication & Information Processing. USTHB University,
+ Algiers, Algeria.
+ ARLSTem.stem(token) returns the Arabic stem for the input token.
+ The ARLSTem Stemmer requires that all tokens are encoded using Unicode
+ encoding.
+ '''
+
+ def __init__(self):
+ # different Alif with hamza
+ self.re_hamzated_alif = re.compile(r'[\u0622\u0623\u0625]')
+ self.re_alifMaqsura = re.compile(r'[\u0649]')
+ self.re_diacritics = re.compile(r'[\u064B-\u065F]')
+
+ # Alif Laam, Laam Laam, Fa Laam, Fa Ba
+ self.pr2 = [
+ '\u0627\u0644', '\u0644\u0644',
+ '\u0641\u0644', '\u0641\u0628'
+ ]
+ # Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam
+ self.pr3 = [
+ '\u0628\u0627\u0644',
+ '\u0643\u0627\u0644',
+ '\u0648\u0627\u0644'
+ ]
+ # Fa Laam Laam, Waaw Laam Laam
+ self.pr32 = ['\u0641\u0644\u0644', '\u0648\u0644\u0644']
+ # Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam
+ self.pr4 = [
+ '\u0641\u0628\u0627\u0644',
+ '\u0648\u0628\u0627\u0644',
+ '\u0641\u0643\u0627\u0644'
+ ]
+
+ # Kaf Yaa, Kaf Miim
+ self.su2 = [
+ '\u0643\u064A',
+ '\u0643\u0645'
+ ]
+ # Ha Alif, Ha Miim
+ self.su22 = ['\u0647\u0627', '\u0647\u0645']
+ # Kaf Miim Alif, Kaf Noon Shadda
+ self.su3 = ['\u0643\u0645\u0627', '\u0643\u0646\u0651']
+ # Ha Miim Alif, Ha Noon Shadda
+ self.su32 = ['\u0647\u0645\u0627', '\u0647\u0646\u0651']
+
+ # Alif Noon, Ya Noon, Waaw Noon
+ self.pl_si2 = ['\u0627\u0646', '\u064A\u0646', '\u0648\u0646']
+ # Taa Alif Noon, Taa Ya Noon
+ self.pl_si3 = ['\u062A\u0627\u0646', '\u062A\u064A\u0646']
+
+ # Alif Noon, Waaw Noon
+ self.verb_su2 = ['\u0627\u0646', '\u0648\u0646']
+ # Siin Taa, Siin Yaa
+ self.verb_pr2 = ['\u0633\u062A', '\u0633\u064A']
+ # Siin Alif, Siin Noon
+ self.verb_pr22 = ['\u0633\u0627', '\u0633\u0646']
+
+ # Taa Miim Alif, Taa Noon Shadda
+ self.verb_suf3 = ['\u062A\u0645\u0627', '\u062A\u0646\u0651']
+ # Noon Alif, Taa Miim, Taa Alif, Waaw Alif
+ self.verb_suf2 = [
+ '\u0646\u0627', '\u062A\u0645',
+ '\u062A\u0627', '\u0648\u0627'
+ ]
+ # Taa, Alif, Noon
+ self.verb_suf1 = ['\u062A', '\u0627', '\u0646']
+
+ def stem(self, token):
+ """
+ call this function to get the word's stem based on ARLSTem .
+ """
+ try:
+ if token is None:
+ raise ValueError("The word could not be stemmed, because \
+ it is empty !")
+ # remove Arabic diacritics and replace some letters with others
+ token = self.norm(token)
+ # strip common prefixes of the nouns
+ pre = self.pref(token)
+ if pre is not None:
+ token = pre
+ # strip the suffixes which are common to nouns and verbs
+ token = self.suff(token)
+ # transform a plural noun to a singular noun
+ ps = self.plur2sing(token)
+ if ps is None:
+ # transform from the feminine form to the masculine form
+ fm = self.fem2masc(token)
+ if fm is not None:
+ return fm
+ else:
+ if pre is None: # if the prefixes are not stripped
+ # strip the verb prefixes and suffixes
+ return self.verb(token)
+ else:
+ return ps
+ return token
+ except ValueError as e:
+ print(e)
+
+ def norm(self, token):
+ """
+ normalize the word by removing diacritics, replacing hamzated Alif
+ with Alif replacing AlifMaqsura with Yaa and removing Waaw at the
+ beginning.
+ """
+ # strip Arabic diacritics
+ token = self.re_diacritics.sub('', token)
+ # replace Hamzated Alif with Alif bare
+ token = self.re_hamzated_alif.sub('\u0627', token)
+ # replace alifMaqsura with Yaa
+ token = self.re_alifMaqsura.sub('\u064A', token)
+ # strip the Waaw from the word beginning if the remaining is 3 letters
+ # at least
+ if token.startswith('\u0648') and len(token) > 3:
+ token = token[1:]
+ return token
+
+ def pref(self, token):
+ """
+ remove prefixes from the words' beginning.
+ """
+ if len(token) > 5:
+ for p3 in self.pr3:
+ if token.startswith(p3):
+ return token[3:]
+ if len(token) > 6:
+ for p4 in self.pr4:
+ if token.startswith(p4):
+ return token[4:]
+ if len(token) > 5:
+ for p3 in self.pr32:
+ if token.startswith(p3):
+ return token[3:]
+ if len(token) > 4:
+ for p2 in self.pr2:
+ if token.startswith(p2):
+ return token[2:]
+
+ def suff(self, token):
+ """
+ remove suffixes from the word's end.
+ """
+ if token.endswith('\u0643') and len(token) > 3:
+ return token[:-1]
+ if len(token) > 4:
+ for s2 in self.su2:
+ if token.endswith(s2):
+ return token[:-2]
+ if len(token) > 5:
+ for s3 in self.su3:
+ if token.endswith(s3):
+ return token[:-3]
+ if token.endswith('\u0647') and len(token) > 3:
+ token = token[:-1]
+ return token
+ if len(token) > 4:
+ for s2 in self.su22:
+ if token.endswith(s2):
+ return token[:-2]
+ if len(token) > 5:
+ for s3 in self.su32:
+ if token.endswith(s3):
+ return token[:-3]
+ if token.endswith('\u0646\u0627') and len(token) > 4:
+ return token[:-2]
+ return token
+
+ def fem2masc(self, token):
+ """
+ transform the word from the feminine form to the masculine form.
+ """
+ if token.endswith('\u0629') and len(token) > 3:
+ return token[:-1]
+
+ def plur2sing(self, token):
+ """
+ transform the word from the plural form to the singular form.
+ """
+ if len(token) > 4:
+ for ps2 in self.pl_si2:
+ if token.endswith(ps2):
+ return token[:-2]
+ if len(token) > 5:
+ for ps3 in self.pl_si3:
+ if token.endswith(ps3):
+ return token[:-3]
+ if len(token) > 3 and token.endswith('\u0627\u062A'):
+ return token[:-2]
+ if (len(token) > 3 and token.startswith('\u0627')
+ and token[2] == '\u0627'):
+ return token[:2] + token[3:]
+ if (len(token) > 4 and token.startswith('\u0627')
+ and token[-2] == '\u0627'):
+ return token[1:-2] + token[-1]
+
+ def verb(self, token):
+ """
+ stem the verb prefixes and suffixes or both
+ """
+ vb = self.verb_t1(token)
+ if vb is not None:
+ return vb
+ vb = self.verb_t2(token)
+ if vb is not None:
+ return vb
+ vb = self.verb_t3(token)
+ if vb is not None:
+ return vb
+ vb = self.verb_t4(token)
+ if vb is not None:
+ return vb
+ return self.verb_t5(token)
+
+ def verb_t1(self, token):
+ """
+ stem the present prefixes and suffixes
+ """
+ if len(token) > 5 and token.startswith('\u062A'): # Taa
+ for s2 in self.pl_si2:
+ if token.endswith(s2):
+ return token[1:-2]
+ if len(token) > 5 and token.startswith('\u064A'): # Yaa
+ for s2 in self.verb_su2:
+ if token.endswith(s2):
+ return token[1:-2]
+ if len(token) > 4 and token.startswith('\u0627'): # Alif
+ # Waaw Alif
+ if len(token) > 5 and token.endswith('\u0648\u0627'):
+ return token[1:-2]
+ # Yaa
+ if token.endswith('\u064A'):
+ return token[1:-1]
+ # Alif
+ if token.endswith('\u0627'):
+ return token[1:-1]
+ # Noon
+ if token.endswith('\u0646'):
+ return token[1:-1]
+ # ^Yaa, Noon$
+ if (len(token) > 4
+ and token.startswith('\u064A')
+ and token.endswith('\u0646')):
+ return token[1:-1]
+ # ^Taa, Noon$
+ if (len(token) > 4
+ and token.startswith('\u062A')
+ and token.endswith('\u0646')):
+ return token[1:-1]
+
+ def verb_t2(self, token):
+ """
+ stem the future prefixes and suffixes
+ """
+ if len(token) > 6:
+ for s2 in self.pl_si2:
+ # ^Siin Taa
+ if (token.startswith(self.verb_pr2[0])
+ and token.endswith(s2)):
+ return token[2:-2]
+ # ^Siin Yaa, Alif Noon$
+ if (token.startswith(self.verb_pr2[1])
+ and token.endswith(self.pl_si2[0])):
+ return token[2:-2]
+ # ^Siin Yaa, Waaw Noon$
+ if (token.startswith(self.verb_pr2[1])
+ and token.endswith(self.pl_si2[2])):
+ return token[2:-2]
+ # ^Siin Taa, Noon$
+ if (len(token) > 5
+ and token.startswith(self.verb_pr2[0])
+ and token.endswith('\u0646')):
+ return token[2:-1]
+ # ^Siin Yaa, Noon$
+ if (len(token) > 5
+ and token.startswith(self.verb_pr2[1])
+ and token.endswith('\u0646')):
+ return token[2:-1]
+
+ def verb_t3(self, token):
+ """
+ stem the present suffixes
+ """
+ if len(token) > 5:
+ for su3 in self.verb_suf3:
+ if(token.endswith(su3)):
+ return token[:-3]
+ if len(token) > 4:
+ for su2 in self.verb_suf2:
+ if token.endswith(su2):
+ return token[:-2]
+ if len(token) > 3:
+ for su1 in self.verb_suf1:
+ if token.endswith(su1):
+ return token[:-1]
+
+ def verb_t4(self, token):
+ """
+ stem the present prefixes
+ """
+ if len(token) > 3:
+ for pr1 in self.verb_suf1:
+ if token.startswith(pr1):
+ return token[1:]
+ if token.startswith('\u064A'):
+ return token[1:]
+
+ def verb_t5(self, token):
+ """
+ stem the future prefixes
+ """
+ if len(token) > 4:
+ for pr2 in self.verb_pr22:
+ if token.startswith(pr2):
+ return token[2:]
+ for pr2 in self.verb_pr2:
+ if token.startswith(pr2):
+ return token[2:]
+ return token
diff --git a/nltk/stem/snowball.py b/nltk/stem/snowball.py
index 3ed2dbb..00b511c 100644
--- a/nltk/stem/snowball.py
+++ b/nltk/stem/snowball.py
@@ -5,7 +5,12 @@
# Copyright (C) 2001-2017 NLTK Project
# Author: Peter Michael Stahl <pemistahl at gmail.com>
# Peter Ljunglof <peter.ljunglof at heatherleaf.se> (revisions)
+# Lakhdar Benzahia <lakhdar.benzahia at gmail.com> (co-writer)
+# Assem Chelli <assem.ch at gmail.com> (reviewer arabicstemmer)
+# Abdelkrim Aries <ab_aries at esi.dz> (reviewer arabicstemmer)
# Algorithms: Dr Martin Porter <martin at tartarus.org>
+# Assem Chelli <assem.ch at gmail.com> arabic stemming algorithm
+# Benzahia Lakhdar <lakhdar.benzahia at gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
@@ -21,11 +26,12 @@ There is also a demo function: `snowball.demo()`.
from __future__ import unicode_literals, print_function
from six.moves import input
+import re
from nltk import compat
from nltk.corpus import stopwords
from nltk.stem import porter
-from nltk.stem.util import suffix_replace
+from nltk.stem.util import suffix_replace, prefix_replace
from nltk.stem.api import StemmerI
@@ -36,7 +42,7 @@ class SnowballStemmer(StemmerI):
Snowball Stemmer
The following languages are supported:
- Danish, Dutch, English, Finnish, French, German,
+ Arabic, Danish, Dutch, English, Finnish, French, German,
Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian,
Spanish and Swedish.
@@ -55,7 +61,7 @@ class SnowballStemmer(StemmerI):
>>> from nltk.stem import SnowballStemmer
>>> print(" ".join(SnowballStemmer.languages)) # See which languages are supported
- danish dutch english finnish french german hungarian
+ arabic danish dutch english finnish french german hungarian
italian norwegian porter portuguese romanian russian
spanish swedish
>>> stemmer = SnowballStemmer("german") # Choose a language
@@ -81,7 +87,7 @@ class SnowballStemmer(StemmerI):
language, a ValueError is raised.
"""
- languages = ("danish", "dutch", "english", "finnish", "french", "german",
+ languages = ("arabic", "danish", "dutch", "english", "finnish", "french", "german",
"hungarian", "italian", "norwegian", "porter", "portuguese",
"romanian", "russian", "spanish", "swedish")
@@ -288,6 +294,520 @@ class _StandardStemmer(_LanguageSpecificStemmer):
return rv
+class ArabicStemmer(_LanguageSpecificStemmer):
+ """
+ https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm)
+ The Snowball Arabic light Stemmer
+ Algorithm : Assem Chelli
+ Abdelkrim Aries
+ Lakhdar Benzahia
+ Nltk Version Author : Lakhdar Benzahia
+ """
+ # Normalize_pre stes
+ __vocalization = re.compile(r'[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]') # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ
+
+ __kasheeda = re.compile(r'[\u0640]') # ـ tatweel/kasheeda
+
+ __arabic_punctuation_marks = re.compile(r'[\u060C-\u061B-\u061F]') # ؛ ، ؟
+
+ # Normalize_post
+ __last_hamzat = ('\u0623', '\u0625', '\u0622', '\u0624', '\u0626') # أ، إ، آ، ؤ، ئ
+
+ # normalize other hamza's
+ __initial_hamzat = re.compile(r'^[\u0622\u0623\u0625]') # أ، إ، آ
+
+ __waw_hamza = re.compile(r'[\u0624]') # ؤ
+
+ __yeh_hamza = re.compile(r'[\u0626]') # ئ
+
+ __alefat = re.compile(r'[\u0623\u0622\u0625]') # أ، إ، آ
+
+ # Checks
+ __checks1 = ('\u0643\u0627\u0644', '\u0628\u0627\u0644', # بال، كال
+ '\u0627\u0644', '\u0644\u0644' # لل، ال
+ )
+
+ __checks2 = ('\u0629', # ة
+ '\u0627\u062a' # female plural ات
+ )
+
+ # Suffixes
+ __suffix_noun_step1a = ('\u064a', '\u0643', '\u0647', # ي، ك، ه
+ '\u0646\u0627', '\u0643\u0645', '\u0647\u0627', '\u0647\u0646', '\u0647\u0645', # نا، كم، ها، هن، هم
+ '\u0643\u0645\u0627', '\u0647\u0645\u0627' # كما، هما
+ )
+
+ __suffix_noun_step1b = ('\u0646') # ن
+
+ __suffix_noun_step2a = ('\u0627', '\u064a', '\u0648') # ا، ي، و
+
+ __suffix_noun_step2b = ('\u0627\u062a') # ات
+
+ __suffix_noun_step2c1 = ('\u062a') # ت
+
+ __suffix_noun_step2c2 = ('\u0629') # ة
+
+ __suffix_noun_step3 = ('\u064a') # ي
+
+ __suffix_verb_step1 = ('\u0647', '\u0643', # ه، ك
+ '\u0646\u064a', '\u0646\u0627', '\u0647\u0627', '\u0647\u0645', # ني، نا، ها، هم
+ '\u0647\u0646', '\u0643\u0645', '\u0643\u0646', # هن، كم، كن
+ '\u0647\u0645\u0627', '\u0643\u0645\u0627', '\u0643\u0645\u0648' # هما، كما، كمو
+ )
+
+ __suffix_verb_step2a = ( '\u062a', '\u0627', '\u0646' , '\u064a', # ت، ا، ن، ي
+ '\u0646\u0627', '\u062a\u0627', '\u062a\u0646', # نا، تا، تن Past
+ '\u0627\u0646', '\u0648\u0646', '\u064a\u0646', # ان، هن، ين Present
+ '\u062a\u0645\u0627' # تما
+ )
+
+ __suffix_verb_step2b = ('\u0648\u0627','\u062a\u0645') # وا، تم
+
+ __suffix_verb_step2c = ('\u0648', # و
+ '\u062a\u0645\u0648' # تمو
+ )
+
+ __suffix_all_alef_maqsura = ('\u0649') # ى
+
+ # Prefixes
+ __prefix_step1 = ('\u0623', # أ
+ '\u0623\u0623', '\u0623\u0622', '\u0623\u0624', '\u0623\u0627', '\u0623\u0625', # أأ، أآ، أؤ، أا، أإ
+ )
+
+ __prefix_step2a = ('\u0641\u0627\u0644', '\u0648\u0627\u0644') # فال، وال
+
+ __prefix_step2b = ('\u0641', '\u0648') # ف، و
+
+ __prefix_step3a_noun = ('\u0627\u0644', '\u0644\u0644', # لل، ال
+ '\u0643\u0627\u0644', '\u0628\u0627\u0644', # بال، كال
+ )
+
+ __prefix_step3b_noun = ('\u0628', '\u0643', '\u0644', # ب، ك، ل
+ '\u0628\u0628', '\u0643\u0643' # بب، كك
+ )
+
+ __prefix_step3_verb = ('\u0633\u064a', '\u0633\u062a', '\u0633\u0646', '\u0633\u0623') # سي، ست، سن، سأ
+
+ __prefix_step4_verb = ('\u064a\u0633\u062a', '\u0646\u0633\u062a', '\u062a\u0633\u062a') # يست، نست، تست
+
+ # Suffixes added due to Conjugation Verbs
+ __conjugation_suffix_verb_1 = ('\u0647', '\u0643') # ه، ك
+
+ __conjugation_suffix_verb_2 = ('\u0646\u064a', '\u0646\u0627','\u0647\u0627', # ني، نا، ها
+ '\u0647\u0645', '\u0647\u0646', '\u0643\u0645', # هم، هن، كم
+ '\u0643\u0646' # كن
+ )
+ __conjugation_suffix_verb_3 = ('\u0647\u0645\u0627', '\u0643\u0645\u0627', '\u0643\u0645\u0648') # هما، كما، كمو
+
+ __conjugation_suffix_verb_4 = ('\u0627', '\u0646', '\u064a') # ا، ن، ي
+
+ __conjugation_suffix_verb_past = ('\u0646\u0627', '\u062a\u0627', '\u062a\u0646') # نا، تا، تن
+
+ __conjugation_suffix_verb_presnet = ('\u0627\u0646', '\u0648\u0646', '\u064a\u0646') # ان، ون، ين
+
+ # Suffixes added due to derivation Names
+ __conjugation_suffix_noun_1 = ('\u064a', '\u0643', '\u0647') # ي، ك، ه
+
+ __conjugation_suffix_noun_2 = ('\u0646\u0627', '\u0643\u0645', # نا، كم
+ '\u0647\u0627', '\u0647\u0646', '\u0647\u0645' # ها، هن، هم
+ )
+
+ __conjugation_suffix_noun_3 = ('\u0643\u0645\u0627', '\u0647\u0645\u0627') # كما، هما
+
+ # Prefixes added due to derivation Names
+ __prefixes1 = ('\u0648\u0627', '\u0641\u0627') # فا، وا
+
+ __articles_3len = ('\u0643\u0627\u0644', '\u0628\u0627\u0644') # بال كال
+
+ __articles_2len = ('\u0627\u0644', '\u0644\u0644') # ال لل
+
+ # Prepositions letters
+ __prepositions1 = ('\u0643', '\u0644') # ك، ل
+ __prepositions2 = ('\u0628\u0628', '\u0643\u0643') # بب، كك
+
+ is_verb = True
+ is_noun = True
+ is_defined = False
+
+ suffixes_verb_step1_success = False
+ suffix_verb_step2a_success = False
+ suffix_verb_step2b_success = False
+ suffix_noun_step2c2_success = False
+ suffix_noun_step1a_success = False
+ suffix_noun_step2a_success = False
+ suffix_noun_step2b_success = False
+ suffixe_noun_step1b_success = False
+ prefix_step2a_success = False
+ prefix_step3a_noun_success = False
+ prefix_step3b_noun_success = False
+
+ def __normalize_pre(self, token):
+ """
+ :param token: string
+ :return: normalized token type string
+ """
+ # strip diacritics
+ token = self.__vocalization.sub('', token)
+ #strip kasheeda
+ token = self.__kasheeda.sub('', token)
+ # strip punctuation marks
+ token = self.__arabic_punctuation_marks.sub('', token)
+ return token
+
+ def __normalize_post(self, token):
+ # normalize last hamza
+ for hamza in self.__last_hamzat:
+ if token.endswith(hamza):
+ token = suffix_replace(token, hamza, '\u0621')
+ break
+ # normalize other hamzat
+ token = self.__initial_hamzat.sub('\u0627', token)
+ token = self.__waw_hamza.sub('\u0648', token)
+ token = self.__yeh_hamza.sub('\u064a', token)
+ token = self.__alefat.sub('\u0627', token)
+ return token
+
+ def __checks_1(self, token):
+ for prefix in self.__checks1 :
+ if token.startswith(prefix):
+ if prefix in self.__articles_3len and len(token) > 4 :
+ self.is_noun = True
+ self.is_verb = False
+ self.is_defined = True
+ break
+
+ if prefix in self.__articles_2len and len(token) > 3 :
+ self.is_noun = True
+ self.is_verb = False
+ self.is_defined = True
+ break
+
+ def __checks_2(self, token):
+ for suffix in self.__checks2:
+ if token.endswith(suffix):
+ if suffix == '\u0629' and len(token) > 2:
+ self.is_noun = True
+ self.is_verb = False
+ break
+
+ if suffix == '\u0627\u062a' and len(token) > 3:
+ self.is_noun = True
+ self.is_verb = False
+ break
+
+ def __Suffix_Verb_Step1(self, token):
+ for suffix in self.__suffix_verb_step1:
+ if token.endswith(suffix):
+ if suffix in self.__conjugation_suffix_verb_1 and len(token) >= 4:
+ token = token[:-1]
+ self.suffixes_verb_step1_success = True
+ break
+
+ if suffix in self.__conjugation_suffix_verb_2 and len(token) >= 5:
+ token = token[:-2]
+ self.suffixes_verb_step1_success = True
+ break
+
+ if suffix in self.__conjugation_suffix_verb_3 and len(token) >= 6:
+ token = token[:-3]
+ self.suffixes_verb_step1_success = True
+ break
+ return token
+
+ def __Suffix_Verb_Step2a(self, token):
+ for suffix in self.__suffix_verb_step2a:
+ if token.endswith(suffix):
+ if suffix == '\u062a' and len(token) >= 4:
+ token = token[:-1]
+ self.suffix_verb_step2a_success = True
+ break
+
+ if suffix in self.__conjugation_suffix_verb_4 and len(token) >= 4:
+ token = token[:-1]
+ self.suffix_verb_step2a_success = True
+ break
+
+ if suffix in self.__conjugation_suffix_verb_past and len(token) >= 5:
+ token = token[:-2] # past
+ self.suffix_verb_step2a_success = True
+ break
+
+ if suffix in self.__conjugation_suffix_verb_present and len(token) > 5:
+ token = token[:-2] # present
+ self.suffix_verb_step2a_success = True
+ break
+
+ if suffix == '\u062a\u0645\u0627' and len(token) >= 6:
+ token = token[:-3]
+ self.suffix_verb_step2a_success = True
+ break
+ return token
+
+ def __Suffix_Verb_Step2c(self, token):
+ for suffix in self.__suffix_verb_step2c:
+ if token.endswith(suffix):
+ if suffix == '\u062a\u0645\u0648' and len(token) >= 6:
+ token = token[:-3]
+ break
+
+ if suffix == '\u0648' and len(token) >= 4:
+ token = token[:-1]
+ break
+ return token
+
+ def __Suffix_Verb_Step2b(self, token):
+ for suffix in self.__suffix_verb_step2b:
+ if token.endswith(suffix) and len(token) >= 5:
+ token = token[:-2]
+ self.suffix_verb_step2b_success = True
+ break
+ return token
+
+ def __Suffix_Noun_Step2c2(self, token):
+ for suffix in self.__suffix_noun_step2c2:
+ if token.endswith(suffix) and len(token) >= 3:
+ token = token[:-1]
+ self.suffix_noun_step2c2_success = True
+ break
+ return token
+
+ def __Suffix_Noun_Step1a(self, token):
+ for suffix in self.__suffix_noun_step1a:
+ if token.endswith(suffix):
+ if suffix in self.__conjugation_suffix_noun_1 and len(token) >= 4:
+ token = token[:-1]
+ self.suffix_noun_step1a_success = True
+ break
+
+ if suffix in self.__conjugation_suffix_noun_2 and len(token) >= 5:
+ token = token[:-2]
+ self.suffix_noun_step1a_success = True
+ break
+
+ if suffix in self.__conjugation_suffix_noun_3 and len(token) >= 6:
+ token = token[:-3]
+ self.suffix_noun_step1a_success = True
+ break
+ return token
+
+ def __Suffix_Noun_Step2a(self, token):
+ for suffix in self.__suffix_noun_step2a:
+ if token.endswith(suffix) and len(token) > 4:
+ token = token[:-1]
+ self.suffix_noun_step2a_success = True
+ break
+ return token
+
+ def __Suffix_Noun_Step2b(self, token):
+ for suffix in self.__suffix_noun_step2b:
+ if token.endswith(suffix) and len(token) >= 5:
+ token = token[:-2]
+ self.suffix_noun_step2b_success = True
+ break
+ return token
+
+ def __Suffix_Noun_Step2c1(self, token):
+ for suffix in self.__suffix_noun_step2c1:
+ if token.endswith(suffix) and len(token) >= 4:
+ token = token[:-1]
+ break
+ return token
+
+ def __Suffix_Noun_Step1b(self, token):
+ for suffix in self.__suffix_noun_step1b:
+ if token.endswith(suffix) and len(token) > 5:
+ token = token[:-1]
+ self.suffixe_noun_step1b_success = True
+ break
+ return token
+
+ def __Suffix_Noun_Step3(self, token):
+ for suffix in self.__suffix_noun_step3:
+ if token.endswith(suffix) and len(token) >= 3:
+ token = token[:-1] # ya' nisbiya
+ break
+ return token
+
+ def __Suffix_All_alef_maqsura(self, token):
+ for suffix in self.__suffix_all_alef_maqsura:
+ if token.endswith(suffix):
+ token = suffix_replace(token, suffix, '\u064a')
+ return token
+
+ def __Prefix_Step1(self, token):
+ for prefix in self.__prefix_step1:
+ if token.startswith(prefix) and len(token) > 3:
+ if prefix == '\u0623\u0623':
+ token = prefix_replace(token, prefix, '\u0623')
+ break
+
+ elif prefix == '\u0623\u0622':
+ token = prefix_replace(token, prefix, '\u0622')
+ break
+
+ elif prefix == '\u0623\u0624':
+ token = prefix_replace(token, prefix, '\u0624')
+ break
+
+ elif prefix == '\u0623\u0627' :
+ token = prefix_replace(token, prefix, '\u0627')
+ break
+
+ elif prefix == '\u0623\u0625' :
+ token = prefix_replace(token, prefix, '\u0625')
+ break
+ return token
+
+ def __Prefix_Step2a(self, token):
+ for prefix in self.__prefix_step2a:
+ if token.startswith(prefix) and len(token) > 5:
+ token = token[len(prefix):]
+ self.prefix_step2a_success = True
+ break
+ return token
+
+ def __Prefix_Step2b(self, token):
+ for prefix in self.__prefix_step2b:
+ if token.startswith(prefix) and len(token) > 3 :
+ if token[:2] not in self.__prefixes1:
+ token = token[len(prefix):]
+ break
+ return token
+
+ def __Prefix_Step3a_Noun(self, token):
+ for prefix in self.__prefix_step3a_noun:
+ if token.startswith(prefix):
+ if prefix in self.__articles_2len and len(token) > 4:
+ token = token[len(prefix):]
+ self.prefix_step3a_noun_success = True
+ break
+ if prefix in self.__articles_3len and len(token) > 5:
+ token = token[len(prefix):]
+ break
+ return token
+
+ def __Prefix_Step3b_Noun(self, token):
+ for prefix in self.__prefix_step3b_noun:
+ if token.startswith(prefix):
+ if len(token) > 3:
+ if prefix == '\u0628':
+ token = token[len(prefix):]
+ self.prefix_step3b_noun_success = True
+ break
+
+ if prefix in self.__prepositions2:
+ token = prefix_replace(token, prefix, prefix[1])
+ self.prefix_step3b_noun_success = True
+ break
+
+ if prefix in self.__prepositions1 and len(token) > 4:
+ token = token[len(prefix):] # BUG: cause confusion
+ self.prefix_step3b_noun_success = True
+ break
+ return token
+
+ def __Prefix_Step3_Verb(self, token):
+ for prefix in self.__prefix_step3_verb:
+ if token.startswith(prefix) and len(token) > 4:
+ token = prefix_replace(token, prefix, prefix[1])
+ break
+ return token
+
+ def __Prefix_Step4_Verb(self, token):
+ for prefix in self.__prefix_step4_verb:
+ if token.startswith(prefix) and len(token) > 4:
+ token = prefix_replace(token, prefix, '\u0627\u0633\u062a')
+ self.is_verb = True
+ self.is_noun = False
+ break
+ return token
+
+ def stem(self, word):
+ """
+ Stem an Arabic word and return the stemmed form.
+ :param word: string
+ :return: string
+ """
+ # set initial values
+ self.is_verb = True
+ self.is_noun = True
+ self.is_defined = False
+
+ self.suffix_verb_step2a_success = False
+ self.suffix_verb_step2b_success = False
+ self.suffix_noun_step2c2_success = False
+ self.suffix_noun_step1a_success = False
+ self.suffix_noun_step2a_success = False
+ self.suffix_noun_step2b_success = False
+ self.suffixe_noun_step1b_success = False
+ self.prefix_step2a_success = False
+ self.prefix_step3a_noun_success = False
+ self.prefix_step3b_noun_success = False
+
+ modified_word = word
+ # guess type and properties
+ # checks1
+ self.__checks_1(modified_word)
+ # checks2
+ self.__checks_2(modified_word)
+ modified_word = self.__normalize_pre(modified_word)
+ if self.is_verb:
+ modified_word = self.__Suffix_Verb_Step1(modified_word)
+ if self.suffixes_verb_step1_success:
+ modified_word = self.__Suffix_Verb_Step2a(modified_word)
+ if not self.suffix_verb_step2a_success :
+ modified_word = self.__Suffix_Verb_Step2c(modified_word)
+ #or next
+ else:
+ modified_word = self.__Suffix_Verb_Step2b(modified_word)
+ if not self.suffix_verb_step2b_success:
+ modified_word = self.__Suffix_Verb_Step2a(modified_word)
+ if self.is_noun:
+ modified_word = self.__Suffix_Noun_Step2c2(modified_word)
+ if not self.suffix_noun_step2c2_success:
+ if not self.is_defined:
+ modified_word = self.__Suffix_Noun_Step1a(modified_word)
+ #if self.suffix_noun_step1a_success:
+ modified_word = self.__Suffix_Noun_Step2a(modified_word)
+ if not self.suffix_noun_step2a_success:
+ modified_word = self.__Suffix_Noun_Step2b(modified_word)
+ if not self.suffix_noun_step2b_success and not self.suffix_noun_step2a_success:
+ modified_word = self.__Suffix_Noun_Step2c1(modified_word)
+ # or next ? todo : how to deal with or next
+ else:
+ modified_word = self.__Suffix_Noun_Step1b(modified_word)
+ if self.suffixe_noun_step1b_success:
+ modified_word = self.__Suffix_Noun_Step2a(modified_word)
+ if not self.suffix_noun_step2a_success:
+ modified_word = self.__Suffix_Noun_Step2b(modified_word)
+ if not self.suffix_noun_step2b_success and not self.suffix_noun_step2a_success:
+ modified_word = self.__Suffix_Noun_Step2c1(modified_word)
+ else:
+ if not self.is_defined:
+ modified_word = self.__Suffix_Noun_Step2a(modified_word)
+ modified_word = self.__Suffix_Noun_Step2b(modified_word)
+ modified_word = self.__Suffix_Noun_Step3(modified_word)
+ if not self.is_noun and self.is_verb:
+ modified_word = self.__Suffix_All_alef_maqsura(modified_word)
+
+ # prefixes
+ modified_word = self.__Prefix_Step1(modified_word)
+ modified_word = self.__Prefix_Step2a(modified_word)
+ if not self.prefix_step2a_success:
+ modified_word = self.__Prefix_Step2b(modified_word)
+ modified_word = self.__Prefix_Step3a_Noun(modified_word)
+ if not self.prefix_step3a_noun_success and self.is_noun:
+ modified_word = self.__Prefix_Step3b_Noun(modified_word)
+ else:
+ if not self.prefix_step3b_noun_success and self.is_verb:
+ modified_word = self.__Prefix_Step3_Verb(modified_word)
+ modified_word = self.__Prefix_Step4_Verb(modified_word)
+
+ # post normalization stemming
+ modified_word = self.__normalize_post(modified_word)
+ stemmed_word = modified_word
+ return stemmed_word
+
class DanishStemmer(_ScandinavianStemmer):
"""
@@ -3658,7 +4178,8 @@ def demo():
import re
from nltk.corpus import udhr
- udhr_corpus = {"danish": "Danish_Dansk-Latin1",
+ udhr_corpus = {"arabic": "Arabic_Alarabia-Arabic",
+ "danish": "Danish_Dansk-Latin1",
"dutch": "Dutch_Nederlands-Latin1",
"english": "English-Latin1",
"finnish": "Finnish_Suomi-Latin1",
diff --git a/nltk/stem/util.py b/nltk/stem/util.py
index c3d9b90..2ba8547 100644
--- a/nltk/stem/util.py
+++ b/nltk/stem/util.py
@@ -10,3 +10,13 @@ def suffix_replace(original, old, new):
Replaces the old suffix of the original string by a new suffix
"""
return original[:-len(old)] + new
+
+def prefix_replace(original, old, new):
+ """
+ Replaces the old prefix of the original string by a new suffix
+ :param original: string
+ :param old: string
+ :param new: string
+ :return: string
+ """
+ return new + original[len(old):]
\ No newline at end of file
diff --git a/nltk/tag/__init__.py b/nltk/tag/__init__.py
index 0de452a..34c8798 100644
--- a/nltk/tag/__init__.py
+++ b/nltk/tag/__init__.py
@@ -19,13 +19,20 @@ the word ``'fly'`` with a noun part of speech tag (``'NN'``):
>>> tagged_tok = ('fly', 'NN')
-An off-the-shelf tagger is available. It uses the Penn Treebank tagset:
+An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset:
>>> from nltk import pos_tag, word_tokenize
>>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
[('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
+A Russian tagger is also available if you specify lang="rus". It uses
+the Russian National Corpus tagset:
+
+ >>> pos_tag(word_tokenize("Илья оторопел и дважды перечитал бумажку."), lang='rus') # doctest: +SKIP
+ [('Илья', 'S'), ('оторопел', 'V'), ('и', 'CONJ'), ('дважды', 'ADV'), ('перечитал', 'V'),
+ ('бумажку', 'S'), ('.', 'NONLEX')]
+
This package defines several taggers, which take a list of tokens,
assign a tag to each one, and return the resulting list of tagged tokens.
Most of the taggers are built automatically based on a training corpus.
diff --git a/nltk/tag/perceptron.py b/nltk/tag/perceptron.py
index b194ad0..4cedd8d 100644
--- a/nltk/tag/perceptron.py
+++ b/nltk/tag/perceptron.py
@@ -28,7 +28,7 @@ class AveragedPerceptron(object):
'''An averaged perceptron, as implemented by Matthew Honnibal.
See more implementation details here:
- http://spacy.io/blog/part-of-speech-POS-tagger-in-python/
+ https://explosion.ai/blog/part-of-speech-pos-tagger-in-python
'''
def __init__(self):
@@ -101,7 +101,7 @@ class PerceptronTagger(TaggerI):
'''
Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
See more implementation details here:
- http://spacy.io/blog/part-of-speech-POS-tagger-in-python/
+ https://explosion.ai/blog/part-of-speech-pos-tagger-in-python
>>> from nltk.tag.perceptron import PerceptronTagger
diff --git a/nltk/tag/stanford.py b/nltk/tag/stanford.py
index d055e5d..26ac640 100644
--- a/nltk/tag/stanford.py
+++ b/nltk/tag/stanford.py
@@ -27,6 +27,7 @@ from six import text_type
from nltk.internals import find_file, find_jar, config_java, java, _java_options
from nltk.tag.api import TaggerI
+from nltk.parse.corenlp import CoreNLPParser
_stanford_url = 'https://nlp.stanford.edu/software'
@@ -47,7 +48,14 @@ class StanfordTagger(TaggerI):
def __init__(self, model_filename, path_to_jar=None, encoding='utf8',
verbose=False, java_options='-mx1000m'):
-
+ # Raise deprecation warning.
+ warnings.simplefilter('always', DeprecationWarning)
+ warnings.warn(str("\nThe StanfordTokenizer will "
+ "be deprecated in version 3.2.5.\n"
+ "Please use \033[91mnltk.tag.corenlp.CoreNLPPOSTagger\033[0m "
+ "or \033[91mnltk.tag.corenlp.CoreNLPNERTagger\033[0m instead."),
+ DeprecationWarning, stacklevel=2)
+ warnings.simplefilter('ignore', DeprecationWarning)
if not self._JAR:
warnings.warn('The StanfordTagger class is not meant to be '
'instantiated directly. Did you mean '
@@ -204,6 +212,67 @@ class StanfordNERTagger(StanfordTagger):
raise NotImplementedError
+class CoreNLPTagger(CoreNLPParser, TaggerI):
+ def __init__(self, tagtype, url='http://localhost:9000', encoding='utf8'):
+ """
+ An abstract interface to POS/NER taggers of CoreNLP that returns the
+ POS/NER tags from the Stanford CoreNLP API at nltk.parse.corenlp.
+ """
+ self.tagtype = tagtype
+ super(CoreNLPTagger, self).__init__(url, encoding)
+
+ def tag_sents(self, sentences):
+ # Converting list(list(str)) -> list(str)
+ sentences = (' '.join(words) for words in sentences)
+ return list(self.raw_tag_sents(sentences))
+
+
+ def tag(self, sentence):
+ return self.tag_sents([sentence])[0]
+
+ def raw_tag_sents(self, sentences):
+ """
+ This function will interface the `GenericCoreNLPParser.api_call` to
+ retreive the JSON output and return the annotations required.
+ """
+ default_properties = {'ssplit.isOneSentence': 'true',
+ 'annotators': 'tokenize,ssplit,' }
+ # Supports only 'pos' or 'ner' tags.
+ assert self.tagtype in ['pos', 'ner']
+ default_properties['annotators'] += self.tagtype
+ for sentence in sentences:
+ tagged_data = self.api_call(sentence, properties=default_properties)
+ assert len(tagged_data['sentences']) == 1
+ # Taggers only need to return 1-best sentence.
+ yield [(token['word'], token[self.tagtype]) for token in tagged_data['sentences'][0]['tokens']]
+
+
+class CoreNLPPOSTagger(CoreNLPTagger):
+ """
+ This is a subclass of the CoreNLPTagger that wraps around the
+ nltk.parse.CoreNLPParser for Part-of-Sppech tagging.
+
+ >>> from nltk.tag.stanford import CoreNLPPOSTagger
+ >>> CoreNLPPOSTagger(url='http://localhost:9000').tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
+ [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
+ """
+ def __init__(self, url='http://localhost:9000', encoding='utf8'):
+ super(CoreNLPPOSTagger, self).__init__('pos', url, encoding)
+
+
+class CoreNLPNERTagger(CoreNLPTagger):
+ """
+ This is a subclass of the CoreNLPTagger that wraps around the
+ nltk.parse.CoreNLPParser for Named-Entity tagging.
+
+ >>> from nltk.tag.stanford import CoreNLPNERTagger
+ >>> CoreNLPNERTagger(url='http://localhost:9000').tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
+ [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'O')]
+ """
+ def __init__(self, url='http://localhost:9000', encoding='utf8'):
+ super(CoreNLPNERTagger, self).__init__('ner', url, encoding)
+
+
def setup_module(module):
from nose import SkipTest
@@ -212,3 +281,10 @@ def setup_module(module):
except LookupError:
raise SkipTest('Doctests from nltk.tag.stanford are skipped because one \
of the stanford jars cannot be found.')
+
+ try:
+ CoreNLPPOSTagger()
+ CoreNLPNERTagger()
+ except LookupError:
+ raise SkipTest('Doctests from nltk.tag.stanford.CoreNLPTokenizer'
+ 'are skipped because the stanford corenlp server not started')
diff --git a/nltk/test/corpus.doctest b/nltk/test/corpus.doctest
index b658e53..c21fedd 100644
--- a/nltk/test/corpus.doctest
+++ b/nltk/test/corpus.doctest
@@ -977,6 +977,12 @@ a given lemma belongs to:
>>> verbnet.classids('accept')
['approve-77', 'characterize-29.2-1-1', 'obtain-13.5.2']
+The `classids()` method may additionally be used to retrieve all classes
+within verbnet if nothing is passed:
+
+ >>> verbnet.classids()
+ ['accompany-51.7', 'admire-31.2', 'admire-31.2-1', 'admit-65', 'adopt-93', 'advise-37.9', 'advise-37.9-1', 'allow-64', 'amalgamate-22.2', 'amalgamate-22.2-1', 'amalgamate-22.2-1-1', 'amalgamate-22.2-2', 'amalgamate-22.2-2-1', 'amalgamate-22.2-3', 'amalgamate-22.2-3-1', 'amalgamate-22.2-3-1-1', 'amalgamate-22.2-3-2', 'amuse-31.1', 'animal_sounds-38', 'appeal-31.4', 'appeal-31.4-1', 'appeal-31.4-2', 'appeal-31.4-3', 'appear-48.1.1', 'appoint-29.1', 'approve-77', 'assessment-34', 'assum [...]
+
The primary object in the lexicon is a class record, which is stored
as an ElementTree xml object. The class record for a given class
identifier is returned by the `vnclass()` method:
@@ -1017,18 +1023,42 @@ concise form. The simplest such method is `pprint()`:
* Theme[+concrete +force]
Frames:
Intransitive (Expletive Subject)
+ Example: It's raining.
Syntax: LEX[it] LEX[[+be]] VERB
Semantics:
* weather(during(E), Weather_type, ?Theme)
NP (Expletive Subject, Theme Object)
+ Example: It's raining cats and dogs.
Syntax: LEX[it] LEX[[+be]] VERB NP[Theme]
Semantics:
* weather(during(E), Weather_type, Theme)
PP (Expletive Subject, Theme-PP)
+ Example: It was pelting with rain.
Syntax: LEX[it[+be]] VERB PREP[with] NP[Theme]
Semantics:
* weather(during(E), Weather_type, Theme)
+Verbnet gives us frames that link the syntax and semantics using an example.
+These frames are part of the corpus and we can use `frames()` to get a frame
+for a given verbnet class.
+
+ >>> frame = verbnet.frames('57')
+ >>> frame == [{'semantics': [{'arguments': [{'value': 'during(E)', 'type': 'Event'}, {'value': 'Weather_type', 'type': 'VerbSpecific'}, {'value': '?Theme', 'type': 'ThemRole'}], 'predicate_value': 'weather'}], 'example': "It's raining.", 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'LEX', 'modifiers': {'value': '[+be]', 'synrestrs': [], 'selrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'synrestrs': [], 'sel [...]
+ True
+
+Verbnet corpus lets us access thematic roles individually using `themroles()`.
+
+ >>> themroles = verbnet.themroles('57')
+ >>> themroles == [{'modifiers': [{'type': 'concrete', 'value': '+'}, {'type': 'force', 'value': '+'}], 'type': 'Theme'}]
+ True
+
+Verbnet classes may also have subclasses sharing similar syntactic and semantic properties
+while having differences with the superclass. The Verbnet corpus allows us to access these
+subclasses using `subclasses()`.
+
+ >>> print(verbnet.subclasses('9.1')) #Testing for 9.1 since '57' does not have subclasses
+ ['put-9.1-1', 'put-9.1-2']
+
nps_chat
--------
diff --git a/nltk/test/stem.doctest b/nltk/test/stem.doctest
index d8427af..eff4d2c 100644
--- a/nltk/test/stem.doctest
+++ b/nltk/test/stem.doctest
@@ -46,7 +46,7 @@ Unit tests for Snowball stemmer
See which languages are supported.
>>> print(" ".join(SnowballStemmer.languages))
- danish dutch english finnish french german hungarian italian
+ arabic danish dutch english finnish french german hungarian italian
norwegian porter portuguese romanian russian spanish swedish
Create a new instance of a language specific subclass.
diff --git a/nltk/test/tokenize.doctest b/nltk/test/tokenize.doctest
index 9d0d668..07ab178 100644
--- a/nltk/test/tokenize.doctest
+++ b/nltk/test/tokenize.doctest
@@ -193,7 +193,14 @@ The sentence splitter should remove whitespace following the sentence boundary.
['See Section 3.', ') Or Section 2.', ')']
-Regression Tests: aling_tokens
+Two instances of PunktSentenceTokenizer should not share PunktParameters.
+
+ >>> pst = PunktSentenceTokenizer()
+ >>> pst2 = PunktSentenceTokenizer()
+ >>> pst._params is pst2._params
+ False
+
+Regression Tests: align_tokens
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Post-hoc alignment of tokens with a source string
diff --git a/nltk/test/unit/test_corenlp.py b/nltk/test/unit/test_corenlp.py
new file mode 100644
index 0000000..feb84cd
--- /dev/null
+++ b/nltk/test/unit/test_corenlp.py
@@ -0,0 +1,412 @@
+# -*- coding: utf-8 -*-
+
+"""
+Mock test for Stanford CoreNLP wrappers.
+"""
+
+import sys
+from itertools import chain
+from unittest import TestCase, SkipTest
+
+try:
+ from unittest.mock import patch # Tries to import mock in Python3.
+except ImportError:
+ raise SkipTest('unittest.mock no supported in Python2')
+
+from nltk.tag.stanford import CoreNLPPOSTagger, CoreNLPNERTagger
+from nltk.tokenize.stanford import CoreNLPTokenizer
+
+
+class TestTokenizerAPI(TestCase):
+ @patch('nltk.tokenize.stanford.CoreNLPTokenizer')
+ def test_tokenize(self, MockTokenizer):
+ corenlp_tokenizer = MockTokenizer()
+ input_string = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."
+ corenlp_tokenizer.api_call.return_value = {
+ u'sentences': [ { u'index': 0,
+ u'tokens': [ { u'after': u' ',
+ u'before': u'',
+ u'characterOffsetBegin': 0,
+ u'characterOffsetEnd': 4,
+ u'index': 1,
+ u'originalText': u'Good',
+ u'word': u'Good'},
+ { u'after': u' ',
+ u'before': u' ',
+ u'characterOffsetBegin': 5,
+ u'characterOffsetEnd': 12,
+ u'index': 2,
+ u'originalText': u'muffins',
+ u'word': u'muffins'},
+ { u'after': u' ',
+ u'before': u' ',
+ u'characterOffsetBegin': 13,
+ u'characterOffsetEnd': 17,
+ u'index': 3,
+ u'originalText': u'cost',
+ u'word': u'cost'},
+ { u'after': u'',
+ u'before': u' ',
+ u'characterOffsetBegin': 18,
+ u'characterOffsetEnd': 19,
+ u'index': 4,
+ u'originalText': u'$',
+ u'word': u'$'},
+ { u'after': u'\n',
+ u'before': u'',
+ u'characterOffsetBegin': 19,
+ u'characterOffsetEnd': 23,
+ u'index': 5,
+ u'originalText': u'3.88',
+ u'word': u'3.88'},
+ { u'after': u' ',
+ u'before': u'\n',
+ u'characterOffsetBegin': 24,
+ u'characterOffsetEnd': 26,
+ u'index': 6,
+ u'originalText': u'in',
+ u'word': u'in'},
+ { u'after': u' ',
+ u'before': u' ',
+ u'characterOffsetBegin': 27,
+ u'characterOffsetEnd': 30,
+ u'index': 7,
+ u'originalText': u'New',
+ u'word': u'New'},
+ { u'after': u'',
+ u'before': u' ',
+ u'characterOffsetBegin': 31,
+ u'characterOffsetEnd': 35,
+ u'index': 8,
+ u'originalText': u'York',
+ u'word': u'York'},
+ { u'after': u' ',
+ u'before': u'',
+ u'characterOffsetBegin': 35,
+ u'characterOffsetEnd': 36,
+ u'index': 9,
+ u'originalText': u'.',
+ u'word': u'.'}]},
+ { u'index': 1,
+ u'tokens': [ { u'after': u' ',
+ u'before': u' ',
+ u'characterOffsetBegin': 38,
+ u'characterOffsetEnd': 44,
+ u'index': 1,
+ u'originalText': u'Please',
+ u'word': u'Please'},
+ { u'after': u' ',
+ u'before': u' ',
+ u'characterOffsetBegin': 45,
+ u'characterOffsetEnd': 48,
+ u'index': 2,
+ u'originalText': u'buy',
+ u'word': u'buy'},
+ { u'after': u'\n',
+ u'before': u' ',
+ u'characterOffsetBegin': 49,
+ u'characterOffsetEnd': 51,
+ u'index': 3,
+ u'originalText': u'me',
+ u'word': u'me'},
+ { u'after': u' ',
+ u'before': u'\n',
+ u'characterOffsetBegin': 52,
+ u'characterOffsetEnd': 55,
+ u'index': 4,
+ u'originalText': u'two',
+ u'word': u'two'},
+ { u'after': u' ',
+ u'before': u' ',
+ u'characterOffsetBegin': 56,
+ u'characterOffsetEnd': 58,
+ u'index': 5,
+ u'originalText': u'of',
+ u'word': u'of'},
+ { u'after': u'',
+ u'before': u' ',
+ u'characterOffsetBegin': 59,
+ u'characterOffsetEnd': 63,
+ u'index': 6,
+ u'originalText': u'them',
+ u'word': u'them'},
+ { u'after': u'\n',
+ u'before': u'',
+ u'characterOffsetBegin': 63,
+ u'characterOffsetEnd': 64,
+ u'index': 7,
+ u'originalText': u'.',
+ u'word': u'.'}]},
+ { u'index': 2,
+ u'tokens': [ { u'after': u'',
+ u'before': u'\n',
+ u'characterOffsetBegin': 65,
+ u'characterOffsetEnd': 71,
+ u'index': 1,
+ u'originalText': u'Thanks',
+ u'word': u'Thanks'},
+ { u'after': u'',
+ u'before': u'',
+ u'characterOffsetBegin': 71,
+ u'characterOffsetEnd': 72,
+ u'index': 2,
+ u'originalText': u'.',
+ u'word': u'.'}]}]
+ }
+ # Should return the mocked json.
+ api_call_output = corenlp_tokenizer.api_call(input_string)
+ self.assertIsInstance(api_call_output, dict)
+ # Emulates the tokenization process.
+ # Note: We're not calling the corenlpt_tokenizer.tokenize() directly because
+ # it will not return the desired value but a MagicMock object.
+ # >>> corenlp_tokenizer.tokenize(input_string)
+ # >>> <MagicMock name='CoreNLPTokenizer().tokenize()' id='140308440963224'>
+ print (corenlp_tokenizer.tokenize(input_string))
+ tokenized_output = [token['originalText'] or token['word']
+ for sentence in api_call_output['sentences']
+ for token in sentence['tokens']]
+ expected_output = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in',
+ u'New', u'York', u'.', u'Please', u'buy', u'me',
+ u'two', u'of', u'them', u'.', u'Thanks', u'.']
+ self.assertEqual(expected_output, tokenized_output)
+
+
+class TestTaggerAPI(TestCase):
+ @patch('nltk.tag.stanford.CoreNLPTagger')
+ def test_blog_posts(self, MockTagger):
+ corenlp_tagger = MockTagger()
+ input_tokens = 'What is the airspeed of an unladen swallow ?'.split()
+ corenlp_tagger.api_call.return_value = {
+ u'sentences': [ { u'basicDependencies': [ { u'dep': u'ROOT',
+ u'dependent': 1,
+ u'dependentGloss': u'What',
+ u'governor': 0,
+ u'governorGloss': u'ROOT'},
+ { u'dep': u'cop',
+ u'dependent': 2,
+ u'dependentGloss': u'is',
+ u'governor': 1,
+ u'governorGloss': u'What'},
+ { u'dep': u'det',
+ u'dependent': 3,
+ u'dependentGloss': u'the',
+ u'governor': 4,
+ u'governorGloss': u'airspeed'},
+ { u'dep': u'nsubj',
+ u'dependent': 4,
+ u'dependentGloss': u'airspeed',
+ u'governor': 1,
+ u'governorGloss': u'What'},
+ { u'dep': u'case',
+ u'dependent': 5,
+ u'dependentGloss': u'of',
+ u'governor': 8,
+ u'governorGloss': u'swallow'},
+ { u'dep': u'det',
+ u'dependent': 6,
+ u'dependentGloss': u'an',
+ u'governor': 8,
+ u'governorGloss': u'swallow'},
+ { u'dep': u'compound',
+ u'dependent': 7,
+ u'dependentGloss': u'unladen',
+ u'governor': 8,
+ u'governorGloss': u'swallow'},
+ { u'dep': u'nmod',
+ u'dependent': 8,
+ u'dependentGloss': u'swallow',
+ u'governor': 4,
+ u'governorGloss': u'airspeed'},
+ { u'dep': u'punct',
+ u'dependent': 9,
+ u'dependentGloss': u'?',
+ u'governor': 1,
+ u'governorGloss': u'What'}],
+ u'enhancedDependencies': [ { u'dep': u'ROOT',
+ u'dependent': 1,
+ u'dependentGloss': u'What',
+ u'governor': 0,
+ u'governorGloss': u'ROOT'},
+ { u'dep': u'cop',
+ u'dependent': 2,
+ u'dependentGloss': u'is',
+ u'governor': 1,
+ u'governorGloss': u'What'},
+ { u'dep': u'det',
+ u'dependent': 3,
+ u'dependentGloss': u'the',
+ u'governor': 4,
+ u'governorGloss': u'airspeed'},
+ { u'dep': u'nsubj',
+ u'dependent': 4,
+ u'dependentGloss': u'airspeed',
+ u'governor': 1,
+ u'governorGloss': u'What'},
+ { u'dep': u'case',
+ u'dependent': 5,
+ u'dependentGloss': u'of',
+ u'governor': 8,
+ u'governorGloss': u'swallow'},
+ { u'dep': u'det',
+ u'dependent': 6,
+ u'dependentGloss': u'an',
+ u'governor': 8,
+ u'governorGloss': u'swallow'},
+ { u'dep': u'compound',
+ u'dependent': 7,
+ u'dependentGloss': u'unladen',
+ u'governor': 8,
+ u'governorGloss': u'swallow'},
+ { u'dep': u'nmod:of',
+ u'dependent': 8,
+ u'dependentGloss': u'swallow',
+ u'governor': 4,
+ u'governorGloss': u'airspeed'},
+ { u'dep': u'punct',
+ u'dependent': 9,
+ u'dependentGloss': u'?',
+ u'governor': 1,
+ u'governorGloss': u'What'}],
+ u'enhancedPlusPlusDependencies': [ { u'dep': u'ROOT',
+ u'dependent': 1,
+ u'dependentGloss': u'What',
+ u'governor': 0,
+ u'governorGloss': u'ROOT'},
+ { u'dep': u'cop',
+ u'dependent': 2,
+ u'dependentGloss': u'is',
+ u'governor': 1,
+ u'governorGloss': u'What'},
+ { u'dep': u'det',
+ u'dependent': 3,
+ u'dependentGloss': u'the',
+ u'governor': 4,
+ u'governorGloss': u'airspeed'},
+ { u'dep': u'nsubj',
+ u'dependent': 4,
+ u'dependentGloss': u'airspeed',
+ u'governor': 1,
+ u'governorGloss': u'What'},
+ { u'dep': u'case',
+ u'dependent': 5,
+ u'dependentGloss': u'of',
+ u'governor': 8,
+ u'governorGloss': u'swallow'},
+ { u'dep': u'det',
+ u'dependent': 6,
+ u'dependentGloss': u'an',
+ u'governor': 8,
+ u'governorGloss': u'swallow'},
+ { u'dep': u'compound',
+ u'dependent': 7,
+ u'dependentGloss': u'unladen',
+ u'governor': 8,
+ u'governorGloss': u'swallow'},
+ { u'dep': u'nmod:of',
+ u'dependent': 8,
+ u'dependentGloss': u'swallow',
+ u'governor': 4,
+ u'governorGloss': u'airspeed'},
+ { u'dep': u'punct',
+ u'dependent': 9,
+ u'dependentGloss': u'?',
+ u'governor': 1,
+ u'governorGloss': u'What'}],
+ u'index': 0,
+ u'parse': u'(ROOT\n (SBARQ\n (WHNP (WP What))\n (SQ (VBZ is)\n (NP\n (NP (DT the) (NN airspeed))\n (PP (IN of)\n (NP (DT an) (NN unladen) (NN swallow)))))\n (. ?)))',
+ u'tokens': [ { u'after': u' ',
+ u'before': u'',
+ u'characterOffsetBegin': 0,
+ u'characterOffsetEnd': 4,
+ u'index': 1,
+ u'lemma': u'what',
+ u'originalText': u'What',
+ u'pos': u'WP',
+ u'word': u'What'},
+ { u'after': u' ',
+ u'before': u' ',
+ u'characterOffsetBegin': 5,
+ u'characterOffsetEnd': 7,
+ u'index': 2,
+ u'lemma': u'be',
+ u'originalText': u'is',
+ u'pos': u'VBZ',
+ u'word': u'is'},
+ { u'after': u' ',
+ u'before': u' ',
+ u'characterOffsetBegin': 8,
+ u'characterOffsetEnd': 11,
+ u'index': 3,
+ u'lemma': u'the',
+ u'originalText': u'the',
+ u'pos': u'DT',
+ u'word': u'the'},
+ { u'after': u' ',
+ u'before': u' ',
+ u'characterOffsetBegin': 12,
+ u'characterOffsetEnd': 20,
+ u'index': 4,
+ u'lemma': u'airspeed',
+ u'originalText': u'airspeed',
+ u'pos': u'NN',
+ u'word': u'airspeed'},
+ { u'after': u' ',
+ u'before': u' ',
+ u'characterOffsetBegin': 21,
+ u'characterOffsetEnd': 23,
+ u'index': 5,
+ u'lemma': u'of',
+ u'originalText': u'of',
+ u'pos': u'IN',
+ u'word': u'of'},
+ { u'after': u' ',
+ u'before': u' ',
+ u'characterOffsetBegin': 24,
+ u'characterOffsetEnd': 26,
+ u'index': 6,
+ u'lemma': u'a',
+ u'originalText': u'an',
+ u'pos': u'DT',
+ u'word': u'an'},
+ { u'after': u' ',
+ u'before': u' ',
+ u'characterOffsetBegin': 27,
+ u'characterOffsetEnd': 34,
+ u'index': 7,
+ u'lemma': u'unladen',
+ u'originalText': u'unladen',
+ u'pos': u'JJ',
+ u'word': u'unladen'},
+ { u'after': u' ',
+ u'before': u' ',
+ u'characterOffsetBegin': 35,
+ u'characterOffsetEnd': 42,
+ u'index': 8,
+ u'lemma': u'swallow',
+ u'originalText': u'swallow',
+ u'pos': u'VB',
+ u'word': u'swallow'},
+ { u'after': u'',
+ u'before': u' ',
+ u'characterOffsetBegin': 43,
+ u'characterOffsetEnd': 44,
+ u'index': 9,
+ u'lemma': u'?',
+ u'originalText': u'?',
+ u'pos': u'.',
+ u'word': u'?'}]}]
+ }
+ expected_output = [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'),
+ ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'),
+ ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
+ tagged_data = corenlp_tagger.api_call(input_tokens,
+ properties={'ssplit.isOneSentence': 'true',
+ 'annotators': 'tokenize,ssplit,pos' })
+ # Emulates the tagging function.
+ # Note: We're not calling the corenlp_tagger.tag() directly because
+ # it will not return the desired value but a MagicMock object.
+ # >>> corenlp_tagger.tag(input_tokens)
+ # >>> <MagicMock name='CoreNLPTagger().tag()' id='140395802719848'>
+ tagged_output = [(token['word'], token['pos'])
+ for token in tagged_data['sentences'][0]['tokens']]
+ self.assertEqual(expected_output, tagged_output)
diff --git a/nltk/test/unit/test_stem.py b/nltk/test/unit/test_stem.py
index 6287f42..5f359d4 100644
--- a/nltk/test/unit/test_stem.py
+++ b/nltk/test/unit/test_stem.py
@@ -10,6 +10,20 @@ import os
class SnowballTest(unittest.TestCase):
+ def test_arabic(self):
+ """
+ this unit testing for test the snowball arabic light stemmer
+ this stemmer deals with prefixes and suffixes
+ """
+ ar_stemmer = SnowballStemmer("arabic")
+ assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
+ assert ar_stemmer.stem("العربية") == "عرب"
+ assert ar_stemmer.stem("فقالوا") == "قال"
+ assert ar_stemmer.stem("الطالبات") == "طالب"
+ assert ar_stemmer.stem("فالطالبات") == "طالب"
+ assert ar_stemmer.stem("والطالبات") == "طالب"
+ assert ar_stemmer.stem("الطالبون") == "طالب"
+
def test_russian(self):
# Russian words both consisting of Cyrillic
# and Roman letters can be stemmed.
diff --git a/nltk/test/unit/test_tokenize.py b/nltk/test/unit/test_tokenize.py
index 45fba66..a46ec82 100644
--- a/nltk/test/unit/test_tokenize.py
+++ b/nltk/test/unit/test_tokenize.py
@@ -5,7 +5,7 @@ See also nltk/test/tokenize.doctest
"""
from __future__ import unicode_literals
-from nltk.tokenize import TweetTokenizer, StanfordSegmenter
+from nltk.tokenize import TweetTokenizer, StanfordSegmenter, TreebankWordTokenizer
from nose import SkipTest
import unittest
import os
@@ -107,3 +107,43 @@ class TestTokenize(unittest.TestCase):
expected = ['u', '@abcde', '@abcdefghijklmnopqrst', '@abcde', '_', '@abcde', '5', '@abcde']
result = tokenizer.tokenize(test7)
self.assertEqual(result, expected)
+
+ def test_treebank_span_tokenizer(self):
+ """
+ Test TreebankWordTokenizer.span_tokenize function
+ """
+
+ tokenizer = TreebankWordTokenizer()
+
+ # Test case in the docstring
+ test1 = "Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks)."
+ expected = [
+ (0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
+ (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
+ (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
+ (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)
+ ]
+ result = tokenizer.span_tokenize(test1)
+ self.assertEqual(result, expected)
+
+ # Test case with double quotation
+ test2 = "The DUP is similar to the \"religious right\" in the United States and takes a hardline stance on social issues"
+ expected = [
+ (0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27),
+ (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64),
+ (65, 68), (69, 74), (75, 76), (77, 85), (86, 92), (93, 95), (96, 102),
+ (103, 109)
+ ]
+ result = tokenizer.span_tokenize(test2)
+ self.assertEqual(result, expected)
+
+ # Test case with double qoutation as well as converted quotations
+ test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
+ expected = [
+ (0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27),
+ (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64),
+ (65, 68), (69, 74), (75, 76), (77, 79), (79, 87), (87, 89), (90, 96),
+ (97, 99), (100, 106), (107, 113)
+ ]
+ result = tokenizer.span_tokenize(test3)
+ self.assertEqual(result, expected)
diff --git a/nltk/test/unit/test_wordnet.py b/nltk/test/unit/test_wordnet.py
new file mode 100644
index 0000000..c45ee3f
--- /dev/null
+++ b/nltk/test/unit/test_wordnet.py
@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+"""
+Unit tests for nltk.corpus.wordnet
+See also nltk/test/wordnet.doctest
+"""
+
+from __future__ import unicode_literals
+from nose import SkipTest
+import unittest
+import os
+
+from nltk.corpus.reader.wordnet import WordNetCorpusReader
+from nltk.corpus import wordnet as wn
+from nltk.corpus import wordnet_ic as wnic
+from nltk.data import find as find_data
+
+
+wn.ensure_loaded()
+S = wn.synset
+L = wn.lemma
+
+class WordnNetDemo(unittest.TestCase):
+
+ def test_retrieve_synset(self):
+ move_synset = S('go.v.21')
+ self.assertEqual(move_synset.name(), "move.v.15")
+ self.assertEqual(move_synset.lemma_names(), ['move', 'go'])
+ self.assertEqual(move_synset.definition(), "have a turn; make one's move in a game")
+ self.assertEqual(move_synset.examples(), ['Can I go now?'])
+
+
+ def test_retrieve_synsets(self):
+ self.assertEqual(sorted(wn.synsets('zap', pos='n')),
+ [S('zap.n.01')])
+ self.assertEqual(sorted(wn.synsets('zap', pos='v')),
+ [S('microwave.v.01'), S('nuke.v.01'), S('zap.v.01'), S('zap.v.02')])
+
+ def test_hyperhyponyms(self):
+ # Not every synset as hypernyms()
+ self.assertEqual(S('travel.v.01').hypernyms(), [])
+ self.assertEqual(S('travel.v.02').hypernyms(),
+ [S('travel.v.03')])
+ self.assertEqual(S('travel.v.03').hypernyms(), [])
+
+ # Test hyper-/hyponyms.
+ self.assertEqual(S('breakfast.n.1').hypernyms(), [S('meal.n.01')])
+ first_five_meal_hypo = [S('banquet.n.02'), S('bite.n.04'), S('breakfast.n.01'), S('brunch.n.01'), S('buffet.n.02')]
+ self.assertEqual(sorted(S('meal.n.1').hyponyms()[:5]), first_five_meal_hypo)
+ self.assertEqual(S('Austen.n.1').instance_hypernyms(), [S('writer.n.01')])
+ first_five_composer_hypo = [S('ambrose.n.01'), S('bach.n.01'), S('barber.n.01'), S('bartok.n.01'), S('beethoven.n.01')]
+ self.assertEqual(S('composer.n.1').instance_hyponyms()[:5], first_five_composer_hypo)
+
+ # Test root hyper-/hyponyms
+ self.assertEqual(S('person.n.01').root_hypernyms(), [S('entity.n.01')])
+ self.assertEqual(S('sail.v.01').root_hypernyms(), [S('travel.v.01')])
+ self.assertEqual(S('fall.v.12').root_hypernyms(), [S('act.v.01'), S('fall.v.17')])
+
+ def test_derivationally_related_forms(self):
+ # Test `derivationally_related_forms()`
+ self.assertEqual(L('zap.v.03.nuke').derivationally_related_forms(),
+ [L('atomic_warhead.n.01.nuke')])
+ self.assertEqual(L('zap.v.03.atomize').derivationally_related_forms(),
+ [L('atomization.n.02.atomization')])
+ self.assertEqual(L('zap.v.03.atomise').derivationally_related_forms(),
+ [L('atomization.n.02.atomisation')])
+ self.assertEqual(L('zap.v.03.zap').derivationally_related_forms(),
+ [])
+
+ def test_meronyms_holonyms(self):
+ # Test meronyms, holonyms.
+ self.assertEqual(S('dog.n.01').member_holonyms(), [S('canis.n.01'), S('pack.n.06')])
+ self.assertEqual(S('dog.n.01').part_meronyms(), [S('flag.n.07')])
+
+ self.assertEqual(S('faculty.n.2').member_meronyms(),
+ [S('professor.n.01')])
+ self.assertEqual(S('copilot.n.1').member_holonyms(),
+ [S('crew.n.01')])
+
+ self.assertEqual(S('table.n.2').part_meronyms(),
+ [S('leg.n.03'), S('tabletop.n.01'), S('tableware.n.01')])
+ self.assertEqual(S('course.n.7').part_holonyms(),
+ [S('meal.n.01')])
+
+ self.assertEqual(S('water.n.1').substance_meronyms(),
+ [S('hydrogen.n.01'), S('oxygen.n.01')])
+ self.assertEqual(S('gin.n.1').substance_holonyms(),
+ [S('gin_and_it.n.01'), S('gin_and_tonic.n.01'),
+ S('martini.n.01'), S('pink_lady.n.01')])
+
+ def test_antonyms(self):
+ # Test antonyms.
+ self.assertEqual(L('leader.n.1.leader').antonyms(), [L('follower.n.01.follower')])
+ self.assertEqual(L('increase.v.1.increase').antonyms(), [L('decrease.v.01.decrease')])
+
+
+ def test_misc_relations(self):
+ # Test misc relations.
+ self.assertEqual(S('snore.v.1').entailments(), [S('sleep.v.01')])
+ self.assertEqual(S('heavy.a.1').similar_tos(),
+ [S('dense.s.03'), S('doughy.s.01'),
+ S('heavier-than-air.s.01'), S('hefty.s.02'),
+ S('massive.s.04'), S('non-buoyant.s.01'),
+ S('ponderous.s.02')])
+ self.assertEqual(S('light.a.1').attributes(), [S('weight.n.01')])
+ self.assertEqual(S('heavy.a.1').attributes(), [S('weight.n.01')])
+
+ # Test pertainyms.
+ self.assertEqual(L('English.a.1.English').pertainyms(),
+ [L('england.n.01.England')])
+
+ def test_lch(self):
+ # Test LCH.
+ self.assertEqual(S('person.n.01').lowest_common_hypernyms(S('dog.n.01')),
+ [S('organism.n.01')])
+ self.assertEqual(S('woman.n.01').lowest_common_hypernyms(S('girlfriend.n.02')),
+ [S('woman.n.01')])
+
+ def test_domains(self):
+ # Test domains.
+ self.assertEqual(S('code.n.03').topic_domains(), [S('computer_science.n.01')])
+ self.assertEqual(S('pukka.a.01').region_domains(), [S('india.n.01')])
+ self.assertEqual(S('freaky.a.01').usage_domains(), [S('slang.n.02')])
+
+ def test_wordnet_similarities(self):
+ # Path based similarities.
+ self.assertAlmostEqual(S('cat.n.01').path_similarity(S('cat.n.01')), 1.0)
+ self.assertAlmostEqual(S('dog.n.01').path_similarity(S('cat.n.01')), 0.2)
+ self.assertAlmostEqual(S('dog.n.01').lch_similarity(S('cat.n.01')), 2.028, places=3)
+ self.assertAlmostEqual(S('dog.n.01').wup_similarity(S('cat.n.01')), 0.8571, places=3)
+ # Information Content similarities.
+ brown_ic = wnic.ic('ic-brown.dat')
+ self.assertAlmostEqual(S('dog.n.01').jcn_similarity(S('cat.n.01'), brown_ic), 0.4497, places=3)
+ semcor_ic = wnic.ic('ic-semcor.dat')
+ self.assertAlmostEqual(S('dog.n.01').lin_similarity(S('cat.n.01'), semcor_ic), 0.8863, places=3)
diff --git a/nltk/tokenize/__init__.py b/nltk/tokenize/__init__.py
index b4b6dd7..6d03924 100644
--- a/nltk/tokenize/__init__.py
+++ b/nltk/tokenize/__init__.py
@@ -74,14 +74,12 @@ from nltk.tokenize.repp import ReppTokenizer
from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize
from nltk.tokenize.simple import (SpaceTokenizer, TabTokenizer, LineTokenizer,
line_tokenize)
-from nltk.tokenize.stanford import StanfordTokenizer
from nltk.tokenize.texttiling import TextTilingTokenizer
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
-
# Standard sentence tokenizer.
def sent_tokenize(text, language='english'):
"""
diff --git a/nltk/tokenize/moses.py b/nltk/tokenize/moses.py
index 44fcace..0f7d31d 100644
--- a/nltk/tokenize/moses.py
+++ b/nltk/tokenize/moses.py
@@ -36,6 +36,19 @@ class MosesTokenizer(TokenizerI):
>>> m = MosesTokenizer()
>>> m.tokenize('abc def.')
[u'abc', u'def', u'.']
+
+ The nonbreaking prefixes should deal the situation when numeric only prefix is the last token.
+ In below example, "pp" is the last element, and there is no digit after it.
+
+ >>> m = MosesTokenizer()
+ >>> m.tokenize('2016, pp.')
+ [u'2016', u',', u'pp', u'.']
+
+ >>> sent = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
+ >>> m.tokenize(sent, escape=True)
+ ['This', 'ain', ''t', 'funny', '.', 'It', ''s', 'actually', 'hillarious', ',', 'yet', 'double', 'Ls', '.', '|', '[', ']', '<', '>', '[', ']', '&', 'You', ''re', 'gonna', 'shake', 'it', 'off', '?', 'Don', ''t', '?']
+ >>> m.tokenize(sent, escape=False)
+ ['This', 'ain', "'t", 'funny', '.', 'It', "'s", 'actually', 'hillarious', ',', 'yet', 'double', 'Ls', '.', '|', '[', ']', '<', '>', '[', ']', '&', 'You', "'re", 'gonna', 'shake', 'it', 'off', '?', 'Don', "'t", '?']
"""
# Perl Unicode Properties character sets.
@@ -234,7 +247,7 @@ class MosesTokenizer(TokenizerI):
super(MosesTokenizer, self).__init__()
self.lang = lang
# Initialize the language specific nonbreaking prefixes.
- self.NONBREAKING_PREFIXES = nonbreaking_prefixes.words(lang)
+ self.NONBREAKING_PREFIXES = [_nbp.strip() for _nbp in nonbreaking_prefixes.words(lang)]
self.NUMERIC_ONLY_PREFIXES = [w.rpartition(' ')[0] for w in
self.NONBREAKING_PREFIXES if
self.has_numeric_only(w)]
@@ -286,6 +299,7 @@ class MosesTokenizer(TokenizerI):
# Checks if the prefix is in NUMERIC_ONLY_PREFIXES
# and ensures that the next word is a digit.
elif (prefix in self.NUMERIC_ONLY_PREFIXES and
+ (i + 1) < num_tokens and
re.search(r'^[0-9]+', tokens[i+1])):
pass # No change to the token.
else: # Otherwise, adds a space after the tokens before a dot.
@@ -315,7 +329,7 @@ class MosesTokenizer(TokenizerI):
text = re.sub(regexp, substitution, text)
return text if return_str else text.split()
- def tokenize(self, text, agressive_dash_splits=False, return_str=False):
+ def tokenize(self, text, agressive_dash_splits=False, return_str=False, escape=True):
"""
Python port of the Moses tokenizer.
@@ -374,8 +388,9 @@ class MosesTokenizer(TokenizerI):
text = re.sub(regexp,substitution, text).strip()
# Restore multidots.
text = self.restore_multidots(text)
- # Escape XML symbols.
- text = self.escape_xml(text)
+ if escape:
+ # Escape XML symbols.
+ text = self.escape_xml(text)
return text if return_str else text.split()
@@ -408,6 +423,11 @@ class MosesDetokenizer(TokenizerI):
>>> detokens = d.detokenize(tokens)
>>> " ".join(detokens) == expected_detokens
True
+
+ >>> d.detokenize(expected_tokens, unescape=True)
+ ['This', "ain't", 'funny.', "It's", 'actually', 'hillarious,', 'yet', 'double', 'Ls.', '|', '[]', '<', '>', '[]', '&', "You're", 'gonna', 'shake', 'it', 'off?', "Don't?"]
+ >>> d.detokenize(expected_tokens, unescape=False)
+ ['This', 'ain', ''t', 'funny.', 'It', ''s', 'actually', 'hillarious,', 'yet', 'double', 'Ls.', '|', '[', ']', '<', '>', '[', ']', '&', 'You', ''re', 'gonna', 'shake', 'it', 'off?', 'Don', ''t?']
"""
# Currency Symbols.
IsAlnum = text_type(''.join(perluniprops.chars('IsAlnum')))
@@ -474,7 +494,7 @@ class MosesDetokenizer(TokenizerI):
return text
- def tokenize(self, tokens, return_str=False):
+ def tokenize(self, tokens, return_str=False, unescape=True):
"""
Python port of the Moses detokenizer.
@@ -489,8 +509,9 @@ class MosesDetokenizer(TokenizerI):
# Detokenize the agressive hyphen split.
regexp, substitution = self.AGGRESSIVE_HYPHEN_SPLIT
text = re.sub(regexp, substitution, text)
- # Unescape the XML symbols.
- text = self.unescape_xml(text)
+ if unescape:
+ # Unescape the XML symbols.
+ text = self.unescape_xml(text)
# Keep track of no. of quotation marks.
quote_counts = {u"'":0 , u'"':0, u"``":0, u"`":0, u"''":0}
@@ -608,6 +629,6 @@ class MosesDetokenizer(TokenizerI):
return detokenized_text if return_str else detokenized_text.split()
- def detokenize(self, tokens, return_str=False):
+ def detokenize(self, tokens, return_str=False, unescape=True):
""" Duck-typing the abstract *tokenize()*."""
- return self.tokenize(tokens, return_str)
+ return self.tokenize(tokens, return_str, unescape)
diff --git a/nltk/tokenize/nist.py b/nltk/tokenize/nist.py
new file mode 100644
index 0000000..0b2e5fb
--- /dev/null
+++ b/nltk/tokenize/nist.py
@@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Python port of the tok-tok.pl tokenizer.
+#
+# Copyright (C) 2001-2015 NLTK Project
+# Author: Liling Tan (ported from ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-v14.pl)
+# Contributors: Ozan Caglayan, Wiktor Stribizew
+#
+# URL: <http://nltk.sourceforge.net>
+# For license information, see LICENSE.TXT
+
+"""
+This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
+https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
+which was also ported into Python in
+https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
+"""
+
+from __future__ import unicode_literals
+
+import io
+import re
+from six import text_type
+
+from nltk.corpus import perluniprops
+from nltk.tokenize.api import TokenizerI
+from nltk.tokenize.util import xml_unescape
+
+
+class NISTTokenizer(TokenizerI):
+ """
+ This NIST tokenizer is sentence-based instead of the original
+ paragraph-based tokenization from mteval-14.pl; The sentence-based
+ tokenization is consistent with the other tokenizers available in NLTK.
+
+ >>> from six import text_type
+ >>> from nltk.tokenize.nist import NISTTokenizer
+ >>> nist = NISTTokenizer()
+ >>> s = "Good muffins cost $3.88 in New York."
+ >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
+ >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
+ >>> nist.tokenize(s, lowercase=False) == expected_cased
+ True
+ >>> nist.tokenize(s, lowercase=True) == expected_lower # Lowercased.
+ True
+
+ The international_tokenize() is the preferred function when tokenizing
+ non-european text, e.g.
+
+ >>> from nltk.tokenize.nist import NISTTokenizer
+ >>> nist = NISTTokenizer()
+
+ # Input strings.
+ >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) us a Chinese e-commerce company...'
+ >>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'
+ >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'
+
+ # Expected tokens.
+ >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'\u963f\u91cc\u5df4\u5df4\u96c6\u56e2\u63a7\u80a1', u'\u6709\u9650\u516c\u53f8', u')']
+ >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'\u02c8\xe6', u'm']
+ >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'\u697d\u5929\u682a\u5f0f\u4f1a\u793e', u'Rakuten', u'Kabushiki', u'-', u'gaisha']
+
+ >>> nist.international_tokenize(albb)[:10] == expected_albb
+ True
+ >>> nist.international_tokenize(amz)[:10] == expected_amz
+ True
+ >>> nist.international_tokenize(rkt)[:10] == expected_rkt
+ True
+ """
+ # Strip "skipped" tags
+ STRIP_SKIP = re.compile('<skipped>'), ''
+ # Strip end-of-line hyphenation and join lines
+ STRIP_EOL_HYPHEN = re.compile(u'\u2028'), ' '
+ # Tokenize punctuation.
+ PUNCT = re.compile('([\{-\~\[-\` -\&\(-\+\:-\@\/])'), ' \\1 '
+ # Tokenize period and comma unless preceded by a digit.
+ PERIOD_COMMA_PRECEED = re.compile('([^0-9])([\.,])'), '\\1 \\2 '
+ # Tokenize period and comma unless followed by a digit.
+ PERIOD_COMMA_FOLLOW = re.compile('([\.,])([^0-9])'), ' \\1 \\2'
+ # Tokenize dash when preceded by a digit
+ DASH_PRECEED_DIGIT = re.compile('([0-9])(-)'), '\\1 \\2 '
+
+ LANG_DEPENDENT_REGEXES = [PUNCT, PERIOD_COMMA_PRECEED,
+ PERIOD_COMMA_FOLLOW, DASH_PRECEED_DIGIT]
+
+ # Perluniprops characters used in NIST tokenizer.
+ pup_number = text_type(''.join(set(perluniprops.chars('Number')))) # i.e. \p{N}
+ pup_punct = text_type(''.join(set(perluniprops.chars('Punctuation')))) # i.e. \p{P}
+ pup_symbol = text_type(''.join(set(perluniprops.chars('Symbol')))) # i.e. \p{S}
+
+ # Python regexes needs to escape some special symbols, see
+ # see https://stackoverflow.com/q/45670950/610569
+ number_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_number)
+ punct_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_punct)
+ symbol_regex = re.sub(r'[]^\\-]', r'\\\g<0>', pup_symbol)
+
+ # Note: In the original perl implementation, \p{Z} and \p{Zl} were used to
+ # (i) strip trailing and heading spaces and
+ # (ii) de-deuplicate spaces.
+ # In Python, this would do: ' '.join(str.strip().split())
+ # Thus, the next two lines were commented out.
+ #Line_Separator = text_type(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
+ #Separator = text_type(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
+
+ # Pads non-ascii strings with space.
+ NONASCII = re.compile('([\x00-\x7f]+)'), r' \1 '
+ # Tokenize any punctuation unless followed AND preceded by a digit.
+ PUNCT_1 = re.compile(u"([{n}])([{p}])".format(n=number_regex, p=punct_regex)), '\\1 \\2 '
+ PUNCT_2 = re.compile(u"([{p}])([{n}])".format(n=number_regex, p=punct_regex)), ' \\1 \\2'
+ # Tokenize symbols
+ SYMBOLS = re.compile(u"({s})".format(s=symbol_regex)), ' \\1 '
+
+ INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS]
+
+ def lang_independent_sub(self, text):
+ """Performs the language independent string substituitions. """
+ # It's a strange order of regexes.
+ # It'll be better to unescape after STRIP_EOL_HYPHEN
+ # but let's keep it close to the original NIST implementation.
+ regexp, substitution = self.STRIP_SKIP
+ text = regexp.sub(substitution, text)
+ text = xml_unescape(text)
+ regexp, substitution = self.STRIP_EOL_HYPHEN
+ text = regexp.sub(substitution, text)
+ return text
+
+ def tokenize(self, text, lowercase=False,
+ western_lang=True, return_str=False):
+ text = text_type(text)
+ # Language independent regex.
+ text = self.lang_independent_sub(text)
+ # Language dependent regex.
+ if western_lang:
+ # Pad string with whitespace.
+ text = ' ' + text + ' '
+ if lowercase:
+ text = text.lower()
+ for regexp, substitution in self.LANG_DEPENDENT_REGEXES:
+ text = regexp.sub(substitution, text)
+ # Remove contiguous whitespaces.
+ text = ' '.join(text.split())
+ # Finally, strips heading and trailing spaces
+ # and converts output string into unicode.
+ text = text_type(text.strip())
+ return text if return_str else text.split()
+
+ def international_tokenize(self, text, lowercase=False,
+ split_non_ascii=True,
+ return_str=False):
+ text = text_type(text)
+ # Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied
+ # first before unescaping.
+ regexp, substitution = self.STRIP_SKIP
+ text = regexp.sub(substitution, text)
+ regexp, substitution = self.STRIP_EOL_HYPHEN
+ text = regexp.sub(substitution, text)
+ text = xml_unescape(text)
+
+ if lowercase:
+ text = text.lower()
+
+ for regexp, substitution in self.INTERNATIONAL_REGEXES:
+ text = regexp.sub(substitution, text)
+
+ # Make sure that there's only one space only between words.
+ # Strip leading and trailing spaces.
+ text = ' '.join(text.strip().split())
+ return text if return_str else text.split()
diff --git a/nltk/tokenize/punkt.py b/nltk/tokenize/punkt.py
index b5b724c..afd73a1 100644
--- a/nltk/tokenize/punkt.py
+++ b/nltk/tokenize/punkt.py
@@ -521,7 +521,9 @@ class PunktBaseClass(object):
"""
def __init__(self, lang_vars=PunktLanguageVars(), token_cls=PunktToken,
- params=PunktParameters()):
+ params=None):
+ if params is None:
+ params = PunktParameters()
self._params = params
self._lang_vars = lang_vars
self._Token = token_cls
diff --git a/nltk/tokenize/stanford.py b/nltk/tokenize/stanford.py
index ec6b312..9ac8352 100644
--- a/nltk/tokenize/stanford.py
+++ b/nltk/tokenize/stanford.py
@@ -13,12 +13,13 @@ import tempfile
import os
import json
from subprocess import PIPE
+import warnings
from six import text_type
from nltk.internals import find_jar, config_java, java, _java_options
-
from nltk.tokenize.api import TokenizerI
+from nltk.parse.corenlp import CoreNLPParser
_stanford_url = 'https://nlp.stanford.edu/software/tokenizer.shtml'
@@ -26,7 +27,7 @@ class StanfordTokenizer(TokenizerI):
r"""
Interface to the Stanford Tokenizer
- >>> from nltk.tokenize import StanfordTokenizer
+ >>> from nltk.tokenize.stanford import StanfordTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."
>>> StanfordTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
@@ -38,6 +39,13 @@ class StanfordTokenizer(TokenizerI):
_JAR = 'stanford-postagger.jar'
def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'):
+ # Raise deprecation warning.
+ warnings.simplefilter('always', DeprecationWarning)
+ warnings.warn(str("\nThe StanfordTokenizer will "
+ "be deprecated in version 3.2.5.\n"
+ "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"),
+ DeprecationWarning, stacklevel=2)
+ warnings.simplefilter('ignore', DeprecationWarning)
self._stanford_jar = find_jar(
self._JAR, path_to_jar,
env_vars=('STANFORD_POSTAGGER',),
@@ -99,6 +107,28 @@ class StanfordTokenizer(TokenizerI):
return stdout
+class CoreNLPTokenizer(CoreNLPParser):
+ def __init__(self, url='http://localhost:9000', encoding='utf8'):
+ r"""
+ This is a duck-type of CoreNLPParser that has the tokenizing
+ functionality similar to the original Stanford POS tagger.
+
+ >>> from nltk.tokenize.stanford import CoreNLPTokenizer
+ >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."
+ >>> CoreNLPTokenizer(url='http://localhost:9000').tokenize(s) == expected # doctest: +SKIP
+ [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.', u'Please', u'buy', u'me', u'two', u'of', u'them', u'.', u'Thanks', u'.']
+ """
+ super(CoreNLPTokenizer, self).__init__(url, encoding)
+
+ def tokenize(self, text, properties=None):
+ """
+ Tokenize a string of text. Consistent with the StanfordTokenizer, This
+ function returns a list of string. The orignal CoreNLPParser.tokenize()
+ returns a generator of string.
+ """
+ return list(super(CoreNLPTokenizer, self).tokenize(text, properties))
+
+
def setup_module(module):
from nose import SkipTest
@@ -106,3 +136,9 @@ def setup_module(module):
StanfordTokenizer()
except LookupError:
raise SkipTest('doctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn\'t exist')
+
+ try:
+ CoreNLPTokenizer()
+ except LookupError:
+ raise SkipTest('doctests from nltk.tokenize.stanford.CoreNLPTokenizer are skipped because the '
+ 'stanford corenlp server not started')
diff --git a/nltk/tokenize/stanford_segmenter.py b/nltk/tokenize/stanford_segmenter.py
index 40613fc..077cbef 100644
--- a/nltk/tokenize/stanford_segmenter.py
+++ b/nltk/tokenize/stanford_segmenter.py
@@ -17,6 +17,7 @@ import tempfile
import os
import json
from subprocess import PIPE
+import warnings
from nltk import compat
from nltk.internals import find_jar, find_file, find_dir, \
@@ -29,8 +30,13 @@ _stanford_url = 'https://nlp.stanford.edu/software'
class StanfordSegmenter(TokenizerI):
- """
- Interface to the Stanford Segmenter
+ """Interface to the Stanford Segmenter
+
+ If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
+ should be provieded, for example::
+
+ seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')
+
>>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
>>> seg = StanfordSegmenter()
>>> seg.default_config('zh')
@@ -46,10 +52,10 @@ class StanfordSegmenter(TokenizerI):
"""
_JAR = 'stanford-segmenter.jar'
- _SLF4J = 'slf4j-api.jar'
def __init__(self,
- path_to_jar=None, path_to_slf4j=None,
+ path_to_jar=None,
+ path_to_slf4j=None,
java_class=None,
path_to_model=None,
path_to_dict=None,
@@ -58,21 +64,33 @@ class StanfordSegmenter(TokenizerI):
keep_whitespaces='false',
encoding='UTF-8', options=None,
verbose=False, java_options='-mx2g'):
+ # Raise deprecation warning.
+ warnings.simplefilter('always', DeprecationWarning)
+ warnings.warn(str("\nThe StanfordTokenizer will "
+ "be deprecated in version 3.2.5.\n"
+ "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"),
+ DeprecationWarning, stacklevel=2)
+ warnings.simplefilter('ignore', DeprecationWarning)
stanford_segmenter = find_jar(
self._JAR, path_to_jar,
env_vars=('STANFORD_SEGMENTER',),
searchpath=(), url=_stanford_url,
verbose=verbose)
- slf4j = find_jar(
- self._SLF4J, path_to_slf4j,
+ if path_to_slf4j is not None:
+ slf4j = find_jar(
+ 'slf4j-api.jar', path_to_slf4j,
env_vars=('SLF4J', 'STANFORD_SEGMENTER',),
searchpath=(), url=_stanford_url,
verbose=verbose)
+ else:
+ slf4j = None
- # This is passed to java as the -cp option, the segmenter needs slf4j.
+ # This is passed to java as the -cp option, the old version of segmenter needs slf4j.
+ # The new version of stanford-segmenter-2016-10-31 doesn't need slf4j
self._stanford_jar = os.pathsep.join(
- [_ for _ in [stanford_segmenter, slf4j] if not _ is None])
+ _ for _ in [stanford_segmenter, slf4j] if _ is not None
+ )
self._java_class = java_class
self._model = path_to_model
diff --git a/nltk/tokenize/treebank.py b/nltk/tokenize/treebank.py
index 2d7b162..f3ae637 100644
--- a/nltk/tokenize/treebank.py
+++ b/nltk/tokenize/treebank.py
@@ -163,7 +163,19 @@ class TreebankWordTokenizer(TokenizerI):
True
"""
- tokens = self.tokenize(text)
+ raw_tokens = self.tokenize(text)
+
+ # Convert converted quotes back to original double quotes
+ # Do this only if original text contains double quote(s)
+ if '"' in text:
+ # Find double quotes and converted quotes
+ matched = [m.group() for m in re.finditer(r'[(``)(\'\')(")]+', text)]
+
+ # Replace converted quotes back to double quotes
+ tokens = [matched.pop(0) if tok in ['"', "``", "''"] else tok for tok in raw_tokens]
+ else:
+ tokens = raw_tokens
+
return align_tokens(tokens, text)
@@ -202,6 +214,29 @@ class TreebankWordDetokenizer(TokenizerI):
>>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
>>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
True
+
+ During tokenization it's safe to add more spaces but during detokenization,
+ simply undoing the padding doesn't really help.
+
+ - During tokenization, left and right pad is added to [!?], when
+ detokenizing, only left shift the [!?] is needed.
+ Thus (re.compile(r'\s([?!])'), r'\g<1>')
+
+ - During tokenization [:,] are left and right padded but when detokenizing,
+ only left shift is necessary and we keep right pad after comma/colon
+ if the string after is a non-digit.
+ Thus (re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')
+
+ >>> from nltk.tokenize.treebank import TreebankWordDetokenizer
+ >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
+ >>> twd = TreebankWordDetokenizer()
+ >>> twd.detokenize(toks)
+ "hello, i can't feel my feet! Help!!"
+
+ >>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
+ ... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
+ >>> twd.detokenize(toks)
+ "hello, i can't feel; my feet! Help!! He said: Help, help?!"
"""
_contractions = MacIntyreContractions()
CONTRACTIONS2 = [re.compile(pattern.replace('(?#X)', '\s'))
@@ -235,7 +270,8 @@ class TreebankWordDetokenizer(TokenizerI):
#punctuation
PUNCTUATION = [
(re.compile(r"([^'])\s'\s"), r"\1' "),
- (re.compile(r'\s([?!])\s'), r'\g<1>'),
+ (re.compile(r'\s([?!])'), r'\g<1>'), # Strip left pad for [?!]
+ #(re.compile(r'\s([?!])\s'), r'\g<1>'),
(re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r'\1\2\3'),
# When tokenizing, [;@#$%&] are padded with whitespace regardless of
# whether there are spaces before or after them.
@@ -246,7 +282,8 @@ class TreebankWordDetokenizer(TokenizerI):
(re.compile(r'\s([&])\s'), r' \g<1> '), # Unknown pad.
(re.compile(r'\s\.\.\.\s'), r'...'),
(re.compile(r'\s([:,])\s$'), r'\1'),
- (re.compile(r'\s([:,])\s([^\d])'), r'\1\2')
+ (re.compile(r'\s([:,])\s([^\d])'), r'\1 \2') # Keep right pad after comma/colon before non-digits.
+ #(re.compile(r'\s([:,])\s([^\d])'), r'\1\2')
]
#starting quotes
diff --git a/nltk/tokenize/util.py b/nltk/tokenize/util.py
index 7229e21..f19894b 100644
--- a/nltk/tokenize/util.py
+++ b/nltk/tokenize/util.py
@@ -7,7 +7,7 @@
# For license information, see LICENSE.TXT
from re import finditer
-from xml.sax.saxutils import escape
+from xml.sax.saxutils import escape, unescape
def string_span_tokenize(s, sep):
r"""
@@ -193,6 +193,30 @@ def xml_escape(text):
r"[": r"[", r"]": r"]", })
+def xml_unescape(text):
+ """
+ This function transforms the "escaped" version suitable
+ for well-formed XML formatting into humanly-readable string.
+
+ Note that the default xml.sax.saxutils.unescape() function don't unescape
+ some characters that Moses does so we have to manually add them to the
+ entities dictionary.
+
+ >>> from xml.sax.saxutils import unescape
+ >>> s = ')| & < > ' " ] ['
+ >>> expected = ''')| & < > \' " ] ['''
+ >>> xml_unescape(s) == expected
+ True
+
+ :param text: The text that needs to be unescaped.
+ :type text: str
+ :rtype: str
+ """
+ return unescape(text, entities={ r"'":r"'", r""":r'"',
+ r"|":r"|",
+ r"[":r"[", r"]":r"]", })
+
+
def align_tokens(tokens, sentence):
"""
This module attempt to find the offsets of the tokens in *s*, as a sequence
diff --git a/nltk/translate/ibm1.py b/nltk/translate/ibm1.py
index c516cf1..35e0420 100644
--- a/nltk/translate/ibm1.py
+++ b/nltk/translate/ibm1.py
@@ -16,7 +16,9 @@
Lexical translation model that ignores word order.
In IBM Model 1, word order is ignored for simplicity. Thus, the
-following two alignments are equally likely.
+following three alignments are equally likely. As long as the word
+alignments are equivalent, it doesn't matter where the word
+occurs in the source or target sentence.
Source: je mange du jambon
Target: i eat some ham
@@ -24,7 +26,11 @@ Alignment: (1,1) (2,2) (3,3) (4,4)
Source: je mange du jambon
Target: some ham eat i
-Alignment: (1,4) (2,3) (3,2) (4,1)
+Alignment: (1,4) (2,3) (3,1) (4,2)
+
+Source: du jambon je mange
+Target: eat i some ham
+Alignment: (1,3) (2,4) (3,2) (4,1)
The EM algorithm used in Model 1 is:
E step - In the training data, count how many times a source language
diff --git a/nltk/translate/ibm_model.py b/nltk/translate/ibm_model.py
index fa5312f..4dfe4e6 100644
--- a/nltk/translate/ibm_model.py
+++ b/nltk/translate/ibm_model.py
@@ -496,6 +496,9 @@ class AlignmentInfo(object):
def __eq__(self, other):
return self.alignment == other.alignment
+ def __ne__(self, other):
+ return not self == other
+
def __hash__(self):
return hash(self.alignment)
diff --git a/nltk/translate/nist_score.py b/nltk/translate/nist_score.py
new file mode 100644
index 0000000..1bedf65
--- /dev/null
+++ b/nltk/translate/nist_score.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: NIST Score
+#
+# Copyright (C) 2001-2017 NLTK Project
+# Authors:
+# Contributors:
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""NIST score implementation."""
+from __future__ import division
+
+import math
+import fractions
+from collections import Counter
+
+from nltk.util import ngrams
+from nltk.translate.bleu_score import modified_precision, closest_ref_length
+
+try:
+ fractions.Fraction(0, 1000, _normalize=False)
+ from fractions import Fraction
+except TypeError:
+ from nltk.compat import Fraction
+
+
+def sentence_nist(references, hypothesis, n=5):
+ """
+ Calculate NIST score from
+ George Doddington. 2002. "Automatic evaluation of machine translation quality
+ using n-gram co-occurrence statistics." Proceedings of HLT.
+ Morgan Kaufmann Publishers Inc. http://dl.acm.org/citation.cfm?id=1289189.1289273
+
+ DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU
+ score. The official script used by NIST to compute BLEU and NIST score is
+ mteval-14.pl. The main differences are:
+
+ - BLEU uses geometric mean of the ngram overlaps, NIST uses arithmetic mean.
+ - NIST has a different brevity penalty
+ - NIST score from mteval-14.pl has a self-contained tokenizer
+
+ Note: The mteval-14.pl includes a smoothing function for BLEU score that is NOT
+ used in the NIST score computation.
+
+ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+ ... 'ensures', 'that', 'the', 'military', 'always',
+ ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
+
+ >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
+ ... 'forever', 'hearing', 'the', 'activity', 'guidebook',
+ ... 'that', 'party', 'direct']
+
+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+ ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
+ ... 'heed', 'Party', 'commands']
+
+ >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+ ... 'guarantees', 'the', 'military', 'forces', 'always',
+ ... 'being', 'under', 'the', 'command', 'of', 'the',
+ ... 'Party']
+
+ >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+ ... 'army', 'always', 'to', 'heed', 'the', 'directions',
+ ... 'of', 'the', 'party']
+
+ >>> sentence_nist([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
+ 0.0854...
+
+ >>> sentence_nist([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS
+ 0.1485...
+
+ :param references: reference sentences
+ :type references: list(list(str))
+ :param hypothesis: a hypothesis sentence
+ :type hypothesis: list(str)
+ :param n: highest n-gram order
+ :type n: int
+ """
+ return corpus_nist([references], [hypothesis], n)
+
+def corpus_nist(list_of_references, hypotheses, n=5):
+ """
+ Calculate a single corpus-level NIST score (aka. system-level BLEU) for all
+ the hypotheses and their respective references.
+
+ :param references: a corpus of lists of reference sentences, w.r.t. hypotheses
+ :type references: list(list(list(str)))
+ :param hypotheses: a list of hypothesis sentences
+ :type hypotheses: list(list(str))
+ :param n: highest n-gram order
+ :type n: int
+ """
+ # Before proceeding to compute NIST, perform sanity checks.
+ assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same"
+
+ p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
+ p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
+ sysoutput_lengths = Counter() # Key = ngram order, and value = no. of ngram in hyp.
+ hyp_lengths, ref_lengths = 0, 0
+
+ # Iterate through each hypothesis and their corresponding references.
+ for references, hypothesis in zip(list_of_references, hypotheses):
+ # For each order of ngram, calculate the numerator and
+ # denominator for the corpus-level modified precision.
+ for i, _ in enumerate(range(1,n+1)):
+ p_i = modified_precision(references, hypothesis, i)
+ p_numerators[i] += p_i.numerator
+ p_denominators[i] += p_i.denominator
+ # Adds the no. of ngrams in the hypothesis.
+ sysoutput_lengths[i] += len(hypothesis) - (i - 1)
+
+ # Calculate the hypothesis length and the closest reference length.
+ # Adds them to the corpus-level hypothesis and reference counts.
+ hyp_len = len(hypothesis)
+ hyp_lengths += hyp_len
+ ref_lengths += closest_ref_length(references, hyp_len)
+
+ # Calculate corpus-level brevity penalty.
+ bp = nist_length_penalty(ref_lengths, hyp_lengths)
+
+ # Collects the various precision values for the different ngram orders.
+ p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
+ for i, _ in enumerate(range(1,n+1))]
+
+ # Eqn 2 in Doddington (2002):
+ # Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ]
+ info = [0 if p_n[i].numerator == 0 or p_n[i+1].numerator == 0 # Handles math domain and zero division errors.
+ else math.log(p_n[i].numerator / p_n[i+1].numerator)
+ for i in range(len(p_n)-1)]
+ return sum(info_i/sysoutput_lengths[i] for i, info_i in enumerate(info)) * bp
+
+
+def nist_length_penalty(closest_ref_len, hyp_len):
+ """
+ Calculates the NIST length penalty, from Eq. 3 in Doddington (2002)
+
+ penalty = exp( beta * log( min( len(hyp)/len(ref) , 1.0 )))
+
+ where,
+
+ `beta` is chosen to make the brevity penalty factor = 0.5 when the
+ no. of words in the system output (hyp) is 2/3 of the average
+ no. of words in the reference translation (ref)
+
+ The NIST penalty is different from BLEU's such that it minimize the impact
+ of the score of small variations in the length of a translation.
+ See Fig. 4 in Doddington (2002)
+ """
+ ratio = closest_ref_len / hyp_len
+ if 0 < ratio < 1:
+ ratio_x, score_x = 1.5, 0.5
+ beta = math.log(score_x) / math.log(score_x)**2
+ return math.exp(beta * math.log(ratio)**2)
+ else: # ratio <= 0 or ratio >= 1
+ return max(min(ratio, 1.0), 0.0)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/nltk.git
More information about the debian-science-commits
mailing list