[nltk] 01/03: Imported Upstream version 3.0.4
Daniel Stender
danstender-guest at moszumanska.debian.org
Wed Jul 15 21:38:40 UTC 2015
This is an automated email from the git hooks/post-receive script.
danstender-guest pushed a commit to branch master
in repository nltk.
commit ac6b066117f555a1493fda00606cf6af4434f95d
Author: Daniel Stender <debian at danielstender.com>
Date: Wed Jul 15 23:21:26 2015 +0200
Imported Upstream version 3.0.4
---
PKG-INFO | 2 +-
README.txt | 4 +-
nltk.egg-info/PKG-INFO | 2 +-
nltk/VERSION | 2 +-
nltk/book.py | 3 -
nltk/ccg/lexicon.py | 4 +-
nltk/corpus/__init__.py | 2 +-
nltk/corpus/reader/bracket_parse.py | 49 ++++++++++++--
nltk/draw/dispersion.py | 4 +-
nltk/sem/boxer.py | 7 +-
nltk/tbl/demo.py | 9 ++-
nltk/test/ccg.doctest | 123 ++++++++++++++++++++++++++++++++++++
setup.cfg | 2 +-
13 files changed, 188 insertions(+), 25 deletions(-)
diff --git a/PKG-INFO b/PKG-INFO
index bdb38c0..4a7ba3b 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: nltk
-Version: 3.0.3
+Version: 3.0.4
Summary: Natural Language Toolkit
Home-page: http://nltk.org/
Author: Steven Bird
diff --git a/README.txt b/README.txt
index 5e5e9bd..9da482d 100644
--- a/README.txt
+++ b/README.txt
@@ -13,7 +13,7 @@ Python modules, data sets and tutorials supporting research and
development in Natural Language Processing.
Documentation: A substantial amount of documentation about how
-to use NLTK, including a textbook and API documention, is
+to use NLTK, including a textbook and API documentation, is
available from the NLTK website: http://nltk.org/
- The book covers a wide range of introductory topics in NLP, and
@@ -21,7 +21,7 @@ available from the NLTK website: http://nltk.org/
- The toolkit's reference documentation describes every module,
interface, class, method, function, and variable in the toolkit.
- This documentation should be useful to both users and developers.
+ This documentation should be useful to both users and developers.
Mailing Lists: There are several mailing lists associated with NLTK:
diff --git a/nltk.egg-info/PKG-INFO b/nltk.egg-info/PKG-INFO
index bdb38c0..4a7ba3b 100644
--- a/nltk.egg-info/PKG-INFO
+++ b/nltk.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: nltk
-Version: 3.0.3
+Version: 3.0.4
Summary: Natural Language Toolkit
Home-page: http://nltk.org/
Author: Steven Bird
diff --git a/nltk/VERSION b/nltk/VERSION
index 75a22a2..b0f2dcb 100644
--- a/nltk/VERSION
+++ b/nltk/VERSION
@@ -1 +1 @@
-3.0.3
+3.0.4
diff --git a/nltk/book.py b/nltk/book.py
index 3d6777e..8cded01 100644
--- a/nltk/book.py
+++ b/nltk/book.py
@@ -10,9 +10,6 @@ from __future__ import print_function
from nltk.corpus import (gutenberg, genesis, inaugural,
nps_chat, webtext, treebank, wordnet)
from nltk.text import Text
-from nltk.probability import FreqDist
-from nltk.util import bigrams
-from nltk.misc import babelize_shell
print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
diff --git a/nltk/ccg/lexicon.py b/nltk/ccg/lexicon.py
index 60badb5..5439a14 100644
--- a/nltk/ccg/lexicon.py
+++ b/nltk/ccg/lexicon.py
@@ -27,7 +27,7 @@ reNextPrim = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
reApp = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
# Parses the definition of the category of either a word or a family
-reLex = re.compile(r'''([A-Za-z_]+)\s*(::|[-=]+>)\s*(.+)''')
+reLex = re.compile(r'''([\w_]+)\s*(::|[-=]+>)\s*(.+)''', re.UNICODE)
# Strips comments from a line
reComm = re.compile('''([^#]*)(?:#.*)?''')
@@ -183,7 +183,7 @@ def parseLexicon(lex_str):
if line.startswith(':-'):
# A line of primitive categories.
- # The first line is the target category
+ # The first one is the target category
# ie, :- S, N, NP, VP
primitives = primitives + [ prim.strip() for prim in line[2:].strip().split(',') ]
else:
diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py
index 08d7011..33f1a8a 100644
--- a/nltk/corpus/__init__.py
+++ b/nltk/corpus/__init__.py
@@ -142,7 +142,7 @@ machado = LazyCorpusLoader(
r'(?!\.).*\.txt', cat_pattern=r'([a-z]*)/.*', encoding='latin-1')
masc_tagged = LazyCorpusLoader(
'masc_tagged', CategorizedTaggedCorpusReader, r'(spoken|written)/.*\.txt',
- cat_file='categories.txt', tagset='wsj', encoding="ascii", sep="_")
+ cat_file='categories.txt', tagset='wsj', encoding="utf-8", sep="_")
movie_reviews = LazyCorpusLoader(
'movie_reviews', CategorizedPlaintextCorpusReader,
r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',
diff --git a/nltk/corpus/reader/bracket_parse.py b/nltk/corpus/reader/bracket_parse.py
index 999382b..7785d69 100644
--- a/nltk/corpus/reader/bracket_parse.py
+++ b/nltk/corpus/reader/bracket_parse.py
@@ -18,6 +18,7 @@ from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
# we use [^\s()]+ instead of \S+? to avoid matching ()
+SORTTAGWRD = re.compile(r'\((\d+) ([^\s()]+) ([^\s()]+)\)')
TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)')
WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)')
EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')
@@ -97,9 +98,9 @@ class BracketParseCorpusReader(SyntaxCorpusReader):
return Tree('S', self._tag(t))
def _tag(self, t, tagset=None):
- tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(self._normalize(t))]
+ tagged_sent = [(w,p) for (p,w) in TAGWORD.findall(self._normalize(t))]
if tagset and tagset != self._tagset:
- tagged_sent = [(w, map_tag(self._tagset, tagset, t)) for (w,t) in tagged_sent]
+ tagged_sent = [(w, map_tag(self._tagset, tagset, p)) for (w,p) in tagged_sent]
return tagged_sent
def _word(self, t):
@@ -165,6 +166,12 @@ class CategorizedBracketParseCorpusReader(CategorizedCorpusReader,
class AlpinoCorpusReader(BracketParseCorpusReader):
"""
Reader for the Alpino Dutch Treebank.
+ This corpus has a lexical breakdown structure embedded, as read by _parse
+ Unfortunately this puts punctuation and some other words out of the sentence
+ order in the xml element tree. This is no good for tag_ and word_
+ _tag and _word will be overridden to use a non-default new parameter 'ordered'
+ to the overridden _normalize function. The _parse function can then remain
+ untouched.
"""
def __init__(self, root, encoding='ISO-8859-1', tagset=None):
BracketParseCorpusReader.__init__(self, root, 'alpino\.xml',
@@ -172,13 +179,47 @@ class AlpinoCorpusReader(BracketParseCorpusReader):
encoding=encoding,
tagset=tagset)
- def _normalize(self, t):
+ def _normalize(self, t, ordered = False):
+ """Normalize the xml sentence element in t.
+ The sentence elements <alpino_ds>, although embedded in a few overall
+ xml elements, are seperated by blank lines. That's how the reader can
+ deliver them one at a time.
+ Each sentence has a few category subnodes that are of no use to us.
+ The remaining word nodes may or may not appear in the proper order.
+ Each word node has attributes, among which:
+ - begin : the position of the word in the sentence
+ - pos : Part of Speech: the Tag
+ - word : the actual word
+ The return value is a string with all xml elementes replaced by
+ clauses: either a cat clause with nested clauses, or a word clause.
+ The order of the bracket clauses closely follows the xml.
+ If ordered == True, the word clauses include an order sequence number.
+ If ordered == False, the word clauses only have pos and word parts.
+ """
if t[:10] != "<alpino_ds":
return ""
# convert XML to sexpr notation
t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
- t = re.sub(r' <node .*? pos="(\w+)".*? word="([^"]+)".*/>', r"(\1 \2)", t)
+ if ordered:
+ t = re.sub(r' <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2 \3)", t)
+ else:
+ t = re.sub(r' <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
t = re.sub(r" </node>", r")", t)
t = re.sub(r"<sentence>.*</sentence>", r"", t)
t = re.sub(r"</?alpino_ds.*>", r"", t)
return t
+
+ def _tag(self, t, tagset=None):
+ tagged_sent = [(int(o), w, p) for (o,p,w) in SORTTAGWRD.findall(self._normalize(t, ordered = True))]
+ tagged_sent.sort()
+ if tagset and tagset != self._tagset:
+ tagged_sent = [(w, map_tag(self._tagset, tagset, p)) for (o,w,p) in tagged_sent]
+ else:
+ tagged_sent = [(w,p) for (o,w,p) in tagged_sent]
+ return tagged_sent
+
+ def _word(self, t):
+ """Return a correctly ordered list if words"""
+ tagged_sent = self._tag(t)
+ return [w for (w,p) in tagged_sent]
+
diff --git a/nltk/draw/dispersion.py b/nltk/draw/dispersion.py
index eddc36f..4de4744 100644
--- a/nltk/draw/dispersion.py
+++ b/nltk/draw/dispersion.py
@@ -9,7 +9,7 @@
A utility for displaying lexical dispersion.
"""
-def dispersion_plot(text, words, ignore_case=False):
+def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"):
"""
Generate a lexical dispersion plot.
@@ -47,7 +47,7 @@ def dispersion_plot(text, words, ignore_case=False):
pylab.plot(x, y, "b|", scalex=.1)
pylab.yticks(list(range(len(words))), words, color="b")
pylab.ylim(-1, len(words))
- pylab.title("Lexical Dispersion Plot")
+ pylab.title(title)
pylab.xlabel("Word Offset")
pylab.show()
diff --git a/nltk/sem/boxer.py b/nltk/sem/boxer.py
index 41d74d7..b8685bb 100644
--- a/nltk/sem/boxer.py
+++ b/nltk/sem/boxer.py
@@ -51,7 +51,7 @@ class Boxer(object):
semantic parser that produces Discourse Representation Structures (DRSs).
"""
- def __init__(self, boxer_drs_interpreter=None, elimeq=False, bin_dir=None, verbose=False):
+ def __init__(self, boxer_drs_interpreter=None, elimeq=False, bin_dir=None, verbose=False, resolve=True):
"""
:param boxer_drs_interpreter: A class that converts from the
``AbstractBoxerDrs`` object hierarchy to a different object. The
@@ -60,11 +60,14 @@ class Boxer(object):
:param elimeq: When set to true, Boxer removes all equalities from the
DRSs and discourse referents standing in the equality relation are
unified, but only if this can be done in a meaning-preserving manner.
+ :param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction.
+ Resolution follows Van der Sandt's theory of binding and accommodation.
"""
if boxer_drs_interpreter is None:
boxer_drs_interpreter = NltkDrtBoxerDrsInterpreter()
self._boxer_drs_interpreter = boxer_drs_interpreter
+ self._resolve = resolve
self._elimeq = elimeq
self.set_bin_dir(bin_dir, verbose)
@@ -172,7 +175,7 @@ class Boxer(object):
args = ['--box', 'false',
'--semantics', 'drs',
#'--flat', 'false', # removed from boxer
- '--resolve', 'true',
+ '--resolve', ['false','true'][self._resolve],
'--elimeq', ['false','true'][self._elimeq],
'--format', 'prolog',
'--instantiate', 'true',
diff --git a/nltk/tbl/demo.py b/nltk/tbl/demo.py
index eee2c39..41b0a3f 100644
--- a/nltk/tbl/demo.py
+++ b/nltk/tbl/demo.py
@@ -268,7 +268,7 @@ def postag(
print("Wrote plot of learning curve to {0}".format(learning_curve_output))
else:
print("Tagging the test data")
- taggedtest = brill_tagger.batch_tag(testing_data)
+ taggedtest = brill_tagger.tag_sents(testing_data)
if template_stats:
brill_tagger.print_template_statistics()
@@ -276,20 +276,19 @@ def postag(
if error_output is not None:
with open(error_output, 'w') as f:
f.write('Errors for Brill Tagger %r\n\n' % serialize_output)
- for e in error_list(gold_data, taggedtest):
- f.write(e+'\n')
+ f.write(u'\n'.join(error_list(gold_data, taggedtest)).encode('utf-8') + '\n')
print("Wrote tagger errors including context to {0}".format(error_output))
# serializing the tagger to a pickle file and reloading (just to see it works)
if serialize_output is not None:
- taggedtest = brill_tagger.batch_tag(testing_data)
+ taggedtest = brill_tagger.tag_sents(testing_data)
with open(serialize_output, 'w') as print_rules:
pickle.dump(brill_tagger, print_rules)
print("Wrote pickled tagger to {0}".format(serialize_output))
with open(serialize_output, "r") as print_rules:
brill_tagger_reloaded = pickle.load(print_rules)
print("Reloaded pickled tagger from {0}".format(serialize_output))
- taggedtest_reloaded = brill_tagger.batch_tag(testing_data)
+ taggedtest_reloaded = brill_tagger.tag_sents(testing_data)
if taggedtest == taggedtest_reloaded:
print("Reloaded tagger tried on test set, results identical")
else:
diff --git a/nltk/test/ccg.doctest b/nltk/test/ccg.doctest
index 716da2c..f79b6ec 100644
--- a/nltk/test/ccg.doctest
+++ b/nltk/test/ccg.doctest
@@ -271,3 +271,126 @@ Interesting to point that the two parses are clearly semantically different.
(N\N)
-----------------------------------------------------------------------------------------------------------------------------<
N
+
+
+Unicode support
+---------------
+
+Unicode words are supported.
+
+ >>> from nltk.ccg import chart, lexicon
+
+Lexicons for the tests:
+
+ >>> lex = lexicon.parseLexicon(u'''
+ ... :- S, N, NP, PP
+ ...
+ ... AdjI :: N\\N
+ ... AdjD :: N/N
+ ... AdvD :: S/S
+ ... AdvI :: S\\S
+ ... Det :: NP/N
+ ... PrepNPCompl :: PP/NP
+ ... PrepNAdjN :: S\\S/N
+ ... PrepNAdjNP :: S\\S/NP
+ ... VPNP :: S\\NP/NP
+ ... VPPP :: S\\NP/PP
+ ... VPser :: S\\NP/AdjI
+ ...
+ ... auto => N
+ ... bebidas => N
+ ... cine => N
+ ... ley => N
+ ... libro => N
+ ... ministro => N
+ ... panadería => N
+ ... presidente => N
+ ... super => N
+ ...
+ ... el => Det
+ ... la => Det
+ ... las => Det
+ ... un => Det
+ ...
+ ... Ana => NP
+ ... Pablo => NP
+ ...
+ ... y => var\\.,var/.,var
+ ...
+ ... pero => (S/NP)\\(S/NP)/(S/NP)
+ ...
+ ... anunció => VPNP
+ ... compró => VPNP
+ ... cree => S\\NP/S[dep]
+ ... desmintió => VPNP
+ ... lee => VPNP
+ ... fueron => VPPP
+ ...
+ ... es => VPser
+ ...
+ ... interesante => AdjD
+ ... interesante => AdjI
+ ... nueva => AdjD
+ ... nueva => AdjI
+ ...
+ ... a => PrepNPCompl
+ ... en => PrepNAdjN
+ ... en => PrepNAdjNP
+ ...
+ ... ayer => AdvI
+ ...
+ ... que => (NP\\NP)/(S/NP)
+ ... que => S[dep]/S
+ ... ''')
+
+ >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
+ >>> for parse in parser.parse(u"el ministro anunció pero el presidente desmintió la nueva ley".split()):
+ ... printCCGDerivation(parse)
+ ... break
+ el ministro anunció pero el presidente desmintió la nueva ley
+ (NP/N) N ((S\NP)/NP) (((S/NP)\(S/NP))/(S/NP)) (NP/N) N ((S\NP)/NP) (NP/N) (N/N) N
+ --------Leaf
+ (NP/N)
+ ----------Leaf
+ N
+ ------------------>
+ NP
+ ------------------>T
+ (S/(S\NP))
+ -------------Leaf
+ ((S\NP)/NP)
+ --------------------------Leaf
+ (((S/NP)\(S/NP))/(S/NP))
+ --------Leaf
+ (NP/N)
+ ------------Leaf
+ N
+ -------------------->
+ NP
+ -------------------->T
+ (S/(S\NP))
+ -------------Leaf
+ ((S\NP)/NP)
+ --------------------------------->B
+ (S/NP)
+ ----------------------------------------------------------->
+ ((S/NP)\(S/NP))
+ --------Leaf
+ (NP/N)
+ -------Leaf
+ (N/N)
+ -----Leaf
+ N
+ ------------>
+ N
+ -------------------->
+ NP
+ --------------------<T
+ (S\(S/NP))
+ -------------------------------------------------------------------------------<B
+ (S\(S/NP))
+ --------------------------------------------------------------------------------------------<B
+ (S/NP)
+ -------------------------------------------------------------------------------------------------------------->
+ S
+
diff --git a/setup.cfg b/setup.cfg
index 6bc2ff3..861a9f5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[egg_info]
-tag_date = 0
tag_build =
+tag_date = 0
tag_svn_revision = 0
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/nltk.git
More information about the debian-science-commits
mailing list