[nltk] 01/03: Imported Upstream version 3.0.4

Wed Jul 15 21:38:40 UTC 2015

This is an automated email from the git hooks/post-receive script.

danstender-guest pushed a commit to branch master
in repository nltk.

commit ac6b066117f555a1493fda00606cf6af4434f95d
Author: Daniel Stender <debian at danielstender.com>
Date:   Wed Jul 15 23:21:26 2015 +0200

    Imported Upstream version 3.0.4
---
 PKG-INFO                            |   2 +-
 README.txt                          |   4 +-
 nltk.egg-info/PKG-INFO              |   2 +-
 nltk/VERSION                        |   2 +-
 nltk/book.py                        |   3 -
 nltk/ccg/lexicon.py                 |   4 +-
 nltk/corpus/__init__.py             |   2 +-
 nltk/corpus/reader/bracket_parse.py |  49 ++++++++++++--
 nltk/draw/dispersion.py             |   4 +-
 nltk/sem/boxer.py                   |   7 +-
 nltk/tbl/demo.py                    |   9 ++-
 nltk/test/ccg.doctest               | 123 ++++++++++++++++++++++++++++++++++++
 setup.cfg                           |   2 +-
 13 files changed, 188 insertions(+), 25 deletions(-)

diff --git a/PKG-INFO b/PKG-INFO
index bdb38c0..4a7ba3b 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: nltk
-Version: 3.0.3
+Version: 3.0.4
 Summary: Natural Language Toolkit
 Home-page: http://nltk.org/
 Author: Steven Bird
diff --git a/README.txt b/README.txt
index 5e5e9bd..9da482d 100644
--- a/README.txt
+++ b/README.txt
@@ -13,7 +13,7 @@ Python modules, data sets and tutorials supporting research and
 development in Natural Language Processing.
 
 Documentation: A substantial amount of documentation about how
-to use NLTK, including a textbook and API documention, is
+to use NLTK, including a textbook and API documentation, is
 available from the NLTK website: http://nltk.org/
 
   - The book covers a wide range of introductory topics in NLP, and
@@ -21,7 +21,7 @@ available from the NLTK website: http://nltk.org/
 
   - The toolkit's reference documentation describes every module,
     interface, class, method, function, and variable in the toolkit.
-    This documentation should be useful to both users and developers.  
+    This documentation should be useful to both users and developers.
 
 Mailing Lists: There are several mailing lists associated with NLTK:
 
diff --git a/nltk.egg-info/PKG-INFO b/nltk.egg-info/PKG-INFO
index bdb38c0..4a7ba3b 100644
--- a/nltk.egg-info/PKG-INFO
+++ b/nltk.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: nltk
-Version: 3.0.3
+Version: 3.0.4
 Summary: Natural Language Toolkit
 Home-page: http://nltk.org/
 Author: Steven Bird
diff --git a/nltk/VERSION b/nltk/VERSION
index 75a22a2..b0f2dcb 100644
--- a/nltk/VERSION
+++ b/nltk/VERSION
@@ -1 +1 @@
-3.0.3
+3.0.4
diff --git a/nltk/book.py b/nltk/book.py
index 3d6777e..8cded01 100644
--- a/nltk/book.py
+++ b/nltk/book.py
@@ -10,9 +10,6 @@ from __future__ import print_function
 from nltk.corpus import (gutenberg, genesis, inaugural,
                          nps_chat, webtext, treebank, wordnet)
 from nltk.text import Text
-from nltk.probability import FreqDist
-from nltk.util import bigrams
-from nltk.misc import babelize_shell
 
 print("*** Introductory Examples for the NLTK Book ***")
 print("Loading text1, ..., text9 and sent1, ..., sent9")
diff --git a/nltk/ccg/lexicon.py b/nltk/ccg/lexicon.py
index 60badb5..5439a14 100644
--- a/nltk/ccg/lexicon.py
+++ b/nltk/ccg/lexicon.py
@@ -27,7 +27,7 @@ reNextPrim = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
 reApp = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
 
 # Parses the definition of the category of either a word or a family
-reLex = re.compile(r'''([A-Za-z_]+)\s*(::|[-=]+>)\s*(.+)''')
+reLex = re.compile(r'''([\w_]+)\s*(::|[-=]+>)\s*(.+)''', re.UNICODE)
 
 # Strips comments from a line
 reComm = re.compile('''([^#]*)(?:#.*)?''')
@@ -183,7 +183,7 @@ def parseLexicon(lex_str):
 
         if line.startswith(':-'):
             # A line of primitive categories.
-            # The first line is the target category
+            # The first one is the target category
             # ie, :- S, N, NP, VP
             primitives = primitives + [ prim.strip() for prim in line[2:].strip().split(',') ]
         else:
diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py
index 08d7011..33f1a8a 100644
--- a/nltk/corpus/__init__.py
+++ b/nltk/corpus/__init__.py
@@ -142,7 +142,7 @@ machado = LazyCorpusLoader(
     r'(?!\.).*\.txt', cat_pattern=r'([a-z]*)/.*', encoding='latin-1')
 masc_tagged = LazyCorpusLoader(
     'masc_tagged', CategorizedTaggedCorpusReader, r'(spoken|written)/.*\.txt',
-    cat_file='categories.txt', tagset='wsj', encoding="ascii", sep="_")
+    cat_file='categories.txt', tagset='wsj', encoding="utf-8", sep="_")
 movie_reviews = LazyCorpusLoader(
     'movie_reviews', CategorizedPlaintextCorpusReader,
     r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',
diff --git a/nltk/corpus/reader/bracket_parse.py b/nltk/corpus/reader/bracket_parse.py
index 999382b..7785d69 100644
--- a/nltk/corpus/reader/bracket_parse.py
+++ b/nltk/corpus/reader/bracket_parse.py
@@ -18,6 +18,7 @@ from nltk.corpus.reader.util import *
 from nltk.corpus.reader.api import *
 
 # we use [^\s()]+ instead of \S+? to avoid matching ()
+SORTTAGWRD = re.compile(r'\((\d+) ([^\s()]+) ([^\s()]+)\)') 
 TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)')
 WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)')
 EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')
@@ -97,9 +98,9 @@ class BracketParseCorpusReader(SyntaxCorpusReader):
             return Tree('S', self._tag(t))
 
     def _tag(self, t, tagset=None):
-        tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(self._normalize(t))]
+        tagged_sent = [(w,p) for (p,w) in TAGWORD.findall(self._normalize(t))]
         if tagset and tagset != self._tagset:
-            tagged_sent = [(w, map_tag(self._tagset, tagset, t)) for (w,t) in tagged_sent]
+            tagged_sent = [(w, map_tag(self._tagset, tagset, p)) for (w,p) in tagged_sent]
         return tagged_sent
 
     def _word(self, t):
@@ -165,6 +166,12 @@ class CategorizedBracketParseCorpusReader(CategorizedCorpusReader,
 class AlpinoCorpusReader(BracketParseCorpusReader):
     """
     Reader for the Alpino Dutch Treebank.
+    This corpus has a lexical breakdown structure embedded, as read by _parse
+    Unfortunately this puts punctuation and some other words out of the sentence
+    order in the xml element tree. This is no good for tag_ and word_
+    _tag and _word will be overridden to use a non-default new parameter 'ordered'
+    to the overridden _normalize function. The _parse function can then remain 
+    untouched.
     """
     def __init__(self, root, encoding='ISO-8859-1', tagset=None):
         BracketParseCorpusReader.__init__(self, root, 'alpino\.xml',
@@ -172,13 +179,47 @@ class AlpinoCorpusReader(BracketParseCorpusReader):
                                  encoding=encoding,
                                  tagset=tagset)
 
-    def _normalize(self, t):
+    def _normalize(self, t, ordered = False):
+        """Normalize the xml sentence element in t.
+        The sentence elements <alpino_ds>, although embedded in a few overall 
+        xml elements, are seperated by blank lines. That's how the reader can 
+        deliver them one at a time.
+        Each sentence has a few category subnodes that are of no use to us.
+        The remaining word nodes may or may not appear in the proper order.
+        Each word node has attributes, among which:
+        - begin : the position of the word in the sentence
+        - pos   : Part of Speech: the Tag
+        - word  : the actual word
+        The return value is a string with all xml elementes replaced by 
+        clauses: either a cat clause with nested clauses, or a word clause.
+        The order of the bracket clauses closely follows the xml.
+        If ordered == True, the word clauses include an order sequence number.
+        If ordered == False, the word clauses only have pos and word parts.
+        """
         if t[:10] != "<alpino_ds":
             return ""
         # convert XML to sexpr notation
         t = re.sub(r'  <node .*? cat="(\w+)".*>', r"(\1", t)
-        t = re.sub(r'  <node .*? pos="(\w+)".*? word="([^"]+)".*/>', r"(\1 \2)", t)
+        if ordered:
+            t = re.sub(r'  <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2 \3)", t)
+        else: 
+            t = re.sub(r'  <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
         t = re.sub(r"  </node>", r")", t)
         t = re.sub(r"<sentence>.*</sentence>", r"", t)
         t = re.sub(r"</?alpino_ds.*>", r"", t)
         return t
+
+    def _tag(self, t, tagset=None):
+        tagged_sent = [(int(o), w, p) for (o,p,w) in SORTTAGWRD.findall(self._normalize(t, ordered = True))]
+        tagged_sent.sort()
+        if tagset and tagset != self._tagset:
+            tagged_sent = [(w, map_tag(self._tagset, tagset, p)) for (o,w,p) in tagged_sent]
+        else:
+            tagged_sent = [(w,p) for (o,w,p) in tagged_sent]
+        return tagged_sent
+
+    def _word(self, t):
+        """Return a correctly ordered list if words"""
+        tagged_sent = self._tag(t)
+        return [w for (w,p) in tagged_sent]      
+
diff --git a/nltk/draw/dispersion.py b/nltk/draw/dispersion.py
index eddc36f..4de4744 100644
--- a/nltk/draw/dispersion.py
+++ b/nltk/draw/dispersion.py
@@ -9,7 +9,7 @@
 A utility for displaying lexical dispersion.
 """
 
-def dispersion_plot(text, words, ignore_case=False):
+def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"):
     """
     Generate a lexical dispersion plot.
 
@@ -47,7 +47,7 @@ def dispersion_plot(text, words, ignore_case=False):
     pylab.plot(x, y, "b|", scalex=.1)
     pylab.yticks(list(range(len(words))), words, color="b")
     pylab.ylim(-1, len(words))
-    pylab.title("Lexical Dispersion Plot")
+    pylab.title(title)
     pylab.xlabel("Word Offset")
     pylab.show()
 
diff --git a/nltk/sem/boxer.py b/nltk/sem/boxer.py
index 41d74d7..b8685bb 100644
--- a/nltk/sem/boxer.py
+++ b/nltk/sem/boxer.py
@@ -51,7 +51,7 @@ class Boxer(object):
     semantic parser that produces Discourse Representation Structures (DRSs).
     """
 
-    def __init__(self, boxer_drs_interpreter=None, elimeq=False, bin_dir=None, verbose=False):
+    def __init__(self, boxer_drs_interpreter=None, elimeq=False, bin_dir=None, verbose=False, resolve=True):
         """
         :param boxer_drs_interpreter: A class that converts from the
         ``AbstractBoxerDrs`` object hierarchy to a different object.  The
@@ -60,11 +60,14 @@ class Boxer(object):
         :param elimeq: When set to true, Boxer removes all equalities from the
         DRSs and discourse referents standing in the equality relation are
         unified, but only if this can be done in a meaning-preserving manner.
+        :param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction. 
+        Resolution follows Van der Sandt's theory of binding and accommodation.
         """
         if boxer_drs_interpreter is None:
             boxer_drs_interpreter = NltkDrtBoxerDrsInterpreter()
         self._boxer_drs_interpreter = boxer_drs_interpreter
 
+        self._resolve = resolve
         self._elimeq = elimeq
 
         self.set_bin_dir(bin_dir, verbose)
@@ -172,7 +175,7 @@ class Boxer(object):
         args = ['--box', 'false',
                 '--semantics', 'drs',
                 #'--flat', 'false', # removed from boxer
-                '--resolve', 'true',
+                '--resolve', ['false','true'][self._resolve],
                 '--elimeq', ['false','true'][self._elimeq],
                 '--format', 'prolog',
                 '--instantiate', 'true',
diff --git a/nltk/tbl/demo.py b/nltk/tbl/demo.py
index eee2c39..41b0a3f 100644
--- a/nltk/tbl/demo.py
+++ b/nltk/tbl/demo.py
@@ -268,7 +268,7 @@ def postag(
             print("Wrote plot of learning curve to {0}".format(learning_curve_output))
     else:
         print("Tagging the test data")
-        taggedtest = brill_tagger.batch_tag(testing_data)
+        taggedtest = brill_tagger.tag_sents(testing_data)
         if template_stats:
             brill_tagger.print_template_statistics()
 
@@ -276,20 +276,19 @@ def postag(
     if error_output is not None:
         with open(error_output, 'w') as f:
             f.write('Errors for Brill Tagger %r\n\n' % serialize_output)
-            for e in error_list(gold_data, taggedtest):
-                f.write(e+'\n')
+            f.write(u'\n'.join(error_list(gold_data, taggedtest)).encode('utf-8') + '\n')
         print("Wrote tagger errors including context to {0}".format(error_output))
 
     # serializing the tagger to a pickle file and reloading (just to see it works)
     if serialize_output is not None:
-        taggedtest = brill_tagger.batch_tag(testing_data)
+        taggedtest = brill_tagger.tag_sents(testing_data)
         with open(serialize_output, 'w') as print_rules:
             pickle.dump(brill_tagger, print_rules)
         print("Wrote pickled tagger to {0}".format(serialize_output))
         with open(serialize_output, "r") as print_rules:
             brill_tagger_reloaded = pickle.load(print_rules)
         print("Reloaded pickled tagger from {0}".format(serialize_output))
-        taggedtest_reloaded = brill_tagger.batch_tag(testing_data)
+        taggedtest_reloaded = brill_tagger.tag_sents(testing_data)
         if taggedtest == taggedtest_reloaded:
             print("Reloaded tagger tried on test set, results identical")
         else:
diff --git a/nltk/test/ccg.doctest b/nltk/test/ccg.doctest
index 716da2c..f79b6ec 100644
--- a/nltk/test/ccg.doctest
+++ b/nltk/test/ccg.doctest
@@ -271,3 +271,126 @@ Interesting to point that the two parses are clearly semantically different.
                                                                      (N\N)
     -----------------------------------------------------------------------------------------------------------------------------<
                                                                   N
+
+
+Unicode support
+---------------
+
+Unicode words are supported.
+
+    >>> from nltk.ccg import chart, lexicon
+
+Lexicons for the tests:
+
+    >>> lex = lexicon.parseLexicon(u'''
+    ...        :- S, N, NP, PP
+    ...
+    ...        AdjI :: N\\N
+    ...        AdjD :: N/N
+    ...        AdvD :: S/S
+    ...        AdvI :: S\\S
+    ...        Det :: NP/N
+    ...        PrepNPCompl :: PP/NP
+    ...        PrepNAdjN :: S\\S/N
+    ...        PrepNAdjNP :: S\\S/NP
+    ...        VPNP :: S\\NP/NP
+    ...        VPPP :: S\\NP/PP
+    ...        VPser :: S\\NP/AdjI
+    ...
+    ...        auto => N
+    ...        bebidas => N
+    ...        cine => N
+    ...        ley => N
+    ...        libro => N
+    ...        ministro => N
+    ...        panadería => N
+    ...        presidente => N
+    ...        super => N
+    ...
+    ...        el => Det
+    ...        la => Det
+    ...        las => Det
+    ...        un => Det
+    ...
+    ...        Ana => NP
+    ...        Pablo => NP
+    ...
+    ...        y => var\\.,var/.,var
+    ...
+    ...        pero => (S/NP)\\(S/NP)/(S/NP)
+    ...
+    ...        anunció => VPNP
+    ...        compró => VPNP
+    ...        cree => S\\NP/S[dep]
+    ...        desmintió => VPNP
+    ...        lee => VPNP
+    ...        fueron => VPPP
+    ...
+    ...        es => VPser
+    ...
+    ...        interesante => AdjD
+    ...        interesante => AdjI
+    ...        nueva => AdjD
+    ...        nueva => AdjI
+    ...
+    ...        a => PrepNPCompl
+    ...        en => PrepNAdjN
+    ...        en => PrepNAdjNP
+    ...
+    ...        ayer => AdvI
+    ...
+    ...        que => (NP\\NP)/(S/NP)
+    ...        que => S[dep]/S
+    ...     ''')
+
+    >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
+    >>> for parse in parser.parse(u"el ministro anunció pero el presidente desmintió la nueva ley".split()):
+    ...     printCCGDerivation(parse)
+    ...     break
+       el    ministro    anunció              pero              el    presidente   desmintió     la    nueva  ley
+     (NP/N)     N      ((S\NP)/NP)  (((S/NP)\(S/NP))/(S/NP))  (NP/N)      N       ((S\NP)/NP)  (NP/N)  (N/N)   N
+    --------Leaf
+     (NP/N)
+            ----------Leaf
+                N
+    ------------------>
+            NP
+    ------------------>T
+        (S/(S\NP))
+                      -------------Leaf
+                       ((S\NP)/NP)
+                                   --------------------------Leaf
+                                    (((S/NP)\(S/NP))/(S/NP))
+                                                             --------Leaf
+                                                              (NP/N)
+                                                                     ------------Leaf
+                                                                          N
+                                                             -------------------->
+                                                                      NP
+                                                             -------------------->T
+                                                                  (S/(S\NP))
+                                                                                 -------------Leaf
+                                                                                  ((S\NP)/NP)
+                                                             --------------------------------->B
+                                                                          (S/NP)
+                                   ----------------------------------------------------------->
+                                                         ((S/NP)\(S/NP))
+                                                                                              --------Leaf
+                                                                                               (NP/N)
+                                                                                                      -------Leaf
+                                                                                                       (N/N)
+                                                                                                             -----Leaf
+                                                                                                               N
+                                                                                                      ------------>
+                                                                                                           N
+                                                                                              -------------------->
+                                                                                                       NP
+                                                                                              --------------------<T
+                                                                                                   (S\(S/NP))
+                                   -------------------------------------------------------------------------------<B
+                                                                     (S\(S/NP))
+                      --------------------------------------------------------------------------------------------<B
+                                                                 (S/NP)
+    -------------------------------------------------------------------------------------------------------------->
+                                                          S
+
diff --git a/setup.cfg b/setup.cfg
index 6bc2ff3..861a9f5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [egg_info]
-tag_date = 0
 tag_build = 
+tag_date = 0
 tag_svn_revision = 0
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/nltk.git