[python-pynlpl] 01/03: New upstream version 1.1.2
Maarten van Gompel
proycon-guest at moszumanska.debian.org
Mon Jan 2 11:23:15 UTC 2017
This is an automated email from the git hooks/post-receive script.
proycon-guest pushed a commit to branch master
in repository python-pynlpl.
commit 749b9475534fcdda59afed895e300f930ca7ac96
Author: proycon <proycon at anaproy.nl>
Date: Mon Jan 2 12:22:15 2017 +0100
New upstream version 1.1.2
---
PKG-INFO | 2 +-
PyNLPl.egg-info/PKG-INFO | 2 +-
PyNLPl.egg-info/SOURCES.txt | 1 +
PyNLPl.egg-info/requires.txt | 1 +
pynlpl/__init__.py | 2 +-
pynlpl/docs/conf.py | 4 +-
pynlpl/formats/folia.py | 499 +++++++++++++++++--------------------------
pynlpl/formats/foliaset.py | 456 +++++++++++++++++++++++++++++++++++++++
pynlpl/formats/fql.py | 375 +++++++++++++++++---------------
pynlpl/formats/timbl.py | 7 +-
pynlpl/lm/srilm.py | 18 +-
pynlpl/tests/folia.py | 58 +++--
pynlpl/tests/fql.py | 122 +++++++++++
setup.py | 4 +-
14 files changed, 1050 insertions(+), 501 deletions(-)
diff --git a/PKG-INFO b/PKG-INFO
index 41044d6..3a329b7 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: PyNLPl
-Version: 1.0.9
+Version: 1.1.2
Summary: PyNLPl, pronounced as 'pineapple', is a Python library for Natural Language Processing. It contains various modules useful for common, and less common, NLP tasks. PyNLPl contains modules for basic tasks, clients for interfacting with server, and modules for parsing several file formats common in NLP, most notably FoLiA.
Home-page: https://github.com/proycon/pynlpl
Author: Maarten van Gompel
diff --git a/PyNLPl.egg-info/PKG-INFO b/PyNLPl.egg-info/PKG-INFO
index 41044d6..3a329b7 100644
--- a/PyNLPl.egg-info/PKG-INFO
+++ b/PyNLPl.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: PyNLPl
-Version: 1.0.9
+Version: 1.1.2
Summary: PyNLPl, pronounced as 'pineapple', is a Python library for Natural Language Processing. It contains various modules useful for common, and less common, NLP tasks. PyNLPl contains modules for basic tasks, clients for interfacting with server, and modules for parsing several file formats common in NLP, most notably FoLiA.
Home-page: https://github.com/proycon/pynlpl
Author: Maarten van Gompel
diff --git a/PyNLPl.egg-info/SOURCES.txt b/PyNLPl.egg-info/SOURCES.txt
index 2161340..e9d9144 100644
--- a/PyNLPl.egg-info/SOURCES.txt
+++ b/PyNLPl.egg-info/SOURCES.txt
@@ -32,6 +32,7 @@ pynlpl/formats/cgn.py
pynlpl/formats/cql.py
pynlpl/formats/dutchsemcor.py
pynlpl/formats/folia.py
+pynlpl/formats/foliaset.py
pynlpl/formats/fql.py
pynlpl/formats/giza.py
pynlpl/formats/imdi.py
diff --git a/PyNLPl.egg-info/requires.txt b/PyNLPl.egg-info/requires.txt
index 05787db..5e36e8a 100644
--- a/PyNLPl.egg-info/requires.txt
+++ b/PyNLPl.egg-info/requires.txt
@@ -1,2 +1,3 @@
lxml >= 2.2
httplib2 >= 0.6
+rdflib
diff --git a/pynlpl/__init__.py b/pynlpl/__init__.py
index aca03ae..0087ef6 100644
--- a/pynlpl/__init__.py
+++ b/pynlpl/__init__.py
@@ -2,4 +2,4 @@
The library is divided into several packages and modules. It is designed for Python 2.6 and upwards. Including Python 3."""
-VERSION = "1.0.9"
+VERSION = "1.1.2"
diff --git a/pynlpl/docs/conf.py b/pynlpl/docs/conf.py
index 5b260f4..f052138 100644
--- a/pynlpl/docs/conf.py
+++ b/pynlpl/docs/conf.py
@@ -12,13 +12,15 @@
# serve to show the default.
import sys, os
-from pynlpl import VERSION
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#sys.path.append(os.path.abspath('.'))
+sys.path.append(os.path.abspath('../../'))
+from pynlpl import VERSION
+
# -- General configuration -----------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be extensions
diff --git a/pynlpl/formats/folia.py b/pynlpl/formats/folia.py
index da9df0e..5dcba30 100644
--- a/pynlpl/formats/folia.py
+++ b/pynlpl/formats/folia.py
@@ -62,6 +62,7 @@ else:
stdout = sys.stdout
from pynlpl.common import u, isstring
+from pynlpl.formats.foliaset import SetDefinition, DeepValidationError
import pynlpl.algorithms
@@ -69,9 +70,9 @@ LXE=True #use lxml instead of built-in ElementTree (default)
#foliaspec:version:FOLIAVERSION
#The FoLiA version
-FOLIAVERSION = "1.3.2"
+FOLIAVERSION = "1.4.0"
-LIBVERSION = FOLIAVERSION + '.83' #== FoLiA version + library revision
+LIBVERSION = FOLIAVERSION + '.84' #== FoLiA version + library revision
#0.9.1.31 is the first version with Python 3 support
@@ -79,6 +80,7 @@ LIBVERSION = FOLIAVERSION + '.83' #== FoLiA version + library revision
#The FoLiA XML namespace
NSFOLIA = "http://ilk.uvt.nl/folia"
+
NSDCOI = "http://lands.let.ru.nl/projects/d-coi/ns/1.0"
nslen = len(NSFOLIA) + 2
nslendcoi = len(NSDCOI) + 2
@@ -109,9 +111,9 @@ class Mode:
XPATH = 1 #The full XML structure will be loaded into memory, but conversion to FoLiA objects occurs only upon querying. The full power of XPath is available.
class AnnotatorType:
- UNSET = 0
- AUTO = 1
- MANUAL = 2
+ UNSET = None
+ AUTO = "auto"
+ MANUAL = "manual"
#foliaspec:attributes
@@ -164,12 +166,6 @@ class UnresolvableTextContent(Exception):
class MalformedXMLError(Exception):
pass
-class DeepValidationError(Exception):
- pass
-
-class SetDefinitionError(DeepValidationError):
- pass
-
class ModeError(Exception):
pass
@@ -597,11 +593,12 @@ class AbstractElement(object):
isinstance(x, AbstractElement)
Generic FoLiA attributes can be accessed on all instances derived from this class:
+
* ``element.id`` (str) - The unique identifier of the element
* ``element.set`` (str) - The set the element pertains to.
- * ``element.cls`` (str) - The assigned class, i.e. the actual value of
- the annotation, defined in the set. Classes correspond with tagsets in this case of many annotation types.
- Note that since *class* is already a reserved keyword in python, the library consistently uses ``cls`` everywhere.
+ * ``element.cls`` (str) - The assigned class, i.e. the actual value of \
+ the annotation, defined in the set. Classes correspond with tagsets in this case of many annotation types. \
+ Note that since *class* is already a reserved keyword in python, the library consistently uses ``cls`` everywhere.
* ``element.annotator`` (str) - The name or ID of the annotator who added/modified this element
* ``element.annotatortype`` - The type of annotator, can be either ``folia.AnnotatorType.MANUAL`` or ``folia.AnnotatorType.AUTO``
* ``element.confidence`` (float) - A confidence value expressing
@@ -669,7 +666,7 @@ class AbstractElement(object):
def __getattr__(self, attr):
"""Internal method"""
#overriding getattr so we can get defaults here rather than needing a copy on each element, saves memory
- if attr in ('set','cls','confidence','annotator','annotatortype','datetime','n','href','src','speaker','begintime','endtime','xlinktype','xlinktitle','xlinklabel','xlinkrole','xlinkshow'):
+ if attr in ('set','cls','confidence','annotator','annotatortype','datetime','n','href','src','speaker','begintime','endtime','xlinktype','xlinktitle','xlinklabel','xlinkrole','xlinkshow','label'):
return None
else:
return super(AbstractElement, self).__getattribute__(attr)
@@ -1353,8 +1350,14 @@ class AbstractElement(object):
try:
self.doc.setdefinitions[self.set].testclass(self.cls)
except KeyError:
- if not self.doc.allowadhocsets:
- raise DeepValidationError("Set definition for " + self.set + " not loaded!")
+ if self.cls and not self.doc.allowadhocsets:
+ raise DeepValidationError("Set definition " + self.set + " for " + self.XMLTAG + " not loaded!")
+ except DeepValidationError as e:
+ errormsg = str(e) + " (in set " + self.set+" for " + self.XMLTAG
+ if self.id:
+ errormsg += " with ID " + self.id
+ errormsg += ")"
+ raise DeepValidationError(errormsg)
def append(self, child, *args, **kwargs):
"""Append a child element.
@@ -1564,39 +1567,54 @@ class AbstractElement(object):
:meth:`insert`
:meth:`replace`
""".format(generic_attribs=DOCSTRING_GENERIC_ATTRIBS)
- addspanfromspanned = False
- if isinstance(self,AbstractStructureElement):
- if inspect.isclass(child):
- if issubclass(child, AbstractSpanAnnotation):
- layerclass = ANNOTATIONTYPE2LAYERCLASS[child.ANNOTATIONTYPE]
- addspanfromspanned = True
- elif isinstance(child, AbstractSpanAnnotation):
- layerclass = ANNOTATIONTYPE2LAYERCLASS[child.ANNOTATIONTYPE]
+
+ addspanfromspanned = False #add a span annotation element from that which is spanned (i.e. a Word, Morpheme)
+ addspanfromstructure = False #add a span annotation elements from a structural parent which holds the span layers? (e.g. a Sentence, Paragraph)
+ if (inspect.isclass(child) and issubclass(child, AbstractSpanAnnotation)) or (not inspect.isclass(child) and isinstance(child, AbstractSpanAnnotation)):
+ layerclass = ANNOTATIONTYPE2LAYERCLASS[child.ANNOTATIONTYPE]
+ if isinstance(self, (Word, Morpheme)):
addspanfromspanned = True
+ elif isinstance(self,AbstractStructureElement): #add a span
+ addspanfromstructure = True
- if not addspanfromspanned: #pylint: disable=too-many-nested-blocks
- return self.append(child,*args,**kwargs)
- else:
+ if addspanfromspanned or addspanfromstructure:
+ #get the set
+ if 'set' in kwargs:
+ set = kwargs['set']
+ else:
+ try:
+ set = self.doc.defaultset(layerclass)
+ except KeyError:
+ raise Exception("No set defined when adding span annotation and none could be inferred")
+
+ if addspanfromspanned: #pylint: disable=too-many-nested-blocks
#collect ancestors of the current element,
allowedparents = [self] + list(self.ancestors(AbstractStructureElement))
#find common ancestors of structure elements in the arguments, and check whether it has the required annotation layer, create one if necessary
for e in commonancestors(AbstractStructureElement, *[ x for x in args if isinstance(x, AbstractStructureElement)] ):
if e in allowedparents: #is the element in the list of allowed parents according to this element?
if AbstractAnnotationLayer in e.ACCEPTED_DATA or layerclass in e.ACCEPTED_DATA:
- if 'set' in kwargs:
- set = kwargs['set']
- else:
- try:
- set = self.doc.defaultset(layerclass)
- except KeyError:
- raise Exception("No set defined when adding span annotation and none could be inferred")
try:
layer = next(e.select(layerclass,set,True))
except StopIteration:
layer = e.append(layerclass)
- return layer.append(child,*args,**kwargs)
+ if 'emptyspan' in kwargs and kwargs['emptyspan']:
+ del kwargs['emptyspan']
+ return layer.append(child,*[],**kwargs)
+ else:
+ return layer.append(child,*args,**kwargs)
raise Exception("Unable to find suitable common ancestor to create annotation layer")
+ elif addspanfromstructure:
+ layer = None
+ for layer in self.layers(child.ANNOTATIONTYPE, set):
+ pass #last one (only one actually) should be available in outer context
+ if layer is None:
+ layer = self.append(layerclass)
+ return layer.append(child,*args,**kwargs)
+ else:
+ #normal behaviour, append
+ return self.append(child,*args,**kwargs)
@@ -2117,6 +2135,7 @@ class AbstractElement(object):
returnnext = False
for e in order(parent):
if e is child:
+ #we found the current item, next item will be the one to return
returnnext = True
elif returnnext and e.auth and not isinstance(e,AbstractAnnotationLayer) and (not structural or (structural and (not isinstance(e,(AbstractTokenAnnotation,TextContent)) ) )):
if structural and isinstance(e,Correction):
@@ -2141,7 +2160,7 @@ class AbstractElement(object):
#generational iteration
child = parent
- if scope is not None and child in scope:
+ if scope is not None and child.__class__ in scope:
#you shall not pass!
break
parent = parent.parent
@@ -2357,7 +2376,7 @@ class AbstractElement(object):
"""Internal class method used for turning an XML element into an instance of the Class.
Args:
- * ``node`' - XML Element
+ * ``node`` - XML Element
* ``doc`` - Document
Returns:
@@ -3191,6 +3210,7 @@ class TextContent(AbstractElement):
on structure elements like :class:`Paragraph` and :class:`Sentence` are by definition untokenised. Only on :class:`Word`` level and deeper they are by definition tokenised.
Text content elements can specify offset that refer to text at a higher parent level. Use the following keyword arguments:
+
* ``ref=``: The instance to point to, this points to the element holding the text content element, not the text content element itself.
* ``offset=``: The offset where this text is found, offsets start at 0
"""
@@ -3276,8 +3296,8 @@ class TextContent(AbstractElement):
#finally, we made it!
return True
-
-
+ def deepvalidation(self):
+ return True
def __unicode__(self):
@@ -3405,9 +3425,10 @@ class TextContent(AbstractElement):
class PhonContent(AbstractElement):
"""Phonetic content element (``ph``), holds a phonetic representation to be associated with whatever element the phonetic content element is a child of.
- Phonetic content elements behave much like text content elements
+ Phonetic content elements behave much like text content elements.
Phonetic content elements can specify offset that refer to phonetic content at a higher parent level. Use the following keyword arguments:
+
* ``ref=``: The instance to point to, this points to the element holding the text content element, not the text content element itself.
* ``offset=``: The offset where this text is found, offsets start at 0
"""
@@ -3486,6 +3507,8 @@ class PhonContent(AbstractElement):
#finally, we made it!
return True
+ def deepvalidation(self):
+ return True
def __unicode__(self):
@@ -3895,7 +3918,7 @@ class Word(AbstractStructureElement, AllowCorrections):
if not attribs: attribs = {}
if not self.space:
attribs['space'] = 'no'
- return super(Word,self).json(attribs, recurse)
+ return super(Word,self).json(attribs, recurse,ignorelist)
@classmethod
def relaxng(cls, includechildren=True,extraattribs = None, extraelements=None):
@@ -3989,7 +4012,7 @@ class Feature(AbstractElement):
elif 'subset' in kwargs:
self.subset = kwargs['subset']
else:
- raise Exception("No subset specified for " + + self.__class__.__name__)
+ raise Exception("No subset specified for " + self.__class__.__name__)
if 'cls' in kwargs:
self.cls = kwargs['cls']
elif 'class' in kwargs:
@@ -4009,7 +4032,7 @@ class Feature(AbstractElement):
return makeelement(E,'{' + NSFOLIA + '}' + self.XMLTAG, **attribs)
def json(self,attribs=None, recurse=True, ignorelist=False):
- jsonnode= {'type': self.XMLTAG}
+ jsonnode= {'type': Feature.XMLTAG}
jsonnode['subset'] = self.subset
jsonnode['class'] = self.cls
return jsonnode
@@ -4019,6 +4042,25 @@ class Feature(AbstractElement):
E = ElementMaker(namespace="http://relaxng.org/ns/structure/1.0",nsmap={None:'http://relaxng.org/ns/structure/1.0' , 'folia': "http://ilk.uvt.nl/folia", 'xml' : "http://www.w3.org/XML/1998/namespace"})
return E.define( E.element(E.attribute(name='subset'), E.attribute(name='class'),name=cls.XMLTAG), name=cls.XMLTAG,ns=NSFOLIA)
+ def deepvalidation(self):
+ """Perform deep validation of this element.
+
+ Raises:
+ :class:`DeepValidationError`
+ """
+ if self.doc and self.doc.deepvalidation and self.parent.set and self.parent.set[0] != '_':
+ try:
+ self.doc.setdefinitions[self.parent.set].testsubclass(self.parent.cls, self.subset, self.cls)
+ except KeyError as e:
+ if self.parent.cls and not self.doc.allowadhocsets:
+ raise DeepValidationError("Set definition " + self.parent.set + " for " + self.parent.XMLTAG + " not loaded (feature validation failed)!")
+ except DeepValidationError as e:
+ errormsg = str(e) + " (in set " + self.parent.set+" for " + self.parent.XMLTAG
+ if self.parent.id:
+ errormsg += " with ID " + self.parent.id
+ errormsg += ")"
+ raise DeepValidationError(errormsg)
+
class ValueFeature(Feature):
"""Value feature, to be used within :class:`Metric`"""
@@ -4108,7 +4150,7 @@ class AbstractSpanAnnotation(AbstractElement, AllowGenerateID, AllowCorrections)
if not found:
raise NoSuchAnnotation()
- def _helper_wrefs(self, targets):
+ def _helper_wrefs(self, targets, recurse=True):
"""Internal helper function"""
for c in self:
if isinstance(c,Word) or isinstance(c,Morpheme) or isinstance(c, Phoneme):
@@ -4118,7 +4160,7 @@ class AbstractSpanAnnotation(AbstractElement, AllowGenerateID, AllowCorrections)
targets.append(self.doc[c.id]) #try to resolve
except KeyError:
targets.append(c) #add unresolved
- elif isinstance(c, AbstractSpanAnnotation):
+ elif isinstance(c, AbstractSpanAnnotation) and recurse:
#recursion
c._helper_wrefs(targets) #pylint: disable=protected-access
elif isinstance(c, Correction) and c.auth: #recurse into corrections
@@ -4129,14 +4171,14 @@ class AbstractSpanAnnotation(AbstractElement, AllowGenerateID, AllowCorrections)
#recursion
e2._helper_wrefs(targets) #pylint: disable=protected-access
- def wrefs(self, index = None):
+ def wrefs(self, index = None, recurse=True):
"""Returns a list of word references, these can be Words but also Morphemes or Phonemes.
Arguments:
index (int or None): If set to an integer, will retrieve and return the n'th element (starting at 0) instead of returning the list of all
"""
targets =[]
- self._helper_wrefs(targets)
+ self._helper_wrefs(targets, recurse)
if index is None:
return targets
else:
@@ -4164,6 +4206,21 @@ class AbstractSpanAnnotation(AbstractElement, AllowGenerateID, AllowCorrections)
else:
yield c.copy(newdoc,idsuffix)
+ def postappend(self):
+ super(AbstractSpanAnnotation,self).postappend()
+
+ #If a span annotation element with wrefs x y z is added in the scope of parent span annotation element with wrefs u v w x y z, then x y z is removed from the parent span (no duplication, implicit through recursion)
+ e = self.parent
+ directwrefs = None #will be populated on first iteration
+ while isinstance(e, AbstractSpanAnnotation):
+ if directwrefs is None:
+ directwrefs = self.wrefs(recurse=False)
+ for wref in directwrefs:
+ try:
+ e.data.remove(wref)
+ except ValueError:
+ pass
+ e = e.parent
class AbstractAnnotationLayer(AbstractElement, AllowGenerateID, AllowCorrections):
"""Annotation layers for Span Annotation are derived from this abstract base class"""
@@ -4289,6 +4346,9 @@ class AbstractAnnotationLayer(AbstractElement, AllowGenerateID, AllowCorrections
extraattribs.append(E.optional(E.attribute(E.text(), name='set')) )
return AbstractElement.relaxng(includechildren, extraattribs, extraelements, cls)
+ def deepvalidation(self):
+ return True
+
# class AbstractSubtokenAnnotationLayer(AbstractElement, AllowGenerateID):
# """Annotation layers for Subtoken Annotation are derived from this abstract base class"""
# OPTIONAL_ATTRIBS = ()
@@ -4311,6 +4371,9 @@ class AbstractCorrectionChild(AbstractElement):
#Delegate ID generation to parent
return self.parent.generate_id(cls)
+ def deepvalidation(self):
+ return True
+
class Reference(AbstractStructureElement):
"""A structural element that denotes a reference, internal or external. Examples are references to footnotes, bibliographies, hyperlinks."""
@@ -4827,7 +4890,7 @@ class Correction(AbstractElement, AllowGenerateID):
"""Get suggestions for correction.
Yields:
- :class:`Suggestion` element that encapsulate the suggested annotations (if index is ``None`, default)
+ :class:`Suggestion` element that encapsulate the suggested annotations (if index is ``None``, default)
Returns:
a :class:`Suggestion` element that encapsulate the suggested annotations (if index is set)
@@ -4906,13 +4969,16 @@ class Alternative(AbstractElement, AllowTokenAnnotation, AllowGenerateID):
pos tag alternative is tied to a particular lemma.
"""
- pass
+ def deepvalidation(self):
+ return True
class AlternativeLayers(AbstractElement):
"""Element grouping alternative subtoken annotation(s). Multiple altlayers elements may occur, each denoting a different alternative. Elements grouped inside an alternative block are considered dependent."""
- pass
+
+ def deepvalidation(self):
+ return True
@@ -6008,6 +6074,11 @@ class Document(object):
else:
self.debug = False
+ if 'verbose' in kwargs:
+ self.verbose = kwargs['verbose']
+ else:
+ self.verbose = False
+
if 'mode' in kwargs:
self.mode = int(kwargs['mode'])
else:
@@ -6039,10 +6110,12 @@ class Document(object):
if 'deepvalidation' in kwargs:
self.deepvalidation = bool(kwargs['deepvalidation'])
- self.loadsetdefinitions = True
else:
self.deepvalidation = False
+ if self.deepvalidation:
+ self.loadsetdefinitions = True
+
if 'allowadhocsets' in kwargs:
self.allowadhocsets = bool(kwargs['allowadhocsets'])
else:
@@ -6450,7 +6523,7 @@ class Document(object):
if set and self.loadsetdefinitions and set not in self.setdefinitions:
if set[:7] == "http://" or set[:8] == "https://" or set[:6] == "ftp://":
try:
- self.setdefinitions[set] = loadsetdefinition(set) #will raise exception on error
+ self.setdefinitions[set] = SetDefinition(set,verbose=self.verbose) #will raise exception on error
except DeepValidationError:
print("WARNING: Set " + set + " could not be downloaded, ignoring!",file=sys.stderr) #warning and ignore
@@ -7163,252 +7236,8 @@ class CorpusProcessor(object):
-class SetType:
- CLOSED, OPEN, MIXED = range(3)
-
-class AbstractDefinition(object):
- pass
-
-class ConstraintDefinition(object):
- def __init__(self, id, restrictions = {}, exceptions = {}):
- self.id = id
- self.restrictions = restrictions
- self.exceptions = exceptions
-
- @classmethod
- def parsexml(Class, node, constraintindex):
- assert node.tag == '{' + NSFOLIA + '}constraint'
-
- if 'ref' in node.attrib:
- try:
- return constraintindex[node.attrib['ref']]
- except KeyError:
- raise KeyError("Unresolvable constraint: " + node.attrib['ref'])
-
-
-
- restrictions = []
- exceptions = []
- for subnode in node:
- if isinstance(subnode.tag, str) or (sys.version < '3' and isinstance(subnode.tag, unicode)):
- if subnode.tag == '{' + NSFOLIA + '}restrict':
- if 'subset' in subnode.attrib:
- restrictions.append( (subnode.attrib['subset'], subnode.attrib['class']) )
- else:
- restrictions.append( (None, subnode.attrib['class']) )
- elif subnode.tag == '{' + NSFOLIA + '}except':
- if 'subset' in subnode.attrib:
- exceptions.append( (subnode.attrib['subset'], subnode.attrib['class']) )
- else:
- exceptions.append( (None, subnode.attrib['class']) )
-
- if '{http://www.w3.org/XML/1998/namespace}id' in node.attrib:
- id = node.attrib['{http://www.w3.org/XML/1998/namespace}id']
- instance = ConstraintDefinition(id, restrictions,exceptions)
- constraintindex[id] = instance
- else:
- instance = ConstraintDefinition(None, restrictions,exceptions)
- return instance
-
-
- def json(self):
- return {'id': self.id} #TODO: Implement
-
-class ClassDefinition(AbstractDefinition):
- def __init__(self,id, label, constraints=[], subclasses=[]):
- self.id = id
- self.label = label
- self.constraints = constraints
- self.subclasses = subclasses
-
- @classmethod
- def parsexml(Class, node, constraintindex):
- if not node.tag == '{' + NSFOLIA + '}class':
- raise Exception("Expected class tag for this xml node, got" + node.tag)
-
- if 'label' in node.attrib:
- label = node.attrib['label']
- else:
- label = ""
-
- constraints = []
- subclasses= []
- for subnode in node:
- if isinstance(subnode.tag, str) or (sys.version < '3' and isinstance(subnode.tag, unicode)):
- if subnode.tag == '{' + NSFOLIA + '}constraint':
- constraints.append( ConstraintDefinition.parsexml(subnode, constraintindex) )
- elif subnode.tag == '{' + NSFOLIA + '}class':
- subclasses.append( ClassDefinition.parsexml(subnode, constraintindex) )
- elif subnode.tag[:len(NSFOLIA) +2] == '{' + NSFOLIA + '}':
- raise Exception("Invalid tag in Class definition: " + subnode.tag)
-
- return ClassDefinition(node.attrib['{http://www.w3.org/XML/1998/namespace}id'],label, constraints, subclasses)
-
-
- def __iter__(self):
- for c in self.subclasses:
- yield c
-
- def json(self):
- jsonnode = {'id': self.id, 'label': self.label}
- jsonnode['constraints'] = []
- jsonnode['subclasses'] = []
- for constraint in self.constraints:
- jsonnode['constaints'].append(constraint.json())
- for subclass in self.subclasses:
- jsonnode['subclasses'].append(subclass.json())
- return jsonnode
-
-class SubsetDefinition(AbstractDefinition):
- def __init__(self, id, type, classes = [], constraints = []):
- self.id = id
- self.type = type
- self.classes = classes
- self.constraints = constraints
-
- @classmethod
- def parsexml(Class, node, constraintindex= {}):
- if not node.tag == '{' + NSFOLIA + '}subset':
- raise Exception("Expected subset tag for this xml node, got" + node.tag)
-
- if 'type' in node.attrib:
- if node.attrib['type'] == 'open':
- type = SetType.OPEN
- elif node.attrib['type'] == 'closed':
- type = SetType.CLOSED
- elif node.attrib['type'] == 'mixed':
- type = SetType.MIXED
- else:
- raise Exception("Invalid set type: ", type)
- else:
- type = SetType.MIXED
-
- classes = []
- constraints = []
- for subnode in node:
- if isinstance(subnode.tag, str) or (sys.version < '3' and isinstance(subnode.tag, unicode)):
- if subnode.tag == '{' + NSFOLIA + '}class':
- classes.append( ClassDefinition.parsexml(subnode, constraintindex) )
- elif subnode.tag == '{' + NSFOLIA + '}constraint':
- constraints.append( ConstraintDefinition.parsexml(subnode, constraintindex) )
- elif subnode.tag[:len(NSFOLIA) +2] == '{' + NSFOLIA + '}':
- raise Exception("Invalid tag in Set definition: " + subnode.tag)
-
- return SubsetDefinition(node.attrib['{http://www.w3.org/XML/1998/namespace}id'],type,classes, constraints)
-
-
- def json(self):
- jsonnode = {'id': self.id}
- if self.type == SetType.OPEN:
- jsonnode['type'] = 'open'
- elif self.type == SetType.CLOSED:
- jsonnode['type'] = 'closed'
- elif self.type == SetType.MIXED:
- jsonnode['type'] = 'mixed'
- jsonnode['constraints'] = []
- for constraint in self.constraints:
- jsonnode['constraints'].append(constraint.json())
- jsonnode['classes'] = {}
- for c in self.classes:
- jsonnode['classes'][c.id] = c.json()
- return jsonnode
-
-class SetDefinition(AbstractDefinition):
- def __init__(self, id, type, classes = [], subsets = [], constraintindex = {}, label =None):
- isncname(id)
- self.id = id
- self.type = type
- self.label = label
- self.classes = classes
- self.subsets = subsets
- self.constraintindex = constraintindex
-
-
- @classmethod
- def parsexml(Class, node):
- assert node.tag == '{' + NSFOLIA + '}set'
- classes = []
- subsets= []
- constraintindex = {}
- if 'type' in node.attrib:
- if node.attrib['type'] == 'open':
- type = SetType.OPEN
- elif node.attrib['type'] == 'closed':
- type = SetType.CLOSED
- elif node.attrib['type'] == 'mixed':
- type = SetType.MIXED
- else:
- raise Exception("Invalid set type: ", type)
- else:
- type = SetType.MIXED
-
- if 'label' in node.attrib:
- label = node.attrib['label']
- else:
- label = None
-
- for subnode in node:
- if isinstance(subnode.tag, str) or (sys.version < '3' and isinstance(subnode.tag, unicode)):
- if subnode.tag == '{' + NSFOLIA + '}class':
- classes.append( ClassDefinition.parsexml(subnode, constraintindex) )
- elif subnode.tag == '{' + NSFOLIA + '}subset':
- subsets.append( SubsetDefinition.parsexml(subnode, constraintindex) )
- elif subnode.tag == '{' + NSFOLIA + '}constraint':
- pass
- elif subnode.tag[:len(NSFOLIA) +2] == '{' + NSFOLIA + '}':
- raise SetDefinitionError("Invalid tag in Set definition: " + subnode.tag)
-
- return SetDefinition(node.attrib['{http://www.w3.org/XML/1998/namespace}id'],type,classes, subsets, constraintindex, label)
-
- def testclass(self,cls):
- raise NotImplementedError #TODO, IMPLEMENT!
-
- def testsubclass(self, cls, subset, subclass):
- raise NotImplementedError #TODO, IMPLEMENT!
-
- def json(self):
- jsonnode = {'id': self.id}
- if self.label:
- jsonnode['label'] = self.label
- if self.type == SetType.OPEN:
- jsonnode['type'] = 'open'
- elif self.type == SetType.CLOSED:
- jsonnode['type'] = 'closed'
- elif self.type == SetType.MIXED:
- jsonnode['type'] = 'mixed'
- jsonnode['subsets'] = {}
- for subset in self.subsets:
- jsonnode['subsets'][subset.id] = subset.json()
- jsonnode['classes'] = {}
- jsonnode['classorder'] = []
- for c in sorted(self.classes, key=lambda x: x.label):
- jsonnode['classes'][c.id] = c.json()
- jsonnode['classorder'].append( c.id )
- return jsonnode
-
-def loadsetdefinition(filename):
- if filename[0] == '/' or filename[0] == '.':
- try:
- tree = ElementTree.parse(filename, ElementTree.XMLParser(collect_ids=False))
- except TypeError:
- tree = ElementTree.parse(filename, ElementTree.XMLParser())
- else:
- try:
- f = urlopen(filename)
- except:
- raise DeepValidationError("Unable to download " + filename)
- try:
- tree = xmltreefromstring(f.read())
- except IOError:
- raise DeepValidationError("Unable to download " + filename)
- f.close()
- root = tree.getroot()
- if root.tag != '{' + NSFOLIA + '}set':
- raise SetDefinitionError("Not a FoLiA Set Definition! Unexpected root tag:"+ root.tag)
-
- return SetDefinition.parsexml(root)
def relaxng_declarations():
@@ -7744,7 +7573,7 @@ def validate(filename,schema=None,deep=False):
#================================= FOLIA SPECIFICATION ==========================================================
#foliaspec:header
-#This file was last updated according to the FoLiA specification for version 1.3.2 on 2016-10-11 11:17:52, using foliaspec.py
+#This file was last updated according to the FoLiA specification for version 1.4.0 on 2016-12-09 14:31:07, using foliaspec.py
#Code blocks after a foliaspec comment (until the next newline) are automatically generated. **DO NOT EDIT THOSE** and **DO NOT REMOVE ANY FOLIASPEC COMMENTS** !!!
#foliaspec:structurescope:STRUCTURESCOPE
@@ -8006,6 +7835,7 @@ AlignReference.XMLTAG = "aref"
#------ Alignment -------
Alignment.ACCEPTED_DATA = (AlignReference, Comment, Description, Feature, ForeignData, Metric,)
Alignment.ANNOTATIONTYPE = AnnotationType.ALIGNMENT
+Alignment.LABEL = "Alignment"
Alignment.OPTIONAL_ATTRIBS = (Attrib.ID, Attrib.CLASS, Attrib.ANNOTATOR, Attrib.N, Attrib.CONFIDENCE, Attrib.DATETIME, Attrib.SRC, Attrib.BEGINTIME, Attrib.ENDTIME, Attrib.SPEAKER,)
Alignment.PRINTABLE = False
Alignment.REQUIRED_ATTRIBS = None
@@ -8015,6 +7845,7 @@ Alignment.XMLTAG = "alignment"
#------ Alternative -------
Alternative.ACCEPTED_DATA = (AbstractTokenAnnotation, Comment, Correction, Description, ForeignData, MorphologyLayer, PhonologyLayer,)
Alternative.AUTH = False
+Alternative.LABEL = "Alternative"
Alternative.OPTIONAL_ATTRIBS = (Attrib.ID, Attrib.CLASS, Attrib.ANNOTATOR, Attrib.N, Attrib.CONFIDENCE, Attrib.DATETIME, Attrib.SRC, Attrib.BEGINTIME, Attrib.ENDTIME, Attrib.SPEAKER,)
Alternative.PRINTABLE = False
Alternative.REQUIRED_ATTRIBS = None
@@ -8023,6 +7854,7 @@ Alternative.XMLTAG = "alt"
#------ AlternativeLayers -------
AlternativeLayers.ACCEPTED_DATA = (AbstractAnnotationLayer, Comment, Description, ForeignData,)
AlternativeLayers.AUTH = False
+AlternativeLayers.LABEL = "Alternative Layers"
AlternativeLayers.OPTIONAL_ATTRIBS = (Attrib.ID, Attrib.CLASS, Attrib.ANNOTATOR, Attrib.N, Attrib.CONFIDENCE, Attrib.DATETIME, Attrib.SRC, Attrib.BEGINTIME, Attrib.ENDTIME, Attrib.SPEAKER,)
AlternativeLayers.PRINTABLE = False
AlternativeLayers.REQUIRED_ATTRIBS = None
@@ -8033,15 +7865,18 @@ BegindatetimeFeature.SUBSET = "begindatetime"
BegindatetimeFeature.XMLTAG = None
#------ Caption -------
Caption.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Feature, ForeignData, Gap, Linebreak, Metric, Part, PhonContent, Reference, Sentence, String, TextContent, Whitespace,)
+Caption.LABEL = "Caption"
Caption.OCCURRENCES = 1
Caption.XMLTAG = "caption"
#------ Cell -------
Cell.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Entry, Event, Example, Feature, ForeignData, Gap, Head, Linebreak, Metric, Note, Paragraph, Part, Reference, Sentence, String, TextContent, Whitespace, Word,)
+Cell.LABEL = "Cell"
Cell.TEXTDELIMITER = " | "
Cell.XMLTAG = "cell"
#------ Chunk -------
Chunk.ACCEPTED_DATA = (AlignReference, Alignment, Comment, Description, Feature, ForeignData, Metric, WordReference,)
Chunk.ANNOTATIONTYPE = AnnotationType.CHUNKING
+Chunk.LABEL = "Chunk"
Chunk.XMLTAG = "chunk"
#------ ChunkingLayer -------
ChunkingLayer.ACCEPTED_DATA = (Chunk, Comment, Correction, Description, ForeignData,)
@@ -8049,11 +7884,13 @@ ChunkingLayer.ANNOTATIONTYPE = AnnotationType.CHUNKING
ChunkingLayer.PRIMARYELEMENT = False
ChunkingLayer.XMLTAG = "chunking"
#------ Comment -------
+Comment.LABEL = "Comment"
Comment.OPTIONAL_ATTRIBS = (Attrib.ID, Attrib.ANNOTATOR, Attrib.CONFIDENCE, Attrib.DATETIME, Attrib.N,)
Comment.XMLTAG = "comment"
#------ ComplexAlignment -------
ComplexAlignment.ACCEPTED_DATA = (Alignment, Comment, Description, Feature, ForeignData, Metric,)
ComplexAlignment.ANNOTATIONTYPE = AnnotationType.COMPLEXALIGNMENT
+ComplexAlignment.LABEL = "Complex Alignment"
ComplexAlignment.OPTIONAL_ATTRIBS = (Attrib.ID, Attrib.CLASS, Attrib.ANNOTATOR, Attrib.N, Attrib.CONFIDENCE, Attrib.DATETIME, Attrib.SRC, Attrib.BEGINTIME, Attrib.ENDTIME, Attrib.SPEAKER,)
ComplexAlignment.PRINTABLE = False
ComplexAlignment.REQUIRED_ATTRIBS = None
@@ -8065,11 +7902,13 @@ ComplexAlignmentLayer.ANNOTATIONTYPE = AnnotationType.COMPLEXALIGNMENT
ComplexAlignmentLayer.PRIMARYELEMENT = False
ComplexAlignmentLayer.XMLTAG = "complexalignments"
#------ Content -------
+Content.LABEL = "Gap Content"
Content.OCCURRENCES = 1
Content.XMLTAG = "content"
#------ CoreferenceChain -------
CoreferenceChain.ACCEPTED_DATA = (AlignReference, Alignment, Comment, CoreferenceLink, Description, Feature, ForeignData, Metric,)
CoreferenceChain.ANNOTATIONTYPE = AnnotationType.COREFERENCE
+CoreferenceChain.LABEL = "Coreference Chain"
CoreferenceChain.REQUIRED_DATA = (CoreferenceLink,)
CoreferenceChain.XMLTAG = "coreferencechain"
#------ CoreferenceLayer -------
@@ -8080,11 +7919,13 @@ CoreferenceLayer.XMLTAG = "coreferences"
#------ CoreferenceLink -------
CoreferenceLink.ACCEPTED_DATA = (AlignReference, Alignment, Comment, Description, Feature, ForeignData, Headspan, LevelFeature, Metric, ModalityFeature, TimeFeature, WordReference,)
CoreferenceLink.ANNOTATIONTYPE = AnnotationType.COREFERENCE
+CoreferenceLink.LABEL = "Coreference Link"
CoreferenceLink.PRIMARYELEMENT = False
CoreferenceLink.XMLTAG = "coreferencelink"
#------ Correction -------
Correction.ACCEPTED_DATA = (Comment, Current, Description, ErrorDetection, Feature, ForeignData, Metric, New, Original, Suggestion,)
Correction.ANNOTATIONTYPE = AnnotationType.CORRECTION
+Correction.LABEL = "Correction"
Correction.OPTIONAL_ATTRIBS = (Attrib.ID, Attrib.CLASS, Attrib.ANNOTATOR, Attrib.N, Attrib.CONFIDENCE, Attrib.DATETIME, Attrib.SRC, Attrib.BEGINTIME, Attrib.ENDTIME, Attrib.SPEAKER,)
Correction.PRINTABLE = True
Correction.SPEAKABLE = True
@@ -8097,6 +7938,7 @@ Current.XMLTAG = "current"
#------ Definition -------
Definition.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Feature, Figure, ForeignData, List, Metric, Paragraph, Part, PhonContent, Reference, Sentence, String, Table, TextContent, Utterance, Word,)
Definition.ANNOTATIONTYPE = AnnotationType.DEFINITION
+Definition.LABEL = "Definition"
Definition.XMLTAG = "def"
#------ DependenciesLayer -------
DependenciesLayer.ACCEPTED_DATA = (Comment, Correction, Dependency, Description, ForeignData,)
@@ -8106,21 +7948,27 @@ DependenciesLayer.XMLTAG = "dependencies"
#------ Dependency -------
Dependency.ACCEPTED_DATA = (AlignReference, Alignment, Comment, DependencyDependent, Description, Feature, ForeignData, Headspan, Metric,)
Dependency.ANNOTATIONTYPE = AnnotationType.DEPENDENCY
+Dependency.LABEL = "Dependency"
Dependency.REQUIRED_DATA = (DependencyDependent, Headspan,)
Dependency.XMLTAG = "dependency"
#------ DependencyDependent -------
+DependencyDependent.LABEL = "Dependent"
+DependencyDependent.OCCURRENCES = 1
DependencyDependent.XMLTAG = "dep"
#------ Description -------
+Description.LABEL = "Description"
Description.OCCURRENCES = 1
Description.OPTIONAL_ATTRIBS = (Attrib.ID, Attrib.ANNOTATOR, Attrib.CONFIDENCE, Attrib.DATETIME, Attrib.N,)
Description.XMLTAG = "desc"
#------ Division -------
Division.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Division, Entry, Event, Example, Feature, Figure, ForeignData, Gap, Head, Linebreak, List, Metric, Note, Paragraph, Part, PhonContent, Quote, Reference, Sentence, Table, TextContent, Utterance, Whitespace,)
Division.ANNOTATIONTYPE = AnnotationType.DIVISION
+Division.LABEL = "Division"
Division.TEXTDELIMITER = "\n\n\n"
Division.XMLTAG = "div"
#------ DomainAnnotation -------
DomainAnnotation.ANNOTATIONTYPE = AnnotationType.DOMAIN
+DomainAnnotation.LABEL = "Domain"
DomainAnnotation.OCCURRENCES_PER_SET = 0
DomainAnnotation.XMLTAG = "domain"
#------ EnddatetimeFeature -------
@@ -8134,36 +7982,44 @@ EntitiesLayer.XMLTAG = "entities"
#------ Entity -------
Entity.ACCEPTED_DATA = (AlignReference, Alignment, Comment, Description, Feature, ForeignData, Metric, WordReference,)
Entity.ANNOTATIONTYPE = AnnotationType.ENTITY
+Entity.LABEL = "Entity"
Entity.XMLTAG = "entity"
#------ Entry -------
Entry.ACCEPTED_DATA = (AbstractAnnotationLayer, Alignment, Alternative, AlternativeLayers, Comment, Correction, Definition, Description, Example, Feature, ForeignData, Metric, Part, Term,)
Entry.ANNOTATIONTYPE = AnnotationType.ENTRY
+Entry.LABEL = "Entry"
Entry.XMLTAG = "entry"
#------ ErrorDetection -------
ErrorDetection.ANNOTATIONTYPE = AnnotationType.ERRORDETECTION
+ErrorDetection.LABEL = "Error Detection"
ErrorDetection.OCCURRENCES_PER_SET = 0
ErrorDetection.XMLTAG = "errordetection"
#------ Event -------
Event.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, ActorFeature, Alignment, Alternative, AlternativeLayers, BegindatetimeFeature, Comment, Correction, Description, Division, EnddatetimeFeature, Event, Example, Feature, Figure, ForeignData, Head, Linebreak, List, Metric, Paragraph, Part, PhonContent, Reference, Sentence, String, Table, TextContent, Utterance, Whitespace, Word,)
Event.ANNOTATIONTYPE = AnnotationType.EVENT
+Event.LABEL = "Event"
Event.XMLTAG = "event"
#------ Example -------
Example.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Feature, Figure, ForeignData, Linebreak, List, Metric, Paragraph, Part, PhonContent, Reference, Sentence, String, Table, TextContent, Utterance, Whitespace, Word,)
Example.ANNOTATIONTYPE = AnnotationType.EXAMPLE
+Example.LABEL = "Example"
Example.XMLTAG = "ex"
#------ External -------
External.ACCEPTED_DATA = (Comment, Description,)
External.AUTH = True
+External.LABEL = "External"
External.OPTIONAL_ATTRIBS = None
External.PRINTABLE = True
External.REQUIRED_ATTRIBS = (Attrib.SRC,)
External.SPEAKABLE = False
External.XMLTAG = "external"
#------ Feature -------
+Feature.LABEL = "Feature"
Feature.XMLTAG = "feat"
#------ Figure -------
Figure.ACCEPTED_DATA = (AbstractAnnotationLayer, Alignment, Alternative, AlternativeLayers, Caption, Comment, Correction, Description, Feature, ForeignData, Metric, Part, Sentence, String, TextContent,)
Figure.ANNOTATIONTYPE = AnnotationType.FIGURE
+Figure.LABEL = "Figure"
Figure.SPEAKABLE = False
Figure.TEXTDELIMITER = "\n\n"
Figure.XMLTAG = "figure"
@@ -8175,10 +8031,12 @@ FunctionFeature.XMLTAG = None
#------ Gap -------
Gap.ACCEPTED_DATA = (Comment, Content, Description, Feature, ForeignData, Metric, Part,)
Gap.ANNOTATIONTYPE = AnnotationType.GAP
+Gap.LABEL = "Gap"
Gap.OPTIONAL_ATTRIBS = (Attrib.ID, Attrib.CLASS, Attrib.ANNOTATOR, Attrib.N, Attrib.DATETIME, Attrib.SRC, Attrib.BEGINTIME, Attrib.ENDTIME,)
Gap.XMLTAG = "gap"
#------ Head -------
Head.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Event, Feature, ForeignData, Gap, Linebreak, Metric, Part, PhonContent, Reference, Sentence, String, TextContent, Whitespace, Word,)
+Head.LABEL = "Head"
Head.OCCURRENCES = 1
Head.TEXTDELIMITER = "\n\n"
Head.XMLTAG = "head"
@@ -8186,35 +8044,44 @@ Head.XMLTAG = "head"
HeadFeature.SUBSET = "head"
HeadFeature.XMLTAG = None
#------ Headspan -------
+Headspan.LABEL = "Head"
+Headspan.OCCURRENCES = 1
Headspan.XMLTAG = "hd"
#------ Label -------
Label.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Feature, ForeignData, Metric, Part, PhonContent, Reference, String, TextContent, Word,)
+Label.LABEL = "Label"
Label.XMLTAG = "label"
#------ LangAnnotation -------
LangAnnotation.ANNOTATIONTYPE = AnnotationType.LANG
+LangAnnotation.LABEL = "Language"
LangAnnotation.XMLTAG = "lang"
#------ LemmaAnnotation -------
LemmaAnnotation.ANNOTATIONTYPE = AnnotationType.LEMMA
+LemmaAnnotation.LABEL = "Lemma"
LemmaAnnotation.XMLTAG = "lemma"
#------ LevelFeature -------
LevelFeature.SUBSET = "level"
LevelFeature.XMLTAG = None
#------ Linebreak -------
Linebreak.ANNOTATIONTYPE = AnnotationType.LINEBREAK
+Linebreak.LABEL = "Linebreak"
Linebreak.TEXTDELIMITER = ""
Linebreak.XMLTAG = "br"
#------ List -------
List.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Caption, Comment, Correction, Description, Event, Feature, ForeignData, ListItem, Metric, Note, Part, PhonContent, Reference, String, TextContent,)
List.ANNOTATIONTYPE = AnnotationType.LIST
+List.LABEL = "List"
List.TEXTDELIMITER = "\n\n"
List.XMLTAG = "list"
#------ ListItem -------
ListItem.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Event, Feature, ForeignData, Gap, Label, Linebreak, List, Metric, Note, Part, PhonContent, Reference, Sentence, String, TextContent, Whitespace,)
+ListItem.LABEL = "List Item"
ListItem.TEXTDELIMITER = "\n"
ListItem.XMLTAG = "item"
#------ Metric -------
Metric.ACCEPTED_DATA = (Comment, Description, Feature, ForeignData, ValueFeature,)
Metric.ANNOTATIONTYPE = AnnotationType.METRIC
+Metric.LABEL = "Metric"
Metric.OPTIONAL_ATTRIBS = (Attrib.ID, Attrib.CLASS, Attrib.ANNOTATOR, Attrib.N, Attrib.CONFIDENCE, Attrib.DATETIME, Attrib.SRC, Attrib.BEGINTIME, Attrib.ENDTIME, Attrib.SPEAKER,)
Metric.XMLTAG = "metric"
#------ ModalityFeature -------
@@ -8223,6 +8090,7 @@ ModalityFeature.XMLTAG = None
#------ Morpheme -------
Morpheme.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Feature, ForeignData, FunctionFeature, Metric, Morpheme, Part, PhonContent, String, TextContent,)
Morpheme.ANNOTATIONTYPE = AnnotationType.MORPHOLOGICAL
+Morpheme.LABEL = "Morpheme"
Morpheme.TEXTDELIMITER = ""
Morpheme.XMLTAG = "morpheme"
#------ MorphologyLayer -------
@@ -8237,10 +8105,12 @@ New.XMLTAG = "new"
#------ Note -------
Note.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Example, Feature, Figure, ForeignData, Head, Linebreak, List, Metric, Paragraph, Part, PhonContent, Reference, Sentence, String, Table, TextContent, Utterance, Whitespace, Word,)
Note.ANNOTATIONTYPE = AnnotationType.NOTE
+Note.LABEL = "Note"
Note.XMLTAG = "note"
#------ Observation -------
Observation.ACCEPTED_DATA = (AlignReference, Alignment, Comment, Description, Feature, ForeignData, Metric, WordReference,)
Observation.ANNOTATIONTYPE = AnnotationType.OBSERVATION
+Observation.LABEL = "Observation"
Observation.XMLTAG = "observation"
#------ ObservationLayer -------
ObservationLayer.ACCEPTED_DATA = (Comment, Correction, Description, ForeignData, Observation,)
@@ -8255,16 +8125,19 @@ Original.XMLTAG = "original"
#------ Paragraph -------
Paragraph.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Entry, Event, Example, Feature, Figure, ForeignData, Gap, Head, Linebreak, List, Metric, Note, Part, PhonContent, Quote, Reference, Sentence, String, TextContent, Whitespace, Word,)
Paragraph.ANNOTATIONTYPE = AnnotationType.PARAGRAPH
+Paragraph.LABEL = "Paragraph"
Paragraph.TEXTDELIMITER = "\n\n"
Paragraph.XMLTAG = "p"
#------ Part -------
Part.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, AbstractStructureElement, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Feature, ForeignData, Metric, Part,)
Part.ANNOTATIONTYPE = AnnotationType.PART
+Part.LABEL = "Part"
Part.TEXTDELIMITER = None
Part.XMLTAG = "part"
#------ PhonContent -------
PhonContent.ACCEPTED_DATA = (Comment, Description,)
PhonContent.ANNOTATIONTYPE = AnnotationType.PHON
+PhonContent.LABEL = "Phonetic Content"
PhonContent.OCCURRENCES = 0
PhonContent.OPTIONAL_ATTRIBS = (Attrib.CLASS, Attrib.ANNOTATOR, Attrib.CONFIDENCE, Attrib.DATETIME,)
PhonContent.PHONCONTAINER = True
@@ -8274,6 +8147,7 @@ PhonContent.XMLTAG = "ph"
#------ Phoneme -------
Phoneme.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Feature, ForeignData, FunctionFeature, Metric, Part, PhonContent, Phoneme, String, TextContent,)
Phoneme.ANNOTATIONTYPE = AnnotationType.PHONOLOGICAL
+Phoneme.LABEL = "Phoneme"
Phoneme.TEXTDELIMITER = ""
Phoneme.XMLTAG = "phoneme"
#------ PhonologyLayer -------
@@ -8287,27 +8161,35 @@ PolarityFeature.XMLTAG = None
#------ PosAnnotation -------
PosAnnotation.ACCEPTED_DATA = (Comment, Description, Feature, ForeignData, HeadFeature, Metric,)
PosAnnotation.ANNOTATIONTYPE = AnnotationType.POS
+PosAnnotation.LABEL = "Part-of-Speech"
PosAnnotation.XMLTAG = "pos"
#------ Predicate -------
Predicate.ACCEPTED_DATA = (AlignReference, Alignment, Comment, Description, Feature, ForeignData, Metric, SemanticRole, WordReference,)
Predicate.ANNOTATIONTYPE = AnnotationType.PREDICATE
+Predicate.LABEL = "Predicate"
Predicate.XMLTAG = "predicate"
#------ Quote -------
Quote.ACCEPTED_DATA = (AbstractAnnotationLayer, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Division, Feature, ForeignData, Gap, Metric, Paragraph, Part, Quote, Sentence, String, TextContent, Utterance, Word,)
+Quote.LABEL = "Quote"
Quote.XMLTAG = "quote"
#------ Reference -------
Reference.ACCEPTED_DATA = (AbstractAnnotationLayer, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Feature, ForeignData, Metric, Paragraph, Part, PhonContent, Quote, Sentence, String, TextContent, Utterance, Word,)
+Reference.LABEL = "Reference"
Reference.TEXTDELIMITER = None
Reference.XMLTAG = "ref"
#------ Relation -------
+Relation.LABEL = "Relation"
+Relation.OCCURRENCES = 1
Relation.XMLTAG = "relation"
#------ Row -------
Row.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Cell, Comment, Correction, Description, Feature, ForeignData, Metric, Part,)
+Row.LABEL = "Table Row"
Row.TEXTDELIMITER = "\n"
Row.XMLTAG = "row"
#------ SemanticRole -------
SemanticRole.ACCEPTED_DATA = (AlignReference, Alignment, Comment, Description, Feature, ForeignData, Headspan, Metric, WordReference,)
SemanticRole.ANNOTATIONTYPE = AnnotationType.SEMROLE
+SemanticRole.LABEL = "Semantic Role"
SemanticRole.REQUIRED_ATTRIBS = (Attrib.CLASS,)
SemanticRole.XMLTAG = "semrole"
#------ SemanticRolesLayer -------
@@ -8318,16 +8200,19 @@ SemanticRolesLayer.XMLTAG = "semroles"
#------ SenseAnnotation -------
SenseAnnotation.ACCEPTED_DATA = (Comment, Description, Feature, ForeignData, Metric, SynsetFeature,)
SenseAnnotation.ANNOTATIONTYPE = AnnotationType.SENSE
+SenseAnnotation.LABEL = "Semantic Sense"
SenseAnnotation.OCCURRENCES_PER_SET = 0
SenseAnnotation.XMLTAG = "sense"
#------ Sentence -------
Sentence.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Entry, Event, Example, Feature, ForeignData, Gap, Linebreak, Metric, Note, Part, PhonContent, Quote, Reference, String, TextContent, Whitespace, Word,)
Sentence.ANNOTATIONTYPE = AnnotationType.SENTENCE
+Sentence.LABEL = "Sentence"
Sentence.TEXTDELIMITER = " "
Sentence.XMLTAG = "s"
#------ Sentiment -------
Sentiment.ACCEPTED_DATA = (AlignReference, Alignment, Comment, Description, Feature, ForeignData, Headspan, Metric, PolarityFeature, Source, StrengthFeature, Target, WordReference,)
Sentiment.ANNOTATIONTYPE = AnnotationType.SENTIMENT
+Sentiment.LABEL = "Sentiment"
Sentiment.XMLTAG = "sentiment"
#------ SentimentLayer -------
SentimentLayer.ACCEPTED_DATA = (Comment, Correction, Description, ForeignData, Sentiment,)
@@ -8335,14 +8220,18 @@ SentimentLayer.ANNOTATIONTYPE = AnnotationType.SENTIMENT
SentimentLayer.PRIMARYELEMENT = False
SentimentLayer.XMLTAG = "sentiments"
#------ Source -------
+Source.LABEL = "Source"
+Source.OCCURRENCES = 1
Source.XMLTAG = "source"
#------ Speech -------
Speech.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Division, Entry, Event, Example, External, Feature, ForeignData, Gap, List, Metric, Note, Paragraph, Part, PhonContent, Quote, Reference, Sentence, String, TextContent, Utterance, Word,)
+Speech.LABEL = "Speech Body"
Speech.TEXTDELIMITER = "\n\n\n"
Speech.XMLTAG = "speech"
#------ Statement -------
Statement.ACCEPTED_DATA = (AlignReference, Alignment, Comment, Description, Feature, ForeignData, Headspan, Metric, Relation, Source, WordReference,)
Statement.ANNOTATIONTYPE = AnnotationType.STATEMENT
+Statement.LABEL = "Statement"
Statement.XMLTAG = "statement"
#------ StatementLayer -------
StatementLayer.ACCEPTED_DATA = (Comment, Correction, Description, ForeignData, Statement,)
@@ -8355,6 +8244,7 @@ StrengthFeature.XMLTAG = None
#------ String -------
String.ACCEPTED_DATA = (AbstractExtendedTokenAnnotation, Alignment, Comment, Correction, Description, Feature, ForeignData, Metric, PhonContent, TextContent,)
String.ANNOTATIONTYPE = AnnotationType.STRING
+String.LABEL = "String"
String.OCCURRENCES = 0
String.OPTIONAL_ATTRIBS = (Attrib.ID, Attrib.CLASS, Attrib.ANNOTATOR, Attrib.CONFIDENCE, Attrib.DATETIME, Attrib.N, Attrib.SRC, Attrib.BEGINTIME, Attrib.ENDTIME,)
String.PRINTABLE = True
@@ -8364,6 +8254,7 @@ StyleFeature.SUBSET = "style"
StyleFeature.XMLTAG = None
#------ SubjectivityAnnotation -------
SubjectivityAnnotation.ANNOTATIONTYPE = AnnotationType.SUBJECTIVITY
+SubjectivityAnnotation.LABEL = "Subjectivity/Sentiment"
SubjectivityAnnotation.XMLTAG = "subjectivity"
#------ Suggestion -------
Suggestion.AUTH = False
@@ -8375,6 +8266,7 @@ SynsetFeature.XMLTAG = None
#------ SyntacticUnit -------
SyntacticUnit.ACCEPTED_DATA = (AlignReference, Alignment, Comment, Description, Feature, ForeignData, Metric, SyntacticUnit, WordReference,)
SyntacticUnit.ANNOTATIONTYPE = AnnotationType.SYNTAX
+SyntacticUnit.LABEL = "Syntactic Unit"
SyntacticUnit.XMLTAG = "su"
#------ SyntaxLayer -------
SyntaxLayer.ACCEPTED_DATA = (Comment, Correction, Description, ForeignData, SyntacticUnit,)
@@ -8384,23 +8276,30 @@ SyntaxLayer.XMLTAG = "syntax"
#------ Table -------
Table.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Feature, ForeignData, Metric, Part, Row, TableHead,)
Table.ANNOTATIONTYPE = AnnotationType.TABLE
+Table.LABEL = "Table"
Table.XMLTAG = "table"
#------ TableHead -------
TableHead.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Feature, ForeignData, Metric, Part, Row,)
+TableHead.LABEL = "Table Header"
TableHead.XMLTAG = "tablehead"
#------ Target -------
+Target.LABEL = "Target"
+Target.OCCURRENCES = 1
Target.XMLTAG = "target"
#------ Term -------
Term.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Event, Feature, Figure, ForeignData, Gap, List, Metric, Paragraph, Part, PhonContent, Reference, Sentence, String, Table, TextContent, Utterance, Word,)
Term.ANNOTATIONTYPE = AnnotationType.TERM
+Term.LABEL = "Term"
Term.XMLTAG = "term"
#------ Text -------
Text.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Division, Entry, Event, Example, External, Feature, Figure, ForeignData, Gap, List, Metric, Note, Paragraph, Part, PhonContent, Quote, Reference, Sentence, String, Table, TextContent, Word,)
+Text.LABEL = "Text Body"
Text.TEXTDELIMITER = "\n\n\n"
Text.XMLTAG = "text"
#------ TextContent -------
TextContent.ACCEPTED_DATA = (AbstractTextMarkup, Comment, Description, Linebreak,)
TextContent.ANNOTATIONTYPE = AnnotationType.TEXT
+TextContent.LABEL = "Text"
TextContent.OCCURRENCES = 0
TextContent.OPTIONAL_ATTRIBS = (Attrib.CLASS, Attrib.ANNOTATOR, Attrib.CONFIDENCE, Attrib.DATETIME,)
TextContent.PRINTABLE = True
@@ -8434,6 +8333,7 @@ TimeFeature.XMLTAG = None
#------ TimeSegment -------
TimeSegment.ACCEPTED_DATA = (ActorFeature, AlignReference, Alignment, BegindatetimeFeature, Comment, Description, EnddatetimeFeature, Feature, ForeignData, Metric, WordReference,)
TimeSegment.ANNOTATIONTYPE = AnnotationType.TIMESEGMENT
+TimeSegment.LABEL = "Time Segment"
TimeSegment.XMLTAG = "timesegment"
#------ TimingLayer -------
TimingLayer.ACCEPTED_DATA = (Comment, Correction, Description, ForeignData, TimeSegment,)
@@ -8443,6 +8343,7 @@ TimingLayer.XMLTAG = "timing"
#------ Utterance -------
Utterance.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractExtendedTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Feature, ForeignData, Gap, Metric, Note, Part, PhonContent, Quote, Reference, Sentence, String, TextContent, Word,)
Utterance.ANNOTATIONTYPE = AnnotationType.UTTERANCE
+Utterance.LABEL = "Utterance"
Utterance.TEXTDELIMITER = " "
Utterance.XMLTAG = "utt"
#------ ValueFeature -------
@@ -8450,11 +8351,13 @@ ValueFeature.SUBSET = "value"
ValueFeature.XMLTAG = None
#------ Whitespace -------
Whitespace.ANNOTATIONTYPE = AnnotationType.WHITESPACE
+Whitespace.LABEL = "Whitespace"
Whitespace.TEXTDELIMITER = ""
Whitespace.XMLTAG = "whitespace"
#------ Word -------
Word.ACCEPTED_DATA = (AbstractAnnotationLayer, AbstractTokenAnnotation, Alignment, Alternative, AlternativeLayers, Comment, Correction, Description, Feature, ForeignData, Metric, Part, PhonContent, Reference, String, TextContent,)
Word.ANNOTATIONTYPE = AnnotationType.TOKEN
+Word.LABEL = "Word/Token"
Word.TEXTDELIMITER = " "
Word.XMLTAG = "w"
#------ WordReference -------
diff --git a/pynlpl/formats/foliaset.py b/pynlpl/formats/foliaset.py
new file mode 100644
index 0000000..5589ec7
--- /dev/null
+++ b/pynlpl/formats/foliaset.py
@@ -0,0 +1,456 @@
+# -*- coding: utf-8 -*-
+#----------------------------------------------------------------
+# PyNLPl - FoLiA Set Definition Module
+# by Maarten van Gompel
+# Centre for Language Studies
+# Radboud University Nijmegen
+#
+# https://proycon.github.io/folia
+# httsp://github.com/proycon/pynlpl
+# proycon AT anaproy DOT nl
+#
+# Module for reading, editing and writing FoLiA XML
+#
+# Licensed under GPLv3
+#
+#----------------------------------------------------------------
+
+#pylint: disable=redefined-builtin,trailing-whitespace,superfluous-parens,bad-classmethod-argument,wrong-import-order,wrong-import-position,ungrouped-imports
+
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import division
+from __future__ import absolute_import
+
+
+import sys
+import io
+import rdflib
+from lxml import etree as ElementTree
+if sys.version < '3':
+ from StringIO import StringIO #pylint: disable=import-error,wrong-import-order
+ from urllib import urlopen #pylint: disable=no-name-in-module,wrong-import-order
+ from urllib2 import HTTPError
+else:
+ from io import StringIO, BytesIO #pylint: disable=wrong-import-order,ungrouped-imports
+ from urllib.request import urlopen #pylint: disable=E0611,wrong-import-order,ungrouped-imports
+ from urllib.error import HTTPError
+
+
+#foliaspec:namespace:NSFOLIA
+#The FoLiA XML namespace
+NSFOLIA = "http://ilk.uvt.nl/folia"
+
+#foliaspec:setdefinitionnamespace:NSFOLIASETDEFINITION
+NSFOLIASETDEFINITION = "http://folia.science.ru.nl/setdefinition"
+NSSKOS = "http://www.w3.org/2004/02/skos/core"
+
+class DeepValidationError(Exception):
+ pass
+
+class SetDefinitionError(DeepValidationError):
+ pass
+
+class SetType: #legacy only
+ CLOSED, OPEN, MIXED, EMPTY = range(4)
+
+class LegacyClassDefinition(object):
+ def __init__(self,id, label, subclasses=None):
+ self.id = id
+ self.label = label
+ if subclasses:
+ self.subclasses = subclasses
+ else:
+ self.subclasses = []
+
+ @classmethod
+ def parsexml(Class, node):
+ if not node.tag == '{' + NSFOLIA + '}class':
+ raise Exception("Expected class tag for this xml node, got" + node.tag)
+
+ if 'label' in node.attrib:
+ label = node.attrib['label']
+ else:
+ label = ""
+
+ subclasses= []
+ for subnode in node:
+ if isinstance(subnode.tag, str) or (sys.version < '3' and isinstance(subnode.tag, unicode)): #pylint: disable=undefined-variable
+ if subnode.tag == '{' + NSFOLIA + '}class':
+ subclasses.append( LegacyClassDefinition.parsexml(subnode) )
+ elif subnode.tag[:len(NSFOLIA) +2] == '{' + NSFOLIA + '}':
+ raise Exception("Invalid tag in Class definition: " + subnode.tag)
+ if '{http://www.w3.org/XML/1998/namespace}id' in node.attrib:
+ idkey = '{http://www.w3.org/XML/1998/namespace}id'
+ else:
+ idkey = 'id'
+ return LegacyClassDefinition(node.attrib[idkey],label, subclasses)
+
+
+ def __iter__(self):
+ for c in self.subclasses:
+ yield c
+
+ def json(self):
+ jsonnode = {'id': self.id, 'label': self.label}
+ jsonnode['subclasses'] = []
+ for subclass in self.subclasses:
+ jsonnode['subclasses'].append(subclass.json())
+ return jsonnode
+
+ def rdf(self,graph, basens,parentseturi, parentclass=None, seqnr=None):
+ graph.add((rdflib.term.URIRef(basens + '#' + self.id), rdflib.RDF.type, rdflib.term.URIRef(NSSKOS + '#Concept')))
+ graph.add((rdflib.term.URIRef(basens + '#' + self.id), rdflib.term.URIRef(NSSKOS + '#notation'), rdflib.term.Literal(self.id)))
+ graph.add((rdflib.term.URIRef(basens + '#' + self.id), rdflib.term.URIRef(NSSKOS + '#prefLabel'), rdflib.term.Literal(self.label)))
+ graph.add((parentseturi , rdflib.term.URIRef(NSSKOS + '#member'), rdflib.term.URIRef(basens + '#' + self.id)))
+ if seqnr is not None:
+ graph.add((rdflib.term.URIRef(basens + '#' + self.id), rdflib.term.URIRef(NSFOLIASETDEFINITION + '#sequenceNumber'), rdflib.term.Literal(seqnr) ))
+ if parentclass:
+ graph.add((rdflib.term.URIRef(basens + '#' + self.id), rdflib.term.URIRef(NSSKOS + '#broader'), rdflib.term.URIRef(basens + '#' + parentclass) ))
+
+ for subclass in self.subclasses:
+ subclass.rdf(graph,basens,parentseturi, self.id)
+
+class LegacySetDefinition(object):
+ def __init__(self, id, type, classes = None, subsets = None, label =None):
+ self.id = id
+ self.type = type
+ self.label = label
+ if classes:
+ self.classes = classes
+ else:
+ self.classes = []
+ if subsets:
+ self.subsets = subsets
+ else:
+ self.subsets = []
+
+ @classmethod
+ def parsexml(Class, node):
+ issubset = node.tag == '{' + NSFOLIA + '}subset'
+ if not issubset:
+ assert node.tag == '{' + NSFOLIA + '}set'
+ classes = []
+ subsets= []
+ if 'type' in node.attrib:
+ if node.attrib['type'] == 'open':
+ type = SetType.OPEN
+ elif node.attrib['type'] == 'closed':
+ type = SetType.CLOSED
+ elif node.attrib['type'] == 'mixed':
+ type = SetType.MIXED
+ elif node.attrib['type'] == 'empty':
+ type = SetType.EMPTY
+ else:
+ raise Exception("Invalid set type: ", type)
+ else:
+ type = SetType.CLOSED
+
+ if 'label' in node.attrib:
+ label = node.attrib['label']
+ else:
+ label = None
+
+ for subnode in node:
+ if isinstance(subnode.tag, str) or (sys.version < '3' and isinstance(subnode.tag, unicode)): #pylint: disable=undefined-variable
+ if subnode.tag == '{' + NSFOLIA + '}class':
+ classes.append( LegacyClassDefinition.parsexml(subnode) )
+ elif not issubset and subnode.tag == '{' + NSFOLIA + '}subset':
+ subsets.append( LegacySetDefinition.parsexml(subnode) )
+ elif subnode.tag == '{' + NSFOLIA + '}constraint':
+ pass
+ elif subnode.tag[:len(NSFOLIA) +2] == '{' + NSFOLIA + '}':
+ raise SetDefinitionError("Invalid tag in Set definition: " + subnode.tag)
+
+ return LegacySetDefinition(node.attrib['{http://www.w3.org/XML/1998/namespace}id'],type,classes, subsets, label)
+
+
+ def json(self):
+ jsonnode = {'id': self.id}
+ if self.label:
+ jsonnode['label'] = self.label
+ if self.type == SetType.OPEN:
+ jsonnode['type'] = 'open'
+ elif self.type == SetType.CLOSED:
+ jsonnode['type'] = 'closed'
+ elif self.type == SetType.MIXED:
+ jsonnode['type'] = 'mixed'
+ elif self.type == SetType.EMPTY:
+ jsonnode['type'] = 'empty'
+ jsonnode['subsets'] = {}
+ for subset in self.subsets:
+ jsonnode['subsets'][subset.id] = subset.json()
+ jsonnode['classes'] = {}
+ jsonnode['classorder'] = []
+ for c in sorted(self.classes, key=lambda x: x.label):
+ jsonnode['classes'][c.id] = c.json()
+ jsonnode['classorder'].append( c.id )
+ return jsonnode
+
+ def rdf(self,graph, basens="",parenturi=None):
+ if not basens:
+ basens = NSFOLIASETDEFINITION + "/" + self.id
+ if not parenturi:
+ graph.bind( self.id, basens + '#', override=True ) #set a prefix for our namespace (does not use @base because of issue RDFLib/rdflib#559 )
+ seturi = rdflib.term.URIRef(basens + '#Set')
+ else:
+ seturi = rdflib.term.URIRef(basens + '#Subset.' + self.id)
+
+ graph.add((seturi, rdflib.RDF.type, rdflib.term.URIRef(NSSKOS + '#Collection')))
+ if self.id:
+ graph.add((seturi, rdflib.term.URIRef(NSSKOS + '#notation'), rdflib.term.Literal(self.id)))
+ if self.type == SetType.OPEN:
+ graph.add((seturi, rdflib.term.URIRef(NSFOLIASETDEFINITION + '#open'), rdflib.term.Literal(True)))
+ elif self.type == SetType.EMPTY:
+ graph.add((seturi, rdflib.term.URIRef(NSFOLIASETDEFINITION + '#empty'), rdflib.term.Literal(True)))
+ if self.label:
+ graph.add((seturi, rdflib.term.URIRef(NSSKOS + '#prefLabel'), rdflib.term.Literal(self.label)))
+ if parenturi:
+ graph.add((parenturi, rdflib.term.URIRef(NSSKOS + '#member'), seturi))
+
+ for i, c in enumerate(self.classes):
+ c.rdf(graph, basens, seturi, None, i+1)
+
+ for s in self.subsets:
+ s.rdf(graph, basens, seturi)
+
+
+def xmltreefromstring(s):
+ """Internal function, deals with different Python versions, unicode strings versus bytes, and with the leak bug in lxml"""
+ if sys.version < '3':
+ #Python 2
+ if isinstance(s,unicode): #pylint: disable=undefined-variable
+ s = s.encode('utf-8')
+ try:
+ return ElementTree.parse(StringIO(s), ElementTree.XMLParser(collect_ids=False))
+ except TypeError:
+ return ElementTree.parse(StringIO(s), ElementTree.XMLParser()) #older lxml, may leak!!!!
+ else:
+ #Python 3
+ if isinstance(s,str):
+ s = s.encode('utf-8')
+ try:
+ return ElementTree.parse(BytesIO(s), ElementTree.XMLParser(collect_ids=False))
+ except TypeError:
+ return ElementTree.parse(BytesIO(s), ElementTree.XMLParser()) #older lxml, may leak!!!!
+
+class SetDefinition(object):
+ def __init__(self, url, format=None, basens="",verbose=False):
+ self.graph = rdflib.Graph()
+ self.basens = basens
+ self.mainsetcache = {}
+ self.subsetcache = {}
+ self.set_id_uri_cache = {}
+ self.verbose = verbose
+ self.graph.bind( 'fsd', NSFOLIASETDEFINITION+'#', override=True)
+ self.graph.bind( 'skos', NSSKOS+'#', override=True)
+ if not format:
+ #try to guess format from URL
+ if url.endswith('.ttl'):
+ format = 'text/turtle'
+ elif url.endswith('.n3'):
+ format = 'text/n3'
+ elif url.endswith('.rdf.xml') or url.endswith('.rdf'):
+ format = 'application/rdf+xml'
+ elif url.endswith('.xml'): #other XML will be considered legacy
+ format = 'application/foliaset+xml' #legacy
+
+ if format in ('application/foliaset+xml','legacy',None):
+ #legacy format, has some checks and fallbacks if the format turns out to be RDF anyway
+ self.legacyset = None
+ if url[0] == '/' or url[0] == '.':
+ #local file
+ f = io.open(url,'r',encoding='utf-8')
+ else:
+ #remote URL
+ if not self.basens:
+ self.basens = url
+ try:
+ f = urlopen(url)
+ except:
+ raise DeepValidationError("Unable to download " + url)
+ try:
+ data = f.read()
+ except IOError:
+ raise DeepValidationError("Unable to download " + url)
+ finally:
+ f.close()
+ if data[0] in ('@',b'@',64):
+ #this is not gonna be valid XML, but looks like turtle/n3 RDF
+ self.graph.parse(location=url, format='text/turtle')
+ if self.verbose:
+ print("Loaded set " + url + " (" + str(len(self.graph)) + " triples)",file=sys.stderr)
+ return
+ tree = xmltreefromstring(data)
+ root = tree.getroot()
+ if root.tag != '{' + NSFOLIA + '}set':
+ if root.tag.lower().find('rdf') != 1:
+ #well, this is RDF after all...
+ self.graph.parse(location=url, format='rdf')
+ return
+ else:
+ raise SetDefinitionError("Not a FoLiA Set Definition! Unexpected root tag:"+ root.tag)
+ legacyset = LegacySetDefinition.parsexml(root)
+ legacyset.rdf(self.graph, self.basens)
+ if self.verbose:
+ print("Loaded legacy set " + url + " (" + str(len(self.graph)) + " triples)",file=sys.stderr)
+ else:
+ try:
+ self.graph.parse(location=url, format=format)
+ except HTTPError:
+ raise DeepValidationError("Unable to download " + url)
+ if self.verbose:
+ print("Loaded set " + url + " (" + str(len(self.graph)) + " triples)",file=sys.stderr)
+
+ def testclass(self,cls):
+ """Test for the presence of the class, returns the full URI or raises an exception"""
+ mainsetinfo = self.mainset()
+ if mainsetinfo['open']:
+ return cls #everything is okay
+ elif mainsetinfo['empty']:
+ if cls:
+ raise DeepValidationError("Expected an empty class, got \"" + cls + "\"")
+ else:
+ if not cls:
+ raise DeepValidationError("No class specified")
+ #closed set
+ set_uri = mainsetinfo['uri']
+ for row in self.graph.query("SELECT ?c WHERE { ?c rdf:type skos:Concept ; skos:notation \"" + cls + "\". <" + str(set_uri) + "> skos:member ?c }"):
+ return str(row.c)
+ raise DeepValidationError("Not a valid class: " + cls)
+
+ def testsubclass(self, cls, subset, subclass):
+ """Test for the presence of a class in a subset (used with features), returns the full URI or raises an exception"""
+ subsetinfo = self.subset(subset)
+ if subsetinfo['open']:
+ return subclass #everything is okay
+ else:
+ subset_uri = subsetinfo['uri']
+ if not subset_uri:
+ raise DeepValidationError("Not a valid subset: " + subset)
+
+ query = "SELECT ?c WHERE { ?c rdf:type skos:Concept ; skos:notation \"" + subclass + "\" . <" + str(subset_uri) + "> skos:member ?c }"
+ for row in self.graph.query(query):
+ return str(row.c)
+ raise DeepValidationError("Not a valid class in subset " + subset + ": " + subclass)
+
+ def get_set_uri(self, set_id=None):
+ if set_id in self.set_id_uri_cache:
+ return self.set_id_uri_cache[set_id]
+ if set_id:
+ for row in self.graph.query("SELECT ?s WHERE { ?s rdf:type skos:Collection ; skos:notation \"" + set_id + "\" }"):
+ self.set_id_uri_cache[set_id] = row.s
+ return row.s
+ raise DeepValidationError("No such set: " + str(set_id))
+ else:
+ for row in self.graph.query("SELECT ?s WHERE { ?s rdf:type skos:Collection . FILTER NOT EXISTS { ?y rdf:type skos:Collection . ?y skos:member ?s } }"):
+ self.set_id_uri_cache[set_id] = row.s
+ return row.s
+ raise DeepValidationError("Main set not found")
+
+ def mainset(self):
+ """Returns information regarding the set"""
+ if self.mainsetcache:
+ return self.mainsetcache
+ set_uri = self.get_set_uri()
+ for row in self.graph.query("SELECT ?seturi ?setid ?setlabel ?setopen ?setempty WHERE { ?seturi rdf:type skos:Collection . OPTIONAL { ?seturi skos:notation ?setid } OPTIONAL { ?seturi skos:prefLabel ?setlabel } OPTIONAL { ?seturi fsd:open ?setopen } OPTIONAL { ?seturi fsd:empty ?setempty } FILTER NOT EXISTS { ?y skos:member ?seturi . ?y rdf:type skos:Collection } }"):
+ self.mainsetcache = {'uri': str(row.seturi), 'id': str(row.setid), 'label': str(row.setlabel) if row.setlabel else "", 'open': bool(row.setopen), 'empty': bool(row.setempty) }
+ return self.mainsetcache
+ raise DeepValidationError("Unable to find main set (set_uri=" + str(set_uri)+"), this should not happen")
+
+ def subset(self, subset_id):
+ """Returns information regarding the set"""
+ if subset_id in self.subsetcache:
+ return self.subsetcache[subset_id]
+ set_uri = self.get_set_uri(subset_id)
+ for row in self.graph.query("SELECT ?seturi ?setid ?setlabel ?setopen WHERE { ?seturi rdf:type skos:Collection . OPTIONAL { ?seturi skos:notation ?setid } OPTIONAL { ?seturi skos:prefLabel ?setlabel } OPTIONAL { ?seturi fsd:open ?setopen } FILTER (?seturi = <" + str(set_uri)+">) }"):
+ self.subsetcache[str(row.setid)] = {'uri': str(row.seturi), 'id': str(row.setid), 'label': str(row.setlabel) if row.setlabel else "", 'open': bool(row.setopen) }
+ return self.subsetcache[str(row.setid)]
+ raise DeepValidationError("Unable to find subset (set_uri=" + str(set_uri)+")")
+
+ def orderedclasses(self, set_uri_or_id=None, nestedhierarchy=False):
+ """Higher-order generator function that yields class information in the right order, combines calls to :meth:`SetDefinition.classes` and :meth:`SetDefinition.classorder`"""
+ classes = self.classes(set_uri_or_id, nestedhierarchy)
+ for classid in self.classorder(classes):
+ yield classes[classid]
+
+ def __iter__(self):
+ """Alias for :meth:`SetDefinition.orderedclasses`"""
+ return self.orderedclasses()
+
+ def classes(self, set_uri_or_id=None, nestedhierarchy=False):
+ """Returns a dictionary of classes for the specified (sub)set (if None, default, the main set is selected)"""
+ if set_uri_or_id and set_uri_or_id.startswith(('http://','https://')):
+ set_uri = set_uri_or_id
+ else:
+ set_uri = self.get_set_uri(set_uri_or_id)
+
+ assert set_uri is not None
+
+ classes= {}
+ uri2idmap = {}
+ for row in self.graph.query("SELECT ?classuri ?classid ?classlabel ?parentclass ?seqnr WHERE { ?classuri rdf:type skos:Concept ; skos:notation ?classid. <" + str(set_uri) + "> skos:member ?classuri . OPTIONAL { ?classuri skos:prefLabel ?classlabel } OPTIONAL { ?classuri skos:broader ?parentclass } OPTIONAL { ?classuri fsd:sequenceNumber ?seqnr } }"):
+ classinfo = {'uri': str(row.classuri), 'id': str(row.classid),'label': str(row.classlabel) if row.classlabel else "" }
+ if nestedhierarchy:
+ uri2idmap[str(row.classuri)] = str(row.classid)
+ if row.parentclass:
+ classinfo['parentclass'] = str(row.parentclass) #uri
+ if row.seqnr:
+ classinfo['seqnr'] = int(row.seqnr)
+ classes[str(row.classid)] = classinfo
+
+ if nestedhierarchy:
+ #build hierarchy
+ removekeys = []
+ for classid, classinfo in classes.items():
+ if 'parentclass' in classinfo:
+ removekeys.append(classid)
+ parentclassid = uri2idmap[classinfo['parentclass']]
+ if 'subclasses' not in classes[parentclassid]:
+ classes[parentclassid]['subclasses'] = {}
+ classes[parentclassid]['subclasses'][classid] = classinfo
+ for key in removekeys:
+ del classes[key]
+ return classes
+
+ def classorder(self,classes):
+ """Return a list of class IDs in order for presentational purposes: order is determined first and foremost by explicit ordering, else alphabetically by label or as a last resort by class ID"""
+ return [ classid for classid, classitem in sorted( ((classid, classitem) for classid, classitem in classes.items() if 'seqnr' in classitem) , key=lambda pair: pair[1]['seqnr'] )] + \
+ [ classid for classid, classitem in sorted( ((classid, classitem) for classid, classitem in classes.items() if 'seqnr' not in classitem) , key=lambda pair: pair[1]['label'] if 'label' in pair[1] else pair[1]['id']) ]
+
+
+
+ def subsets(self, set_uri_or_id=None):
+ if set_uri_or_id and set_uri_or_id.startswith(('http://', 'https://')):
+ set_uri = set_uri_or_id
+ else:
+ set_uri = self.get_set_uri(set_uri_or_id)
+
+ assert set_uri is not None
+
+ for row in self.graph.query("SELECT ?seturi ?setid ?setlabel ?setopen WHERE { ?seturi rdf:type skos:Collection . <" + str(set_uri) + "> skos:member ?seturi . OPTIONAL { ?seturi skos:notation ?setid } OPTIONAL { ?seturi skos:prefLabel ?setlabel } OPTIONAL { ?seturi fsd:open ?setopen } }"):
+ yield {'uri': str(row.seturi), 'id': str(row.setid), 'label': str(row.setlabel) if row.setlabel else "", 'open': bool(row.setopen) }
+
+ def json(self):
+ data = {'subsets': {}}
+ setinfo = self.mainset()
+ #backward compatibility, set type:
+ if setinfo['open']:
+ setinfo['type'] = 'open'
+ else:
+ setinfo['type'] = 'closed'
+ data.update(setinfo)
+ classes = self.classes()
+ data['classes'] = classes
+ data['classorder'] = self.classorder(classes)
+ for subsetinfo in self.subsets():
+ #backward compatibility, set type:
+ if subsetinfo['open']:
+ subsetinfo['type'] = 'open'
+ else:
+ subsetinfo['type'] = 'closed'
+ data['subsets'][subsetinfo['id']] = subsetinfo
+ classes = self.classes(subsetinfo['uri'])
+ data['subsets'][subsetinfo['id']]['classes'] = classes
+ data['subsets'][subsetinfo['id']]['classorder'] = self.classorder(classes)
+ return data
diff --git a/pynlpl/formats/fql.py b/pynlpl/formats/fql.py
index 78e8807..eed347d 100644
--- a/pynlpl/formats/fql.py
+++ b/pynlpl/formats/fql.py
@@ -35,8 +35,8 @@ MASK_LITERAL = 1
MASK_EXPRESSION = 2
MAXEXPANSION = 99
-FOLIAVERSION = '1.3.1'
-FQLVERSION = '0.3.1'
+FOLIAVERSION = '1.4.0'
+FQLVERSION = '0.4.0'
class SyntaxError(Exception):
pass
@@ -224,8 +224,8 @@ class Filter(object): #WHERE ....
operator = q[i+1]
if q[i] == "class":
v = lambda x,y='cls': getattr(x,y)
- elif q[i] in ("text","value"):
- v = lambda x,y='text': getattr(x,'value') if isinstance(x, (folia.Description, folia.Comment, folia.Content)) else getattr(x,'text')()
+ elif q[i] in ("text","value","phon"):
+ v = lambda x,y='text': getattr(x,'value') if isinstance(x, (folia.Description, folia.Comment, folia.Content)) else getattr(x,'phon') if isinstance(x,folia.PhonContent) else getattr(x,'text')()
else:
v = lambda x,y=q[i]: getattr(x,y)
if q[i] == 'confidence':
@@ -463,8 +463,9 @@ class Selector(object):
selector.Class = candidate.__class__
if not selector.filter or selector.filter(query,candidate, debug):
if debug: print("[FQL EVALUATION DEBUG] Select - Yielding (by ID) ", repr(candidate),file=sys.stderr)
- yield candidate, None
+ yield candidate, e
except KeyError:
+ if debug: print("[FQL EVALUATION DEBUG] Select - Selecting by ID failed for ID " + selector.id,file=sys.stderr)
pass #silently ignore ID mismatches
elif selector.Class == "ALL":
for candidate in e:
@@ -558,6 +559,9 @@ class Span(object):
def __init__(self, targets, intervals = []):
self.targets = targets #Selector instances making up the span
+ def __len__(self):
+ return len(self.targets)
+
@staticmethod
def parse(q, i=0):
targets = []
@@ -569,6 +573,9 @@ class Span(object):
elif q.kw(i,"&"):
#we're gonna have more targets
i += 1
+ elif q.kw(i,"NONE"):
+ #empty span
+ return Span([]), i+1
else:
break
@@ -582,179 +589,183 @@ class Span(object):
backtrack = []
l = len(self.targets)
+ if l == 0:
+ #span is explicitly empty, this is allowed in RESPAN context
+ if debug: print("[FQL EVALUATION DEBUG] Span - Yielding explicitly empty SpanSet",file=sys.stderr)
+ yield SpanSet()
+ else:
+ #find the first non-optional element, it will be our pivot:
+ pivotindex = None
+ for i, target in enumerate(self.targets):
+ if self.targets[i].id or not self.targets[i].expansion or self.targets[i].expansion[0] > 0:
+ pivotindex = i
+ break
+ if pivotindex is None:
+ raise QueryError("All parts in the SPAN expression are optional, at least one non-optional component is required")
- #find the first non-optional element, it will be our pivot:
- pivotindex = None
- for i, target in enumerate(self.targets):
- if self.targets[i].id or not self.targets[i].expansion or self.targets[i].expansion[0] > 0:
- pivotindex = i
- break
- if pivotindex is None:
- raise QueryError("All parts in the SPAN expression are optional, at least one non-optional component is required")
+ #get first target
+ for element, target in self.targets[pivotindex](query, contextselector, recurse,debug):
+ if debug: print("[FQL EVALUATION DEBUG] Span - First item of span found (pivotindex=" + str(pivotindex) + ",l=" + str(l) + "," + str(repr(element)) + ")",file=sys.stderr)
+ spanset = SpanSet() #elemnent is added later
- #get first target
- for element, target in self.targets[pivotindex](query, contextselector, recurse,debug):
- if debug: print("[FQL EVALUATION DEBUG] Span - First item of span found (pivotindex=" + str(pivotindex) + ",l=" + str(l) + "," + str(repr(element)) + ")",file=sys.stderr)
- spanset = SpanSet() #elemnent is added later
+ match = True #we attempt to disprove this
- match = True #we attempt to disprove this
+ #now see if consecutive elements match up
- #now see if consecutive elements match up
+ #--- matching prior to pivot -------
- #--- matching prior to pivot -------
+ #match optional elements before pivotindex
+ i = pivotindex
+ currentelement = element
+ while i > 0:
+ i -= 1
+ if i < 0: break
+ selector = self.targets[i]
+ minmatches = selector.expansion[0]
+ assert minmatches == 0 #everything before pivot has to have minmatches 0
+ maxmatches = selector.expansion[1]
+ done = False
- #match optional elements before pivotindex
- i = pivotindex
- currentelement = element
- while i > 0:
- i -= 1
- if i < 0: break
- selector = self.targets[i]
- minmatches = selector.expansion[0]
- assert minmatches == 0 #everything before pivot has to have minmatches 0
- maxmatches = selector.expansion[1]
- done = False
+ matches = 0
+ while True:
+ prevelement = element
+ element = element.previous(selector.Class, None)
+ if not element or (target and target not in element.ancestors()):
+ if debug: print("[FQL EVALUATION DEBUG] Span - Prior element not found or out of scope",file=sys.stderr)
+ done = True #no more elements left
+ break
+ elif element and not selector.match(query, element,debug):
+ if debug: print("[FQL EVALUATION DEBUG] Span - Prior element does not match filter",file=sys.stderr)
+ element = prevelement #reset
+ break
- matches = 0
- while True:
- prevelement = element
- element = element.previous(selector.Class, None)
- if not element or (target and target not in element.ancestors()):
- if debug: print("[FQL EVALUATION DEBUG] Span - Prior element not found or out of scope",file=sys.stderr)
- done = True #no more elements left
- break
- elif element and not selector.match(query, element,debug):
- if debug: print("[FQL EVALUATION DEBUG] Span - Prior element does not match filter",file=sys.stderr)
- element = prevelement #reset
- break
+ if debug: print("[FQL EVALUATION DEBUG] Span - Prior element matches",file=sys.stderr)
+ #we have a match
+ matches += 1
+ spanset.insert(0,element)
+ if matches >= maxmatches:
+ if debug: print("[FQL EVALUATION DEBUG] Span - Maximum threshold reached for span selector " + str(i) + ", breaking", file=sys.stderr)
+ break
- if debug: print("[FQL EVALUATION DEBUG] Span - Prior element matches",file=sys.stderr)
- #we have a match
- matches += 1
- spanset.insert(0,element)
- if matches >= maxmatches:
- if debug: print("[FQL EVALUATION DEBUG] Span - Maximum threshold reached for span selector " + str(i) + ", breaking", file=sys.stderr)
+ if done:
break
- if done:
- break
-
- #--- matching pivot and selectors after pivot -------
+ #--- matching pivot and selectors after pivot -------
- done = False #are we done with this selector?
- element = currentelement
- i = pivotindex - 1 #loop does +1 at the start of each iteration, we want to start with the pivotindex
- while i < l:
- i += 1
- if i == l:
- if debug: print("[FQL EVALUATION DEBUG] Span - No more selectors to try",i,l, file=sys.stderr)
- break
- selector = self.targets[i]
- if selector.id: #selection by ID, don't care about consecutiveness
- try:
- element = query.doc[selector.id]
- if debug: print("[FQL EVALUATION DEBUG] Span - Obtained subsequent span item from ID: ", repr(element), file=sys.stderr)
- except KeyError:
- if debug: print("[FQL EVALUATION DEBUG] Span - Obtained subsequent with specified ID does not exist ", file=sys.stderr)
- match = False
+ done = False #are we done with this selector?
+ element = currentelement
+ i = pivotindex - 1 #loop does +1 at the start of each iteration, we want to start with the pivotindex
+ while i < l:
+ i += 1
+ if i == l:
+ if debug: print("[FQL EVALUATION DEBUG] Span - No more selectors to try",i,l, file=sys.stderr)
break
- if element and not selector.match(query, element,debug):
- if debug: print("[FQL EVALUATION DEBUG] Span - Subsequent element does not match filter",file=sys.stderr)
- else:
- spanset.append(element)
-
- else: #element must be consecutive
- if selector.expansion:
- minmatches = selector.expansion[0]
- maxmatches = selector.expansion[1]
- else:
- minmatches = maxmatches = 1
-
- if debug: print("[FQL EVALUATION DEBUG] Span - Preparing to match selector " + str(i) + " of span, expansion={" + str(minmatches) + "," + str(maxmatches) + "}", file=sys.stderr)
- matches = 0
-
- while True:
- submatch = True #does the element currenty under consideration match? (the match variable is reserved for the entire match)
- done = False #are we done with this span selector?
- holdelement = False #do not go to next element
-
- if debug: print("[FQL EVALUATION DEBUG] Span - Processing element with span selector " + str(i) + ": ", repr(element), file=sys.stderr)
-
- if not element or (target and target not in element.ancestors()):
- if debug:
- if not element:
- print("[FQL EVALUATION DEBUG] Span - Element not found",file=sys.stderr)
- elif target and not target in element.ancestors():
- print("[FQL EVALUATION DEBUG] Span - Element out of scope",file=sys.stderr)
- submatch = False
- elif element and not selector.match(query, element,debug):
- if debug: print("[FQL EVALUATION DEBUG] Span - Element does not match filter",file=sys.stderr)
- submatch = False
-
- if submatch:
- matches += 1
- if debug: print("[FQL EVALUATION DEBUG] Span - Element is a match, got " + str(matches) + " match(es) now", file=sys.stderr)
-
- if matches > minmatches:
- #check if the next selector(s) match too, then we have a point where we might branch two ways
- #j = 1
- #while i+j < len(self.targets):
- # nextselector = self.targets[i+j]
- # if nextselector.match(query, element,debug):
- # #save this point for backtracking, when we get stuck, we'll roll back to this point
- # backtrack.append( (i+j, prevelement, copy(spanset) ) ) #using prevelement, nextelement will be recomputed after backtracking, using different selector
- # if not nextselector.expansion or nextselector.expansion[0] > 0:
- # break
- # j += 1
- #TODO: implement
- pass
- elif matches < minmatches:
- if debug: print("[FQL EVALUATION DEBUG] Span - Minimum threshold not reached yet for span selector " + str(i), file=sys.stderr)
-
+ selector = self.targets[i]
+ if selector.id: #selection by ID, don't care about consecutiveness
+ try:
+ element = query.doc[selector.id]
+ if debug: print("[FQL EVALUATION DEBUG] Span - Obtained subsequent span item from ID: ", repr(element), file=sys.stderr)
+ except KeyError:
+ if debug: print("[FQL EVALUATION DEBUG] Span - Obtained subsequent with specified ID does not exist ", file=sys.stderr)
+ match = False
+ break
+ if element and not selector.match(query, element,debug):
+ if debug: print("[FQL EVALUATION DEBUG] Span - Subsequent element does not match filter",file=sys.stderr)
+ else:
spanset.append(element)
- if matches >= maxmatches:
- if debug: print("[FQL EVALUATION DEBUG] Span - Maximum threshold reached for span selector " + str(i) + ", breaking", file=sys.stderr)
- done = True #done with this selector
+
+ else: #element must be consecutive
+ if selector.expansion:
+ minmatches = selector.expansion[0]
+ maxmatches = selector.expansion[1]
else:
- if matches < minmatches:
- #can we backtrack?
- if backtrack: #(not reached currently)
- if debug: print("[FQL EVALUATION DEBUG] Span - Backtracking",file=sys.stderr)
- index, element, spanset = backtrack.pop()
- i = index - 1 #next iteration will do +1 again
- match = True #default
- continue
+ minmatches = maxmatches = 1
+
+ if debug: print("[FQL EVALUATION DEBUG] Span - Preparing to match selector " + str(i) + " of span, expansion={" + str(minmatches) + "," + str(maxmatches) + "}", file=sys.stderr)
+ matches = 0
+
+ while True:
+ submatch = True #does the element currenty under consideration match? (the match variable is reserved for the entire match)
+ done = False #are we done with this span selector?
+ holdelement = False #do not go to next element
+
+ if debug: print("[FQL EVALUATION DEBUG] Span - Processing element with span selector " + str(i) + ": ", repr(element), file=sys.stderr)
+
+ if not element or (target and target not in element.ancestors()):
+ if debug:
+ if not element:
+ print("[FQL EVALUATION DEBUG] Span - Element not found",file=sys.stderr)
+ elif target and not target in element.ancestors():
+ print("[FQL EVALUATION DEBUG] Span - Element out of scope",file=sys.stderr)
+ submatch = False
+ elif element and not selector.match(query, element,debug):
+ if debug: print("[FQL EVALUATION DEBUG] Span - Element does not match filter",file=sys.stderr)
+ submatch = False
+
+ if submatch:
+ matches += 1
+ if debug: print("[FQL EVALUATION DEBUG] Span - Element is a match, got " + str(matches) + " match(es) now", file=sys.stderr)
+
+ if matches > minmatches:
+ #check if the next selector(s) match too, then we have a point where we might branch two ways
+ #j = 1
+ #while i+j < len(self.targets):
+ # nextselector = self.targets[i+j]
+ # if nextselector.match(query, element,debug):
+ # #save this point for backtracking, when we get stuck, we'll roll back to this point
+ # backtrack.append( (i+j, prevelement, copy(spanset) ) ) #using prevelement, nextelement will be recomputed after backtracking, using different selector
+ # if not nextselector.expansion or nextselector.expansion[0] > 0:
+ # break
+ # j += 1
+ #TODO: implement
+ pass
+ elif matches < minmatches:
+ if debug: print("[FQL EVALUATION DEBUG] Span - Minimum threshold not reached yet for span selector " + str(i), file=sys.stderr)
+
+ spanset.append(element)
+ if matches >= maxmatches:
+ if debug: print("[FQL EVALUATION DEBUG] Span - Maximum threshold reached for span selector " + str(i) + ", breaking", file=sys.stderr)
+ done = True #done with this selector
+ else:
+ if matches < minmatches:
+ #can we backtrack?
+ if backtrack: #(not reached currently)
+ if debug: print("[FQL EVALUATION DEBUG] Span - Backtracking",file=sys.stderr)
+ index, element, spanset = backtrack.pop()
+ i = index - 1 #next iteration will do +1 again
+ match = True #default
+ continue
+ else:
+ #nope, all is lost, we have no match
+ if debug: print("[FQL EVALUATION DEBUG] Span - Minimum threshold could not be attained for span selector " + str(i), file=sys.stderr)
+ match = False
+ break
else:
- #nope, all is lost, we have no match
- if debug: print("[FQL EVALUATION DEBUG] Span - Minimum threshold could not be attained for span selector " + str(i), file=sys.stderr)
- match = False
+ if debug: print("[FQL EVALUATION DEBUG] Span - No match for span selector " + str(i) + ", but no problem since matching threshold was already reached", file=sys.stderr)
+ holdelement = True
+ done = True
break
- else:
- if debug: print("[FQL EVALUATION DEBUG] Span - No match for span selector " + str(i) + ", but no problem since matching threshold was already reached", file=sys.stderr)
- holdelement = True
- done = True
- break
- if not holdelement:
- prevelement = element
- #get next element
- element = element.next(selector.Class, None)
- if debug: print("[FQL EVALUATION DEBUG] Span - Selecting next element for next round", repr(element), file=sys.stderr)
+ if not holdelement:
+ prevelement = element
+ #get next element
+ element = element.next(selector.Class, None)
+ if debug: print("[FQL EVALUATION DEBUG] Span - Selecting next element for next round", repr(element), file=sys.stderr)
- if done or not match:
- if debug: print("[FQL EVALUATION DEBUG] Span - Done with span selector " + str(i), repr(element), file=sys.stderr)
- break
+ if done or not match:
+ if debug: print("[FQL EVALUATION DEBUG] Span - Done with span selector " + str(i), repr(element), file=sys.stderr)
+ break
- if not match: break
+ if not match: break
- if match:
- if debug: print("[FQL EVALUATION DEBUG] Span - Span found, returning spanset (" + repr(spanset) + ")",file=sys.stderr)
- yield spanset
- else:
- if debug: print("[FQL EVALUATION DEBUG] Span - Span not found",file=sys.stderr)
+ if match:
+ if debug: print("[FQL EVALUATION DEBUG] Span - Span found, returning spanset (" + repr(spanset) + ")",file=sys.stderr)
+ yield spanset
+ else:
+ if debug: print("[FQL EVALUATION DEBUG] Span - Span not found",file=sys.stderr)
@@ -859,7 +870,7 @@ class Target(object): #FOR/IN... expression
if debug: print("[FQL EVALUATION DEBUG] Target - Matched end! Breaking after yielding...",e, file=sys.stderr)
started = False
dobreak = True
- if debug: print("[FQL EVALUATION DEBUG] Target - Yielding ",e, file=sys.stderr)
+ if debug: print("[FQL EVALUATION DEBUG] Target - Yielding ",repr(e), file=sys.stderr)
yield e
if dobreak and not self.repeat:
break
@@ -1069,12 +1080,12 @@ class Correction(object): #AS CORRECTION/SUGGESTION expression...
inheritchildren = []
if focus and not self.bare: #copy all data within
inheritchildren = list(focus.copychildren(query.doc, True))
- if action.action == "EDIT" and 'respan' in action.extra:
+ if action.action == "EDIT" and action.span: #respan
#delete all word references from the copy first, we will add new ones
inheritchildren = [ c for c in inheritchildren if not isinstance(c, folia.WordReference) ]
if not isinstance(focus, folia.AbstractSpanAnnotation): raise QueryError("Can only perform RESPAN on span annotation elements!")
contextselector = target if target else query.doc
- spanset = next(action.extra['respan'](query, contextselector, True, debug)) #there can be only one
+ spanset = next(action.span(query, contextselector, True, debug)) #there can be only one
for w in spanset:
inheritchildren.append(w)
@@ -1315,7 +1326,7 @@ class Correction(object): #AS CORRECTION/SUGGESTION expression...
def getassignments(q, i, assignments, focus=None):
l = len(q)
while i < l:
- if q.kw(i, ('id','set','annotator','class','n')):
+ if q.kw(i, ('id','set','subset','annotator','class','n')):
if q[i+1] == 'NONE':
assignments[q[i]] = None
else:
@@ -1340,9 +1351,11 @@ def getassignments(q, i, assignments, focus=None):
else:
raise SyntaxError("Invalid value for annotatortype: " + str(q[i+1]))
i+=2
- elif q.kw(i,'text'):
+ elif q.kw(i,('text','value','phon')):
if not focus is None and focus.Class in (folia.TextContent, folia.Description, folia.Comment):
key = 'value'
+ elif not focus is None and focus.Class is folia.PhonContent:
+ key = 'phon'
else:
key = 'text'
assignments[key] = q[i+1]
@@ -1377,8 +1390,7 @@ class Action(object): #Action expression
self.form = None
self.subactions = []
self.nextaction = None
- self.respan = []
- self.extra = {}
+ self.span = None #encodes an extra SPAN/RESPAN action
@staticmethod
@@ -1410,8 +1422,8 @@ class Action(object): #Action expression
#we have enough to set up the action now
action = Action(action, focus, assignments)
- if action.action == "EDIT" and q.kw(i,"RESPAN"):
- action.extra['respan'], i = Span.parse(q,i+1)
+ if action.action in ("EDIT","ADD", "APPEND","PREPEND") and q.kw(i,("RESPAN","SPAN")):
+ action.span, i = Span.parse(q,i+1)
done = False
while not done:
@@ -1565,10 +1577,13 @@ class Action(object): #Action expression
if action.action == "EDIT":
if debug: print("[FQL EVALUATION DEBUG] Action - Applying EDIT to focus ", repr(focus),file=sys.stderr)
for attr, value in action.assignments.items():
- if attr in ("text","value"):
+ if attr in ("text","value","phon"):
if isinstance(focus, (folia.Description, folia.Comment, folia.Content)):
if debug: print("[FQL EVALUATION DEBUG] Action - setting value ("+ value+ ") on focus ", repr(focus),file=sys.stderr)
focus.value = value
+ elif isinstance(focus, (folia.PhonContent)):
+ if debug: print("[FQL EVALUATION DEBUG] Action - setphon("+ value+ ") on focus ", repr(focus),file=sys.stderr)
+ focus.setphon(value)
else:
if debug: print("[FQL EVALUATION DEBUG] Action - settext("+ value+ ") on focus ", repr(focus),file=sys.stderr)
focus.settext(value)
@@ -1578,9 +1593,9 @@ class Action(object): #Action expression
else:
if debug: print("[FQL EVALUATION DEBUG] Action - " + attr + " = " + value + " on focus ", repr(focus),file=sys.stderr)
setattr(focus, attr, value)
- if 'respan' in action.extra:
+ if action.span is not None: #respan
if not isinstance(focus, folia.AbstractSpanAnnotation): raise QueryError("Can only perform RESPAN on span annotation elements!")
- spanset = next(action.extra['respan'](query, contextselector, True, debug)) #there can be only one
+ spanset = next(action.span(query, contextselector, True, debug)) #there can be only one
focus.setspan(*spanset)
query._touch(focus)
@@ -1623,8 +1638,8 @@ class Action(object): #Action expression
raise QueryError("Focus of action has no class!")
isspan = issubclass(action.focus.Class, folia.AbstractSpanAnnotation)
-
- if not 'set' in action.assignments and action.focus.Class not in (folia.Description, folia.Comment):
+ isspanrole = issubclass(action.focus.Class, folia.AbstractSpanRole)
+ if 'set' not in action.assignments and action.focus.Class not in (folia.Description, folia.Comment, folia.Feature) and not isspanrole:
if action.focus.set and action.focus.set != "undefined":
action.assignments['set'] = action.focus.set
elif action.focus.Class.XMLTAG in query.defaultsets:
@@ -1646,6 +1661,8 @@ class Action(object): #Action expression
if isinstance(target, SpanSet):
if action.action == "ADD" or action.action == "EDIT":
if debug: print("[FQL EVALUATION DEBUG] Action - Applying " + action.action + " of " + action.focus.Class.__name__ + " to target spanset " + repr(target),file=sys.stderr)
+ if action.span is not None and len(action.span) == 0:
+ action.assignments['emptyspan'] = True
focusselection.append( target[0].add(action.focus.Class, *target, **action.assignments) ) #handles span annotation too
query._touch(focusselection[-1])
else:
@@ -1674,6 +1691,12 @@ class Action(object): #Action expression
elif not any(x is target for x in constrainedtargetselection):
constrainedtargetselection.append(target)
+ if focusselection and action.span: #process SPAN keyword (ADD .. SPAN .. FOR .. rather than ADD ... FOR SPAN ..)
+ if not isspan: raise QueryError("Can only use SPAN with span annotation elements!")
+ for focus in focusselection:
+ spanset = next(action.span(query, contextselector, True, debug)) #there can be only one
+ focus.setspan(*spanset)
+
if focusselection and action.subactions and not substitution:
for subaction in action.subactions:
#check if set is declared, if not, auto-declare
@@ -1743,7 +1766,7 @@ class Context(object):
class Query(object):
"""This class represents an FQL query.
-
+
Selecting a word with a particular text is done as follows, ``doc`` is an instance of :class:`pynlpl.formats.folia.Document`::
query = fql.Query('SELECT w WHERE text = "house"')
@@ -1754,13 +1777,13 @@ class Query(object):
query = fql.Query('SELECT w WHERE text MATCHES "^house.*$"')
for word in query(doc):
- print(word)
+ print(word)
The classes of other annotation types can be easily queried as follows::
query = fql.Query('SELECT w WHERE :pos = "v"' AND :lemma = "be"')
for word in query(doc):
- print(word)
+ print(word)
You can constrain your queries to a particular target selection using the ``FOR`` keyword::
@@ -1778,8 +1801,8 @@ class Query(object):
query = fql.Query('SELECT entity WHERE class = "person" FOR w WHERE text != "John" FOR div ID "section.21"')
for entity in query(doc):
- print(entity)
-
+ print(entity)
+
Sets are specified using the **OF** keyword, it can be omitted if there is only one for the annotation type, but will be required otherwise::
query = fql.Query('SELECT su OF "http://some/syntax/set" WHERE class = "np"')
@@ -1820,6 +1843,7 @@ class Query(object):
i += 2
defaults = {}
+ decset = None
if q.kw(i,"OF") and q[i+1]:
i += 1
decset = q[i]
@@ -1827,6 +1851,9 @@ class Query(object):
if q.kw(i,"WITH"):
i = getassignments(q,i+1,defaults)
+ if not decset:
+ raise SyntaxError("DECLARE statement must state a set")
+
self.declarations.append( (Class, decset, defaults) )
if i < l:
diff --git a/pynlpl/formats/timbl.py b/pynlpl/formats/timbl.py
index 2a4e1a6..f6855f0 100644
--- a/pynlpl/formats/timbl.py
+++ b/pynlpl/formats/timbl.py
@@ -42,8 +42,6 @@ class TimblOutput(object):
def __iter__(self):
# Note: distance parsing (+v+di) works only if distributions (+v+db) are also enabled!
-
-
for line in self.stream:
endfvec = None
line = line.strip()
@@ -64,8 +62,8 @@ class TimblOutput(object):
#endfvec = segments.index("{")
except ValueError:
endfvec = None
-
- if endfvec > 2: #only for +v+db
+
+ if endfvec and endfvec > 2: # only for +v+db
try:
enddistr = segments.index('}',endfvec)
except ValueError:
@@ -106,4 +104,3 @@ class TimblOutput(object):
print("ERROR: pynlpl.input.timbl.TimblOutput -- Did not find class distribution for ", instance,file=stderr)
return Distribution(dist)
-
diff --git a/pynlpl/lm/srilm.py b/pynlpl/lm/srilm.py
index 9298bc0..a379de7 100644
--- a/pynlpl/lm/srilm.py
+++ b/pynlpl/lm/srilm.py
@@ -18,11 +18,26 @@ from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
-import srilmcc
+try:
+ import srilmcc
+except ImportError:
+ import warnings
+ warnings.warn("srilmcc module is not compiled")
+ srilmcc = None
+
from pynlpl.textprocessors import Windower
+
+class SRILMException(Exception):
+ """Base Exception for SRILM."""
+
+
class SRILM:
def __init__(self, filename, n):
+ if not srilmcc:
+ raise SRILMException(
+ "SRILM is not downloaded and compiled."
+ "Please follow the instructions in makesrilmcc")
self.model = srilmcc.LanguageModel(filename, n)
self.n = n
@@ -55,4 +70,3 @@ class SRILM:
raise KeyError
else:
raise Exception("Not an " + str(self.n) + "-gram")
-
diff --git a/pynlpl/tests/folia.py b/pynlpl/tests/folia.py
index 4eb07e8..870b534 100755
--- a/pynlpl/tests/folia.py
+++ b/pynlpl/tests/folia.py
@@ -36,7 +36,7 @@ import bz2
import re
-FOLIARELEASE = "v1.3.2.52"
+FOLIARELEASE = "v1.4.0.53"
if os.path.exists('../../FoLiA'):
FOLIAPATH = '../../FoLiA/'
@@ -266,8 +266,8 @@ class Test2Sanity(unittest.TestCase):
self.assertEqual( w.annotation(folia.PosAnnotation).cls, 'N(soort,ev,basis,onz,stan)' ) #cls is used everywhere instead of class, since class is a reserved keyword in python
self.assertEqual( w.pos(),'N(soort,ev,basis,onz,stan)' ) #w.pos() is just a direct shortcut for getting the class
- self.assertEqual( w.annotation(folia.PosAnnotation).set, 'cgn-combinedtags' )
- self.assertEqual( w.annotation(folia.PosAnnotation).annotator, 'tadpole' )
+ self.assertEqual( w.annotation(folia.PosAnnotation).set, 'https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/frog-mbpos-cgn' )
+ self.assertEqual( w.annotation(folia.PosAnnotation).annotator, 'frog' )
self.assertEqual( w.annotation(folia.PosAnnotation).annotatortype, folia.AnnotatorType.AUTO )
@@ -366,9 +366,9 @@ class Test2Sanity(unittest.TestCase):
"""Sanity Check - Subtoken annotation (part of speech)"""
w= self.doc['WR-P-E-J-0000000001.p.1.s.2.w.5']
p = w.annotation(folia.PosAnnotation)
- self.assertEqual( p.feat('role'), 'pv' )
- self.assertEqual( p.feat('tense'), 'tgw' )
- self.assertEqual( p.feat('form'), 'met-t' )
+ self.assertEqual( p.feat('wvorm'), 'pv' )
+ self.assertEqual( p.feat('pvtijd'), 'tgw' )
+ self.assertEqual( p.feat('pvagr'), 'met-t' )
def test019_alignment(self):
"""Sanity Check - Alignment in same document"""
@@ -506,6 +506,12 @@ class Test2Sanity(unittest.TestCase):
self.assertTrue( isinstance(prevw, folia.Word) )
self.assertEqual( prevw.text(), "." )
+ def test021c_previousword_constrained(self):
+ """Sanity Check - Obtaining non-existing previous word with scope constraint"""
+ w = self.doc['WR-P-E-J-0000000001.p.1.s.4.w.1']
+ prevw = w.previous(folia.Word, [folia.Sentence])
+ self.assertEqual(prevw, None)
+
def test022_nextword(self):
"""Sanity Check - Obtaining next word"""
w = self.doc['WR-P-E-J-0000000001.p.1.s.2.w.7']
@@ -1618,7 +1624,7 @@ class Test4Edit(unittest.TestCase):
self.assertTrue( isinstance(l, folia.LemmaAnnotation) )
self.assertEqual( l.cls, 'NAAM' )
- self.assertTrue( xmlcheck(w.xmlstring(), '<w xmlns="http://ilk.uvt.nl/folia" xml:id="WR-P-E-J-0000000001.p.1.s.2.w.11"><t>naam</t><pos class="N(soort,ev,basis,zijd,stan)" set="cgn-combinedtags"/><lemma class="naam" set="lemmas-nl"/><pos class="NOUN" set="adhocpos" annotatortype="auto" annotator="testscript"/><lemma set="adhoclemma" class="NAAM" datetime="1982-12-15T19:00:01" annotatortype="auto" annotator="testscript"/></w>') )
+ self.assertTrue( xmlcheck(w.xmlstring(), '<w xmlns="http://ilk.uvt.nl/folia" xml:id="WR-P-E-J-0000000001.p.1.s.2.w.11"><t>naam</t><pos class="N(soort,ev,basis,zijd,stan)" set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/frog-mbpos-cgn"/><lemma class="naam" set="lemmas-nl"/><pos class="NOUN" set="adhocpos" annotatortype="auto" annotator="testscript"/><lemma set="adhoclemma" class="NAAM" datetime="1982-12-15T19:00:01" annotatortype="auto" annotator="testsc [...]
def test002b_addannotation(self):
"""Edit Check - Adding a token annotation (pos, lemma) (instances generated on the fly)"""
@@ -1642,7 +1648,7 @@ class Test4Edit(unittest.TestCase):
self.assertTrue( isinstance(l, folia.LemmaAnnotation) )
self.assertEqual( l.cls, 'NAAM' )
- self.assertTrue( xmlcheck(w.xmlstring(), '<w xmlns="http://ilk.uvt.nl/folia" xml:id="WR-P-E-J-0000000001.p.1.s.2.w.11"><t>naam</t><pos class="N(soort,ev,basis,zijd,stan)" set="cgn-combinedtags"/><lemma class="naam" set="lemmas-nl"/><pos class="NOUN" set="adhocpos" annotatortype="auto" annotator="testscript"/><lemma class="NAAM" set="adhoclemma" annotatortype="auto" annotator="testscript"/></w>'))
+ self.assertTrue( xmlcheck(w.xmlstring(), '<w xmlns="http://ilk.uvt.nl/folia" xml:id="WR-P-E-J-0000000001.p.1.s.2.w.11"><t>naam</t><pos class="N(soort,ev,basis,zijd,stan)" set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/frog-mbpos-cgn"/><lemma class="naam" set="lemmas-nl"/><pos class="NOUN" set="adhocpos" annotatortype="auto" annotator="testscript"/><lemma class="NAAM" set="adhoclemma" annotatortype="auto" annotator="testscript"/></w>'))
def test002c_addannotation(self):
"""Edit Check - Adding a token annotation (pos, lemma) (using add instead of append)"""
@@ -1666,7 +1672,7 @@ class Test4Edit(unittest.TestCase):
self.assertTrue( isinstance(l, folia.LemmaAnnotation) )
self.assertEqual( l.cls, 'NAAM' )
- self.assertTrue( xmlcheck(w.xmlstring(), '<w xmlns="http://ilk.uvt.nl/folia" xml:id="WR-P-E-J-0000000001.p.1.s.2.w.11"><t>naam</t><pos class="N(soort,ev,basis,zijd,stan)" set="cgn-combinedtags"/><lemma class="naam" set="lemmas-nl"/><pos class="NOUN" set="adhocpos" annotatortype="auto" annotator="testscript"/><lemma set="adhoclemma" class="NAAM" datetime="1982-12-15T19:00:01" annotatortype="auto" annotator="testscript"/></w>') )
+ self.assertTrue( xmlcheck(w.xmlstring(), '<w xmlns="http://ilk.uvt.nl/folia" xml:id="WR-P-E-J-0000000001.p.1.s.2.w.11"><t>naam</t><pos class="N(soort,ev,basis,zijd,stan)" set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/frog-mbpos-cgn"/><lemma class="naam" set="lemmas-nl"/><pos class="NOUN" set="adhocpos" annotatortype="auto" annotator="testscript"/><lemma set="adhoclemma" class="NAAM" datetime="1982-12-15T19:00:01" annotatortype="auto" annotator="testsc [...]
def test004_addinvalidannotation(self):
"""Edit Check - Adding a token default-set annotation that clashes with the existing one"""
@@ -1895,6 +1901,30 @@ class Test4Edit(unittest.TestCase):
self.assertTrue( xmlcheck(l.xmlstring(), '<entities xmlns="http://ilk.uvt.nl/folia"><correction xml:id="example.cell.correction.1" class="wrongclass"><new><entity class="loc" set="http://raw.github.com/proycon/folia/master/setdefinitions/namedentities.foliaset.xml"><wref t="Radboud" id="example.table.1.w.6"/><wref t="University" id="example.table.1.w.7"/><wref t="Nijmegen" id="example.table.1.w.8"/></entity></new><original auth="no"><entity xml:id="example.radboud.university.nijm [...]
+ def test013d_spanannot(self):
+ """Edit Check - Adding Span Annotation (entity, from sentence using add)"""
+ sentence = self.doc["WR-P-E-J-0000000001.p.1.s.4"]
+ word = self.doc["WR-P-E-J-0000000001.p.1.s.4.w.2"] #hoofdletter
+ word2 = self.doc["WR-P-E-J-0000000001.p.1.s.4.w.3"] #A
+ entity = sentence.add(folia.Entity, word, word2, cls="misc",set="http://raw.github.com/proycon/folia/master/setdefinitions/namedentities.foliaset.xml")
+
+ self.assertIsInstance(entity, folia.Entity)
+ self.assertTrue(xmlcheck(entity.parent.parent.xmlstring(),'<s xmlns="http://ilk.uvt.nl/folia" xml:id="WR-P-E-J-0000000001.p.1.s.4"><t>De hoofdletter A wordt gebruikt voor het originele handschrift.</t><t class="original">De hoofdletter A wordt gebruikt voor het originele handschrift.</t><t class="translate">Uppercase A is used for the original.</t><part xml:id="WR-P-E-J-0000000001.p.1.s.4.part.1"><w xml:id="WR-P-E-J-0000000001.p.1.s.4.w.1"><t offset="0">De</t><t class="original" [...]
+
+
+ def test013e_spanannot(self):
+ """Edit Check - Adding nested Span Annotation"""
+ word = self.doc["WR-P-E-J-0000000001.p.1.s.1.w.7"] #stamboom
+ for su in word.findspans(folia.SyntacticUnit):
+ if su.cls == 'pp':
+ parentspan = su
+ self.assertIsInstance(parentspan, folia.SyntacticUnit)
+ self.assertEqual(parentspan.wrefs(recurse=False) , [self.doc["WR-P-E-J-0000000001.p.1.s.1.w.6"],self.doc["WR-P-E-J-0000000001.p.1.s.1.w.7"]]) #prior to adding
+ newspan = parentspan.add(folia.SyntacticUnit, word, cls='np')
+ self.assertEqual(parentspan.wrefs(recurse=False) , [self.doc["WR-P-E-J-0000000001.p.1.s.1.w.6"]]) #after adding, parent span wref gone (moved to child)
+ self.assertEqual(parentspan.wrefs(recurse=True) , [self.doc["WR-P-E-J-0000000001.p.1.s.1.w.6"],self.doc["WR-P-E-J-0000000001.p.1.s.1.w.7"]]) #result is still the same with recursion
+ self.assertEqual(newspan.wrefs() , [self.doc["WR-P-E-J-0000000001.p.1.s.1.w.7"]])
+
def test014_replace(self):
"""Edit Check - Replacing an annotation"""
word = self.doc['WR-P-E-J-0000000001.p.1.s.3.w.14']
@@ -2474,14 +2504,10 @@ class Test8Validation(unittest.TestCase):
doc = folia.Document(file=os.path.join(TMPDIR,'foliatest.xml'), loadsetdefinitions=True)
assert isinstance( doc.setdefinitions["http://raw.github.com/proycon/folia/master/setdefinitions/namedentities.foliaset.xml"], folia.SetDefinition)
- def test003_deepvalidation(self):
+class Test9Validation(unittest.TestCase):
+ def test001_deepvalidation(self):
"""Validation - Deep Validation"""
- try:
- doc = folia.Document(file=os.path.join(TMPDIR,'foliatest.xml'), deepvalidation=True, allowadhocsets=True)
- assert isinstance( doc.setdefinitions["http://raw.github.com/proycon/folia/master/setdefinitions/namedentities.foliaset.xml"], folia.SetDefinition)
- except NotImplementedError:
- print("Deep validation not implemented yet! (not failing over this)",file=sys.stderr)
- return
+ doc = folia.Document(file=os.path.join(FOLIAPATH,'test/example.deep.xml'), deepvalidation=True, allowadhocsets=True)
f = io.open(FOLIAPATH + '/test/example.xml', 'r',encoding='utf-8')
diff --git a/pynlpl/tests/fql.py b/pynlpl/tests/fql.py
index b675496..7bad0cd 100755
--- a/pynlpl/tests/fql.py
+++ b/pynlpl/tests/fql.py
@@ -83,6 +83,15 @@ Qselect_span2_returntarget = "SELECT entity OF \"http://raw.github.com/proycon/f
Qadd_span = "ADD entity OF \"http://raw.github.com/proycon/folia/master/setdefinitions/namedentities.foliaset.xml\" WITH class \"misc\" FOR SPAN ID \"WR-P-E-J-0000000001.p.1.s.4.w.2\" & ID \"WR-P-E-J-0000000001.p.1.s.4.w.3\""
Qadd_span_returntarget = "ADD entity OF \"http://raw.github.com/proycon/folia/master/setdefinitions/namedentities.foliaset.xml\" WITH class \"misc\" FOR SPAN ID \"WR-P-E-J-0000000001.p.1.s.4.w.2\" & ID \"WR-P-E-J-0000000001.p.1.s.4.w.3\" RETURN target"
Qadd_span_returnancestortarget = "ADD entity OF \"http://raw.github.com/proycon/folia/master/setdefinitions/namedentities.foliaset.xml\" WITH class \"misc\" FOR SPAN ID \"WR-P-E-J-0000000001.p.1.s.4.w.2\" & ID \"WR-P-E-J-0000000001.p.1.s.4.w.3\" RETURN ancestor-target"
+Qadd_span2 = "ADD entity OF \"http://raw.github.com/proycon/folia/master/setdefinitions/namedentities.foliaset.xml\" WITH class \"misc\" SPAN ID \"WR-P-E-J-0000000001.p.1.s.4.w.2\" & ID \"WR-P-E-J-0000000001.p.1.s.4.w.3\" FOR ID \"WR-P-E-J-0000000001.p.1.s.4\""
+Qadd_span3 = "ADD entity OF \"http://raw.github.com/proycon/folia/master/setdefinitions/namedentities.foliaset.xml\" WITH class \"misc\" RESPAN ID \"WR-P-E-J-0000000001.p.1.s.4.w.3\" FOR SPAN ID \"WR-P-E-J-0000000001.p.1.s.4.w.2\" & ID \"WR-P-E-J-0000000001.p.1.s.4.w.3\""
+Qadd_span4 = "ADD entity OF \"http://raw.github.com/proycon/folia/master/setdefinitions/namedentities.foliaset.xml\" WITH class \"misc\" RESPAN NONE FOR SPAN ID \"WR-P-E-J-0000000001.p.1.s.4.w.2\" & ID \"WR-P-E-J-0000000001.p.1.s.4.w.3\""
+
+Qadd_span_subqueries = "ADD dependency OF alpino-set WITH class \"test\" RESPAN NONE (ADD dep SPAN ID WR-P-E-J-0000000001.p.1.s.2.w.6) (ADD hd SPAN ID WR-P-E-J-0000000001.p.1.s.2.w.7) FOR SPAN ID WR-P-E-J-0000000001.p.1.s.2.w.6 & ID WR-P-E-J-0000000001.p.1.s.2.w.7 RETURN focus"
+Qedit_spanrole = "EDIT hd SPAN ID \"WR-P-E-J-0000000001.p.1.s.1.w.3\" & ID \"WR-P-E-J-0000000001.p.1.s.1.w.4\" & ID \"WR-P-E-J-0000000001.p.1.s.1.w.5\" FOR dependency ID \"WR-P-E-J-0000000001.p.1.s.1.dep.2\" RETURN target"
+Qedit_spanrole_id = "EDIT hd ID \"test\" SPAN ID \"WR-P-E-J-0000000001.p.1.s.1.w.3\" & ID \"WR-P-E-J-0000000001.p.1.s.1.w.4\" & ID \"WR-P-E-J-0000000001.p.1.s.1.w.5\" FOR dependency ID \"WR-P-E-J-0000000001.p.1.s.1.dep.2\" RETURN target"
+
+Qadd_nested_span = "ADD su OF \"syntax-set\" WITH class \"np\" SPAN ID \"WR-P-E-J-0000000001.p.1.s.1.w.4\" & ID \"WR-P-E-J-0000000001.p.1.s.1.w.5\" FOR ID \"WR-P-E-J-0000000001.p.1.s.1.su.0\""
Qalt = "EDIT lemma WHERE class = \"terweil\" WITH class \"terwijl\" (AS ALTERNATIVE WITH confidence 0.9)"
@@ -142,6 +151,11 @@ Qsuggest_insertion2 = "APPEND (AS CORRECTION OF \"http://raw.github.com/proycon/
Qcomment = "ADD comment WITH text \"This is our university!\" FOR entity ID \"example.radboud.university.nijmegen.org\""
+Qfeat = "SELECT feat WHERE subset = \"wvorm\" FOR pos WHERE class = \"WW(pv,tgw,met-t)\" FOR ID \"WR-P-E-J-0000000001.p.1.s.2.w.5\""
+Qfeat2 = "EDIT feat WHERE subset = \"wvorm\" WITH class \"inf\" FOR pos WHERE class = \"WW(pv,tgw,met-t)\" FOR ID \"WR-P-E-J-0000000001.p.1.s.2.w.5\""
+Qfeat3 = "ADD feat WITH subset \"wvorm\" class \"inf\" FOR pos WHERE class = \"WW(inf,vrij,zonder)\" FOR ID \"WR-P-E-J-0000000001.p.1.s.2.w.28\""
+Qfeat4 = "EDIT feat WHERE subset = \"strength\" AND class = \"strong\" WITH class \"verystrong\" FOR ID \"WR-P-E-J-0000000001.text.sentiment.1\""
+
class Test1UnparsedQuery(unittest.TestCase):
@@ -340,6 +354,7 @@ class Test3Evaluation(unittest.TestCase):
q = fql.Query(Qedittext4)
results = q(self.doc)
self.assertEqual(results[0].text(), "ter\nwijl")
+ self.assertEqual(results[0].xmlstring(), "<w xmlns=\"http://ilk.uvt.nl/folia\" xml:id=\"WR-P-E-J-0000000001.p.1.s.8.w.9\"><t>ter\nwijl</t><errordetection class=\"spelling\"/><pos class=\"VG(onder)\"/><lemma class=\"terweil\"/></w>")
def test13_subfilter(self):
q = fql.Query(Qhas)
@@ -454,6 +469,40 @@ class Test3Evaluation(unittest.TestCase):
results = q(self.doc)
self.assertIsInstance(results[0], folia.Part )
+ def test20d_add_span(self):
+ """Add span (using SPAN instead of FOR SPAN)"""
+ q = fql.Query(Qadd_span2)
+ results = q(self.doc)
+ self.assertIsInstance(results[0], folia.Entity)
+ self.assertEqual(results[0].cls, 'misc')
+ results = list(results[0].wrefs())
+ self.assertIsInstance(results[0], folia.Word)
+ self.assertEqual(results[0].text(), "hoofdletter")
+ self.assertIsInstance(results[1], folia.Word)
+ self.assertEqual(results[1].text(), "A")
+ self.assertEqual(len(results), 2)
+
+ def test20e_add_span(self):
+ """Add span (using RESPAN and FOR SPAN, immediately respanning)"""
+ q = fql.Query(Qadd_span3)
+ results = q(self.doc)
+ self.assertIsInstance(results[0], folia.Entity)
+ self.assertEqual(results[0].cls, 'misc')
+ results = list(results[0].wrefs())
+ self.assertIsInstance(results[0], folia.Word)
+ self.assertEqual(len(results), 1)
+ self.assertEqual(results[0].text(), "A")
+
+ def test20f_add_span(self):
+ """Add span (using RESPAN NONE, immediately respanning)"""
+ q = fql.Query(Qadd_span4)
+ results = q(self.doc)
+ self.assertIsInstance(results[0], folia.Entity)
+ self.assertEqual(results[0].cls, 'misc')
+ results = list(results[0].wrefs())
+ self.assertEqual(len(results), 0)
+
+
def test21_edit_alt(self):
"""Add alternative token annotation"""
q = fql.Query(Qalt)
@@ -704,6 +753,79 @@ class Test3Evaluation(unittest.TestCase):
self.assertEqual(results[0].value, "This is our university!")
self.assertEqual(results[0].parent.id, "example.radboud.university.nijmegen.org")
+ def test36_feature(self):
+ """Selecting a feature"""
+ q = fql.Query(Qfeat)
+ results = q(self.doc)
+ self.assertIsInstance(results[0], folia.Feature)
+ self.assertEqual(results[0].subset, "wvorm")
+ self.assertEqual(results[0].cls, "pv")
+
+ def test36b_feature(self):
+ """Editing a feature"""
+ q = fql.Query(Qfeat2)
+ results = q(self.doc)
+ self.assertIsInstance(results[0], folia.Feature)
+ self.assertEqual(results[0].subset, "wvorm")
+ self.assertEqual(results[0].cls, "inf")
+
+ def test36c_feature(self):
+ """Adding a feature"""
+ q = fql.Query(Qfeat3)
+ results = q(self.doc)
+ self.assertIsInstance(results[0], folia.Feature)
+ self.assertEqual(results[0].subset, "wvorm")
+ self.assertEqual(results[0].cls, "inf")
+
+ def test36d_feature(self):
+ """Editing a feature that has a predefined subset"""
+ q = fql.Query(Qfeat4)
+ results = q(self.doc)
+ self.assertIsInstance(results[0], folia.Feature)
+ self.assertEqual(results[0].subset, "strength")
+ self.assertEqual(results[0].cls, "verystrong")
+
+ def test37_subqueries(self):
+ """Adding a complex span annotation with span roles, using subqueries"""
+ q = fql.Query(Qadd_span_subqueries)
+ results = q(self.doc)
+ self.assertIsInstance(results[0], folia.Dependency)
+ self.assertEqual(results[0].cls, "test")
+ self.assertEqual(list(results[0].annotation(folia.Headspan).wrefs()), [ results[0].doc['WR-P-E-J-0000000001.p.1.s.2.w.7'] ] )
+ self.assertEqual(list(results[0].annotation(folia.DependencyDependent).wrefs()), [ results[0].doc['WR-P-E-J-0000000001.p.1.s.2.w.6'] ] )
+ self.assertEqual(results[0].ancestor(folia.AbstractStructureElement).id, 'WR-P-E-J-0000000001.p.1.s.2')
+
+ def test38_nested_span(self):
+ """Adding a nested span"""
+ q = fql.Query(Qadd_nested_span)
+ results = q(self.doc)
+ self.assertIsInstance(results[0], folia.SyntacticUnit)
+ self.assertIsInstance(results[0].parent, folia.SyntacticUnit)
+ self.assertEqual(results[0].parent.id, "WR-P-E-J-0000000001.p.1.s.1.su.0")
+ self.assertEqual(list(results[0].wrefs()), [ results[0].doc['WR-P-E-J-0000000001.p.1.s.1.w.4'],results[0].doc['WR-P-E-J-0000000001.p.1.s.1.w.5'] ] )
+
+ def test39_edit_spanrole(self):
+ """Editing a spanrole"""
+ q = fql.Query(Qedit_spanrole)
+ results = q(self.doc)
+ self.assertIsInstance(results[0], folia.Dependency)
+ self.assertEqual(list(results[0].annotation(folia.Headspan).wrefs()), [ results[0].doc['WR-P-E-J-0000000001.p.1.s.1.w.3'], results[0].doc['WR-P-E-J-0000000001.p.1.s.1.w.4'], results[0].doc['WR-P-E-J-0000000001.p.1.s.1.w.5'] ] )
+ self.assertEqual(results[0].ancestor(folia.AbstractStructureElement).id, 'WR-P-E-J-0000000001.p.1.s.1')
+
+ def test39b_edit_spanrole(self):
+ """Editing a spanrole (with ID)"""
+ #ID does not exist yet, we add it first:
+ q = fql.Query("SELECT hd FOR ID \"WR-P-E-J-0000000001.p.1.s.1.dep.2\"")
+ hd = q(self.doc)[0]
+ hd.id = "test"
+ self.doc.index["test"] = hd
+ #now the actual test:
+ q = fql.Query(Qedit_spanrole_id)
+ results = q(self.doc)
+ self.assertIsInstance(results[0], folia.Dependency)
+ self.assertEqual(list(results[0].annotation(folia.Headspan).wrefs()), [ results[0].doc['WR-P-E-J-0000000001.p.1.s.1.w.3'], results[0].doc['WR-P-E-J-0000000001.p.1.s.1.w.4'], results[0].doc['WR-P-E-J-0000000001.p.1.s.1.w.5'] ] )
+ self.assertEqual(results[0].ancestor(folia.AbstractStructureElement).id, 'WR-P-E-J-0000000001.p.1.s.1')
+
class Test4CQL(unittest.TestCase):
def setUp(self):
self.doc = folia.Document(string=FOLIAEXAMPLE)
diff --git a/setup.py b/setup.py
index c712603..5074c91 100755
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@ if sys.version > '3':
setup(
name = "PyNLPl",
- version = "1.0.9", #edit version in __init__.py as well and ensure tests/folia.py FOLIARELEASE points to the right version!
+ version = "1.1.2", #edit version in __init__.py as well and ensure tests/folia.py FOLIARELEASE points to the right version!
author = "Maarten van Gompel",
author_email = "proycon at anaproy.nl",
description = ("PyNLPl, pronounced as 'pineapple', is a Python library for Natural Language Processing. It contains various modules useful for common, and less common, NLP tasks. PyNLPl contains modules for basic tasks, clients for interfacting with server, and modules for parsing several file formats common in NLP, most notably FoLiA."),
@@ -63,6 +63,6 @@ setup(
zip_safe=False,
include_package_data=True,
package_data = {'pynlpl': ['tests/test.sh', 'tests/evaluation_timbl/*'] },
- install_requires=['lxml >= 2.2','httplib2 >= 0.6'],
+ install_requires=['lxml >= 2.2','httplib2 >= 0.6','rdflib'],
entry_points = entry_points
)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/python-pynlpl.git
More information about the debian-science-commits
mailing list