[frog] 01/03: New upstream version 0.13.8
Maarten van Gompel
proycon-guest at moszumanska.debian.org
Tue Nov 7 11:45:47 UTC 2017
This is an automated email from the git hooks/post-receive script.
proycon-guest pushed a commit to branch master
in repository frog.
commit 7f8c0f4dc45d6b2c37bb616adced1b130bccc0dc
Author: proycon <proycon at anaproy.nl>
Date: Tue Nov 7 12:33:45 2017 +0100
New upstream version 0.13.8
---
ChangeLog | 705 +++++++++++++++++++++++
Makefile.in | 20 +-
NEWS | 29 +-
README | 105 +---
aclocal.m4 | 1 -
bootstrap.sh | 27 +-
config.guess | 165 ++++--
config.h.in | 3 -
config.sub | 56 +-
configure | 504 +++++++---------
configure.ac | 84 +--
docs/Makefile.in | 12 +-
docs/frog.1 | 60 +-
include/Makefile.in | 12 +-
include/frog/FrogAPI.h | 17 +-
include/frog/Makefile.am | 5 +-
include/frog/Makefile.in | 17 +-
include/frog/Parser.h | 15 +-
include/frog/cgn_tagger_mod.h | 11 +-
include/frog/ckyparser.h | 4 +-
include/frog/csidp.h | 3 +-
include/frog/iob_tagger_mod.h | 19 +-
include/frog/mblem_mod.h | 10 +-
include/frog/mbma_brackets.h | 13 +-
include/frog/mbma_mod.h | 3 +-
include/frog/mwu_chunker_mod.h | 4 +-
include/frog/ner_tagger_mod.h | 31 +-
include/frog/{pos_tagger_mod.h => tagger_base.h} | 36 +-
include/frog/ucto_tokenizer_mod.h | 6 +-
install-sh | 23 +-
ltmain.sh | 39 +-
m4/Makefile.in | 12 +-
m4/ax_icu_check.m4 | 86 ---
m4/libtool.m4 | 27 +-
m4/ltsugar.m4 | 7 +-
m4/lt~obsolete.m4 | 7 +-
m4/pkg.m4 | 217 +++----
src/Frog.cxx | 190 ++++--
src/FrogAPI.cxx | 526 ++++++++++-------
src/Makefile.am | 6 +-
src/Makefile.in | 22 +-
src/Parser.cxx | 119 ++--
src/cgn_tagger_mod.cxx | 98 +++-
src/ckyparser.cxx | 45 +-
src/csidp.cxx | 46 +-
src/iob_tagger_mod.cxx | 233 +++-----
src/mblem_mod.cxx | 98 ++--
src/mblem_prog.cxx | 12 +-
src/mbma_brackets.cxx | 307 +++++-----
src/mbma_mod.cxx | 170 +++---
src/mbma_prog.cxx | 19 +-
src/mbma_rule.cxx | 20 +-
src/mwu_chunker_mod.cxx | 79 +--
src/ner_prog.cxx | 12 +-
src/ner_tagger_mod.cxx | 477 +++++++--------
src/pos_tagger_mod.cxx | 295 ----------
src/tagger_base.cxx | 275 +++++++++
src/ucto_tokenizer_mod.cxx | 40 +-
tests/Makefile.in | 12 +-
59 files changed, 3055 insertions(+), 2441 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index be384cf..d78ed92 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,708 @@
+2017-10-26 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * NEWS: typos
+
+2017-10-26 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * NEWS: updated NEWS, also stuff Maarten forgot in ealier releases!
+
+2017-10-26 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/ner_tagger_mod.h, src/mwu_chunker_mod.cxx,
+ src/ner_tagger_mod.cxx: enforce correct set in entity layers
+
+2017-10-25 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx, src/ckyparser.cxx, src/mblem_mod.cxx,
+ src/mblem_prog.cxx, src/mbma_mod.cxx, src/mbma_prog.cxx,
+ src/mbma_rule.cxx, src/mwu_chunker_mod.cxx, src/ner_prog.cxx,
+ src/ucto_tokenizer_mod.cxx: more refactoring...
+
+2017-10-25 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/Frog.cxx, src/cgn_tagger_mod.cxx, src/csidp.cxx,
+ src/mblem_mod.cxx, src/mblem_prog.cxx, src/mbma_mod.cxx,
+ src/mbma_prog.cxx: some code refactoring
+
+2017-10-25 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * tests/tst.ok: make check: result has changed!
+
+2017-10-25 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/FrogAPI.h, include/frog/iob_tagger_mod.h,
+ include/frog/ner_tagger_mod.h, src/Frog.cxx, src/FrogAPI.cxx,
+ src/Parser.cxx, src/cgn_tagger_mod.cxx, src/iob_tagger_mod.cxx,
+ src/mblem_mod.cxx, src/mblem_prog.cxx, src/mbma_brackets.cxx,
+ src/mbma_mod.cxx, src/mbma_prog.cxx, src/mbma_rule.cxx,
+ src/mwu_chunker_mod.cxx, src/ner_prog.cxx, src/ner_tagger_mod.cxx,
+ src/tagger_base.cxx, src/ucto_tokenizer_mod.cxx: A lot off
+ refactoring. Using namespaces more consequent
+
+2017-10-24 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * NEWS: Updated NEWS
+
+2017-10-24 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx, src/iob_tagger_mod.cxx, src/ner_prog.cxx,
+ src/ner_tagger_mod.cxx: adapted to changed filenames
+
+2017-10-24 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/Makefile.am, include/frog/enr_iob_tagger_mod.h,
+ include/frog/enr_ner_tagger_mod.h, include/frog/iob_tagger_mod.h,
+ include/frog/ner_tagger_mod.h, src/Makefile.am,
+ src/enr_iob_tagger_mod.cxx, src/enr_ner_tagger_mod.cxx,
+ src/iob_tagger_mod.cxx, src/ner_tagger_mod.cxx: renaming files
+
+2017-10-23 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .gitignore, README, bootstrap.sh, include/frog/Makefile.am,
+ src/Makefile.am, src/ner_prog.cxx: start cleaning up old unused
+ modules
+
+2017-10-20 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/FrogAPI.h, include/frog/enr_iob_tagger_mod.h,
+ src/FrogAPI.cxx, src/enr_iob_tagger_mod.cxx: oosp, lost textclass...
+
+2017-10-19 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * tests/tst.ok: make check result changed after NER update
+
+2017-10-19 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/enr_iob_tagger_mod.cxx, src/enr_ner_tagger_mod.cxx: fixed
+ generate_id problem in NER. On the fly also for IOB :)
+
+2017-10-19 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/enr_ner_tagger_mod.cxx: implemented smarter NER gazeteer
+ searching. saves half of the time :)
+
+2017-10-19 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/enr_ner_tagger_mod.h, src/enr_ner_tagger_mod.cxx:
+ attempt to document some stuff. Also small refactoring which may
+ improve speed a bit.
+
+2017-10-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * tests/tst.ok: updates test result
+
+2017-10-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/enr_ner_tagger_mod.cxx: add textclass attribute to entities
+
+2017-10-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * configure.ac, include/frog/FrogAPI.h,
+ include/frog/enr_ner_tagger_mod.h, src/FrogAPI.cxx,
+ src/Makefile.am, src/enr_ner_tagger_mod.cxx: switched to newer
+ enriched NER. Needs new FrogData too!
+
+2017-10-10 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx: set the default textredundancy to 'minimal'
+
+2017-10-10 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * configure.ac, include/frog/FrogAPI.h,
+ include/frog/ucto_tokenizer_mod.h, src/Frog.cxx, src/FrogAPI.cxx,
+ src/ucto_tokenizer_mod.cxx: added -T / --textredundancy option which
+ is passed to ucto. Also improved error handling
+
+2017-09-14 Maarten van Gompel <proycon at anaproy.nl>
+
+ * src/Frog.cxx: fixes and cleanup for --override parameter
+
+2017-09-14 Maarten van Gompel <proycon at anaproy.nl>
+
+ * src/Frog.cxx: added --override option (to be tested still)
+
+2017-09-14 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/FrogAPI.h, include/frog/enr_ner_tagger_mod.h,
+ include/frog/tagger_base.h, src/FrogAPI.cxx,
+ src/enr_iob_tagger_mod.cxx, src/enr_ner_tagger_mod.cxx,
+ src/tagger_base.cxx: code cleanup, using BaseTagger for NER and IOB.
+ Added checks for correct FrogData versions
+
+2017-09-13 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/FrogAPI.h, include/frog/enr_iob_tagger_mod.h,
+ include/frog/tagger_base.h, src/FrogAPI.cxx,
+ src/enr_iob_tagger_mod.cxx: added conditional compilation of
+ enriched NER and IOB. (default = NO!!!).
+
+2017-09-13 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/Makefile.am, src/Makefile.am: build (but don't use
+ yet) enriched variants of NER and IOB
+
+2017-09-13 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/enr_iob_tagger_mod.h,
+ include/frog/enr_ner_tagger_mod.h, src/enr_iob_tagger_mod.cxx,
+ src/enr_ner_tagger_mod.cxx: added enriched variants for iob and ner
+ tagger. Not enabled yet...
+
+2017-09-11 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * configure.ac, src/mblem_mod.cxx, src/mbma_mod.cxx: make sure that
+ the textclass of the Word matches the text when using the Word's
+ class
+
+2017-08-31 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/cgn_tagger_mod.cxx: small fix. to play safe MT wise
+
+2017-08-30 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * configure.ac: simplified config. accepting ICU 50 on the fly
+
+2017-08-28 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/Parser.cxx: alse remove spaces when parsing 'multi word' folia
+
+2017-08-28 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/Frog.cxx: changed include file name
+
+2017-08-28 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * : commit 5660c58ce9adbd7525d54436228ca04b0a5aa9fc Author: Ko van
+ der Sloot <K.vanderSloot at let.ru.nl> Date: Mon Aug 28 12:44:03 2017
+ +0200
+
+2017-08-28 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/cgn_tagger_mod.h, include/frog/iob_tagger_mod.h,
+ include/frog/ner_tagger_mod.h, include/frog/tagger_base.h,
+ src/cgn_tagger_mod.cxx, src/iob_tagger_mod.cxx,
+ src/ner_tagger_mod.cxx, src/tagger_base.cxx: more refactoring and
+ cleanup
+
+2017-08-28 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/ner_tagger_mod.h, src/ner_tagger_mod.cxx,
+ src/tagger_base.cxx: more cleanup
+
+2017-08-28 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/FrogAPI.h, include/frog/Makefile.am,
+ include/frog/cgn_tagger_mod.h, include/frog/iob_tagger_mod.h,
+ include/frog/ner_tagger_mod.h, include/frog/tagger_base.h,
+ src/FrogAPI.cxx, src/Makefile.am, src/cgn_tagger_mod.cxx,
+ src/iob_tagger_mod.cxx, src/ner_tagger_mod.cxx, src/tagger_base.cxx:
+ class POSTagger renamed to BaseTagger
+
+2017-08-28 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/{pos_tagger_mod.h => tagger_base.h},
+ src/{pos_tagger_mod.cxx => tagger_base.cxx}: rename some files
+
+2017-08-28 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/cgn_tagger_mod.h, include/frog/iob_tagger_mod.h,
+ include/frog/ner_tagger_mod.h, include/frog/pos_tagger_mod.h,
+ src/FrogAPI.cxx, src/cgn_tagger_mod.cxx, src/iob_tagger_mod.cxx,
+ src/mblem_prog.cxx, src/mbma_prog.cxx, src/ner_prog.cxx,
+ src/ner_tagger_mod.cxx, src/pos_tagger_mod.cxx: refactoring,
+ cleaning up most
+
+2017-08-28 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/ner_tagger_mod.h, src/FrogAPI.cxx, src/ner_prog.cxx,
+ src/ner_tagger_mod.cxx: ner tagger now also based on the common
+ tagger base
+
+2017-08-24 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/iob_tagger_mod.h, include/frog/pos_tagger_mod.h,
+ src/FrogAPI.cxx, src/cgn_tagger_mod.cxx, src/iob_tagger_mod.cxx,
+ src/pos_tagger_mod.cxx: started refactoring iob tagger
+
+2017-08-24 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/cgn_tagger_mod.h, include/frog/iob_tagger_mod.h,
+ include/frog/pos_tagger_mod.h, src/FrogAPI.cxx,
+ src/cgn_tagger_mod.cxx, src/iob_tagger_mod.cxx, src/mblem_prog.cxx,
+ src/mbma_prog.cxx, src/pos_tagger_mod.cxx: did some reafctoring, all
+ tagger modules should inherit from a base module
+
+2017-08-24 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/iob_tagger_mod.cxx, src/mbma_mod.cxx, src/ner_tagger_mod.cxx,
+ src/pos_tagger_mod.cxx: let mbma and all taggers handle words with
+ spaces as if the spaces aren't there
+
+2017-08-24 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/pos_tagger_mod.cxx: improved error message. use correct
+ textclass
+
+2017-08-22 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/mblem_mod.cxx: run timbl with TABBED settings
+
+2017-08-22 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/mblem_mod.cxx: start working on spaces aware frog
+
+2017-08-21 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/mblem_mod.cxx: quick hack to prevent lemma's with an empty
+ class (https://github.com/LanguageMachines/frog/issues/38)
+
+2017-08-18 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml, src/FrogAPI.cxx: reverted last edits. didn't help :{
+
+2017-08-18 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: enabled running al tests again. just wait and see
+
+2017-08-18 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx: removed a omp call. maybe this fixes problems
+ with clang and OpenMp
+
+2017-08-18 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: latests last attempt
+
+2017-08-18 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: oke, a last attempt
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: fix .yml?
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: giving up
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: now?
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: try another libomp
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml, configure.ac: ugly hack
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml, configure.ac: try another OPENMP check
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: dammit
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: closing in
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: grmbl
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: next attempt
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * configure.ac: desparate
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: still a typo left :{
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: '"' ?
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: hmm. how to add " in ""?
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: fixing..
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: more syntax. space matters
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: syntax...
+
+2017-08-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: try a different aproach
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: last commit vandaag
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: travis is driving me nuts.
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: ???
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: !haat
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: err
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: fixing the fixes
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: libiomp?
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: fixes in .travis file
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: next attempt
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: attemp to check with clang 4.0
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx, src/pos_tagger_mod.cxx: removed debug lines. We
+ know enough :{
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/pos_tagger_mod.cxx: small patch
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/pos_tagger_mod.cxx: last resort
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/Parser.cxx, src/cgn_tagger_mod.cxx, src/iob_tagger_mod.cxx,
+ src/mblem_mod.cxx, src/mbma_brackets.cxx, src/mbma_mod.cxx,
+ src/mwu_chunker_mod.cxx, src/ner_tagger_mod.cxx,
+ src/pos_tagger_mod.cxx: desparately seeking bug
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * configure.ac: we need the leanest meanest libfolie (because of
+ textclass attribute)
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx: more debuglines...
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx: more debug lines...
+
+2017-08-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx: some extra debug lines
+
+2017-08-15 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx: added more debug lines
+
+2017-08-15 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/Frog.cxx: display some ore timing...
+
+2017-08-15 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx: a bit more output when running a server
+
+2017-08-15 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: ok, last fix was supid. try again
+
+2017-08-15 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: simpler
+
+2017-08-15 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: next attempt to clearify what is going on
+
+2017-08-15 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: stupid typo
+
+2017-08-15 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: improve actions on failure
+
+2017-08-14 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/Parser.cxx: added textclass to Parser dependencies too
+
+2017-08-14 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/mwu_chunker_mod.h, src/iob_tagger_mod.cxx,
+ src/mblem_mod.cxx, src/mwu_chunker_mod.cxx, src/ner_tagger_mod.cxx,
+ src/pos_tagger_mod.cxx: added textclass attribute to MWU, POS,
+ lemma, Chunker en NER annotations. (when != current)
+
+2017-07-18 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: attempt to fix OpenMP in clang
+
+2017-07-18 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: added dependency
+
+2017-07-18 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/Frog.cxx: avoid creating empty files on failure
+
+2017-07-06 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx: adapted Frogtostring() method to honour doXMLout
+ option.
+
+2017-06-22 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/mbma_brackets.cxx, src/mbma_mod.cxx: small fix in compound
+ calculation
+
+2017-06-21 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: added libtar-dev to the dependencies. added a
+ 'group: edge' directive as advised by
+ https://blog.travis-ci.com/2017-06-19-trusty-updates-2017-Q2
+
+2017-06-21 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/mbma_brackets.cxx: on output always preceed inflections with a
+ '/'. For 'inflection' nodes, don't output the inflection codes as
+ <t> nodes in the FoLiA
+
+2017-06-21 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/ner_tagger_mod.h, src/ner_tagger_mod.cxx: added
+ possibility to add NE's from more than one file
+
+2017-06-19 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/mbma_brackets.cxx: sharper condition for VV compunds
+
+2017-06-19 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/mbma_brackets.cxx: simplified compund calculation
+
+2017-06-19 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/mbma_brackets.cxx: some code reshuffling/simplification
+ regarding compounds
+
+2017-06-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/ner_tagger_mod.cxx: ignore single-word NE's from wordlist
+
+2017-06-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/ner_tagger_mod.h, src/ner_tagger_mod.cxx:
+ max_ner_size is configurable now. Fixed a 1-off bug in merge()
+
+2017-06-06 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: added 'expect' to the required packages
+
+2017-06-06 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: closing in on the problem
+
+2017-06-06 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: still clueless
+
+2017-06-06 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: grrrrr
+
+2017-06-06 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: still failing test....
+
+2017-06-06 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: attempt to fix the test
+
+2017-06-06 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: ask some more output on failure
+
+2017-06-06 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: improved testing in travis. cross fingers now...
+
+2017-05-22 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/Parser.cxx: fix
+
+2017-05-22 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/Parser.cxx: Small refactoring to avoid Multi-Threading
+ problems.
+
+2017-05-22 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/ckyparser.h, include/frog/csidp.h, src/FrogAPI.cxx,
+ src/Parser.cxx, src/ckyparser.cxx, src/csidp.cxx,
+ src/mbma_brackets.cxx, src/mbma_mod.cxx: improved/standarized
+ logging. Small refactoring to avoid Multi-Threading problems
+
+2017-05-17 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx: (re)throw an error when the FoLiA is invalid.
+
+2017-05-09 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: fixing teh fixes
+
+2017-05-09 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: updated IRC notice
+
+2017-05-09 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: don't use notices. too verbose
+
+2017-05-09 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: change notice behaviour
+
+2017-05-09 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * .travis.yml: add irc notification (experimental)
+
+2017-05-09 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx: changed comment
+
+2017-05-09 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * configure.ac: modernize configuration. Makes it work with newest
+ ucto too
+
+2017-05-01 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * docs/frog.1: updated man page
+
+2017-05-01 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/FrogAPI.h, src/Frog.cxx, src/FrogAPI.cxx: added two
+ more options: --retry and --nostdout
+
+2017-05-01 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/pos_tagger_mod.cxx: typo
+
+2017-04-20 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/mbma_brackets.cxx, src/mbma_mod.cxx: small
+ refactoring. Prefer calling folia::settext() over creating a
+ TextContent directly
+
+2017-02-16 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx, src/cgn_tagger_mod.cxx, src/ner_tagger_mod.cxx,
+ src/pos_tagger_mod.cxx: some refactoring. Added more OpenMP
+ safeguards
+
+2017-02-14 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/Frog.cxx: added a debug
+
+2017-02-13 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx: removed debug lines
+
+2017-02-13 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/FrogAPI.h, src/Frog.cxx, src/FrogAPI.cxx,
+ src/mbma_mod.cxx: honour textclass/inputclass/outputclass everywhere
+
+2017-02-13 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/FrogAPI.cxx: set correct default values
+
+2017-02-13 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/Frog.cxx, src/FrogAPI.cxx, src/pos_tagger_mod.cxx,
+ src/ucto_tokenizer_mod.cxx: fixed some problems with
+ inputclass/outputclass. Messy code now. Need rethinking!
+
+2017-02-13 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/FrogAPI.h, include/frog/ucto_tokenizer_mod.h,
+ src/Frog.cxx, src/FrogAPI.cxx, src/iob_tagger_mod.cxx,
+ src/mblem_mod.cxx, src/mbma_mod.cxx, src/mwu_chunker_mod.cxx,
+ src/ner_tagger_mod.cxx, src/pos_tagger_mod.cxx,
+ src/ucto_tokenizer_mod.cxx: aded --inputclass and --outputclass
+ options
+
+2017-02-08 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/Parser.h, include/frog/iob_tagger_mod.h,
+ include/frog/mblem_mod.h, include/frog/mbma_mod.h,
+ include/frog/mwu_chunker_mod.h, include/frog/ner_tagger_mod.h,
+ include/frog/pos_tagger_mod.h, src/Frog.cxx, src/FrogAPI.cxx,
+ src/Parser.cxx, src/iob_tagger_mod.cxx, src/mblem_mod.cxx,
+ src/mbma_mod.cxx, src/mwu_chunker_mod.cxx, src/ner_tagger_mod.cxx,
+ src/pos_tagger_mod.cxx: when --textclass is specified, use that for
+ ALL text lookup. (not INSIDE morhpemes. newed???)
+
+2017-01-26 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/pos_tagger_mod.h, include/frog/ucto_tokenizer_mod.h,
+ src/iob_tagger_mod.cxx: still some refactoring to satisfy scan-build
+
+2017-01-25 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/mblem_mod.h, include/frog/mbma_brackets.h:
+ re-refactor
+
+2017-01-25 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * include/frog/Parser.h, include/frog/cgn_tagger_mod.h,
+ include/frog/iob_tagger_mod.h, include/frog/mblem_mod.h,
+ include/frog/mbma_mod.h, include/frog/mwu_chunker_mod.h,
+ include/frog/ner_tagger_mod.h, include/frog/pos_tagger_mod.h,
+ src/csidp.cxx, src/iob_tagger_mod.cxx, src/mbma_brackets.cxx,
+ src/ner_tagger_mod.cxx: next refactoring step
+
+2017-01-23 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * configure.ac: bumped version after release and removed unneeded
+ lines
+
2017-01-23 Maarten van Gompel <proycon at anaproy.nl>
* README.md: minor readme update
diff --git a/Makefile.in b/Makefile.in
index 8bf0929..d86c053 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -90,8 +90,7 @@ build_triplet = @build@
host_triplet = @host@
subdir = .
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \
- $(top_srcdir)/m4/ax_lib_readline.m4 \
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_lib_readline.m4 \
$(top_srcdir)/m4/ax_pthread.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
@@ -269,13 +268,7 @@ EXEEXT = @EXEEXT@
FGREP = @FGREP@
GREP = @GREP@
ICU_CFLAGS = @ICU_CFLAGS@
-ICU_CONFIG = @ICU_CONFIG@
-ICU_CPPSEARCHPATH = @ICU_CPPSEARCHPATH@
-ICU_CXXFLAGS = @ICU_CXXFLAGS@
-ICU_IOLIBS = @ICU_IOLIBS@
-ICU_LIBPATH = @ICU_LIBPATH@
ICU_LIBS = @ICU_LIBS@
-ICU_VERSION = @ICU_VERSION@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -319,7 +312,6 @@ SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
STRIP = @STRIP@
-SYSCONFDIR = @SYSCONFDIR@
VERSION = @VERSION@
XML2_CFLAGS = @XML2_CFLAGS@
XML2_LIBS = @XML2_LIBS@
@@ -372,10 +364,10 @@ mbt_LIBS = @mbt_LIBS@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
-pkgconfigpath = @pkgconfigpath@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
+runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
@@ -651,7 +643,7 @@ distdir: $(DISTFILES)
! -type d ! -perm -444 -exec $(install_sh) -c -m a+r {} {} \; \
|| chmod -R a+r "$(distdir)"
dist-gzip: distdir
- tardir=$(distdir) && $(am__tar) | eval GZIP= gzip $(GZIP_ENV) -c >$(distdir).tar.gz
+ tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
$(am__post_remove_distdir)
dist-bzip2: distdir
@@ -677,7 +669,7 @@ dist-shar: distdir
@echo WARNING: "Support for shar distribution archives is" \
"deprecated." >&2
@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
- shar $(distdir) | eval GZIP= gzip $(GZIP_ENV) -c >$(distdir).shar.gz
+ shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
$(am__post_remove_distdir)
dist-zip: distdir
@@ -695,7 +687,7 @@ dist dist-all:
distcheck: dist
case '$(DIST_ARCHIVES)' in \
*.tar.gz*) \
- eval GZIP= gzip $(GZIP_ENV) -dc $(distdir).tar.gz | $(am__untar) ;;\
+ GZIP=$(GZIP_ENV) gzip -dc $(distdir).tar.gz | $(am__untar) ;;\
*.tar.bz2*) \
bzip2 -dc $(distdir).tar.bz2 | $(am__untar) ;;\
*.tar.lz*) \
@@ -705,7 +697,7 @@ distcheck: dist
*.tar.Z*) \
uncompress -c $(distdir).tar.Z | $(am__untar) ;;\
*.shar.gz*) \
- eval GZIP= gzip $(GZIP_ENV) -dc $(distdir).shar.gz | unshar ;;\
+ GZIP=$(GZIP_ENV) gzip -dc $(distdir).shar.gz | unshar ;;\
*.zip*) \
unzip $(distdir).zip ;;\
esac
diff --git a/NEWS b/NEWS
index 0d85088..2725b90 100644
--- a/NEWS
+++ b/NEWS
@@ -1,4 +1,24 @@
-0.13.5 - 2017-01-05
+0.13.8 - 2017-10-26
+[Ko vd Sloot]
+* Now with new and enhanced NER and IOB chunker. (needs Frogdata >0.15)
+* added -t / --textredundancy option, which is passed to ucto
+* set textclass attributes on entities (folia 1.5 feature)
+* better textclass handling in general
+* multiple types of entities (setnames) are stored in different layers
+* some small provisions for 'multi word' words added. mblem may use them
+ other modules just ignore them (seeing a multiword as multi words)
+* added --inpuclass and --outputclass options. (prefer over textclass)
+* added a --retry option, to redo complete directories, skipping what is done.
+* added a --nostdout option to suppress the tabbed output to stdout
+* refactoring and small fixes
+
+[Maarten van Gompel]
+* new --override option
+
+0.13.7 - 2017-01-23
+* Data files are now in share/ rather than etc/ (requires frogdata >= v0.13)
+
+0.13.6 - 2017-01-05
[Ko van der Sloot]
* rework done on compounding in MBMA. (still work in progress)
* lots of improvement in MBMA rule handling. (but still work in progress)
@@ -11,7 +31,12 @@
- detect multiple languages
- handle a selected language an ignore the rest
-- some minor code refactoring. )logging etc.)
+0.13.5 - 2016-09-13
+* Added safeguards against faulty data
+* Added manpage for ner tool (issue #8)
+* Added some more compounding rules
+* Read and display frogdata version
+
0.13.4 - 2016-07-11
[Ko van der Sloot]
- added long options --help and --version
diff --git a/README b/README
index 1acc9f9..5505612 100644
--- a/README
+++ b/README
@@ -1,104 +1 @@
-[](https://travis-ci.org/LanguageMachines/frog) [](http://applejack.science.ru.nl/languagemachines/)
-
-==============================================================================
-Frog - A Tagger-Lemmatizer-Morphological-Analyzer-Dependency-Parser for Dutch
-==============================================================================
-
- Copyright 2006-2017
- Bertjan Busser, Antal van den Bosch, Ko van der Sloot, Maarten van Gompel
-
- Centre for Language and Speech Technology, Radboud University Nijmegen
- Induction of Linguistic Knowledge Research Group, Tilburg University
-
-**Website:** https://languagemachines.github.io/frog
-
-Frog is an integration of memory-based natural language processing (NLP)
-modules developed for Dutch. All NLP modules are based on Timbl, the Tilburg
-memory-based learning software package. Most modules were created in the 1990s
-at the ILK Research Group (Tilburg University, the Netherlands) and the CLiPS
-Research Centre (University of Antwerp, Belgium). Over the years they have been
-integrated into a single text processing tool, which is currently maintained
-and developed by the Language Machines Research Group and the Centre for
-Language and Speech Technology at Radboud University Nijmegen. A dependency
-parser, a base phrase chunker, and a named-entity recognizer module were added
-more recently. Where possible, Frog makes use of multi-processor support to run
-subtasks in parallel.
-
-Various (re)programming rounds have been made possible through funding by NWO,
-the Netherlands Organisation for Scientific Research, particularly under the
-CGN project, the IMIX programme, the Implicit Linguistics project, the
-CLARIN-NL programme and the CLARIAH programme.
-
------------------------------------------------------------------------------
-
-Frog is free software; you can redistribute it and/or modify it under the terms
-of the GNU General Public License as published by the Free Software Foundation;
-either version 3 of the License, or (at your option) any later version (see the file COPYING)
-
-frog is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
-Comments and bug-reports are welcome at our issue tracker at
-https://github.com/LanguageMachines/frog/issues or by mailing
-lamasoftware (at) science.ru.nl.
-Updates and more info may be found on
-https://languagemachines.github.io/frog .
-
-
-----------------------------------------------------------------------------
-
-This software has been tested on:
-- Intel platforms running several versions of Linux, including Ubuntu, Debian,
- Arch Linux, Fedora (both 32 and 64 bits)
-- Apple platform running Mac OS X 10.10
-
-Contents of this distribution:
-- Sources
-- Licensing information ( COPYING )
-- Installation instructions ( INSTALL )
-- Build system based on GNU Autotools
-- Example data files ( in the demos directory )
-- Documentation ( in the docs directory )
-
-To install Frog, first consult whether your distribution's package manager has
-an up-to-date package. If not, for easy installation of Frog and its many
-dependencies, it is included as part of our software distribution
-**LaMachine**: https://proycon.github.io/LaMachine .
-
-To be able to succesfully build Frog from source instead, you need the following dependencies:
-- A sane C++ build enviroment with autoconf, automake, autoconf-archive, pkg-config, gcc or clang, libtool
-- libxml2-dev
-- libicu-dev
-- [ticcutils](https://github.com/LanguageMachines/ticcutils)
-- [libfolia](https://github.com/LanguageMachines/libfolia)
-- [uctodata](https://github.com/LanguageMachines/uctodata)
-- [ucto](https://github.com/LanguageMachines/ucto)
-- [timbl](https://github.com/LanguageMachines/timbl)
-- [mbt](https://github.com/LanguageMachines/mbt)
-- [frogdata](https://github.com/LanguageMachines/frogdata)
-
-The data for Frog is packaged seperately and needs to be installed prior to installing frog:
-- [frogdata](https://github.com/LanguageMachines/frogdata)
-
-To compile and install manually from source instead, provided you have all the dependencies installed:
-
- $ bash bootstrap.sh
- $ ./configure
- $ make
- $ make install
-
-and optionally:
- $ make check
-
-
--------------------------------------------------------------------------------
-Credits
--------------------------------------------------------------------------------
-
-Many thanks go out to the people who made the developments of the Frog
-components possible: Walter Daelemans, Jakub Zavrel, Ko van der Sloot, Sabine
-Buchholz, Sander Canisius, Gert Durieux, Peter Berck and Maarten van Gompel.
-
-Thanks to Erik Tjong Kim Sang and Lieve Macken for stress-testing the first
-versions of Tadpole, the predecessor of Frog
+Please see README.md for for information.
diff --git a/aclocal.m4 b/aclocal.m4
index 86e8443..af5b8c4 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1150,7 +1150,6 @@ AC_SUBST([am__tar])
AC_SUBST([am__untar])
]) # _AM_PROG_TAR
-m4_include([m4/ax_icu_check.m4])
m4_include([m4/ax_lib_readline.m4])
m4_include([m4/ax_pthread.m4])
m4_include([m4/libtool.m4])
diff --git a/bootstrap.sh b/bootstrap.sh
index ed99b2f..769f503 100644
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,6 +1,3 @@
-# $Id$
-# $URL$
-
# bootstrap - script to bootstrap the distribution rolling engine
# usage:
@@ -21,25 +18,6 @@
automake=automake
aclocal=aclocal
-ln -s README.md README
-
-# if you want to autogenerate a ChangeLog form svn:
-#
-# svn2cl, a python script, as used in the GNU Enterprise project.
-# By jcater (Jason Cater), contributions by reinhard (Reinhard Müller).
-# Get it from
-# http://www.gnuenterprise.org/cgi-bin/viewcvs.cgi/*checkout*/gnue/trunk/gnue-common/utils/svn2cl .
-# svn2cl is used in Makefile.am too.
-#
-# (Another svn2cl implementation, in perl, is at
-# http://www.contactor.se/~dast/svn/archive-2002-04/0910.shtml)
-#
-# see also toplevel Makefile.am
-
-# test -f ChangeLog || {
-# svn log --verbose > ChangeLog
-#}
-
# inspired by hack as used in mcl (from http://micans.org/)
# autoconf-archive Debian package, aclocal-archive RPM, obsolete/badly supported OS, installed in home dir
@@ -60,9 +38,9 @@ ln -s README.md README
cat <<EOT
You need the autoconf-archive Debian package, or the aclocal-archive
RPM package. Alternatively, you could install the GNU Autoconf Macro
-Archive's http://autoconf-archive.cryp.to/ac_path_lib.html
-as `pwd`/acinclude.m4.
+Archive: https://www.gnu.org/software/autoconf-archive/
EOT
+ exit 1
fi
@@ -98,4 +76,3 @@ AUTOMAKE=automake ACLOCAL=aclocal autoreconf --install \
# aclocal-1.9 \
# && automake-1.9 --add-missing --verbose --gnu \
# && autoconf
-
diff --git a/config.guess b/config.guess
index 6c32c86..2e9ad7f 100755
--- a/config.guess
+++ b/config.guess
@@ -1,8 +1,8 @@
#! /bin/sh
# Attempt to guess a canonical system name.
-# Copyright 1992-2014 Free Software Foundation, Inc.
+# Copyright 1992-2016 Free Software Foundation, Inc.
-timestamp='2014-11-04'
+timestamp='2016-10-02'
# This file is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
@@ -27,7 +27,7 @@ timestamp='2014-11-04'
# Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
#
# You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
#
# Please send patches to <config-patches at gnu.org>.
@@ -50,7 +50,7 @@ version="\
GNU config.guess ($timestamp)
Originally written by Per Bothner.
-Copyright 1992-2014 Free Software Foundation, Inc.
+Copyright 1992-2016 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -168,19 +168,29 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
# Note: NetBSD doesn't particularly care about the vendor
# portion of the name. We always set it to "unknown".
sysctl="sysctl -n hw.machine_arch"
- UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
- /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
+ UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \
+ /sbin/$sysctl 2>/dev/null || \
+ /usr/sbin/$sysctl 2>/dev/null || \
+ echo unknown)`
case "${UNAME_MACHINE_ARCH}" in
armeb) machine=armeb-unknown ;;
arm*) machine=arm-unknown ;;
sh3el) machine=shl-unknown ;;
sh3eb) machine=sh-unknown ;;
sh5el) machine=sh5le-unknown ;;
+ earmv*)
+ arch=`echo ${UNAME_MACHINE_ARCH} | sed -e 's,^e\(armv[0-9]\).*$,\1,'`
+ endian=`echo ${UNAME_MACHINE_ARCH} | sed -ne 's,^.*\(eb\)$,\1,p'`
+ machine=${arch}${endian}-unknown
+ ;;
*) machine=${UNAME_MACHINE_ARCH}-unknown ;;
esac
# The Operating System including object format, if it has switched
- # to ELF recently, or will in the future.
+ # to ELF recently (or will in the future) and ABI.
case "${UNAME_MACHINE_ARCH}" in
+ earm*)
+ os=netbsdelf
+ ;;
arm*|i386|m68k|ns32k|sh3*|sparc|vax)
eval $set_cc_for_build
if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
@@ -197,6 +207,13 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
os=netbsd
;;
esac
+ # Determine ABI tags.
+ case "${UNAME_MACHINE_ARCH}" in
+ earm*)
+ expr='s/^earmv[0-9]/-eabi/;s/eb$//'
+ abi=`echo ${UNAME_MACHINE_ARCH} | sed -e "$expr"`
+ ;;
+ esac
# The OS release
# Debian GNU/NetBSD machines have a different userland, and
# thus, need a distinct triplet. However, they do not need
@@ -207,13 +224,13 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
release='-gnu'
;;
*)
- release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+ release=`echo ${UNAME_RELEASE} | sed -e 's/[-_].*//' | cut -d. -f1,2`
;;
esac
# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
# contains redundant information, the shorter form:
# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
- echo "${machine}-${os}${release}"
+ echo "${machine}-${os}${release}${abi}"
exit ;;
*:Bitrig:*:*)
UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
@@ -223,6 +240,10 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
exit ;;
+ *:LibertyBSD:*:*)
+ UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'`
+ echo ${UNAME_MACHINE_ARCH}-unknown-libertybsd${UNAME_RELEASE}
+ exit ;;
*:ekkoBSD:*:*)
echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
exit ;;
@@ -235,6 +256,9 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
*:MirBSD:*:*)
echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
exit ;;
+ *:Sortix:*:*)
+ echo ${UNAME_MACHINE}-unknown-sortix
+ exit ;;
alpha:OSF1:*:*)
case $UNAME_RELEASE in
*4.0)
@@ -251,42 +275,42 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1`
case "$ALPHA_CPU_TYPE" in
"EV4 (21064)")
- UNAME_MACHINE="alpha" ;;
+ UNAME_MACHINE=alpha ;;
"EV4.5 (21064)")
- UNAME_MACHINE="alpha" ;;
+ UNAME_MACHINE=alpha ;;
"LCA4 (21066/21068)")
- UNAME_MACHINE="alpha" ;;
+ UNAME_MACHINE=alpha ;;
"EV5 (21164)")
- UNAME_MACHINE="alphaev5" ;;
+ UNAME_MACHINE=alphaev5 ;;
"EV5.6 (21164A)")
- UNAME_MACHINE="alphaev56" ;;
+ UNAME_MACHINE=alphaev56 ;;
"EV5.6 (21164PC)")
- UNAME_MACHINE="alphapca56" ;;
+ UNAME_MACHINE=alphapca56 ;;
"EV5.7 (21164PC)")
- UNAME_MACHINE="alphapca57" ;;
+ UNAME_MACHINE=alphapca57 ;;
"EV6 (21264)")
- UNAME_MACHINE="alphaev6" ;;
+ UNAME_MACHINE=alphaev6 ;;
"EV6.7 (21264A)")
- UNAME_MACHINE="alphaev67" ;;
+ UNAME_MACHINE=alphaev67 ;;
"EV6.8CB (21264C)")
- UNAME_MACHINE="alphaev68" ;;
+ UNAME_MACHINE=alphaev68 ;;
"EV6.8AL (21264B)")
- UNAME_MACHINE="alphaev68" ;;
+ UNAME_MACHINE=alphaev68 ;;
"EV6.8CX (21264D)")
- UNAME_MACHINE="alphaev68" ;;
+ UNAME_MACHINE=alphaev68 ;;
"EV6.9A (21264/EV69A)")
- UNAME_MACHINE="alphaev69" ;;
+ UNAME_MACHINE=alphaev69 ;;
"EV7 (21364)")
- UNAME_MACHINE="alphaev7" ;;
+ UNAME_MACHINE=alphaev7 ;;
"EV7.9 (21364A)")
- UNAME_MACHINE="alphaev79" ;;
+ UNAME_MACHINE=alphaev79 ;;
esac
# A Pn.n version is a patched version.
# A Vn.n version is a released version.
# A Tn.n version is a released field test version.
# A Xn.n version is an unreleased experimental baselevel.
# 1.2 uses "1.2" for uname -r.
- echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+ echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
exitcode=$?
trap '' 0
@@ -359,16 +383,16 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
exit ;;
i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
eval $set_cc_for_build
- SUN_ARCH="i386"
+ SUN_ARCH=i386
# If there is a compiler, see if it is configured for 64-bit objects.
# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
# This test works for both compilers.
- if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+ if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
- (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+ (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
grep IS_64BIT_ARCH >/dev/null
then
- SUN_ARCH="x86_64"
+ SUN_ARCH=x86_64
fi
fi
echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
@@ -393,7 +417,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
exit ;;
sun*:*:4.2BSD:*)
UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
- test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
+ test "x${UNAME_RELEASE}" = x && UNAME_RELEASE=3
case "`/bin/arch`" in
sun3)
echo m68k-sun-sunos${UNAME_RELEASE}
@@ -618,13 +642,13 @@ EOF
sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
case "${sc_cpu_version}" in
- 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
- 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
+ 523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0
+ 528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1
532) # CPU_PA_RISC2_0
case "${sc_kernel_bits}" in
- 32) HP_ARCH="hppa2.0n" ;;
- 64) HP_ARCH="hppa2.0w" ;;
- '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20
+ 32) HP_ARCH=hppa2.0n ;;
+ 64) HP_ARCH=hppa2.0w ;;
+ '') HP_ARCH=hppa2.0 ;; # HP-UX 10.20
esac ;;
esac
fi
@@ -663,11 +687,11 @@ EOF
exit (0);
}
EOF
- (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+ (CCOPTS="" $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
test -z "$HP_ARCH" && HP_ARCH=hppa
fi ;;
esac
- if [ ${HP_ARCH} = "hppa2.0w" ]
+ if [ ${HP_ARCH} = hppa2.0w ]
then
eval $set_cc_for_build
@@ -680,12 +704,12 @@ EOF
# $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
# => hppa64-hp-hpux11.23
- if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
+ if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) |
grep -q __LP64__
then
- HP_ARCH="hppa2.0w"
+ HP_ARCH=hppa2.0w
else
- HP_ARCH="hppa64"
+ HP_ARCH=hppa64
fi
fi
echo ${HP_ARCH}-hp-hpux${HPUX_REV}
@@ -790,14 +814,14 @@ EOF
echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
exit ;;
F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
- FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
- FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+ FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
+ FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
exit ;;
5000:UNIX_System_V:4.*:*)
- FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
- FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
+ FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
+ FUJITSU_REL=`echo ${UNAME_RELEASE} | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'`
echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
exit ;;
i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
@@ -879,7 +903,7 @@ EOF
exit ;;
*:GNU/*:*:*)
# other systems with GNU libc and userland
- echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC}
+ echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC}
exit ;;
i*86:Minix:*:*)
echo ${UNAME_MACHINE}-pc-minix
@@ -902,7 +926,7 @@ EOF
EV68*) UNAME_MACHINE=alphaev68 ;;
esac
objdump --private-headers /bin/sh | grep -q ld.so.1
- if test "$?" = 0 ; then LIBC="gnulibc1" ; fi
+ if test "$?" = 0 ; then LIBC=gnulibc1 ; fi
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
exit ;;
arc:Linux:*:* | arceb:Linux:*:*)
@@ -933,6 +957,9 @@ EOF
crisv32:Linux:*:*)
echo ${UNAME_MACHINE}-axis-linux-${LIBC}
exit ;;
+ e2k:Linux:*:*)
+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ exit ;;
frv:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
exit ;;
@@ -945,6 +972,9 @@ EOF
ia64:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
exit ;;
+ k1om:Linux:*:*)
+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ exit ;;
m32r*:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
exit ;;
@@ -970,6 +1000,9 @@ EOF
eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
;;
+ mips64el:Linux:*:*)
+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ exit ;;
openrisc*:Linux:*:*)
echo or1k-unknown-linux-${LIBC}
exit ;;
@@ -1002,6 +1035,9 @@ EOF
ppcle:Linux:*:*)
echo powerpcle-unknown-linux-${LIBC}
exit ;;
+ riscv32:Linux:*:* | riscv64:Linux:*:*)
+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ exit ;;
s390:Linux:*:* | s390x:Linux:*:*)
echo ${UNAME_MACHINE}-ibm-linux-${LIBC}
exit ;;
@@ -1021,7 +1057,7 @@ EOF
echo ${UNAME_MACHINE}-dec-linux-${LIBC}
exit ;;
x86_64:Linux:*:*)
- echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+ echo ${UNAME_MACHINE}-pc-linux-${LIBC}
exit ;;
xtensa*:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
@@ -1100,7 +1136,7 @@ EOF
# uname -m prints for DJGPP always 'pc', but it prints nothing about
# the processor, so we play safe by assuming i586.
# Note: whatever this is, it MUST be the same as what config.sub
- # prints for the "djgpp" host, or else GDB configury will decide that
+ # prints for the "djgpp" host, or else GDB configure will decide that
# this is a cross-build.
echo i586-pc-msdosdjgpp
exit ;;
@@ -1249,6 +1285,9 @@ EOF
SX-8R:SUPER-UX:*:*)
echo sx8r-nec-superux${UNAME_RELEASE}
exit ;;
+ SX-ACE:SUPER-UX:*:*)
+ echo sxace-nec-superux${UNAME_RELEASE}
+ exit ;;
Power*:Rhapsody:*:*)
echo powerpc-apple-rhapsody${UNAME_RELEASE}
exit ;;
@@ -1262,9 +1301,9 @@ EOF
UNAME_PROCESSOR=powerpc
fi
if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then
- if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+ if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
- (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+ (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
grep IS_64BIT_ARCH >/dev/null
then
case $UNAME_PROCESSOR in
@@ -1286,7 +1325,7 @@ EOF
exit ;;
*:procnto*:*:* | *:QNX:[0123456789]*:*)
UNAME_PROCESSOR=`uname -p`
- if test "$UNAME_PROCESSOR" = "x86"; then
+ if test "$UNAME_PROCESSOR" = x86; then
UNAME_PROCESSOR=i386
UNAME_MACHINE=pc
fi
@@ -1317,7 +1356,7 @@ EOF
# "uname -m" is not consistent, so use $cputype instead. 386
# is converted to i386 for consistency with other x86
# operating systems.
- if test "$cputype" = "386"; then
+ if test "$cputype" = 386; then
UNAME_MACHINE=i386
else
UNAME_MACHINE="$cputype"
@@ -1359,7 +1398,7 @@ EOF
echo i386-pc-xenix
exit ;;
i*86:skyos:*:*)
- echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
+ echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE} | sed -e 's/ .*$//'`
exit ;;
i*86:rdos:*:*)
echo ${UNAME_MACHINE}-pc-rdos
@@ -1370,23 +1409,25 @@ EOF
x86_64:VMkernel:*:*)
echo ${UNAME_MACHINE}-unknown-esx
exit ;;
+ amd64:Isilon\ OneFS:*:*)
+ echo x86_64-unknown-onefs
+ exit ;;
esac
cat >&2 <<EOF
$0: unable to guess system type
-This script, last modified $timestamp, has failed to recognize
-the operating system you are using. It is advised that you
-download the most up to date version of the config scripts from
+This script (version $timestamp), has failed to recognize the
+operating system you are using. If your script is old, overwrite
+config.guess and config.sub with the latest versions from:
- http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
+ http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
and
- http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
+ http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
-If the version you run ($0) is already up to date, please
-send the following data and any information you think might be
-pertinent to <config-patches at gnu.org> in order to provide the needed
-information to handle your system.
+If $0 has already been updated, send the following data and any
+information you think might be pertinent to config-patches at gnu.org to
+provide the necessary information to handle your system.
config.guess timestamp = $timestamp
diff --git a/config.h.in b/config.h.in
index eddcd69..cf9718c 100644
--- a/config.h.in
+++ b/config.h.in
@@ -13,9 +13,6 @@
/* Define to 1 if you have the <history.h> header file. */
#undef HAVE_HISTORY_H
-/* we want to use ICU */
-#undef HAVE_ICU
-
/* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H
diff --git a/config.sub b/config.sub
index 7ffe373..dd2ca93 100755
--- a/config.sub
+++ b/config.sub
@@ -1,8 +1,8 @@
#! /bin/sh
# Configuration validation subroutine script.
-# Copyright 1992-2014 Free Software Foundation, Inc.
+# Copyright 1992-2016 Free Software Foundation, Inc.
-timestamp='2014-12-03'
+timestamp='2016-11-04'
# This file is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
@@ -33,7 +33,7 @@ timestamp='2014-12-03'
# Otherwise, we print the canonical config type on stdout and succeed.
# You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
# This file is supposed to be the same for all GNU packages
# and recognize all the CPU types, system types and aliases
@@ -53,8 +53,7 @@ timestamp='2014-12-03'
me=`echo "$0" | sed -e 's,.*/,,'`
usage="\
-Usage: $0 [OPTION] CPU-MFR-OPSYS
- $0 [OPTION] ALIAS
+Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS
Canonicalize a configuration name.
@@ -68,7 +67,7 @@ Report bugs and patches to <config-patches at gnu.org>."
version="\
GNU config.sub ($timestamp)
-Copyright 1992-2014 Free Software Foundation, Inc.
+Copyright 1992-2016 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -117,8 +116,8 @@ maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
case $maybe_os in
nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
- knetbsd*-gnu* | netbsd*-gnu* | \
- kopensolaris*-gnu* | \
+ knetbsd*-gnu* | netbsd*-gnu* | netbsd*-eabi* | \
+ kopensolaris*-gnu* | cloudabi*-eabi* | \
storm-chaos* | os2-emx* | rtmk-nova*)
os=-$maybe_os
basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
@@ -255,12 +254,13 @@ case $basic_machine in
| arc | arceb \
| arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \
| avr | avr32 \
+ | ba \
| be32 | be64 \
| bfin \
| c4x | c8051 | clipper \
| d10v | d30v | dlx | dsp16xx \
- | epiphany \
- | fido | fr30 | frv \
+ | e2k | epiphany \
+ | fido | fr30 | frv | ft32 \
| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
| hexagon \
| i370 | i860 | i960 | ia64 \
@@ -301,11 +301,12 @@ case $basic_machine in
| open8 | or1k | or1knd | or32 \
| pdp10 | pdp11 | pj | pjl \
| powerpc | powerpc64 | powerpc64le | powerpcle \
+ | pru \
| pyramid \
| riscv32 | riscv64 \
| rl78 | rx \
| score \
- | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
+ | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[234]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
| sh64 | sh64le \
| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
@@ -376,12 +377,13 @@ case $basic_machine in
| alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \
| arm-* | armbe-* | armle-* | armeb-* | armv*-* \
| avr-* | avr32-* \
+ | ba-* \
| be32-* | be64-* \
| bfin-* | bs2000-* \
| c[123]* | c30-* | [cjt]90-* | c4x-* \
| c8051-* | clipper-* | craynv-* | cydra-* \
| d10v-* | d30v-* | dlx-* \
- | elxsi-* \
+ | e2k-* | elxsi-* \
| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
| h8300-* | h8500-* \
| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
@@ -427,13 +429,15 @@ case $basic_machine in
| orion-* \
| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
+ | pru-* \
| pyramid-* \
+ | riscv32-* | riscv64-* \
| rl78-* | romp-* | rs6000-* | rx-* \
| sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
| sparclite-* \
- | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \
+ | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx*-* \
| tahoe-* \
| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
| tile*-* \
@@ -518,6 +522,9 @@ case $basic_machine in
basic_machine=i386-pc
os=-aros
;;
+ asmjs)
+ basic_machine=asmjs-unknown
+ ;;
aux)
basic_machine=m68k-apple
os=-aux
@@ -638,6 +645,14 @@ case $basic_machine in
basic_machine=m68k-bull
os=-sysv3
;;
+ e500v[12])
+ basic_machine=powerpc-unknown
+ os=$os"spe"
+ ;;
+ e500v[12]-*)
+ basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
+ os=$os"spe"
+ ;;
ebmon29k)
basic_machine=a29k-amd
os=-ebmon
@@ -1017,7 +1032,7 @@ case $basic_machine in
ppc-* | ppcbe-*)
basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
;;
- ppcle | powerpclittle | ppc-le | powerpc-little)
+ ppcle | powerpclittle)
basic_machine=powerpcle-unknown
;;
ppcle-* | powerpclittle-*)
@@ -1027,7 +1042,7 @@ case $basic_machine in
;;
ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
;;
- ppc64le | powerpc64little | ppc64-le | powerpc64-little)
+ ppc64le | powerpc64little)
basic_machine=powerpc64le-unknown
;;
ppc64le-* | powerpc64little-*)
@@ -1373,18 +1388,18 @@ case $os in
| -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
| -sym* | -kopensolaris* | -plan9* \
| -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
- | -aos* | -aros* \
+ | -aos* | -aros* | -cloudabi* | -sortix* \
| -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
| -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
| -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
- | -bitrig* | -openbsd* | -solidbsd* \
+ | -bitrig* | -openbsd* | -solidbsd* | -libertybsd* \
| -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
| -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
| -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
| -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
| -chorusos* | -chorusrdb* | -cegcc* \
| -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
- | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \
+ | -midipix* | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \
| -linux-newlib* | -linux-musl* | -linux-uclibc* \
| -uxpv* | -beos* | -mpeix* | -udk* | -moxiebox* \
| -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
@@ -1393,7 +1408,8 @@ case $os in
| -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
| -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
| -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
- | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* | -tirtos*)
+ | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* \
+ | -onefs* | -tirtos* | -phoenix* | -fuchsia*)
# Remember, each alternative MUST END IN *, to match a version number.
;;
-qnx*)
@@ -1525,6 +1541,8 @@ case $os in
;;
-nacl*)
;;
+ -ios)
+ ;;
-none)
;;
*)
diff --git a/configure b/configure
index ec58e35..efd4e00 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for frog 0.13.7.
+# Generated by GNU Autoconf 2.69 for frog 0.13.8.
#
# Report bugs to <lamasoftware at science.ru.nl>.
#
@@ -590,8 +590,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='frog'
PACKAGE_TARNAME='frog'
-PACKAGE_VERSION='0.13.7'
-PACKAGE_STRING='frog 0.13.7'
+PACKAGE_VERSION='0.13.8'
+PACKAGE_STRING='frog 0.13.8'
PACKAGE_BUGREPORT='lamasoftware at science.ru.nl'
PACKAGE_URL=''
@@ -648,17 +648,10 @@ timbl_LIBS
timbl_CFLAGS
ticcutils_LIBS
ticcutils_CFLAGS
-ICU_IOLIBS
ICU_LIBS
-ICU_LIBPATH
-ICU_VERSION
-ICU_CPPSEARCHPATH
-ICU_CXXFLAGS
ICU_CFLAGS
-ICU_CONFIG
XML2_LIBS
XML2_CFLAGS
-pkgconfigpath
PKG_CONFIG_LIBDIR
PKG_CONFIG_PATH
PKG_CONFIG
@@ -667,7 +660,6 @@ PTHREAD_CFLAGS
PTHREAD_LIBS
PTHREAD_CC
ax_pthread_config
-SYSCONFDIR
CXXCPP
CPP
LT_SYS_LIBRARY_PATH
@@ -769,6 +761,7 @@ infodir
docdir
oldincludedir
includedir
+runstatedir
localstatedir
sharedstatedir
sysconfdir
@@ -802,12 +795,6 @@ with_gnu_ld
with_sysroot
enable_libtool_lock
enable_openmp
-with_icu
-with_ticcutils
-with_timbl
-with_mbt
-with_folia
-with_ucto
'
ac_precious_vars='build_alias
host_alias
@@ -828,6 +815,8 @@ PKG_CONFIG_PATH
PKG_CONFIG_LIBDIR
XML2_CFLAGS
XML2_LIBS
+ICU_CFLAGS
+ICU_LIBS
ticcutils_CFLAGS
ticcutils_LIBS
timbl_CFLAGS
@@ -878,6 +867,7 @@ datadir='${datarootdir}'
sysconfdir='${prefix}/etc'
sharedstatedir='${prefix}/com'
localstatedir='${prefix}/var'
+runstatedir='${localstatedir}/run'
includedir='${prefix}/include'
oldincludedir='/usr/include'
docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
@@ -1130,6 +1120,15 @@ do
| -silent | --silent | --silen | --sile | --sil)
silent=yes ;;
+ -runstatedir | --runstatedir | --runstatedi | --runstated \
+ | --runstate | --runstat | --runsta | --runst | --runs \
+ | --run | --ru | --r)
+ ac_prev=runstatedir ;;
+ -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
+ | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
+ | --run=* | --ru=* | --r=*)
+ runstatedir=$ac_optarg ;;
+
-sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
ac_prev=sbindir ;;
-sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
@@ -1267,7 +1266,7 @@ fi
for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \
datadir sysconfdir sharedstatedir localstatedir includedir \
oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
- libdir localedir mandir
+ libdir localedir mandir runstatedir
do
eval ac_val=\$$ac_var
# Remove trailing slashes.
@@ -1380,7 +1379,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures frog 0.13.7 to adapt to many kinds of systems.
+\`configure' configures frog 0.13.8 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1420,6 +1419,7 @@ Fine tuning of the installation directories:
--sysconfdir=DIR read-only single-machine data [PREFIX/etc]
--sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com]
--localstatedir=DIR modifiable single-machine data [PREFIX/var]
+ --runstatedir=DIR modifiable per-process data [LOCALSTATEDIR/run]
--libdir=DIR object code libraries [EPREFIX/lib]
--includedir=DIR C header files [PREFIX/include]
--oldincludedir=DIR C header files for non-gcc [/usr/include]
@@ -1450,7 +1450,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of frog 0.13.7:";;
+ short | recursive ) echo "Configuration of frog 0.13.8:";;
esac
cat <<\_ACEOF
@@ -1482,12 +1482,6 @@ Optional Packages:
--with-gnu-ld assume the C compiler uses GNU ld [default=no]
--with-sysroot[=DIR] Search for dependent libraries within DIR (or the
compiler's sysroot if not specified).
- --with-icu=DIR use ICU installed in <DIR>
- --with-ticcutils=DIR use ticcutils installed in <DIR>
- --with-timbl=DIR use timbl installed in <DIR>
- --with-mbt=DIR use mbt installed in <DIR>
- --with-folia=DIR use libfolia installed in <DIR>
- --with-ucto=DIR use ucto installed in <DIR>
Some influential environment variables:
CXX C++ compiler command
@@ -1510,6 +1504,8 @@ Some influential environment variables:
path overriding pkg-config's built-in search path
XML2_CFLAGS C compiler flags for XML2, overriding pkg-config
XML2_LIBS linker flags for XML2, overriding pkg-config
+ ICU_CFLAGS C compiler flags for ICU, overriding pkg-config
+ ICU_LIBS linker flags for ICU, overriding pkg-config
ticcutils_CFLAGS
C compiler flags for ticcutils, overriding pkg-config
ticcutils_LIBS
@@ -1595,7 +1591,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-frog configure 0.13.7
+frog configure 0.13.8
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2248,7 +2244,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by frog $as_me 0.13.7, which was
+It was created by frog $as_me 0.13.8, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -3111,7 +3107,7 @@ fi
# Define the identity of the package.
PACKAGE='frog'
- VERSION='0.13.7'
+ VERSION='0.13.8'
cat >>confdefs.h <<_ACEOF
@@ -3211,9 +3207,9 @@ ac_config_headers="$ac_config_headers config.h"
if test x"${CXXFLAGS+set}" = xset; then
# the user set CXXFLAGS; don't override it.
- cxx_flags_were_set=true
-else
cxx_flags_were_set=false
+else
+ cxx_flags_were_set=true
fi
# Checks for programs.
@@ -5956,7 +5952,7 @@ linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
lt_cv_deplibs_check_method=pass_all
;;
-netbsd*)
+netbsd* | netbsdelf*-gnu)
if echo __ELF__ | $CC -E - | $GREP __ELF__ > /dev/null; then
lt_cv_deplibs_check_method='match_pattern /lib[^/]+(\.so\.[0-9]+\.[0-9]+|_pic\.a)$'
else
@@ -9660,6 +9656,9 @@ $as_echo_n "checking whether the $compiler linker ($LD) supports shared librarie
openbsd* | bitrig*)
with_gnu_ld=no
;;
+ linux* | k*bsd*-gnu | gnu*)
+ link_all_deplibs=no
+ ;;
esac
ld_shlibs=yes
@@ -9914,7 +9913,7 @@ _LT_EOF
fi
;;
- netbsd*)
+ netbsd* | netbsdelf*-gnu)
if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
archive_cmds='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib'
wlarc=
@@ -10584,6 +10583,7 @@ $as_echo "$lt_cv_irix_exported_symbol" >&6; }
if test yes = "$lt_cv_irix_exported_symbol"; then
archive_expsym_cmds='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations $wl-exports_file $wl$export_symbols -o $lib'
fi
+ link_all_deplibs=no
else
archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib'
archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -exports_file $export_symbols -o $lib'
@@ -10605,7 +10605,7 @@ $as_echo "$lt_cv_irix_exported_symbol" >&6; }
esac
;;
- netbsd*)
+ netbsd* | netbsdelf*-gnu)
if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags' # a.out
else
@@ -11720,6 +11720,18 @@ fi
dynamic_linker='GNU/Linux ld.so'
;;
+netbsdelf*-gnu)
+ version_type=linux
+ need_lib_prefix=no
+ need_version=no
+ library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
+ soname_spec='${libname}${release}${shared_ext}$major'
+ shlibpath_var=LD_LIBRARY_PATH
+ shlibpath_overrides_runpath=no
+ hardcode_into_libs=yes
+ dynamic_linker='NetBSD ld.elf_so'
+ ;;
+
netbsd*)
version_type=sunos
need_lib_prefix=no
@@ -14614,7 +14626,7 @@ lt_prog_compiler_static_CXX=
;;
esac
;;
- netbsd*)
+ netbsd* | netbsdelf*-gnu)
;;
*qnx* | *nto*)
# QNX uses GNU C++, but need to define -shared option too, otherwise
@@ -14989,6 +15001,9 @@ $as_echo_n "checking whether the $compiler linker ($LD) supports shared librarie
;;
esac
;;
+ linux* | k*bsd*-gnu | gnu*)
+ link_all_deplibs_CXX=no
+ ;;
*)
export_symbols_cmds_CXX='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
;;
@@ -15682,6 +15697,18 @@ fi
dynamic_linker='GNU/Linux ld.so'
;;
+netbsdelf*-gnu)
+ version_type=linux
+ need_lib_prefix=no
+ need_version=no
+ library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
+ soname_spec='${libname}${release}${shared_ext}$major'
+ shlibpath_var=LD_LIBRARY_PATH
+ shlibpath_overrides_runpath=no
+ hardcode_into_libs=yes
+ dynamic_linker='NetBSD ld.elf_so'
+ ;;
+
netbsd*)
version_type=sunos
need_lib_prefix=no
@@ -16033,9 +16060,6 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
-SYSCONFDIR=$sysconfdir
-
-
# when running tests, use CXX
ac_ext=cpp
ac_cpp='$CXXCPP $CPPFLAGS'
@@ -17310,8 +17334,15 @@ if test "x$ac_cv_prog_cxx_openmp" != "xunsupported"; then
$as_echo "#define HAVE_OPENMP 1 " >>confdefs.h
else
- { $as_echo "$as_me:${as_lineno-$LINENO}: We don't have OpenMP. Multithreaded operation is disabled" >&5
+ if test "$CXX" = "clang++-4.0"; then
+ CXXFLAGS="$CXXFLAGS -fopenmp"
+
+$as_echo "#define HAVE_OPENMP 1 " >>confdefs.h
+
+ else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: We don't have OpenMP. Multithreaded operation is disabled" >&5
$as_echo "$as_me: We don't have OpenMP. Multithreaded operation is disabled" >&6;}
+ fi
fi
if test $prefix = "NONE"; then
@@ -17438,55 +17469,13 @@ $as_echo "no" >&6; }
PKG_CONFIG=""
fi
fi
-# Extract the first word of "pkg-config", so it can be a program name with args.
-set dummy pkg-config; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_pkgconfigpath+:} false; then :
- $as_echo_n "(cached) " >&6
+if test "x$PKG_CONFIG_PATH" = x; then
+ export PKG_CONFIG_PATH="$prefix/lib/pkgconfig"
else
- case $pkgconfigpath in
- [\\/]* | ?:[\\/]*)
- ac_cv_path_pkgconfigpath="$pkgconfigpath" # Let the user override the test with a path.
- ;;
- *)
- as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
- IFS=$as_save_IFS
- test -z "$as_dir" && as_dir=.
- for ac_exec_ext in '' $ac_executable_extensions; do
- if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
- ac_cv_path_pkgconfigpath="$as_dir/$ac_word$ac_exec_ext"
- $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
- break 2
- fi
-done
- done
-IFS=$as_save_IFS
-
- test -z "$ac_cv_path_pkgconfigpath" && ac_cv_path_pkgconfigpath="NONE"
- ;;
-esac
-fi
-pkgconfigpath=$ac_cv_path_pkgconfigpath
-if test -n "$pkgconfigpath"; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pkgconfigpath" >&5
-$as_echo "$pkgconfigpath" >&6; }
-else
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+ export PKG_CONFIG_PATH="$prefix/lib/pkgconfig:$PKG_CONFIG_PATH"
fi
-if test "$pkgconfigpath" != "NONE"; then
-# ugly hack when PKG_CONFIG_PATH isn't defined.
-# couldn't get it to work otherwise
- if test "x$PKG_CONFIG_PATH" = x; then
- export PKG_CONFIG_PATH=""
- fi
-fi
-
pkg_failed=no
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for XML2" >&5
$as_echo_n "checking for XML2... " >&6; }
@@ -17581,6 +17570,100 @@ CXXFLAGS="$CXXFLAGS $XML2_CFLAGS"
LIBS="$XML2_LIBS $LIBS"
+pkg_failed=no
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ICU" >&5
+$as_echo_n "checking for ICU... " >&6; }
+
+if test -n "$ICU_CFLAGS"; then
+ pkg_cv_ICU_CFLAGS="$ICU_CFLAGS"
+ elif test -n "$PKG_CONFIG"; then
+ if test -n "$PKG_CONFIG" && \
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"icu-uc >= 50 icu-io \""; } >&5
+ ($PKG_CONFIG --exists --print-errors "icu-uc >= 50 icu-io ") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; then
+ pkg_cv_ICU_CFLAGS=`$PKG_CONFIG --cflags "icu-uc >= 50 icu-io " 2>/dev/null`
+ test "x$?" != "x0" && pkg_failed=yes
+else
+ pkg_failed=yes
+fi
+ else
+ pkg_failed=untried
+fi
+if test -n "$ICU_LIBS"; then
+ pkg_cv_ICU_LIBS="$ICU_LIBS"
+ elif test -n "$PKG_CONFIG"; then
+ if test -n "$PKG_CONFIG" && \
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"icu-uc >= 50 icu-io \""; } >&5
+ ($PKG_CONFIG --exists --print-errors "icu-uc >= 50 icu-io ") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; then
+ pkg_cv_ICU_LIBS=`$PKG_CONFIG --libs "icu-uc >= 50 icu-io " 2>/dev/null`
+ test "x$?" != "x0" && pkg_failed=yes
+else
+ pkg_failed=yes
+fi
+ else
+ pkg_failed=untried
+fi
+
+
+
+if test $pkg_failed = yes; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+ _pkg_short_errors_supported=yes
+else
+ _pkg_short_errors_supported=no
+fi
+ if test $_pkg_short_errors_supported = yes; then
+ ICU_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "icu-uc >= 50 icu-io " 2>&1`
+ else
+ ICU_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "icu-uc >= 50 icu-io " 2>&1`
+ fi
+ # Put the nasty error message in config.log where it belongs
+ echo "$ICU_PKG_ERRORS" >&5
+
+ as_fn_error $? "Package requirements (icu-uc >= 50 icu-io ) were not met:
+
+$ICU_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+Alternatively, you may set the environment variables ICU_CFLAGS
+and ICU_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details." "$LINENO" 5
+elif test $pkg_failed = untried; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "The pkg-config script could not be found or is too old. Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+Alternatively, you may set the environment variables ICU_CFLAGS
+and ICU_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.
+
+To get pkg-config, see <http://pkg-config.freedesktop.org/>.
+See \`config.log' for more details" "$LINENO" 5; }
+else
+ ICU_CFLAGS=$pkg_cv_ICU_CFLAGS
+ ICU_LIBS=$pkg_cv_ICU_LIBS
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+fi
+CXXFLAGS="$CXXFLAGS $ICU_CFLAGS"
+LIBS="$ICU_LIBS $LIBS"
+
+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a readline compatible library" >&5
$as_echo_n "checking for a readline compatible library... " >&6; }
if ${ax_cv_lib_readline+:} false; then :
@@ -17710,165 +17793,6 @@ done
-#check for needed dependencies
-useICU=1;
-# inspired by feh-1.3.4/configure.ac. Tnx Tom Gilbert and feh hackers.
-
-# Check whether --with-icu was given.
-if test "${with_icu+set}" = set; then :
- withval=$with_icu; if test "$with_icu" = "no"; then
- useICU=0
- else
- CXXFLAGS="$CXXFLAGS -I$withval/include"
- LIBS="-L$withval/lib $LIBS"
- fi
-fi
-
-
-if test "$useICU" = "1"; then
-
- succeeded=no
-
- if test -z "$ICU_CONFIG"; then
- # Extract the first word of "icu-config", so it can be a program name with args.
-set dummy icu-config; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_path_ICU_CONFIG+:} false; then :
- $as_echo_n "(cached) " >&6
-else
- case $ICU_CONFIG in
- [\\/]* | ?:[\\/]*)
- ac_cv_path_ICU_CONFIG="$ICU_CONFIG" # Let the user override the test with a path.
- ;;
- *)
- as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
- IFS=$as_save_IFS
- test -z "$as_dir" && as_dir=.
- for ac_exec_ext in '' $ac_executable_extensions; do
- if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
- ac_cv_path_ICU_CONFIG="$as_dir/$ac_word$ac_exec_ext"
- $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
- break 2
- fi
-done
- done
-IFS=$as_save_IFS
-
- test -z "$ac_cv_path_ICU_CONFIG" && ac_cv_path_ICU_CONFIG="no"
- ;;
-esac
-fi
-ICU_CONFIG=$ac_cv_path_ICU_CONFIG
-if test -n "$ICU_CONFIG"; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ICU_CONFIG" >&5
-$as_echo "$ICU_CONFIG" >&6; }
-else
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
- fi
-
- if test "$ICU_CONFIG" = "no" ; then
- echo "*** The icu-config script could not be found. Make sure it is"
- echo "*** in your path, and that taglib is properly installed."
- echo "*** Or see http://www.icu-project.org/"
- else
- ICU_VERSION=`$ICU_CONFIG --version`
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for ICU >= 4.2" >&5
-$as_echo_n "checking for ICU >= 4.2... " >&6; }
- VERSION_CHECK=`expr $ICU_VERSION \>\= 4.2`
- if test "$VERSION_CHECK" = "1" ; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
- succeeded=yes
-
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking ICU_CFLAGS" >&5
-$as_echo_n "checking ICU_CFLAGS... " >&6; }
- ICU_CFLAGS=`$ICU_CONFIG --cflags`
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ICU_CFLAGS" >&5
-$as_echo "$ICU_CFLAGS" >&6; }
-
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking ICU_CPPSEARCHPATH" >&5
-$as_echo_n "checking ICU_CPPSEARCHPATH... " >&6; }
- ICU_CPPSEARCHPATH=`$ICU_CONFIG --cppflags-searchpath`
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ICU_CPPSEARCHPATH" >&5
-$as_echo "$ICU_CPPSEARCHPATH" >&6; }
-
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking ICU_CXXFLAGS" >&5
-$as_echo_n "checking ICU_CXXFLAGS... " >&6; }
- ICU_CXXFLAGS=`$ICU_CONFIG --cxxflags`
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ICU_CXXFLAGS" >&5
-$as_echo "$ICU_CXXFLAGS" >&6; }
-
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking ICU_LIBS" >&5
-$as_echo_n "checking ICU_LIBS... " >&6; }
- ICU_LIBS=`$ICU_CONFIG --ldflags-libsonly`
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ICU_LIBS" >&5
-$as_echo "$ICU_LIBS" >&6; }
-
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking ICU_LIBPATH" >&5
-$as_echo_n "checking ICU_LIBPATH... " >&6; }
- ICU_LIBPATH=`$ICU_CONFIG --ldflags-searchpath`
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ICU_LIBPATH" >&5
-$as_echo "$ICU_LIBPATH" >&6; }
-
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking ICU_IOLIBS" >&5
-$as_echo_n "checking ICU_IOLIBS... " >&6; }
- ICU_IOLIBS=`$ICU_CONFIG --ldflags-icuio`
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ICU_IOLIBS" >&5
-$as_echo "$ICU_IOLIBS" >&6; }
- else
- ICU_CFLAGS=""
- ICU_CXXFLAGS=""
- ICU_CPPSEARCHPATH=""
- ICU_LIBPATH=""
- ICU_LIBS=""
- ICU_IOLIBS=""
- ## If we have a custom action on failure, don't print errors, but
- ## do set a variable so people can do so.
-
- fi
-
-
-
-
-
-
-
-
- fi
-
- if test $succeeded = yes; then
- CXXFLAGS="$CXXFLAGS $ICU_CPPSEARCHPATH"
- LIBS="$ICU_LIBPATH $ICU_LIBS $ICU_IOLIBS $LIBS"
- else
- { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "\"No ICU development environment found. Please check if libicu-ev or the like is installed\"
-See \`config.log' for more details" "$LINENO" 5; }
- fi
-
-
-$as_echo "#define HAVE_ICU 1" >>confdefs.h
-
-else
- as_fn_error $? "\"ICU support is required\"" "$LINENO" 5
-fi
-
-
-# Check whether --with-ticcutils was given.
-if test "${with_ticcutils+set}" = set; then :
- withval=$with_ticcutils; PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$withval/lib/pkgconfig"
-else
- PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$prefix/lib/pkgconfig"
-fi
-
-
pkg_failed=no
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ticcutils" >&5
$as_echo_n "checking for ticcutils... " >&6; }
@@ -17963,14 +17887,6 @@ CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS"
LIBS="$ticcutils_LIBS $LIBS"
-# Check whether --with-timbl was given.
-if test "${with_timbl+set}" = set; then :
- withval=$with_timbl; PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$withval/lib/pkgconfig"
-else
- PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$prefix/lib/pkgconfig"
-fi
-
-
pkg_failed=no
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for timbl" >&5
$as_echo_n "checking for timbl... " >&6; }
@@ -18065,14 +17981,6 @@ CXXFLAGS="$CXXFLAGS $timbl_CFLAGS"
LIBS="$timbl_LIBS $LIBS"
-# Check whether --with-mbt was given.
-if test "${with_mbt+set}" = set; then :
- withval=$with_mbt; PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$withval/lib/pkgconfig"
-else
- PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$prefix/lib/pkgconfig"
-fi
-
-
pkg_failed=no
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for mbt" >&5
$as_echo_n "checking for mbt... " >&6; }
@@ -18167,16 +18075,6 @@ CXXFLAGS="$CXXFLAGS $mbt_CFLAGS"
LIBS="$mbt_LIBS $LIBS"
-# Check whether --with-folia was given.
-if test "${with_folia+set}" = set; then :
- withval=$with_folia; PKG_CONFIG_PATH="$withval/lib/pkgconfig:$PKG_CONFIG_PATH"
-else
- PKG_CONFIG_PATH="$prefix/lib/pkgconfig:$PKG_CONFIG_PATH"
-fi
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: pkg-config search path: $PKG_CONFIG_PATH " >&5
-$as_echo "$as_me: pkg-config search path: $PKG_CONFIG_PATH " >&6;}
-
pkg_failed=no
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for folia" >&5
$as_echo_n "checking for folia... " >&6; }
@@ -18185,12 +18083,12 @@ if test -n "$folia_CFLAGS"; then
pkg_cv_folia_CFLAGS="$folia_CFLAGS"
elif test -n "$PKG_CONFIG"; then
if test -n "$PKG_CONFIG" && \
- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"folia >= 1.0\""; } >&5
- ($PKG_CONFIG --exists --print-errors "folia >= 1.0") 2>&5
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"folia >= 1.10\""; } >&5
+ ($PKG_CONFIG --exists --print-errors "folia >= 1.10") 2>&5
ac_status=$?
$as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
test $ac_status = 0; }; then
- pkg_cv_folia_CFLAGS=`$PKG_CONFIG --cflags "folia >= 1.0" 2>/dev/null`
+ pkg_cv_folia_CFLAGS=`$PKG_CONFIG --cflags "folia >= 1.10" 2>/dev/null`
test "x$?" != "x0" && pkg_failed=yes
else
pkg_failed=yes
@@ -18202,12 +18100,12 @@ if test -n "$folia_LIBS"; then
pkg_cv_folia_LIBS="$folia_LIBS"
elif test -n "$PKG_CONFIG"; then
if test -n "$PKG_CONFIG" && \
- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"folia >= 1.0\""; } >&5
- ($PKG_CONFIG --exists --print-errors "folia >= 1.0") 2>&5
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"folia >= 1.10\""; } >&5
+ ($PKG_CONFIG --exists --print-errors "folia >= 1.10") 2>&5
ac_status=$?
$as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
test $ac_status = 0; }; then
- pkg_cv_folia_LIBS=`$PKG_CONFIG --libs "folia >= 1.0" 2>/dev/null`
+ pkg_cv_folia_LIBS=`$PKG_CONFIG --libs "folia >= 1.10" 2>/dev/null`
test "x$?" != "x0" && pkg_failed=yes
else
pkg_failed=yes
@@ -18228,14 +18126,14 @@ else
_pkg_short_errors_supported=no
fi
if test $_pkg_short_errors_supported = yes; then
- folia_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "folia >= 1.0" 2>&1`
+ folia_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "folia >= 1.10" 2>&1`
else
- folia_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "folia >= 1.0" 2>&1`
+ folia_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "folia >= 1.10" 2>&1`
fi
# Put the nasty error message in config.log where it belongs
echo "$folia_PKG_ERRORS" >&5
- as_fn_error $? "Package requirements (folia >= 1.0) were not met:
+ as_fn_error $? "Package requirements (folia >= 1.10) were not met:
$folia_PKG_ERRORS
@@ -18271,14 +18169,6 @@ CXXFLAGS="$CXXFLAGS $folia_CFLAGS"
LIBS="$folia_LIBS $LIBS"
-# Check whether --with-ucto was given.
-if test "${with_ucto+set}" = set; then :
- withval=$with_ucto; PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$withval/lib/pkgconfig"
-else
- PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$prefix/lib/pkgconfig"
-fi
-
-
pkg_failed=no
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ucto" >&5
$as_echo_n "checking for ucto... " >&6; }
@@ -18287,12 +18177,12 @@ if test -n "$ucto_CFLAGS"; then
pkg_cv_ucto_CFLAGS="$ucto_CFLAGS"
elif test -n "$PKG_CONFIG"; then
if test -n "$PKG_CONFIG" && \
- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ucto >= 0.9.6 \""; } >&5
- ($PKG_CONFIG --exists --print-errors "ucto >= 0.9.6 ") 2>&5
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ucto >= 0.9.7 \""; } >&5
+ ($PKG_CONFIG --exists --print-errors "ucto >= 0.9.7 ") 2>&5
ac_status=$?
$as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
test $ac_status = 0; }; then
- pkg_cv_ucto_CFLAGS=`$PKG_CONFIG --cflags "ucto >= 0.9.6 " 2>/dev/null`
+ pkg_cv_ucto_CFLAGS=`$PKG_CONFIG --cflags "ucto >= 0.9.7 " 2>/dev/null`
test "x$?" != "x0" && pkg_failed=yes
else
pkg_failed=yes
@@ -18304,12 +18194,12 @@ if test -n "$ucto_LIBS"; then
pkg_cv_ucto_LIBS="$ucto_LIBS"
elif test -n "$PKG_CONFIG"; then
if test -n "$PKG_CONFIG" && \
- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ucto >= 0.9.6 \""; } >&5
- ($PKG_CONFIG --exists --print-errors "ucto >= 0.9.6 ") 2>&5
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ucto >= 0.9.7 \""; } >&5
+ ($PKG_CONFIG --exists --print-errors "ucto >= 0.9.7 ") 2>&5
ac_status=$?
$as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
test $ac_status = 0; }; then
- pkg_cv_ucto_LIBS=`$PKG_CONFIG --libs "ucto >= 0.9.6 " 2>/dev/null`
+ pkg_cv_ucto_LIBS=`$PKG_CONFIG --libs "ucto >= 0.9.7 " 2>/dev/null`
test "x$?" != "x0" && pkg_failed=yes
else
pkg_failed=yes
@@ -18330,14 +18220,14 @@ else
_pkg_short_errors_supported=no
fi
if test $_pkg_short_errors_supported = yes; then
- ucto_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "ucto >= 0.9.6 " 2>&1`
+ ucto_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "ucto >= 0.9.7 " 2>&1`
else
- ucto_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "ucto >= 0.9.6 " 2>&1`
+ ucto_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "ucto >= 0.9.7 " 2>&1`
fi
# Put the nasty error message in config.log where it belongs
echo "$ucto_PKG_ERRORS" >&5
- as_fn_error $? "Package requirements (ucto >= 0.9.6 ) were not met:
+ as_fn_error $? "Package requirements (ucto >= 0.9.7 ) were not met:
$ucto_PKG_ERRORS
@@ -18381,12 +18271,12 @@ if test -n "$frogdata_CFLAGS"; then
pkg_cv_frogdata_CFLAGS="$frogdata_CFLAGS"
elif test -n "$PKG_CONFIG"; then
if test -n "$PKG_CONFIG" && \
- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"frogdata >= 0.13 \""; } >&5
- ($PKG_CONFIG --exists --print-errors "frogdata >= 0.13 ") 2>&5
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"frogdata >= 0.15 \""; } >&5
+ ($PKG_CONFIG --exists --print-errors "frogdata >= 0.15 ") 2>&5
ac_status=$?
$as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
test $ac_status = 0; }; then
- pkg_cv_frogdata_CFLAGS=`$PKG_CONFIG --cflags "frogdata >= 0.13 " 2>/dev/null`
+ pkg_cv_frogdata_CFLAGS=`$PKG_CONFIG --cflags "frogdata >= 0.15 " 2>/dev/null`
test "x$?" != "x0" && pkg_failed=yes
else
pkg_failed=yes
@@ -18398,12 +18288,12 @@ if test -n "$frogdata_LIBS"; then
pkg_cv_frogdata_LIBS="$frogdata_LIBS"
elif test -n "$PKG_CONFIG"; then
if test -n "$PKG_CONFIG" && \
- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"frogdata >= 0.13 \""; } >&5
- ($PKG_CONFIG --exists --print-errors "frogdata >= 0.13 ") 2>&5
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"frogdata >= 0.15 \""; } >&5
+ ($PKG_CONFIG --exists --print-errors "frogdata >= 0.15 ") 2>&5
ac_status=$?
$as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
test $ac_status = 0; }; then
- pkg_cv_frogdata_LIBS=`$PKG_CONFIG --libs "frogdata >= 0.13 " 2>/dev/null`
+ pkg_cv_frogdata_LIBS=`$PKG_CONFIG --libs "frogdata >= 0.15 " 2>/dev/null`
test "x$?" != "x0" && pkg_failed=yes
else
pkg_failed=yes
@@ -18424,14 +18314,14 @@ else
_pkg_short_errors_supported=no
fi
if test $_pkg_short_errors_supported = yes; then
- frogdata_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "frogdata >= 0.13 " 2>&1`
+ frogdata_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "frogdata >= 0.15 " 2>&1`
else
- frogdata_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "frogdata >= 0.13 " 2>&1`
+ frogdata_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "frogdata >= 0.15 " 2>&1`
fi
# Put the nasty error message in config.log where it belongs
echo "$frogdata_PKG_ERRORS" >&5
- as_fn_error $? "Package requirements (frogdata >= 0.13 ) were not met:
+ as_fn_error $? "Package requirements (frogdata >= 0.15 ) were not met:
$frogdata_PKG_ERRORS
@@ -18463,7 +18353,7 @@ else
$as_echo "yes" >&6; }
fi
-#
+
ac_config_files="$ac_config_files Makefile frog.pc m4/Makefile docs/Makefile tests/Makefile src/Makefile include/Makefile include/frog/Makefile"
@@ -19001,7 +18891,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by frog $as_me 0.13.7, which was
+This file was extended by frog $as_me 0.13.8, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -19067,7 +18957,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-frog config.status 0.13.7
+frog config.status 0.13.8
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
diff --git a/configure.ac b/configure.ac
index 4559869..b8bd64a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ(2.59)
-AC_INIT([frog], [0.13.7], [lamasoftware at science.ru.nl])
+AC_INIT([frog], [0.13.8], [lamasoftware at science.ru.nl])
AM_INIT_AUTOMAKE
AC_CONFIG_SRCDIR([configure.ac])
AC_CONFIG_MACRO_DIR([m4])
@@ -10,9 +10,9 @@ AC_CONFIG_HEADER([config.h])
if test x"${CXXFLAGS+set}" = xset; then
# the user set CXXFLAGS; don't override it.
- cxx_flags_were_set=true
-else
cxx_flags_were_set=false
+else
+ cxx_flags_were_set=true
fi
# Checks for programs.
@@ -26,9 +26,6 @@ fi
AC_PROG_LIBTOOL
LT_INIT
-SYSCONFDIR=$sysconfdir
-AC_SUBST([SYSCONFDIR])
-
# when running tests, use CXX
AC_LANG([C++])
@@ -57,7 +54,12 @@ if test "x$ac_cv_prog_cxx_openmp" != "xunsupported"; then
CXXFLAGS="$CXXFLAGS $OPENMP_CXXFLAGS"
AC_DEFINE(HAVE_OPENMP, 1 , Define to 1 if you have OpenMP )
else
- AC_MSG_NOTICE(We don't have OpenMP. Multithreaded operation is disabled)
+ if test "$CXX" = "clang++-4.0"; then
+ CXXFLAGS="$CXXFLAGS -fopenmp"
+ AC_DEFINE(HAVE_OPENMP, 1 , Define to 1 if you have OpenMP )
+ else
+ AC_MSG_NOTICE(We don't have OpenMP. Multithreaded operation is disabled)
+ fi
fi
if test $prefix = "NONE"; then
@@ -65,86 +67,44 @@ if test $prefix = "NONE"; then
fi
PKG_PROG_PKG_CONFIG
-AC_PATH_PROG(pkgconfigpath, pkg-config, NONE)
-if test "$pkgconfigpath" != "NONE"; then
-# ugly hack when PKG_CONFIG_PATH isn't defined.
-# couldn't get it to work otherwise
- if test "x$PKG_CONFIG_PATH" = x; then
- export PKG_CONFIG_PATH=""
- fi
+if test "x$PKG_CONFIG_PATH" = x; then
+ export PKG_CONFIG_PATH="$prefix/lib/pkgconfig"
+else
+ export PKG_CONFIG_PATH="$prefix/lib/pkgconfig:$PKG_CONFIG_PATH"
fi
+
PKG_CHECK_MODULES([XML2], [libxml-2.0 >= 2.6.16] )
CXXFLAGS="$CXXFLAGS $XML2_CFLAGS"
LIBS="$XML2_LIBS $LIBS"
-AX_LIB_READLINE
-
+PKG_CHECK_MODULES([ICU], [icu-uc >= 50 icu-io] )
+CXXFLAGS="$CXXFLAGS $ICU_CFLAGS"
+LIBS="$ICU_LIBS $LIBS"
-#check for needed dependencies
-useICU=1;
-# inspired by feh-1.3.4/configure.ac. Tnx Tom Gilbert and feh hackers.
-AC_ARG_WITH(icu,
- [ --with-icu=DIR use ICU installed in <DIR>],
- [if test "$with_icu" = "no"; then
- useICU=0
- else
- CXXFLAGS="$CXXFLAGS -I$withval/include"
- LIBS="-L$withval/lib $LIBS"
- fi] )
-
-if test "$useICU" = "1"; then
- AX_ICU_CHECK( [4.2],
- [CXXFLAGS="$CXXFLAGS $ICU_CPPSEARCHPATH"
- LIBS="$ICU_LIBPATH $ICU_LIBS $ICU_IOLIBS $LIBS"],
- [AC_MSG_FAILURE( "No ICU development environment found. Please check if libicu-ev or the like is installed" )] )
- AC_DEFINE(HAVE_ICU, 1, we want to use ICU )
-else
- AC_MSG_ERROR("ICU support is required")
-fi
+AX_LIB_READLINE
-AC_ARG_WITH(ticcutils,
- [ --with-ticcutils=DIR use ticcutils installed in <DIR>],
- [PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$withval/lib/pkgconfig"],
- [PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$prefix/lib/pkgconfig"])
PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.9] )
CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS"
LIBS="$ticcutils_LIBS $LIBS"
-AC_ARG_WITH(timbl,
- [ --with-timbl=DIR use timbl installed in <DIR>],
- [PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$withval/lib/pkgconfig"],
- [PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$prefix/lib/pkgconfig"])
PKG_CHECK_MODULES([timbl], [timbl >= 6.4.4] )
CXXFLAGS="$CXXFLAGS $timbl_CFLAGS"
LIBS="$timbl_LIBS $LIBS"
-AC_ARG_WITH(mbt,
- [ --with-mbt=DIR use mbt installed in <DIR>],
- [PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$withval/lib/pkgconfig"],
- [PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$prefix/lib/pkgconfig"])
PKG_CHECK_MODULES([mbt], [mbt >= 3.2.14] )
CXXFLAGS="$CXXFLAGS $mbt_CFLAGS"
LIBS="$mbt_LIBS $LIBS"
-AC_ARG_WITH(folia,
- [ --with-folia=DIR use libfolia installed in <DIR>],
- [PKG_CONFIG_PATH="$withval/lib/pkgconfig:$PKG_CONFIG_PATH"],
- [PKG_CONFIG_PATH="$prefix/lib/pkgconfig:$PKG_CONFIG_PATH"])
-AC_MSG_NOTICE( [pkg-config search path: $PKG_CONFIG_PATH] )
-PKG_CHECK_MODULES([folia],[folia >= 1.0])
+PKG_CHECK_MODULES([folia],[folia >= 1.10])
CXXFLAGS="$CXXFLAGS $folia_CFLAGS"
LIBS="$folia_LIBS $LIBS"
-AC_ARG_WITH(ucto,
- [ --with-ucto=DIR use ucto installed in <DIR>],
- [PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$withval/lib/pkgconfig"],
- [PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$prefix/lib/pkgconfig"])
-PKG_CHECK_MODULES([ucto], [ucto >= 0.9.6] )
+PKG_CHECK_MODULES([ucto], [ucto >= 0.9.7] )
CXXFLAGS="$CXXFLAGS $ucto_CFLAGS"
LIBS="$ucto_LIBS $LIBS"
-PKG_CHECK_MODULES([frogdata], [frogdata >= 0.13] )
-#
+PKG_CHECK_MODULES([frogdata], [frogdata >= 0.15] )
+
AC_OUTPUT([
Makefile
diff --git a/docs/Makefile.in b/docs/Makefile.in
index 4fe49eb..eb1e040 100644
--- a/docs/Makefile.in
+++ b/docs/Makefile.in
@@ -89,8 +89,7 @@ build_triplet = @build@
host_triplet = @host@
subdir = docs
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \
- $(top_srcdir)/m4/ax_lib_readline.m4 \
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_lib_readline.m4 \
$(top_srcdir)/m4/ax_pthread.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
@@ -186,13 +185,7 @@ EXEEXT = @EXEEXT@
FGREP = @FGREP@
GREP = @GREP@
ICU_CFLAGS = @ICU_CFLAGS@
-ICU_CONFIG = @ICU_CONFIG@
-ICU_CPPSEARCHPATH = @ICU_CPPSEARCHPATH@
-ICU_CXXFLAGS = @ICU_CXXFLAGS@
-ICU_IOLIBS = @ICU_IOLIBS@
-ICU_LIBPATH = @ICU_LIBPATH@
ICU_LIBS = @ICU_LIBS@
-ICU_VERSION = @ICU_VERSION@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -236,7 +229,6 @@ SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
STRIP = @STRIP@
-SYSCONFDIR = @SYSCONFDIR@
VERSION = @VERSION@
XML2_CFLAGS = @XML2_CFLAGS@
XML2_LIBS = @XML2_LIBS@
@@ -289,10 +281,10 @@ mbt_LIBS = @mbt_LIBS@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
-pkgconfigpath = @pkgconfigpath@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
+runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
diff --git a/docs/frog.1 b/docs/frog.1
index 28ad0e4..e5de2bb 100644
--- a/docs/frog.1
+++ b/docs/frog.1
@@ -1,4 +1,4 @@
-.TH frog 1 "2016 march 10"
+.TH frog 1 "2017 may 1"
.SH NAME
frog \- Dutch Natural Language Toolkit
@@ -45,7 +45,7 @@ The default tabbed output is also more detailed in the Morpheme field.
set input encoding. (default UTF8)
.RE
-.BR \-h
+.BR \-h " or " \-\-help
.RS
give some help
.RE
@@ -55,6 +55,18 @@ give some help
keep the intermediate files from the parser. Last sentence only!
.RE
+.BR \-\-language='comma separated list of languages'
+.RS
+Set the languages to work on. This parameter is also passed to the tokenizer.
+The strings are assumed to be ISO 639\-2 codes.
+
+The first language in the list will be the default, unspecified languages are
+asumed to be that default.
+
+e.g. \-\-language=nld,eng,por
+means: detect Dutch, English and Portuguese, with Dutch being the default.
+.RE
+
.BR \-n
.RS
assume inputfile to hold one sentence per line.
@@ -63,6 +75,15 @@ Very useful when running interactive, otherwise an empty line is needed to
signal end of input.
.RE
+.BR \-\-nostdout
+.RS
+suppress the collumned output to stdout. (when no outputfile is specified with
+\-o or \-\-outputdir)
+
+Especially useful when XML output is speifies with \-X or \-\-xmldir.
+.RE
+
+
.BR \-o " <file>"
.RS
send output to 'file' instead of stdout. Defaults to the name of the inputfile with '.out' appended.
@@ -73,6 +94,15 @@ send output to 'file' instead of stdout. Defaults to the name of the inputfile w
send all output to 'dir' instead of stdout. Creates filenames from the inputfilename(s) with '.out' appended.
.RE
+.BR \-\-retry
+.RS
+assume a re-run on the same input file(s). Frog wil only process those files
+that haven't been processed yet. This is accomplished by looking at the output
+file names. (so this has no effect if neither \-o, \-\-outputdir, \-X or
+\-\-xmldir is used)
+.RE
+
+
.BR \-\-skip =[aclmnpt]
.RS
skip parts of the process: Tokenizer (t), Chunker (c), Lemmatizer (l), Morphological Analyzer (a), Multi\(hyWord unit (m), Named\(hyEntity recognizer (n) or Parser (p)
@@ -106,10 +136,22 @@ is provided, all '.xml' files in 'dir' will be processed as FoLia XML.
.RS
When
.BR \-x
-is given, use 'cls' to find text in the FoLiA document(s).
+is given, use 'cls' to find AND store text in the FoLiA document(s).
+Using \-\-inputclass and \-\-\outputclass is in general a better choice.
.RE
+.BR \-\-inputclass "=<cls>"
+.RS
+use 'cls' to find text in the FoLiA input document(s).
+.RE
+
+.BR \-\-outputclass "=<cls>"
+.RS
+use 'cls' to output text in the FoLiA input document(s).
+Preferably this is another class then the inputclass.
+.RE
+
.BR \-\-testdir =<dir>
.RS
process all files in 'dir'. When the input mode is XML, only '.xml' files are teken from 'dir'. see also
@@ -121,6 +163,11 @@ process all files in 'dir'. When the input mode is XML, only '.xml' files are te
location to store intermediate files. Default /tmp.
.RE
+.BR \-\-uttmarker =<mark>
+.RS
+assume all utterances are separeted by 'mark'. (the default is none).
+.RE
+
.BR \-\-threads =<n>
.RS
use a maximum of 'n' threads. The default is to take whatever is needed.
@@ -154,11 +201,12 @@ for FoLia is given, use 'id' to give the doc an ID.
likely
.SH AUTHORS
-Maarten van Gompel proycon at anaproy.nl
+Maarten van Gompel
-Ko van der Sloot Timbl at uvt.nl
+Ko van der Sloot
-Antal van den Bosch Timbl at uvt.nl
+Antal van den Bosch
+e\-mail: lamasoftware at science.ru.nl
.SH SEE ALSO
.BR ucto (1)
diff --git a/include/Makefile.in b/include/Makefile.in
index 606d5f6..e71eafa 100644
--- a/include/Makefile.in
+++ b/include/Makefile.in
@@ -94,8 +94,7 @@ build_triplet = @build@
host_triplet = @host@
subdir = include
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \
- $(top_srcdir)/m4/ax_lib_readline.m4 \
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_lib_readline.m4 \
$(top_srcdir)/m4/ax_pthread.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
@@ -220,13 +219,7 @@ EXEEXT = @EXEEXT@
FGREP = @FGREP@
GREP = @GREP@
ICU_CFLAGS = @ICU_CFLAGS@
-ICU_CONFIG = @ICU_CONFIG@
-ICU_CPPSEARCHPATH = @ICU_CPPSEARCHPATH@
-ICU_CXXFLAGS = @ICU_CXXFLAGS@
-ICU_IOLIBS = @ICU_IOLIBS@
-ICU_LIBPATH = @ICU_LIBPATH@
ICU_LIBS = @ICU_LIBS@
-ICU_VERSION = @ICU_VERSION@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -270,7 +263,6 @@ SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
STRIP = @STRIP@
-SYSCONFDIR = @SYSCONFDIR@
VERSION = @VERSION@
XML2_CFLAGS = @XML2_CFLAGS@
XML2_LIBS = @XML2_LIBS@
@@ -323,10 +315,10 @@ mbt_LIBS = @mbt_LIBS@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
-pkgconfigpath = @pkgconfigpath@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
+runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
diff --git a/include/frog/FrogAPI.h b/include/frog/FrogAPI.h
index 52311cb..22253b9 100644
--- a/include/frog/FrogAPI.h
+++ b/include/frog/FrogAPI.h
@@ -46,7 +46,7 @@ class Mbma;
class Mblem;
class Mwu;
class Parser;
-class POSTagger;
+class CGNTagger;
class IOBTagger;
class NERTagger;
@@ -63,10 +63,11 @@ class FrogOptions {
bool doSentencePerLine;
bool doQuoteDetection;
bool doDirTest;
- bool doServer;
-
+ bool doRetry;
+ bool noStdOut;
bool doXMLin;
bool doXMLout;
+ bool doServer;
bool doKanon;
int debugFlag;
@@ -77,8 +78,10 @@ class FrogOptions {
std::string uttmark;
std::string listenport;
std::string docid;
- std::string textclass;
+ std::string inputclass;
+ std::string outputclass;
std::string language;
+ std::string textredundancy;
unsigned int maxParserTokens;
@@ -104,6 +107,7 @@ class FrogAPI {
std::string Frogtostringfromfile( const std::string& );
private:
+ void test_version( const std::string&, double );
// functions
bool TestSentence( folia::Sentence*, TimerBlock&);
void FrogStdin( bool prompt );
@@ -128,13 +132,16 @@ class FrogAPI {
Mblem *myMblem;
Mwu *myMwu;
Parser *myParser;
- POSTagger *myPoSTagger;
+ CGNTagger *myCGNTagger;
IOBTagger *myIOBTagger;
NERTagger *myNERTagger;
UctoTokenizer *tokenizer;
};
std::vector<std::string> get_full_morph_analysis( folia::Word *, bool = false );
+std::vector<std::string> get_full_morph_analysis( folia::Word *,
+ const std::string&,
+ bool = false );
std::vector<std::string> get_compound_analysis( folia::Word * );
#endif
diff --git a/include/frog/Makefile.am b/include/frog/Makefile.am
index 4f0711e..18d2c13 100644
--- a/include/frog/Makefile.am
+++ b/include/frog/Makefile.am
@@ -1,4 +1,5 @@
pkginclude_HEADERS = FrogAPI.h Frog.h mblem_mod.h \
mbma_rule.h mbma_mod.h mbma_brackets.h clex.h mwu_chunker_mod.h \
- pos_tagger_mod.h cgn_tagger_mod.h iob_tagger_mod.h Parser.h \
- ucto_tokenizer_mod.h ner_tagger_mod.h csidp.h ckyparser.h
+ tagger_base.h cgn_tagger_mod.h iob_tagger_mod.h \
+ Parser.h ucto_tokenizer_mod.h ner_tagger_mod.h \
+ csidp.h ckyparser.h
diff --git a/include/frog/Makefile.in b/include/frog/Makefile.in
index 47e48c2..2991af7 100644
--- a/include/frog/Makefile.in
+++ b/include/frog/Makefile.in
@@ -90,8 +90,7 @@ build_triplet = @build@
host_triplet = @host@
subdir = include/frog
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \
- $(top_srcdir)/m4/ax_lib_readline.m4 \
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_lib_readline.m4 \
$(top_srcdir)/m4/ax_pthread.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
@@ -204,13 +203,7 @@ EXEEXT = @EXEEXT@
FGREP = @FGREP@
GREP = @GREP@
ICU_CFLAGS = @ICU_CFLAGS@
-ICU_CONFIG = @ICU_CONFIG@
-ICU_CPPSEARCHPATH = @ICU_CPPSEARCHPATH@
-ICU_CXXFLAGS = @ICU_CXXFLAGS@
-ICU_IOLIBS = @ICU_IOLIBS@
-ICU_LIBPATH = @ICU_LIBPATH@
ICU_LIBS = @ICU_LIBS@
-ICU_VERSION = @ICU_VERSION@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -254,7 +247,6 @@ SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
STRIP = @STRIP@
-SYSCONFDIR = @SYSCONFDIR@
VERSION = @VERSION@
XML2_CFLAGS = @XML2_CFLAGS@
XML2_LIBS = @XML2_LIBS@
@@ -307,10 +299,10 @@ mbt_LIBS = @mbt_LIBS@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
-pkgconfigpath = @pkgconfigpath@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
+runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
@@ -327,8 +319,9 @@ ucto_CFLAGS = @ucto_CFLAGS@
ucto_LIBS = @ucto_LIBS@
pkginclude_HEADERS = FrogAPI.h Frog.h mblem_mod.h \
mbma_rule.h mbma_mod.h mbma_brackets.h clex.h mwu_chunker_mod.h \
- pos_tagger_mod.h cgn_tagger_mod.h iob_tagger_mod.h Parser.h \
- ucto_tokenizer_mod.h ner_tagger_mod.h csidp.h ckyparser.h
+ tagger_base.h cgn_tagger_mod.h iob_tagger_mod.h \
+ Parser.h ucto_tokenizer_mod.h ner_tagger_mod.h \
+ csidp.h ckyparser.h
all: all-am
diff --git a/include/frog/Parser.h b/include/frog/Parser.h
index bab76b0..1e40750 100644
--- a/include/frog/Parser.h
+++ b/include/frog/Parser.h
@@ -36,9 +36,16 @@ struct parseData;
class Parser {
public:
- Parser(TiCC::LogStream* logstream):pairs(0),dir(0),rels(0),isInit(false) {
- parseLog = new TiCC::LogStream(logstream, "parser-");
- };
+ explicit Parser( TiCC::LogStream* logstream ):
+ pairs(0),
+ dir(0),
+ rels(0),
+ maxDepSpan( 0 ),
+ isInit( false ),
+ filter( 0 )
+ {
+ parseLog = new TiCC::LogStream(logstream, "parser-");
+ };
~Parser();
bool init( const TiCC::Configuration& );
void addDeclaration( folia::Document& doc ) const;
@@ -63,7 +70,9 @@ class Parser {
std::string dep_tagset;
std::string POS_tagset;
std::string MWU_tagset;
+ std::string textclass;
Tokenizer::UnicodeFilter *filter;
+ Parser( const Parser& ){}; // inhibit copies
};
diff --git a/include/frog/cgn_tagger_mod.h b/include/frog/cgn_tagger_mod.h
index 1f02bca..dd73b9f 100644
--- a/include/frog/cgn_tagger_mod.h
+++ b/include/frog/cgn_tagger_mod.h
@@ -32,15 +32,16 @@
#ifndef CGN_TAGGER_MOD_H
#define CGN_TAGGER_MOD_H
-#include "frog/pos_tagger_mod.h"
+#include "frog/tagger_base.h"
-class CGNTagger: public POSTagger {
+class CGNTagger: public BaseTagger {
public:
- CGNTagger( TiCC::LogStream *l ): POSTagger( l ){};
+ explicit CGNTagger( TiCC::LogStream *l ): BaseTagger( l, "tagger" ){};
bool init( const TiCC::Configuration& );
- void Classify( const std::vector<folia::Word *>& );
- void post_process( const std::vector<folia::Word *>& words );
+ void addDeclaration( folia::Document& ) const;
+ void post_process( const std::vector<folia::Word*>& );
private:
+ void addTag( folia::Word *, const std::string&, double );
void fillSubSetTable();
std::string getSubSet( const std::string& , const std::string& );
std::multimap<std::string,std::string> cgnSubSets;
diff --git a/include/frog/ckyparser.h b/include/frog/ckyparser.h
index 0f3720e..55e945c 100644
--- a/include/frog/ckyparser.h
+++ b/include/frog/ckyparser.h
@@ -110,7 +110,8 @@ class chart_rec {
class CKYParser {
public:
- CKYParser( size_t, const std::vector<const Constraint*>& );
+ CKYParser( size_t, const std::vector<const Constraint*>&, TiCC::LogStream* );
+ ~CKYParser(){ delete ckyLog; };
void parse();
void leftIncomplete( int , int , std::vector<parsrel>& );
void rightIncomplete( int , int , std::vector<parsrel>& );
@@ -127,6 +128,7 @@ private:
std::vector< std::vector< std::vector<const Constraint*>>> edgeConstraints;
std::vector< std::vector<chart_rec>> chart;
+ TiCC::LogStream *ckyLog;
};
#endif
diff --git a/include/frog/csidp.h b/include/frog/csidp.h
index 2ec5efd..fa315b4 100644
--- a/include/frog/csidp.h
+++ b/include/frog/csidp.h
@@ -22,6 +22,7 @@ std::vector<parsrel> parse( const std::vector<timbl_result>&,
const std::vector<timbl_result>&,
const std::vector<timbl_result>&,
size_t,
- int );
+ int,
+ TiCC::LogStream* );
#endif
diff --git a/include/frog/iob_tagger_mod.h b/include/frog/iob_tagger_mod.h
index 0389327..f38c099 100644
--- a/include/frog/iob_tagger_mod.h
+++ b/include/frog/iob_tagger_mod.h
@@ -32,29 +32,24 @@
#ifndef IOB_TAGGER_MOD_H
#define IOB_TAGGER_MOD_H
-class IOBTagger {
+#include "frog/tagger_base.h"
+
+class IOBTagger: public BaseTagger {
public:
- IOBTagger(TiCC::LogStream *);
- ~IOBTagger();
+ explicit IOBTagger( TiCC::LogStream *l ): BaseTagger( l, "IOB" ){};
bool init( const TiCC::Configuration& );
void addDeclaration( folia::Document& ) const;
- void Classify( const std::vector<folia::Word *>& );
- std::string getTagset() const { return tagset; };
- std::string set_eos_mark( const std::string& );
+ void Classify( const std::vector<folia::Word*>& );
+ void post_process( const std::vector<folia::Word*>& );
private:
void addChunk( folia::ChunkingLayer *,
const std::vector<folia::Word*>&,
const std::vector<double>&,
+ const std::string&,
const std::string& );
void addIOBTags( const std::vector<folia::Word*>&,
const std::vector<std::string>&,
const std::vector<double>& );
- MbtAPI *tagger;
- TiCC::LogStream *iobLog;
- int debug;
- std::string version;
- std::string tagset;
- Tokenizer::UnicodeFilter *filter;
};
#endif // IOB_TAGGER_MOD_H
diff --git a/include/frog/mblem_mod.h b/include/frog/mblem_mod.h
index 2d59c42..af0cee9 100644
--- a/include/frog/mblem_mod.h
+++ b/include/frog/mblem_mod.h
@@ -34,10 +34,9 @@
class mblemData {
public:
- mblemData( const std::string& l, const std::string& t ){
- lemma = l;
- tag = t;
- };
+ mblemData( const std::string& l, const std::string& t ):
+ lemma( l ),
+ tag( t ) { };
std::string getLemma() const { return lemma; };
std::string getTag() const { return tag; };
private:
@@ -47,7 +46,7 @@ class mblemData {
class Mblem {
public:
- Mblem(TiCC::LogStream *);
+ explicit Mblem( TiCC::LogStream * );
~Mblem();
bool init( const TiCC::Configuration& );
void addDeclaration( folia::Document& doc ) const;
@@ -78,6 +77,7 @@ class Mblem {
std::string version;
std::string tagset;
std::string POS_tagset;
+ std::string textclass;
TiCC::LogStream *mblemLog;
Tokenizer::UnicodeFilter *filter;
};
diff --git a/include/frog/mbma_brackets.h b/include/frog/mbma_brackets.h
index 5b2a45b..b4535bb 100644
--- a/include/frog/mbma_brackets.h
+++ b/include/frog/mbma_brackets.h
@@ -55,18 +55,21 @@ namespace folia {
class RulePart;
class BaseBracket {
-public:
+ public:
BaseBracket( CLEX::Type t, const std::vector<CLEX::Type>& R, int flag,
TiCC::LogStream& l ):
RightHand(R),
cls(t),
+ _status( FAILED ),
debugFlag(flag),
myLog(l)
-
- {};
+ {};
BaseBracket( CLEX::Type t, int flag, TiCC::LogStream& l ):
- cls(t), debugFlag(flag), myLog(l)
- {};
+ cls(t),
+ _status( FAILED ),
+ debugFlag(flag),
+ myLog(l)
+ {};
virtual ~BaseBracket() {};
Status status() const { return _status; };
diff --git a/include/frog/mbma_mod.h b/include/frog/mbma_mod.h
index 0b5bb6f..c5f3e6b 100644
--- a/include/frog/mbma_mod.h
+++ b/include/frog/mbma_mod.h
@@ -60,7 +60,7 @@ namespace folia {
class Mbma {
public:
- Mbma(TiCC::LogStream *);
+ explicit Mbma( TiCC::LogStream * );
~Mbma();
bool init( const TiCC::Configuration& );
void addDeclaration( folia::Document& doc ) const;
@@ -104,6 +104,7 @@ class Mbma {
Timbl::TimblAPI *MTree;
std::vector<Rule*> analysis;
std::string version;
+ std::string textclass;
TiCC::LogStream *mbmaLog;
Transliterator *transliterator;
Tokenizer::UnicodeFilter *filter;
diff --git a/include/frog/mwu_chunker_mod.h b/include/frog/mwu_chunker_mod.h
index f3e85c8..c0e4d8f 100644
--- a/include/frog/mwu_chunker_mod.h
+++ b/include/frog/mwu_chunker_mod.h
@@ -46,6 +46,7 @@ class mwuAna {
bool isSpec(){ return spec; };
folia::EntitiesLayer *addEntity( const std::string&,
+ const std::string&,
folia::Sentence *,
folia::EntitiesLayer * );
@@ -61,7 +62,7 @@ class mwuAna {
class Mwu {
friend std::ostream& operator<< (std::ostream&, const Mwu& );
public:
- Mwu(TiCC::LogStream*);
+ explicit Mwu(TiCC::LogStream*);
~Mwu();
void reset();
bool init( const TiCC::Configuration& );
@@ -80,6 +81,7 @@ class Mwu {
mymap2 MWUs;
TiCC::LogStream *mwuLog;
std::string version;
+ std::string textclass;
std::string mwu_tagset;
std::string glue_tag;
Tokenizer::UnicodeFilter *filter;
diff --git a/include/frog/ner_tagger_mod.h b/include/frog/ner_tagger_mod.h
index 18dde68..b37e4fc 100644
--- a/include/frog/ner_tagger_mod.h
+++ b/include/frog/ner_tagger_mod.h
@@ -32,33 +32,26 @@
#ifndef NER_TAGGER_MOD_H
#define NER_TAGGER_MOD_H
-class NERTagger {
+#include <unordered_map>
+#include "frog/tagger_base.h"
+
+class NERTagger: public BaseTagger {
public:
- NERTagger(TiCC::LogStream *);
- ~NERTagger();
+ explicit NERTagger( TiCC::LogStream * );
bool init( const TiCC::Configuration& );
void Classify( const std::vector<folia::Word *>& );
+ void post_process( const std::vector<folia::Word*>& );
void addDeclaration( folia::Document& ) const;
void addNERTags( const std::vector<folia::Word*>&,
const std::vector<std::string>&,
const std::vector<double>& );
- std::string getTagset() const { return tagset; };
- std::vector<Tagger::TagResult> tagLine( const std::string& );
- bool fill_known_ners( const std::string& );
- void handle_known_ners( const std::vector<std::string>&,
- std::vector<std::string>& );
- void merge( const std::vector<std::string>&,
- std::vector<std::string>& tags,
- std::vector<double>& );
- std::string set_eos_mark( const std::string& );
+ bool read_gazets( const std::string&, const std::string& );
+ std::vector<std::string> create_ner_list( const std::vector<std::string>& );
+ bool Generate( const std::string& );
private:
- MbtAPI *tagger;
- TiCC::LogStream *nerLog;
- int debug;
- std::string version;
- std::string tagset;
- std::vector<std::map<std::string,std::string>> known_ners;
- Tokenizer::UnicodeFilter *filter;
+ bool fill_ners( const std::string&, const std::string&, const std::string& );
+ std::vector<std::unordered_map<std::string,std::set<std::string>>> known_ners;
+ int max_ner_size;
};
#endif // NER_TAGGER_MOD_H
diff --git a/include/frog/pos_tagger_mod.h b/include/frog/tagger_base.h
similarity index 68%
rename from include/frog/pos_tagger_mod.h
rename to include/frog/tagger_base.h
index 352fe2f..d8cef29 100644
--- a/include/frog/pos_tagger_mod.h
+++ b/include/frog/tagger_base.h
@@ -29,33 +29,43 @@
*/
-#ifndef POS_TAGGER_MOD_H
-#define POS_TAGGER_MOD_H
+#ifndef TAGGER_BASE_H
+#define TAGGER_BASE_H
#include "mbt/MbtAPI.h"
-class POSTagger {
+class BaseTagger {
public:
- POSTagger(TiCC::LogStream*);
- virtual ~POSTagger();
+ explicit BaseTagger( TiCC::LogStream *, const std::string& );
+ virtual ~BaseTagger();
virtual bool init( const TiCC::Configuration& );
- virtual void Classify( const std::vector<folia::Word *>& );
+ virtual void post_process( const std::vector<folia::Word*>& ) = 0;
+ virtual void Classify( const std::vector<folia::Word*>& );
void addDeclaration( folia::Document& ) const;
- void addTag( folia::Word *, const std::string&, double, bool );
- std::vector<Tagger::TagResult> tagLine( const std::string& );
std::string getTagset() const { return tagset; };
- bool fill_map( const std::string&, std::map<std::string,std::string>& );
std::string set_eos_mark( const std::string& );
+ bool fill_map( const std::string&, std::map<std::string,std::string>& );
+ std::vector<Tagger::TagResult> tagLine( const std::string& );
+ private:
+ std::string extract_sentence( const std::vector<folia::Word*>&,
+ std::vector<std::string>& );
protected:
+ void extract_words_tags( const std::vector<folia::Word *>&,
+ const std::string&,
+ std::vector<std::string>&,
+ std::vector<std::string>& );
int debug;
+ std::string _label;
std::string tagset;
+ std::string version;
+ std::string textclass;
TiCC::LogStream *tag_log;
- private:
MbtAPI *tagger;
- std::string version;
Tokenizer::UnicodeFilter *filter;
+ std::vector<std::string> _words;
+ std::vector<Tagger::TagResult> _tag_result;
std::map<std::string,std::string> token_tag_map;
- std::set<std::string> valid_tags;
+ BaseTagger( const BaseTagger& ){} // inhibit copies
};
-#endif // POS_TAGGER_MOD_H
+#endif // TAGGER_BASE_H
diff --git a/include/frog/ucto_tokenizer_mod.h b/include/frog/ucto_tokenizer_mod.h
index 90a1c51..9ed483d 100644
--- a/include/frog/ucto_tokenizer_mod.h
+++ b/include/frog/ucto_tokenizer_mod.h
@@ -32,7 +32,7 @@
class UctoTokenizer {
public:
- UctoTokenizer(TiCC::LogStream *);
+ explicit UctoTokenizer( TiCC::LogStream * );
~UctoTokenizer() { delete tokenizer; };
bool init( const TiCC::Configuration& );
void setUttMarker( const std::string& );
@@ -42,8 +42,10 @@ class UctoTokenizer {
void setInputEncoding( const std::string& );
void setQuoteDetection( bool );
void setInputXml( bool );
- void setTextClass( const std::string& );
+ void setInputClass( const std::string& );
+ void setOutputClass( const std::string& );
void setDocID( const std::string& );
+ void setTextRedundancy( const std::string& );
folia::Document *tokenizestring( const std::string& );
folia::Document *tokenize( std::istream& );
bool tokenize( folia::Document& );
diff --git a/install-sh b/install-sh
index 0b0fdcb..59990a1 100755
--- a/install-sh
+++ b/install-sh
@@ -1,7 +1,7 @@
#!/bin/sh
# install - install a program, script, or datafile
-scriptversion=2013-12-25.23; # UTC
+scriptversion=2014-09-12.12; # UTC
# This originates from X11R5 (mit/util/scripts/install.sh), which was
# later released in X11R6 (xc/config/util/install.sh) with the
@@ -324,34 +324,41 @@ do
# is incompatible with FreeBSD 'install' when (umask & 300) != 0.
;;
*)
+ # $RANDOM is not portable (e.g. dash); use it when possible to
+ # lower collision chance
tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
- trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
+ trap 'ret=$?; rmdir "$tmpdir/a/b" "$tmpdir/a" "$tmpdir" 2>/dev/null; exit $ret' 0
+ # As "mkdir -p" follows symlinks and we work in /tmp possibly; so
+ # create the $tmpdir first (and fail if unsuccessful) to make sure
+ # that nobody tries to guess the $tmpdir name.
if (umask $mkdir_umask &&
- exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
+ $mkdirprog $mkdir_mode "$tmpdir" &&
+ exec $mkdirprog $mkdir_mode -p -- "$tmpdir/a/b") >/dev/null 2>&1
then
if test -z "$dir_arg" || {
# Check for POSIX incompatibilities with -m.
# HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
# other-writable bit of parent directory when it shouldn't.
# FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
- ls_ld_tmpdir=`ls -ld "$tmpdir"`
+ test_tmpdir="$tmpdir/a"
+ ls_ld_tmpdir=`ls -ld "$test_tmpdir"`
case $ls_ld_tmpdir in
d????-?r-*) different_mode=700;;
d????-?--*) different_mode=755;;
*) false;;
esac &&
- $mkdirprog -m$different_mode -p -- "$tmpdir" && {
- ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
+ $mkdirprog -m$different_mode -p -- "$test_tmpdir" && {
+ ls_ld_tmpdir_1=`ls -ld "$test_tmpdir"`
test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
}
}
then posix_mkdir=:
fi
- rmdir "$tmpdir/d" "$tmpdir"
+ rmdir "$tmpdir/a/b" "$tmpdir/a" "$tmpdir"
else
# Remove any dirs left behind by ancient mkdir implementations.
- rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
+ rmdir ./$mkdir_mode ./-p ./-- "$tmpdir" 2>/dev/null
fi
trap '' 0;;
esac;;
diff --git a/ltmain.sh b/ltmain.sh
index 0f0a2da..a736cf9 100644
--- a/ltmain.sh
+++ b/ltmain.sh
@@ -31,7 +31,7 @@
PROGRAM=libtool
PACKAGE=libtool
-VERSION=2.4.6
+VERSION="2.4.6 Debian-2.4.6-2"
package_revision=2.4.6
@@ -2068,12 +2068,12 @@ include the following information:
compiler: $LTCC
compiler flags: $LTCFLAGS
linker: $LD (gnu? $with_gnu_ld)
- version: $progname (GNU libtool) 2.4.6
+ version: $progname $scriptversion Debian-2.4.6-2
automake: `($AUTOMAKE --version) 2>/dev/null |$SED 1q`
autoconf: `($AUTOCONF --version) 2>/dev/null |$SED 1q`
Report bugs to <bug-libtool at gnu.org>.
-GNU libtool home page: <http://www.gnu.org/software/libtool/>.
+GNU libtool home page: <http://www.gnu.org/s/libtool/>.
General help using GNU software: <http://www.gnu.org/gethelp/>."
exit 0
}
@@ -7272,10 +7272,13 @@ func_mode_link ()
# -tp=* Portland pgcc target processor selection
# --sysroot=* for sysroot support
# -O*, -g*, -flto*, -fwhopr*, -fuse-linker-plugin GCC link-time optimization
+ # -specs=* GCC specs files
# -stdlib=* select c++ std lib with clang
+ # -fsanitize=* Clang/GCC memory and address sanitizer
-64|-mips[0-9]|-r[0-9][0-9]*|-xarch=*|-xtarget=*|+DA*|+DD*|-q*|-m*| \
-t[45]*|-txscale*|-p|-pg|--coverage|-fprofile-*|-F*|@*|-tp=*|--sysroot=*| \
- -O*|-g*|-flto*|-fwhopr*|-fuse-linker-plugin|-fstack-protector*|-stdlib=*)
+ -O*|-g*|-flto*|-fwhopr*|-fuse-linker-plugin|-fstack-protector*|-stdlib=*| \
+ -specs=*|-fsanitize=*)
func_quote_for_eval "$arg"
arg=$func_quote_for_eval_result
func_append compile_command " $arg"
@@ -7568,7 +7571,10 @@ func_mode_link ()
case $pass in
dlopen) libs=$dlfiles ;;
dlpreopen) libs=$dlprefiles ;;
- link) libs="$deplibs %DEPLIBS% $dependency_libs" ;;
+ link)
+ libs="$deplibs %DEPLIBS%"
+ test "X$link_all_deplibs" != Xno && libs="$libs $dependency_libs"
+ ;;
esac
fi
if test lib,dlpreopen = "$linkmode,$pass"; then
@@ -7887,19 +7893,19 @@ func_mode_link ()
# It is a libtool convenience library, so add in its objects.
func_append convenience " $ladir/$objdir/$old_library"
func_append old_convenience " $ladir/$objdir/$old_library"
+ tmp_libs=
+ for deplib in $dependency_libs; do
+ deplibs="$deplib $deplibs"
+ if $opt_preserve_dup_deps; then
+ case "$tmp_libs " in
+ *" $deplib "*) func_append specialdeplibs " $deplib" ;;
+ esac
+ fi
+ func_append tmp_libs " $deplib"
+ done
elif test prog != "$linkmode" && test lib != "$linkmode"; then
func_fatal_error "'$lib' is not a convenience library"
fi
- tmp_libs=
- for deplib in $dependency_libs; do
- deplibs="$deplib $deplibs"
- if $opt_preserve_dup_deps; then
- case "$tmp_libs " in
- *" $deplib "*) func_append specialdeplibs " $deplib" ;;
- esac
- fi
- func_append tmp_libs " $deplib"
- done
continue
fi # $pass = conv
@@ -8823,6 +8829,9 @@ func_mode_link ()
revision=$number_minor
lt_irix_increment=no
;;
+ *)
+ func_fatal_configuration "$modename: unknown library version type '$version_type'"
+ ;;
esac
;;
no)
diff --git a/m4/Makefile.in b/m4/Makefile.in
index 3937844..3197e6a 100644
--- a/m4/Makefile.in
+++ b/m4/Makefile.in
@@ -92,8 +92,7 @@ build_triplet = @build@
host_triplet = @host@
subdir = m4
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \
- $(top_srcdir)/m4/ax_lib_readline.m4 \
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_lib_readline.m4 \
$(top_srcdir)/m4/ax_pthread.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
@@ -158,13 +157,7 @@ EXEEXT = @EXEEXT@
FGREP = @FGREP@
GREP = @GREP@
ICU_CFLAGS = @ICU_CFLAGS@
-ICU_CONFIG = @ICU_CONFIG@
-ICU_CPPSEARCHPATH = @ICU_CPPSEARCHPATH@
-ICU_CXXFLAGS = @ICU_CXXFLAGS@
-ICU_IOLIBS = @ICU_IOLIBS@
-ICU_LIBPATH = @ICU_LIBPATH@
ICU_LIBS = @ICU_LIBS@
-ICU_VERSION = @ICU_VERSION@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -208,7 +201,6 @@ SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
STRIP = @STRIP@
-SYSCONFDIR = @SYSCONFDIR@
VERSION = @VERSION@
XML2_CFLAGS = @XML2_CFLAGS@
XML2_LIBS = @XML2_LIBS@
@@ -261,10 +253,10 @@ mbt_LIBS = @mbt_LIBS@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
-pkgconfigpath = @pkgconfigpath@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
+runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
diff --git a/m4/ax_icu_check.m4 b/m4/ax_icu_check.m4
deleted file mode 100644
index 3ffe425..0000000
--- a/m4/ax_icu_check.m4
+++ /dev/null
@@ -1,86 +0,0 @@
-dnl @synopsis AX_ICU_CHECK([version], [action-if], [action-if-not])
-dnl
-dnl Test for ICU support
-dnl
-dnl This will define ICU_LIBS, ICU_CFLAGS, ICU_CXXFLAGS, ICU_IOLIBS.
-dnl
-dnl Based on ac_check_icu (http://autoconf-archive.cryp.to/ac_check_icu.html)
-dnl by Akos Maroy <darkeye at tyrell.hu>.
-dnl
-dnl Portions Copyright 2005 Akos Maroy <darkeye at tyrell.hu>
-dnl Copying and distribution of this file, with or without modification,
-dnl are permitted in any medium without royalty provided the copyright
-dnl notice and this notice are preserved.
-dnl
-dnl @author Hunter Morris <huntermorris at gmail.com>
-dnl @version 2008-03-18
-AC_DEFUN([AX_ICU_CHECK], [
- succeeded=no
-
- if test -z "$ICU_CONFIG"; then
- AC_PATH_PROG(ICU_CONFIG, icu-config, no)
- fi
-
- if test "$ICU_CONFIG" = "no" ; then
- echo "*** The icu-config script could not be found. Make sure it is"
- echo "*** in your path, and that taglib is properly installed."
- echo "*** Or see http://www.icu-project.org/"
- else
- ICU_VERSION=`$ICU_CONFIG --version`
- AC_MSG_CHECKING(for ICU >= $1)
- VERSION_CHECK=`expr $ICU_VERSION \>\= $1`
- if test "$VERSION_CHECK" = "1" ; then
- AC_MSG_RESULT(yes)
- succeeded=yes
-
- AC_MSG_CHECKING(ICU_CFLAGS)
- ICU_CFLAGS=`$ICU_CONFIG --cflags`
- AC_MSG_RESULT($ICU_CFLAGS)
-
- AC_MSG_CHECKING(ICU_CPPSEARCHPATH)
- ICU_CPPSEARCHPATH=`$ICU_CONFIG --cppflags-searchpath`
- AC_MSG_RESULT($ICU_CPPSEARCHPATH)
-
- AC_MSG_CHECKING(ICU_CXXFLAGS)
- ICU_CXXFLAGS=`$ICU_CONFIG --cxxflags`
- AC_MSG_RESULT($ICU_CXXFLAGS)
-
- AC_MSG_CHECKING(ICU_LIBS)
- ICU_LIBS=`$ICU_CONFIG --ldflags-libsonly`
- AC_MSG_RESULT($ICU_LIBS)
-
- AC_MSG_CHECKING(ICU_LIBPATH)
- ICU_LIBPATH=`$ICU_CONFIG --ldflags-searchpath`
- AC_MSG_RESULT($ICU_LIBPATH)
-
- AC_MSG_CHECKING(ICU_IOLIBS)
- ICU_IOLIBS=`$ICU_CONFIG --ldflags-icuio`
- AC_MSG_RESULT($ICU_IOLIBS)
- else
- ICU_CFLAGS=""
- ICU_CXXFLAGS=""
- ICU_CPPSEARCHPATH=""
- ICU_LIBPATH=""
- ICU_LIBS=""
- ICU_IOLIBS=""
- ## If we have a custom action on failure, don't print errors, but
- ## do set a variable so people can do so.
- ifelse([$3], ,echo "can't find ICU >= $1",)
- fi
-
- AC_SUBST(ICU_CFLAGS)
- AC_SUBST(ICU_CXXFLAGS)
- AC_SUBST(ICU_CPPSEARCHPATH)
- AC_SUBST(ICU_VERSION)
- AC_SUBST(ICU_LIBPATH)
- AC_SUBST(ICU_LIBS)
- AC_SUBST(ICU_IOLIBS)
- fi
-
- if test $succeeded = yes; then
- ifelse([$2], , :, [$2])
- else
- ifelse([$3], , AC_MSG_ERROR([Library requirements (ICU) not met.]), [$3])
- fi
-])
-
diff --git a/m4/libtool.m4 b/m4/libtool.m4
index a3bc337..10ab284 100644
--- a/m4/libtool.m4
+++ b/m4/libtool.m4
@@ -2887,6 +2887,18 @@ linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
dynamic_linker='GNU/Linux ld.so'
;;
+netbsdelf*-gnu)
+ version_type=linux
+ need_lib_prefix=no
+ need_version=no
+ library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
+ soname_spec='${libname}${release}${shared_ext}$major'
+ shlibpath_var=LD_LIBRARY_PATH
+ shlibpath_overrides_runpath=no
+ hardcode_into_libs=yes
+ dynamic_linker='NetBSD ld.elf_so'
+ ;;
+
netbsd*)
version_type=sunos
need_lib_prefix=no
@@ -3546,7 +3558,7 @@ linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
lt_cv_deplibs_check_method=pass_all
;;
-netbsd*)
+netbsd* | netbsdelf*-gnu)
if echo __ELF__ | $CC -E - | $GREP __ELF__ > /dev/null; then
lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so\.[[0-9]]+\.[[0-9]]+|_pic\.a)$'
else
@@ -4424,7 +4436,7 @@ m4_if([$1], [CXX], [
;;
esac
;;
- netbsd*)
+ netbsd* | netbsdelf*-gnu)
;;
*qnx* | *nto*)
# QNX uses GNU C++, but need to define -shared option too, otherwise
@@ -4936,6 +4948,9 @@ m4_if([$1], [CXX], [
;;
esac
;;
+ linux* | k*bsd*-gnu | gnu*)
+ _LT_TAGVAR(link_all_deplibs, $1)=no
+ ;;
*)
_LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
;;
@@ -4998,6 +5013,9 @@ dnl Note also adjust exclude_expsyms for C++ above.
openbsd* | bitrig*)
with_gnu_ld=no
;;
+ linux* | k*bsd*-gnu | gnu*)
+ _LT_TAGVAR(link_all_deplibs, $1)=no
+ ;;
esac
_LT_TAGVAR(ld_shlibs, $1)=yes
@@ -5252,7 +5270,7 @@ _LT_EOF
fi
;;
- netbsd*)
+ netbsd* | netbsdelf*-gnu)
if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
_LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib'
wlarc=
@@ -5773,6 +5791,7 @@ _LT_EOF
if test yes = "$lt_cv_irix_exported_symbol"; then
_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags $wl-soname $wl$soname `test -n "$verstring" && func_echo_all "$wl-set_version $wl$verstring"` $wl-update_registry $wl$output_objdir/so_locations $wl-exports_file $wl$export_symbols -o $lib'
fi
+ _LT_TAGVAR(link_all_deplibs, $1)=no
else
_LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -o $lib'
_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry $output_objdir/so_locations -exports_file $export_symbols -o $lib'
@@ -5794,7 +5813,7 @@ _LT_EOF
esac
;;
- netbsd*)
+ netbsd* | netbsdelf*-gnu)
if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
_LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags' # a.out
else
diff --git a/m4/ltsugar.m4 b/m4/ltsugar.m4
index 48bc934..9000a05 100644
--- a/m4/ltsugar.m4
+++ b/m4/ltsugar.m4
@@ -1,7 +1,6 @@
# ltsugar.m4 -- libtool m4 base layer. -*-Autoconf-*-
#
-# Copyright (C) 2004-2005, 2007-2008, 2011-2015 Free Software
-# Foundation, Inc.
+# Copyright (C) 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
# Written by Gary V. Vaughan, 2004
#
# This file is free software; the Free Software Foundation gives
@@ -34,7 +33,7 @@ m4_define([_lt_join],
# ------------
# Manipulate m4 lists.
# These macros are necessary as long as will still need to support
-# Autoconf-2.59, which quotes differently.
+# Autoconf-2.59 which quotes differently.
m4_define([lt_car], [[$1]])
m4_define([lt_cdr],
[m4_if([$#], 0, [m4_fatal([$0: cannot be called without arguments])],
@@ -45,7 +44,7 @@ m4_define([lt_unquote], $1)
# lt_append(MACRO-NAME, STRING, [SEPARATOR])
# ------------------------------------------
-# Redefine MACRO-NAME to hold its former content plus 'SEPARATOR''STRING'.
+# Redefine MACRO-NAME to hold its former content plus `SEPARATOR'`STRING'.
# Note that neither SEPARATOR nor STRING are expanded; they are appended
# to MACRO-NAME as is (leaving the expansion for when MACRO-NAME is invoked).
# No SEPARATOR is output if MACRO-NAME was previously undefined (different
diff --git a/m4/lt~obsolete.m4 b/m4/lt~obsolete.m4
index c6b26f8..c573da9 100644
--- a/m4/lt~obsolete.m4
+++ b/m4/lt~obsolete.m4
@@ -1,7 +1,6 @@
# lt~obsolete.m4 -- aclocal satisfying obsolete definitions. -*-Autoconf-*-
#
-# Copyright (C) 2004-2005, 2007, 2009, 2011-2015 Free Software
-# Foundation, Inc.
+# Copyright (C) 2004, 2005, 2007, 2009 Free Software Foundation, Inc.
# Written by Scott James Remnant, 2004.
#
# This file is free software; the Free Software Foundation gives
@@ -12,7 +11,7 @@
# These exist entirely to fool aclocal when bootstrapping libtool.
#
-# In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN),
+# In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN)
# which have later been changed to m4_define as they aren't part of the
# exported API, or moved to Autoconf or Automake where they belong.
#
@@ -26,7 +25,7 @@
# included after everything else. This provides aclocal with the
# AC_DEFUNs it wants, but when m4 processes it, it doesn't do anything
# because those macros already exist, or will be overwritten later.
-# We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6.
+# We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6.
#
# Anytime we withdraw an AC_DEFUN or AU_DEFUN, remember to add it here.
# Yes, that means every name once taken will need to remain here until
diff --git a/m4/pkg.m4 b/m4/pkg.m4
index 82bea96..c5b26b5 100644
--- a/m4/pkg.m4
+++ b/m4/pkg.m4
@@ -1,60 +1,29 @@
-dnl pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*-
-dnl serial 11 (pkg-config-0.29.1)
-dnl
-dnl Copyright © 2004 Scott James Remnant <scott at netsplit.com>.
-dnl Copyright © 2012-2015 Dan Nicholson <dbn.lists at gmail.com>
-dnl
-dnl This program is free software; you can redistribute it and/or modify
-dnl it under the terms of the GNU General Public License as published by
-dnl the Free Software Foundation; either version 2 of the License, or
-dnl (at your option) any later version.
-dnl
-dnl This program is distributed in the hope that it will be useful, but
-dnl WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU General Public License
-dnl along with this program; if not, write to the Free Software
-dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
-dnl 02111-1307, USA.
-dnl
-dnl As a special exception to the GNU General Public License, if you
-dnl distribute this file as part of a program that contains a
-dnl configuration script generated by Autoconf, you may include it under
-dnl the same distribution terms that you use for the rest of that
-dnl program.
-
-dnl PKG_PREREQ(MIN-VERSION)
-dnl -----------------------
-dnl Since: 0.29
-dnl
-dnl Verify that the version of the pkg-config macros are at least
-dnl MIN-VERSION. Unlike PKG_PROG_PKG_CONFIG, which checks the user's
-dnl installed version of pkg-config, this checks the developer's version
-dnl of pkg.m4 when generating configure.
-dnl
-dnl To ensure that this macro is defined, also add:
-dnl m4_ifndef([PKG_PREREQ],
-dnl [m4_fatal([must install pkg-config 0.29 or later before running autoconf/autogen])])
-dnl
-dnl See the "Since" comment for each macro you use to see what version
-dnl of the macros you require.
-m4_defun([PKG_PREREQ],
-[m4_define([PKG_MACROS_VERSION], [0.29.1])
-m4_if(m4_version_compare(PKG_MACROS_VERSION, [$1]), -1,
- [m4_fatal([pkg.m4 version $1 or higher is required but ]PKG_MACROS_VERSION[ found])])
-])dnl PKG_PREREQ
-
-dnl PKG_PROG_PKG_CONFIG([MIN-VERSION])
-dnl ----------------------------------
-dnl Since: 0.16
-dnl
-dnl Search for the pkg-config tool and set the PKG_CONFIG variable to
-dnl first found in the path. Checks that the version of pkg-config found
-dnl is at least MIN-VERSION. If MIN-VERSION is not specified, 0.9.0 is
-dnl used since that's the first version where most current features of
-dnl pkg-config existed.
+# pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*-
+# serial 1 (pkg-config-0.24)
+#
+# Copyright © 2004 Scott James Remnant <scott at netsplit.com>.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# PKG_PROG_PKG_CONFIG([MIN-VERSION])
+# ----------------------------------
AC_DEFUN([PKG_PROG_PKG_CONFIG],
[m4_pattern_forbid([^_?PKG_[A-Z_]+$])
m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$])
@@ -76,19 +45,18 @@ if test -n "$PKG_CONFIG"; then
PKG_CONFIG=""
fi
fi[]dnl
-])dnl PKG_PROG_PKG_CONFIG
-
-dnl PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
-dnl -------------------------------------------------------------------
-dnl Since: 0.18
-dnl
-dnl Check to see whether a particular set of modules exists. Similar to
-dnl PKG_CHECK_MODULES(), but does not set variables or print errors.
-dnl
-dnl Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG])
-dnl only at the first occurence in configure.ac, so if the first place
-dnl it's called might be skipped (such as if it is within an "if", you
-dnl have to call PKG_CHECK_EXISTS manually
+])# PKG_PROG_PKG_CONFIG
+
+# PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+#
+# Check to see whether a particular set of modules exists. Similar
+# to PKG_CHECK_MODULES(), but does not set variables or print errors.
+#
+# Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG])
+# only at the first occurence in configure.ac, so if the first place
+# it's called might be skipped (such as if it is within an "if", you
+# have to call PKG_CHECK_EXISTS manually
+# --------------------------------------------------------------
AC_DEFUN([PKG_CHECK_EXISTS],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
if test -n "$PKG_CONFIG" && \
@@ -98,10 +66,8 @@ m4_ifvaln([$3], [else
$3])dnl
fi])
-dnl _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES])
-dnl ---------------------------------------------
-dnl Internal wrapper calling pkg-config via PKG_CONFIG and setting
-dnl pkg_failed based on the result.
+# _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES])
+# ---------------------------------------------
m4_define([_PKG_CONFIG],
[if test -n "$$1"; then
pkg_cv_[]$1="$$1"
@@ -113,11 +79,10 @@ m4_define([_PKG_CONFIG],
else
pkg_failed=untried
fi[]dnl
-])dnl _PKG_CONFIG
+])# _PKG_CONFIG
-dnl _PKG_SHORT_ERRORS_SUPPORTED
-dnl ---------------------------
-dnl Internal check to see if pkg-config supports short errors.
+# _PKG_SHORT_ERRORS_SUPPORTED
+# -----------------------------
AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])
if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
@@ -125,17 +90,19 @@ if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
else
_pkg_short_errors_supported=no
fi[]dnl
-])dnl _PKG_SHORT_ERRORS_SUPPORTED
-
-
-dnl PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
-dnl [ACTION-IF-NOT-FOUND])
-dnl --------------------------------------------------------------
-dnl Since: 0.4.0
-dnl
-dnl Note that if there is a possibility the first call to
-dnl PKG_CHECK_MODULES might not happen, you should be sure to include an
-dnl explicit call to PKG_PROG_PKG_CONFIG in your configure.ac
+])# _PKG_SHORT_ERRORS_SUPPORTED
+
+
+# PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
+# [ACTION-IF-NOT-FOUND])
+#
+#
+# Note that if there is a possibility the first call to
+# PKG_CHECK_MODULES might not happen, you should be sure to include an
+# explicit call to PKG_PROG_PKG_CONFIG in your configure.ac
+#
+#
+# --------------------------------------------------------------
AC_DEFUN([PKG_CHECK_MODULES],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl
@@ -189,40 +156,16 @@ else
AC_MSG_RESULT([yes])
$3
fi[]dnl
-])dnl PKG_CHECK_MODULES
-
-
-dnl PKG_CHECK_MODULES_STATIC(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
-dnl [ACTION-IF-NOT-FOUND])
-dnl ---------------------------------------------------------------------
-dnl Since: 0.29
-dnl
-dnl Checks for existence of MODULES and gathers its build flags with
-dnl static libraries enabled. Sets VARIABLE-PREFIX_CFLAGS from --cflags
-dnl and VARIABLE-PREFIX_LIBS from --libs.
-dnl
-dnl Note that if there is a possibility the first call to
-dnl PKG_CHECK_MODULES_STATIC might not happen, you should be sure to
-dnl include an explicit call to PKG_PROG_PKG_CONFIG in your
-dnl configure.ac.
-AC_DEFUN([PKG_CHECK_MODULES_STATIC],
-[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
-_save_PKG_CONFIG=$PKG_CONFIG
-PKG_CONFIG="$PKG_CONFIG --static"
-PKG_CHECK_MODULES($@)
-PKG_CONFIG=$_save_PKG_CONFIG[]dnl
-])dnl PKG_CHECK_MODULES_STATIC
+])# PKG_CHECK_MODULES
-dnl PKG_INSTALLDIR([DIRECTORY])
-dnl -------------------------
-dnl Since: 0.27
-dnl
-dnl Substitutes the variable pkgconfigdir as the location where a module
-dnl should install pkg-config .pc files. By default the directory is
-dnl $libdir/pkgconfig, but the default can be changed by passing
-dnl DIRECTORY. The user can override through the --with-pkgconfigdir
-dnl parameter.
+# PKG_INSTALLDIR(DIRECTORY)
+# -------------------------
+# Substitutes the variable pkgconfigdir as the location where a module
+# should install pkg-config .pc files. By default the directory is
+# $libdir/pkgconfig, but the default can be changed by passing
+# DIRECTORY. The user can override through the --with-pkgconfigdir
+# parameter.
AC_DEFUN([PKG_INSTALLDIR],
[m4_pushdef([pkg_default], [m4_default([$1], ['${libdir}/pkgconfig'])])
m4_pushdef([pkg_description],
@@ -233,18 +176,16 @@ AC_ARG_WITH([pkgconfigdir],
AC_SUBST([pkgconfigdir], [$with_pkgconfigdir])
m4_popdef([pkg_default])
m4_popdef([pkg_description])
-])dnl PKG_INSTALLDIR
+]) dnl PKG_INSTALLDIR
-dnl PKG_NOARCH_INSTALLDIR([DIRECTORY])
-dnl --------------------------------
-dnl Since: 0.27
-dnl
-dnl Substitutes the variable noarch_pkgconfigdir as the location where a
-dnl module should install arch-independent pkg-config .pc files. By
-dnl default the directory is $datadir/pkgconfig, but the default can be
-dnl changed by passing DIRECTORY. The user can override through the
-dnl --with-noarch-pkgconfigdir parameter.
+# PKG_NOARCH_INSTALLDIR(DIRECTORY)
+# -------------------------
+# Substitutes the variable noarch_pkgconfigdir as the location where a
+# module should install arch-independent pkg-config .pc files. By
+# default the directory is $datadir/pkgconfig, but the default can be
+# changed by passing DIRECTORY. The user can override through the
+# --with-noarch-pkgconfigdir parameter.
AC_DEFUN([PKG_NOARCH_INSTALLDIR],
[m4_pushdef([pkg_default], [m4_default([$1], ['${datadir}/pkgconfig'])])
m4_pushdef([pkg_description],
@@ -255,15 +196,13 @@ AC_ARG_WITH([noarch-pkgconfigdir],
AC_SUBST([noarch_pkgconfigdir], [$with_noarch_pkgconfigdir])
m4_popdef([pkg_default])
m4_popdef([pkg_description])
-])dnl PKG_NOARCH_INSTALLDIR
+]) dnl PKG_NOARCH_INSTALLDIR
-dnl PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE,
-dnl [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
-dnl -------------------------------------------
-dnl Since: 0.28
-dnl
-dnl Retrieves the value of the pkg-config variable for the given module.
+# PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE,
+# [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+# -------------------------------------------
+# Retrieves the value of the pkg-config variable for the given module.
AC_DEFUN([PKG_CHECK_VAR],
[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
AC_ARG_VAR([$1], [value of $3 for $2, overriding pkg-config])dnl
@@ -272,4 +211,4 @@ _PKG_CONFIG([$1], [variable="][$3]["], [$2])
AS_VAR_COPY([$1], [pkg_cv_][$1])
AS_VAR_IF([$1], [""], [$5], [$4])dnl
-])dnl PKG_CHECK_VAR
+])# PKG_CHECK_VAR
diff --git a/src/Frog.cxx b/src/Frog.cxx
index 4f87918..8b33374 100644
--- a/src/Frog.cxx
+++ b/src/Frog.cxx
@@ -53,16 +53,14 @@
#include "frog/Frog.h" //internal interface, included by all modules
#include "frog/FrogAPI.h" //public API interface
#include "frog/ucto_tokenizer_mod.h"
-#include "frog/pos_tagger_mod.h"
+#include "frog/tagger_base.h"
#include "ticcutils/StringOps.h"
#include "ticcutils/CommandLine.h"
#include "ticcutils/FileUtils.h"
using namespace std;
-using namespace folia;
-using namespace TiCC;
-#define LOG *Log(theErrLog)
+#define LOG *TiCC::Log(theErrLog)
string testDirName;
string outputFileName;
@@ -91,7 +89,7 @@ set<string> fileNames;
*/
-Configuration configuration;
+TiCC::Configuration configuration;
void usage( ) {
cout << endl << "Options:\n";
@@ -99,22 +97,32 @@ void usage( ) {
<< "\t -e <encoding> specify encoding of the input (default UTF-8)\n"
<< "\t -t <testfile> Run frog on this file\n"
<< "\t -x <testfile> Run frog on this FoLiA XML file. Or the files from 'testdir'\n"
- << "\t --textclass=<cls> use the specified class to search for text in the the FoLia docs.\n"
+ << "\t --textclass=<cls> use the specified class to search for text in the the FoLiA docs. (default 'current').\n"
+ << "\t\t\t the same value is used for output too.\n"
+ << "\t --inputclass=<cls> use the specified class to search for text in the the FoLiA docs. (default 'current') \n"
+ << "\t --outputclass=<cls> use the specified class to output text in the the FoLia docs. (default 'inputclass') \n"
<< "\t --testdir=<directory> All files in this dir will be tested\n"
<< "\t --uttmarker=<mark> utterances are separated by 'mark' symbols"
<< "\t (default none)\n"
<< "\t -n Assume input file to hold one sentence per line\n"
+ << "\t --retry assume frog is running again on the same input,\n"
+ << "\t already done files are skipped. (detected on the basis of already existing output files)\n"
<< "\t --max-parser-tokens=<n> inhibit parsing when a sentence contains over 'n' tokens. (default: 500, needs already 16Gb of memory!)\n"
<< "\t -Q Enable quote detection in tokeniser.\n"
+ << "\t-T or --textredundancy=[full|minimal|none] - set text redundancy level in the tokenizer for text nodes in FoLiA output: " << endl
+ << "\t 'full' - add text to all levels: <p> <s> <w> etc." << endl
+ << "\t 'minimal' - don't introduce text on higher levels, but retain what is already there." << endl
+ << "\t 'none' - only introduce text on <w>, AND remove all text from higher levels" << endl
<< "\t============= MODULE SELECTION ==========================================\n"
<< "\t --skip=[mptncla] Skip Tokenizer (t), Lemmatizer (l), Morphological Analyzer (a), Chunker (c), Multi-Word Units (m), Named Entity Recognition (n), or Parser (p) \n"
<< "\t============= CONFIGURATION OPTIONS =====================================\n"
- << "\t -c <filename> Set configuration file (default "
- << FrogAPI::defaultConfigFile() << ")\n"
+ << "\t -c <filename> Set configuration file (default " << FrogAPI::defaultConfigFile() << ")\n"
+ << "\t --override <section>.<parameter>=<value> Override a configuration option, can be used multiple times\n"
<< "\t --language <language-list> Set the languages. e.g. --language=nld,eng,por"
<< "\t\t The first language in the list will be the default. (default dutch).\n"
<< "\t============= OUTPUT OPTIONS ============================================\n"
<< "\t -o <outputfile> Output columned output to file, instead of default stdout\n"
+ << "\t --nostdout suppress columned output to stdout\n"
<< "\t -X <xmlfile> Output also to an XML file in FoLiA format\n"
<< "\t --id=<docid> Document ID, used in FoLiA output. (Default 'untitled')\n"
<< "\t --outputdir=<dir> Output to dir, instead of default stdout\n"
@@ -126,16 +134,16 @@ void usage( ) {
<< "\t -d <debug level> (for more verbosity)\n"
<< "\t --debug=<module><level>,<module><level>... (eg --debug=l5,n3) \n"
<< "\t\t Set debug value for Tokenizer (t), Lemmatizer (l), Morphological Analyzer (a), Chunker (c), Multi-Word Units (m), Named Entity Recognition (n), or Parser (p) \n"
- << "\t -S <port> Run as server instead of reading from testfile\n"
+ << "\t -S <port> Run as server instead of reading from testfile\n"
#ifdef HAVE_OPENMP
- << "\t --threads=<n> Use a maximum of 'n' threads. Default: 8. \n"
+ << "\t --threads=<n> Use a maximum of 'n' threads. Default: 8. \n"
#endif
- << "\t (but always 1 for server mode)\n";
+ << "\t (but always 1 for server mode)\n";
}
bool parse_args( TiCC::CL_Options& Opts,
FrogOptions& options,
- LogStream* theErrLog ) {
+ TiCC::LogStream* theErrLog ) {
// process the command line and fill FrogOptions to initialize the API
// also fill some globals we use for our own main.
@@ -151,15 +159,14 @@ bool parse_args( TiCC::CL_Options& Opts,
language = "none";
}
else {
- vector<string> lang;
- int num = split_at( languages, lang, "," );
- if ( num < 0 ){
+ vector<string> lang_v = TiCC::split_at( languages, "," );
+ if ( lang_v.empty() ){
cerr<< "invalid value in --languages=" << languages
<< " option. " << endl;
return false;
}
- language = lang[0]; // the first mentioned is the default.
- if ( num > 1 ){
+ language = lang_v[0]; // the first mentioned is the default.
+ if ( lang_v.size() > 1 ){
cerr << "WARNING: you used the --language=" << languages << " option"
<< " with more then one language " << endl
<< "\t specified. These values will be handled to the tokenizer,"
@@ -198,7 +205,7 @@ bool parse_args( TiCC::CL_Options& Opts,
string value;
// debug opts
if ( Opts.extract ('d', value) ) {
- if ( !stringTo<int>( value, options.debugFlag ) ){
+ if ( !TiCC::stringTo<int>( value, options.debugFlag ) ){
LOG << "-d value should be an integer" << endl;
return false;
}
@@ -209,13 +216,12 @@ bool parse_args( TiCC::CL_Options& Opts,
}
if ( Opts.extract( "debug", value ) ) {
value = TiCC::lowercase( value );
- vector<string> vec;
- TiCC::split_at( value, vec, "," );
+ vector<string> vec = TiCC::split_at( value, "," );
for ( const auto& val : vec ){
char mod = val[0];
string value = val.substr(1);
int dbval = 0;
- if ( !stringTo<int>( value, dbval ) ){
+ if ( !TiCC::stringTo<int>( value, dbval ) ){
cerr << "expected integer value for --debug=" << mod << value << endl;
return false;
}
@@ -247,7 +253,18 @@ bool parse_args( TiCC::CL_Options& Opts,
}
}
}
-
+ string redundancy;
+ Opts.extract( 'T', redundancy );
+ Opts.extract( "textredundancy", redundancy );
+ if ( !redundancy.empty() ){
+ if ( redundancy != "full"
+ && redundancy != "minimal"
+ && redundancy != "none" ){
+ LOG << "unknown textredundancy level: " << redundancy << endl;
+ return false;
+ }
+ options.textredundancy = redundancy;
+ }
options.doSentencePerLine = Opts.extract( 'n' );
options.doQuoteDetection = Opts.extract( 'Q' );
if ( Opts.extract( "skip", value )) {
@@ -277,11 +294,12 @@ bool parse_args( TiCC::CL_Options& Opts,
options.doDeepMorph = true;
options.doMorph = true;
}
-
+ options.doRetry = Opts.extract( "retry" );
+ options.noStdOut = Opts.extract( "nostdout" );
Opts.extract( 'e', options.encoding );
if ( Opts.extract( "max-parser-tokens", value ) ){
- if ( !stringTo<unsigned int>( value, options.maxParserTokens ) ){
+ if ( !TiCC::stringTo<unsigned int>( value, options.maxParserTokens ) ){
LOG << "max-parser-tokens value should be an integer" << endl;
return false;
}
@@ -297,7 +315,7 @@ bool parse_args( TiCC::CL_Options& Opts,
}
else if ( Opts.extract( "threads", value ) ){
int num;
- if ( !stringTo<int>( value, num ) || num < 1 ){
+ if ( !TiCC::stringTo<int>( value, num ) || num < 1 ){
LOG << "threads value should be a positive integer" << endl;
return false;
}
@@ -382,13 +400,52 @@ bool parse_args( TiCC::CL_Options& Opts,
}
}
}
- if ( Opts.extract( "textclass", options.textclass ) ){
- if ( !options.doXMLin ){
+ string textclass;
+ string inputclass;
+ string outputclass;
+ Opts.extract( "textclass", textclass );
+ Opts.extract( "inputclass", inputclass );
+ Opts.extract( "outputclass", outputclass );
+ if ( !options.doXMLin ){
+ if ( !textclass.empty() ){
LOG << "--textclass is only valid when -x is also present" << endl;
return false;
}
+ if ( !inputclass.empty() ){
+ LOG << "--inputclass is only valid when -x is also present" << endl;
+ return false;
+ }
+ if ( !outputclass.empty() ){
+ LOG << "--outputclass is only valid when -x is also present" << endl;
+ return false;
+ }
+ }
+ else { // FoLiA files...
+ if ( !textclass.empty() ){
+ if ( !inputclass.empty() || !outputclass.empty() ){
+ LOG << "when --textclass is specified, --inputclass or --outputclass may NOT be present." << endl;
+ return false;
+ }
+ options.inputclass = textclass;
+ options.outputclass = textclass;
+ configuration.setatt( "inputclass", textclass );
+ configuration.setatt( "outputclass", textclass );
+ }
+ else {
+ if ( !inputclass.empty() ){
+ options.inputclass = inputclass;
+ configuration.setatt( "inputclass", inputclass );
+ if ( outputclass.empty() ){
+ options.outputclass = inputclass;
+ configuration.setatt( "outputclass", inputclass );
+ }
+ }
+ if ( !outputclass.empty() ){
+ options.outputclass = outputclass;
+ configuration.setatt( "outputclass", outputclass );
+ }
+ }
}
-
if ( !XMLoutFileName.empty() && !testDirName.empty() ){
LOG << "useless -X value" << endl;
return false;
@@ -424,6 +481,26 @@ bool parse_args( TiCC::CL_Options& Opts,
return false;
}
}
+ string overridestatement;
+ while ( Opts.extract("override", overridestatement )) {
+ vector<string> values;
+ const int num = TiCC::split_at( overridestatement,values, "=" );
+ if ( num == 2 ) {
+ vector<string> module_param;
+ const int num2 = TiCC::split_at(values[0], module_param, "." );
+ if (num2 == 2) {
+ LOG << "Overriding configuration parameter " << module_param[0] << "." << module_param[1] << " with " << values[1] << endl;
+ configuration.setatt( module_param[1] , values[1], module_param[0] );
+ } else if (num2 == 1) {
+ LOG << "Overriding configuration parameter " << module_param[0] << " with " << values[1] << endl;
+ configuration.setatt( module_param[0] , values[1]);
+ } else {
+ LOG << "Invalid syntax for --override option" << endl;
+ }
+ } else {
+ LOG << "Invalid syntax for --override option" << endl;
+ }
+ }
if ( !Opts.empty() ){
LOG << "unhandled commandline options: " << Opts.toString() << endl;
@@ -432,11 +509,11 @@ bool parse_args( TiCC::CL_Options& Opts,
return true;
}
-bool StillRunning = true;
+static bool StillRunning = true;
void KillServerFun( int Signal ){
if ( Signal == SIGTERM ){
- cerr << "KillServerFun caught a signal SIGTERM" << endl;
+ cerr << TiCC::Timer::now() << " KillServerFun caught a signal SIGTERM" << endl;
sleep(5); // give children some spare time...
StillRunning = false;
}
@@ -454,16 +531,19 @@ int main(int argc, char *argv[]) {
<< Timbl::VersionName() << ", "
<< TimblServer::VersionName() << ", "
<< Tagger::VersionName() << "]" << endl;
- LogStream *theErrLog = new LogStream( cerr, "frog-", StampMessage );
+ TiCC::LogStream *theErrLog
+ = new TiCC::LogStream( cerr, "frog-", StampMessage );
std::ios_base::sync_with_stdio(false);
FrogOptions options;
try {
- TiCC::CL_Options Opts("c:e:o:t:x::X::nQhVd:S:",
- "textclass:,testdir:,uttmarker:,max-parser-tokens:,"
+ TiCC::CL_Options Opts("c:e:o:t:T:x::X::nQhVd:S:",
+ "textclass:,inputclass:,outputclass:,testdir:,"
+ "uttmarker:,max-parser-tokens:,textredundancy:,"
"skip:,id:,outputdir:,xmldir:,tmpdir:,deep-morph,"
- "help,language:,"
- "debug:,keep-parser-files,version,threads:,KANON");
+ "help,language:,retry,nostdout,"
+ "debug:,keep-parser-files,version,threads:,"
+ "override:,KANON");
Opts.init(argc, argv);
if ( Opts.is_present('V' ) || Opts.is_present("version" ) ){
// we already did show what we wanted.
@@ -486,18 +566,25 @@ int main(int argc, char *argv[]) {
ostream *outS = 0;
if ( !outputFileName.empty() ){
+ if ( options.doRetry && TiCC::isFile( outputFileName ) ){
+ LOG << "retry, skip: " << outputFileName << " already exists" << endl;
+ return EXIT_SUCCESS;
+ }
if ( !TiCC::createPath( outputFileName ) ) {
LOG << "problem: unable to create outputfile: "
- << outputFileName << endl;
+ << outputFileName << endl;
return EXIT_FAILURE;
}
outS = new ofstream( outputFileName );
}
+ if ( fileNames.size() > 1 ){
+ LOG << "start procesessing " << fileNames.size() << " files..." << endl;
+ }
for ( auto const& name : fileNames ){
string testName = testDirName + name;
if ( !TiCC::isFile( testName ) ){
LOG << "skip " << testName << " (file not found )"
- << endl;
+ << endl;
continue;
}
string outName;
@@ -510,6 +597,10 @@ int main(int argc, char *argv[]) {
else {
outName = outPath + name + ".out";
}
+ if ( options.doRetry && TiCC::isFile( outName ) ){
+ LOG << "retry, skip: " << outName << " already exists" << endl;
+ continue;
+ }
if ( !TiCC::createPath( outName ) ) {
LOG << "problem frogging: " << name << endl
<< "unable to create outputfile: " << outName
@@ -517,7 +608,8 @@ int main(int argc, char *argv[]) {
continue;
}
outS = new ofstream( outName );
- } else {
+ }
+ else {
outS = &cout;
}
}
@@ -532,11 +624,20 @@ int main(int argc, char *argv[]) {
else if ( options.doXMLout )
xmlOutName = name + ".xml"; // do not clobber the inputdir!
}
- if ( !xmlOutName.empty() && !TiCC::createPath( xmlOutName ) ){
- LOG << "problem frogging: " << name << endl
- << "unable to create outputfile: " << xmlOutName
- << endl;
- continue;
+ if ( !xmlOutName.empty() ){
+ if ( options.doRetry && TiCC::isFile( xmlOutName ) ){
+ LOG << "retry, skip: " << xmlOutName << " already exists" << endl;
+ continue;
+ }
+ if ( !TiCC::createPath( xmlOutName ) ){
+ LOG << "problem frogging: " << name << endl
+ << "unable to create outputfile: " << xmlOutName
+ << endl;
+ continue;
+ }
+ else {
+ remove( xmlOutName.c_str() );
+ }
}
LOG << "Frogging " << testName << endl;
try {
@@ -605,8 +706,9 @@ int main(int argc, char *argv[]) {
throw( runtime_error( "Accept failed" ) );
}
}
+ LOG << TiCC::Timer::now() << " server terminated by SIGTERM" << endl;
}
- catch ( std::exception& e ) {
+ catch ( exception& e ) {
LOG << "Server error:" << e.what() << " Exiting." << endl;
throw;
}
diff --git a/src/FrogAPI.cxx b/src/FrogAPI.cxx
index 4c0ea6e..8c8b90b 100644
--- a/src/FrogAPI.cxx
+++ b/src/FrogAPI.cxx
@@ -39,6 +39,7 @@
#include <sstream>
#include <fstream>
#include <vector>
+#include <unordered_map>
#include "config.h"
#ifdef HAVE_OPENMP
#include <omp.h>
@@ -78,11 +79,10 @@
using namespace std;
-using namespace folia;
-using namespace TiCC;
using namespace Tagger;
+using TiCC::operator<<;
-#define LOG *Log(theErrLog)
+#define LOG *TiCC::Log(theErrLog)
string configDir = string(SYSCONF_PATH) + "/" + PACKAGE + "/";
string configFileName = configDir + "frog.cfg";
@@ -106,6 +106,8 @@ FrogOptions::FrogOptions() {
doSentencePerLine = false;
doQuoteDetection = false;
doDirTest = false;
+ doRetry = false;
+ noStdOut = false;
doServer = false;
doXMLin = false;
doXMLout = false;
@@ -123,12 +125,56 @@ FrogOptions::FrogOptions() {
uttmark = "<utt>";
listenport = "void";
docid = "untitled";
+ inputclass="current";
+ outputclass="current";
+ textredundancy="minimal";
debugFlag = 0;
}
+void FrogAPI::test_version( const string& where, double minimum ){
+ string version = configuration.lookUp( "version", where );
+ double v = 0.0;
+ if ( !version.empty() ){
+ if ( !TiCC::stringTo( version, v ) ){
+ v = 0.5;
+ }
+ }
+ if ( where == "IOB" ){
+ if ( v < minimum ){
+ LOG << "[[" << where << "]] Wrong FrogData!. "
+ << "Expected version " << minimum << " or higher for module: "
+ << where << endl;
+ if ( version.empty() ) {
+ LOG << "but no version info was found!." << endl;
+ }
+ else {
+ LOG << "but found version " << v << endl;
+ }
+ throw runtime_error( "Frog initialization failed" );
+ }
+ }
+ else if ( where == "NER" ){
+ if ( v < minimum ){
+ LOG << "[[" << where << "]] Wrong FrogData!. "
+ << "Expected version " << minimum << " or higher for module: "
+ << where << endl;
+ if ( version.empty() ) {
+ LOG << "but no version info was found!." << endl;
+ }
+ else {
+ LOG << "but found version " << v << endl;
+ }
+ throw runtime_error( "Frog initialization failed" );
+ }
+ }
+ else {
+ throw logic_error( "unknown where:" + where );
+ }
+}
+
FrogAPI::FrogAPI( FrogOptions &opt,
- const Configuration &conf,
- LogStream *log ):
+ const TiCC::Configuration &conf,
+ TiCC::LogStream *log ):
configuration(conf),
options(opt),
theErrLog(log),
@@ -136,7 +182,7 @@ FrogAPI::FrogAPI( FrogOptions &opt,
myMblem(0),
myMwu(0),
myParser(0),
- myPoSTagger(0),
+ myCGNTagger(0),
myIOBTagger(0),
myNERTagger(0),
tokenizer(0)
@@ -154,15 +200,25 @@ FrogAPI::FrogAPI( FrogOptions &opt,
LOG << "Disabled the Morhological analyzer." << endl;
options.doMorph = false;
}
- if ( options.doIOB && !configuration.hasSection("IOB") ){
- LOG << "Missing [[IOB]] section in config file." << endl;
- LOG << "Disabled the IOB Chunker." << endl;
- options.doIOB = false;
+ if ( options.doIOB ){
+ if ( !configuration.hasSection("IOB") ){
+ LOG << "Missing [[IOB]] section in config file." << endl;
+ LOG << "Disabled the IOB Chunker." << endl;
+ options.doIOB = false;
+ }
+ else {
+ test_version( "IOB", 2.0 );
+ }
}
- if ( options.doNER && !configuration.hasSection("NER") ){
- LOG << "Missing [[NER]] section in config file." << endl;
- LOG << "Disabled the NER." << endl;
- options.doNER = false;
+ if ( options.doNER ) {
+ if ( !configuration.hasSection("NER") ){
+ LOG << "Missing [[NER]] section in config file." << endl;
+ LOG << "Disabled the NER." << endl;
+ options.doNER = false;
+ }
+ else {
+ test_version( "NER", 2.0 );
+ }
}
if ( options.doMwu && !configuration.hasSection("mwu") ){
LOG << "Missing [[mwu]] section in config file." << endl;
@@ -191,20 +247,22 @@ FrogAPI::FrogAPI( FrogOptions &opt,
tokenizer->setInputEncoding( options.encoding );
tokenizer->setInputXml( options.doXMLin );
tokenizer->setUttMarker( options.uttmark );
- tokenizer->setTextClass( options.textclass );
- myPoSTagger = new CGNTagger(theErrLog);
- stat = myPoSTagger->init( configuration );
+ tokenizer->setInputClass( options.inputclass );
+ tokenizer->setOutputClass( options.outputclass );
+ tokenizer->setTextRedundancy( options.textredundancy );
+ myCGNTagger = new CGNTagger( theErrLog );
+ stat = myCGNTagger->init( configuration );
if ( stat ){
- myPoSTagger->set_eos_mark( options.uttmark );
+ myCGNTagger->set_eos_mark( options.uttmark );
if ( options.doIOB ){
- myIOBTagger = new IOBTagger(theErrLog);
+ myIOBTagger = new IOBTagger( theErrLog );
stat = myIOBTagger->init( configuration );
if ( stat ){
myIOBTagger->set_eos_mark( options.uttmark );
}
}
if ( stat && options.doNER ){
- myNERTagger = new NERTagger(theErrLog);
+ myNERTagger = new NERTagger( theErrLog );
stat = myNERTagger->init( configuration );
if ( stat ){
myNERTagger->set_eos_mark( options.uttmark );
@@ -254,20 +312,34 @@ FrogAPI::FrogAPI( FrogOptions &opt,
#endif
bool tokStat = true;
+ string tokWhat;
bool lemStat = true;
+ string lemWhat;
bool mwuStat = true;
+ string mwuWhat;
bool mbaStat = true;
+ string mbaWhat;
bool parStat = true;
+ string parWhat;
bool tagStat = true;
+ string tagWhat;
bool iobStat = true;
+ string iobWhat;
bool nerStat = true;
+ string nerWhat;
#pragma omp parallel sections
{
#pragma omp section
{
- tokenizer = new UctoTokenizer(theErrLog);
- tokStat = tokenizer->init( configuration );
+ try {
+ tokenizer = new UctoTokenizer(theErrLog);
+ tokStat = tokenizer->init( configuration );
+ }
+ catch ( const exception& e ){
+ tokWhat = e.what();
+ tokStat = false;
+ }
if ( tokStat ){
tokenizer->setPassThru( !options.doTok );
tokenizer->setDocID( options.docid );
@@ -276,56 +348,100 @@ FrogAPI::FrogAPI( FrogOptions &opt,
tokenizer->setInputEncoding( options.encoding );
tokenizer->setInputXml( options.doXMLin );
tokenizer->setUttMarker( options.uttmark );
- tokenizer->setTextClass( options.textclass );
+ tokenizer->setInputClass( options.inputclass );
+ tokenizer->setOutputClass( options.outputclass );
+ tokenizer->setTextRedundancy( options.textredundancy );
}
}
#pragma omp section
{
if ( options.doLemma ){
- myMblem = new Mblem(theErrLog);
- lemStat = myMblem->init( configuration );
+ try {
+ myMblem = new Mblem(theErrLog);
+ lemStat = myMblem->init( configuration );
+ }
+ catch ( const exception& e ){
+ lemWhat = e.what();
+ lemStat = false;
+ }
}
}
#pragma omp section
{
if ( options.doMorph ){
- myMbma = new Mbma(theErrLog);
- mbaStat = myMbma->init( configuration );
- if ( options.doDeepMorph )
- myMbma->setDeepMorph(true);
+ try {
+ myMbma = new Mbma(theErrLog);
+ mbaStat = myMbma->init( configuration );
+ if ( options.doDeepMorph )
+ myMbma->setDeepMorph(true);
+ }
+ catch ( const exception& e ){
+ mbaWhat = e.what();
+ mbaStat = false;
+ }
}
}
#pragma omp section
{
- myPoSTagger = new CGNTagger(theErrLog);
- tagStat = myPoSTagger->init( configuration );
+ try {
+ myCGNTagger = new CGNTagger( theErrLog );
+ tagStat = myCGNTagger->init( configuration );
+ }
+ catch ( const exception& e ){
+ tagWhat = e.what();
+ tagStat = false;
+ }
}
#pragma omp section
{
if ( options.doIOB ){
- myIOBTagger = new IOBTagger(theErrLog);
- iobStat = myIOBTagger->init( configuration );
+ try {
+ myIOBTagger = new IOBTagger( theErrLog );
+ iobStat = myIOBTagger->init( configuration );
+ }
+ catch ( const exception& e ){
+ iobWhat = e.what();
+ iobStat = false;
+ }
}
}
#pragma omp section
{
if ( options.doNER ){
- myNERTagger = new NERTagger(theErrLog);
- nerStat = myNERTagger->init( configuration );
+ try {
+ myNERTagger = new NERTagger( theErrLog );
+ nerStat = myNERTagger->init( configuration );
+ }
+ catch ( const exception& e ){
+ nerWhat = e.what();
+ nerStat = false;
+ }
}
}
#pragma omp section
{
if ( options.doMwu ){
- myMwu = new Mwu(theErrLog);
- mwuStat = myMwu->init( configuration );
- if ( mwuStat && options.doParse ){
- Timer initTimer;
- initTimer.start();
- myParser = new Parser(theErrLog);
- parStat = myParser->init( configuration );
- initTimer.stop();
- LOG << "init Parse took: " << initTimer << endl;
+ try {
+ myMwu = new Mwu(theErrLog);
+ mwuStat = myMwu->init( configuration );
+ if ( mwuStat && options.doParse ){
+ TiCC::Timer initTimer;
+ initTimer.start();
+ try {
+ myParser = new Parser(theErrLog);
+ parStat = myParser->init( configuration );
+ initTimer.stop();
+ LOG << "init Parse took: " << initTimer << endl;
+ }
+ catch ( const exception& e ){
+ parWhat = e.what();
+ parStat = false;
+ }
+ }
+ }
+ catch ( const exception& e ){
+ mwuWhat = e.what();
+ mwuStat = false;
}
}
}
@@ -334,28 +450,28 @@ FrogAPI::FrogAPI( FrogOptions &opt,
&& mbaStat && mwuStat && parStat ) ){
string out = "Initialization failed for: ";
if ( !tokStat ){
- out += "[tokenizer] ";
+ out += "[tokenizer] " + tokWhat;
}
if ( !tagStat ){
- out += "[tagger] ";
+ out += "[tagger] " + tagWhat;
}
if ( !iobStat ){
- out += "[IOB] ";
+ out += "[IOB] " + iobWhat;
}
if ( !nerStat ){
- out += "[NER] ";
+ out += "[NER] " + nerWhat;
}
if ( !lemStat ){
- out += "[lemmatizer] ";
+ out += "[lemmatizer] " + lemWhat;
}
if ( !mbaStat ){
- out += "[morphology] ";
+ out += "[morphology] " + mbaWhat;
}
if ( !mwuStat ){
- out += "[multiword unit] ";
+ out += "[multiword unit] " + mwuWhat;
}
if ( !parStat ){
- out += "[parser] ";
+ out += "[parser] " + parWhat;
}
LOG << out << endl;
throw runtime_error( "Frog init failed" );
@@ -368,15 +484,15 @@ FrogAPI::~FrogAPI() {
delete myMbma;
delete myMblem;
delete myMwu;
- delete myPoSTagger;
+ delete myCGNTagger;
delete myIOBTagger;
delete myNERTagger;
delete myParser;
delete tokenizer;
}
-bool FrogAPI::TestSentence( Sentence* sent, TimerBlock& timers){
- vector<Word*> swords;
+bool FrogAPI::TestSentence( folia::Sentence* sent, TimerBlock& timers){
+ vector<folia::Word*> swords;
if ( options.doQuoteDetection ){
swords = sent->wordParts();
}
@@ -387,13 +503,13 @@ bool FrogAPI::TestSentence( Sentence* sent, TimerBlock& timers){
bool all_well = true;
string exs;
if ( !swords.empty() ) {
-#pragma omp parallel sections shared(all_well,exs)
+#pragma omp parallel sections shared(all_well,exs,swords)
{
#pragma omp section
{
timers.tagTimer.start();
try {
- myPoSTagger->Classify( swords );
+ myCGNTagger->Classify( swords );
}
catch ( exception&e ){
all_well = false;
@@ -401,34 +517,6 @@ bool FrogAPI::TestSentence( Sentence* sent, TimerBlock& timers){
}
timers.tagTimer.stop();
}
-#pragma omp section
- {
- if ( options.doIOB ){
- timers.iobTimer.start();
- try {
- myIOBTagger->Classify( swords );
- }
- catch ( exception&e ){
- all_well = false;
- exs += string(e.what()) + " ";
- }
- timers.iobTimer.stop();
- }
- }
-#pragma omp section
- {
- if ( options.doNER ){
- timers.nerTimer.start();
- try {
- myNERTagger->Classify( swords );
- }
- catch ( exception&e ){
- all_well = false;
- exs += string(e.what()) + " ";
- }
- timers.nerTimer.stop();
- }
- }
} // parallel sections
if ( !all_well ){
throw runtime_error( exs );
@@ -471,25 +559,61 @@ bool FrogAPI::TestSentence( Sentence* sent, TimerBlock& timers){
}
}
} // omp parallel sections
- } //for int i = 0 to num_words
+ } //for all words
if ( !all_well ){
throw runtime_error( exs );
}
-
- if ( options.doMwu ){
- if ( swords.size() > 0 ){
- timers.mwuTimer.start();
- myMwu->Classify( swords );
- timers.mwuTimer.stop();
+#pragma omp parallel sections
+ {
+#pragma omp section
+ {
+ if ( options.doNER ){
+ timers.nerTimer.start();
+ if (options.debugFlag) {
+ LOG << "Calling NER..." << endl;
+ }
+ try {
+ myNERTagger->Classify( swords );
+ }
+ catch ( exception&e ){
+ all_well = false;
+ exs += string(e.what()) + " ";
+ }
+ timers.nerTimer.stop();
+ }
}
- }
- if ( options.doParse ){
- if ( options.maxParserTokens != 0
- && swords.size() > options.maxParserTokens ){
- showParse = false;
+#pragma omp section
+ {
+ if ( options.doIOB ){
+ timers.iobTimer.start();
+ try {
+ myIOBTagger->Classify( swords );
+ }
+ catch ( exception&e ){
+ all_well = false;
+ exs += string(e.what()) + " ";
+ }
+ timers.iobTimer.stop();
+ }
}
- else {
- myParser->Parse( swords, timers );
+#pragma omp section
+ {
+ if ( options.doMwu ){
+ if ( swords.size() > 0 ){
+ timers.mwuTimer.start();
+ myMwu->Classify( swords );
+ timers.mwuTimer.stop();
+ }
+ }
+ if ( options.doParse ){
+ if ( options.maxParserTokens != 0
+ && swords.size() > options.maxParserTokens ){
+ showParse = false;
+ }
+ else {
+ myParser->Parse( swords, timers );
+ }
+ }
}
}
}
@@ -516,7 +640,7 @@ void FrogAPI::FrogServer( Sockets::ServerSocket &conn ){
if ( options.debugFlag ){
LOG << "received data [" << result << "]" << endl;
}
- Document doc;
+ folia::Document doc;
try {
doc.readFromString( result );
}
@@ -524,7 +648,7 @@ void FrogAPI::FrogServer( Sockets::ServerSocket &conn ){
LOG << "FoLiaParsing failed:" << endl << e.what() << endl;
throw;
}
- LOG << "Processing... " << endl;
+ LOG << "Processing XML... " << endl;
timers.reset();
timers.tokTimer.start();
tokenizer->tokenize( doc );
@@ -536,6 +660,7 @@ void FrogAPI::FrogServer( Sockets::ServerSocket &conn ){
else {
showResults( outputstream, doc );
}
+ // LOG << "Done Processing XML... " << endl;
}
else {
string data = "";
@@ -556,11 +681,11 @@ void FrogAPI::FrogServer( Sockets::ServerSocket &conn ){
if ( options.debugFlag ){
LOG << "Received: [" << data << "]" << endl;
}
- LOG << "Processing... " << endl;
+ LOG << TiCC::Timer::now() << " Processing... " << endl;
istringstream inputstream(data,istringstream::in);
timers.reset();
timers.tokTimer.start();
- Document *doc = tokenizer->tokenize( inputstream );
+ folia::Document *doc = tokenizer->tokenize( inputstream );
timers.tokTimer.stop();
FrogDoc( *doc );
if ( options.doXMLout ){
@@ -570,6 +695,7 @@ void FrogAPI::FrogServer( Sockets::ServerSocket &conn ){
showResults( outputstream, *doc );
}
delete doc;
+ // LOG << "Done Processing... " << endl;
}
if (!conn.write( (outputstream.str()) ) || !(conn.write("READY\n")) ){
if (options.debugFlag) {
@@ -630,7 +756,7 @@ void FrogAPI::FrogStdin( bool prompt ) {
cout << "Processing... " << endl;
}
istringstream inputstream(data,istringstream::in);
- Document *doc = tokenizer->tokenize( inputstream );
+ folia::Document *doc = tokenizer->tokenize( inputstream );
FrogDoc( *doc, true );
showResults( cout, *doc );
delete doc;
@@ -702,7 +828,7 @@ void FrogAPI::FrogInteractive(){
}
cout << "Processing... '" << data << "'" << endl;
istringstream inputstream(data,istringstream::in);
- Document *doc = tokenizer->tokenize( inputstream );
+ folia::Document *doc = tokenizer->tokenize( inputstream );
FrogDoc( *doc, true );
showResults( cout, *doc );
delete doc;
@@ -713,49 +839,49 @@ void FrogAPI::FrogInteractive(){
#endif
}
-vector<Word*> FrogAPI::lookup( Word *word,
- const vector<Entity*>& entities ) const {
+vector<folia::Word*> FrogAPI::lookup( folia::Word *word,
+ const vector<folia::Entity*>& entities ) const {
for ( const auto& ent : entities ){
- vector<Word*> vec = ent->select<Word>();
+ vector<folia::Word*> vec = ent->select<folia::Word>();
if ( !vec.empty() ){
if ( vec[0]->id() == word->id() ) {
return vec;
}
}
}
- vector<Word*> vec;
+ vector<folia::Word*> vec;
vec.push_back( word ); // single unit
return vec;
}
-Dependency *FrogAPI::lookupDep( const Word *word,
- const vector<Dependency*>&dependencies ) const{
+folia::Dependency *FrogAPI::lookupDep( const folia::Word *word,
+ const vector<folia::Dependency*>&dependencies ) const{
if (dependencies.size() == 0 ){
return 0;
}
int dbFlag = 0;
try {
- dbFlag = stringTo<int>( configuration.lookUp( "debug", "parser" ) );
+ dbFlag = TiCC::stringTo<int>( configuration.lookUp( "debug", "parser" ) );
}
catch (exception & e) {
dbFlag = 0;
}
if ( dbFlag ){
- using TiCC::operator<<;
- LOG << "\nDependency-lookup "<< word << " in " << dependencies << endl;
+ LOG << endl << "Dependency-lookup "<< word << " in " << dependencies << endl;
}
for ( const auto& dep : dependencies ){
if ( dbFlag ){
LOG << "Dependency try: " << dep << endl;
}
try {
- vector<DependencyDependent*> dv = dep->select<DependencyDependent>();
+ vector<folia::DependencyDependent*> dv
+ = dep->select<folia::DependencyDependent>();
if ( !dv.empty() ){
- vector<Word*> wv = dv[0]->select<Word>();
+ vector<folia::Word*> wv = dv[0]->select<folia::Word>();
for ( const auto& w : wv ){
if ( w == word ){
if ( dbFlag ){
- LOG << "\nDependency found word " << w << endl;
+ LOG << "Dependency found word " << w << endl;
}
return dep;
}
@@ -764,28 +890,26 @@ Dependency *FrogAPI::lookupDep( const Word *word,
}
catch ( exception& e ){
if (dbFlag > 0){
- LOG << "get Dependency results failed: "
- << e.what() << endl;
+ LOG << "get Dependency results failed: " << e.what() << endl;
}
}
}
return 0;
}
-string FrogAPI::lookupNEREntity( const vector<Word *>& mwus,
- const vector<Entity*>& entities ) const {
+string FrogAPI::lookupNEREntity( const vector<folia::Word *>& mwus,
+ const vector<folia::Entity*>& entities ) const {
string endresult;
int dbFlag = 0;
try{
- dbFlag = stringTo<int>( configuration.lookUp( "debug", "NER" ) );
+ dbFlag = TiCC::stringTo<int>( configuration.lookUp( "debug", "NER" ) );
}
catch (exception & e) {
dbFlag = 0;
}
for ( const auto& mwu : mwus ){
if ( dbFlag ){
- using TiCC::operator<<;
- LOG << "\nNER: lookup "<< mwu << " in " << entities << endl;
+ LOG << endl << "NER: lookup "<< mwu << " in " << entities << endl;
}
string result;
for ( const auto& entity :entities ){
@@ -793,7 +917,7 @@ string FrogAPI::lookupNEREntity( const vector<Word *>& mwus,
LOG << "NER: try: " << entity << endl;
}
try {
- vector<Word*> wv = entity->select<Word>();
+ vector<folia::Word*> wv = entity->select<folia::Word>();
bool first = true;
for ( const auto& word : wv ){
if ( word == mwu ){
@@ -801,10 +925,10 @@ string FrogAPI::lookupNEREntity( const vector<Word *>& mwus,
LOG << "NER found word " << word << endl;
}
if ( first ){
- result += "B-" + uppercase(entity->cls());
+ result += "B-" + TiCC::uppercase(entity->cls());
}
else {
- result += "I-" + uppercase(entity->cls());
+ result += "I-" + TiCC::uppercase(entity->cls());
}
break;
}
@@ -834,19 +958,18 @@ string FrogAPI::lookupNEREntity( const vector<Word *>& mwus,
}
-string FrogAPI::lookupIOBChunk( const vector<Word *>& mwus,
- const vector<Chunk*>& chunks ) const{
+string FrogAPI::lookupIOBChunk( const vector<folia::Word *>& mwus,
+ const vector<folia::Chunk*>& chunks ) const{
string endresult;
int dbFlag = 0;
try {
- dbFlag = stringTo<int>( configuration.lookUp( "debug", "IOB" ) );
+ dbFlag = TiCC::stringTo<int>( configuration.lookUp( "debug", "IOB" ) );
}
catch (exception & e) {
dbFlag = 0;
}
for ( const auto& mwu : mwus ){
if ( dbFlag ){
- using TiCC::operator<<;
LOG << "IOB lookup "<< mwu << " in " << chunks << endl;
}
string result;
@@ -855,7 +978,7 @@ string FrogAPI::lookupIOBChunk( const vector<Word *>& mwus,
LOG << "IOB try: " << chunk << endl;
}
try {
- vector<Word*> wv = chunk->select<Word>();
+ vector<folia::Word*> wv = chunk->select<folia::Word>();
bool first = true;
for ( const auto& word : wv ){
if ( word == mwu ){
@@ -897,17 +1020,16 @@ string FrogAPI::lookupIOBChunk( const vector<Word *>& mwus,
vector<string> get_compound_analysis( folia::Word* word ){
vector<string> result;
- vector<MorphologyLayer*> layers
- = word->annotations<MorphologyLayer>( Mbma::mbma_tagset );
+ vector<folia::MorphologyLayer*> layers
+ = word->annotations<folia::MorphologyLayer>( Mbma::mbma_tagset );
for ( const auto& layer : layers ){
- vector<Morpheme*> m =
- layer->select<Morpheme>( Mbma::mbma_tagset, false );
+ vector<folia::Morpheme*> m =
+ layer->select<folia::Morpheme>( Mbma::mbma_tagset, false );
if ( m.size() == 1 ) {
// check for top layer compound
- PosAnnotation *postag = 0;
+ folia::PosAnnotation *postag = 0;
try {
- postag = m[0]->annotation<PosAnnotation>( Mbma::clex_tagset );
- // cerr << "found a clex postag!" << endl;
+ postag = m[0]->annotation<folia::PosAnnotation>( Mbma::clex_tagset );
result.push_back( postag->feat( "compound" ) ); // might be empty
}
catch (...){
@@ -943,12 +1065,18 @@ string flatten( const string& s ){
}
vector<string> get_full_morph_analysis( folia::Word* w, bool flat ){
+ return get_full_morph_analysis( w, "current", flat );
+}
+
+vector<string> get_full_morph_analysis( folia::Word* w,
+ const string& cls,
+ bool flat ){
vector<string> result;
- vector<MorphologyLayer*> layers
- = w->annotations<MorphologyLayer>( Mbma::mbma_tagset );
+ vector<folia::MorphologyLayer*> layers
+ = w->annotations<folia::MorphologyLayer>( Mbma::mbma_tagset );
for ( const auto& layer : layers ){
- vector<Morpheme*> m =
- layer->select<Morpheme>( Mbma::mbma_tagset, false );
+ vector<folia::Morpheme*> m =
+ layer->select<folia::Morpheme>( Mbma::mbma_tagset, false );
bool is_deep = false;
if ( m.size() == 1 ) {
// check for top layer from deep morph analysis
@@ -964,9 +1092,10 @@ vector<string> get_full_morph_analysis( folia::Word* w, bool flat ){
if ( !is_deep ){
// flat structure
string morph;
- vector<Morpheme*> m = layer->select<Morpheme>( Mbma::mbma_tagset );
+ vector<folia::Morpheme*> m
+ = layer->select<folia::Morpheme>( Mbma::mbma_tagset );
for ( const auto& mor : m ){
- string txt = UnicodeToUTF8( mor->text() );
+ string txt = folia::UnicodeToUTF8( mor->text( cls ) );
morph += "[" + txt + "]";
}
result.push_back( morph );
@@ -977,7 +1106,7 @@ vector<string> get_full_morph_analysis( folia::Word* w, bool flat ){
void FrogAPI::displayMWU( ostream& os,
size_t index,
- const vector<Word*>& mwu ) const {
+ const vector<folia::Word*>& mwu ) const {
string wrd;
string pos;
string lemma;
@@ -986,8 +1115,9 @@ void FrogAPI::displayMWU( ostream& os,
double conf = 1;
for ( const auto& word : mwu ){
try {
- wrd += word->str();
- PosAnnotation *postag = word->annotation<PosAnnotation>( myPoSTagger->getTagset() );
+ wrd += word->str( options.outputclass );
+ folia::PosAnnotation *postag
+ = word->annotation<folia::PosAnnotation>( myCGNTagger->getTagset() );
pos += postag->cls();
if ( &word != &mwu.back() ){
wrd += "_";
@@ -1018,7 +1148,7 @@ void FrogAPI::displayMWU( ostream& os,
if ( options.doMorph ){
// also covers doDeepMorph
try {
- vector<string> morphs = get_full_morph_analysis( word );
+ vector<string> morphs = get_full_morph_analysis( word, options.outputclass );
for ( const auto& m : morphs ){
morph += m;
if ( &m != &morphs.back() ){
@@ -1073,54 +1203,54 @@ void FrogAPI::displayMWU( ostream& os,
}
ostream& FrogAPI::showResults( ostream& os,
- Document& doc ) const {
- vector<Sentence*> sentences = doc.sentences();
+ folia::Document& doc ) const {
+ vector<folia::Sentence*> sentences = doc.sentences();
for ( auto const& sentence : sentences ){
- vector<Word*> words = sentence->words();
- vector<Entity*> mwu_entities;
+ vector<folia::Word*> words = sentence->words();
+ vector<folia::Entity*> mwu_entities;
if (myMwu){
- mwu_entities = sentence->select<Entity>( myMwu->getTagset() );
+ mwu_entities = sentence->select<folia::Entity>( myMwu->getTagset() );
}
- vector<Dependency*> dependencies;
+ vector<folia::Dependency*> dependencies;
if (myParser){
- dependencies = sentence->select<Dependency>( myParser->getTagset() );
+ dependencies = sentence->select<folia::Dependency>( myParser->getTagset() );
}
- vector<Chunk*> iob_chunking;
+ vector<folia::Chunk*> iob_chunking;
if ( myIOBTagger ){
- iob_chunking = sentence->select<Chunk>( myIOBTagger->getTagset() );
+ iob_chunking = sentence->select<folia::Chunk>( myIOBTagger->getTagset() );
}
- vector<Entity*> ner_entities;
+ vector<folia::Entity*> ner_entities;
if (myNERTagger){
- ner_entities = sentence->select<Entity>( myNERTagger->getTagset() );
+ ner_entities = sentence->select<folia::Entity>( myNERTagger->getTagset() );
}
- static set<ElementType> excludeSet;
- vector<Sentence*> parts = sentence->select<Sentence>( excludeSet );
+ static set<folia::ElementType> excludeSet;
+ vector<folia::Sentence*> parts = sentence->select<folia::Sentence>( excludeSet );
if ( !options.doQuoteDetection ){
assert( parts.size() == 0 );
}
for ( auto const& part : parts ){
- vector<Entity*> ents;
+ vector<folia::Entity*> ents;
if (myMwu){
- ents = part->select<Entity>( myMwu->getTagset() );
+ ents = part->select<folia::Entity>( myMwu->getTagset() );
}
mwu_entities.insert( mwu_entities.end(), ents.begin(), ents.end() );
- vector<Dependency*> deps = part->select<Dependency>();
+ vector<folia::Dependency*> deps = part->select<folia::Dependency>();
dependencies.insert( dependencies.end(), deps.begin(), deps.end() );
- vector<Chunk*> chunks = part->select<Chunk>();
+ vector<folia::Chunk*> chunks = part->select<folia::Chunk>();
iob_chunking.insert( iob_chunking.end(), chunks.begin(), chunks.end() );
- vector<Entity*> ners ;
+ vector<folia::Entity*> ners ;
if (myNERTagger) {
- ners = part->select<Entity>( myNERTagger->getTagset() );
+ ners = part->select<folia::Entity>( myNERTagger->getTagset() );
}
ner_entities.insert( ner_entities.end(), ners.begin(), ners.end() );
}
size_t index = 1;
- map<FoliaElement*, int> enumeration;
- vector<vector<Word*> > mwus;
+ unordered_map<folia::FoliaElement*, int> enumeration;
+ vector<vector<folia::Word*> > mwus;
for ( size_t i=0; i < words.size(); ++i ){
- Word *word = words[i];
- vector<Word*> mwu = lookup( word, mwu_entities );
+ folia::Word *word = words[i];
+ vector<folia::Word*> mwu = lookup( word, mwu_entities );
for ( size_t j=0; j < mwu.size(); ++j ){
enumeration[mwu[j]] = index;
}
@@ -1149,13 +1279,13 @@ ostream& FrogAPI::showResults( ostream& os,
}
if ( options.doParse ){
string cls;
- Dependency *dep = lookupDep( mwu[0], dependencies);
+ folia::Dependency *dep = lookupDep( mwu[0], dependencies);
if ( dep ){
- vector<Headspan*> w = dep->select<Headspan>();
+ vector<folia::Headspan*> w = dep->select<folia::Headspan>();
size_t num;
- if ( w[0]->index(0)->isinstance( PlaceHolder_t ) ){
+ if ( w[0]->index(0)->isinstance( folia::PlaceHolder_t ) ){
string indexS = w[0]->index(0)->str();
- FoliaElement *pnt = w[0]->index(0)->doc()->index(indexS);
+ folia::FoliaElement *pnt = w[0]->index(0)->doc()->index(indexS);
num = enumeration.find(pnt->index(0))->second;
}
else {
@@ -1180,10 +1310,15 @@ ostream& FrogAPI::showResults( ostream& os,
}
string FrogAPI::Frogtostring( const string& s ){
- Document *doc = tokenizer->tokenizestring( s );
+ folia::Document *doc = tokenizer->tokenizestring( s );
stringstream ss;
FrogDoc( *doc, true );
- showResults( ss, *doc );
+ if ( options.doXMLout ){
+ doc->save( ss, options.doKanon );
+ }
+ else {
+ showResults( ss, *doc );
+ }
delete doc;
return ss.str();
}
@@ -1194,13 +1329,13 @@ string FrogAPI::Frogtostringfromfile( const string& name ){
return ss.str();
}
-void FrogAPI::FrogDoc( Document& doc,
+void FrogAPI::FrogDoc( folia::Document& doc,
bool hidetimers ){
timers.frogTimer.start();
// first we make sure that the doc will accept our annotations, by
// declaring them in the doc
- if (myPoSTagger){
- myPoSTagger->addDeclaration( doc );
+ if (myCGNTagger){
+ myCGNTagger->addDeclaration( doc );
}
if ( options.doLemma && myMblem ) {
myMblem->addDeclaration( doc );
@@ -1223,8 +1358,7 @@ void FrogAPI::FrogDoc( Document& doc,
if ( options.debugFlag > 5 ){
LOG << "Testing document :" << doc << endl;
}
-
- vector<Sentence*> sentences;
+ vector<folia::Sentence*> sentences;
if ( options.doQuoteDetection ){
sentences = doc.sentenceParts();
}
@@ -1299,30 +1433,30 @@ void FrogAPI::FrogDoc( Document& doc,
}
void FrogAPI::FrogFile( const string& infilename,
- ostream &os,
+ ostream& os,
const string& xmlOutF ) {
// stuff the whole input into one FoLiA document.
// This is not a good idea on the long term, I think (agreed [proycon] )
string xmlOutFile = xmlOutF;
if ( options.doXMLin && !xmlOutFile.empty() ){
- if ( match_back( infilename, ".gz" ) ){
- if ( !match_back( xmlOutFile, ".gz" ) )
+ if ( TiCC::match_back( infilename, ".gz" ) ){
+ if ( !TiCC::match_back( xmlOutFile, ".gz" ) )
xmlOutFile += ".gz";
}
- else if ( match_back( infilename, ".bz2" ) ){
- if ( !match_back( xmlOutFile, ".bz2" ) )
+ else if ( TiCC::match_back( infilename, ".bz2" ) ){
+ if ( !TiCC::match_back( xmlOutFile, ".bz2" ) )
xmlOutFile += ".bz2";
}
}
if ( options.doXMLin ){
- Document doc;
+ folia::Document doc;
try {
doc.readFromFile( infilename );
}
catch ( exception &e ){
LOG << "retrieving FoLiA from '" << infilename << "' failed with exception:" << endl;
- cerr << e.what() << endl;
- return;
+ LOG << e.what() << endl;
+ throw ( runtime_error( "read failed" ) );
}
timers.reset();
timers.tokTimer.start();
@@ -1333,20 +1467,24 @@ void FrogAPI::FrogFile( const string& infilename,
doc.save( xmlOutFile, options.doKanon );
LOG << "resulting FoLiA doc saved in " << xmlOutFile << endl;
}
- showResults( os, doc );
+ if ( !options.noStdOut ){
+ showResults( os, doc );
+ }
}
else {
ifstream IN( infilename );
timers.reset();
timers.tokTimer.start();
- Document *doc = tokenizer->tokenize( IN );
+ folia::Document *doc = tokenizer->tokenize( IN );
timers.tokTimer.stop();
FrogDoc( *doc );
if ( !xmlOutFile.empty() ){
doc->save( xmlOutFile, options.doKanon );
LOG << "resulting FoLiA doc saved in " << xmlOutFile << endl;
}
- showResults( os, *doc );
+ if ( !options.noStdOut ){
+ showResults( os, *doc );
+ }
delete doc;
}
}
diff --git a/src/Makefile.am b/src/Makefile.am
index 73ce53d..58a45cf 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,5 +1,5 @@
AM_CPPFLAGS = -I at top_srcdir@/include
-AM_CXXFLAGS = -DSYSCONF_PATH=\"$(datadir)\" -std=c++0x # -Weffc++
+AM_CXXFLAGS = -DSYSCONF_PATH=\"$(datadir)\" -std=c++11 -W -Wall -pedantic -g -O3 # -Weffc++
bin_PROGRAMS = frog mbma mblem ner
frog_SOURCES = Frog.cxx
@@ -15,7 +15,9 @@ libfrog_la_SOURCES = FrogAPI.cxx \
mbma_rule.cxx mbma_mod.cxx mbma_brackets.cxx clex.cxx \
mblem_mod.cxx csidp.cxx ckyparser.cxx \
Frog-util.cxx mwu_chunker_mod.cxx Parser.cxx \
- pos_tagger_mod.cxx cgn_tagger_mod.cxx iob_tagger_mod.cxx ner_tagger_mod.cxx \
+ tagger_base.cxx cgn_tagger_mod.cxx \
+ iob_tagger_mod.cxx \
+ ner_tagger_mod.cxx \
ucto_tokenizer_mod.cxx
diff --git a/src/Makefile.in b/src/Makefile.in
index 027c6aa..904dda4 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -92,8 +92,7 @@ host_triplet = @host@
bin_PROGRAMS = frog$(EXEEXT) mbma$(EXEEXT) mblem$(EXEEXT) ner$(EXEEXT)
subdir = src
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \
- $(top_srcdir)/m4/ax_lib_readline.m4 \
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_lib_readline.m4 \
$(top_srcdir)/m4/ax_pthread.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
@@ -137,7 +136,7 @@ LTLIBRARIES = $(lib_LTLIBRARIES)
libfrog_la_LIBADD =
am_libfrog_la_OBJECTS = FrogAPI.lo mbma_rule.lo mbma_mod.lo \
mbma_brackets.lo clex.lo mblem_mod.lo csidp.lo ckyparser.lo \
- Frog-util.lo mwu_chunker_mod.lo Parser.lo pos_tagger_mod.lo \
+ Frog-util.lo mwu_chunker_mod.lo Parser.lo tagger_base.lo \
cgn_tagger_mod.lo iob_tagger_mod.lo ner_tagger_mod.lo \
ucto_tokenizer_mod.lo
libfrog_la_OBJECTS = $(am_libfrog_la_OBJECTS)
@@ -438,13 +437,7 @@ EXEEXT = @EXEEXT@
FGREP = @FGREP@
GREP = @GREP@
ICU_CFLAGS = @ICU_CFLAGS@
-ICU_CONFIG = @ICU_CONFIG@
-ICU_CPPSEARCHPATH = @ICU_CPPSEARCHPATH@
-ICU_CXXFLAGS = @ICU_CXXFLAGS@
-ICU_IOLIBS = @ICU_IOLIBS@
-ICU_LIBPATH = @ICU_LIBPATH@
ICU_LIBS = @ICU_LIBS@
-ICU_VERSION = @ICU_VERSION@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -488,7 +481,6 @@ SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
STRIP = @STRIP@
-SYSCONFDIR = @SYSCONFDIR@
VERSION = @VERSION@
XML2_CFLAGS = @XML2_CFLAGS@
XML2_LIBS = @XML2_LIBS@
@@ -541,10 +533,10 @@ mbt_LIBS = @mbt_LIBS@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
-pkgconfigpath = @pkgconfigpath@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
+runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
@@ -560,7 +552,7 @@ top_srcdir = @top_srcdir@
ucto_CFLAGS = @ucto_CFLAGS@
ucto_LIBS = @ucto_LIBS@
AM_CPPFLAGS = -I at top_srcdir@/include
-AM_CXXFLAGS = -DSYSCONF_PATH=\"$(datadir)\" -std=c++0x # -Weffc++
+AM_CXXFLAGS = -DSYSCONF_PATH=\"$(datadir)\" -std=c++11 -W -Wall -pedantic -g -O3 # -Weffc++
frog_SOURCES = Frog.cxx
mbma_SOURCES = mbma_prog.cxx
mblem_SOURCES = mblem_prog.cxx
@@ -572,7 +564,9 @@ libfrog_la_SOURCES = FrogAPI.cxx \
mbma_rule.cxx mbma_mod.cxx mbma_brackets.cxx clex.cxx \
mblem_mod.cxx csidp.cxx ckyparser.cxx \
Frog-util.cxx mwu_chunker_mod.cxx Parser.cxx \
- pos_tagger_mod.cxx cgn_tagger_mod.cxx iob_tagger_mod.cxx ner_tagger_mod.cxx \
+ tagger_base.cxx cgn_tagger_mod.cxx \
+ iob_tagger_mod.cxx \
+ ner_tagger_mod.cxx \
ucto_tokenizer_mod.cxx
TESTS = tst.sh
@@ -739,7 +733,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/mwu_chunker_mod.Plo at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/ner_prog.Po at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/ner_tagger_mod.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/pos_tagger_mod.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/tagger_base.Plo at am__quote@
@AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/ucto_tokenizer_mod.Plo at am__quote@
.cxx.o:
diff --git a/src/Parser.cxx b/src/Parser.cxx
index 39360c9..8ff7864 100644
--- a/src/Parser.cxx
+++ b/src/Parser.cxx
@@ -35,6 +35,7 @@
#include <string>
#include <iostream>
#include <fstream>
+#include <algorithm>
#include "config.h"
#include "ticcutils/Configuration.h"
@@ -45,7 +46,6 @@
#include "frog/csidp.h"
using namespace std;
-using namespace folia;
using TiCC::operator<<;
@@ -62,7 +62,7 @@ struct parseData {
vector<string> words;
vector<string> heads;
vector<string> mods;
- vector<vector<Word*> > mwus;
+ vector<vector<folia::Word*> > mwus;
};
ostream& operator<<( ostream& os, const parseData& pd ){
@@ -92,7 +92,16 @@ bool Parser::init( const TiCC::Configuration& configuration ){
bool problem = false;
LOG << "initiating parser ... " << endl;
string cDir = configuration.configDir();
- string val = configuration.lookUp( "version", "parser" );
+ string val = configuration.lookUp( "debug", "parser" );
+ if ( !val.empty() ){
+ int level;
+ if ( TiCC::stringTo<int>( val, level ) ){
+ if ( level > 5 ){
+ parseLog->setlevel( LogLevel::LogDebug );
+ }
+ }
+ }
+ val = configuration.lookUp( "version", "parser" );
if ( val.empty() ){
version = "1.0";
}
@@ -181,6 +190,13 @@ bool Parser::init( const TiCC::Configuration& configuration ){
return false;
}
+ string cls = configuration.lookUp( "outputclass" );
+ if ( !cls.empty() ){
+ textclass = cls;
+ }
+ else {
+ textclass = "current";
+ }
bool happy = true;
pairs = new Timbl::TimblAPI( pairsOptions );
if ( pairs->Valid() ){
@@ -227,11 +243,11 @@ Parser::~Parser(){
delete filter;
}
-static vector<Word *> lookup( Word *word,
- const vector<Entity*>& entities ){
- vector<Word*> vec;
+static vector<folia::Word *> lookup( folia::Word *word,
+ const vector<folia::Entity*>& entities ){
+ vector<folia::Word*> vec;
for ( const auto& ent : entities ){
- vec = ent->select<Word>();
+ vec = ent->select<folia::Word>();
if ( !vec.empty() ){
if ( vec[0]->id() == word->id() ) {
// cerr << "found " << vec << endl;
@@ -796,44 +812,47 @@ vector<string> Parser::createRelInstances( const parseData& pd ){
}
-void Parser::addDeclaration( Document& doc ) const {
-#pragma omp critical(foliaupdate)
+void Parser::addDeclaration( folia::Document& doc ) const {
+#pragma omp critical (foliaupdate)
{
- doc.declare( AnnotationType::DEPENDENCY, dep_tagset,
+ doc.declare( folia::AnnotationType::DEPENDENCY, dep_tagset,
"annotator='frog-depparse-" + version
+ "', annotatortype='auto'");
}
}
-parseData Parser::prepareParse( const vector<Word *>& fwords ){
+parseData Parser::prepareParse( const vector<folia::Word *>& fwords ){
parseData pd;
- Sentence *sent = 0;
- vector<Entity*> entities;
-#pragma omp critical(foliaupdate)
+ folia::Sentence *sent = 0;
+ vector<folia::Entity*> entities;
+#pragma omp critical (foliaupdate)
{
sent = fwords[0]->sentence();
- entities = sent->select<Entity>(MWU_tagset);
+ entities = sent->select<folia::Entity>(MWU_tagset);
}
for ( size_t i=0; i < fwords.size(); ++i ){
- Word *word = fwords[i];
- vector<Word*> mwuv = lookup( word, entities );
+ folia::Word *word = fwords[i];
+ vector<folia::Word*> mwuv = lookup( word, entities );
if ( !mwuv.empty() ){
string multi_word;
string head;
string mod;
for ( const auto& mwu : mwuv ){
UnicodeString tmp;
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
- tmp = mwu->text();
+ tmp = mwu->text( textclass );
}
if ( filter )
tmp = filter->filter( tmp );
- string ms = UnicodeToUTF8( tmp );
+ string ms = folia::UnicodeToUTF8( tmp );
+ // the word may contain spaces, remove them all!
+ ms.erase(remove_if(ms.begin(), ms.end(), ::isspace), ms.end());
multi_word += ms;
- PosAnnotation *postag = mwu->annotation<PosAnnotation>( POS_tagset );
+ folia::PosAnnotation *postag
+ = mwu->annotation<folia::PosAnnotation>( POS_tagset );
head += postag->feat("head");
- vector<Feature*> feats = postag->select<Feature>();
+ vector<folia::Feature*> feats = postag->select<folia::Feature>();
for ( const auto& feat : feats ){
mod += feat->cls();
if ( &feat != &feats.back() ){
@@ -854,19 +873,22 @@ parseData Parser::prepareParse( const vector<Word *>& fwords ){
}
else {
UnicodeString tmp;
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
- tmp = word->text();
+ tmp = word->text( textclass );
}
if ( filter )
tmp = filter->filter( tmp );
- string ms = UnicodeToUTF8( tmp );
+ string ms = folia::UnicodeToUTF8( tmp );
+ // the word may contain spaces, remove them all!
+ ms.erase(remove_if(ms.begin(), ms.end(), ::isspace), ms.end());
pd.words.push_back( ms );
- PosAnnotation *postag = word->annotation<PosAnnotation>( POS_tagset );
+ folia::PosAnnotation *postag
+ = word->annotation<folia::PosAnnotation>( POS_tagset );
string head = postag->feat("head");
pd.heads.push_back( head );
string mod;
- vector<Feature*> feats = postag->select<Feature>();
+ vector<folia::Feature*> feats = postag->select<folia::Feature>();
if ( feats.empty() ){
mod = "__";
}
@@ -879,7 +901,7 @@ parseData Parser::prepareParse( const vector<Word *>& fwords ){
}
}
pd.mods.push_back( mod );
- vector<Word*> vec;
+ vector<folia::Word*> vec;
vec.push_back(word);
pd.mwus.push_back( vec );
}
@@ -887,36 +909,45 @@ parseData Parser::prepareParse( const vector<Word *>& fwords ){
return pd;
}
-void appendResult( const vector<Word *>& words,
+void appendResult( const vector<folia::Word *>& words,
parseData& pd,
const string& tagset,
+ const string& textclass,
const vector<int>& nums,
const vector<string>& roles ){
- Sentence *sent = words[0]->sentence();
- KWargs args;
+ folia::Sentence *sent = 0;
+#pragma omp critical (foliaupdate)
+ {
+ sent = words[0]->sentence();
+ }
+ folia::DependenciesLayer *dl = 0;
+ folia::KWargs args;
args["generate_id"] = sent->id();
args["set"] = tagset;
- DependenciesLayer *dl = new DependenciesLayer( args, sent->doc() );
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
+ dl = new folia::DependenciesLayer( args, sent->doc() );
sent->append( dl );
}
for ( size_t i=0; i < nums.size(); ++i ){
if ( nums[i] != 0 ){
- KWargs args;
+ folia::KWargs args;
args["generate_id"] = dl->id();
args["class"] = roles[i];
args["set"] = tagset;
-#pragma omp critical(foliaupdate)
+ if ( textclass != "current" ){
+ args["textclass"] = textclass;
+ }
+#pragma omp critical (foliaupdate)
{
- Dependency *d = new Dependency( args, sent->doc() );
+ folia::Dependency *d = new folia::Dependency( args, sent->doc() );
dl->append( d );
- Headspan *dh = new Headspan();
+ folia::Headspan *dh = new folia::Headspan();
for ( const auto& wrd : pd.mwus[nums[i]-1] ){
dh->append( wrd );
}
d->append( dh );
- DependencyDependent *dd = new DependencyDependent();
+ folia::DependencyDependent *dd = new folia::DependencyDependent();
for ( const auto& it : pd.mwus[i] ){
dd->append( it );
}
@@ -926,9 +957,10 @@ void appendResult( const vector<Word *>& words,
}
}
-void appendParseResult( const vector<Word *>& words,
+void appendParseResult( const vector<folia::Word *>& words,
parseData& pd,
const string& tagset,
+ const string& textclass,
const vector<parsrel>& res ){
vector<int> nums;
vector<string> roles;
@@ -936,7 +968,7 @@ void appendParseResult( const vector<Word *>& words,
nums.push_back( it.head );
roles.push_back( it.deprel );
}
- appendResult( words, pd, tagset, nums, roles );
+ appendResult( words, pd, tagset, textclass, nums, roles );
}
void timbl( Timbl::TimblAPI* tim,
@@ -950,7 +982,7 @@ void timbl( Timbl::TimblAPI* tim,
}
}
-void Parser::Parse( const vector<Word*>& words,
+void Parser::Parse( const vector<folia::Word*>& words,
TimerBlock& timers ){
timers.parseTimer.start();
if ( !isInit ){
@@ -997,8 +1029,9 @@ void Parser::Parse( const vector<Word*>& words,
r_results,
d_results,
pd.words.size(),
- maxDepSpan );
+ maxDepSpan,
+ parseLog );
timers.csiTimer.stop();
- appendParseResult( words, pd, dep_tagset, res );
+ appendParseResult( words, pd, dep_tagset, textclass, res );
timers.parseTimer.stop();
}
diff --git a/src/cgn_tagger_mod.cxx b/src/cgn_tagger_mod.cxx
index b60b99c..10f5264 100644
--- a/src/cgn_tagger_mod.cxx
+++ b/src/cgn_tagger_mod.cxx
@@ -35,11 +35,9 @@
#include "frog/cgn_tagger_mod.h"
using namespace std;
-using namespace folia;
-using namespace TiCC;
using namespace Tagger;
-#define LOG *Log(tag_log)
+#define LOG *TiCC::Log(tag_log)
void CGNTagger::fillSubSetTable(){
// should become a config file!
@@ -167,24 +165,35 @@ void CGNTagger::fillSubSetTable(){
}
-bool CGNTagger::init( const Configuration& config ){
- if ( debug ){
+bool CGNTagger::init( const TiCC::Configuration& config ){
+ if ( debug ){
LOG << "INIT CGN Tagger." << endl;
}
- if ( POSTagger::init( config ) ){
+ if ( BaseTagger::init( config ) ){
fillSubSetTable();
if ( debug ){
- LOG << "DONE CGN Tagger." << endl;
+ LOG << "DONE Init CGN Tagger." << endl;
}
return true;
}
return false;
}
+void CGNTagger::addDeclaration( folia::Document& doc ) const {
+#pragma omp critical (foliaupdate)
+ {
+ doc.declare( folia::AnnotationType::POS,
+ tagset,
+ "annotator='frog-mbpos-" + version
+ + "', annotatortype='auto', datetime='" + getTime() + "'");
+ }
+}
+
string CGNTagger::getSubSet( const string& val, const string& head ){
auto it = cgnSubSets.find( val );
- if ( it == cgnSubSets.end() )
+ if ( it == cgnSubSets.end() ){
throw folia::ValueError( "unknown cgn subset for class: '" + val + "'" );
+ }
string result;
while ( it != cgnSubSets.upper_bound(val) ){
result = it->second;
@@ -208,30 +217,71 @@ string CGNTagger::getSubSet( const string& val, const string& head ){
"' whithin the constraints for '" + head + "'" );
}
-void CGNTagger::post_process( const vector<Word *>& words ){
+void CGNTagger::addTag( folia::Word *word,
+ const string& inputTag,
+ double confidence ){
+ string pos_tag = inputTag;
+ string ucto_class = word->cls();
+ if ( debug ){
+ LOG << "lookup ucto class= " << ucto_class << endl;
+ }
+ auto const tt = token_tag_map.find( ucto_class );
+ if ( tt != token_tag_map.end() ){
+ if ( debug ){
+ LOG << "found translation ucto class= " << ucto_class
+ << " to POS-Tag=" << tt->second << endl;
+ }
+ pos_tag = tt->second;
+ confidence = 1.0;
+ }
+ folia::KWargs args;
+ args["set"] = tagset;
+ args["class"] = pos_tag;
+ args["confidence"]= TiCC::toString(confidence);
+ if ( textclass != "current" ){
+ args["textclass"] = textclass;
+ }
+#pragma omp critical (foliaupdate)
+ {
+ word->addPosAnnotation( args );
+ }
+}
+
+
+void CGNTagger::post_process( const vector<folia::Word *>& words ){
+ for ( size_t i=0; i < _tag_result.size(); ++i ){
+ addTag( words[i],
+ _tag_result[i].assignedTag(),
+ _tag_result[i].confidence() );
+ }
for ( auto const& word : words ){
- PosAnnotation *postag = word->annotation<PosAnnotation>( );
+ folia::PosAnnotation *postag = 0;
+#pragma omp critical (foliaupdate)
+ {
+ postag = word->annotation<folia::PosAnnotation>( );
+ }
string cls = postag->cls();
- vector<string> parts;
- TiCC::split_at_first_of( cls, parts, "()" );
+ vector<string> parts = TiCC::split_at_first_of( cls, "()" );
string head = parts[0];
- KWargs args;
+ folia::KWargs args;
args["class"] = head;
args["set"] = tagset;
folia::Feature *feat = new folia::HeadFeature( args );
- postag->append( feat );
- if ( head == "SPEC" ){
- postag->confidence(1.0);
+#pragma omp critical (foliaupdate)
+ {
+ postag->append( feat );
+ if ( head == "SPEC" ){
+ postag->confidence(1.0);
+ }
}
if ( parts.size() > 1 ){
- vector<string> tagParts;
- TiCC::split_at( parts[1], tagParts, "," );
+ vector<string> tagParts = TiCC::split_at( parts[1], "," );
for ( auto const& part : tagParts ){
- KWargs args;
+ folia::KWargs args;
args["set"] = tagset;
args["subset"] = getSubSet( part, head );
args["class"] = part;
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
folia::Feature *feat = new folia::Feature( args );
postag->append( feat );
@@ -240,11 +290,3 @@ void CGNTagger::post_process( const vector<Word *>& words ){
}
}
}
-
-void CGNTagger::Classify( const vector<Word*>& swords ){
- POSTagger::Classify( swords );
- if ( debug ){
- LOG << "POS Classify done:" << endl;
- }
- post_process( swords );
-}
diff --git a/src/ckyparser.cxx b/src/ckyparser.cxx
index 8c84be5..97c6e53 100644
--- a/src/ckyparser.cxx
+++ b/src/ckyparser.cxx
@@ -30,14 +30,17 @@
*/
#include <iostream>
#include <vector>
-#include <map>
#include <string>
#include "ticcutils/PrettyPrint.h"
+#include "ticcutils/LogStream.h"
#include "frog/ckyparser.h"
using namespace std;
+#define LOG *TiCC::Log(ckyLog)
+#define DBG *TiCC::Dbg(ckyLog)
+
ostream& operator<<( ostream& os, const Constraint* c ){
if ( c ){
c->put( os );
@@ -51,6 +54,8 @@ ostream& operator<<( ostream& os, const Constraint& c ){
return os << &c;
}
+using TiCC::operator<<;
+
void HasIncomingRel::put( ostream& os ) const {
Constraint::put( os );
os << " incoming rel=" << relType;
@@ -68,7 +73,9 @@ void DependencyDirection::put( ostream & os ) const {
}
-CKYParser::CKYParser( size_t num, const vector<const Constraint*>& constraints ):
+CKYParser::CKYParser( size_t num,
+ const vector<const Constraint*>& constraints,
+ TiCC::LogStream* log ):
numTokens(num)
{
inDepConstraints.resize( numTokens + 1 );
@@ -84,7 +91,7 @@ CKYParser::CKYParser( size_t num, const vector<const Constraint*>& constraints )
for ( const auto& constraint : constraints ){
addConstraint( constraint );
}
-
+ ckyLog = new TiCC::LogStream( log, "cky:" );
}
@@ -100,7 +107,7 @@ void CKYParser::addConstraint( const Constraint *c ){
outDepConstraints[c->tIndex()].push_back( c );
break;
default:
- cerr << "UNSUPPORTED constraint type" << endl;
+ LOG << "UNSUPPORTED constraint type" << endl;
abort();
}
}
@@ -111,25 +118,25 @@ string CKYParser::bestEdge( const SubTree& leftSubtree,
set<const Constraint*>& bestConstraints,
double& bestScore ){
bestConstraints.clear();
- // cerr << "BESTEDGE " << headIndex << " <> " << depIndex << endl;
+ DBG << "BESTEDGE " << headIndex << " <> " << depIndex << endl;
if ( headIndex == 0 ){
bestScore = 0.0;
for ( auto const& constraint : outDepConstraints[depIndex] ){
- // cerr << "CHECK " << constraint << endl;
+ DBG << "CHECK " << constraint << endl;
if ( constraint->direct() == dirType::ROOT ){
- // cerr << "head outdep matched " << constraint << endl;
+ DBG << "head outdep matched " << constraint << endl;
bestScore = constraint->wght();
bestConstraints.insert( constraint );
}
}
string label = "ROOT";
for ( auto const& constraint : edgeConstraints[depIndex][0] ){
- // cerr << "head edge matched " << constraint << endl;
+ DBG << "head edge matched " << constraint << endl;
bestScore += constraint->wght();
bestConstraints.insert( constraint );
label = constraint->rel();
}
- // cerr << "best HEAD==>" << label << " " << bestScore << " " << bestConstraints << endl;
+ DBG << "best HEAD==>" << label << " " << bestScore << " " << bestConstraints << endl;
return label;
}
bestScore = -0.5;
@@ -143,7 +150,7 @@ string CKYParser::bestEdge( const SubTree& leftSubtree,
if ( constraint->rel() == my_label &&
leftSubtree.satisfiedConstraints.find( constraint ) == leftSubtree.satisfiedConstraints.end() &&
rightSubtree.satisfiedConstraints.find( constraint ) == rightSubtree.satisfiedConstraints.end() ){
- // cerr << "inDep matched: " << constraint << endl;
+ DBG << "inDep matched: " << constraint << endl;
my_score += constraint->wght();
my_constraints.insert(constraint);
}
@@ -156,7 +163,7 @@ string CKYParser::bestEdge( const SubTree& leftSubtree,
headIndex > depIndex ) )
&& leftSubtree.satisfiedConstraints.find( constraint ) == leftSubtree.satisfiedConstraints.end()
&& rightSubtree.satisfiedConstraints.find( constraint ) == rightSubtree.satisfiedConstraints.end() ){
- // cerr << "outdep matched: " << constraint << endl;
+ DBG << "outdep matched: " << constraint << endl;
my_score += constraint->wght();
my_constraints.insert(constraint);
}
@@ -165,10 +172,10 @@ string CKYParser::bestEdge( const SubTree& leftSubtree,
bestScore = my_score;
bestLabel = my_label;
bestConstraints = std::move(my_constraints);
- // cerr << "UPDATE BEst " << bestLabel << " " << bestScore << " " << bestConstraints << endl;
+ DBG << "UPDATE BEst " << bestLabel << " " << bestScore << " " << bestConstraints << endl;
}
}
- // cerr << "GRAND TOTAL " << bestLabel << " " << bestScore << " " << bestConstraints << endl;
+ DBG << "GRAND TOTAL " << bestLabel << " " << bestScore << " " << bestConstraints << endl;
return bestLabel;
}
@@ -186,7 +193,7 @@ void CKYParser::parse(){
string label = bestEdge( chart[s][r].r_True,
chart[r+1][t].l_True,
t, s, constraints, edgeScore );
- // cerr << "STEP 1 BEST EDGE==> " << label << " ( " << edgeScore << ")" << endl;
+ DBG << "STEP 1 BEST EDGE==> " << label << " ( " << edgeScore << ")" << endl;
double score = chart[s][r].r_True.score() + chart[r+1][t].l_True.score() + edgeScore;
if ( score > bestScore ){
bestScore = score;
@@ -195,7 +202,7 @@ void CKYParser::parse(){
bestConstraints = std::move(constraints);
}
}
- // cerr << "STEP 1 ADD: " << bestScore <<"-" << bestI << "-" << bestL << endl;
+ DBG << "STEP 1 ADD: " << bestScore <<"-" << bestI << "-" << bestL << endl;
chart[s][t].l_False = SubTree( bestScore, bestI, bestL );
chart[s][t].l_False.satisfiedConstraints.insert( chart[s][bestI].r_True.satisfiedConstraints.begin(), chart[s][bestI].r_True.satisfiedConstraints.end() );
chart[s][t].l_False.satisfiedConstraints.insert( chart[bestI+1][t].l_True.satisfiedConstraints.begin(), chart[bestI+1][t].l_True.satisfiedConstraints.end() );
@@ -211,7 +218,7 @@ void CKYParser::parse(){
string label = bestEdge( chart[s][r].r_True,
chart[r+1][t].l_True,
s, t, constraints, edgeScore );
- // cerr << "STEP 2 BEST EDGE==> " << label << " ( " << edgeScore << ")" << endl;
+ DBG << "STEP 2 BEST EDGE==> " << label << " ( " << edgeScore << ")" << endl;
double score = chart[s][r].r_True.score() + chart[r+1][t].l_True.score() + edgeScore;
if ( score > bestScore ){
bestScore = score;
@@ -221,7 +228,7 @@ void CKYParser::parse(){
}
}
- // cerr << "STEP 2 ADD: " << bestScore <<"-" << bestI << "-" << bestL << endl;
+ DBG << "STEP 2 ADD: " << bestScore <<"-" << bestI << "-" << bestL << endl;
chart[s][t].r_False = SubTree( bestScore, bestI, bestL );
chart[s][t].r_False.satisfiedConstraints.insert( chart[s][bestI].r_True.satisfiedConstraints.begin(), chart[s][bestI].r_True.satisfiedConstraints.end() );
chart[s][t].r_False.satisfiedConstraints.insert( chart[bestI+1][t].l_True.satisfiedConstraints.begin(), chart[bestI+1][t].l_True.satisfiedConstraints.end() );
@@ -237,7 +244,7 @@ void CKYParser::parse(){
bestI = r;
}
}
- // cerr << "STEP 3 ADD: " << bestScore <<"-" << bestI << "-" << bestL << endl;
+ DBG << "STEP 3 ADD: " << bestScore <<"-" << bestI << "-" << bestL << endl;
chart[s][t].l_True = SubTree( bestScore, bestI, bestL );
chart[s][t].l_True.satisfiedConstraints.insert( chart[s][bestI].l_True.satisfiedConstraints.begin(), chart[s][bestI].l_True.satisfiedConstraints.end() );
chart[s][t].l_True.satisfiedConstraints.insert( chart[bestI][t].l_False.satisfiedConstraints.begin(), chart[bestI][t].l_False.satisfiedConstraints.end() );
@@ -253,7 +260,7 @@ void CKYParser::parse(){
}
}
- // cerr << "STEP 4 ADD: " << bestScore <<"-" << bestI << "-" << bestL << endl;
+ DBG << "STEP 4 ADD: " << bestScore <<"-" << bestI << "-" << bestL << endl;
chart[s][t].r_True = SubTree( bestScore, bestI, bestL );
chart[s][t].r_True.satisfiedConstraints.insert( chart[s][bestI].r_False.satisfiedConstraints.begin(), chart[s][bestI].r_False.satisfiedConstraints.end() );
chart[s][t].r_True.satisfiedConstraints.insert( chart[bestI][t].r_True.satisfiedConstraints.begin(), chart[bestI][t].r_True.satisfiedConstraints.end() );
diff --git a/src/csidp.cxx b/src/csidp.cxx
index 4ab1ba0..ca3fded 100644
--- a/src/csidp.cxx
+++ b/src/csidp.cxx
@@ -32,44 +32,47 @@
#include <fstream>
#include <string>
#include <vector>
-#include <map>
+#include <unordered_map>
#include "ticcutils/StringOps.h"
#include "ticcutils/PrettyPrint.h"
+#include "ticcutils/LogStream.h"
#include "frog/csidp.h"
using namespace std;
-void split_dist( const vector< pair<string,double>>& dist,
- map<string,double>& result ){
- result.clear();
+#define LOG *TiCC::Log(log)
+#define DBG *TiCC::Dbg(log)
+
+unordered_map<string,double> split_dist( const vector< pair<string,double>>& dist ){
+ unordered_map<string,double> result;
for( const auto& it : dist ){
double d = it.second;
- vector<string> tags;
- TiCC::split_at( it.first, tags, "|" );
+ vector<string> tags = TiCC::split_at( it.first, "|" );
for( const auto& t : tags ){
result[t] += d;
}
}
+ return result;
}
vector<const Constraint*> formulateWCSP( const vector<timbl_result>& d_res,
- const vector<timbl_result>& r_res,
- const vector<timbl_result>& p_res,
- size_t sent_len,
- size_t maxDist ){
+ const vector<timbl_result>& r_res,
+ const vector<timbl_result>& p_res,
+ size_t sent_len,
+ size_t maxDist,
+ TiCC::LogStream *log ){
vector<const Constraint*> constraints;
vector<timbl_result>::const_iterator pit = p_res.begin();
for ( size_t dependent_id = 1;
dependent_id <= sent_len;
++dependent_id ){
- int headId = 0;
string top_class = pit->cls();
double conf = pit->confidence();
++pit;
- // cerr << "class=" << top_class << " met conf " << conf << endl;
+ DBG << "class=" << top_class << " met conf " << conf << endl;
if ( top_class != "__" ){
- constraints.push_back(new HasDependency(dependent_id,headId,top_class,conf));
+ constraints.push_back( new HasDependency( dependent_id, 0 ,top_class, conf ) );
}
}
@@ -83,13 +86,13 @@ vector<const Constraint*> formulateWCSP( const vector<timbl_result>& d_res,
if ( diff != 0 && diff <= maxDist ){
string line;
if ( pit == p_res.end() ){
- cerr << "OEPS p_res leeg? " << endl;
+ LOG << "OEPS p_res leeg? " << endl;
break;
}
string top_class = pit->cls();
double conf = pit->confidence();
++pit;
- // cerr << "class=" << top_class << " met conf " << conf << endl;
+ DBG << "class=" << top_class << " met conf " << conf << endl;
if ( top_class != "__" ){
constraints.push_back( new HasDependency(dependent_id,headId,top_class,conf));
}
@@ -115,10 +118,8 @@ vector<const Constraint*> formulateWCSP( const vector<timbl_result>& d_res,
}
string top_class = rit->cls();
if ( top_class != "__" ){
- map<string,double> splits;
- split_dist( rit->dist(), splits );
- vector<string> clss;
- TiCC::split_at( top_class, clss, "|" );
+ unordered_map<string,double> splits = split_dist( rit->dist() );
+ vector<string> clss = TiCC::split_at( top_class, "|" );
for( const auto& rel : clss ){
constraints.push_back( new HasIncomingRel( rel_id, rel, splits[rel] ) );
}
@@ -144,10 +145,11 @@ vector<parsrel> parse( const vector<timbl_result>& p_res,
const vector<timbl_result>& r_res,
const vector<timbl_result>& d_res,
size_t parse_size,
- int maxDist ){
+ int maxDist,
+ TiCC::LogStream *log ){
vector<const Constraint*> constraints
- = formulateWCSP( d_res, r_res, p_res, parse_size, maxDist );
- CKYParser parser( parse_size, constraints );
+ = formulateWCSP( d_res, r_res, p_res, parse_size, maxDist, log );
+ CKYParser parser( parse_size, constraints, log );
parser.parse();
vector<parsrel> result( parse_size );
parser.rightComplete(0, parse_size, result );
diff --git a/src/iob_tagger_mod.cxx b/src/iob_tagger_mod.cxx
index 50a022e..436769f 100644
--- a/src/iob_tagger_mod.cxx
+++ b/src/iob_tagger_mod.cxx
@@ -35,116 +35,40 @@
#include "frog/iob_tagger_mod.h"
using namespace std;
-using namespace folia;
-using namespace TiCC;
using namespace Tagger;
-#define LOG *Log(iobLog)
+#define LOG *TiCC::Log(tag_log)
-IOBTagger::IOBTagger(TiCC::LogStream * logstream){
- tagger = 0;
- iobLog = new LogStream( logstream, "iob-" );
- filter = 0;
-}
-
-IOBTagger::~IOBTagger(){
- delete tagger;
- delete iobLog;
- delete filter;
-}
+const string cgn_tagset = "http://ilk.uvt.nl/folia/sets/frog-mbpos-cgn";
-bool IOBTagger::init( const Configuration& config ){
- debug = 0;
- string val = config.lookUp( "debug", "IOB" );
- if ( val.empty() ){
- val = config.lookUp( "debug" );
- }
- if ( !val.empty() ){
- debug = TiCC::stringTo<int>( val );
- }
- switch ( debug ){
- case 0:
- case 1:
- iobLog->setlevel(LogNormal);
- break;
- case 2:
- case 3:
- case 4:
- iobLog->setlevel(LogDebug);
- break;
- case 5:
- case 6:
- case 7:
- iobLog->setlevel(LogHeavy);
- break;
- default:
- iobLog->setlevel(LogExtreme);
- }
- if (debug) {
- LOG << "IOB Chunker Init" << endl;
- }
- if ( tagger != 0 ){
- LOG << "IOBTagger is already initialized!" << endl;
- return false;
- }
- val = config.lookUp( "settings", "IOB" );
- if ( val.empty() ){
- LOG << "Unable to find settings for IOB" << endl;
- return false;
- }
- string settings;
- if ( val[0] == '/' ) {
- // an absolute path
- settings = val;
- }
- else {
- settings = config.configDir() + val;
- }
-
- val = config.lookUp( "version", "IOB" );
- if ( val.empty() ){
- version = "1.0";
- }
- else {
- version = val;
- }
- val = config.lookUp( "set", "IOB" );
- if ( val.empty() ){
- tagset = "http://ilk.uvt.nl/folia/sets/frog-chunker-nl";
- }
- else {
- tagset = val;
- }
- string charFile = config.lookUp( "char_filter_file", "IOB" );
- if ( charFile.empty() )
- charFile = config.lookUp( "char_filter_file" );
- if ( !charFile.empty() ){
- charFile = prefix( config.configDir(), charFile );
- filter = new Tokenizer::UnicodeFilter();
- filter->fill( charFile );
- }
- string init = "-s " + settings + " -vcf";
- tagger = new MbtAPI( init, *iobLog );
- return tagger->isInit();
+bool IOBTagger::init( const TiCC::Configuration& config ){
+ return BaseTagger::init( config );
}
-void IOBTagger::addChunk( ChunkingLayer *chunks,
- const vector<Word*>& words,
+void IOBTagger::addChunk( folia::ChunkingLayer *chunks,
+ const vector<folia::Word*>& words,
const vector<double>& confs,
- const string& IOB ){
+ const string& IOB,
+ const string& textclass ){
double conf = 1;
for ( auto const& val : confs )
conf *= val;
- KWargs args;
+ folia::KWargs args;
args["class"] = IOB;
args["set"] = tagset;
- args["confidence"] = toString(conf);
- args["generate_id"] = chunks->id();
- Chunk *chunk = 0;
+ args["confidence"] = TiCC::toString(conf);
+ string parent_id = chunks->id();
+ if ( !parent_id.empty() ){
+ args["generate_id"] = chunks->id();
+ }
+ if ( textclass != "current" ){
+ args["textclass"] = textclass;
+ }
+ folia::Chunk *chunk = 0;
#pragma omp critical(foliaupdate)
{
try {
- chunk = new Chunk( args, chunks->doc() );
+ chunk = new folia::Chunk( args, chunks->doc() );
chunks->append( chunk );
}
catch ( exception& e ){
@@ -153,7 +77,7 @@ void IOBTagger::addChunk( ChunkingLayer *chunks,
}
}
for ( const auto& word : words ){
- if ( word->isinstance(PlaceHolder_t) ){
+ if ( word->isinstance(folia::PlaceHolder_t) ){
continue;
}
#pragma omp critical(foliaupdate)
@@ -163,61 +87,54 @@ void IOBTagger::addChunk( ChunkingLayer *chunks,
}
}
-void IOBTagger::addIOBTags( const vector<Word*>& words,
+void IOBTagger::addIOBTags( const vector<folia::Word*>& words,
const vector<string>& tags,
const vector<double>& confs ){
if ( words.empty() ){
return;
}
- ChunkingLayer *el = 0;
+ folia::ChunkingLayer *el = 0;
#pragma omp critical(foliaupdate)
{
- Sentence *sent = words[0]->sentence();
+ folia::Sentence *sent = words[0]->sentence();
try {
- el = sent->annotation<ChunkingLayer>(tagset);
+ el = sent->annotation<folia::ChunkingLayer>(tagset);
}
catch(...){
- KWargs args;
+ folia::KWargs args;
args["generate_id"] = sent->id();
args["set"] = tagset;
- el = new ChunkingLayer( args, sent->doc() );
+ el = new folia::ChunkingLayer( args, sent->doc() );
sent->append( el );
}
}
- vector<Word*> stack;
+ vector<folia::Word*> stack;
vector<double> dstack;
string curIOB;
- for ( size_t i=0; i < tags.size(); ++i ){
- if (debug){
- LOG << "tag = " << tags[i] << endl;
- }
- vector<string> tagwords;
- size_t num_words = TiCC::split_at( tags[i], tagwords, "_" );
- if ( num_words != 2 ){
- LOG << "expected <POS>_<IOB>, got: " << tags[i] << endl;
- throw;
- }
+ int i = 0;
+ for ( const auto& tag : tags ){
vector<string> iob;
- if (debug){
- LOG << "IOB = " << tagwords[1] << endl;
+ if ( debug){
+ LOG << "word=" << words[i]->text() << " IOB TAG = " << tag << endl;
}
- if ( tagwords[1] == "O" ){
+ if ( tag == "O" ){
if ( !stack.empty() ){
- if (debug) {
+ if ( debug) {
LOG << "O spit out " << curIOB << endl;
using TiCC::operator<<;
LOG << "spit out " << stack << endl;
}
- addChunk( el, stack, dstack, curIOB );
+ addChunk( el, stack, dstack, curIOB, textclass );
dstack.clear();
stack.clear();
}
+ ++i;
continue;
}
else {
- num_words = TiCC::split_at( tagwords[1], iob, "-" );
+ int num_words = TiCC::split_at( tag, iob, "-" );
if ( num_words != 2 ){
- LOG << "expected <IOB>-tag, got: " << tagwords[1] << endl;
+ LOG << "expected <IOB>-tag, got: " << tag << endl;
throw;
}
}
@@ -232,7 +149,7 @@ void IOBTagger::addIOBTags( const vector<Word*>& words,
using TiCC::operator<<;
LOG << "spit out " << stack << endl;
}
- addChunk( el, stack, dstack, curIOB );
+ addChunk( el, stack, dstack, curIOB, textclass );
dstack.clear();
stack.clear();
}
@@ -240,6 +157,7 @@ void IOBTagger::addIOBTags( const vector<Word*>& words,
}
dstack.push_back( confs[i] );
stack.push_back( words[i] );
+ ++i;
}
if ( !stack.empty() ){
if ( debug ){
@@ -247,66 +165,65 @@ void IOBTagger::addIOBTags( const vector<Word*>& words,
using TiCC::operator<<;
LOG << "spit out " << stack << endl;
}
- addChunk( el, stack, dstack, curIOB );
+ addChunk( el, stack, dstack, curIOB, textclass );
}
}
-void IOBTagger::addDeclaration( Document& doc ) const {
+void IOBTagger::addDeclaration( folia::Document& doc ) const {
#pragma omp critical(foliaupdate)
{
- doc.declare( AnnotationType::CHUNKING,
+ doc.declare( folia::AnnotationType::CHUNKING,
tagset,
"annotator='frog-chunker-" + version
+ "', annotatortype='auto', datetime='" + getTime() + "'");
}
}
-void IOBTagger::Classify( const vector<Word *>& swords ){
+void IOBTagger::Classify( const vector<folia::Word *>& swords ){
if ( !swords.empty() ) {
- string sentence; // the tagger needs the whole sentence
- for ( const auto& sword : swords ){
- UnicodeString word;
-#pragma omp critical(foliaupdate)
- {
- word = sword->text();
+ vector<string> words;
+ vector<string> ptags;
+ extract_words_tags( swords, cgn_tagset, words, ptags );
+ string text_block;
+ string prev = "_";
+ for ( size_t i=0; i < swords.size(); ++i ){
+ string word = words[i];
+ string pos = ptags[i];
+ text_block += word + "\t" + prev + "\t" + pos + "\t";
+ prev = pos;
+ if ( i < swords.size() - 1 ){
+ text_block += ptags[i+1];
}
- if ( filter )
- word = filter->filter( word );
- sentence += UnicodeToUTF8(word);
- if ( &sword != &swords.back() ){
- sentence += " ";
+ else {
+ text_block += "_";
}
+ text_block += "\t??\n";
}
- if (debug){
- LOG << "IOB in: " << sentence << endl;
- }
- vector<TagResult> tagv = tagger->TagLine(sentence);
- if ( tagv.size() != swords.size() ){
- throw runtime_error( "IOB tagger is confused" );
+ if ( debug ){
+ LOG << "TAGGING TEXT_BLOCK\n" << text_block << endl;
}
+ _tag_result = tagger->TagLine( text_block );
if ( debug ){
LOG << "IOB tagger out: " << endl;
- for ( size_t i=0; i < tagv.size(); ++i ){
- LOG << "[" << i << "] : word=" << tagv[i].word()
- << " tag=" << tagv[i].assignedTag()
- << " confidence=" << tagv[i].confidence() << endl;
+ for ( size_t i=0; i < _tag_result.size(); ++i ){
+ LOG << "[" << i << "] : word=" << _tag_result[i].word()
+ << " tag=" << _tag_result[i].assignedTag()
+ << " confidence=" << _tag_result[i].confidence() << endl;
}
}
- vector<double> conf;
- vector<string> tags;
- for ( const auto& tag : tagv ){
- tags.push_back( tag.assignedTag() );
- conf.push_back( tag.confidence() );
- }
- addIOBTags( swords, tags, conf );
}
+ post_process( swords );
}
-string IOBTagger::set_eos_mark( const string& eos ){
- if ( tagger ){
- return tagger->set_eos_mark(eos);
+void IOBTagger::post_process( const std::vector<folia::Word*>& swords ){
+ if ( debug ){
+ LOG << "IOB postprocess...." << endl;
}
- else {
- throw runtime_error( "IOBTagger is not initialized" );
+ vector<double> conf;
+ vector<string> tags;
+ for ( const auto& tag : _tag_result ){
+ tags.push_back( tag.assignedTag() );
+ conf.push_back( tag.confidence() );
}
+ addIOBTags( swords, tags, conf );
}
diff --git a/src/mblem_mod.cxx b/src/mblem_mod.cxx
index b643cf4..a180a6d 100644
--- a/src/mblem_mod.cxx
+++ b/src/mblem_mod.cxx
@@ -29,7 +29,6 @@
*/
-#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
@@ -41,12 +40,10 @@
#include "frog/mblem_mod.h"
using namespace std;
-using namespace TiCC;
-using namespace folia;
-#define LOG *Log(mblemLog)
+#define LOG *TiCC::Log(mblemLog)
-Mblem::Mblem( LogStream *logstream ):
+Mblem::Mblem( TiCC::LogStream *logstream ):
myLex(0),
punctuation( "?...,:;\\'`(){}[]%#+-_=/!" ),
history(20),
@@ -54,7 +51,7 @@ Mblem::Mblem( LogStream *logstream ):
keep_case( false ),
filter(0)
{
- mblemLog = new LogStream( logstream, "mblem" );
+ mblemLog = new TiCC::LogStream( logstream, "mblem" );
}
bool Mblem::fill_ts_map( const string& file ){
@@ -67,8 +64,8 @@ bool Mblem::fill_ts_map( const string& file ){
while ( getline( is, line ) ){
if ( line.empty() || line[0] == '#' )
continue;
- vector<string> parts;
- if ( TiCC::split( line, parts ) != 3 ){
+ vector<string> parts = TiCC::split( line );
+ if ( parts.size() != 3 ){
LOG << "invalid line in: '" << file << "' (expected 3 parts)" << endl;
return false;
}
@@ -80,7 +77,7 @@ bool Mblem::fill_ts_map( const string& file ){
return true;
}
-bool Mblem::init( const Configuration& config ) {
+bool Mblem::init( const TiCC::Configuration& config ) {
LOG << "Initiating lemmatizer..." << endl;
debug = 0;
string val = config.lookUp( "debug", "mblem" );
@@ -139,8 +136,7 @@ bool Mblem::init( const Configuration& config ) {
string one_one_tagS = config.lookUp( "one_one_tags", "mblem" );
if ( !one_one_tagS.empty() ){
- vector<string> tags;
- TiCC::split_at( one_one_tagS, tags, "," );
+ vector<string> tags = TiCC::split_at( one_one_tagS, "," );
for ( auto const& t : tags ){
one_one_tags.insert( t );
}
@@ -148,14 +144,22 @@ bool Mblem::init( const Configuration& config ) {
string par = config.lookUp( "keep_case", "mblem" );
if ( !par.empty() ){
- keep_case = stringTo<bool>( par );
+ keep_case = TiCC::stringTo<bool>( par );
+ }
+
+ string cls = config.lookUp( "outputclass" );
+ if ( !cls.empty() ){
+ textclass = cls;
+ }
+ else {
+ textclass = "current";
}
string opts = config.lookUp( "timblOpts", "mblem" );
if ( opts.empty() )
opts = "-a1";
//make it silent
- opts += " +vs -vf";
+ opts += " +vs -vf -F TABBED";
//Read in (igtree) data
myLex = new Timbl::TimblAPI(opts);
return myLex->GetInstanceBase(treeName);
@@ -179,25 +183,29 @@ string Mblem::make_instance( const UnicodeString& in ) {
size_t j = length - history + i;
if (( i < history - length ) &&
(length<history))
- instance += "= ";
+ instance += "=\t";
else {
instance += in[j];
- instance += ' ';
+ instance += '\t';
}
}
instance += "?";
- string result = UnicodeToUTF8(instance);
+ string result = folia::UnicodeToUTF8(instance);
if ( debug ){
LOG << "inst: " << instance << endl;
}
return result;
}
-void Mblem::addLemma( Word *word, const string& cls ){
- KWargs args;
+void Mblem::addLemma( folia::Word *word, const string& cls ){
+ folia::KWargs args;
args["set"]=tagset;
args["class"]=cls;
-#pragma omp critical(foliaupdate)
+ if ( textclass != "current" ){
+ args["textclass"] = textclass;
+ }
+
+#pragma omp critical (foliaupdate)
{
try {
word->addLemmaAnnotation( args );
@@ -266,10 +274,10 @@ void Mblem::makeUnique( ){
}
}
-void Mblem::getFoLiAResult( Word *word, const UnicodeString& uWord ){
+void Mblem::getFoLiAResult( folia::Word *word, const UnicodeString& uWord ){
if ( mblemResult.empty() ){
// just return the word as a lemma
- string result = UnicodeToUTF8( uWord );
+ string result = folia::UnicodeToUTF8( uWord );
addLemma( word, result );
}
else {
@@ -281,27 +289,32 @@ void Mblem::getFoLiAResult( Word *word, const UnicodeString& uWord ){
}
-void Mblem::addDeclaration( Document& doc ) const {
+void Mblem::addDeclaration( folia::Document& doc ) const {
#pragma omp critical (foliaupdate)
{
- doc.declare( AnnotationType::LEMMA,
+ doc.declare( folia::AnnotationType::LEMMA,
tagset,
"annotator='frog-mblem-" + version
+ "', annotatortype='auto', datetime='" + getTime() + "'");
}
}
-void Mblem::Classify( Word *sword ){
- if ( sword->isinstance(PlaceHolder_t ) )
+void Mblem::Classify( folia::Word *sword ){
+ if ( sword->isinstance( folia::PlaceHolder_t ) )
return;
UnicodeString uword;
string pos;
string token_class;
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
- uword = sword->text();
+ uword = sword->text( textclass );
pos = sword->pos();
- token_class = sword->cls();
+ string txtcls = sword->textclass();
+ if ( txtcls == textclass ){
+ // so only use the word class is the textclass of the word
+ // matches the wanted text
+ token_class = sword->cls();
+ }
}
if (debug){
LOG << "Classify " << uword << "(" << pos << ") ["
@@ -312,7 +325,7 @@ void Mblem::Classify( Word *sword ){
if ( token_class == "ABBREVIATION" ){
// We dont handle ABBREVIATION's so just take the word as such
- string word = UnicodeToUTF8(uword);
+ string word = folia::UnicodeToUTF8(uword);
addLemma( sword, word );
return;
}
@@ -322,15 +335,18 @@ void Mblem::Classify( Word *sword ){
// we have to strip a few letters to get a lemma
auto const& it2 = it1->second.find( token_class );
if ( it2 != it1->second.end() ){
- uword = UnicodeString( uword, 0, uword.length() - it2->second );
- string word = UnicodeToUTF8(uword);
+ UnicodeString uword2 = UnicodeString( uword, 0, uword.length() - it2->second );
+ if ( uword2.isEmpty() ){
+ uword2 = uword;
+ }
+ string word = folia::UnicodeToUTF8(uword2);
addLemma( sword, word );
return;
}
}
if ( one_one_tags.find(pos) != one_one_tags.end() ){
// some tags are just taken as such
- string word = UnicodeToUTF8(uword);
+ string word = folia::UnicodeToUTF8(uword);
addLemma( sword, word );
return;
}
@@ -348,12 +364,12 @@ void Mblem::Classify( const UnicodeString& uWord ){
string inst = make_instance(uWord);
string classString;
myLex->Classify( inst, classString );
- if (debug){
+ if ( debug){
LOG << "class: " << classString << endl;
}
// 1st find all alternatives
vector<string> parts;
- int numParts = split_at( classString, parts, "|" );
+ int numParts = TiCC::split_at( classString, parts, "|" );
if ( numParts < 1 ){
LOG << "no alternatives found" << endl;
}
@@ -370,10 +386,10 @@ void Mblem::Classify( const UnicodeString& uWord ){
}
else {
// some edit info available, like: WW(27)+Dgekomen+Ikomen
- vector<string> edits;
- size_t n = split_at( partS, edits, "+" );
- if ( n < 1 )
+ vector<string> edits = TiCC::split_at( partS, "+" );
+ if ( edits.empty() ){
throw runtime_error( "invalid editstring: " + partS );
+ }
restag = edits[0]; // the first one is the POS tag
UnicodeString insstr;
@@ -385,13 +401,13 @@ void Mblem::Classify( const UnicodeString& uWord ){
}
switch ( edit[0] ){
case 'P':
- prefix = UTF8ToUnicode( edit.substr( 1 ) );
+ prefix = folia::UTF8ToUnicode( edit.substr( 1 ) );
break;
case 'I':
- insstr = UTF8ToUnicode( edit.substr( 1 ) );
+ insstr = folia::UTF8ToUnicode( edit.substr( 1 ) );
break;
case 'D':
- delstr = UTF8ToUnicode( edit.substr( 1 ) );
+ delstr = folia::UTF8ToUnicode( edit.substr( 1 ) );
break;
default:
LOG << "Error: strange value in editstring: " << edit
@@ -478,7 +494,7 @@ void Mblem::Classify( const UnicodeString& uWord ){
if ( debug ){
LOG << "appending lemma " << lemma << " and tag " << restag << endl;
}
- mblemResult.push_back( mblemData( UnicodeToUTF8(lemma), restag ) );
+ mblemResult.push_back( mblemData( folia::UnicodeToUTF8(lemma), restag ) );
} // while
if ( debug ) {
LOG << "stored lemma and tag options: " << mblemResult.size()
diff --git a/src/mblem_prog.cxx b/src/mblem_prog.cxx
index cc7ab51..d9d5ff6 100644
--- a/src/mblem_prog.cxx
+++ b/src/mblem_prog.cxx
@@ -31,10 +31,8 @@
#include <string>
#include <iostream>
-#include <sstream>
#include <fstream>
#include <vector>
-#include <map>
#include "config.h"
#include "ticcutils/LogStream.h"
@@ -48,17 +46,16 @@
using namespace std;
using namespace Timbl;
-using namespace TiCC;
using namespace Tagger;
-LogStream my_default_log( cerr, "", StampMessage ); // fall-back
-LogStream *theErrLog = &my_default_log; // fill the externals
+TiCC::LogStream my_default_log( cerr, "", StampMessage ); // fall-back
+TiCC::LogStream *theErrLog = &my_default_log; // fill the externals
vector<string> fileNames;
bool useTagger = true;
bool useTokenizer = true;
-Configuration configuration;
+TiCC::Configuration configuration;
static string configDir = string(SYSCONF_PATH) + "/" + PACKAGE + "/nld/";
static string configFileName = configDir + "frog.cfg";
@@ -181,8 +178,7 @@ void Test( istream& in ){
}
}
else {
- vector<string> parts;
- TiCC::split( s, parts );
+ vector<string> parts = TiCC::split( s );
for ( const auto& w : parts ){
UnicodeString uWord = folia::UTF8ToUnicode(w);
myMblem.Classify( uWord );
diff --git a/src/mbma_brackets.cxx b/src/mbma_brackets.cxx
index 60cd196..c5ea022 100644
--- a/src/mbma_brackets.cxx
+++ b/src/mbma_brackets.cxx
@@ -29,6 +29,7 @@
*/
+#include <cassert>
#include <string>
#include <vector>
#include <list>
@@ -45,10 +46,9 @@
#include "frog/mbma_brackets.h"
using namespace std;
-using namespace TiCC;
-using namespace folia;
+using TiCC::operator<<;
-#define LOG *Log(myLog)
+#define LOG *TiCC::Log(myLog)
string toString( const Compound::Type& ct ){
switch ( ct ){
@@ -235,7 +235,7 @@ ostream& operator<<( ostream& os, const Status& st ){
return os;
}
-BracketLeaf::BracketLeaf( const RulePart& p, int flag, LogStream& l ):
+BracketLeaf::BracketLeaf( const RulePart& p, int flag, TiCC::LogStream& l ):
BaseBracket(p.ResultClass, p.RightHand, flag, l ),
glue(false),
morph(p.morpheme )
@@ -285,7 +285,7 @@ BracketLeaf::BracketLeaf( const RulePart& p, int flag, LogStream& l ):
BracketLeaf::BracketLeaf( CLEX::Type t,
const UnicodeString& us,
int flag,
- LogStream& l ):
+ TiCC::LogStream& l ):
BaseBracket( t, vector<CLEX::Type>(), flag, l ),
morph( us )
{
@@ -297,8 +297,9 @@ BracketLeaf::BracketLeaf( CLEX::Type t,
BracketNest::BracketNest( CLEX::Type t,
Compound::Type c,
int flag,
- LogStream& l ): BaseBracket( t, flag, l ),
- _compound( c )
+ TiCC::LogStream& l ):
+ BaseBracket( t, flag, l ),
+ _compound( c )
{
_status = Status::COMPLEX;
}
@@ -317,7 +318,7 @@ BracketNest::~BracketNest(){
UnicodeString BaseBracket::put( bool full ) const {
UnicodeString result = "[err?]";
if ( full ){
- UnicodeString s = UTF8ToUnicode(toString(cls));
+ UnicodeString s = folia::UTF8ToUnicode(toString(cls));
result += s;
}
return result;
@@ -332,10 +333,16 @@ UnicodeString BracketLeaf::put( bool full ) const {
}
if ( full ){
if ( orig.empty() ){
- result += UTF8ToUnicode(inflect);
+ UnicodeString s = folia::UTF8ToUnicode(toString(cls));
+ if ( s == "/" ){
+ result += s + folia::UTF8ToUnicode(inflect);
+ }
+ else {
+ result += s + "/" + folia::UTF8ToUnicode(inflect);
+ }
}
else {
- result += UTF8ToUnicode(orig);
+ result += folia::UTF8ToUnicode(orig);
}
}
return result;
@@ -346,16 +353,19 @@ UnicodeString BracketNest::put( bool full ) const {
for ( auto const& it : parts ){
UnicodeString m = it->put( full );
if ( !m.isEmpty() ){
- result += m + " ";
+ result += m + " ";
+ // if (&it != &parts.back() ){
+ // result += " ";
+ // }
}
}
result += "]";
if ( full ){
if ( cls != CLEX::UNASS ){
- result += UTF8ToUnicode(toString(cls));
+ result += folia::UTF8ToUnicode(toString(cls));
}
if ( _compound != Compound::Type::NONE ){
- result += " " + UTF8ToUnicode(toString(_compound)) + "-compound";
+ result += " " + folia::UTF8ToUnicode(toString(_compound)) + "-compound";
}
}
return result;
@@ -379,7 +389,10 @@ ostream& operator<< ( ostream& os, const BaseBracket *c ){
void prettyP( ostream& os, const list<BaseBracket*>& v ){
os << "[";
for ( auto const& it : v ){
- os << it << " ";
+ os << it;
+ if ( &it != &v.back() ){
+ os << " ";
+ }
}
os << "]";
}
@@ -442,7 +455,7 @@ bool BracketNest::testMatch( list<BaseBracket*>& result,
return true;
}
-Compound::Type construct( const vector<CLEX::Type> tags ){
+Compound::Type construct( const vector<CLEX::Type>& tags ){
string s;
for ( const auto& t : tags ){
s += toString( t );
@@ -462,6 +475,7 @@ Compound::Type construct( const CLEX::Type tag1, const CLEX::Type tag2 ){
return construct( v );
}
+bool TEST = 1;
Compound::Type BracketNest::getCompoundType(){
if ( debugFlag > 5 ){
LOG << "get compoundType: " << this << endl;
@@ -485,24 +499,16 @@ Compound::Type BracketNest::getCompoundType(){
LOG << "tag2 :" << tag2 << " stat2: " << st2 << " cp2: " << cp2 << endl;
}
if ( st1 != Status::FAILED
- && st2 != Status::FAILED ){
- if ( tag1 == CLEX::N
- && st1 != Status::DERIVATIONAL
- && st1 != Status::PARTICLE
- && st1 != Status::PARTICIPLE ){
- if ( st2 == Status::STEM ){
- compound = construct( tag1, tag2 );
- }
- else if ( st2 == Status::DERIVATIONAL
- || st2 == Status::INFO
- || st2 == Status::INFLECTION ){
- compound = cp1;
+ && st2 != Status::FAILED
+ && st1 != Status::PARTICLE
+ && st1 != Status::PARTICIPLE ){
+ switch ( tag1 ){
+ case CLEX::N:
+ case CLEX::A:
+ if ( st1 == Status::DERIVATIONAL ){
+ compound = cp2;
}
- }
- else if ( tag1 == CLEX::A
- && st1 != Status::PARTICLE
- && st1 != Status::PARTICIPLE ){
- if ( st2 == Status::STEM ){
+ else if ( st2 == Status::STEM ){
compound = construct( tag1, tag2 );
}
else if ( st2 == Status::DERIVATIONAL
@@ -510,33 +516,31 @@ Compound::Type BracketNest::getCompoundType(){
|| st2 == Status::INFLECTION ){
compound = cp1;
}
- }
- else if ( tag1 == CLEX::B ){
+ break;
+ case CLEX::B:
if ( st2 == Status::STEM ){
compound = construct( tag1, tag2 );
}
- }
- else if ( tag1 == CLEX::P ){
+ break;
+ case CLEX::P:
if ( st2 == Status::STEM ){
compound = construct( tag1, tag2 );
}
- else if ( tag2 == CLEX::NEUTRAL || tag2 == CLEX::UNASS ){
+ else if ( tag2 == CLEX::NEUTRAL
+ || tag2 == CLEX::UNASS ){
compound = cp1;
}
- }
- else if ( tag1 == CLEX::V ){
- if ( st1 != Status::PARTICLE
- && st1 != Status::PARTICIPLE ){
- if ( st1 == Status::DERIVATIONAL ){
- compound = cp2;
- }
- else if ( st2 == Status::STEM ){
- compound = construct( tag1, tag2 );
- }
+ break;
+ case CLEX::V:
+ if ( st1 == Status::DERIVATIONAL ){
+ compound = cp2;
}
- else if ( st2 == Status::COMPLEX ) {
+ else if ( st2 == Status::STEM ){
compound = construct( tag1, tag2 );
}
+ break;
+ default:
+ break;
}
}
}
@@ -561,8 +565,13 @@ Compound::Type BracketNest::getCompoundType(){
&& st3 != Status::FAILED
&& st1 != Status::PARTICLE
&& st1 != Status::PARTICIPLE ){
- if ( tag1 == CLEX::N ){
- if ( st2 == Status::STEM &&
+ switch ( tag1 ){
+ case CLEX::N:
+ if ( st2 == Status::STEM && tag2 == CLEX::N
+ && st3 == Status::STEM && tag3 == CLEX::N ){
+ compound = Compound::Type::NNN;
+ }
+ else if ( st1 != Status::DERIVATIONAL && st2 == Status::STEM &&
( st3 == Status::INFLECTION || tag3 == CLEX::NEUTRAL ) ){
compound = construct( tag1, tag2 );
}
@@ -603,11 +612,8 @@ Compound::Type BracketNest::getCompoundType(){
compound = Compound::Type::NN;
}
}
- else if ( tag2 == CLEX::N && tag3 == CLEX::N ){
- compound = Compound::Type::NNN;
- }
- }
- else if ( tag1 == CLEX::A ){
+ break;
+ case CLEX::A:
if ( st2 == Status::STEM &&
( st3 == Status::INFLECTION || tag3 == CLEX::NEUTRAL ) ){
compound = construct( tag1, tag2 );
@@ -625,8 +631,8 @@ Compound::Type BracketNest::getCompoundType(){
compound = cp1;
}
}
- }
- else if ( tag1 == CLEX::P ){
+ break;
+ case CLEX::P:
if ( st2 == Status::STEM &&
( st3 == Status::INFLECTION || tag3 == CLEX::NEUTRAL ) ){
compound = construct( tag1, tag2 );
@@ -640,30 +646,35 @@ Compound::Type BracketNest::getCompoundType(){
else if ( st3 == Status::DERIVATIONAL ){
compound = construct( tag1, tag3 );
}
- }
- else if ( tag1 == CLEX::B && st1 == Status::STEM ){
- if ( ( st2 == Status::STEM
- && ( st3 == Status::INFLECTION || tag3 == CLEX::NEUTRAL ) ) ){
- compound = construct( tag1, tag2 );
- }
- else if ( st2 == Status::COMPLEX ){
- if ( tag2 == CLEX::N ){
- compound = Compound::Type::BN;
+ break;
+ case CLEX::B:
+ if ( st1 == Status::STEM ){
+ if ( ( st2 == Status::STEM
+ && ( st3 == Status::INFLECTION || tag3 == CLEX::NEUTRAL ) ) ){
+ compound = construct( tag1, tag2 );
}
- else {
- compound = cp2;
+ else if ( st2 == Status::COMPLEX ){
+ if ( tag2 == CLEX::N ){
+ compound = Compound::Type::BN;
+ }
+ else {
+ compound = cp2;
+ }
}
}
- }
- else if ( tag1 == CLEX::V ){
+ break;
+ case CLEX::V:
if ( st2 == Status::STEM &&
( st3 == Status::INFLECTION || tag3 == CLEX::NEUTRAL ) ){
compound = construct( tag1, tag2 );
}
else if ( st3 == Status::STEM &&
- ( st2 == Status::INFLECTION ) ){
+ ( st2 == Status::INFLECTION ) ){
compound = construct( tag1, tag3 );
}
+ break;
+ default:
+ break;
}
}
}
@@ -674,16 +685,16 @@ Compound::Type BracketNest::getCompoundType(){
return compound;
}
-Morpheme *BracketLeaf::createMorpheme( Document *doc ) const {
+folia::Morpheme *BracketLeaf::createMorpheme( folia::Document *doc ) const {
string desc;
int cnt = 0;
return createMorpheme( doc, desc, cnt );
}
-Morpheme *BracketLeaf::createMorpheme( Document *doc,
- string& desc,
- int& cnt ) const {
- Morpheme *result = 0;
+folia::Morpheme *BracketLeaf::createMorpheme( folia::Document *doc,
+ string& desc,
+ int& cnt ) const {
+ folia::Morpheme *result = 0;
desc.clear();
string::size_type pos = orig.find( "^" );
bool glue = ( pos != string::npos );
@@ -692,20 +703,17 @@ Morpheme *BracketLeaf::createMorpheme( Document *doc,
}
else if ( _status == Status::STEM
|| ( _status == Status::DERIVATIONAL && glue ) ){
- KWargs args;
- args["set"] = Mbma::mbma_tagset;
- args["class"] = "stem";
- result = new Morpheme( args, doc );
- args.clear();
- string out = UnicodeToUTF8(morph);
+ string out = folia::UnicodeToUTF8(morph);
if ( out.empty() ){
throw logic_error( "stem has empty morpheme" );
}
- args["value"] = out;
- TextContent *t = new TextContent( args );
-#pragma omp critical(foliaupdate)
+ folia::KWargs args;
+ args["set"] = Mbma::mbma_tagset;
+ args["class"] = "stem";
+#pragma omp critical (foliaupdate)
{
- result->append( t );
+ result = new folia::Morpheme( args, doc );
+ result->settext( out );
}
++cnt;
args.clear();
@@ -721,58 +729,48 @@ Morpheme *BracketLeaf::createMorpheme( Document *doc,
args["class"] = toString( tag() );
desc = "[" + out + "]" + CLEX::get_tDescr( tag() ); // spread the word upwards!
}
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
result->addPosAnnotation( args );
}
}
else if ( _status == Status::PARTICLE ){
- KWargs args;
- args["set"] = Mbma::mbma_tagset;
- args["class"] = "particle";
- result = new Morpheme( args, doc );
- args.clear();
- string out = UnicodeToUTF8(morph);
+ string out = folia::UnicodeToUTF8(morph);
if ( out.empty() ){
throw logic_error( "particle has empty morpheme" );
}
- args["value"] = out;
- TextContent *t = new TextContent( args );
-#pragma omp critical(foliaupdate)
+ folia::KWargs args;
+ args["set"] = Mbma::mbma_tagset;
+ args["class"] = "particle";
+#pragma omp critical (foliaupdate)
{
- result->append( t );
+ result = new folia::Morpheme( args, doc );
+ result->settext( out );
}
++cnt;
args.clear();
args["set"] = Mbma::clex_tagset;
args["class"] = toString( tag() );
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
result->addPosAnnotation( args );
}
desc = "[" + out + "]"; // spread the word upwards! maybe add 'part' ??
}
else if ( _status == Status::INFLECTION ){
- KWargs args;
- args["class"] = "inflection";
- args["set"] = Mbma::mbma_tagset;
- result = new Morpheme( args, doc );
- args.clear();
- string out = UnicodeToUTF8(morph);
- if ( out.empty() ){
- out = inflect;
- }
- else {
+ string out = folia::UnicodeToUTF8(morph);
+ if ( !out.empty() ){
desc = "[" + out + "]";
}
- if ( out.empty() ){
- throw logic_error( "Inflection and morpheme empty" );
- }
- args["value"] = out;
- TextContent *t = new TextContent( args );
-#pragma omp critical(foliaupdate)
+ folia::KWargs args;
+ args["class"] = "inflection";
+ args["set"] = Mbma::mbma_tagset;
+#pragma omp critical (foliaupdate)
{
- result->append( t );
+ result = new folia::Morpheme( args, doc );
+ if ( !out.empty() ){
+ result->settext( out );
+ }
}
++cnt;
args.clear();
@@ -784,9 +782,9 @@ Morpheme *BracketLeaf::createMorpheme( Document *doc,
// happens sometimes when there is fawlty data
args["class"] = d;
desc += "/" + d;
- folia::Feature *feat = new folia::Feature( args );
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
+ folia::Feature *feat = new folia::Feature( args );
result->append( feat );
}
}
@@ -796,7 +794,11 @@ Morpheme *BracketLeaf::createMorpheme( Document *doc,
else if ( _status == Status::DERIVATIONAL
|| _status == Status::PARTICIPLE
|| _status == Status::FAILED ){
- KWargs args;
+ string out = folia::UnicodeToUTF8(morph);
+ if ( out.empty() ){
+ throw logic_error( "Derivation with empty morpheme" );
+ }
+ folia::KWargs args;
if ( _status == Status::DERIVATIONAL ){
args["class"] = "affix";
}
@@ -807,18 +809,10 @@ Morpheme *BracketLeaf::createMorpheme( Document *doc,
args["class"] = "derivational";
}
args["set"] = Mbma::mbma_tagset;
- result = new Morpheme( args, doc );
- args.clear();
- string out = UnicodeToUTF8(morph);
- if ( out.empty() ){
- LOG << "problem: " << this << endl;
- throw logic_error( "Derivation with empty morpheme" );
- }
- args["value"] = out;
- TextContent *t = new TextContent( args );
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
- result->append( t );
+ result = new folia::Morpheme( args, doc );
+ result->settext( out );
}
++cnt;
desc = "[" + out + "]"; // pass it up!
@@ -834,7 +828,7 @@ Morpheme *BracketLeaf::createMorpheme( Document *doc,
args.clear();
args["subset"] = "structure";
args["class"] = desc;
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
folia::Feature *feat = new folia::Feature( args );
result->append( feat );
@@ -842,16 +836,19 @@ Morpheme *BracketLeaf::createMorpheme( Document *doc,
args.clear();
// args["set"] = Mbma::clex_tagset;
// args["class"] = orig;
-// #pragma omp critical(foliaupdate)
+// #pragma omp critical (foliaupdate)
// {
// result->addPosAnnotation( args );
// }
}
else if ( _status == Status::INFO ){
- KWargs args;
+ folia::KWargs args;
args["class"] = "inflection";
args["set"] = Mbma::mbma_tagset;
- result = new Morpheme( args, doc );
+#pragma omp critical (foliaupdate)
+ {
+ result = new folia::Morpheme( args, doc );
+ }
args.clear();
args["subset"] = "inflection";
for ( const auto& inf : inflect ){
@@ -861,9 +858,9 @@ Morpheme *BracketLeaf::createMorpheme( Document *doc,
// happens sometimes when there is fawlty data
desc += "/" + d;
args["class"] = d;
- folia::Feature *feat = new folia::Feature( args );
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
+ folia::Feature *feat = new folia::Feature( args );
result->append( feat );
}
}
@@ -873,36 +870,39 @@ Morpheme *BracketLeaf::createMorpheme( Document *doc,
return result;
}
-Morpheme *BracketNest::createMorpheme( Document *doc ) const {
+folia::Morpheme *BracketNest::createMorpheme( folia::Document *doc ) const {
string desc;
int cnt = 0;
return createMorpheme( doc, desc, cnt );
}
-Morpheme *BracketNest::createMorpheme( Document *doc,
- string& desc,
- int& cnt ) const {
- KWargs args;
+folia::Morpheme *BracketNest::createMorpheme( folia::Document *doc,
+ string& desc,
+ int& cnt ) const {
+ folia::Morpheme *result = 0;
+ folia::KWargs args;
args["class"] = "complex";
args["set"] = Mbma::mbma_tagset;
- Morpheme *result = new Morpheme( args, doc );
- string mor;
+#pragma omp critical (foliaupdate)
+ {
+ result = new folia::Morpheme( args, doc );
+ }
cnt = 0;
desc.clear();
- vector<Morpheme*> stack;
+ vector<folia::Morpheme*> stack;
for ( auto const& it : parts ){
string deeper_desc;
int deep_cnt = 0;
- Morpheme *m = it->createMorpheme( doc,
- deeper_desc,
- deep_cnt );
+ folia::Morpheme *m = it->createMorpheme( doc,
+ deeper_desc,
+ deep_cnt );
if ( it->status() == Status::DERIVATIONAL
|| it->status() == Status::PARTICIPLE ){
if ( !it->original().empty() ){
args.clear();
args["subset"] = "applied_rule";
args["class"] = it->original();
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
folia::Feature *feat = new folia::Feature( args );
result->append( feat );
@@ -922,7 +922,7 @@ Morpheme *BracketNest::createMorpheme( Document *doc,
args.clear();
args["subset"] = "structure";
args["class"] = desc;
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
folia::Feature *feat = new folia::Feature( args );
result->append( feat );
@@ -930,8 +930,8 @@ Morpheme *BracketNest::createMorpheme( Document *doc,
args.clear();
args["set"] = Mbma::clex_tagset;
args["class"] = toString( tag() );
- PosAnnotation *pos = 0;
-#pragma omp critical(foliaupdate)
+ folia::PosAnnotation *pos = 0;
+#pragma omp critical (foliaupdate)
{
pos = result->addPosAnnotation( args );
}
@@ -940,13 +940,13 @@ Morpheme *BracketNest::createMorpheme( Document *doc,
args.clear();
args["subset"] = "compound";
args["class"] = toString(ct);
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
folia::Feature *feat = new folia::Feature( args );
pos->append( feat );
}
}
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
for ( const auto& s : stack ){
result->append( s );
}
@@ -1009,8 +1009,7 @@ void BracketNest::resolveNouns( ){
&& ( (*it)->tag() == CLEX::N && (*it)->status() == Status::STEM )
&& (*it)->RightHand.size() == 0 ){
Compound::Type newt = Compound::Type::NN;
- if ( (*prev)->compound() == Compound::Type::NN
- || (*prev)->compound() == Compound::Type::NN ){
+ if ( (*prev)->compound() == Compound::Type::NN ){
newt = Compound::Type::NNN;
}
BaseBracket *tmp = new BracketNest( CLEX::N, newt, debugFlag, myLog );
diff --git a/src/mbma_mod.cxx b/src/mbma_mod.cxx
index a109dc5..17e5e5d 100644
--- a/src/mbma_mod.cxx
+++ b/src/mbma_mod.cxx
@@ -35,31 +35,30 @@
#include <iostream>
#include <algorithm>
#include <fstream>
-#include <sstream>
#include "timbl/TimblAPI.h"
#include "ucto/unicode.h"
#include "ticcutils/Configuration.h"
+#include "ticcutils/StringOps.h"
#include "frog/Frog.h"
#include "frog/mbma_mod.h"
using namespace std;
-using namespace folia;
-using namespace TiCC;
+using TiCC::operator<<;
const long int LEFT = 6; // left context
const long int RIGHT = 6; // right context
-#define LOG *Log(mbmaLog)
+#define LOG *TiCC::Log(mbmaLog)
-Mbma::Mbma(LogStream * logstream):
+Mbma::Mbma( TiCC::LogStream * logstream):
MTreeFilename( "dm.igtree" ),
MTree(0),
transliterator(0),
filter(0),
doDeepMorph(false)
{
- mbmaLog = new LogStream( logstream, "mbma-" );
+ mbmaLog = new TiCC::LogStream( logstream, "mbma-" );
}
// define the statics
@@ -82,7 +81,7 @@ void Mbma::init_cgn( const string& main, const string& sub ) {
string line;
while( getline( tc, line) ) {
vector<string> tmp;
- size_t num = split_at(line, tmp, " ");
+ size_t num = TiCC::split_at(line, tmp, " ");
if ( num < 2 ){
LOG << "splitting '" << line << "' failed" << endl;
throw ( runtime_error("panic") );
@@ -98,7 +97,7 @@ void Mbma::init_cgn( const string& main, const string& sub ) {
string line;
while( getline(tc1, line) ) {
vector<string> tmp;
- size_t num = split_at(line, tmp, " ");
+ size_t num = TiCC::split_at(line, tmp, " ");
if ( num == 2 ){
TAGconv.insert( make_pair( tmp[0], tmp[1] ) );
}
@@ -120,7 +119,7 @@ Transliterator *Mbma::init_trans( ){
return t;
}
-bool Mbma::init( const Configuration& config ) {
+bool Mbma::init( const TiCC::Configuration& config ) {
LOG << "Initiating morphological analyzer..." << endl;
debugFlag = 0;
string val = config.lookUp( "debug", "mbma" );
@@ -191,11 +190,20 @@ bool Mbma::init( const Configuration& config ) {
MTreeFilename = prefix( config.configDir(), tfName );
string dof = config.lookUp( "filter_diacritics", "mbma" );
if ( !dof.empty() ){
- bool b = stringTo<bool>( dof );
+ bool b = TiCC::stringTo<bool>( dof );
if ( b ){
transliterator = init_trans();
}
}
+
+ string cls = config.lookUp( "outputclass" );
+ if ( !cls.empty() ){
+ textclass = cls;
+ }
+ else {
+ textclass = "current";
+ }
+
//Read in (igtree) data
string opts = config.lookUp( "timblOpts", "mbma" );
if ( opts.empty() ){
@@ -242,7 +250,7 @@ vector<string> Mbma::make_instances( const UnicodeString& word ){
}
}
inst += "?";
- insts.push_back( UnicodeToUTF8(inst) );
+ insts.push_back( folia::UnicodeToUTF8(inst) );
}
return insts;
}
@@ -267,13 +275,13 @@ string find_class( unsigned int step,
vector<vector<string> > generate_all_perms( const vector<string>& classes ){
// determine all alternative analyses, remember the largest
// and store every part in a vector of string vectors
- int largest_anal=1;
+ size_t largest_anal=1;
vector<vector<string> > classParts;
classParts.reserve( classes.size() );
for ( const auto& cl : classes ){
- vector<string> parts;
- int num = split_at( cl, parts, "|" );
- if ( num > 0 ){
+ vector<string> parts = TiCC::split_at( cl, "|" );
+ size_t num = parts.size();
+ if ( num > 1 ){
classParts.push_back( parts );
if ( num > largest_anal ){
largest_anal = num;
@@ -290,7 +298,7 @@ vector<vector<string> > generate_all_perms( const vector<string>& classes ){
// now expand the result
vector<vector<string> > result;
result.reserve( largest_anal );
- for ( int step=0; step < largest_anal; ++step ){
+ for ( size_t step=0; step < largest_anal; ++step ){
vector<string> item;
for ( const auto& cp : classParts ){
item.push_back( find_class( step, cp, largest_anal ) );
@@ -315,16 +323,11 @@ Rule* Mbma::matchRule( const std::vector<std::string>& ana,
if ( debugFlag ){
LOG << "after reduction: " << rule << endl;
}
-#ifdef EXPERIMENT
- rule->resolveBrackets( doDeepMorph );
-#endif
rule->resolve_inflections();
if ( debugFlag ){
LOG << "after resolving: " << rule << endl;
}
-#ifndef EXPERIMENT
rule->resolveBrackets( doDeepMorph );
-#endif
rule->getCleanInflect();
if ( debugFlag ){
LOG << "1 added Inflection: " << rule << endl;
@@ -344,7 +347,8 @@ vector<Rule*> Mbma::execute( const UnicodeString& word,
const vector<string>& classes ){
vector<vector<string> > allParts = generate_all_perms( classes );
if ( debugFlag ){
- string out = "alternatives: word=" + UnicodeToUTF8(word) + ", classes=<";
+ string out = "alternatives: word="
+ + folia::UnicodeToUTF8(word) + ", classes=<";
for ( const auto& cls : classes ){
out += cls + ",";
}
@@ -364,12 +368,12 @@ vector<Rule*> Mbma::execute( const UnicodeString& word,
return accepted;
}
-void Mbma::addMorph( Word *word,
+void Mbma::addMorph( folia::Word *word,
const vector<string>& morphs ) const {
- KWargs args;
+ folia::KWargs args;
args["set"] = mbma_tagset;
- MorphologyLayer *ml;
-#pragma omp critical(foliaupdate)
+ folia::MorphologyLayer *ml;
+#pragma omp critical (foliaupdate)
{
try {
ml = word->addMorphologyLayer( args );
@@ -382,7 +386,7 @@ void Mbma::addMorph( Word *word,
addMorph( ml, morphs );
}
-void Mbma::addBracketMorph( Word *word,
+void Mbma::addBracketMorph( folia::Word *word,
const string& wrd,
const string& tag ) const {
if (debugFlag){
@@ -395,9 +399,9 @@ void Mbma::addBracketMorph( Word *word,
}
else if ( head == "X" ) {
// unanalysed, so trust the TAGGER
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
- const auto pos = word->annotation<PosAnnotation>( cgn_tagset );
+ const auto pos = word->annotation<folia::PosAnnotation>( cgn_tagset );
head = pos->feat("head");
}
if (debugFlag){
@@ -406,7 +410,7 @@ void Mbma::addBracketMorph( Word *word,
const auto tagIt = TAGconv.find( head );
if ( tagIt == TAGconv.end() ) {
// this should never happen
- throw ValueError( "unknown head feature '" + head + "'" );
+ throw folia::ValueError( "unknown head feature '" + head + "'" );
}
celex_tag = tagIt->second;
head = CLEX::get_tDescr(CLEX::toCLEX(tagIt->second));
@@ -415,10 +419,10 @@ void Mbma::addBracketMorph( Word *word,
}
}
- KWargs args;
+ folia::KWargs args;
args["set"] = mbma_tagset;
- MorphologyLayer *ml;
-#pragma omp critical(foliaupdate)
+ folia::MorphologyLayer *ml;
+#pragma omp critical (foliaupdate)
{
try {
ml = word->addMorphologyLayer( args );
@@ -429,18 +433,16 @@ void Mbma::addBracketMorph( Word *word,
}
}
args["class"] = "stem";
- Morpheme *result = new Morpheme( args, word->doc() );
- args.clear();
- args["value"] = wrd;
- TextContent *t = new TextContent( args );
-#pragma omp critical(foliaupdate)
+ folia::Morpheme *result = 0;
+#pragma omp critical (foliaupdate)
{
- result->append( t );
+ result = new folia::Morpheme( args, word->doc() );
+ result->settext( wrd, textclass );
}
args.clear();
args["subset"] = "structure";
args["class"] = "[" + wrd + "]" + head;
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
folia::Feature *feat = new folia::Feature( args );
result->append( feat );
@@ -448,26 +450,26 @@ void Mbma::addBracketMorph( Word *word,
args.clear();
args["set"] = clex_tagset;
args["class"] = celex_tag;
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
result->addPosAnnotation( args );
}
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
ml->append( result );
}
}
-void Mbma::addBracketMorph( Word *word,
+void Mbma::addBracketMorph( folia::Word *word,
const string& orig_word,
const BracketNest *brackets ) const {
if (debugFlag){
LOG << "addBracketMorph(" << word << "," << orig_word << "," << brackets << ")" << endl;
}
- KWargs args;
+ folia::KWargs args;
args["set"] = mbma_tagset;
- MorphologyLayer *ml;
-#pragma omp critical(foliaupdate)
+ folia::MorphologyLayer *ml;
+#pragma omp critical (foliaupdate)
{
try {
ml = word->addMorphologyLayer( args );
@@ -477,43 +479,34 @@ void Mbma::addBracketMorph( Word *word,
throw;
}
}
- Morpheme *m = 0;
+ folia::Morpheme *m = 0;
try {
m = brackets->createMorpheme( word->doc() );
}
catch( const exception& e ){
- cerr << "createMorpheme failed: " << e.what() << endl;
+ LOG << "createMorpheme failed: " << e.what() << endl;
throw;
}
if ( m ){
- args.clear();
- args["value"] = orig_word;
- TextContent *t = new TextContent( args );
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
- m->append( t );
+ m->settext( orig_word, textclass );
ml->append( m );
}
}
}
-void Mbma::addMorph( MorphologyLayer *ml,
+void Mbma::addMorph( folia::MorphologyLayer *ml,
const vector<string>& morphs ) const {
for ( const auto& mor : morphs ){
- KWargs args;
+ folia::KWargs args;
args["set"] = mbma_tagset;
- Morpheme *m = new Morpheme( args, ml->doc() );
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
+ folia::Morpheme *m = new folia::Morpheme( args, ml->doc() );
+ m->settext( mor, textclass );
ml->append( m );
}
- args.clear();
- args["value"] = mor;
- TextContent *t = new TextContent( args );
-#pragma omp critical(foliaupdate)
- {
- m->append( t );
- }
}
}
@@ -534,7 +527,7 @@ void Mbma::filterHeadTag( const string& head ){
map<string,string>::const_iterator tagIt = TAGconv.find( head );
if ( tagIt == TAGconv.end() ) {
// this should never happen
- throw ValueError( "unknown head feature '" + head + "'" );
+ throw folia::ValueError( "unknown head feature '" + head + "'" );
}
string celex_tag = tagIt->second;
if (debugFlag){
@@ -737,7 +730,8 @@ void Mbma::assign_compounds(){
}
}
-void Mbma::getFoLiAResult( Word *fword, const UnicodeString& uword ) const {
+void Mbma::getFoLiAResult( folia::Word *fword,
+ const UnicodeString& uword ) const {
if ( analysis.size() == 0 ){
// fallback option: use the word and pretend it's a morpheme ;-)
if ( debugFlag ){
@@ -745,18 +739,18 @@ void Mbma::getFoLiAResult( Word *fword, const UnicodeString& uword ) const {
<< uword << endl;
}
if ( doDeepMorph ){
- addBracketMorph( fword, UnicodeToUTF8(uword), "X" );
+ addBracketMorph( fword, folia::UnicodeToUTF8(uword), "X" );
}
else {
vector<string> tmp;
- tmp.push_back( UnicodeToUTF8(uword) );
+ tmp.push_back( folia::UnicodeToUTF8(uword) );
addMorph( fword, tmp );
}
}
else {
for ( auto const& sit : analysis ){
if ( doDeepMorph ){
- addBracketMorph( fword, UnicodeToUTF8(uword), sit->brackets );
+ addBracketMorph( fword, folia::UnicodeToUTF8(uword), sit->brackets );
}
else {
addMorph( fword, sit->extract_morphemes() );
@@ -765,14 +759,14 @@ void Mbma::getFoLiAResult( Word *fword, const UnicodeString& uword ) const {
}
}
-void Mbma::addDeclaration( Document& doc ) const {
-#pragma omp critical(foliaupdate)
+void Mbma::addDeclaration( folia::Document& doc ) const {
+#pragma omp critical (foliaupdate)
{
- doc.declare( AnnotationType::MORPHOLOGICAL, mbma_tagset,
+ doc.declare( folia::AnnotationType::MORPHOLOGICAL, mbma_tagset,
"annotator='frog-mbma-" + version +
+ "', annotatortype='auto', datetime='" + getTime() + "'");
if ( doDeepMorph ){
- doc.declare( AnnotationType::POS, clex_tagset,
+ doc.declare( folia::AnnotationType::POS, clex_tagset,
"annotator='frog-mbma-" + version +
+ "', annotatortype='auto', datetime='" + getTime() + "'");
}
@@ -790,20 +784,25 @@ UnicodeString Mbma::filterDiacritics( const UnicodeString& in ) const {
}
}
-void Mbma::Classify( Word* sword ){
- if ( sword->isinstance(PlaceHolder_t) ){
+void Mbma::Classify( folia::Word* sword ){
+ if ( sword->isinstance(folia::PlaceHolder_t) ){
return;
}
UnicodeString uWord;
- PosAnnotation *pos;
+ folia::PosAnnotation *pos;
string head;
string token_class;
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
- uWord = sword->text();
- pos = sword->annotation<PosAnnotation>( cgn_tagset );
+ uWord = sword->text( textclass );
+ pos = sword->annotation<folia::PosAnnotation>( cgn_tagset );
head = pos->feat("head");
- token_class = sword->cls();
+ string txtcls = sword->textclass();
+ if ( txtcls == textclass ){
+ // so only use the word class is the textclass of the word
+ // matches the wanted text
+ token_class = sword->cls();
+ }
}
if (debugFlag ){
LOG << "Classify " << uWord << "(" << pos << ") ["
@@ -812,10 +811,17 @@ void Mbma::Classify( Word* sword ){
if ( filter ){
uWord = filter->filter( uWord );
}
+ string word_s = folia::UnicodeToUTF8( uWord );
+ vector<string> parts = TiCC::split( word_s );
+ word_s.clear();
+ for ( const auto& p : parts ){
+ word_s += p;
+ }
+ uWord = folia::UTF8ToUnicode( word_s );
if ( head == "LET" || head == "SPEC" || token_class == "ABBREVIATION" ){
// take over the letter/word 'as-is'.
// also ABBREVIATION's aren't handled bij mbma-rules
- string word = UnicodeToUTF8( uWord );
+ string word = folia::UnicodeToUTF8( uWord );
if ( doDeepMorph ){
addBracketMorph( sword, word, head );
}
@@ -832,7 +838,7 @@ void Mbma::Classify( Word* sword ){
}
Classify( lWord );
vector<string> featVals;
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
vector<folia::Feature*> feats = pos->select<folia::Feature>();
featVals.reserve( feats.size() );
diff --git a/src/mbma_prog.cxx b/src/mbma_prog.cxx
index 82b53f8..e5e60fb 100644
--- a/src/mbma_prog.cxx
+++ b/src/mbma_prog.cxx
@@ -31,10 +31,7 @@
#include <string>
#include <iostream>
-#include <sstream>
-#include <fstream>
#include <vector>
-#include <map>
#include "config.h"
#include "ticcutils/LogStream.h"
@@ -48,11 +45,10 @@
using namespace std;
using namespace Timbl;
-using namespace TiCC;
using namespace Tagger;
-LogStream my_default_log( cerr, "", StampMessage ); // fall-back
-LogStream *theErrLog = &my_default_log; // fill the externals
+TiCC::LogStream my_default_log( cerr, "", StampMessage ); // fall-back
+TiCC::LogStream *theErrLog = &my_default_log; // fill the externals
vector<string> fileNames;
bool useTagger = true;
@@ -60,7 +56,7 @@ bool useTokenizer = true;
bool bulk = false;
bool verbose = false;
-Configuration configuration;
+TiCC::Configuration configuration;
static string configDir = string(SYSCONF_PATH) + "/" + PACKAGE + "/nld/";
static string configFileName = configDir + "frog.cfg";
@@ -188,10 +184,8 @@ void Test( istream& in ){
vector<TagResult> tagv = tagger.tagLine( s );
for ( const auto& tr : tagv ){
UnicodeString uWord = folia::UTF8ToUnicode( tr.word() );
- vector<string> v;
- size_t num = TiCC::split_at_first_of( tr.assignedTag(),
- v, "(,)" );
- if ( num < 1 ){
+ vector<string> v = TiCC::split_at_first_of( tr.assignedTag(), "(,)" );
+ if ( v.empty() ){
throw runtime_error( "error: tag not in right format " );
}
string head = v[0];
@@ -222,8 +216,7 @@ void Test( istream& in ){
}
}
else {
- vector<string> parts;
- TiCC::split( s, parts );
+ vector<string> parts = TiCC::split( s );
for ( auto const& w : parts ){
UnicodeString uWord = folia::UTF8ToUnicode(w);
uWord.toLower();
diff --git a/src/mbma_rule.cxx b/src/mbma_rule.cxx
index d41be81..b5f5d8f 100644
--- a/src/mbma_rule.cxx
+++ b/src/mbma_rule.cxx
@@ -31,7 +31,6 @@
#include <string>
#include <vector>
-#include <list>
#include <iostream>
#include "ticcutils/LogStream.h"
#include "ticcutils/StringOps.h"
@@ -42,7 +41,6 @@
#include "frog/mbma_rule.h"
using namespace std;
-using namespace folia;
using TiCC::operator<<;
#define LOG *TiCC::Log(myLog)
@@ -100,22 +98,22 @@ ostream& operator<<( ostream& os, const RulePart *r ){
void RulePart::get_edits( const string& edit ){
if (edit[0]=='D') { // delete operation
string s = edit.substr(1);
- ins = UTF8ToUnicode( s );
+ ins = folia::UTF8ToUnicode( s );
}
else if ( edit[0]=='I') { // insert operation
string s = edit.substr(1);
- del = UTF8ToUnicode( s );
+ del = folia::UTF8ToUnicode( s );
}
else if ( edit[0]=='H') { // hidden morpheme
string s = edit.substr(1);
- hide = UTF8ToUnicode( s );
+ hide = folia::UTF8ToUnicode( s );
}
else if ( edit[0]=='R') { // replace operation
string::size_type pos = edit.find( ">" );
string s = edit.substr( 1, pos-1 );
- ins = UTF8ToUnicode( s );
+ ins = folia::UTF8ToUnicode( s );
s = edit.substr( pos+1 );
- del = UTF8ToUnicode( s );
+ del = folia::UTF8ToUnicode( s );
}
}
@@ -303,7 +301,7 @@ vector<string> Rule::extract_morphemes( ) const {
for ( const auto& it : rules ){
UnicodeString morpheme = it.morpheme;
if ( !morpheme.isEmpty() ){
- morphemes.push_back( UnicodeToUTF8(morpheme) );
+ morphemes.push_back( folia::UnicodeToUTF8(morpheme) );
}
}
return morphemes;
@@ -313,7 +311,7 @@ string Rule::morpheme_string( bool structured ) const {
string result;
if ( structured ){
UnicodeString us = brackets->put(true);
- result = UnicodeToUTF8( us );
+ result = folia::UnicodeToUTF8( us );
}
else {
vector<string> vec = extract_morphemes();
@@ -496,7 +494,7 @@ UnicodeString Rule::getKey( bool deep ){
UnicodeString tmp;
stringstream ss;
ss << brackets << endl;
- tmp = UTF8ToUnicode(ss.str());
+ tmp = folia::UTF8ToUnicode(ss.str());
sortkey = tmp;
}
return sortkey;
@@ -506,7 +504,7 @@ UnicodeString Rule::getKey( bool deep ){
UnicodeString tmp;
// create an unique string
for ( auto const& mor : morphs ){
- tmp += UTF8ToUnicode(mor) + "++";
+ tmp += folia::UTF8ToUnicode(mor) + "++";
}
return tmp;
}
diff --git a/src/mwu_chunker_mod.cxx b/src/mwu_chunker_mod.cxx
index f14b732..cf7341e 100644
--- a/src/mwu_chunker_mod.cxx
+++ b/src/mwu_chunker_mod.cxx
@@ -44,18 +44,16 @@
#include "frog/mwu_chunker_mod.h"
using namespace std;
-using namespace TiCC;
-using namespace folia;
-#define LOG *Log(mwuLog)
+#define LOG *TiCC::Log(mwuLog)
-mwuAna::mwuAna( Word *fwrd, const string& txt, const string& glue_tag ){
+mwuAna::mwuAna( folia::Word *fwrd, const string& txt, const string& glue_tag ){
spec = false;
word = txt;
string tag;
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
- tag = fwrd->annotation<PosAnnotation>()->cls();
+ tag = fwrd->annotation<folia::PosAnnotation>()->cls();
}
spec = ( tag == glue_tag );
fwords.push_back( fwrd );
@@ -66,29 +64,35 @@ void mwuAna::merge( const mwuAna *add ){
delete add;
}
-EntitiesLayer *mwuAna::addEntity( const std::string& tagset,
- Sentence *sent, EntitiesLayer *el ){
+folia::EntitiesLayer *mwuAna::addEntity( const string& tagset,
+ const string& textclass,
+ folia::Sentence *sent,
+ folia::EntitiesLayer *el ){
if ( fwords.size() > 1 ){
if ( el == 0 ){
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
- KWargs args;
+ folia::KWargs args;
args["generate_id"] = sent->id();
- el = new EntitiesLayer( args, sent->doc() );
+ args["set"] = tagset;
+ el = new folia::EntitiesLayer( args, sent->doc() );
sent->append( el );
}
}
- KWargs args;
+ folia::KWargs args;
args["set"] = tagset;
args["generate_id"] = el->id();
- Entity *e=0;
-#pragma omp critical(foliaupdate)
+ if ( textclass != "current" ){
+ args["textclass"] = textclass;
+ }
+ folia::Entity *e=0;
+#pragma omp critical (foliaupdate)
{
- e = new Entity( args, el->doc() );
+ e = new folia::Entity( args, el->doc() );
el->append( e );
}
for ( const auto& fw : fwords ){
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
e->append( fw );
}
@@ -97,8 +101,8 @@ EntitiesLayer *mwuAna::addEntity( const std::string& tagset,
return el;
}
-Mwu::Mwu(LogStream * logstream){
- mwuLog = new LogStream( logstream, "mwu-" );
+Mwu::Mwu( TiCC::LogStream * logstream ){
+ mwuLog = new TiCC::LogStream( logstream, "mwu-" );
filter = 0;
}
@@ -115,15 +119,15 @@ void Mwu::reset(){
mWords.clear();
}
-void Mwu::add( Word *word ){
+void Mwu::add( folia::Word *word ){
UnicodeString tmp;
-#pragma omp critical(foliaupdate)
+#pragma omp critical (foliaupdate)
{
- tmp = word->text();
+ tmp = word->text( textclass );
}
if ( filter )
tmp = filter->filter( tmp );
- string txt = UnicodeToUTF8( tmp );
+ string txt = folia::UnicodeToUTF8( tmp );
mWords.push_back( new mwuAna( word, txt, glue_tag ) );
}
@@ -137,8 +141,8 @@ bool Mwu::read_mwus( const string& fname) {
string line;
while( getline( mwufile, line ) ) {
vector<string> res1, res2; //res1 has mwus and tags, res2 has ind. words
- if ( ( split_at(line, res1, " ") == 2 ) &&
- ( split_at(res1[0], res2, "_") >= 2 ) ){
+ if ( ( TiCC::split_at(line, res1, " ") == 2 ) &&
+ ( TiCC::split_at(res1[0], res2, "_") >= 2 ) ){
string key = res2[0];
res2.erase(res2.begin());
MWUs.insert( make_pair( key, res2 ) );
@@ -151,7 +155,7 @@ bool Mwu::read_mwus( const string& fname) {
return true;
}
-bool Mwu::init( const Configuration& config ) {
+bool Mwu::init( const TiCC::Configuration& config ) {
LOG << "initiating mwuChunker..." << endl;
debug = 0;
string val = config.lookUp( "debug", "mwu" );
@@ -201,6 +205,14 @@ bool Mwu::init( const Configuration& config ) {
glue_tag = val;
}
+ string cls = config.lookUp( "outputclass" );
+ if ( !cls.empty() ){
+ textclass = cls;
+ }
+ else {
+ textclass = "current";
+ }
+
return true;
}
@@ -210,17 +222,17 @@ ostream &operator<<( ostream& os, const Mwu& mwu ){
return os;
}
-void Mwu::addDeclaration( Document& doc ) const {
-#pragma omp critical(foliaupdate)
+void Mwu::addDeclaration( folia::Document& doc ) const {
+#pragma omp critical (foliaupdate)
{
- doc.declare( AnnotationType::ENTITY,
+ doc.declare( folia::AnnotationType::ENTITY,
mwu_tagset,
"annotator='frog-mwu-" + version
+ "', annotatortype='auto', datetime='" + getTime() + "'");
}
}
-void Mwu::Classify( const vector<Word*>& words ){
+void Mwu::Classify( const vector<folia::Word*>& words ){
if ( words.empty() ){
return;
}
@@ -229,14 +241,14 @@ void Mwu::Classify( const vector<Word*>& words ){
add( word );
}
Classify();
- EntitiesLayer *el = 0;
- Sentence *sent;
-#pragma omp critical(foliaupdate)
+ folia::EntitiesLayer *el = 0;
+ folia::Sentence *sent;
+#pragma omp critical (foliaupdate)
{
sent = words[0]->sentence();
}
for ( const auto& mword : mWords ){
- el = mword->addEntity( mwu_tagset, sent, el );
+ el = mword->addEntity( mwu_tagset, textclass, sent, el );
}
}
@@ -267,7 +279,6 @@ void Mwu::Classify(){
if ( debug ){
LOG << "checking word[" << i <<"]: " << word << endl;
}
- // pair<mymap2::iterator, mymap2::iterator> matches = MWUs.equal_range(word);
const auto matches = MWUs.equal_range(word);
if ( matches.first != MWUs.end() ) {
//match
diff --git a/src/ner_prog.cxx b/src/ner_prog.cxx
index 71da5a1..2069b32 100644
--- a/src/ner_prog.cxx
+++ b/src/ner_prog.cxx
@@ -31,10 +31,7 @@
#include <string>
#include <iostream>
-#include <sstream>
-#include <fstream>
#include <vector>
-#include <map>
#include "config.h"
#include "ticcutils/LogStream.h"
@@ -42,21 +39,20 @@
#include "ticcutils/CommandLine.h"
#include "ticcutils/PrettyPrint.h"
#include "libfolia/folia.h"
-#include "frog/ucto_tokenizer_mod.h"
#include "mbt/MbtAPI.h"
+#include "frog/ucto_tokenizer_mod.h"
#include "frog/ner_tagger_mod.h"
using namespace std;
using namespace Timbl;
-using namespace TiCC;
using namespace Tagger;
-LogStream my_default_log( cerr, "", StampMessage ); // fall-back
-LogStream *theErrLog = &my_default_log; // fill the externals
+TiCC::LogStream my_default_log( cerr, "", StampMessage ); // fall-back
+TiCC::LogStream *theErrLog = &my_default_log; // fill the externals
vector<string> fileNames;
-Configuration configuration;
+TiCC::Configuration configuration;
static string configDir = string(SYSCONF_PATH) + "/" + PACKAGE + "/";
static string configFileName = configDir + "frog.cfg";
diff --git a/src/ner_tagger_mod.cxx b/src/ner_tagger_mod.cxx
index 38f0f9c..04e2c48 100644
--- a/src/ner_tagger_mod.cxx
+++ b/src/ner_tagger_mod.cxx
@@ -32,269 +32,212 @@
#include "mbt/MbtAPI.h"
#include "frog/Frog.h"
#include "ucto/unicode.h"
+#include "ticcutils/FileUtils.h"
+#include "ticcutils/StringOps.h"
#include "frog/ner_tagger_mod.h"
using namespace std;
-using namespace folia;
-using namespace TiCC;
using namespace Tagger;
+using TiCC::operator<<;
-const int KNOWN_NERS_SIZE = 10;
+#define LOG *TiCC::Log(tag_log)
-#define LOG *Log(nerLog)
+// should come from the config!
+const string cgn_tagset = "http://ilk.uvt.nl/folia/sets/frog-mbpos-cgn";
-NERTagger::NERTagger(TiCC::LogStream * logstream){
- tagger = 0;
- filter = 0;
- nerLog = new LogStream( logstream, "ner-" );
- known_ners.resize( KNOWN_NERS_SIZE+1 );
+NERTagger::NERTagger( TiCC::LogStream *l ):
+ BaseTagger( l, "NER" ),
+ max_ner_size(20)
+{ known_ners.resize( max_ner_size + 1 );
}
-NERTagger::~NERTagger(){
- delete tagger;
- delete nerLog;
- delete filter;
-}
-
-bool NERTagger::init( const Configuration& config ){
- debug = 0;
- string val = config.lookUp( "debug", "NER" );
- if ( val.empty() ){
- val = config.lookUp( "debug" );
- }
- if ( !val.empty() ){
- debug = TiCC::stringTo<int>( val );
- }
- switch ( debug ){
- case 0:
- case 1:
- nerLog->setlevel(LogNormal);
- break;
- case 2:
- case 3:
- case 4:
- nerLog->setlevel(LogDebug);
- break;
- case 5:
- case 6:
- case 7:
- nerLog->setlevel(LogHeavy);
- break;
- default:
- nerLog->setlevel(LogExtreme);
- }
- if (debug){
- LOG << "NER Tagger Init" << endl;
- }
- if ( tagger != 0 ){
- LOG << "NER Tagger is already initialized!" << endl;
- return false;
- }
- val = config.lookUp( "settings", "NER" );
- if ( val.empty() ){
- LOG << "Unable to find settings for NER" << endl;
+bool NERTagger::init( const TiCC::Configuration& config ){
+ if ( !BaseTagger::init( config ) ){
return false;
}
- string settings;
- if ( val[0] == '/' ){
- // an absolute path
- settings = val;
- }
- else {
- settings = config.configDir() + val;
- }
- val = config.lookUp( "version", "NER" );
- if ( val.empty() ){
- version = "1.0";
- }
- else {
- version = val;
- }
- val = config.lookUp( "set", "NER" );
- if ( val.empty() ){
- tagset = "http://ilk.uvt.nl/folia/sets/frog-ner-nl";
- }
- else {
- tagset = val;
- }
- string charFile = config.lookUp( "char_filter_file", "NER" );
- if ( charFile.empty() )
- charFile = config.lookUp( "char_filter_file" );
- if ( !charFile.empty() ){
- charFile = prefix( config.configDir(), charFile );
- filter = new Tokenizer::UnicodeFilter();
- filter->fill( charFile );
+ string val = config.lookUp( "max_ner_size", "NER" );
+ if ( !val.empty() ){
+ max_ner_size = TiCC::stringTo<int>( val );
}
val = config.lookUp( "known_ners", "NER" );
if ( !val.empty() ){
- string file_name;
- if ( val[0] == '/' ) {
- // an absolute path
- file_name = val;
- }
- else {
- file_name = config.configDir() + val;
- }
- if ( !fill_known_ners( file_name ) ){
- LOG << "Unable to fill known NER's from file: '" << file_name << "'" << endl;
+ if ( !read_gazets( val, config.configDir() ) ){
return false;
}
}
+ return true;
+}
- string init = "-s " + settings + " -vcf";
- tagger = new MbtAPI( init, *nerLog );
- return tagger->isInit();
+bool NERTagger::fill_ners( const string& cat,
+ const string& name,
+ const string& config_dir ){
+ string file_name = name;
+ if ( !TiCC::isFile( file_name ) ){
+ file_name = config_dir + "/" + name;
+ if ( !TiCC::isFile( file_name ) ){
+ LOG << "unable to load additional NE from file: " << file_name << endl;
+ return false;
+ }
+ }
+ ifstream is( file_name );
+ int long_err_cnt = 0;
+ size_t ner_cnt = 0;
+ string line;
+ while ( getline( is, line ) ){
+ if ( line.empty() || line[0] == '#' ){
+ continue;
+ }
+ else {
+ vector<string> parts;
+ size_t num = TiCC::split( line, parts );
+ if ( num > (unsigned)max_ner_size ){
+ // LOG << "expected 1 to " << max_ner_size
+ // << " SPACE-separated parts in line: '" << line
+ // << "'" << endl;
+ if ( ++long_err_cnt > 50 ){
+ LOG << "too many long entries in additional wordlist file. " << file_name << endl;
+ LOG << "consider raising the max_ner_size in the configuration. (now "
+ << max_ner_size << ")" << endl;
+ return false;
+ }
+ else {
+ // LOG << "ignoring entry" << endl;
+ continue;
+ }
+ }
+ // reconstruct the NER with single spaces
+ line = "";
+ for ( const auto& part : parts ){
+ line += part;
+ if ( &part != &parts.back() ){
+ line += " ";
+ }
+ }
+ known_ners[num][line].insert( cat );
+ ++ner_cnt;
+ }
+ }
+ LOG << "loaded " << ner_cnt << " additional " << cat
+ << " Named Entities from: " << file_name << endl;
+ return true;
}
-bool NERTagger::fill_known_ners( const string& file_name ){
+bool NERTagger::read_gazets( const string& name, const string& config_dir ){
+ string file_name = name;
+ if ( name[0] != '/' ) {
+ file_name = config_dir + "/" + file_name;
+ }
ifstream is( file_name );
if ( !is ){
+ LOG << "Unable to find Named Enties file " << file_name << endl;
return false;
}
+ LOG << "READ " << file_name << endl;
+ int err_cnt = 0;
+ size_t file_cnt = 0;
string line;
while ( getline( is, line ) ){
if ( line.empty() || line[0] == '#' ){
continue;
}
+ // we search for entries of the form 'category\tfilename'
vector<string> parts;
if ( TiCC::split_at( line, parts, "\t" ) != 2 ){
LOG << "expected 2 TAB-separated parts in line: '" << line << "'" << endl;
- return false;
- }
- line = parts[0];
- string ner_value = parts[1];
- size_t num = TiCC::split( line, parts );
- if ( num < 1 || num > KNOWN_NERS_SIZE ){
- LOG << "expected 1 to " << KNOWN_NERS_SIZE
- << " SPACE-separated parts in line: '" << line
- << "'" << endl;
- return false;
- }
- line = "";
- for ( const auto& part : parts ){
- line += part;
- if ( &part != &parts.back() ){
- line += " ";
+ if ( ++err_cnt > 50 ){
+ LOG << "too many errors in additional wordlist file: " << file_name << endl;
+ return false;
+ }
+ else {
+ LOG << "ignoring entry" << endl;
+ continue;
}
}
- known_ners[num][line] = ner_value;
- }
- return true;
-}
-
-size_t count_sp( const string& sentence, string::size_type pos ){
- int sp = 0;
- for ( string::size_type i=0; i < pos; ++i ){
- if ( sentence[i] == ' ' ){
- ++sp;
+ string cat = parts[0];
+ string file = parts[1];
+ if ( fill_ners( cat, file, config_dir ) ){
+ ++file_cnt;
}
}
- return sp;
-}
-
-void NERTagger::handle_known_ners( const vector<string>& words,
- vector<string>& tags ){
- if ( debug ){
- LOG << "search for known NER's" << endl;
- }
- string sentence = " ";
- for ( const auto& w : words ){
- sentence += w + " ";
+ if ( file_cnt < 1 ){
+ LOG << "unable to load any additional Named Enties." << endl;
+ return false;
}
- // so sentence starts AND ends with a space!
- if ( debug ){
- LOG << "Sentence = " << sentence << endl;
+ else {
+ LOG << "loaded " << file_cnt << " additional Named Entities files" << endl;
+ return true;
}
- for ( size_t i = KNOWN_NERS_SIZE; i > 0; --i ){
- auto const& mp = known_ners[i];
- if ( mp.empty() ){
- continue;
- }
- for( auto const& it : mp ){
- string blub = " " + it.first + " ";
- string::size_type pos = sentence.find( blub );
- while ( pos != string::npos ){
- size_t sp = count_sp( sentence, pos );
- if ( debug ){
- LOG << "matched '" << it.first << "' to '"
- << sentence << "' at position " << sp
- << " : " << it.second << endl;
- }
- bool safe = true;
- for ( size_t j=0; j < i && safe; ++j ){
- safe = ( tags[sp+j] == "O" );
- }
- if ( safe ){
- // we can safely change the tag (don't trample upon hits of longer known ners!)
- tags[sp] = "B-" + it.second;
- for ( size_t j=1; j < i; ++j ){
- tags[sp+j] = "I-" + it.second;
- }
- }
- pos = sentence.find( blub, pos + blub.length() );
+}
+
+static vector<string> serialize( const vector<set<string>>& stags ){
+ // for every non empty set {el1,el2,..}, we compose a string like: el1+el2+...
+ vector<string> ambitags( stags.size(), "O" );
+ size_t pos = 0;
+ for ( const auto& it : stags ){
+ if ( !it.empty() ){
+ string res;
+ for ( const auto& s : it ){
+ res += s + "+";
}
+ ambitags[pos] = res;
+ // cerr << "set ambi[" << pos << " to " << res << endl;
}
+ ++pos;
}
+ return ambitags;
}
-void NERTagger::merge( const vector<string>& ktags, vector<string>& tags,
- vector<double>& conf ){
+vector<string> NERTagger::create_ner_list( const vector<string>& words ){
+ vector<set<string>> stags( words.size() );
if ( debug ){
- using TiCC::operator<<;
- LOG << "merge " << ktags << endl << "with " << tags << endl;
+ LOG << "search for known NER's" << endl;
}
- for ( size_t i=0; i < ktags.size(); ++i ){
- if ( ktags[i] == "O" ){
- if ( i > 0 && ktags[i-1] != "O" ){
- // so we did some merging. check that we aren't in the middle of some tag now
- size_t j = i;
- while ( j < tags.size() && tags[j][0] == 'I' ) {
- tags[j] = "O";
- ++j;
- }
+ for ( size_t j=0; j < words.size(); ++j ){
+ // cycle through the words
+ string seq;
+ size_t len = 1;
+ for ( size_t i = 0; i < min( words.size() - j, (size_t)max_ner_size); ++i ){
+ // start looking for sequences of length len
+ auto const& mp = known_ners[len++];
+ if ( mp.empty() ){
+ continue;
}
- continue;
- }
- else if ( ktags[i][0] == 'B' ){
- // maybe we landed in the middel of some tag.
- if ( tags[i][0] == 'I' ){
- //indeed, so erase it backwards
- size_t j = i;
- while ( tags[j][0] == 'I' ){
- tags[j] = "O";
- --j;
+ seq += words[j+i];
+ if ( debug ){
+ LOG << "sequence = '" << seq << "'" << endl;
+ }
+ auto const& tags = mp.find(seq);
+ if ( tags != mp.end() ){
+ if ( debug ){
+ LOG << "FOUND tags " << tags->first << "-" << tags->second << endl;
+ }
+ for ( size_t k = 0; k <= i; ++k ){
+ stags[k+j].insert( tags->second.begin(), tags->second.end() );
}
- tags[j] = "O";
}
- // now copy
- tags[i] = ktags[i];
- conf[i] = 1.0;
+ seq += " ";
}
- else {
- tags[i] = ktags[i];
- conf[i] = 1.0;
- }
- }
- if ( debug ){
- LOG << "Merge gave " << tags << endl;
}
+ return serialize( stags );
}
-static void addEntity( Sentence *sent,
+static void addEntity( folia::Sentence *sent,
const string& tagset,
- const vector<Word*>& words,
+ const vector<folia::Word*>& words,
const vector<double>& confs,
- const string& NER ){
- EntitiesLayer *el = 0;
+ const string& NER,
+ const string& textclass ){
+ folia::EntitiesLayer *el = 0;
#pragma omp critical(foliaupdate)
{
try {
- el = sent->annotation<EntitiesLayer>();
+ el = sent->annotation<folia::EntitiesLayer>(tagset);
}
catch(...){
- KWargs args;
+ folia::KWargs args;
args["generate_id"] = sent->id();
- el = new EntitiesLayer( args, sent->doc() );
+ args["set"] = tagset;
+ el = new folia::EntitiesLayer( args, sent->doc() );
sent->append( el );
}
}
@@ -303,15 +246,21 @@ static void addEntity( Sentence *sent,
c += val;
}
c /= confs.size();
- KWargs args;
+ folia::KWargs args;
args["class"] = NER;
- args["confidence"] = toString(c);
+ args["confidence"] = TiCC::toString(c);
args["set"] = tagset;
- args["generate_id"] = el->id();
- Entity *e = 0;
+ string parent_id = el->id();
+ if ( !parent_id.empty() ){
+ args["generate_id"] = el->id();
+ }
+ if ( textclass != "current" ){
+ args["textclass"] = textclass;
+ }
+ folia::Entity *e = 0;
#pragma omp critical(foliaupdate)
{
- e = new Entity( args, el->doc() );
+ e = new folia::Entity( args, el->doc() );
el->append( e );
}
for ( const auto& word : words ){
@@ -322,14 +271,14 @@ static void addEntity( Sentence *sent,
}
}
-void NERTagger::addNERTags( const vector<Word*>& words,
+void NERTagger::addNERTags( const vector<folia::Word*>& words,
const vector<string>& tags,
const vector<double>& confs ){
if ( words.empty() ) {
return;
}
- Sentence *sent = words[0]->sentence();
- vector<Word*> stack;
+ folia::Sentence *sent = words[0]->sentence();
+ vector<folia::Word*> stack;
vector<double> dstack;
string curNER;
for ( size_t i=0; i < tags.size(); ++i ){
@@ -345,7 +294,7 @@ void NERTagger::addNERTags( const vector<Word*>& words,
LOG << "ners " << stack << endl;
LOG << "confs " << dstack << endl;
}
- addEntity( sent, tagset, stack, dstack, curNER );
+ addEntity( sent, tagset, stack, dstack, curNER, textclass );
dstack.clear();
stack.clear();
}
@@ -370,7 +319,7 @@ void NERTagger::addNERTags( const vector<Word*>& words,
using TiCC::operator<<;
LOG << "spit out " << stack << endl;
}
- addEntity( sent, tagset, stack, dstack, curNER );
+ addEntity( sent, tagset, stack, dstack, curNER, textclass );
dstack.clear();
stack.clear();
}
@@ -385,86 +334,78 @@ void NERTagger::addNERTags( const vector<Word*>& words,
using TiCC::operator<<;
LOG << "spit out " << stack << endl;
}
- addEntity( sent, tagset, stack, dstack, curNER );
+ addEntity( sent, tagset, stack, dstack, curNER, textclass );
}
}
-void NERTagger::addDeclaration( Document& doc ) const {
+void NERTagger::addDeclaration( folia::Document& doc ) const {
#pragma omp critical(foliaupdate)
{
- doc.declare( AnnotationType::ENTITY,
+ doc.declare( folia::AnnotationType::ENTITY,
tagset,
"annotator='frog-ner-" + version
+ "', annotatortype='auto', datetime='" + getTime() + "'");
}
}
-void NERTagger::Classify( const vector<Word *>& swords ){
+void NERTagger::Classify( const vector<folia::Word *>& swords ){
if ( !swords.empty() ) {
vector<string> words;
- string sentence; // the tagger needs the whole sentence
- for ( const auto& sw : swords ){
- UnicodeString word;
-#pragma omp critical(foliaupdate)
- {
- word = sw->text();
+ vector<string> ptags;
+ extract_words_tags( swords, cgn_tagset, words, ptags );
+ vector<string> ktags = create_ner_list( words );
+ string text_block;
+ string prev = "_";
+ string prevN = "_";
+ for ( size_t i=0; i < swords.size(); ++i ){
+ string word = words[i];
+ string pos = ptags[i];
+ text_block += word + "\t" + prev + "\t" + pos + "\t";
+ prev = pos;
+ if ( i < swords.size() - 1 ){
+ text_block += ptags[i+1];
}
- if ( filter )
- word = filter->filter( word );
- sentence += UnicodeToUTF8(word);
- words.push_back( UnicodeToUTF8(word) );
- if ( &sw != &swords.back() ){
- sentence += " ";
+ else {
+ text_block += "_";
}
- }
- if (debug){
- LOG << "NER in: " << sentence << endl;
- }
- vector<TagResult> tagv = tagger->TagLine(sentence);
- if ( tagv.size() != swords.size() ){
- string out;
- for ( const auto& val : tagv ){
- out += val.word() + "//" + val.assignedTag() + " ";
+ string ktag = ktags[i];
+ text_block += "\t" + prevN + "\t" + ktag + "\t";
+ prevN = ktag;
+ if ( i < swords.size() - 1 ){
+ text_block += ktags[i+1];
}
- if ( debug ){
- LOG << "NER tagger is confused" << endl;
- LOG << "sentences was: '" << sentence << "'" << endl;
- LOG << "but tagged:" << endl;
- LOG << out << endl;
+ else {
+ text_block += "_";
}
- throw runtime_error( "NER failed: '" + sentence + "' ==> '" + out + "'" );
+
+ text_block += "\t??\n";
}
if ( debug ){
+ LOG << "TAGGING TEXT_BLOCK\n" << text_block << endl;
+ }
+ _tag_result = tagger->TagLine( text_block );
+ if ( debug ){
LOG << "NER tagger out: " << endl;
- for ( size_t i=0; i < tagv.size(); ++i ){
- LOG << "[" << i << "] : word=" << tagv[i].word()
- << " tag=" << tagv[i].assignedTag()
- << " confidence=" << tagv[i].confidence() << endl;
+ for ( size_t i=0; i < _tag_result.size(); ++i ){
+ LOG << "[" << i << "] : word=" << _tag_result[i].word()
+ << " tag=" << _tag_result[i].assignedTag()
+ << " confidence=" << _tag_result[i].confidence() << endl;
}
}
- vector<double> conf;
- vector<string> tags;
- for ( const auto& tag : tagv ){
- tags.push_back( tag.assignedTag() );
- conf.push_back( tag.confidence() );
- }
- vector<string> ktags( tagv.size(), "O" );
- handle_known_ners( words, ktags );
- merge( ktags, tags, conf );
- addNERTags( swords, tags, conf );
}
+ post_process( swords );
}
-vector<TagResult> NERTagger::tagLine( const string& line ){
- if ( tagger ){
- return tagger->TagLine(line);
- }
- throw runtime_error( "NERTagger is not initialized" );
+void NERTagger::post_process( const std::vector<folia::Word*>& swords ){
+ vector<string> tags;
+ vector<double> conf;
+ for ( const auto& tag : _tag_result ){
+ tags.push_back( tag.assignedTag() );
+ conf.push_back( tag.confidence() );
+ }
+ addNERTags( swords, tags, conf );
}
-string NERTagger::set_eos_mark( const string& eos ){
- if ( tagger ){
- return tagger->set_eos_mark(eos);
- }
- throw runtime_error( "NERTagger is not initialized" );
+bool NERTagger::Generate( const std::string& opt_line ){
+ return tagger->GenerateTagger( opt_line );
}
diff --git a/src/pos_tagger_mod.cxx b/src/pos_tagger_mod.cxx
deleted file mode 100644
index 4776070..0000000
--- a/src/pos_tagger_mod.cxx
+++ /dev/null
@@ -1,295 +0,0 @@
-/* ex: set tabstop=8 expandtab: */
-/*
- Copyright (c) 2006 - 2017
- CLST - Radboud University
- ILK - Tilburg University
-
- This file is part of frog:
-
- A Tagger-Lemmatizer-Morphological-Analyzer-Dependency-Parser for
- several languages
-
- frog is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 3 of the License, or
- (at your option) any later version.
-
- frog is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-
- For questions and suggestions, see:
- https://github.com/LanguageMachines/frog/issues
- or send mail to:
- lamasoftware (at ) science.ru.nl
-
-*/
-
-#include "mbt/MbtAPI.h"
-#include "ucto/unicode.h"
-#include "frog/Frog.h"
-#include "frog/ucto_tokenizer_mod.h"
-#include "frog/pos_tagger_mod.h"
-
-using namespace std;
-using namespace folia;
-using namespace TiCC;
-using namespace Tagger;
-
-
-#define LOG *Log(tag_log)
-
-POSTagger::POSTagger(TiCC::LogStream * logstream){
- debug = 0;
- tagger = 0;
- filter = 0;
- tag_log = new LogStream( logstream, "pos-tagger-" );
-}
-
-POSTagger::~POSTagger(){
- delete tagger;
- delete filter;
- delete tag_log;
-}
-
-bool fill_set( const string& file, set<string>& st ){
- ifstream is( file );
- if ( !is ){
- return false;
- }
- string line;
- while( getline( is, line ) ){
- if ( line.empty() || line[0] == '#' )
- continue;
- line = TiCC::trim( line );
- st.insert( line );
- }
- return true;
-}
-
-bool POSTagger::fill_map( const string& file, map<string,string>& mp ){
- ifstream is( file );
- if ( !is ){
- return false;
- }
- string line;
- while( getline( is, line ) ){
- if ( line.empty() || line[0] == '#' )
- continue;
- vector<string> parts;
- size_t num = TiCC::split_at( line, parts, "\t" );
- if ( num != 2 ){
- LOG << "invalid entry in '" << file << "'" << endl;
- LOG << "expected 2 tab-separated values, but got: '"
- << line << "'" << endl;
- return false;
- }
- mp[ parts[0] ] = parts[1];
- }
- return true;
-}
-
-bool POSTagger::init( const Configuration& config ){
- debug = 0;
- string val = config.lookUp( "debug", "tagger" );
- if ( val.empty() ){
- val = config.lookUp( "debug" );
- }
- if ( !val.empty() ){
- debug = TiCC::stringTo<int>( val );
- }
- switch ( debug ){
- case 0:
- case 1:
- break;
- case 2:
- case 3:
- case 4:
- tag_log->setlevel(LogDebug);
- break;
- case 5:
- case 6:
- case 7:
- tag_log->setlevel(LogHeavy);
- break;
- default:
- tag_log->setlevel(LogExtreme);
- }
- if ( tagger != 0 ){
- LOG << "POS-Tagger is already initialized!" << endl;
- return false;
- }
- val = config.lookUp( "settings", "tagger" );
- if ( val.empty() ){
- LOG << "Unable to find settings for Tagger" << endl;
- return false;
- }
- string settings;
- if ( val[0] == '/' ) // an absolute path
- settings = val;
- else
- settings = config.configDir() + val;
-
- val = config.lookUp( "version", "tagger" );
- if ( val.empty() ){
- version = "1.0";
- }
- else
- version = val;
- val = config.lookUp( "set", "tagger" );
- if ( val.empty() ){
- LOG << "missing set declaration in config" << endl;
- return false;
- }
- else {
- tagset = val;
- }
- string charFile = config.lookUp( "char_filter_file", "tagger" );
- if ( charFile.empty() )
- charFile = config.lookUp( "char_filter_file" );
- if ( !charFile.empty() ){
- charFile = prefix( config.configDir(), charFile );
- filter = new Tokenizer::UnicodeFilter();
- filter->fill( charFile );
- }
- string tokFile = config.lookUp( "token_trans_file", "tagger" );
- if ( tokFile.empty() )
- tokFile = config.lookUp( "token_trans_file" );
- if ( !tokFile.empty() ){
- tokFile = prefix( config.configDir(), tokFile );
- if ( !fill_map( tokFile, token_tag_map ) ){
- LOG << "failed to load a token translation file from: '"
- << tokFile << "'"<< endl;
- return false;
- }
- }
- string tagsFile = config.lookUp( "tags_file", "tagger" );
- if ( tagsFile.empty() )
- tagsFile = config.lookUp( "tags_file" );
- if ( !tagsFile.empty() ){
- tagsFile = prefix( config.configDir(), tagsFile );
- if ( !fill_set( tagsFile, valid_tags ) ){
- LOG << "failed to load a tags file from: '"
- << tagsFile << "'"<< endl;
- return false;
- }
- }
- string init = "-s " + settings + " -vcf";
- tagger = new MbtAPI( init, *tag_log );
- return tagger->isInit();
-}
-
-void POSTagger::addTag( Word *word,
- const string& inputTag,
- double confidence,
- bool /*known NOT USED yet*/ ){
- string pos_tag = inputTag;
- string ucto_class = word->cls();
- if ( debug ){
- LOG << "lookup ucto class= " << ucto_class << endl;
- }
- auto const tt = token_tag_map.find( ucto_class );
- if ( tt != token_tag_map.end() ){
- if ( debug ){
- LOG << "found translation ucto class= " << ucto_class
- << " to POS-Tag=" << tt->second << endl;
- }
- pos_tag = tt->second;
- confidence = 1.0;
- }
- KWargs args;
- args["set"] = tagset;
- args["class"] = pos_tag;
- args["confidence"]= toString(confidence);
-#pragma omp critical(foliaupdate)
- {
- word->addPosAnnotation( args );
- }
- // folia::FoliaElement *pos = 0;
- //#pragma omp critical(foliaupdate)
- // {
- // pos = word->addPosAnnotation( args );
- // }
- // if ( !known ){
- // args.clear();
- // args["class"] = "yes";
- // args["subset"] = "unknown_word";
- // folia::Feature *feat = new folia::Feature( args );
- // pos->append( feat );
- // }
-}
-
-void POSTagger::addDeclaration( Document& doc ) const {
-#pragma omp critical(foliaupdate)
- {
- doc.declare( AnnotationType::POS, tagset,
- "annotator='frog-mbpos-" + version
- + "', annotatortype='auto', datetime='" + getTime() + "'");
- }
-}
-
-vector<TagResult> POSTagger::tagLine( const string& line ){
- if ( tagger )
- return tagger->TagLine(line);
- else
- throw runtime_error( "POSTagger is not initialized" );
-}
-
-string POSTagger::set_eos_mark( const std::string& eos ){
- if ( tagger )
- return tagger->set_eos_mark( eos );
- else
- throw runtime_error( "POSTagger is not initialized" );
-}
-
-void POSTagger::Classify( const vector<Word*>& swords ){
- if ( !swords.empty() ) {
- string sentence; // the tagger needs the whole sentence
- for ( size_t w = 0; w < swords.size(); ++w ) {
- UnicodeString word;
-#pragma omp critical(foliaupdate)
- {
- word = swords[w]->text();
- }
- if ( filter )
- word = filter->filter( word );
- sentence += UnicodeToUTF8(word);
- if ( w < swords.size()-1 )
- sentence += " ";
- }
- if (debug){
- LOG << "POS tagger in: " << sentence << endl;
- }
- vector<TagResult> tagv = tagger->TagLine(sentence);
- if ( tagv.size() != swords.size() ){
- LOG << "mismatch between number of <w> tags and the tagger result." << endl;
- LOG << "words according to <w> tags: " << endl;
- for ( size_t w = 0; w < swords.size(); ++w ) {
- LOG << "w[" << w << "]= " << swords[w]->str() << endl;
- }
- LOG << "words according to POS tagger: " << endl;
- for ( size_t i=0; i < tagv.size(); ++i ){
- LOG << "word[" << i << "]=" << tagv[i].word() << endl;
- }
- throw runtime_error( "POS tagger is confused" );
- }
- if ( debug ){
- LOG << "POS tagger out: " << endl;
- for ( size_t i=0; i < tagv.size(); ++i ){
- LOG << "[" << i << "] : word=" << tagv[i].word()
- << " tag=" << tagv[i].assignedTag()
- << " confidence=" << tagv[i].confidence() << endl;
- }
- }
- for ( size_t i=0; i < tagv.size(); ++i ){
- addTag( swords[i],
- tagv[i].assignedTag(),
- tagv[i].confidence(),
- tagv[i].isKnown() );
- }
- }
-}
diff --git a/src/tagger_base.cxx b/src/tagger_base.cxx
new file mode 100644
index 0000000..9760e22
--- /dev/null
+++ b/src/tagger_base.cxx
@@ -0,0 +1,275 @@
+/* ex: set tabstop=8 expandtab: */
+/*
+ Copyright (c) 2006 - 2017
+ CLST - Radboud University
+ ILK - Tilburg University
+
+ This file is part of frog:
+
+ A Tagger-Lemmatizer-Morphological-Analyzer-Dependency-Parser for
+ several languages
+
+ frog is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ frog is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ For questions and suggestions, see:
+ https://github.com/LanguageMachines/frog/issues
+ or send mail to:
+ lamasoftware (at ) science.ru.nl
+
+*/
+
+#include <algorithm>
+#include "mbt/MbtAPI.h"
+#include "ucto/unicode.h"
+#include "frog/Frog.h"
+#include "frog/ucto_tokenizer_mod.h"
+#include "frog/tagger_base.h"
+
+using namespace std;
+using namespace Tagger;
+
+
+#define LOG *TiCC::Log(tag_log)
+
+BaseTagger::BaseTagger( TiCC::LogStream *logstream, const string& label ){
+ debug = 0;
+ tagger = 0;
+ filter = 0;
+ _label = label;
+ tag_log = new TiCC::LogStream( logstream, _label + "-tagger-" );
+}
+
+BaseTagger::~BaseTagger(){
+ delete tagger;
+ delete filter;
+ delete tag_log;
+}
+
+bool fill_set( const string& file, set<string>& st ){
+ ifstream is( file );
+ if ( !is ){
+ return false;
+ }
+ string line;
+ while( getline( is, line ) ){
+ if ( line.empty() || line[0] == '#' )
+ continue;
+ line = TiCC::trim( line );
+ st.insert( line );
+ }
+ return true;
+}
+
+bool BaseTagger::fill_map( const string& file, map<string,string>& mp ){
+ ifstream is( file );
+ if ( !is ){
+ return false;
+ }
+ string line;
+ while( getline( is, line ) ){
+ if ( line.empty() || line[0] == '#' )
+ continue;
+ vector<string> parts;
+ size_t num = TiCC::split_at( line, parts, "\t" );
+ if ( num != 2 ){
+ LOG << "invalid entry in '" << file << "'" << endl;
+ LOG << "expected 2 tab-separated values, but got: '"
+ << line << "'" << endl;
+ return false;
+ }
+ mp[ parts[0] ] = parts[1];
+ }
+ return true;
+}
+
+bool BaseTagger::init( const TiCC::Configuration& config ){
+ debug = 0;
+ if ( tagger != 0 ){
+ LOG << _label << "-tagger is already initialized!" << endl;
+ return false;
+ }
+ string val = config.lookUp( "debug", _label );
+ if ( val.empty() ){
+ val = config.lookUp( "debug" );
+ }
+ if ( !val.empty() ){
+ debug = TiCC::stringTo<int>( val );
+ }
+ switch ( debug ){
+ case 0:
+ case 1:
+ break;
+ case 2:
+ case 3:
+ case 4:
+ tag_log->setlevel(LogDebug);
+ break;
+ case 5:
+ case 6:
+ case 7:
+ tag_log->setlevel(LogHeavy);
+ break;
+ default:
+ tag_log->setlevel(LogExtreme);
+ }
+ val = config.lookUp( "settings", _label );
+ if ( val.empty() ){
+ LOG << "Unable to find settings for: " << _label << endl;
+ return false;
+ }
+ string settings;
+ if ( val[0] == '/' ) // an absolute path
+ settings = val;
+ else
+ settings = config.configDir() + val;
+
+ val = config.lookUp( "version", _label );
+ if ( val.empty() ){
+ version = "1.0";
+ }
+ else
+ version = val;
+ val = config.lookUp( "set", _label );
+ if ( val.empty() ){
+ LOG << "missing 'set' declaration in config" << endl;
+ return false;
+ }
+ else {
+ tagset = val;
+ }
+ string charFile = config.lookUp( "char_filter_file", _label );
+ if ( charFile.empty() )
+ charFile = config.lookUp( "char_filter_file" );
+ if ( !charFile.empty() ){
+ charFile = prefix( config.configDir(), charFile );
+ filter = new Tokenizer::UnicodeFilter();
+ filter->fill( charFile );
+ }
+ string tokFile = config.lookUp( "token_trans_file", _label );
+ if ( tokFile.empty() )
+ tokFile = config.lookUp( "token_trans_file" );
+ if ( !tokFile.empty() ){
+ tokFile = prefix( config.configDir(), tokFile );
+ if ( !fill_map( tokFile, token_tag_map ) ){
+ LOG << "failed to load a token translation file from: '"
+ << tokFile << "'"<< endl;
+ return false;
+ }
+ }
+ string cls = config.lookUp( "outputclass" );
+ if ( !cls.empty() ){
+ textclass = cls;
+ }
+ else {
+ textclass = "current";
+ }
+ if ( debug ){
+ LOG << _label << "-taggger textclass= " << textclass << endl;
+ }
+ string init = "-s " + settings + " -vcf";
+ tagger = new MbtAPI( init, *tag_log );
+ return tagger->isInit();
+}
+
+vector<TagResult> BaseTagger::tagLine( const string& line ){
+ if ( tagger )
+ return tagger->TagLine(line);
+ else
+ throw runtime_error( _label + "-tagger is not initialized" );
+}
+
+string BaseTagger::set_eos_mark( const std::string& eos ){
+ if ( tagger )
+ return tagger->set_eos_mark( eos );
+ else
+ throw runtime_error( _label + "-tagger is not initialized" );
+}
+
+string BaseTagger::extract_sentence( const vector<folia::Word*>& swords,
+ vector<string>& words ){
+ words.clear();
+ string sentence;
+ for ( const auto& sword : swords ){
+ UnicodeString word;
+#pragma omp critical (foliaupdate)
+ {
+ word = sword->text( textclass );
+ }
+ if ( filter )
+ word = filter->filter( word );
+ string word_s = folia::UnicodeToUTF8( word );
+ // the word may contain spaces, remove them all!
+ word_s.erase(remove_if(word_s.begin(), word_s.end(), ::isspace), word_s.end());
+ sentence += word_s;
+ if ( &sword != &swords.back() ){
+ sentence += " ";
+ }
+ }
+ return sentence;
+}
+
+void BaseTagger::extract_words_tags( const vector<folia::Word *>& swords,
+ const string& tagset,
+ vector<string>& words,
+ vector<string>& ptags ){
+ for ( size_t i=0; i < swords.size(); ++i ){
+ folia::Word *sw = swords[i];
+ folia::PosAnnotation *postag = 0;
+ UnicodeString word;
+#pragma omp critical(foliaupdate)
+ {
+ word = sw->text( textclass );
+ postag = sw->annotation<folia::PosAnnotation>( tagset );
+ }
+ if ( filter ){
+ word = filter->filter( word );
+ }
+ // the word may contain spaces, remove them all!
+ string word_s = folia::UnicodeToUTF8( word );
+ word_s.erase(remove_if(word_s.begin(), word_s.end(), ::isspace), word_s.end());
+ words.push_back( word_s );
+ ptags.push_back( postag->cls() );
+ }
+}
+
+void BaseTagger::Classify( const vector<folia::Word*>& swords ){
+ if ( !swords.empty() ) {
+ string sentence = extract_sentence( swords, _words );
+ if (debug){
+ LOG << _label << "-tagger in: " << sentence << endl;
+ }
+ _tag_result = tagger->TagLine(sentence);
+ if ( _tag_result.size() != swords.size() ){
+ LOG << _label << "-tagger mismatch between number of <w> tags and the tagger result." << endl;
+ LOG << "words according to <w> tags: " << endl;
+ for ( size_t w = 0; w < swords.size(); ++w ) {
+ LOG << "w[" << w << "]= " << swords[w]->str( textclass ) << endl;
+ }
+ LOG << "words according to " << _label << "-tagger: " << endl;
+ for ( size_t i=0; i < _tag_result.size(); ++i ){
+ LOG << "word[" << i << "]=" << _tag_result[i].word() << endl;
+ }
+ throw runtime_error( _label + "-tagger is confused" );
+ }
+ if ( debug ){
+ LOG << _label + "-tagger out: " << endl;
+ for ( size_t i=0; i < _tag_result.size(); ++i ){
+ LOG << "[" << i << "] : word=" << _tag_result[i].word()
+ << " tag=" << _tag_result[i].assignedTag()
+ << " confidence=" << _tag_result[i].confidence() << endl;
+ }
+ }
+ post_process( swords );
+ }
+}
diff --git a/src/ucto_tokenizer_mod.cxx b/src/ucto_tokenizer_mod.cxx
index d8df81c..0b6d517 100644
--- a/src/ucto_tokenizer_mod.cxx
+++ b/src/ucto_tokenizer_mod.cxx
@@ -29,7 +29,6 @@
*/
-#include <map>
#include <string>
#include "timbl/TimblAPI.h"
#include "frog/Frog.h"
@@ -38,13 +37,13 @@
#include "frog/ucto_tokenizer_mod.h"
using namespace std;
-using namespace TiCC;
+using TiCC::operator<<;
-#define LOG *Log(uctoLog)
+#define LOG *TiCC::Log(uctoLog)
-UctoTokenizer::UctoTokenizer(LogStream * logstream) {
+UctoTokenizer::UctoTokenizer( TiCC::LogStream * logstream ) {
tokenizer = 0;
- uctoLog = new LogStream( logstream, "tok-" );
+ uctoLog = new TiCC::LogStream( logstream, "tok-" );
}
string resolve_configdir( const string& rules_name, const string& dir ){
@@ -59,7 +58,7 @@ string resolve_configdir( const string& rules_name, const string& dir ){
return rules_name;
}
-bool UctoTokenizer::init( const Configuration& config ){
+bool UctoTokenizer::init( const TiCC::Configuration& config ){
if ( tokenizer )
throw runtime_error( "ucto tokenizer is already initialized" );
tokenizer = new Tokenizer::TokenizerClass();
@@ -78,8 +77,7 @@ bool UctoTokenizer::init( const Configuration& config ){
string languages = config.lookUp( "languages", "tokenizer" );
vector<string> language_list;
if ( !languages.empty() ){
- split_at( languages, language_list, "," );
- using TiCC::operator<<;
+ language_list = TiCC::split_at( languages, "," );
LOG << "Language List =" << language_list << endl;
}
if ( tokenizer->getPassThru() ){
@@ -111,7 +109,10 @@ bool UctoTokenizer::init( const Configuration& config ){
}
}
}
-
+ string textredundancy = config.lookUp( "textredundancy", "tokenizer" );
+ if ( !textredundancy.empty() ){
+ tokenizer->setTextRedundancy( textredundancy );
+ }
tokenizer->setEosMarker( "" );
tokenizer->setVerbose( false );
tokenizer->setSentenceDetection( true ); //detection of sentences
@@ -152,10 +153,19 @@ void UctoTokenizer::setInputEncoding( const std::string & enc ){
throw runtime_error( "ucto tokenizer not initialized" );
}
-void UctoTokenizer::setTextClass( const std::string& cls ){
+void UctoTokenizer::setInputClass( const std::string& cls ){
if ( tokenizer ){
if ( !cls.empty() )
- tokenizer->setTextClass( cls );
+ tokenizer->setInputClass( cls );
+ }
+ else
+ throw runtime_error( "ucto tokenizer not initialized" );
+}
+
+void UctoTokenizer::setOutputClass( const std::string& cls ){
+ if ( tokenizer ){
+ if ( !cls.empty() )
+ tokenizer->setOutputClass( cls );
}
else
throw runtime_error( "ucto tokenizer not initialized" );
@@ -178,6 +188,14 @@ void UctoTokenizer::setInputXml( bool b ){
throw runtime_error( "ucto tokenizer not initialized" );
}
+void UctoTokenizer::setTextRedundancy( const string& tr ) {
+ if ( tokenizer ){
+ tokenizer->setTextRedundancy( tr );
+ }
+ else
+ throw runtime_error( "ucto tokenizer not initialized" );
+}
+
void UctoTokenizer::setPassThru( const bool b ) {
if ( tokenizer ){
tokenizer->setPassThru( b );
diff --git a/tests/Makefile.in b/tests/Makefile.in
index 1a431fb..aec23c3 100644
--- a/tests/Makefile.in
+++ b/tests/Makefile.in
@@ -92,8 +92,7 @@ build_triplet = @build@
host_triplet = @host@
subdir = tests
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \
- $(top_srcdir)/m4/ax_lib_readline.m4 \
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_lib_readline.m4 \
$(top_srcdir)/m4/ax_pthread.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
@@ -158,13 +157,7 @@ EXEEXT = @EXEEXT@
FGREP = @FGREP@
GREP = @GREP@
ICU_CFLAGS = @ICU_CFLAGS@
-ICU_CONFIG = @ICU_CONFIG@
-ICU_CPPSEARCHPATH = @ICU_CPPSEARCHPATH@
-ICU_CXXFLAGS = @ICU_CXXFLAGS@
-ICU_IOLIBS = @ICU_IOLIBS@
-ICU_LIBPATH = @ICU_LIBPATH@
ICU_LIBS = @ICU_LIBS@
-ICU_VERSION = @ICU_VERSION@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -208,7 +201,6 @@ SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
STRIP = @STRIP@
-SYSCONFDIR = @SYSCONFDIR@
VERSION = @VERSION@
XML2_CFLAGS = @XML2_CFLAGS@
XML2_LIBS = @XML2_LIBS@
@@ -261,10 +253,10 @@ mbt_LIBS = @mbt_LIBS@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
-pkgconfigpath = @pkgconfigpath@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
+runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/frog.git
More information about the debian-science-commits
mailing list