[ucto] 66/69: Imported Upstream version 0.5.3

Joost van Baal joostvb at moszumanska.debian.org
Wed Sep 3 17:14:47 UTC 2014


This is an automated email from the git hooks/post-receive script.

joostvb pushed a commit to branch master
in repository ucto.

commit 2ebceaf679e9cffd9ff5215032a1f78ad3233af1
Author: Joost van Baal-Ilić <joostvb at nusku.mdcc.cx>
Date:   Wed Sep 3 19:00:16 2014 +0200

    Imported Upstream version 0.5.3
---
 AUTHORS                               |    1 +
 ChangeLog                             |  642 +++++++++++++++++
 INSTALL                               |    9 +-
 Makefile.in                           |   88 ++-
 NEWS                                  |   11 +
 aclocal.m4                            |   69 +-
 bootstrap                             |    6 +-
 config.guess                          |   49 +-
 config.h.in                           |    3 -
 config.sub                            |   65 +-
 config/Makefile.am                    |    6 +-
 config/Makefile.in                    |   49 +-
 config/es.abr                         |  205 ++++++
 config/nl_afk.abr                     |  463 ++++++++++++
 config/smiley.rule                    |    3 +-
 config/standard-quotes.quote          |    3 +-
 config/tokconfig-de                   |   23 +-
 config/tokconfig-en                   |  107 +--
 config/{tokconfig-de => tokconfig-es} |  161 +----
 config/tokconfig-fr                   |  106 +--
 config/tokconfig-fy                   |    9 +-
 config/tokconfig-it                   |    3 +-
 config/tokconfig-nl                   |  506 +------------
 config/tokconfig-nl-sonarchat         |  472 +------------
 config/tokconfig-nl-twitter           |  495 +------------
 configure                             |  298 ++++----
 configure.ac                          |   21 +-
 depcomp                               |   74 +-
 docs/Makefile.in                      |   43 +-
 docs/ucto.1                           |   37 +-
 include/Makefile.in                   |   33 +-
 include/ucto/Makefile.in              |   43 +-
 include/ucto/tokenize.h               |  137 ++--
 include/ucto/unicode.h                |   10 +-
 ltmain.sh                             |   95 ++-
 m4/Makefile.in                        |   33 +-
 m4/ac_define_dir.m4                   |   45 --
 m4/ltversion.m4                       |   10 +-
 missing                               |   53 +-
 src/Makefile.am                       |    5 +-
 src/Makefile.in                       |   61 +-
 src/tokenize.cxx                      | 1255 ++++++++++++++++++---------------
 src/ucto.cxx                          |   13 +-
 src/unicode.cxx                       |    4 +-
 tests/Makefile.in                     |   33 +-
 45 files changed, 2930 insertions(+), 2927 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 99e309e..bc94e27 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,2 +1,3 @@
 Maarten van Gompel
 Ko van der Sloot
+Folgert Karsdorp
diff --git a/ChangeLog b/ChangeLog
index dfc8b95..2e0bcb3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,645 @@
+2013-04-03 15:55  sloot
+
+	* [r15925] src/Makefile.am: increment current library version
+
+2013-04-03 14:36  sloot
+
+	* [r15918] tests/test.nl.tok.V: adapt to new tokenisation rules
+
+2013-04-03 13:57  sloot
+
+	* [r15910] include/ucto/tokenize.h, src/tokenize.cxx, src/ucto.cxx,
+	  src/unicode.cxx: changed folia include path
+
+2013-04-03 13:03  sloot
+
+	* [r15905] NEWS, configure.ac: NEWS
+
+2013-04-02 16:09  sloot
+
+	* [r15885] configure.ac: we need the most recent ticcutile
+
+2013-04-02 15:51  sloot
+
+	* [r15875] include/ucto/tokenize.h, src/tokenize.cxx, src/ucto.cxx,
+	  src/unicode.cxx: modified folia include path
+
+2013-04-02 08:55  sloot
+
+	* [r15851] include/ucto/Makefile.am, src/tst.sh: tags
+
+2013-03-25 16:09  sloot
+
+	* [r15810] src/tokenize.cxx: fixed WORD-WITHSUFFIX
+
+2013-03-25 14:38  sloot
+
+	* [r15809] src/tokenize.cxx: more quote hacking
+
+2013-03-25 14:20  sloot
+
+	* [r15807] tests/testquotes.ok, tests/testslash.ok: some rule neams
+	  have changed
+
+2013-03-25 14:18  sloot
+
+	* [r15806] config/tokconfig-nl: small edits
+
+2013-03-25 13:45  sloot
+
+	* [r15805] config/tokconfig-nl, src/tokenize.cxx,
+	  tests/testquotes.ok, tests/testslash.ok: some rexexp patches for
+	  words like A50, vanessa23 etc.
+	  adaptes tests
+
+2013-03-25 12:19  sloot
+
+	* [r15803] config/tokconfig-nl: more quotes for '40 etc.
+
+2013-03-25 11:20  sloot
+
+	* [r15802] config/tokconfig-nl: small fix for prime's/quotes
+
+2013-03-21 14:03  sloot
+
+	* [r15797] src/tokenize.cxx: added small hack to appen 'dangling'
+	  quotes to the previous sentence.
+	  This avoids sentences with only a ' or a ". (extend to exotic
+	  quotest too?)
+
+2013-03-06 13:50  sloot
+
+	* [r15774] src/ucto.cxx: be sure to take "current" as the default
+	  class for text searching
+
+2013-03-06 13:49  sloot
+
+	* [r15773] tests/testfoliain.ok: oesp.
+
+2013-03-06 12:23  mvgompel
+
+	* [r15769] config/tokconfig-de, config/tokconfig-en,
+	  config/tokconfig-es, config/tokconfig-fr, config/tokconfig-it:
+	  fixed ABBREVIATION rule for other languages
+
+2013-03-06 10:40  sloot
+
+	* [r15766] docs/ucto.1: Updated.
+
+2013-03-06 10:28  sloot
+
+	* [r15765] include/ucto/tokenize.h, src/tokenize.cxx, src/ucto.cxx:
+	  added a --textclass option
+
+2013-03-06 10:28  sloot
+
+	* [r15764] tests/folia5.xml, tests/testfoliain,
+	  tests/testfoliain.ok, tests/testoption1.ok, tests/testusage.ok:
+	  added test for the new --textclass option
+
+2013-02-21 14:55  sloot
+
+	* [r15693] src/tokenize.cxx: added the passthru fix for FoLia docs
+	  also elsewhere
+
+2013-02-18 13:42  sloot
+
+	* [r15683] src/tokenize.cxx: passtru mode was handled incorrectly
+	  for FoLiA input
+
+2013-02-18 11:12  sloot
+
+	* [r15682] src/tokenize.cxx: make sure that we tokenize existing
+	  FoLiA in our own set!
+
+2013-02-14 12:28  mvgompel
+
+	* [r15675] config/Makefile.am: added to makefile
+
+2013-02-14 12:26  mvgompel
+
+	* [r15674] config/es.abr, config/tokconfig-de, config/tokconfig-en,
+	  config/tokconfig-es, config/tokconfig-fr: updated foreign
+	  tokenisation configurations
+
+2013-02-07 13:11  sloot
+
+	* [r15648] src/tokenize.cxx: some debugging lines silenced
+
+2013-02-07 11:19  sloot
+
+	* [r15646] tests/bracket.nl.tok.V, tests/bracket.nl.txt: added a
+	  test for the EOS in brackets problem.
+
+2013-02-07 11:11  sloot
+
+	* [r15645] src/tokenize.cxx, tests/test.py: added some trickery to
+	  transfer EOS markers to next token if it is
+	  a closing bracket. EXPERIMENTAL
+
+2013-01-10 11:03  sloot
+
+	* [r15587] tests/testone: improved test
+
+2013-01-10 10:52  sloot
+
+	* [r15586] tests/utf16bom.nl, tests/utf8bom.nl: we need this files
+	  too
+
+2013-01-10 10:48  sloot
+
+	* [r15585] include/ucto/tokenize.h, src/tokenize.cxx: now we handle
+	  (and use) BOM markers.
+
+2013-01-10 10:48  sloot
+
+	* [r15584] tests/testencoding2, tests/testencoding2.ok: added a
+	  test for the BOM
+
+2013-01-07 14:54  sloot
+
+	* [r15571] include/ucto/tokenize.h, include/ucto/unicode.h,
+	  src/tokenize.cxx, src/ucto.cxx, src/unicode.cxx: Bump year
+
+2012-12-06 16:10  mvgompel
+
+	* [r15532] docs/ucto.1: wrong man
+
+2012-12-06 14:43  sloot
+
+	* [r15529] config/nl_afk.abr: removed 'al' beacuse it isn't an
+	  abrivation, ans it IS a word!
+
+2012-12-05 22:23  sloot
+
+	* [r15528] src/tokenize.cxx, tests/testfolia.ok,
+	  tests/testfolia2.ok, tests/testfoliain.ok, tests/testslash.ok:
+	  use datetime='now()' in declarations. And test that too
+
+2012-12-05 16:16  sloot
+
+	* [r15522] src/tokenize.cxx: add datetime attribute to
+	  token-annotation declaration
+
+2012-12-05 14:08  sloot
+
+	* [r15520] src/tokenize.cxx: only add a set default definition when
+	  there is something to do
+
+2012-12-05 14:07  sloot
+
+	* [r15519] config/tokconfig-nl-twitter: removed odd typo
+
+2012-12-05 14:04  sloot
+
+	* [r15518] config/Makefile.am, config/nl_afk, config/nl_afk.abr:
+	  fixed
+
+2012-11-30 19:13  mvgompel
+
+	* [r15490] config/Makefile.am: added to config data
+
+2012-11-30 19:12  mvgompel
+
+	* [r15489] config/nl_afk, config/tokconfig-nl-twitter: adding
+	  forgotten file
+
+2012-11-28 10:52  sloot
+
+	* [r15480] config/tokconfig-nl-sonarchat,
+	  config/tokconfig-nl-twitter: use the nl_afk abbreviations file
+
+2012-11-28 10:29  sloot
+
+	* [r15479] config/tokconfig-nl, include/ucto/tokenize.h,
+	  src/tokenize.cxx: implemented %include for ABBREVIATIONS
+
+2012-11-28 10:06  mvgompel
+
+	* [r15478] docs/ucto_manual.pdf, docs/ucto_manual.tex: docupdate
+
+2012-11-26 22:38  mvgompel
+
+	* [r15472] docs/ucto_manual.pdf, docs/ucto_manual.tex: updated
+	  after comments
+
+2012-11-26 13:17  mvgompel
+
+	* [r15468] docs/ucto_manual.bib, docs/ucto_manual.pdf,
+	  docs/ucto_manual.tex: Added documentation (finally)
+
+2012-11-01 14:51  sloot
+
+	* [r15397] tests/reverse-smiley.nl.tok.V,
+	  tests/reverse-smiley.nl.txt, tests/smileys.nl.tok.V,
+	  tests/smileys.nl.txt: added a test for reversed smileys.
+	  addapted the smiley testbtoo.
+
+2012-11-01 14:43  sloot
+
+	* [r15396] config/smiley.rule: attempt to support 'repetitive'
+	  smileys like this :)))))
+
+2012-11-01 14:13  sloot
+
+	* [r15395] config/tokconfig-nl-sonarchat,
+	  config/tokconfig-nl-twitter: added REVERSE-SMILEY rule
+
+2012-11-01 14:09  sloot
+
+	* [r15394] config/smiley.rule, config/tokconfig-nl: attempt to
+	  define a rule for reverse smileys {-8.
+
+2012-11-01 09:33  sloot
+
+	* [r15387] config/tokconfig-nl-twitter: get in line with standard
+	  tokconfig-nl
+
+2012-10-31 17:04  sloot
+
+	* [r15386] tests/testall, tests/testtwitter, tests/testtwitter.ok,
+	  tests/twitter.txt: added a test
+
+2012-10-30 12:36  mvgompel
+
+	* [r15361] bootstrap: bootstrap fix (matched failed on automake
+	  1.11.6 with Ubuntu 12.10)
+
+2012-10-30 09:56  sloot
+
+	* [r15355] src/tokenize.cxx: adapted to changed FoLiA stylesheet
+	  handling
+
+2012-10-11 09:42  sloot
+
+	* [r15278] tests/testfolia.ok, tests/testfolia2.ok,
+	  tests/testfoliain.ok, tests/testslash.ok: adapted to improved
+	  libfolia output (explicit UTF-8 encoding)
+
+2012-10-10 12:45  sloot
+
+	* [r15275] include/ucto/tokenize.h, include/ucto/unicode.h:
+	  safeguarded the safeguards
+
+2012-09-04 14:23  sloot
+
+	* [r15158] configure.ac: unneeded check
+
+2012-08-08 09:04  sloot
+
+	* [r15062] COPYING, configure.ac, include/ucto/tokenize.h,
+	  src/tokenize.cxx: use ticcutils in TiCC namespace
+
+2012-08-06 13:27  sloot
+
+	* [r15027] tests/test.es.txt, tests/test.es.txt.disabled,
+	  tests/test.fr.txt, tests/test.fr.txt.disabled, tests/test.pt.txt,
+	  tests/test.pt.txt.disabled: teporary disabled until Proycon does
+	  his job
+
+2012-08-06 12:54  sloot
+
+	* [r15026] config/tokconfig-en: improved english tokenization rules
+
+2012-08-06 12:53  sloot
+
+	* [r15025] tests/test.en.tok.V: accaptable result
+
+2012-08-06 10:20  sloot
+
+	* [r15024] config/tokconfig-de: V.S. en d.h. gaan nu goed.
+	  a.d.Donau misschien niet
+
+2012-08-06 09:45  sloot
+
+	* [r15023] tests/test.de.tok.V: ik denk dat dit dichter tegen de
+	  waarheid zit.
+	  Nu nog zorgen dat het er uit komt :{
+
+2012-08-01 16:21  sloot
+
+	* [r15022] tests/test.de.tok.V: new
+
+2012-08-01 16:10  sloot
+
+	* [r15021] config/tokconfig-de, src/tokenize.cxx: attempt to fix
+	  .de
+
+2012-08-01 13:21  sloot
+
+	* [r15020] include/ucto/tokenize.h: keep it private
+
+2012-08-01 12:54  sloot
+
+	* [r15019] include/ucto/tokenize.h, src/tokenize.cxx: save
+	  intermediate refactored state
+
+2012-07-31 10:44  sloot
+
+	* [r15016] src/tokenize.cxx: some cleanup
+
+2012-07-31 10:02  sloot
+
+	* [r15015] include/ucto/tokenize.h, src/tokenize.cxx: fixed eos
+	  problems. '-s' option seems to work now.
+
+2012-07-31 10:01  sloot
+
+	* [r15014] tests/testall: test the new test
+
+2012-07-31 10:00  sloot
+
+	* [r15013] tests/eos.txt, tests/testoption-s,
+	  tests/testoption-s.ok, tests/utt.txt: added a new test
+
+2012-07-30 15:59  sloot
+
+	* [r15012] include/ucto/tokenize.h, src/tokenize.cxx: better info.
+
+2012-07-30 15:42  sloot
+
+	* [r15011] src/tokenize.cxx: typo, and some refactoring.
+
+2012-07-30 13:44  sloot
+
+	* [r15010] include/ucto/tokenize.h, src/tokenize.cxx: small cleanup
+
+2012-07-30 13:28  sloot
+
+	* [r15009] include/ucto/tokenize.h, src/tokenize.cxx: further
+	  refactoring
+
+2012-07-23 14:57  sloot
+
+	* [r15005] include/ucto/tokenize.h, src/tokenize.cxx:
+	  re-re-re-factored :P
+
+2012-07-23 14:13  sloot
+
+	* [r15004] src/tokenize.cxx: one small step…
+
+2012-07-23 13:09  sloot
+
+	* [r15003] include/ucto/tokenize.h, src/tokenize.cxx: more cleanup.
+
+2012-07-23 09:55  sloot
+
+	* [r15001] include/ucto/tokenize.h, src/tokenize.cxx: some more
+	  refactoring, and added some comment
+
+2012-07-18 15:20  sloot
+
+	* [r15000] src/tokenize.cxx: ok, more next week
+
+2012-07-18 14:48  sloot
+
+	* [r14999] include/ucto/tokenize.h, src/tokenize.cxx: next refactor
+	  step
+
+2012-07-18 13:31  sloot
+
+	* [r14998] include/ucto/tokenize.h, src/tokenize.cxx: hmm.
+	  progress…
+
+2012-07-18 12:59  sloot
+
+	* [r14997] src/tokenize.cxx: save intermediate result during
+	  cleanup
+
+2012-07-18 12:33  sloot
+
+	* [r14996] include/ucto/tokenize.h, src/tokenize.cxx: refactoring:
+	  removed parameters without any visable changes in behaviour.
+
+2012-07-18 08:44  fkarsdorp
+
+	* [r14995] tests/quotetest_folgert8.nl.tok.V: Correct the test
+	  case.
+
+2012-07-18 08:25  sloot
+
+	* [r14994] AUTHORS: Honour Folgert
+
+2012-07-17 15:36  sloot
+
+	* [r14993] include/ucto/tokenize.h, src/tokenize.cxx: indentation
+	  and such
+
+2012-07-17 15:13  sloot
+
+	* [r14992] include/ucto/tokenize.h, src/tokenize.cxx: some renaming
+	  for my own mental health
+
+2012-07-17 14:40  sloot
+
+	* [r14991] src/tokenize.cxx: remove unused functionm
+
+2012-07-17 14:26  sloot
+
+	* [r14990] include/ucto/tokenize.h, src/tokenize.cxx: started some
+	  refactoring
+
+2012-07-17 12:25  sloot
+
+	* [r14989] tests/folia2.txt, tests/testall, tests/testfolia2,
+	  tests/testfolia2.ok: added a complex folia test. asume the result
+	  is OK for now.
+
+2012-07-16 15:26  sloot
+
+	* [r14988] tests/bug0083.nl.tok.V: I think this more in line with
+	  wat we want.
+	  still doubtfull quotation
+
+2012-07-16 15:22  sloot
+
+	* [r14987] src/tokenize.cxx: attempt to fix bug94 (fixes
+	  quotetest_folgert7 too??)
+
+2012-07-16 15:00  sloot
+
+	* [r14986] tests/quotetest_folgert7.nl.tok.V: I think the correct
+	  file wasn't correct at all. Folgert??
+
+2012-07-10 13:44  sloot
+
+	* [r14972] src/tokenize.cxx: fixed debugging lines
+
+2012-07-05 13:50  sloot
+
+	* [r14962] NEWS: updated NEWS
+
+2012-07-05 13:47  sloot
+
+	* [r14961] src/tokenize.cxx: fixed folia id's
+
+2012-07-03 12:54  mvgompel
+
+	* [r14947] config/tokconfig-fy: partial revert
+
+2012-07-03 12:53  mvgompel
+
+	* [r14946] config/tokconfig-fy: fix
+
+2012-07-03 12:48  mvgompel
+
+	* [r14945] config/tokconfig-fy: update
+
+2012-06-13 16:05  sloot
+
+	* [r14861] tests/bug0094.nl.tok.V, tests/bug0094.nl.txt,
+	  tests/quotetest_folgert.nl.tok.V, tests/quotetest_folgert.nl.txt,
+	  tests/quotetest_folgert2.nl.tok.V,
+	  tests/quotetest_folgert2.nl.txt,
+	  tests/quotetest_folgert3.nl.tok.V,
+	  tests/quotetest_folgert3.nl.txt,
+	  tests/quotetest_folgert4.nl.tok.V,
+	  tests/quotetest_folgert4.nl.txt,
+	  tests/quotetest_folgert5.nl.tok.V,
+	  tests/quotetest_folgert5.nl.txt,
+	  tests/quotetest_folgert6.nl.tok.V,
+	  tests/quotetest_folgert6.nl.txt,
+	  tests/quotetest_folgert7.nl.tok.V,
+	  tests/quotetest_folgert7.nl.txt,
+	  tests/quotetest_folgert8.nl.tok.V,
+	  tests/quotetest_folgert8.nl.txt, tests/testquotes,
+	  tests/testquotes.ok: properties
+
+2012-06-13 16:03  sloot
+
+	* [r14860] tests/bug0094.nl.tok.V, tests/bug0094.nl.txt: new bug
+	  found
+
+2012-06-11 13:01  sloot
+
+	* [r14847] tests/bug0051.nl.tok.V, tests/bug0052.nl.tok.V,
+	  tests/bug0065b.nl.tok.V, tests/quotetest_folgert.nl.tok.V,
+	  tests/quotetest_folgert2.nl.tok.V,
+	  tests/quotetest_folgert3.nl.tok.V,
+	  tests/quotetest_folgert4.nl.tok.V,
+	  tests/quotetest_folgert5.nl.tok.V,
+	  tests/quotetest_folgert6.nl.tok.V,
+	  tests/quotetest_folgert7.nl.tok.V,
+	  tests/quotetest_folgert8.nl.tok.V,
+	  tests/quotetest_multiline.nl.tok.V,
+	  tests/quotetest_multisentence.nl.tok.V,
+	  tests/quotetest_nested2.nl.tok.V,
+	  tests/quotetest_onesentence.nl.tok.V, tests/test.nl.tok.V,
+	  tests/testquotes.ok: addapted to changed verbose output
+
+2012-06-11 12:50  sloot
+
+	* [r14846] tests/bug0083.nl.tok.V: ok, we are more happy now, but
+	  there is still a BEGINOFSENTENCE missing I think
+
+2012-06-11 12:42  sloot
+
+	* [r14845] src/tokenize.cxx: fixed indentation of the source
+	  also always put an extra newline after ENDOFSENNTENCE in verbose
+	  mode
+
+2012-06-11 11:12  sloot
+
+	* [r14844] tests/nu.nl.tok.V: dit is al meer wat we willen
+
+2012-06-11 11:04  sloot
+
+	* [r14843] tests/test.py: reversed parameters to diff.
+	  now we do 'diff <what we got> <what we want>"
+	  This is more in line with other tests
+
+2012-06-04 11:25  fkarsdorp
+
+	* [r14833] COPYING, config/tokconfig-nl, src/tokenize.cxx,
+	  tests/quotetest_folgert5.nl.tok.V,
+	  tests/quotetest_folgert5.nl.txt,
+	  tests/quotetest_folgert6.nl.tok.V,
+	  tests/quotetest_folgert6.nl.txt,
+	  tests/quotetest_folgert7.nl.tok.V,
+	  tests/quotetest_folgert7.nl.txt,
+	  tests/quotetest_folgert8.nl.tok.V,
+	  tests/quotetest_folgert8.nl.txt: Fix nested quoted sentences;
+	  added testcases; small additions to tokconfig-nl
+
+2012-05-31 13:56  fkarsdorp
+
+	* [r14826] include/ucto/tokenize.h: update header, u_isquote
+
+2012-05-31 13:56  fkarsdorp
+
+	* [r14825] src/tokenize.cxx: getting used to svn...
+
+2012-05-31 12:55  fkarsdorp
+
+	* [r14824] tests/quotetest_folgert4.nl.tok.V,
+	  tests/quotetest_folgert4.nl.txt: small fix in quote detection,
+	  testcase added.
+
+2012-05-30 15:11  sloot
+
+	* [r14823] configure.ac, src/Makefile.am: exterminated
+	  AC_DEFINE_DIR here too
+
+2012-05-30 13:50  mvgompel
+
+	* [r14820] NEWS: updated
+
+2012-05-30 13:48  mvgompel
+
+	* [r14819] src/tokenize.cxx, tests/bug0065b.nl.tok.V,
+	  tests/quotetest_folgert.nl.tok.V, tests/quotetest_folgert.nl.txt,
+	  tests/quotetest_folgert2.nl.tok.V,
+	  tests/quotetest_folgert2.nl.txt,
+	  tests/quotetest_folgert3.nl.tok.V,
+	  tests/quotetest_folgert3.nl.txt, tests/test.py: Folgert fixed
+	  quote detection, extra tests added
+
+2012-05-22 13:34  sloot
+
+	* [r14777] src/tokenize.cxx: handle ' just like " in
+	  quote-detection
+
+2012-05-22 13:34  sloot
+
+	* [r14776] config/standard-quotes.quote: updated comment
+
+2012-05-22 13:33  sloot
+
+	* [r14775] tests/testquotes.ok: let's call this OK, for now.
+
+2012-05-22 13:32  sloot
+
+	* [r14774] tests/qtest.nl, tests/qtest.spec, tests/testall,
+	  tests/testquotes: added tests for quotes
+
+2012-05-07 14:52  sloot
+
+	* [r14721] config/tokconfig-nl: attempt to fix 'time' problem
+
+2012-05-07 14:47  sloot
+
+	* [r14720] config/smiley.rule, tests/smileys.nl.tok.V: fixed smiley
+
+2012-05-07 14:40  sloot
+
+	* [r14719] tests/datetime.nl.tok.V, tests/datetime.nl.txt,
+	  tests/smileys.nl.tok.V, tests/smileys.nl.txt, tests/testall,
+	  tests/testconf2.ok: added tests (still fail!)
+
+2012-04-26 15:57  sloot
+
+	* [r14703] tests/slashes.xml, tests/testfolia.ok,
+	  tests/testfoliain.ok, tests/testslash.ok: adapted to new folia
+	  version
+
+2012-03-29 09:04  sloot
+
+	* [r14587] configure.ac: bump version after release
+
+2012-03-29 08:48  sloot
+
+	* [r14585] NEWS, configure.ac: get ready for release
+
 2012-03-19 10:54  sloot
 
 	* [r14472] src/ucto.cxx: numb change
diff --git a/INSTALL b/INSTALL
index 7d1c323..a1e89e1 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,8 +1,8 @@
 Installation Instructions
 *************************
 
-Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005,
-2006, 2007, 2008, 2009 Free Software Foundation, Inc.
+Copyright (C) 1994-1996, 1999-2002, 2004-2011 Free Software Foundation,
+Inc.
 
    Copying and distribution of this file, with or without modification,
 are permitted in any medium without royalty provided the copyright
@@ -226,6 +226,11 @@ order to use an ANSI C compiler:
 
 and if that doesn't work, install pre-built binaries of GCC for HP-UX.
 
+   HP-UX `make' updates targets which have the same time stamps as
+their prerequisites, which makes it generally unusable when shipped
+generated files such as `configure' are involved.  Use GNU `make'
+instead.
+
    On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot
 parse its `<wchar.h>' header file.  The option `-nodtk' can be used as
 a workaround.  If GNU CC is not installed, it is therefore recommended
diff --git a/Makefile.in b/Makefile.in
index 9edc563..a89271f 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -1,9 +1,9 @@
-# Makefile.in generated by automake 1.11.1 from Makefile.am.
+# Makefile.in generated by automake 1.11.3 from Makefile.am.
 # @configure_input@
 
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006, 2007, 2008, 2009  Free Software Foundation,
-# Inc.
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
@@ -45,11 +45,11 @@ DIST_COMMON = README $(am__configure_deps) $(srcdir)/Makefile.am \
 	TODO config.guess config.sub depcomp install-sh ltmain.sh \
 	missing
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ac_define_dir.m4 \
-	$(top_srcdir)/m4/ax_icu_check.m4 $(top_srcdir)/m4/libtool.m4 \
-	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
-	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-	$(top_srcdir)/m4/pkg.m4 $(top_srcdir)/configure.ac
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \
+	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
+	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
+	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/m4/pkg.m4 \
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
@@ -88,6 +88,12 @@ am__nobase_list = $(am__nobase_strip_setup); \
 am__base_list = \
   sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
   sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
 am__installdirs = "$(DESTDIR)$(pkgconfigdir)"
 DATA = $(pkgconfig_DATA)
 RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
@@ -102,9 +108,11 @@ DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 distdir = $(PACKAGE)-$(VERSION)
 top_distdir = $(distdir)
 am__remove_distdir = \
-  { test ! -d "$(distdir)" \
-    || { find "$(distdir)" -type d ! -perm -200 -exec chmod u+w {} ';' \
-         && rm -fr "$(distdir)"; }; }
+  if test -d "$(distdir)"; then \
+    find "$(distdir)" -type d ! -perm -200 -exec chmod u+w {} ';' \
+      && rm -rf "$(distdir)" \
+      || { sleep 5 && rm -rf "$(distdir)"; }; \
+  else :; fi
 am__relativize = \
   dir0=`pwd`; \
   sed_first='s,^\([^/]*\)/.*$$,\1,'; \
@@ -133,6 +141,8 @@ am__relativize = \
 DIST_ARCHIVES = $(distdir).tar.gz
 GZIP_ENV = --best
 distuninstallcheck_listfiles = find . -type f -print
+am__distuninstallcheck_listfiles = $(distuninstallcheck_listfiles) \
+  | sed 's|^\./|$(prefix)/|' | grep -v '$(infodir)/dir$$'
 distcleancheck_listfiles = find . -type f -print
 ACLOCAL = @ACLOCAL@
 AMTAR = @AMTAR@
@@ -210,7 +220,7 @@ SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
 STRIP = @STRIP@
-SYSCONF_PATH = @SYSCONF_PATH@
+SYSCONFDIR = @SYSCONFDIR@
 VERSION = @VERSION@
 XML2_CFLAGS = @XML2_CFLAGS@
 XML2_LIBS = @XML2_LIBS@
@@ -266,6 +276,8 @@ sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
 sysconfdir = @sysconfdir@
 target_alias = @target_alias@
+ticcutils_CFLAGS = @ticcutils_CFLAGS@
+ticcutils_LIBS = @ticcutils_LIBS@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
@@ -278,7 +290,7 @@ all: config.h
 	$(MAKE) $(AM_MAKEFLAGS) all-recursive
 
 .SUFFIXES:
-am--refresh:
+am--refresh: Makefile
 	@:
 $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
@@ -314,10 +326,8 @@ $(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 $(am__aclocal_m4_deps):
 
 config.h: stamp-h1
-	@if test ! -f $@; then \
-	  rm -f stamp-h1; \
-	  $(MAKE) $(AM_MAKEFLAGS) stamp-h1; \
-	else :; fi
+	@if test ! -f $@; then rm -f stamp-h1; else :; fi
+	@if test ! -f $@; then $(MAKE) $(AM_MAKEFLAGS) stamp-h1; else :; fi
 
 stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status
 	@rm -f stamp-h1
@@ -359,9 +369,7 @@ uninstall-pkgconfigDATA:
 	@$(NORMAL_UNINSTALL)
 	@list='$(pkgconfig_DATA)'; test -n "$(pkgconfigdir)" || list=; \
 	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
-	test -n "$$files" || exit 0; \
-	echo " ( cd '$(DESTDIR)$(pkgconfigdir)' && rm -f" $$files ")"; \
-	cd "$(DESTDIR)$(pkgconfigdir)" && rm -f $$files
+	dir='$(DESTDIR)$(pkgconfigdir)'; $(am__uninstall_files_from_dir)
 
 # This directory's subdirectories are mostly independent; you can cd
 # into them and run `make' without going through this Makefile.
@@ -570,7 +578,11 @@ dist-gzip: distdir
 	$(am__remove_distdir)
 
 dist-bzip2: distdir
-	tardir=$(distdir) && $(am__tar) | bzip2 -9 -c >$(distdir).tar.bz2
+	tardir=$(distdir) && $(am__tar) | BZIP2=$${BZIP2--9} bzip2 -c >$(distdir).tar.bz2
+	$(am__remove_distdir)
+
+dist-lzip: distdir
+	tardir=$(distdir) && $(am__tar) | lzip -c $${LZIP_OPT--9} >$(distdir).tar.lz
 	$(am__remove_distdir)
 
 dist-lzma: distdir
@@ -578,7 +590,7 @@ dist-lzma: distdir
 	$(am__remove_distdir)
 
 dist-xz: distdir
-	tardir=$(distdir) && $(am__tar) | xz -c >$(distdir).tar.xz
+	tardir=$(distdir) && $(am__tar) | XZ_OPT=$${XZ_OPT--e} xz -c >$(distdir).tar.xz
 	$(am__remove_distdir)
 
 dist-tarZ: distdir
@@ -609,6 +621,8 @@ distcheck: dist
 	  bzip2 -dc $(distdir).tar.bz2 | $(am__untar) ;;\
 	*.tar.lzma*) \
 	  lzma -dc $(distdir).tar.lzma | $(am__untar) ;;\
+	*.tar.lz*) \
+	  lzip -dc $(distdir).tar.lz | $(am__untar) ;;\
 	*.tar.xz*) \
 	  xz -dc $(distdir).tar.xz | $(am__untar) ;;\
 	*.tar.Z*) \
@@ -628,6 +642,7 @@ distcheck: dist
 	  && am__cwd=`pwd` \
 	  && $(am__cd) $(distdir)/_build \
 	  && ../configure --srcdir=.. --prefix="$$dc_install_base" \
+	    $(AM_DISTCHECK_CONFIGURE_FLAGS) \
 	    $(DISTCHECK_CONFIGURE_FLAGS) \
 	  && $(MAKE) $(AM_MAKEFLAGS) \
 	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
@@ -656,8 +671,16 @@ distcheck: dist
 	  list='$(DIST_ARCHIVES)'; for i in $$list; do echo $$i; done) | \
 	  sed -e 1h -e 1s/./=/g -e 1p -e 1x -e '$$p' -e '$$x'
 distuninstallcheck:
-	@$(am__cd) '$(distuninstallcheck_dir)' \
-	&& test `$(distuninstallcheck_listfiles) | wc -l` -le 1 \
+	@test -n '$(distuninstallcheck_dir)' || { \
+	  echo 'ERROR: trying to run $@ with an empty' \
+	       '$$(distuninstallcheck_dir)' >&2; \
+	  exit 1; \
+	}; \
+	$(am__cd) '$(distuninstallcheck_dir)' || { \
+	  echo 'ERROR: cannot chdir into $(distuninstallcheck_dir)' >&2; \
+	  exit 1; \
+	}; \
+	test `$(am__distuninstallcheck_listfiles) | wc -l` -eq 0 \
 	   || { echo "ERROR: files left after uninstall:" ; \
 	        if test -n "$(DESTDIR)"; then \
 	          echo "  (check DESTDIR support)"; \
@@ -691,10 +714,15 @@ install-am: all-am
 
 installcheck: installcheck-recursive
 install-strip:
-	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	  `test -z '$(STRIP)' || \
-	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
 mostlyclean-generic:
 
 clean-generic:
@@ -782,8 +810,8 @@ uninstall-am: uninstall-pkgconfigDATA
 .PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
 	all all-am am--refresh check check-am clean clean-generic \
 	clean-libtool ctags ctags-recursive dist dist-all dist-bzip2 \
-	dist-gzip dist-lzma dist-shar dist-tarZ dist-xz dist-zip \
-	distcheck distclean distclean-generic distclean-hdr \
+	dist-gzip dist-lzip dist-lzma dist-shar dist-tarZ dist-xz \
+	dist-zip distcheck distclean distclean-generic distclean-hdr \
 	distclean-libtool distclean-tags distcleancheck distdir \
 	distuninstallcheck dvi dvi-am html html-am info info-am \
 	install install-am install-data install-data-am install-dvi \
diff --git a/NEWS b/NEWS
index 6e7a6b6..aa50a2c 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,14 @@
+0.5.3 2013-04-04
+[Folgert Karsdorp]
+* Fixed quote detection, added tests. still shaky and default disabled 
+[Ko van der Sloot]
+* changed verbose output slightly
+* fixed id's in folia output
+* various folia fixes
+* honour BOM markers in input file
+* lots of configuration updates
+* some fixes in handling if RULES
+
 0.5.2 2012-03-29
 [Ko vd Sloot]
 * some small changes. Made it work with libfolia 0.9
diff --git a/aclocal.m4 b/aclocal.m4
index d1d0f69..cef13c0 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1,7 +1,8 @@
-# generated automatically by aclocal 1.11.1 -*- Autoconf -*-
+# generated automatically by aclocal 1.11.3 -*- Autoconf -*-
 
 # Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
-# 2005, 2006, 2007, 2008, 2009  Free Software Foundation, Inc.
+# 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation,
+# Inc.
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
@@ -19,12 +20,15 @@ You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically `autoreconf'.])])
 
-# Copyright (C) 2002, 2003, 2005, 2006, 2007, 2008  Free Software Foundation, Inc.
+# Copyright (C) 2002, 2003, 2005, 2006, 2007, 2008, 2011 Free Software
+# Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
+# serial 1
+
 # AM_AUTOMAKE_VERSION(VERSION)
 # ----------------------------
 # Automake X.Y traces this macro to ensure aclocal.m4 has been
@@ -34,7 +38,7 @@ AC_DEFUN([AM_AUTOMAKE_VERSION],
 [am__api_version='1.11'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.11.1], [],
+m4_if([$1], [1.11.3], [],
       [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])
 
@@ -50,19 +54,21 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.11.1])dnl
+[AM_AUTOMAKE_VERSION([1.11.3])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
   [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 
 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
 
-# Copyright (C) 2001, 2003, 2005  Free Software Foundation, Inc.
+# Copyright (C) 2001, 2003, 2005, 2011 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
+# serial 1
+
 # For projects using AC_CONFIG_AUX_DIR([foo]), Autoconf sets
 # $ac_aux_dir to `$srcdir/foo'.  In other projects, it is set to
 # `$srcdir', `$srcdir/..', or `$srcdir/../..'.
@@ -144,14 +150,14 @@ AC_CONFIG_COMMANDS_PRE(
 Usually this means the macro was only invoked conditionally.]])
 fi])])
 
-# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2009
-# Free Software Foundation, Inc.
+# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2009,
+# 2010, 2011 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 10
+# serial 12
 
 # There are a few dirty hacks below to avoid letting `AC_PROG_CC' be
 # written in clear, in which case automake, when reading aclocal.m4,
@@ -191,6 +197,7 @@ AC_CACHE_CHECK([dependency style of $depcc],
   # instance it was reported that on HP-UX the gcc test will end up
   # making a dummy file named `D' -- because `-MD' means `put the output
   # in D'.
+  rm -rf conftest.dir
   mkdir conftest.dir
   # Copy depcomp to subdir because otherwise we won't find it if we're
   # using a relative directory.
@@ -255,7 +262,7 @@ AC_CACHE_CHECK([dependency style of $depcc],
 	break
       fi
       ;;
-    msvisualcpp | msvcmsys)
+    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
       # This compiler won't grok `-c -o', but also, the minuso test has
       # not run yet.  These depmodes are late enough in the game, and
       # so weak that their functioning should not be impacted.
@@ -320,10 +327,13 @@ AC_DEFUN([AM_DEP_TRACK],
 if test "x$enable_dependency_tracking" != xno; then
   am_depcomp="$ac_aux_dir/depcomp"
   AMDEPBACKSLASH='\'
+  am__nodep='_no'
 fi
 AM_CONDITIONAL([AMDEP], [test "x$enable_dependency_tracking" != xno])
 AC_SUBST([AMDEPBACKSLASH])dnl
 _AM_SUBST_NOTMAKE([AMDEPBACKSLASH])dnl
+AC_SUBST([am__nodep])dnl
+_AM_SUBST_NOTMAKE([am__nodep])dnl
 ])
 
 # Generate code to set up dependency tracking.              -*- Autoconf -*-
@@ -545,12 +555,15 @@ for _am_header in $config_headers :; do
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
 
-# Copyright (C) 2001, 2003, 2005, 2008  Free Software Foundation, Inc.
+# Copyright (C) 2001, 2003, 2005, 2008, 2011 Free Software Foundation,
+# Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
+# serial 1
+
 # AM_PROG_INSTALL_SH
 # ------------------
 # Define $install_sh.
@@ -682,12 +695,15 @@ else
 fi
 ])
 
-# Copyright (C) 2003, 2004, 2005, 2006  Free Software Foundation, Inc.
+# Copyright (C) 2003, 2004, 2005, 2006, 2011 Free Software Foundation,
+# Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
+# serial 1
+
 # AM_PROG_MKDIR_P
 # ---------------
 # Check for `mkdir -p'.
@@ -710,13 +726,14 @@ esac
 
 # Helper functions for option handling.                     -*- Autoconf -*-
 
-# Copyright (C) 2001, 2002, 2003, 2005, 2008  Free Software Foundation, Inc.
+# Copyright (C) 2001, 2002, 2003, 2005, 2008, 2010 Free Software
+# Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 4
+# serial 5
 
 # _AM_MANGLE_OPTION(NAME)
 # -----------------------
@@ -724,13 +741,13 @@ AC_DEFUN([_AM_MANGLE_OPTION],
 [[_AM_OPTION_]m4_bpatsubst($1, [[^a-zA-Z0-9_]], [_])])
 
 # _AM_SET_OPTION(NAME)
-# ------------------------------
+# --------------------
 # Set option NAME.  Presently that only means defining a flag for this option.
 AC_DEFUN([_AM_SET_OPTION],
 [m4_define(_AM_MANGLE_OPTION([$1]), 1)])
 
 # _AM_SET_OPTIONS(OPTIONS)
-# ----------------------------------
+# ------------------------
 # OPTIONS is a space-separated list of Automake options.
 AC_DEFUN([_AM_SET_OPTIONS],
 [m4_foreach_w([_AM_Option], [$1], [_AM_SET_OPTION(_AM_Option)])])
@@ -806,12 +823,14 @@ Check your system clock])
 fi
 AC_MSG_RESULT(yes)])
 
-# Copyright (C) 2001, 2003, 2005  Free Software Foundation, Inc.
+# Copyright (C) 2001, 2003, 2005, 2011 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
+# serial 1
+
 # AM_PROG_INSTALL_STRIP
 # ---------------------
 # One issue with vendor `install' (even GNU) is that you can't
@@ -834,13 +853,13 @@ fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])
 
-# Copyright (C) 2006, 2008  Free Software Foundation, Inc.
+# Copyright (C) 2006, 2008, 2010 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-# serial 2
+# serial 3
 
 # _AM_SUBST_NOTMAKE(VARIABLE)
 # ---------------------------
@@ -849,13 +868,13 @@ AC_SUBST([INSTALL_STRIP_PROGRAM])])
 AC_DEFUN([_AM_SUBST_NOTMAKE])
 
 # AM_SUBST_NOTMAKE(VARIABLE)
-# ---------------------------
+# --------------------------
 # Public sister of _AM_SUBST_NOTMAKE.
 AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
 
 # Check how to create a tarball.                            -*- Autoconf -*-
 
-# Copyright (C) 2004, 2005  Free Software Foundation, Inc.
+# Copyright (C) 2004, 2005, 2012 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -877,10 +896,11 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
 # a tarball read from stdin.
 #     $(am__untar) < result.tar
 AC_DEFUN([_AM_PROG_TAR],
-[# Always define AMTAR for backward compatibility.
-AM_MISSING_PROG([AMTAR], [tar])
+[# Always define AMTAR for backward compatibility.  Yes, it's still used
+# in the wild :-(  We should find a proper way to deprecate it ...
+AC_SUBST([AMTAR], ['$${TAR-tar}'])
 m4_if([$1], [v7],
-     [am__tar='${AMTAR} chof - "$$tardir"'; am__untar='${AMTAR} xf -'],
+     [am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'],
      [m4_case([$1], [ustar],, [pax],,
               [m4_fatal([Unknown tar format])])
 AC_MSG_CHECKING([how to create a $1 tar archive])
@@ -949,7 +969,6 @@ AC_SUBST([am__tar])
 AC_SUBST([am__untar])
 ]) # _AM_PROG_TAR
 
-m4_include([m4/ac_define_dir.m4])
 m4_include([m4/ax_icu_check.m4])
 m4_include([m4/libtool.m4])
 m4_include([m4/ltoptions.m4])
diff --git a/bootstrap b/bootstrap
index 21c8158..c5bba62 100644
--- a/bootstrap
+++ b/bootstrap
@@ -1,4 +1,4 @@
-# $Id: bootstrap 8036 2011-01-27 11:56:11Z joostvb $
+# $Id: bootstrap 15361 2012-10-30 12:36:03Z mvgompel $
 # $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/bootstrap $
 
 # bootstrap - script to bootstrap the distribution rolling engine
@@ -64,7 +64,7 @@ EOT
    fi
 
 
-if $automake --version|head -1 |grep '1\.[4-8]'; then
+if $automake --version|head -1 |grep ' 1\.[4-8]'; then
     echo "automake 1.4-1.8 is active. You should use automake 1.9 or later"
     if test -f /etc/debian_version; then
         echo " sudo apt-get install automake1.9"
@@ -73,7 +73,7 @@ if $automake --version|head -1 |grep '1\.[4-8]'; then
     exit 1
 fi
 
-if $aclocal --version|head -1 |grep '1\.[4-8]'; then
+if $aclocal --version|head -1 |grep ' 1\.[4-8]'; then
     echo "aclocal 1.4-1.8 is active. You should use aclocal 1.9 or later"
     if test -f /etc/debian_version; then	
         echo " sudo apt-get install aclocal1.9"
diff --git a/config.guess b/config.guess
index 40eaed4..d622a44 100755
--- a/config.guess
+++ b/config.guess
@@ -2,9 +2,9 @@
 # Attempt to guess a canonical system name.
 #   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
 #   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
-#   2011 Free Software Foundation, Inc.
+#   2011, 2012 Free Software Foundation, Inc.
 
-timestamp='2011-05-11'
+timestamp='2012-02-10'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -17,9 +17,7 @@ timestamp='2011-05-11'
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
-# 02110-1301, USA.
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
 #
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
@@ -57,8 +55,8 @@ GNU config.guess ($timestamp)
 
 Originally written by Per Bothner.
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
-2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free
-Software Foundation, Inc.
+2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
+Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -145,7 +143,7 @@ UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
 case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
     *:NetBSD:*:*)
 	# NetBSD (nbsd) targets should (where applicable) match one or
-	# more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
+	# more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*,
 	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
 	# switched to ELF, *-*-netbsd* would select the old
 	# object file format.  This provides both forward
@@ -792,13 +790,12 @@ EOF
 	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
 	exit ;;
     *:FreeBSD:*:*)
-	case ${UNAME_MACHINE} in
-	    pc98)
-		echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	UNAME_PROCESSOR=`/usr/bin/uname -p`
+	case ${UNAME_PROCESSOR} in
 	    amd64)
 		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
 	    *)
-		echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+		echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
 	esac
 	exit ;;
     i*:CYGWIN*:*)
@@ -807,6 +804,9 @@ EOF
     *:MINGW*:*)
 	echo ${UNAME_MACHINE}-pc-mingw32
 	exit ;;
+    i*:MSYS*:*)
+	echo ${UNAME_MACHINE}-pc-msys
+	exit ;;
     i*:windows32*:*)
 	# uname -m includes "-pc" on this system.
 	echo ${UNAME_MACHINE}-mingw32
@@ -861,6 +861,13 @@ EOF
     i*86:Minix:*:*)
 	echo ${UNAME_MACHINE}-pc-minix
 	exit ;;
+    aarch64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    aarch64_be:Linux:*:*)
+	UNAME_MACHINE=aarch64_be
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
     alpha:Linux:*:*)
 	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
 	  EV5)   UNAME_MACHINE=alphaev5 ;;
@@ -895,13 +902,16 @@ EOF
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     cris:Linux:*:*)
-	echo cris-axis-linux-gnu
+	echo ${UNAME_MACHINE}-axis-linux-gnu
 	exit ;;
     crisv32:Linux:*:*)
-	echo crisv32-axis-linux-gnu
+	echo ${UNAME_MACHINE}-axis-linux-gnu
 	exit ;;
     frv:Linux:*:*)
-	echo frv-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    hexagon:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     i*86:Linux:*:*)
 	LIBC=gnu
@@ -943,7 +953,7 @@ EOF
 	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
 	;;
     or32:Linux:*:*)
-	echo or32-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     padre:Linux:*:*)
 	echo sparc-unknown-linux-gnu
@@ -978,13 +988,13 @@ EOF
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     tile*:Linux:*:*)
-	echo ${UNAME_MACHINE}-tilera-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     vax:Linux:*:*)
 	echo ${UNAME_MACHINE}-dec-linux-gnu
 	exit ;;
     x86_64:Linux:*:*)
-	echo x86_64-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     xtensa*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
@@ -1315,6 +1325,9 @@ EOF
     i*86:AROS:*:*)
 	echo ${UNAME_MACHINE}-pc-aros
 	exit ;;
+    x86_64:VMkernel:*:*)
+	echo ${UNAME_MACHINE}-unknown-esx
+	exit ;;
 esac
 
 #echo '(No uname command or uname output not recognized.)' 1>&2
diff --git a/config.h.in b/config.h.in
index edd537a..9bffbfb 100644
--- a/config.h.in
+++ b/config.h.in
@@ -67,9 +67,6 @@
 /* Define to 1 if you have the ANSI C header files. */
 #undef STDC_HEADERS
 
-/* sysconfdir */
-#undef SYSCONF_PATH
-
 /* Version number of package */
 #undef VERSION
 
diff --git a/config.sub b/config.sub
index 30fdca8..c894da4 100755
--- a/config.sub
+++ b/config.sub
@@ -2,9 +2,9 @@
 # Configuration validation subroutine script.
 #   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
 #   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
-#   2011 Free Software Foundation, Inc.
+#   2011, 2012 Free Software Foundation, Inc.
 
-timestamp='2011-03-23'
+timestamp='2012-02-10'
 
 # This file is (in principle) common to ALL GNU software.
 # The presence of a machine in this file suggests that SOME GNU software
@@ -21,9 +21,7 @@ timestamp='2011-03-23'
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
-# 02110-1301, USA.
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
 #
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
@@ -76,8 +74,8 @@ version="\
 GNU config.sub ($timestamp)
 
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
-2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free
-Software Foundation, Inc.
+2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
+Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -132,6 +130,10 @@ case $maybe_os in
     os=-$maybe_os
     basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
     ;;
+  android-linux)
+    os=-linux-android
+    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`-unknown
+    ;;
   *)
     basic_machine=`echo $1 | sed 's/-[^-]*$//'`
     if [ $basic_machine != $1 ]
@@ -247,17 +249,22 @@ case $basic_machine in
 	# Some are omitted here because they have special meanings below.
 	1750a | 580 \
 	| a29k \
+	| aarch64 | aarch64_be \
 	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
 	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
 	| am33_2.0 \
 	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
+        | be32 | be64 \
 	| bfin \
 	| c4x | clipper \
 	| d10v | d30v | dlx | dsp16xx \
+	| epiphany \
 	| fido | fr30 | frv \
 	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
+	| hexagon \
 	| i370 | i860 | i960 | ia64 \
 	| ip2k | iq2000 \
+	| le32 | le64 \
 	| lm32 \
 	| m32c | m32r | m32rle | m68000 | m68k | m88k \
 	| maxq | mb | microblaze | mcore | mep | metag \
@@ -291,7 +298,7 @@ case $basic_machine in
 	| pdp10 | pdp11 | pj | pjl \
 	| powerpc | powerpc64 | powerpc64le | powerpcle \
 	| pyramid \
-	| rx \
+	| rl78 | rx \
 	| score \
 	| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
 	| sh64 | sh64le \
@@ -300,7 +307,7 @@ case $basic_machine in
 	| spu \
 	| tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \
 	| ubicom32 \
-	| v850 | v850e \
+	| v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
 	| we32k \
 	| x86 | xc16x | xstormy16 | xtensa \
 	| z8k | z80)
@@ -315,8 +322,7 @@ case $basic_machine in
 	c6x)
 		basic_machine=tic6x-unknown
 		;;
-	m6811 | m68hc11 | m6812 | m68hc12 | picochip)
-		# Motorola 68HC11/12.
+	m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | picochip)
 		basic_machine=$basic_machine-unknown
 		os=-none
 		;;
@@ -329,7 +335,10 @@ case $basic_machine in
 	strongarm | thumb | xscale)
 		basic_machine=arm-unknown
 		;;
-
+	xgate)
+		basic_machine=$basic_machine-unknown
+		os=-none
+		;;
 	xscaleeb)
 		basic_machine=armeb-unknown
 		;;
@@ -352,11 +361,13 @@ case $basic_machine in
 	# Recognize the basic CPU types with company name.
 	580-* \
 	| a29k-* \
+	| aarch64-* | aarch64_be-* \
 	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
 	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
 	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
 	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
 	| avr-* | avr32-* \
+	| be32-* | be64-* \
 	| bfin-* | bs2000-* \
 	| c[123]* | c30-* | [cjt]90-* | c4x-* \
 	| clipper-* | craynv-* | cydra-* \
@@ -365,8 +376,10 @@ case $basic_machine in
 	| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
 	| h8300-* | h8500-* \
 	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
+	| hexagon-* \
 	| i*86-* | i860-* | i960-* | ia64-* \
 	| ip2k-* | iq2000-* \
+	| le32-* | le64-* \
 	| lm32-* \
 	| m32c-* | m32r-* | m32rle-* \
 	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
@@ -400,7 +413,7 @@ case $basic_machine in
 	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
 	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
 	| pyramid-* \
-	| romp-* | rs6000-* | rx-* \
+	| rl78-* | romp-* | rs6000-* | rx-* \
 	| sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
 	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
 	| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
@@ -408,10 +421,11 @@ case $basic_machine in
 	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \
 	| tahoe-* \
 	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
-	| tile-* | tilegx-* \
+	| tile*-* \
 	| tron-* \
 	| ubicom32-* \
-	| v850-* | v850e-* | vax-* \
+	| v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
+	| vax-* \
 	| we32k-* \
 	| x86-* | x86_64-* | xc16x-* | xps100-* \
 	| xstormy16-* | xtensa*-* \
@@ -711,7 +725,6 @@ case $basic_machine in
 	i370-ibm* | ibm*)
 		basic_machine=i370-ibm
 		;;
-# I'm not sure what "Sysv32" means.  Should this be sysv3.2?
 	i*86v32)
 		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
 		os=-sysv32
@@ -808,10 +821,18 @@ case $basic_machine in
 	ms1-*)
 		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
 		;;
+	msys)
+		basic_machine=i386-pc
+		os=-msys
+		;;
 	mvs)
 		basic_machine=i370-ibm
 		os=-mvs
 		;;
+	nacl)
+		basic_machine=le32-unknown
+		os=-nacl
+		;;
 	ncr3000)
 		basic_machine=i486-ncr
 		os=-sysv4
@@ -1120,13 +1141,8 @@ case $basic_machine in
 		basic_machine=t90-cray
 		os=-unicos
 		;;
-	# This must be matched before tile*.
-	tilegx*)
-		basic_machine=tilegx-unknown
-		os=-linux-gnu
-		;;
 	tile*)
-		basic_machine=tile-unknown
+		basic_machine=$basic_machine-unknown
 		os=-linux-gnu
 		;;
 	tx39)
@@ -1336,7 +1352,7 @@ case $os in
 	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
 	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
 	      | -chorusos* | -chorusrdb* | -cegcc* \
-	      | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
+	      | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
 	      | -mingw32* | -linux-gnu* | -linux-android* \
 	      | -linux-newlib* | -linux-uclibc* \
 	      | -uxpv* | -beos* | -mpeix* | -udk* \
@@ -1548,9 +1564,6 @@ case $basic_machine in
 		;;
 	m68000-sun)
 		os=-sunos3
-		# This also exists in the configure program, but was not the
-		# default.
-		# os=-sunos4
 		;;
 	m68*-cisco)
 		os=-aout
diff --git a/config/Makefile.am b/config/Makefile.am
index fc60d71..d01711d 100644
--- a/config/Makefile.am
+++ b/config/Makefile.am
@@ -1,11 +1,11 @@
-# $Id: Makefile.am 14432 2012-03-09 10:17:55Z mvgompel $
+# $Id: Makefile.am 15675 2013-02-14 12:28:20Z mvgompel $
 # $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/config/Makefile.am $
 
-config_DATA = tokconfig-en tokconfig-nl tokconfig-fr tokconfig-it \
+config_DATA = tokconfig-en tokconfig-nl tokconfig-fr tokconfig-it tokconfig-es \
 	tokconfig-de tokconfig-sv tokconfig-nl-twitter tokconfig-nl-sonarchat tokconfig-fy \
 	url.rule e-mail.rule smiley.rule \
 	ligatures.filter standard-quotes.quote \
-	exotic-quotes.quote standard-eos.eos exotic-eos.eos
+	exotic-quotes.quote standard-eos.eos exotic-eos.eos nl_afk.abr es.abr
 
 configdir = $(sysconfdir)/$(PACKAGE)
 
diff --git a/config/Makefile.in b/config/Makefile.in
index e3c4d89..05c6551 100644
--- a/config/Makefile.in
+++ b/config/Makefile.in
@@ -1,9 +1,9 @@
-# Makefile.in generated by automake 1.11.1 from Makefile.am.
+# Makefile.in generated by automake 1.11.3 from Makefile.am.
 # @configure_input@
 
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006, 2007, 2008, 2009  Free Software Foundation,
-# Inc.
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
@@ -15,7 +15,7 @@
 
 @SET_MAKE@
 
-# $Id: Makefile.am 14432 2012-03-09 10:17:55Z mvgompel $
+# $Id: Makefile.am 15675 2013-02-14 12:28:20Z mvgompel $
 # $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/config/Makefile.am $
 
 VPATH = @srcdir@
@@ -40,11 +40,11 @@ host_triplet = @host@
 subdir = config
 DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ac_define_dir.m4 \
-	$(top_srcdir)/m4/ax_icu_check.m4 $(top_srcdir)/m4/libtool.m4 \
-	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
-	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-	$(top_srcdir)/m4/pkg.m4 $(top_srcdir)/configure.ac
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \
+	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
+	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
+	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/m4/pkg.m4 \
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -74,6 +74,12 @@ am__nobase_list = $(am__nobase_strip_setup); \
 am__base_list = \
   sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
   sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
 am__installdirs = "$(DESTDIR)$(configdir)"
 DATA = $(config_DATA)
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
@@ -153,7 +159,7 @@ SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
 STRIP = @STRIP@
-SYSCONF_PATH = @SYSCONF_PATH@
+SYSCONFDIR = @SYSCONFDIR@
 VERSION = @VERSION@
 XML2_CFLAGS = @XML2_CFLAGS@
 XML2_LIBS = @XML2_LIBS@
@@ -209,14 +215,16 @@ sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
 sysconfdir = @sysconfdir@
 target_alias = @target_alias@
+ticcutils_CFLAGS = @ticcutils_CFLAGS@
+ticcutils_LIBS = @ticcutils_LIBS@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
-config_DATA = tokconfig-en tokconfig-nl tokconfig-fr tokconfig-it \
+config_DATA = tokconfig-en tokconfig-nl tokconfig-fr tokconfig-it tokconfig-es \
 	tokconfig-de tokconfig-sv tokconfig-nl-twitter tokconfig-nl-sonarchat tokconfig-fy \
 	url.rule e-mail.rule smiley.rule \
 	ligatures.filter standard-quotes.quote \
-	exotic-quotes.quote standard-eos.eos exotic-eos.eos
+	exotic-quotes.quote standard-eos.eos exotic-eos.eos nl_afk.abr es.abr
 
 configdir = $(sysconfdir)/$(PACKAGE)
 EXTRA_DIST = $(config_DATA)
@@ -276,9 +284,7 @@ uninstall-configDATA:
 	@$(NORMAL_UNINSTALL)
 	@list='$(config_DATA)'; test -n "$(configdir)" || list=; \
 	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
-	test -n "$$files" || exit 0; \
-	echo " ( cd '$(DESTDIR)$(configdir)' && rm -f" $$files ")"; \
-	cd "$(DESTDIR)$(configdir)" && rm -f $$files
+	dir='$(DESTDIR)$(configdir)'; $(am__uninstall_files_from_dir)
 tags: TAGS
 TAGS:
 
@@ -333,10 +339,15 @@ install-am: all-am
 
 installcheck: installcheck-am
 install-strip:
-	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	  `test -z '$(STRIP)' || \
-	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
 mostlyclean-generic:
 
 clean-generic:
diff --git a/config/es.abr b/config/es.abr
new file mode 100644
index 0000000..c21eb25
--- /dev/null
+++ b/config/es.abr
@@ -0,0 +1,205 @@
+aa
+vv
+Abg
+Abd.do
+Abg.da
+acept
+a/f
+afmo
+admón.
+amdr
+amd.or
+amd.ora
+ap
+Alfz
+Almte
+apdo
+Arq
+Arz
+atte
+atto
+atta
+av
+avd
+avda
+Bco
+Bibl
+Bmo
+Bo
+Brig
+Bto
+c.
+c/
+cl
+ctv
+ctvo
+caj
+cap
+Cap
+c/c
+Cdad
+céntimo
+Cía
+cje
+Cmdt
+Cmte
+cf
+cfr
+Cnel
+cód
+col
+Col
+Comod
+Comp
+Comte
+cónf
+cónfr
+Contralmte
+coord
+cp
+crec
+cta
+cte
+Cte
+c/u
+don
+Dña
+dcho
+del
+depto
+d/f
+dicc
+diag
+Dir
+doc
+dpto
+dto
+dupdo
+d/v
+e/
+e/c
+ed
+edit
+edo
+ef
+ej
+Emmo
+entlo
+etc
+excl
+Excmo
+Excma
+f
+fra
+fasc
+fca
+Fdo
+féc
+figura
+Fr
+fra
+Gdor
+Gdora
+Gob
+g/p
+Gral
+H
+Hno
+Hna
+ib
+ibíd
+ibid
+id
+íd
+igla
+imp
+impr
+impto
+incl
+Ing
+Inst
+izdo
+izda
+izq
+izqdo
+izqda
+Jhs
+Lcdo
+Lcda
+Ldo
+Lda
+Lic
+loc
+cit
+Ltd
+Ltdo
+Ltda
+Magfco
+Magfca
+máx
+Mín
+Mons
+mr
+ms
+núm
+Ob
+pág
+párr
+Pat
+Pbro
+pdo
+Pdte
+Pdta
+pg
+pl
+plza
+Pnt
+ppal
+pral
+Presb
+Prof
+prov
+prólogo
+pza
+Rdo
+Rda
+Rev
+Rvd
+Rvdo
+reg
+Rep
+Rte
+Rvdmo
+s
+S
+Sdad
+Soc
+Sgto
+sig
+Sr
+Sra
+Srta
+Sto
+Sta
+tel
+teléf
+tfno
+tít
+trad
+Tte
+U
+Ud
+Uds
+V
+Vd
+Vds
+Univ
+Valmte
+Vdo
+Vda
+vid 
+vol
+vs
+vto
+vta
+Xto
diff --git a/config/nl_afk.abr b/config/nl_afk.abr
new file mode 100644
index 0000000..2238e3a
--- /dev/null
+++ b/config/nl_afk.abr
@@ -0,0 +1,463 @@
+Aardoliemij
+Adm
+Adriaansz
+Afd
+Am
+Ant
+Anthoniszn
+Ave
+BMCie
+Bel
+Belastinggr
+Bfr
+Bijv
+Bk
+Blvd
+Br
+Bros
+Burg
+CHR
+Ch
+Chr
+Cie
+Co
+Com
+Corneliszn
+Corp
+CvN
+Cy
+Dep
+Dept
+Di
+Do
+Dhr
+Dr
+Drs
+Ed
+Em
+Eng
+Esq
+Eur
+Exc
+Exp
+F
+Fa
+Fam
+Fed
+Fl
+Fr
+Fred
+Gebr
+Gem
+Gen
+Gld
+H
+HH
+Hd
+Herv
+Hoogl
+Hr
+Hub
+Hzn
+Inc
+Ing
+Inl
+Inst
+Int
+Ir
+Isr
+It
+J-P
+Jac
+Jacq
+Jan
+Jhr
+Jkvr
+Joh
+Jr
+Jul
+Jzn
+KLu
+Kcal
+Kon
+Krj
+L
+Lat
+Ltd
+M
+Ma
+Mad
+Mass
+Mej
+Mevr
+Mgr
+Mij
+Min
+Mr
+Mrs
+Ms
+Mus
+Mw
+N
+NH
+NL
+Nd
+Ndl
+Ned
+Nic
+Nov
+O
+Oct
+Olym
+Org
+Oud-Eng
+P
+PE
+Pct
+PepsiCo
+Ph
+Phs
+Pol
+Prof
+Prov
+RED
+Red
+Rijkscomm
+Rom
+SEPT
+Sept
+Sj
+Sp
+Sr
+St
+Stbl
+Stct
+Sted
+TH
+Tel
+Th
+Tijdschr
+Tj
+Uitg
+Univ
+VS
+Ver
+Vic
+Vl
+Vlnr
+Vr
+Vz
+W
+Werkn
+Wo
+Z
+Za
+Zl
+Zn
+a
+aanv
+acad
+acc
+adj
+adm
+adv
+afb
+afd
+afk
+afl
+afz
+alg
+alt
+arr
+art
+asp
+ass
+atm
+aug
+beh
+beheerscomm
+ben
+benod
+betr
+bijv
+bijz
+bl
+blz
+br
+brab
+brandm
+btw
+bur
+bv
+c
+ca
+cal
+cand
+cao
+cap
+cat
+cc
+cf
+chr
+cm
+cod
+com
+commer
+comp
+coop
+cq
+ct
+deb
+dec
+derg
+dgl
+dgs
+dhr
+di
+dipl
+dir
+distr
+div
+do
+don
+dr
+drs
+ds
+dw
+ed
+eerste-luit
+eerw
+eig
+em
+enk
+enz
+etc
+ev
+evt
+ex
+excl
+f
+fa
+feb
+febr
+fec
+fig
+fl
+fol
+fr
+geb
+gebr
+gedipl
+geh
+gem
+gep
+gesch
+get
+gez
+gld
+gr
+gymn
+h
+herv
+hh
+hoogl
+hs
+ib
+ibid
+id
+ill
+imp
+impr
+inc
+incl
+indiv
+inf
+ing
+ink
+inl
+insp
+int
+intr
+inw
+inz
+ir
+it
+j
+jan
+jg
+jhr
+jl
+joh
+jr
+kHz
+kand
+kath
+kcal
+kg
+kl
+km
+l
+lb
+lib
+lic
+ll
+lt
+ltd
+m
+ma
+maj
+max
+med
+medew
+mej
+mevr
+mg
+mgr
+mil
+milj
+mld
+mln
+mm
+mnd
+mr
+mrd
+mrs
+mrt
+ms
+mtr
+muz
+mv
+mw
+n
+ned
+nl
+nom
+nov
+nr
+o
+oa
+ob
+obl
+okt
+olv
+ong
+ongeh
+onz
+opm
+opp
+or
+org
+oud-bevelv
+oud-penn
+oud-secr
+oud-voorz
+oud-vrijw
+oud-vrz
+p
+pCt
+pag
+par
+pct
+pd
+penn
+penningm
+perf
+persc
+pl
+plm
+plv
+pnt
+pr
+praes
+pres
+prk
+proc
+prof
+prot
+prov
+ps
+pt
+r
+re
+reg
+resp
+ret
+rk
+sc
+scholengem
+schr
+scr
+sec
+sept
+seq
+ser
+sin
+sing
+soc
+spr
+sq
+sr
+st
+subs
+subst
+sup
+t
+tab
+td
+tech
+temp
+terugbez
+tg
+tgov
+theel
+tit
+tv
+tw
+v
+vac
+var
+vdt
+verb
+verg
+versch
+vert
+vgl
+vice-voorz
+vice-vrz
+vid
+vlg
+vlgg
+vlnr
+vml
+vnl
+vnlr
+vnw
+voc
+voorl
+voorm
+voorw
+voorz
+vorstverl
+vr
+vrijw
+vrijwil
+vrijwill
+vrz
+vs
+wd
+weled
+weledelgeb
+weledelgestr
+weleerw
+werkg
+wo
+wsch
+z
+za
+zelfst
+zg
+zgn
+zn
+zog
+zw
+zwemb
diff --git a/config/smiley.rule b/config/smiley.rule
index c282ce9..e2db4d3 100644
--- a/config/smiley.rule
+++ b/config/smiley.rule
@@ -1 +1,2 @@
-SMILEY=^(?:[oO>}\])]?)(?:[\:;8][',]?[-\^]?(?:[sSdDpPcCoO#@*$|]|\)\)?|\{|\[|\(\(?)=?)$
+SMILEY=^(?:[oO>}\])]?)(?:[\:;8][',]?[-\^]?(?:[sSdDpPcCoO#@*$|?]|\)\)*|\{|\[|\(\(*)=?)$
+REVERSE-SMILEY=^(?:\(|\)|\}*)(?:[sScCoO#@*$|?]?|\{|\[|\(\(?)=?(?:[',]?[-\^]?[\:;8])(?:[oO<}\[)]?)$
diff --git a/config/standard-quotes.quote b/config/standard-quotes.quote
index 1eb5c1a..6e68d3e 100644
--- a/config/standard-quotes.quote
+++ b/config/standard-quotes.quote
@@ -1,7 +1,8 @@
 # all quotes must be entered as pairs of open en close quotes
-# separated by a ||
+# separated by a space
 # When more opening quotes match a single closing quote (or visa versa)
 # they must be aggregated in one string!
+# the ambiguous quotes " and ' are handled automaticly
 
 ‘ ’
 “„‟ ”
diff --git a/config/tokconfig-de b/config/tokconfig-de
index 450671c..7f829fb 100644
--- a/config/tokconfig-de
+++ b/config/tokconfig-de
@@ -1,11 +1,18 @@
+[RULE-ORDER]
+WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
+E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
+ABBREVIATION INITIALS INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
+NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+
 [RULES]
 %include url
 %include e-mail
+%include smiley
 
-#Ex (oud)-studente(s)
+#Ex (alt)-studente()
 WORD-PARPREFIX-PARSUFFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})
 
-#Ex: (oud)-studente, (on)zin,
+#Ex: (un)verstehbar,
 WORD-PARPREFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*
 
 #Ex: könig(in)
@@ -15,7 +22,7 @@ WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{
 WORD-COMPOUND=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)+
 
 #Abbreviations with multiple periods
-ABBREVIATION=\p{L}{1,3}(?:\.\p{L}{1,3})+\.?
+ABBREVIATION=^(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)\Z
 
 #retain initials
 INITIAL=^(?:\p{Lt}|\p{Lu})\.$
@@ -206,9 +213,17 @@ zus
 Ztr
 zzgl
 
+[FILTER]
+fl fl
+ff ff
+ffi ffi
+ffl ffl
+# also filter soft hyphen 
+\u00AD
+
+
 [EOSMARKERS]
 %include standard-eos
-%include exotic-eos
 
 [QUOTES]
 %include standard-quotes
diff --git a/config/tokconfig-en b/config/tokconfig-en
index e2db952..586dc5f 100644
--- a/config/tokconfig-en
+++ b/config/tokconfig-en
@@ -1,6 +1,13 @@
+[RULE-ORDER]
+WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
+E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
+ABBREVIATION INITIALS INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
+NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+
 [RULES]
 %include url
 %include e-mail
+%include smiley
 
 #Ex: (dis)information
 WORD-PARPREFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*
@@ -12,7 +19,7 @@ WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{
 WORD-COMPOUND=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)+
 
 #Abbreviations with multiple periods
-ABBREVIATION=\p{L}{1,3}(?:\.\p{L}{1,3})+\.?
+ABBREVIATION=^(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)\Z
 
 #retain initials
 INITIAL=^(?:\p{Lt}|\p{Lu})\.$
@@ -21,8 +28,8 @@ INITIAL=^(?:\p{Lt}|\p{Lu})\.$
 PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 
 #Date
-DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
-DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
+DATE=\p{N}{1,2}/\p{Ps}?\p{N}{1,2}[/]\p{Ps}?\p{N}{2,4}
+DATE-REVERSE=\p{N}{4}/\p{N}{1,2}/\p{N}{1,2}
 
 NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
 #NUMBER-YEAR=('\p{N}{2})\P{N}
@@ -50,6 +57,7 @@ n't
 's
 'd
 've
+'ll
 
 [ORDINALS]
 st
@@ -246,88 +254,17 @@ Wis
 Wyo
 
 
-[EOSMARKERS]
-# Character: !
-# Name: EXCLAMATION MARK
-# Code: 33 (0x21) 
-\u0021
-
-# Character: ?
-# Name: QUESTION MARK
-# Code: 3f (0x3f) 
-\u003F
-
-# Character: ;
-# Name: GREEK QUESTION MARK
-# Code: 894 (0x37e) 
-\u037e
-
-# Character: ؟
-# Name: ARABIC QUESTION MARK
-# Code: 1567 (0x61f) 
-\u061f
-
-# Character: 。
-# Name: IDEOGRAPHIC FULL STOP
-# Code: 12290 (0x3002) 
-\u3002
-
-# Character: 。
-# Name: HALFWIDTH IDEOGRAPHIC FULL STOP
-# Code: 65377 (0xff61) 
-\uff61
-
-# Character: ?
-# Name: FULLWIDTH QUESTION MARK
-# Code: 65311 (0xff1f) 
-\uff1f
+[FILTER]
+fl fl
+ff ff
+ffi ffi
+ffl ffl
+# also filter soft hyphen 
+\u00AD
 
-# Character: !
-# Name: FULLWIDTH EXCLAMATION MARK
-# Code: 65281 (0xff01) 
-\uff01
 
-# Character: ।
-# Name: DEVANAGARI DANDA
-# Code: 2404 (0x964) 
-\u0964
-
-# Character: ։
-# Name: ARMENIAN FULL STOP
-# Code: 1417 (0x589) 
-\u0589
-
-# Character: ՞
-# Name: ARMENIAN QUESTION MARK
-# Code: 1374 (0x55e) 
-\u055e
-
-# Character: ።
-# Name: ETHIOPIC FULL STOP
-# Code: 4962 (0x1362) 
-\u1362
-
-# Character: ᙮
-# Name: CANADIAN SYLLABICS FULL STOP
-# Code: 5742 (0x166e) 
-\u166e
-
-# Character: ។
-# Name: KHMER SIGN KHAN
-# Code: 6100 (0x17d4) 
-\u17d4
-
-# Character: ៕
-# Name: KHMER SIGN BARIYOOSAN
-# Code: 6101 (0x17d5) 
-\u17d5
-
-# Character: ᠃
-# Name: MONGOLIAN FULL STOP
-# Code: 6147 (0x1803) 
-\u1803
+[EOSMARKERS]
+%include standard-eos
 
-# Character: ᠉
-# Name: MONGOLIAN MANCHU FULL STOP
-# Code: 6153 (0x1809) 
-\u1809
+[QUOTES]
+%include standard-quotes
diff --git a/config/tokconfig-de b/config/tokconfig-es
similarity index 51%
copy from config/tokconfig-de
copy to config/tokconfig-es
index 450671c..d46ba7d 100644
--- a/config/tokconfig-de
+++ b/config/tokconfig-es
@@ -1,21 +1,25 @@
+[RULE-ORDER]
+WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
+E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
+ABBREVIATION INITIALS INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
+NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
+
 [RULES]
 %include url
 %include e-mail
+%include smiley
 
-#Ex (oud)-studente(s)
-WORD-PARPREFIX-PARSUFFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})
-
-#Ex: (oud)-studente, (on)zin,
+#Ex: (dis)information
 WORD-PARPREFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*
 
-#Ex: könig(in)
+#Ex: understand(s)
 WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})
 
 #Keep dash/underscore connected parts (even if they are in parenthesis)
 WORD-COMPOUND=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)+
 
 #Abbreviations with multiple periods
-ABBREVIATION=\p{L}{1,3}(?:\.\p{L}{1,3})+\.?
+ABBREVIATION=^(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)\Z
 
 #retain initials
 INITIAL=^(?:\p{Lt}|\p{Lu})\.$
@@ -24,8 +28,8 @@ INITIAL=^(?:\p{Lt}|\p{Lu})\.$
 PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 
 #Date
-DATE=\p{N}{1,2}[\.-]\p{Ps}?\p{N}{1,2}[\.-]\p{Ps}?\p{N}{2,4}
-DATE-REVERSE=\p{N}{4}-\p{N}{1,2}[\.-]\p{N}{1,2}\.?
+DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
+DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
 
 NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
 #NUMBER-YEAR=('\p{N}{2})\P{N}
@@ -47,10 +51,12 @@ UNKNOWN=.
 [PREFIXES]
 
 [SUFFIXES]
-'s
+
+[ORDINALS]
+o
+a
 
 [TOKENS]
-'s
 
 [UNITS]
 km
@@ -67,150 +73,17 @@ min
 gb
 mb
 kb
-St
 
 
 [CURRENCY]
 EUR
-DM
-USD
 
 [ABBREVIATIONS]
-Abb
-Abf
-Abk
-Abs
-Abt
-abzgl
-Adr
-am
-amtl
-Anh
-Anl
-Anm
-Aufl
-b
-Bd
-beil
-bes
-Betr
-Bez
-Bhf
-bzgl
-bzw
-ca
-Chr
-dgl
-Dir
-DM
-Dr
-dt
-Dtzd
-ehem
-eigtl
-eindschl
-entspr
-erb
-erw
-Erw
-etc
-ev
-evtl
-exkl
-f
-Fa
-Fam
-Ffm
-Fr
-Frl
-frz
-geb
-Gebr
-gedr
-gegr
-gek
-Ges
-gesch
-gest
-gez
-ggf
-ggfs
-Hbf
-Hg
-hpts
-Hptst
-Hr
-Hrn
-Hrsg
-Ing
-Inh
-inkl
-jew
-Jh
-jhrl
-Kap
-kath
-Kfm
-kfm
-kgl
-Kl
-l
-led
-Mio
-möbl
-Mr
-Mrd
-Msp
-mtl
-MwSt
-MWSt
-näml
-Nr
-o
-Obb
-od
-österr
-Pfd
-Pl
-Prof
-r
-Red
-röm
-röm\.-kath
-S
-s
-Sa
-schles
-schwäb
-schweiz
-So
-sog
-Spvgg
-St
-Str
-StR
-südd
-tägl
-Tel
-Ts
-u
-usw
-Verf
-verh
-verw
-vgl
-vorm
-z
-zur
-zus
-Ztr
-zzgl
+%include es
 
 [EOSMARKERS]
 %include standard-eos
-%include exotic-eos
 
 [QUOTES]
 %include standard-quotes
 %include exotic-quotes
-
diff --git a/config/tokconfig-fr b/config/tokconfig-fr
index d82ca7a..5b57e4c 100644
--- a/config/tokconfig-fr
+++ b/config/tokconfig-fr
@@ -1,6 +1,13 @@
+[RULE-ORDER]
+WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
+E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
+ABBREVIATION INITIALS INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
+NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+
 [RULES]
 %include url
 %include e-mail
+%include smiley
 
 #Ex: (dis)information
 WORD-PARPREFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*
@@ -12,7 +19,7 @@ WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{
 WORD-COMPOUND=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)+
 
 #Abbreviations with multiple periods
-ABBREVIATION=\p{L}{1,3}(?:\.\p{L}{1,3})+\.?
+ABBREVIATION=^(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)\Z
 
 #retain initials
 INITIAL=^(?:\p{Lt}|\p{Lu})\.$
@@ -93,88 +100,17 @@ CAD
 [ABBREVIATIONS]
 
 
+[FILTER]
+fl fl
+ff ff
+ffi ffi
+ffl ffl
+# also filter soft hyphen 
+\u00AD
+
+
 [EOSMARKERS]
-# Character: !
-# Name: EXCLAMATION MARK
-# Code: 33 (0x21) 
-\u0021
-
-# Character: ?
-# Name: QUESTION MARK
-# Code: 3f (0x3f) 
-\u003F
-
-# Character: ;
-# Name: GREEK QUESTION MARK
-# Code: 894 (0x37e) 
-\u037e
-
-# Character: ؟
-# Name: ARABIC QUESTION MARK
-# Code: 1567 (0x61f) 
-\u061f
-
-# Character: 。
-# Name: IDEOGRAPHIC FULL STOP
-# Code: 12290 (0x3002) 
-\u3002
-
-# Character: 。
-# Name: HALFWIDTH IDEOGRAPHIC FULL STOP
-# Code: 65377 (0xff61) 
-\uff61
-
-# Character: ?
-# Name: FULLWIDTH QUESTION MARK
-# Code: 65311 (0xff1f) 
-\uff1f
-
-# Character: !
-# Name: FULLWIDTH EXCLAMATION MARK
-# Code: 65281 (0xff01) 
-\uff01
-
-# Character: ।
-# Name: DEVANAGARI DANDA
-# Code: 2404 (0x964) 
-\u0964
-
-# Character: ։
-# Name: ARMENIAN FULL STOP
-# Code: 1417 (0x589) 
-\u0589
-
-# Character: ՞
-# Name: ARMENIAN QUESTION MARK
-# Code: 1374 (0x55e) 
-\u055e
-
-# Character: ።
-# Name: ETHIOPIC FULL STOP
-# Code: 4962 (0x1362) 
-\u1362
-
-# Character: ᙮
-# Name: CANADIAN SYLLABICS FULL STOP
-# Code: 5742 (0x166e) 
-\u166e
-
-# Character: ។
-# Name: KHMER SIGN KHAN
-# Code: 6100 (0x17d4) 
-\u17d4
-
-# Character: ៕
-# Name: KHMER SIGN BARIYOOSAN
-# Code: 6101 (0x17d5) 
-\u17d5
-
-# Character: ᠃
-# Name: MONGOLIAN FULL STOP
-# Code: 6147 (0x1803) 
-\u1803
-
-# Character: ᠉
-# Name: MONGOLIAN MANCHU FULL STOP
-# Code: 6153 (0x1809) 
-\u1809
+%include standard-eos
+
+[QUOTES]
+%include standard-quotes
diff --git a/config/tokconfig-fy b/config/tokconfig-fy
index 03ddb1d..2981a6c 100644
--- a/config/tokconfig-fy
+++ b/config/tokconfig-fy
@@ -59,16 +59,14 @@ UNKNOWN=.
 
 [PREFIXES]
 
-[ATTACHEDSUFFIXES]
-\['`’]s
-\['`’]t
-\['`’]n
 
 [ORDINALS]
 e
 de
 ste
 
+[ATTACHEDSUFFIXES]
+\['`’]s
 
 [TOKENS]
 \['`’]s
@@ -77,6 +75,7 @@ ste
 \['`’]n
 \['`’]t
 \['`’]d
+\['`’]e
 
 [UNITS]
 km
@@ -304,7 +303,7 @@ kub
 kw
 l
 L
-Lat
+Latherman
 Lem
 lidw
 lit
diff --git a/config/tokconfig-it b/config/tokconfig-it
index 804106a..7d22faf 100644
--- a/config/tokconfig-it
+++ b/config/tokconfig-it
@@ -12,7 +12,8 @@ WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{
 WORD-COMPOUND=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)+
 
 #Abbreviations with multiple periods
-ABBREVIATION=\p{L}{1,3}(?:\.\p{L}{1,3})+\.?
+ABBREVIATION=^(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)\Z
+
 
 #retain initials
 INITIAL=^(?:\p{Lt}|\p{Lu})\.$
diff --git a/config/tokconfig-nl b/config/tokconfig-nl
index fa05122..516299e 100644
--- a/config/tokconfig-nl
+++ b/config/tokconfig-nl
@@ -1,7 +1,9 @@
 [RULE-ORDER]
-WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
-E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
-ABBREVIATION INITIALS INITIAL SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
+WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL QUOTE-COMPOUND
+NUMBER-STRING STRING-NUMBER URL URL-WWW URL-DOMAIN E-MAIL
+WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
+ABBREVIATION INITIALS INITIAL SMILEY REVERSE-SMILEY 
+PUNCTUATION-MULTI DATE DATE-REVERSE
 NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
 # to do PREFIXES (is leeg nu) UNITS (uitgecommentarieerd in de c++ code) 
 
@@ -22,6 +24,9 @@ WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{
 #Keep dash/underscore connected parts (even if they are in parenthesis)
 WORD-COMPOUND=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)+
 
+#Keep quote connected parts (even if they are in parenthesis)
+QUOTE-COMPOUND=[\p{L}\p{N}]+(?:['`’‘]\p{L}+)+
+
 #Abbreviations with multiple periods
 ABBREVIATION=^(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)\Z
 
@@ -31,8 +36,6 @@ INITIALS=(\p{L}(?:\.\p{L})+\.)\p{Lu}\p{L}{3,999}+
 #retain initials
 INITIAL=^(?:\p{Lt}|\p{Lu})\.$
 
-#SMILEY=^(?:>?[:;]['`^]?[-~]*[)}\](\\/\[{Ss\$PpDd]+)$
-
 #Homogeneous punctuation (ellipsis etc)
 PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 
@@ -40,11 +43,17 @@ PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
 DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
 
-NUMBER-YEAR=(['`’]\p{N}{2})(?:\P{N}|\z)
+#numberstring like 20jarige
+NUMBER-STRING=\p{N}+(?:\p{Pd}?)(?:\p{L}+)
+
+#combinations like A50, a1 
+STRING-NUMBER=\p{L}+\p{Pd}?\p{N}+
+
+NUMBER-YEAR=(['`’‘]\p{N}{2})(?:\P{N}|\z)
 #NUMBER-YEAR=('\p{N}{2})\P{N}
 
 #Times
-TIME=\p{N}{1,2}:\p{N}{1,2}(?::\p{N})?(am|pm|AM|PM)?
+TIME=\p{N}{1,2}:\p{N}{1,2}(?::\p{N}{1,2})?(am|pm|AM|PM)?
 
 #retain digits, including those starting with initial period (.22), and negative numbers
 NUMBER=-?(?:[\.,]?\p{N}+)+
@@ -60,9 +69,10 @@ UNKNOWN=.
 [PREFIXES]
 
 [ATTACHEDSUFFIXES]
-\['`’]s
-\['`’]t
-\['`’]n
+\['`’‘]s
+\['`’‘]t
+\['`’‘]n
+\['`’‘]r
 
 [ORDINALS]
 e
@@ -72,11 +82,12 @@ er
 
 
 [TOKENS]
-\['`’]s
-\['`’]k
-\['`’]m
-\['`’]n
-\['`’]t
+\['`’‘]s
+\['`’‘]k
+\['`’‘]m
+\['`’‘]n
+\['`’‘]t
+\['`’‘]ie
 
 [UNITS]
 km
@@ -103,470 +114,7 @@ f
 
 
 [ABBREVIATIONS]
-Aardoliemij
-Adm
-Adriaansz
-Afd
-Am
-Ant
-Anthoniszn
-Ave
-BMCie
-Bel
-Belastinggr
-Bfr
-Bijv
-Bk
-Blvd
-Br
-Bros
-Burg
-CHR
-Ch
-Chr
-Cie
-Co
-Com
-Corneliszn
-Corp
-CvN
-Cy
-Dep
-Dept
-Di
-Do
-Dhr
-Dr
-Drs
-Ed
-Em
-Eng
-Esq
-Eur
-Exc
-Exp
-F
-Fa
-Fam
-Fed
-Fl
-Fr
-Fred
-Gebr
-Gem
-Gen
-Gld
-H
-HH
-Hd
-Herv
-Hoogl
-Hr
-Hub
-Hzn
-Inc
-Ing
-Inl
-Inst
-Int
-Ir
-Isr
-It
-J-P
-Jac
-Jacq
-Jan
-Jhr
-Jkvr
-Joh
-Jr
-Jul
-Jzn
-KLu
-Kcal
-Kon
-Krj
-L
-Lat
-Ltd
-M
-Ma
-Mad
-Mass
-Mej
-Mevr
-Mgr
-Mij
-Min
-Mr
-Mrs
-Ms
-Mus
-Mw
-N
-NH
-NL
-Nd
-Ndl
-Ned
-Nic
-Nov
-O
-Oct
-Olym
-Org
-Oud-Eng
-P
-PE
-Pct
-PepsiCo
-Ph
-Phs
-Pol
-Prof
-Prov
-RED
-Red
-Rijkscomm
-Rom
-SEPT
-Sept
-Sj
-Sp
-Sr
-St
-Stbl
-Stct
-Sted
-TH
-Tel
-Th
-Tijdschr
-Tj
-Uitg
-Univ
-VS
-Ver
-Vic
-Vl
-Vlnr
-Vr
-Vz
-W
-Werkn
-Wo
-Z
-Za
-Zl
-Zn
-a
-aanv
-acad
-acc
-adj
-adm
-adv
-afb
-afd
-afk
-afl
-afz
-al
-alg
-alt
-arr
-art
-asp
-ass
-atm
-aug
-beh
-beheerscomm
-ben
-benod
-betr
-bijv
-bijz
-bl
-blz
-br
-brab
-brandm
-btw
-bur
-bv
-c
-ca
-cal
-cand
-cao
-cap
-cat
-cc
-cf
-chr
-cm
-cod
-com
-commer
-comp
-coop
-cq
-ct
-deb
-dec
-derg
-dgl
-dgs
-dhr
-di
-dipl
-dir
-distr
-div
-do
-don
-dr
-drs
-ds
-dw
-ed
-eerste-luit
-eerw
-eig
-em
-enk
-enz
-etc
-ev
-evt
-ex
-excl
-f
-fa
-feb
-febr
-fec
-fig
-fl
-fol
-fr
-geb
-gebr
-gedipl
-geh
-gem
-gep
-gesch
-get
-gez
-gld
-gr
-gymn
-h
-herv
-hh
-hoogl
-hs
-ib
-ibid
-id
-ill
-imp
-impr
-inc
-incl
-indiv
-inf
-ing
-ink
-inl
-insp
-int
-intr
-inw
-inz
-ir
-it
-j
-jan
-jg
-jhr
-jl
-joh
-jr
-kHz
-kand
-kath
-kcal
-kg
-kl
-km
-l
-lb
-lib
-lic
-ll
-lt
-ltd
-m
-ma
-maj
-max
-med
-medew
-mej
-mevr
-mg
-mgr
-mil
-milj
-mld
-mln
-mm
-mnd
-mr
-mrd
-mrs
-mrt
-ms
-mtr
-muz
-mv
-mw
-n
-ned
-nl
-nom
-nov
-nr
-o
-oa
-ob
-obl
-okt
-olv
-ong
-ongeh
-onz
-opm
-opp
-or
-org
-oud-bevelv
-oud-penn
-oud-secr
-oud-voorz
-oud-vrijw
-oud-vrz
-p
-pCt
-pag
-par
-pct
-pd
-penn
-penningm
-perf
-persc
-pl
-plm
-plv
-pnt
-pr
-praes
-pres
-prk
-proc
-prof
-prot
-prov
-ps
-pt
-r
-re
-reg
-resp
-ret
-rk
-sc
-scholengem
-schr
-scr
-sec
-sept
-seq
-ser
-sin
-sing
-soc
-spr
-sq
-sr
-st
-subs
-subst
-sup
-t
-tab
-td
-tech
-temp
-terugbez
-tg
-tgov
-theel
-tit
-tv
-tw
-v
-vac
-var
-vdt
-verb
-verg
-versch
-vert
-vgl
-vice-voorz
-vice-vrz
-vid
-vlg
-vlgg
-vlnr
-vml
-vnl
-vnlr
-vnw
-voc
-voorl
-voorm
-voorw
-voorz
-vorstverl
-vr
-vrijw
-vrijwil
-vrijwill
-vrz
-vs
-wd
-weled
-weledelgeb
-weledelgestr
-weleerw
-werkg
-wo
-wsch
-z
-za
-zelfst
-zg
-zgn
-zn
-zog
-zw
-zwemb
+%include nl_afk
 
 [FILTER]
 %include ligatures
diff --git a/config/tokconfig-nl-sonarchat b/config/tokconfig-nl-sonarchat
index 3c7bba0..1c292ba 100644
--- a/config/tokconfig-nl-sonarchat
+++ b/config/tokconfig-nl-sonarchat
@@ -1,7 +1,8 @@
 [RULE-ORDER]
-WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
-E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND NICKNAME
-ABBREVIATION INITIALS INITIAL SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
+WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW 
+URL-DOMAIN E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX 
+WORD-COMPOUND NICKNAME ABBREVIATION INITIALS INITIAL SMILEY REVERSE_SMILEY
+PUNCTUATION-MULTI DATE DATE-REVERSE
 NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
 # to do PREFIXES (is leeg nu) UNITS (uitgecommentarieerd in de c++ code) 
 
@@ -105,470 +106,7 @@ f
 
 
 [ABBREVIATIONS]
-Aardoliemij
-Adm
-Adriaansz
-Afd
-Am
-Ant
-Anthoniszn
-Ave
-BMCie
-Bel
-Belastinggr
-Bfr
-Bijv
-Bk
-Blvd
-Br
-Bros
-Burg
-CHR
-Ch
-Chr
-Cie
-Co
-Com
-Corneliszn
-Corp
-CvN
-Cy
-Dep
-Dept
-Di
-Do
-Dhr
-Dr
-Drs
-Ed
-Em
-Eng
-Esq
-Eur
-Exc
-Exp
-F
-Fa
-Fam
-Fed
-Fl
-Fr
-Fred
-Gebr
-Gem
-Gen
-Gld
-H
-HH
-Hd
-Herv
-Hoogl
-Hr
-Hub
-Hzn
-Inc
-Ing
-Inl
-Inst
-Int
-Ir
-Isr
-It
-J-P
-Jac
-Jacq
-Jan
-Jhr
-Jkvr
-Joh
-Jr
-Jul
-Jzn
-KLu
-Kcal
-Kon
-Krj
-L
-Lat
-Ltd
-M
-Ma
-Mad
-Mass
-Mej
-Mevr
-Mgr
-Mij
-Min
-Mr
-Mrs
-Ms
-Mus
-Mw
-N
-NH
-NL
-Nd
-Ndl
-Ned
-Nic
-Nov
-O
-Oct
-Olym
-Org
-Oud-Eng
-P
-PE
-Pct
-PepsiCo
-Ph
-Phs
-Pol
-Prof
-Prov
-RED
-Red
-Rijkscomm
-Rom
-SEPT
-Sept
-Sj
-Sp
-Sr
-St
-Stbl
-Stct
-Sted
-TH
-Tel
-Th
-Tijdschr
-Tj
-Uitg
-Univ
-VS
-Ver
-Vic
-Vl
-Vlnr
-Vr
-Vz
-W
-Werkn
-Wo
-Z
-Za
-Zl
-Zn
-a
-aanv
-acad
-acc
-adj
-adm
-adv
-afb
-afd
-afk
-afl
-afz
-al
-alg
-alt
-arr
-art
-asp
-ass
-atm
-aug
-beh
-beheerscomm
-ben
-benod
-betr
-bijv
-bijz
-bl
-blz
-br
-brab
-brandm
-btw
-bur
-bv
-c
-ca
-cal
-cand
-cao
-cap
-cat
-cc
-cf
-chr
-cm
-cod
-com
-commer
-comp
-coop
-cq
-ct
-deb
-dec
-derg
-dgl
-dgs
-dhr
-di
-dipl
-dir
-distr
-div
-do
-don
-dr
-drs
-ds
-dw
-ed
-eerste-luit
-eerw
-eig
-em
-enk
-enz
-etc
-ev
-evt
-ex
-excl
-f
-fa
-feb
-febr
-fec
-fig
-fl
-fol
-fr
-geb
-gebr
-gedipl
-geh
-gem
-gep
-gesch
-get
-gez
-gld
-gr
-gymn
-h
-herv
-hh
-hoogl
-hs
-ib
-ibid
-id
-ill
-imp
-impr
-inc
-incl
-indiv
-inf
-ing
-ink
-inl
-insp
-int
-intr
-inw
-inz
-ir
-it
-j
-jan
-jg
-jhr
-jl
-joh
-jr
-kHz
-kand
-kath
-kcal
-kg
-kl
-km
-l
-lb
-lib
-lic
-ll
-lt
-ltd
-m
-ma
-maj
-max
-med
-medew
-mej
-mevr
-mg
-mgr
-mil
-milj
-mld
-mln
-mm
-mnd
-mr
-mrd
-mrs
-mrt
-ms
-mtr
-muz
-mv
-mw
-n
-ned
-nl
-nom
-nov
-nr
-o
-oa
-ob
-obl
-okt
-olv
-ong
-ongeh
-onz
-opm
-opp
-or
-org
-oud-bevelv
-oud-penn
-oud-secr
-oud-voorz
-oud-vrijw
-oud-vrz
-p
-pCt
-pag
-par
-pct
-pd
-penn
-penningm
-perf
-persc
-pl
-plm
-plv
-pnt
-pr
-praes
-pres
-prk
-proc
-prof
-prot
-prov
-ps
-pt
-r
-re
-reg
-resp
-ret
-rk
-sc
-scholengem
-schr
-scr
-sec
-sept
-seq
-ser
-sin
-sing
-soc
-spr
-sq
-sr
-st
-subs
-subst
-sup
-t
-tab
-td
-tech
-temp
-terugbez
-tg
-tgov
-theel
-tit
-tv
-tw
-v
-vac
-var
-vdt
-verb
-verg
-versch
-vert
-vgl
-vice-voorz
-vice-vrz
-vid
-vlg
-vlgg
-vlnr
-vml
-vnl
-vnlr
-vnw
-voc
-voorl
-voorm
-voorw
-voorz
-vorstverl
-vr
-vrijw
-vrijwil
-vrijwill
-vrz
-vs
-wd
-weled
-weledelgeb
-weledelgestr
-weleerw
-werkg
-wo
-wsch
-z
-za
-zelfst
-zg
-zgn
-zn
-zog
-zw
-zwemb
+%include nl_afk
 
 [FILTER]
 %include ligatures
diff --git a/config/tokconfig-nl-twitter b/config/tokconfig-nl-twitter
index 1d2fe43..c8339ed 100644
--- a/config/tokconfig-nl-twitter
+++ b/config/tokconfig-nl-twitter
@@ -1,7 +1,8 @@
 [RULE-ORDER]
-WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
-E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
-ABBREVIATION INITIALS INITIAL SMILEY HASHTAG ADDRESSEE PUNCTUATION-MULTI DATE DATE-REVERSE
+WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW 
+URL-DOMAIN E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX 
+WORD-COMPOUND ABBREVIATION INITIALS INITIAL SMILEY REVERSE-SMILEY HASHTAG 
+ADDRESSEE PUNCTUATION-MULTI DATE DATE-REVERSE
 NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
 # to do PREFIXES (is leeg nu) UNITS (uitgecommentarieerd in de c++ code) 
 
@@ -42,11 +43,11 @@ PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
 DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
 
-NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
+NUMBER-YEAR=(['`’]\p{N}{2})(?:\P{N}|\z)
 #NUMBER-YEAR=('\p{N}{2})\P{N}
 
 #Times
-TIME=\p{N}{1,2}:\p{N}{1,2}(?::\p{N})?(?i:am|pm)?
+TIME=\p{N}{1,2}:\p{N}{1,2}(?::\p{N}{1,2})?(am|pm|AM|PM)?
 
 #retain digits, including those starting with initial period (.22), and negative numbers
 NUMBER=-?(?:[\.,]?\p{N}+)+
@@ -63,9 +64,10 @@ UNKNOWN=.
 [PREFIXES]
 
 [ATTACHEDSUFFIXES]
-'s
-'t
-'n
+\['`’]s
+\['`’]t
+\['`’]n
+\['`’]r
 
 [ORDINALS]
 e
@@ -75,11 +77,12 @@ er
 
 
 [TOKENS]
-'s
-'k
-'m
-'n
-'t
+\['`’]s
+\['`’]k
+\['`’]m
+\['`’]n
+\['`’]t
+\['`’]ie
 
 [UNITS]
 km
@@ -104,472 +107,8 @@ hfl
 fl
 f
 
-
 [ABBREVIATIONS]
-Aardoliemij
-Adm
-Adriaansz
-Afd
-Am
-Ant
-Anthoniszn
-Ave
-BMCie
-Bel
-Belastinggr
-Bfr
-Bijv
-Bk
-Blvd
-Br
-Bros
-Burg
-CHR
-Ch
-Chr
-Cie
-Co
-Com
-Corneliszn
-Corp
-CvN
-Cy
-Dep
-Dept
-Di
-Do
-Dhr
-Dr
-Drs
-Ed
-Em
-Eng
-Esq
-Eur
-Exc
-Exp
-F
-Fa
-Fam
-Fed
-Fl
-Fr
-Fred
-Gebr
-Gem
-Gen
-Gld
-H
-HH
-Hd
-Herv
-Hoogl
-Hr
-Hub
-Hzn
-Inc
-Ing
-Inl
-Inst
-Int
-Ir
-Isr
-It
-J-P
-Jac
-Jacq
-Jan
-Jhr
-Jkvr
-Joh
-Jr
-Jul
-Jzn
-KLu
-Kcal
-Kon
-Krj
-L
-Lat
-Ltd
-M
-Ma
-Mad
-Mass
-Mej
-Mevr
-Mgr
-Mij
-Min
-Mr
-Mrs
-Ms
-Mus
-Mw
-N
-NH
-NL
-Nd
-Ndl
-Ned
-Nic
-Nov
-O
-Oct
-Olym
-Org
-Oud-Eng
-P
-PE
-Pct
-PepsiCo
-Ph
-Phs
-Pol
-Prof
-Prov
-RED
-Red
-Rijkscomm
-Rom
-SEPT
-Sept
-Sj
-Sp
-Sr
-St
-Stbl
-Stct
-Sted
-TH
-Tel
-Th
-Tijdschr
-Tj
-Uitg
-Univ
-VS
-Ver
-Vic
-Vl
-Vlnr
-Vr
-Vz
-W
-Werkn
-Wo
-Z
-Za
-Zl
-Zn
-a
-aanv
-acad
-acc
-adj
-adm
-adv
-afb
-afd
-afk
-afl
-afz
-al
-alg
-alt
-arr
-art
-asp
-ass
-atm
-aug
-beh
-beheerscomm
-ben
-benod
-betr
-bijv
-bijz
-bl
-blz
-br
-brab
-brandm
-btw
-bur
-bv
-c
-ca
-cal
-cand
-cao
-cap
-cat
-cc
-cf
-chr
-cm
-cod
-com
-commer
-comp
-coop
-cq
-ct
-deb
-dec
-derg
-dgl
-dgs
-dhr
-di
-dipl
-dir
-distr
-div
-do
-don
-dr
-drs
-ds
-dw
-ed
-eerste-luit
-eerw
-eig
-em
-enk
-enz
-etc
-ev
-evt
-ex
-excl
-f
-fa
-feb
-febr
-fec
-fig
-fl
-fol
-fr
-geb
-gebr
-gedipl
-geh
-gem
-gep
-gesch
-get
-gez
-gld
-gr
-gymn
-h
-herv
-hh
-hoogl
-hs
-ib
-ibid
-id
-ill
-imp
-impr
-inc
-incl
-indiv
-inf
-ing
-ink
-inl
-insp
-int
-intr
-inw
-inz
-ir
-it
-j
-jan
-jg
-jhr
-jl
-joh
-jr
-kHz
-kand
-kath
-kcal
-kg
-kl
-km
-l
-lb
-lib
-lic
-ll
-lt
-ltd
-m
-ma
-maj
-max
-med
-medew
-mej
-mevr
-mg
-mgr
-mil
-milj
-mld
-mln
-mm
-mnd
-mr
-mrd
-mrs
-mrt
-ms
-mtr
-muz
-mv
-mw
-n
-ned
-nl
-nom
-nov
-nr
-o
-oa
-ob
-obl
-okt
-olv
-ong
-ongeh
-onz
-opm
-opp
-or
-org
-oud-bevelv
-oud-penn
-oud-secr
-oud-voorz
-oud-vrijw
-oud-vrz
-p
-pCt
-pag
-par
-pct
-pd
-penn
-penningm
-perf
-persc
-pl
-plm
-plv
-pnt
-pr
-praes
-pres
-prk
-proc
-prof
-prot
-prov
-ps
-pt
-r
-re
-reg
-resp
-ret
-rk
-sc
-scholengem
-schr
-scr
-sec
-sept
-seq
-ser
-sin
-sing
-soc
-spr
-sq
-sr
-st
-subs
-subst
-sup
-t
-tab
-td
-tech
-temp
-terugbez
-tg
-tgov
-theel
-tit
-tv
-tw
-v
-vac
-var
-vdt
-verb
-verg
-versch
-vert
-vgl
-vice-voorz
-vice-vrz
-vid
-vlg
-vlgg
-vlnr
-vml
-vnl
-vnlr
-vnw
-voc
-voorl
-voorm
-voorw
-voorz
-vorstverl
-vr
-vrijw
-vrijwil
-vrijwill
-vrz
-vs
-wd
-weled
-weledelgeb
-weledelgestr
-weleerw
-werkg
-wo
-wsch
-z
-za
-zelfst
-zg
-zgn
-zn
-zog
-zw
-zwemb
+%include nl_afk
 
 [FILTER]
 %include ligatures
diff --git a/configure b/configure
index 5a63758..6a6fe19 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.68 for ucto 0.5.2.
+# Generated by GNU Autoconf 2.68 for ucto 0.5.3.
 #
 # Report bugs to <timbl at uvt.nl>.
 #
@@ -569,8 +569,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ucto'
 PACKAGE_TARNAME='ucto'
-PACKAGE_VERSION='0.5.2'
-PACKAGE_STRING='ucto 0.5.2'
+PACKAGE_VERSION='0.5.3'
+PACKAGE_STRING='ucto 0.5.3'
 PACKAGE_BUGREPORT='timbl at uvt.nl'
 PACKAGE_URL=''
 
@@ -615,6 +615,8 @@ ac_subst_vars='am__EXEEXT_FALSE
 am__EXEEXT_TRUE
 LTLIBOBJS
 LIBOBJS
+ticcutils_LIBS
+ticcutils_CFLAGS
 folia_LIBS
 folia_CFLAGS
 XML2_LIBS
@@ -671,6 +673,7 @@ RANLIB
 am__fastdepCXX_FALSE
 am__fastdepCXX_TRUE
 CXXDEPMODE
+am__nodep
 AMDEPBACKSLASH
 AMDEP_FALSE
 AMDEP_TRUE
@@ -684,7 +687,7 @@ CPPFLAGS
 LDFLAGS
 CXXFLAGS
 CXX
-SYSCONF_PATH
+SYSCONFDIR
 am__untar
 am__tar
 AMTAR
@@ -759,6 +762,7 @@ with_sysroot
 enable_libtool_lock
 with_icu
 with_folia
+with_ticcutils
 '
       ac_precious_vars='build_alias
 host_alias
@@ -779,7 +783,9 @@ PKG_CONFIG_LIBDIR
 XML2_CFLAGS
 XML2_LIBS
 folia_CFLAGS
-folia_LIBS'
+folia_LIBS
+ticcutils_CFLAGS
+ticcutils_LIBS'
 
 
 # Initialize some variables set by options.
@@ -1322,7 +1328,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ucto 0.5.2 to adapt to many kinds of systems.
+\`configure' configures ucto 0.5.3 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1392,7 +1398,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ucto 0.5.2:";;
+     short | recursive ) echo "Configuration of ucto 0.5.3:";;
    esac
   cat <<\_ACEOF
 
@@ -1420,6 +1426,9 @@ Optional Packages:
   --with-folia=DIR       use libfolia installed in <DIR>;
                note that you can install folia in a non-default directory with
                ./configure --prefix=<DIR> in the folia installation directory
+  --with-ticcutils=DIR       use ticcutils installed in <DIR>;
+               note that you can install ticcutils in a non-default directory with
+               ./configure --prefix=<DIR> in the ticcutils installation directory
 
 Some influential environment variables:
   CXX         C++ compiler command
@@ -1443,6 +1452,10 @@ Some influential environment variables:
   folia_CFLAGS
               C compiler flags for folia, overriding pkg-config
   folia_LIBS  linker flags for folia, overriding pkg-config
+  ticcutils_CFLAGS
+              C compiler flags for ticcutils, overriding pkg-config
+  ticcutils_LIBS
+              linker flags for ticcutils, overriding pkg-config
 
 Use these variables to override the choices made by `configure' or to help
 it to find libraries and programs with nonstandard names/locations.
@@ -1510,7 +1523,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ucto configure 0.5.2
+ucto configure 0.5.3
 generated by GNU Autoconf 2.68
 
 Copyright (C) 2010 Free Software Foundation, Inc.
@@ -1906,97 +1919,6 @@ fi
 
 } # ac_fn_cxx_try_link
 
-# ac_fn_cxx_check_header_mongrel LINENO HEADER VAR INCLUDES
-# ---------------------------------------------------------
-# Tests whether HEADER exists, giving a warning if it cannot be compiled using
-# the include files in INCLUDES and setting the cache variable VAR
-# accordingly.
-ac_fn_cxx_check_header_mongrel ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  if eval \${$3+:} false; then :
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-$as_echo_n "checking for $2... " >&6; }
-if eval \${$3+:} false; then :
-  $as_echo_n "(cached) " >&6
-fi
-eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
-else
-  # Is the header compilable?
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 usability" >&5
-$as_echo_n "checking $2 usability... " >&6; }
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-$4
-#include <$2>
-_ACEOF
-if ac_fn_cxx_try_compile "$LINENO"; then :
-  ac_header_compiler=yes
-else
-  ac_header_compiler=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_compiler" >&5
-$as_echo "$ac_header_compiler" >&6; }
-
-# Is the header present?
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 presence" >&5
-$as_echo_n "checking $2 presence... " >&6; }
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <$2>
-_ACEOF
-if ac_fn_cxx_try_cpp "$LINENO"; then :
-  ac_header_preproc=yes
-else
-  ac_header_preproc=no
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_preproc" >&5
-$as_echo "$ac_header_preproc" >&6; }
-
-# So?  What about this header?
-case $ac_header_compiler:$ac_header_preproc:$ac_cxx_preproc_warn_flag in #((
-  yes:no: )
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&5
-$as_echo "$as_me: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&2;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
-$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
-    ;;
-  no:yes:* )
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: present but cannot be compiled" >&5
-$as_echo "$as_me: WARNING: $2: present but cannot be compiled" >&2;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     check for missing prerequisite headers?" >&5
-$as_echo "$as_me: WARNING: $2:     check for missing prerequisite headers?" >&2;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: see the Autoconf documentation" >&5
-$as_echo "$as_me: WARNING: $2: see the Autoconf documentation" >&2;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&5
-$as_echo "$as_me: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&2;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
-$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
-( $as_echo "## --------------------------- ##
-## Report this to timbl at uvt.nl ##
-## --------------------------- ##"
-     ) | sed "s/^/$as_me: WARNING:     /" >&2
-    ;;
-esac
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-$as_echo_n "checking for $2... " >&6; }
-if eval \${$3+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  eval "$3=\$ac_header_compiler"
-fi
-eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
-fi
-  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
-
-} # ac_fn_cxx_check_header_mongrel
-
 # ac_fn_cxx_check_type LINENO TYPE VAR INCLUDES
 # ---------------------------------------------
 # Tests whether TYPE exists after having included INCLUDES, setting cache
@@ -2054,7 +1976,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ucto $as_me 0.5.2, which was
+It was created by ucto $as_me 0.5.3, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
 
   $ $0 $@
@@ -2869,7 +2791,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ucto'
- VERSION='0.5.2'
+ VERSION='0.5.3'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -2899,11 +2821,11 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
 
 # We need awk for the "check" target.  The system "awk" is bad on
 # some platforms.
-# Always define AMTAR for backward compatibility.
+# Always define AMTAR for backward compatibility.  Yes, it's still used
+# in the wild :-(  We should find a proper way to deprecate it ...
+AMTAR='$${TAR-tar}'
 
-AMTAR=${AMTAR-"${am_missing_run}tar"}
-
-am__tar='${AMTAR} chof - "$$tardir"'; am__untar='${AMTAR} xf -'
+am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'
 
 
 
@@ -2914,22 +2836,7 @@ am__tar='${AMTAR} chof - "$$tardir"'; am__untar='${AMTAR} xf -'
 ac_config_headers="$ac_config_headers config.h"
 
 
-
-  prefix_NONE=
-  exec_prefix_NONE=
-  test "x$prefix" = xNONE && prefix_NONE=yes && prefix=$ac_default_prefix
-  test "x$exec_prefix" = xNONE && exec_prefix_NONE=yes && exec_prefix=$prefix
-  eval ac_define_dir="\"$sysconfdir\""
-  eval ac_define_dir="\"$ac_define_dir\""
-  SYSCONF_PATH="$ac_define_dir"
-
-
-cat >>confdefs.h <<_ACEOF
-#define SYSCONF_PATH "$ac_define_dir"
-_ACEOF
-
-  test "$prefix_NONE" && prefix=NONE
-  test "$exec_prefix_NONE" && exec_prefix=NONE
+SYSCONFDIR=$sysconfdir
 
 
 if test x"${CXXFLAGS+set}" = xset; then
@@ -3498,6 +3405,7 @@ fi
 if test "x$enable_dependency_tracking" != xno; then
   am_depcomp="$ac_aux_dir/depcomp"
   AMDEPBACKSLASH='\'
+  am__nodep='_no'
 fi
  if test "x$enable_dependency_tracking" != xno; then
   AMDEP_TRUE=
@@ -3522,6 +3430,7 @@ else
   # instance it was reported that on HP-UX the gcc test will end up
   # making a dummy file named `D' -- because `-MD' means `put the output
   # in D'.
+  rm -rf conftest.dir
   mkdir conftest.dir
   # Copy depcomp to subdir because otherwise we won't find it if we're
   # using a relative directory.
@@ -3581,7 +3490,7 @@ else
 	break
       fi
       ;;
-    msvisualcpp | msvcmsys)
+    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
       # This compiler won't grok `-c -o', but also, the minuso test has
       # not run yet.  These depmodes are late enough in the game, and
       # so weak that their functioning should not be impacted.
@@ -3742,8 +3651,8 @@ esac
 
 
 
-macro_version='2.4'
-macro_revision='1.3293'
+macro_version='2.4.2'
+macro_revision='1.3337'
 
 
 
@@ -4452,6 +4361,7 @@ else
   # instance it was reported that on HP-UX the gcc test will end up
   # making a dummy file named `D' -- because `-MD' means `put the output
   # in D'.
+  rm -rf conftest.dir
   mkdir conftest.dir
   # Copy depcomp to subdir because otherwise we won't find it if we're
   # using a relative directory.
@@ -4511,7 +4421,7 @@ else
 	break
       fi
       ;;
-    msvisualcpp | msvcmsys)
+    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
       # This compiler won't grok `-c -o', but also, the minuso test has
       # not run yet.  These depmodes are late enough in the game, and
       # so weak that their functioning should not be impacted.
@@ -15181,19 +15091,6 @@ fi
 
 # Checks for header files.
 
-for ac_header in unistd.h
-do :
-  ac_fn_cxx_check_header_mongrel "$LINENO" "unistd.h" "ac_cv_header_unistd_h" "$ac_includes_default"
-if test "x$ac_cv_header_unistd_h" = xyes; then :
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_UNISTD_H 1
-_ACEOF
-
-fi
-
-done
-
-
 # Checks for typedefs, structures, and compiler characteristics.
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for stdbool.h that conforms to C99" >&5
 $as_echo_n "checking for stdbool.h that conforms to C99... " >&6; }
@@ -15731,12 +15628,12 @@ if test -n "$folia_CFLAGS"; then
     pkg_cv_folia_CFLAGS="$folia_CFLAGS"
  elif test -n "$PKG_CONFIG"; then
     if test -n "$PKG_CONFIG" && \
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"folia >= 0.9 \""; } >&5
-  ($PKG_CONFIG --exists --print-errors "folia >= 0.9 ") 2>&5
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"folia >= 0.10 \""; } >&5
+  ($PKG_CONFIG --exists --print-errors "folia >= 0.10 ") 2>&5
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; then
-  pkg_cv_folia_CFLAGS=`$PKG_CONFIG --cflags "folia >= 0.9 " 2>/dev/null`
+  pkg_cv_folia_CFLAGS=`$PKG_CONFIG --cflags "folia >= 0.10 " 2>/dev/null`
 		      test "x$?" != "x0" && pkg_failed=yes
 else
   pkg_failed=yes
@@ -15748,12 +15645,12 @@ if test -n "$folia_LIBS"; then
     pkg_cv_folia_LIBS="$folia_LIBS"
  elif test -n "$PKG_CONFIG"; then
     if test -n "$PKG_CONFIG" && \
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"folia >= 0.9 \""; } >&5
-  ($PKG_CONFIG --exists --print-errors "folia >= 0.9 ") 2>&5
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"folia >= 0.10 \""; } >&5
+  ($PKG_CONFIG --exists --print-errors "folia >= 0.10 ") 2>&5
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; then
-  pkg_cv_folia_LIBS=`$PKG_CONFIG --libs "folia >= 0.9 " 2>/dev/null`
+  pkg_cv_folia_LIBS=`$PKG_CONFIG --libs "folia >= 0.10 " 2>/dev/null`
 		      test "x$?" != "x0" && pkg_failed=yes
 else
   pkg_failed=yes
@@ -15774,14 +15671,14 @@ else
         _pkg_short_errors_supported=no
 fi
         if test $_pkg_short_errors_supported = yes; then
-	        folia_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "folia >= 0.9 " 2>&1`
+	        folia_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "folia >= 0.10 " 2>&1`
         else
-	        folia_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "folia >= 0.9 " 2>&1`
+	        folia_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "folia >= 0.10 " 2>&1`
         fi
 	# Put the nasty error message in config.log where it belongs
 	echo "$folia_PKG_ERRORS" >&5
 
-	as_fn_error $? "Package requirements (folia >= 0.9 ) were not met:
+	as_fn_error $? "Package requirements (folia >= 0.10 ) were not met:
 
 $folia_PKG_ERRORS
 
@@ -15817,6 +15714,109 @@ CXXFLAGS="$folia_CFLAGS $CXXFLAGS"
 LIBS="$folia_LIBS $LIBS"
 
 
+# Check whether --with-ticcutils was given.
+if test "${with_ticcutils+set}" = set; then :
+  withval=$with_ticcutils; PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$withval/lib/pkgconfig"
+else
+  PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$prefix/lib/pkgconfig"
+fi
+
+#  AC_MSG_NOTICE( [pkg-config search path: $PKG_CONFIG_PATH] )
+
+pkg_failed=no
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ticcutils" >&5
+$as_echo_n "checking for ticcutils... " >&6; }
+
+if test -n "$ticcutils_CFLAGS"; then
+    pkg_cv_ticcutils_CFLAGS="$ticcutils_CFLAGS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.4 \""; } >&5
+  ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.4 ") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_ticcutils_CFLAGS=`$PKG_CONFIG --cflags "ticcutils >= 0.4 " 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+if test -n "$ticcutils_LIBS"; then
+    pkg_cv_ticcutils_LIBS="$ticcutils_LIBS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.4 \""; } >&5
+  ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.4 ") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_ticcutils_LIBS=`$PKG_CONFIG --libs "ticcutils >= 0.4 " 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+
+
+
+if test $pkg_failed = yes; then
+   	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+        _pkg_short_errors_supported=yes
+else
+        _pkg_short_errors_supported=no
+fi
+        if test $_pkg_short_errors_supported = yes; then
+	        ticcutils_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "ticcutils >= 0.4 " 2>&1`
+        else
+	        ticcutils_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "ticcutils >= 0.4 " 2>&1`
+        fi
+	# Put the nasty error message in config.log where it belongs
+	echo "$ticcutils_PKG_ERRORS" >&5
+
+	as_fn_error $? "Package requirements (ticcutils >= 0.4 ) were not met:
+
+$ticcutils_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+Alternatively, you may set the environment variables ticcutils_CFLAGS
+and ticcutils_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details." "$LINENO" 5
+elif test $pkg_failed = untried; then
+     	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "The pkg-config script could not be found or is too old.  Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+Alternatively, you may set the environment variables ticcutils_CFLAGS
+and ticcutils_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.
+
+To get pkg-config, see <http://pkg-config.freedesktop.org/>.
+See \`config.log' for more details" "$LINENO" 5; }
+else
+	ticcutils_CFLAGS=$pkg_cv_ticcutils_CFLAGS
+	ticcutils_LIBS=$pkg_cv_ticcutils_LIBS
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+fi
+CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS"
+LIBS="$LIBS $ticcutils_LIBS"
+
+
 # Checks for library functions.
 
 ac_config_files="$ac_config_files Makefile ucto.pc ucto-icu.pc m4/Makefile config/Makefile docs/Makefile src/Makefile tests/Makefile include/Makefile include/ucto/Makefile"
@@ -16359,7 +16359,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ucto $as_me 0.5.2, which was
+This file was extended by ucto $as_me 0.5.3, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -16425,7 +16425,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ucto config.status 0.5.2
+ucto config.status 0.5.3
 configured by $0, generated by GNU Autoconf 2.68,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 0ea060c..a302ab6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,16 +1,17 @@
 #                                               -*- Autoconf -*-
 # Process this file with autoconf to produce a configure script.
-# $Id: configure.ac 14356 2012-02-29 11:37:58Z sloot $
+# $Id: configure.ac 15905 2013-04-03 13:03:22Z sloot $
 # $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/configure.ac $
 
 AC_PREREQ(2.59)
-AC_INIT([ucto], [0.5.2], [timbl at uvt.nl])
+AC_INIT([ucto], [0.5.3], [timbl at uvt.nl])
 AM_INIT_AUTOMAKE
 AC_CONFIG_SRCDIR([configure.ac])
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_HEADER([config.h])
 
-AC_DEFINE_DIR([SYSCONF_PATH], sysconfdir, sysconfdir)
+SYSCONFDIR=$sysconfdir
+AC_SUBST([SYSCONFDIR])
 
 if test x"${CXXFLAGS+set}" = xset; then
   # the user set CXXFLAGS; don't override it.
@@ -41,7 +42,6 @@ if test $prefix = "NONE"; then
 fi
 
 # Checks for header files.
-AC_CHECK_HEADERS([unistd.h])
 
 # Checks for typedefs, structures, and compiler characteristics.
 AC_HEADER_STDBOOL
@@ -88,10 +88,21 @@ AC_ARG_WITH(folia,
        [PKG_CONFIG_PATH="$withval/lib/pkgconfig:$PKG_CONFIG_PATH"],
        [PKG_CONFIG_PATH="$prefix/lib/pkgconfig:$PKG_CONFIG_PATH"])
 AC_MSG_NOTICE( [pkg-config search path: $PKG_CONFIG_PATH] )
-PKG_CHECK_MODULES([folia], [folia >= 0.9] )
+PKG_CHECK_MODULES([folia], [folia >= 0.10] )
 CXXFLAGS="$folia_CFLAGS $CXXFLAGS"
 LIBS="$folia_LIBS $LIBS"
 
+AC_ARG_WITH(ticcutils,
+       [  --with-ticcutils=DIR       use ticcutils installed in <DIR>; 
+               note that you can install ticcutils in a non-default directory with
+               ./configure --prefix=<DIR> in the ticcutils installation directory],
+       [PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$withval/lib/pkgconfig"],
+       [PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$prefix/lib/pkgconfig"])
+#  AC_MSG_NOTICE( [pkg-config search path: $PKG_CONFIG_PATH] )
+PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.4] )
+CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS"
+LIBS="$LIBS $ticcutils_LIBS"
+
 
 # Checks for library functions.
 
diff --git a/depcomp b/depcomp
index df8eea7..bd0ac08 100755
--- a/depcomp
+++ b/depcomp
@@ -1,10 +1,10 @@
 #! /bin/sh
 # depcomp - compile a program generating dependencies as side-effects
 
-scriptversion=2009-04-28.21; # UTC
+scriptversion=2011-12-04.11; # UTC
 
-# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2006, 2007, 2009 Free
-# Software Foundation, Inc.
+# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2006, 2007, 2009, 2010,
+# 2011 Free Software Foundation, Inc.
 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -44,7 +44,7 @@ Environment variables:
   object      Object file output by `PROGRAMS ARGS'.
   DEPDIR      directory where to store dependencies.
   depfile     Dependency file to output.
-  tmpdepfile  Temporary file to use when outputing dependencies.
+  tmpdepfile  Temporary file to use when outputting dependencies.
   libtool     Whether libtool is used (yes/no).
 
 Report bugs to <bug-automake at gnu.org>.
@@ -90,10 +90,18 @@ if test "$depmode" = msvcmsys; then
    # This is just like msvisualcpp but w/o cygpath translation.
    # Just convert the backslash-escaped backslashes to single forward
    # slashes to satisfy depend.m4
-   cygpath_u="sed s,\\\\\\\\,/,g"
+   cygpath_u='sed s,\\\\,/,g'
    depmode=msvisualcpp
 fi
 
+if test "$depmode" = msvc7msys; then
+   # This is just like msvc7 but w/o cygpath translation.
+   # Just convert the backslash-escaped backslashes to single forward
+   # slashes to satisfy depend.m4
+   cygpath_u='sed s,\\\\,/,g'
+   depmode=msvc7
+fi
+
 case "$depmode" in
 gcc3)
 ## gcc 3 implements dependency tracking that does exactly what
@@ -158,10 +166,12 @@ gcc)
 ' < "$tmpdepfile" |
 ## Some versions of gcc put a space before the `:'.  On the theory
 ## that the space means something, we add a space to the output as
-## well.
+## well.  hp depmode also adds that space, but also prefixes the VPATH
+## to the object.  Take care to not repeat it in the output.
 ## Some versions of the HPUX 10.20 sed can't process this invocation
 ## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+    sed -e 's/^\\$//' -e '/^$/d' -e "s|.*$object$||" -e '/:$/d' \
+      | sed -e 's/$/ :/' >> "$depfile"
   rm -f "$tmpdepfile"
   ;;
 
@@ -405,6 +415,52 @@ tru64)
    rm -f "$tmpdepfile"
    ;;
 
+msvc7)
+  if test "$libtool" = yes; then
+    showIncludes=-Wc,-showIncludes
+  else
+    showIncludes=-showIncludes
+  fi
+  "$@" $showIncludes > "$tmpdepfile"
+  stat=$?
+  grep -v '^Note: including file: ' "$tmpdepfile"
+  if test "$stat" = 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+  echo "$object : \\" > "$depfile"
+  # The first sed program below extracts the file names and escapes
+  # backslashes for cygpath.  The second sed program outputs the file
+  # name when reading, but also accumulates all include files in the
+  # hold buffer in order to output them again at the end.  This only
+  # works with sed implementations that can handle large buffers.
+  sed < "$tmpdepfile" -n '
+/^Note: including file:  *\(.*\)/ {
+  s//\1/
+  s/\\/\\\\/g
+  p
+}' | $cygpath_u | sort -u | sed -n '
+s/ /\\ /g
+s/\(.*\)/	\1 \\/p
+s/.\(.*\) \\/\1:/
+H
+$ {
+  s/.*/	/
+  G
+  p
+}' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+msvc7msys)
+  # This case exists only to let depend.m4 do its work.  It works by
+  # looking at the text of this script.  This case will never be run,
+  # since it is checked for above.
+  exit 1
+  ;;
+
 #nosideeffect)
   # This comment above is used by automake to tell side-effect
   # dependency tracking mechanisms from slower ones.
@@ -503,7 +559,9 @@ makedepend)
   touch "$tmpdepfile"
   ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@"
   rm -f "$depfile"
-  cat < "$tmpdepfile" > "$depfile"
+  # makedepend may prepend the VPATH from the source file name to the object.
+  # No need to regex-escape $object, excess matching of '.' is harmless.
+  sed "s|^.*\($object *:\)|\1|" "$tmpdepfile" > "$depfile"
   sed '1,2d' "$tmpdepfile" | tr ' ' '
 ' | \
 ## Some versions of the HPUX 10.20 sed can't process this invocation
diff --git a/docs/Makefile.in b/docs/Makefile.in
index 2cc9f4b..5214823 100644
--- a/docs/Makefile.in
+++ b/docs/Makefile.in
@@ -1,9 +1,9 @@
-# Makefile.in generated by automake 1.11.1 from Makefile.am.
+# Makefile.in generated by automake 1.11.3 from Makefile.am.
 # @configure_input@
 
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006, 2007, 2008, 2009  Free Software Foundation,
-# Inc.
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
@@ -39,11 +39,11 @@ host_triplet = @host@
 subdir = docs
 DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ac_define_dir.m4 \
-	$(top_srcdir)/m4/ax_icu_check.m4 $(top_srcdir)/m4/libtool.m4 \
-	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
-	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-	$(top_srcdir)/m4/pkg.m4 $(top_srcdir)/configure.ac
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \
+	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
+	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
+	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/m4/pkg.m4 \
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -73,6 +73,12 @@ am__nobase_list = $(am__nobase_strip_setup); \
 am__base_list = \
   sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
   sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
 man1dir = $(mandir)/man1
 am__installdirs = "$(DESTDIR)$(man1dir)"
 NROFF = nroff
@@ -154,7 +160,7 @@ SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
 STRIP = @STRIP@
-SYSCONF_PATH = @SYSCONF_PATH@
+SYSCONFDIR = @SYSCONFDIR@
 VERSION = @VERSION@
 XML2_CFLAGS = @XML2_CFLAGS@
 XML2_LIBS = @XML2_LIBS@
@@ -210,6 +216,8 @@ sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
 sysconfdir = @sysconfdir@
 target_alias = @target_alias@
+ticcutils_CFLAGS = @ticcutils_CFLAGS@
+ticcutils_LIBS = @ticcutils_LIBS@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
@@ -285,9 +293,7 @@ uninstall-man1:
 	files=`{ for i in $$list; do echo "$$i"; done; \
 	} | sed -e 's,.*/,,;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \
 	      -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,'`; \
-	test -z "$$files" || { \
-	  echo " ( cd '$(DESTDIR)$(man1dir)' && rm -f" $$files ")"; \
-	  cd "$(DESTDIR)$(man1dir)" && rm -f $$files; }
+	dir='$(DESTDIR)$(man1dir)'; $(am__uninstall_files_from_dir)
 tags: TAGS
 TAGS:
 
@@ -355,10 +361,15 @@ install-am: all-am
 
 installcheck: installcheck-am
 install-strip:
-	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	  `test -z '$(STRIP)' || \
-	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
 mostlyclean-generic:
 
 clean-generic:
diff --git a/docs/ucto.1 b/docs/ucto.1
index b88a471..e6c506c 100644
--- a/docs/ucto.1
+++ b/docs/ucto.1
@@ -1,4 +1,4 @@
-.TH ucto 1 "2011 november 28"
+.TH ucto 1 "2013 march 6"
 
 .SH NAME
 ucto - Unicode Tokenizer
@@ -51,12 +51,12 @@ Convert to all uppercase
 
 .BR -n 
 .RS
-Assume one sentence per line on input
+Emit one sentence per line on output
 .RE
 
 .BR -m
 .RS
-Emit one sentence per line on output
+Assume one sentence per line on input
 .RE
 
 .BR --passthru    
@@ -95,16 +95,39 @@ Show version information
 set Verbose mode
 .RE
 
-.B -x
+.B -F
+.RS
+Read a FoLiA XML document, tokenize it, and output the modified doc. (this disables usage of most other options: -nulPQvsS)
+.RE
+
+.BR --textclass " cls"
+.RS
+When tokenizing a FoLiA XML document, search for text nodes of class 'cls'
+.RE
+
+.B -X
+.RS
+Output FoLiA XML. (this disables usage of most other options: -nulPQvsS)
+.RE	
+
+.B --id
 <DocId>
 .RS
-Output FoLiA XML, use the specified Document ID. (this disables usage of most other options: -nulPQvsS)
+Use the specified Document ID for the FoLiA XML
 .RE
 
-.B -F
+.B -x
+<DocId>
+.B (obsolete)
 .RS
-Read a FoLiA XML document, tokenize it, and output the modified doc. (this disables usage of most other options: -nulPQvsS)
+Output FoLiA XML, use the specified Document ID. (this disables usage of most other options: -nulPQvsS)
 
+.B obsolete
+Use 
+.B -X 
+and 
+.B --id
+instead
 .RE
 
 .SH BUGS
diff --git a/include/Makefile.in b/include/Makefile.in
index a49a3fc..556b03b 100644
--- a/include/Makefile.in
+++ b/include/Makefile.in
@@ -1,9 +1,9 @@
-# Makefile.in generated by automake 1.11.1 from Makefile.am.
+# Makefile.in generated by automake 1.11.3 from Makefile.am.
 # @configure_input@
 
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006, 2007, 2008, 2009  Free Software Foundation,
-# Inc.
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
@@ -38,11 +38,11 @@ host_triplet = @host@
 subdir = include
 DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ac_define_dir.m4 \
-	$(top_srcdir)/m4/ax_icu_check.m4 $(top_srcdir)/m4/libtool.m4 \
-	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
-	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-	$(top_srcdir)/m4/pkg.m4 $(top_srcdir)/configure.ac
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \
+	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
+	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
+	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/m4/pkg.m4 \
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -168,7 +168,7 @@ SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
 STRIP = @STRIP@
-SYSCONF_PATH = @SYSCONF_PATH@
+SYSCONFDIR = @SYSCONFDIR@
 VERSION = @VERSION@
 XML2_CFLAGS = @XML2_CFLAGS@
 XML2_LIBS = @XML2_LIBS@
@@ -224,6 +224,8 @@ sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
 sysconfdir = @sysconfdir@
 target_alias = @target_alias@
+ticcutils_CFLAGS = @ticcutils_CFLAGS@
+ticcutils_LIBS = @ticcutils_LIBS@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
@@ -476,10 +478,15 @@ install-am: all-am
 
 installcheck: installcheck-recursive
 install-strip:
-	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	  `test -z '$(STRIP)' || \
-	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
 mostlyclean-generic:
 
 clean-generic:
diff --git a/include/ucto/Makefile.in b/include/ucto/Makefile.in
index 90f77f6..8e03a56 100644
--- a/include/ucto/Makefile.in
+++ b/include/ucto/Makefile.in
@@ -1,9 +1,9 @@
-# Makefile.in generated by automake 1.11.1 from Makefile.am.
+# Makefile.in generated by automake 1.11.3 from Makefile.am.
 # @configure_input@
 
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006, 2007, 2008, 2009  Free Software Foundation,
-# Inc.
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
@@ -38,11 +38,11 @@ subdir = include/ucto
 DIST_COMMON = $(pkginclude_HEADERS) $(srcdir)/Makefile.am \
 	$(srcdir)/Makefile.in
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ac_define_dir.m4 \
-	$(top_srcdir)/m4/ax_icu_check.m4 $(top_srcdir)/m4/libtool.m4 \
-	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
-	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-	$(top_srcdir)/m4/pkg.m4 $(top_srcdir)/configure.ac
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \
+	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
+	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
+	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/m4/pkg.m4 \
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -72,6 +72,12 @@ am__nobase_list = $(am__nobase_strip_setup); \
 am__base_list = \
   sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
   sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
 am__installdirs = "$(DESTDIR)$(pkgincludedir)"
 HEADERS = $(pkginclude_HEADERS)
 ETAGS = etags
@@ -153,7 +159,7 @@ SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
 STRIP = @STRIP@
-SYSCONF_PATH = @SYSCONF_PATH@
+SYSCONFDIR = @SYSCONFDIR@
 VERSION = @VERSION@
 XML2_CFLAGS = @XML2_CFLAGS@
 XML2_LIBS = @XML2_LIBS@
@@ -209,6 +215,8 @@ sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
 sysconfdir = @sysconfdir@
 target_alias = @target_alias@
+ticcutils_CFLAGS = @ticcutils_CFLAGS@
+ticcutils_LIBS = @ticcutils_LIBS@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
@@ -269,9 +277,7 @@ uninstall-pkgincludeHEADERS:
 	@$(NORMAL_UNINSTALL)
 	@list='$(pkginclude_HEADERS)'; test -n "$(pkgincludedir)" || list=; \
 	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
-	test -n "$$files" || exit 0; \
-	echo " ( cd '$(DESTDIR)$(pkgincludedir)' && rm -f" $$files ")"; \
-	cd "$(DESTDIR)$(pkgincludedir)" && rm -f $$files
+	dir='$(DESTDIR)$(pkgincludedir)'; $(am__uninstall_files_from_dir)
 
 ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
 	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
@@ -372,10 +378,15 @@ install-am: all-am
 
 installcheck: installcheck-am
 install-strip:
-	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	  `test -z '$(STRIP)' || \
-	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
 mostlyclean-generic:
 
 clean-generic:
diff --git a/include/ucto/tokenize.h b/include/ucto/tokenize.h
index 5677121..1da44fc 100644
--- a/include/ucto/tokenize.h
+++ b/include/ucto/tokenize.h
@@ -1,7 +1,7 @@
 /*
-  $Id: tokenize.h 14285 2012-02-21 10:18:28Z mvgompel $
+  $Id: tokenize.h 15910 2013-04-03 13:57:51Z sloot $
   $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/include/ucto/tokenize.h $
-  Copyright (c) 2006 - 2012
+  Copyright (c) 2006 - 2013
   Tilburg University
 
   This file is part of Ucto.
@@ -23,8 +23,8 @@
       http://ilk.uvt.nl/frog
 */
 
-#ifndef TOKENIZE_H
-#define TOKENIZE_H
+#ifndef UCTO_TOKENIZE_H
+#define UCTO_TOKENIZE_H
 
 #include <vector>
 #include <map>
@@ -32,7 +32,7 @@
 #include <stdexcept>
 #include "config.h"
 #include "ucto/unicode.h"
-#include "libfolia/document.h"
+#include "ticcutils/LogStream.h"
 
 namespace Tokenizer {
 
@@ -138,82 +138,65 @@ namespace Tokenizer {
     TokenizerClass();
     ~TokenizerClass();
     bool init( const std::string& );
-    void setErrorLog( std::ostream *os ) { theErrLog = os; };
+    void setErrorLog( TiCC::LogStream *os );
 
-    //Tokenize from input stream to FoLiA document
+    // Tokenize from input stream to FoLiA document
     folia::Document tokenize( std::istream& );
     
-    //tokenize folia document
+    // Tokenize a folia document
     bool tokenize(folia::Document& );
-    //..or one element thereof:
-    bool tokenize(folia::FoliaElement *);
-    bool tokenize(folia::FoliaElement *, bool, bool); //more specific variant, will be called by the one above
     
     //Tokenize from input stream to output stream
+    std::vector<Token> tokenizeStream( std::istream& );
     void tokenize( std::istream&, std::ostream& );
     void tokenize( std::istream* in, std::ostream* out){
       // for backward compatability
       return tokenize( *in, *out );};
     
-    //Tokenize a line (a line is NOT a sentence, but an arbitrary line of input)
-    int tokenizeLine( const UnicodeString& );
-    int tokenizeLine( const std::string& );
+    // Tokenize a line (a line is NOT a sentence, but an arbitrary string 
+    //                  of characters, inclusive EOS markers, Newlines etc.)
+    int tokenizeLine( const UnicodeString& ); // Unicode chars
+    int tokenizeLine( const std::string& );   // UTF8 chars
     
     void passthruLine( const std::string&, bool& );    
     
-    bool empty() const { return tokens.empty(); };
-        
     //Processes tokens and initialises the sentence buffer. Returns the amount of sentences found
     int countSentences(bool forceentirebuffer = false); //count the number of sentences (only after detectSentenceBounds) (does some extra validation as well)
     int flushSentences(const int); //Flush n sentences from buffer (does some extra validation as well)
     
-    
-    
-    //Get the begin and end index of the sentence with the specified index
-    bool getSentence( int, int& begin, int& end );
-    
-    //Get the sentence with the specified index as a vector of tokens
-    std::vector<Token*> getSentence( int );    
-    
     //Get the sentence with the specified index as a string (UTF-8 encoded)
-    std::string getSentenceString( unsigned int, const bool = false);
+    std::string getSentenceString( unsigned int );
     
     //Get all sentences as a vector of strings (UTF-8 encoded)
-    std::vector<std::string> getSentences() const;
-            
-    void detectSentenceBounds( const int offset = 0 );
-    void detectQuoteBounds( const int, const UChar);
-    
-    //Does the token buffer terminate with a proper EOS marker?
-    bool terminatesWithEOS( ) const;
-    
+    std::vector<std::string> getSentences();
+
     //Enable verbose mode
     bool setVerbose( bool b=true ) { bool t = verbose; verbose = b; return t; };
-    bool getVerbose() { return verbose; }
+    bool getVerbose() const { return verbose; }
     
     //set debug value
     int setDebug( int d ) { bool dd = tokDebug; tokDebug = d; return dd; };
-    int getDebug() { return tokDebug; }
+    int getDebug() const { return tokDebug; }
     
     //Enable conversion of all output to lowercase
     bool setLowercase( bool b=true ) { bool t = lowercase; lowercase = b; if (b) uppercase = false; return t; };
-    bool getLowercase() { return lowercase; }
+    bool getLowercase() const { return lowercase; }
 
     //Enable passtru mode
     bool setPassThru( bool b=true ) { bool t = passthru; passthru = b; return t; };
-    bool getPassThru() { return passthru; }
+    bool getPassThru() const { return passthru; }
     
     //Enable conversion of all output to uppercase
     bool setUppercase( bool b=true ) { bool t = uppercase; uppercase = b; if (b) lowercase = false; return t; };
-    bool getUppercase() { return uppercase; }
+    bool getUppercase() const { return uppercase; }
 
     //Enable sentence-bound detection
     bool setSentenceDetection( bool b=true ) { bool t = detectBounds; detectBounds = b; return t; }
-    bool getSentenceDetection() { return detectBounds; }
+    bool getSentenceDetection() const { return detectBounds; }
     
     //Enable paragraph detection
     bool setParagraphDetection( bool b=true ) { bool t = detectPar; detectPar = b; return t; }
-    bool getParagraphDetection() { return detectPar; }
+    bool getParagraphDetection() const { return detectPar; }
     
     //Enable quote detection
     bool setQuoteDetection( bool b=true ) { bool t = detectQuotes; detectQuotes = b; return t; }
@@ -234,46 +217,57 @@ namespace Tokenizer {
     std::string getInputEncoding() const { return inputEncoding; };
     
     // set eos marker
-    std::string setEosMarker( const std::string& s = "<utt>") { std::string t = eosmark; eosmark = s; return t; };
-    std::string getEosMarker( ) { return eosmark; }
+    UnicodeString setEosMarker( const std::string& s = "<utt>") { UnicodeString t = eosmark; eosmark =  folia::UTF8ToUnicode(s); return t; };
+    UnicodeString getEosMarker( ) const { return eosmark; }
 
     bool setSentencePerLineOutput( bool b=true ) { bool t = sentenceperlineoutput; sentenceperlineoutput = b; return t; };
-    bool getSentencePerLineOutput() { return sentenceperlineoutput; }
+    bool getSentencePerLineOutput() const { return sentenceperlineoutput; }
     
     bool setSentencePerLineInput( bool b=true ) { bool t = sentenceperlineinput; sentenceperlineinput = b; return t; };
-    bool getSentencePerLineInput() { return sentenceperlineinput; }    
-    
-    std::string getDocID() { return docid; }
-    bool getXMLOutput() { return xmlout; }
-    bool getXMLInput() { return xmlin; }
+    bool getSentencePerLineInput() const { return sentenceperlineinput; }    
     
+    std::string getDocID() const { return docid; }
+    bool getXMLOutput() const { return xmlout; }
+    bool getXMLInput() const { return xmlin; }
+
+    const std::string getTextClass( ) const { return textclass; }
+    const std::string setTextClass( const std::string& cls) {  
+      std::string res = textclass;
+      textclass = cls;
+      return res;
+    }
     
-    bool setXMLOutput(bool b, std::string id) { bool t = xmlout; docid = id; xmlout = b; return t; }
-    bool setXMLInput(bool b) { bool t = xmlin; xmlin = b; return t; }
+    bool setXMLOutput( bool b, const std::string& id) { bool t = xmlout; docid = id; xmlout = b; return t; }
+    bool setXMLInput( bool b ) { bool t = xmlin; xmlin = b; return t; }
     
+    void outputTokens( std::ostream&, const std::vector<Token>& ) const;
+  private:
+    void tokenizeWord( const UnicodeString&, bool);    
+
+    bool detectEos( size_t ) const;
+    void detectSentenceBounds( const int offset = 0 );
+    void detectQuotedSentenceBounds( const int offset = 0 );
+    void detectQuoteBounds( const int );
     //Signal the tokeniser that a paragraph is detected
     void signalParagraph( bool b=true ) { paragraphsignal = b; };
-    
-    
-    void outputTokens( std::ostream&, const size_t, const size_t, const bool = false) const;
-    void outputTokensXML( folia::Document& , const size_t, const size_t, bool&);
-    void outputTokensXML( folia::FoliaElement * root, const size_t begin, const size_t end,  bool& in_paragraph, bool root_is_paragraph=false, bool root_is_sentence=false);
-  private:
-    void tokenizeWord( const UnicodeString&, bool);
-    
-    //Turn buffered tokens into a UnicodeString contai, also outputs types and roles in verbose mode
-    
+        
     bool resolveQuote( int, const UnicodeString& );
-    bool detectEos( UChar );
-
+    bool u_isquote( UChar );
+    std::string checkBOM( const std::string&, std::string& );
     bool readsettings( const std::string& );
     bool readrules( const std::string& );
     bool readfilters( const std::string& );
     bool readquotes( const std::string& );
     bool readeosmarkers( const std::string& );
+    bool readabbreviations( const std::string&, UnicodeString& );
     
-    //Don't use this in normal processing, use flushSentences instead
-    void clear() { tokens.clear(); quotes.clearStack(); };
+    void sortRules( std::vector<Rule *>&, std::vector<UnicodeString>& );
+    void outputTokensDoc( folia::Document&, const std::vector<Token>& ) const;
+    void outputTokensXML( folia::FoliaElement *, const std::vector<Token>& ) const;
+    void tokenizeElement( folia::FoliaElement * );
+    void tokenizeSentenceElement( folia::FoliaElement * );         
+    //return the sentence with the specified index in a Token vector;
+    std::vector<Token> getSentence( int );
 
     Quoting quotes;
     UnicodeFilter filter;
@@ -281,12 +275,10 @@ namespace Tokenizer {
     UnicodeString eosmarkers;
     std::string inputEncoding;
 
-    std::string eosmark;
+    UnicodeString eosmark;
     std::vector<Token> tokens;
-        
     std::vector<Rule *> rules;
-    std::ostream *theErrLog;
-
+    TiCC::LogStream *theErrLog;
     
     //debug flag
     int tokDebug;
@@ -308,28 +300,21 @@ namespace Tokenizer {
     
     //has a paragraph been signaled?
     bool paragraphsignal;
-    //has a sentence been signaled?
-    bool sentencesignal;
     
     //one sentence per line output
     bool sentenceperlineoutput;
     bool sentenceperlineinput;
     
 
-    bool firstoutput;
     bool lowercase;
     bool uppercase;
     bool xmlout;  
     bool xmlin;  
     bool passthru;
-    
-    int parCount;
 
     std::string settingsfilename;
-    
     std::string docid; //document ID (UTF-8), necessary for XML output 
-  private:
-    void sortRules( std::vector<Rule *>&, std::vector<UnicodeString>& );
+    std::string textclass; // class for folia text
   };
 
   template< typename T >
diff --git a/include/ucto/unicode.h b/include/ucto/unicode.h
index 303db90..bdcc52c 100644
--- a/include/ucto/unicode.h
+++ b/include/ucto/unicode.h
@@ -1,7 +1,7 @@
 /*
- $Id: unicode.h 13842 2012-01-02 16:32:58Z sloot $
+ $Id: unicode.h 15571 2013-01-07 14:54:28Z sloot $
  $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/include/ucto/unicode.h $
-  Copyright (c) 1998 - 2012
+  Copyright (c) 1998 - 2013
   ILK  -  Tilburg University
   CNTS -  University of Antwerp
  
@@ -26,8 +26,8 @@
       Timbl at uvt.nl
 */
 
-#ifndef UNICODE_H
-#define UNICODE_H
+#ifndef UCTO_UNICODE_H
+#define UCTO_UNICODE_H
 
 #include <string>
 #include <map>
@@ -61,4 +61,4 @@ namespace Tokenizer {
   
 } // namespace
 
-#endif // UNICODE_H
+#endif // UCTO_UNICODE_H
diff --git a/ltmain.sh b/ltmain.sh
old mode 100755
new mode 100644
index b4a3231..c2852d8
--- a/ltmain.sh
+++ b/ltmain.sh
@@ -1,9 +1,9 @@
 
-# libtool (GNU libtool) 2.4
+# libtool (GNU libtool) 2.4.2
 # Written by Gordon Matzigkeit <gord at gnu.ai.mit.edu>, 1996
 
 # Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005, 2006,
-# 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
+# 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
 # This is free software; see the source for copying conditions.  There is NO
 # warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 
@@ -41,6 +41,7 @@
 #       --quiet, --silent    don't print informational messages
 #       --no-quiet, --no-silent
 #                            print informational messages (default)
+#       --no-warn            don't display warning messages
 #       --tag=TAG            use configuration variables from tag TAG
 #   -v, --verbose            print more informational messages than default
 #       --no-verbose         don't print the extra informational messages
@@ -69,7 +70,7 @@
 #         compiler:		$LTCC
 #         compiler flags:		$LTCFLAGS
 #         linker:		$LD (gnu? $with_gnu_ld)
-#         $progname:	(GNU libtool) 2.4 Debian-2.4-2ubuntu1
+#         $progname:	(GNU libtool) 2.4.2 Debian-2.4.2-1ubuntu1
 #         automake:	$automake_version
 #         autoconf:	$autoconf_version
 #
@@ -79,9 +80,9 @@
 
 PROGRAM=libtool
 PACKAGE=libtool
-VERSION="2.4 Debian-2.4-2ubuntu1"
+VERSION="2.4.2 Debian-2.4.2-1ubuntu1"
 TIMESTAMP=""
-package_revision=1.3293
+package_revision=1.3337
 
 # Be Bourne compatible
 if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
@@ -136,15 +137,10 @@ progpath="$0"
 
 : ${CP="cp -f"}
 test "${ECHO+set}" = set || ECHO=${as_echo-'printf %s\n'}
-: ${EGREP="/bin/grep -E"}
-: ${FGREP="/bin/grep -F"}
-: ${GREP="/bin/grep"}
-: ${LN_S="ln -s"}
 : ${MAKE="make"}
 : ${MKDIR="mkdir"}
 : ${MV="mv -f"}
 : ${RM="rm -f"}
-: ${SED="/bin/sed"}
 : ${SHELL="${CONFIG_SHELL-/bin/sh}"}
 : ${Xsed="$SED -e 1s/^X//"}
 
@@ -387,7 +383,7 @@ case $progpath in
      ;;
   *)
      save_IFS="$IFS"
-     IFS=:
+     IFS=${PATH_SEPARATOR-:}
      for progdir in $PATH; do
        IFS="$save_IFS"
        test -x "$progdir/$progname" && break
@@ -771,8 +767,8 @@ func_help ()
 	s*\$LTCFLAGS*'"$LTCFLAGS"'*
 	s*\$LD*'"$LD"'*
 	s/\$with_gnu_ld/'"$with_gnu_ld"'/
-	s/\$automake_version/'"`(automake --version) 2>/dev/null |$SED 1q`"'/
-	s/\$autoconf_version/'"`(autoconf --version) 2>/dev/null |$SED 1q`"'/
+	s/\$automake_version/'"`(${AUTOMAKE-automake} --version) 2>/dev/null |$SED 1q`"'/
+	s/\$autoconf_version/'"`(${AUTOCONF-autoconf} --version) 2>/dev/null |$SED 1q`"'/
 	p
 	d
      }
@@ -1052,6 +1048,7 @@ opt_finish=false
 opt_help=false
 opt_help_all=false
 opt_silent=:
+opt_warning=:
 opt_verbose=:
 opt_silent=false
 opt_verbose=false
@@ -1120,6 +1117,10 @@ esac
 			opt_silent=false
 func_append preserve_args " $opt"
 			;;
+      --no-warning|--no-warn)
+			opt_warning=false
+func_append preserve_args " $opt"
+			;;
       --no-verbose)
 			opt_verbose=false
 func_append preserve_args " $opt"
@@ -2059,7 +2060,7 @@ func_mode_compile ()
     *.[cCFSifmso] | \
     *.ada | *.adb | *.ads | *.asm | \
     *.c++ | *.cc | *.ii | *.class | *.cpp | *.cxx | \
-    *.[fF][09]? | *.for | *.java | *.obj | *.sx | *.cu | *.cup)
+    *.[fF][09]? | *.for | *.java | *.go | *.obj | *.sx | *.cu | *.cup)
       func_xform "$libobj"
       libobj=$func_xform_result
       ;;
@@ -3201,11 +3202,13 @@ func_mode_install ()
 
       # Set up the ranlib parameters.
       oldlib="$destdir/$name"
+      func_to_tool_file "$oldlib" func_convert_file_msys_to_w32
+      tool_oldlib=$func_to_tool_file_result
 
       func_show_eval "$install_prog \$file \$oldlib" 'exit $?'
 
       if test -n "$stripme" && test -n "$old_striplib"; then
-	func_show_eval "$old_striplib $oldlib" 'exit $?'
+	func_show_eval "$old_striplib $tool_oldlib" 'exit $?'
       fi
 
       # Do each command in the postinstall commands.
@@ -3470,7 +3473,7 @@ static const void *lt_preloaded_setup() {
 	  # linked before any other PIC object.  But we must not use
 	  # pic_flag when linking with -static.  The problem exists in
 	  # FreeBSD 2.2.6 and is fixed in FreeBSD 3.1.
-	  *-*-freebsd2*|*-*-freebsd3.0*|*-*-freebsdelf3.0*)
+	  *-*-freebsd2.*|*-*-freebsd3.0*|*-*-freebsdelf3.0*)
 	    pic_flag_for_symtable=" $pic_flag -DFREEBSD_WORKAROUND" ;;
 	  *-*-hpux*)
 	    pic_flag_for_symtable=" $pic_flag"  ;;
@@ -3982,14 +3985,17 @@ func_exec_program_core ()
 # launches target application with the remaining arguments.
 func_exec_program ()
 {
-  for lt_wr_arg
-  do
-    case \$lt_wr_arg in
-    --lt-*) ;;
-    *) set x \"\$@\" \"\$lt_wr_arg\"; shift;;
-    esac
-    shift
-  done
+  case \" \$* \" in
+  *\\ --lt-*)
+    for lt_wr_arg
+    do
+      case \$lt_wr_arg in
+      --lt-*) ;;
+      *) set x \"\$@\" \"\$lt_wr_arg\"; shift;;
+      esac
+      shift
+    done ;;
+  esac
   func_exec_program_core \${1+\"\$@\"}
 }
 
@@ -5057,9 +5063,15 @@ void lt_dump_script (FILE* f)
 {
 EOF
 	    func_emit_wrapper yes |
-              $SED -e 's/\([\\"]\)/\\\1/g' \
-	           -e 's/^/  fputs ("/' -e 's/$/\\n", f);/'
-
+	      $SED -n -e '
+s/^\(.\{79\}\)\(..*\)/\1\
+\2/
+h
+s/\([\\"]\)/\\\1/g
+s/$/\\n/
+s/\([^\n]*\).*/  fputs ("\1", f);/p
+g
+D'
             cat <<"EOF"
 }
 EOF
@@ -5643,7 +5655,8 @@ func_mode_link ()
 	continue
 	;;
 
-      -mt|-mthreads|-kthread|-Kthread|-pthread|-pthreads|--thread-safe|-threads)
+      -mt|-mthreads|-kthread|-Kthread|-pthread|-pthreads|--thread-safe \
+      |-threads|-fopenmp|-openmp|-mp|-xopenmp|-omp|-qsmp=*)
 	func_append compiler_flags " $arg"
 	func_append compile_command " $arg"
 	func_append finalize_command " $arg"
@@ -6150,7 +6163,8 @@ func_mode_link ()
 	lib=
 	found=no
 	case $deplib in
-	-mt|-mthreads|-kthread|-Kthread|-pthread|-pthreads|--thread-safe|-threads)
+	-mt|-mthreads|-kthread|-Kthread|-pthread|-pthreads|--thread-safe \
+        |-threads|-fopenmp|-openmp|-mp|-xopenmp|-omp|-qsmp=*)
 	  if test "$linkmode,$pass" = "prog,link"; then
 	    compile_deplibs="$deplib $compile_deplibs"
 	    finalize_deplibs="$deplib $finalize_deplibs"
@@ -6834,7 +6848,7 @@ func_mode_link ()
 	         test "$hardcode_direct_absolute" = no; then
 		add="$dir/$linklib"
 	      elif test "$hardcode_minus_L" = yes; then
-		add_dir="-L$dir"
+		add_dir="-L$absdir"
 		# Try looking first in the location we're being installed to.
 		if test -n "$inst_prefix_dir"; then
 		  case $libdir in
@@ -7319,6 +7333,7 @@ func_mode_link ()
 	  # which has an extra 1 added just for fun
 	  #
 	  case $version_type in
+	  # correct linux to gnu/linux during the next big refactor
 	  darwin|linux|osf|windows|none)
 	    func_arith $number_major + $number_minor
 	    current=$func_arith_result
@@ -7438,7 +7453,7 @@ func_mode_link ()
 	  versuffix="$major.$revision"
 	  ;;
 
-	linux)
+	linux) # correct to gnu/linux during the next big refactor
 	  func_arith $current - $age
 	  major=.$func_arith_result
 	  versuffix="$major.$age.$revision"
@@ -8026,6 +8041,11 @@ EOF
 
       # Test again, we may have decided not to build it any more
       if test "$build_libtool_libs" = yes; then
+	# Remove ${wl} instances when linking with ld.
+	# FIXME: should test the right _cmds variable.
+	case $archive_cmds in
+	  *\$LD\ *) wl= ;;
+        esac
 	if test "$hardcode_into_libs" = yes; then
 	  # Hardcode the library paths
 	  hardcode_libdirs=
@@ -8056,7 +8076,7 @@ EOF
 	    elif test -n "$runpath_var"; then
 	      case "$perm_rpath " in
 	      *" $libdir "*) ;;
-	      *) func_apped perm_rpath " $libdir" ;;
+	      *) func_append perm_rpath " $libdir" ;;
 	      esac
 	    fi
 	  done
@@ -8064,11 +8084,7 @@ EOF
 	  if test -n "$hardcode_libdir_separator" &&
 	     test -n "$hardcode_libdirs"; then
 	    libdir="$hardcode_libdirs"
-	    if test -n "$hardcode_libdir_flag_spec_ld"; then
-	      eval dep_rpath=\"$hardcode_libdir_flag_spec_ld\"
-	    else
-	      eval dep_rpath=\"$hardcode_libdir_flag_spec\"
-	    fi
+	    eval "dep_rpath=\"$hardcode_libdir_flag_spec\""
 	  fi
 	  if test -n "$runpath_var" && test -n "$perm_rpath"; then
 	    # We should set the runpath_var.
@@ -9158,6 +9174,8 @@ EOF
 	    esac
 	  done
 	fi
+	func_to_tool_file "$oldlib" func_convert_file_msys_to_w32
+	tool_oldlib=$func_to_tool_file_result
 	eval cmds=\"$old_archive_cmds\"
 
 	func_len " $cmds"
@@ -9267,7 +9285,8 @@ EOF
 	      *.la)
 		func_basename "$deplib"
 		name="$func_basename_result"
-		eval libdir=`${SED} -n -e 's/^libdir=\(.*\)$/\1/p' $deplib`
+		func_resolve_sysroot "$deplib"
+		eval libdir=`${SED} -n -e 's/^libdir=\(.*\)$/\1/p' $func_resolve_sysroot_result`
 		test -z "$libdir" && \
 		  func_fatal_error "\`$deplib' is not a valid libtool archive"
 		func_append newdependency_libs " ${lt_sysroot:+=}$libdir/$name"
diff --git a/m4/Makefile.in b/m4/Makefile.in
index 67e25bd..58e23c8 100644
--- a/m4/Makefile.in
+++ b/m4/Makefile.in
@@ -1,9 +1,9 @@
-# Makefile.in generated by automake 1.11.1 from Makefile.am.
+# Makefile.in generated by automake 1.11.3 from Makefile.am.
 # @configure_input@
 
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006, 2007, 2008, 2009  Free Software Foundation,
-# Inc.
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
@@ -39,11 +39,11 @@ host_triplet = @host@
 subdir = m4
 DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ac_define_dir.m4 \
-	$(top_srcdir)/m4/ax_icu_check.m4 $(top_srcdir)/m4/libtool.m4 \
-	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
-	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-	$(top_srcdir)/m4/pkg.m4 $(top_srcdir)/configure.ac
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \
+	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
+	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
+	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/m4/pkg.m4 \
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -129,7 +129,7 @@ SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
 STRIP = @STRIP@
-SYSCONF_PATH = @SYSCONF_PATH@
+SYSCONFDIR = @SYSCONFDIR@
 VERSION = @VERSION@
 XML2_CFLAGS = @XML2_CFLAGS@
 XML2_LIBS = @XML2_LIBS@
@@ -185,6 +185,8 @@ sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
 sysconfdir = @sysconfdir@
 target_alias = @target_alias@
+ticcutils_CFLAGS = @ticcutils_CFLAGS@
+ticcutils_LIBS = @ticcutils_LIBS@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
@@ -279,10 +281,15 @@ install-am: all-am
 
 installcheck: installcheck-am
 install-strip:
-	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	  `test -z '$(STRIP)' || \
-	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
 mostlyclean-generic:
 
 clean-generic:
diff --git a/m4/ac_define_dir.m4 b/m4/ac_define_dir.m4
deleted file mode 100644
index 8594947..0000000
--- a/m4/ac_define_dir.m4
+++ /dev/null
@@ -1,45 +0,0 @@
-# ===========================================================================
-#             http://autoconf-archive.cryp.to/ac_define_dir.html
-# ===========================================================================
-#
-# SYNOPSIS
-#
-#   AC_DEFINE_DIR(VARNAME, DIR [, DESCRIPTION])
-#
-# DESCRIPTION
-#
-#   This macro sets VARNAME to the expansion of the DIR variable, taking
-#   care of fixing up ${prefix} and such.
-#
-#   VARNAME is then offered as both an output variable and a C preprocessor
-#   symbol.
-#
-#   Example:
-#
-#      AC_DEFINE_DIR([DATADIR], [datadir], [Where data are placed to.])
-#
-# LICENSE
-#
-#   Copyright (c) 2008 Stepan Kasal <kasal at ucw.cz>
-#   Copyright (c) 2008 Andreas Schwab <schwab at suse.de>
-#   Copyright (c) 2008 Guido U. Draheim <guidod at gmx.de>
-#   Copyright (c) 2008 Alexandre Oliva
-#
-#   Copying and distribution of this file, with or without modification, are
-#   permitted in any medium without royalty provided the copyright notice
-#   and this notice are preserved.
-
-AC_DEFUN([AC_DEFINE_DIR], [
-  prefix_NONE=
-  exec_prefix_NONE=
-  test "x$prefix" = xNONE && prefix_NONE=yes && prefix=$ac_default_prefix
-  test "x$exec_prefix" = xNONE && exec_prefix_NONE=yes && exec_prefix=$prefix
-dnl In Autoconf 2.60, ${datadir} refers to ${datarootdir}, which in turn
-dnl refers to ${prefix}.  Thus we have to use `eval' twice.
-  eval ac_define_dir="\"[$]$2\""
-  eval ac_define_dir="\"$ac_define_dir\""
-  AC_SUBST($1, "$ac_define_dir")
-  AC_DEFINE_UNQUOTED($1, "$ac_define_dir", [$3])
-  test "$prefix_NONE" && prefix=NONE
-  test "$exec_prefix_NONE" && exec_prefix=NONE
-])
diff --git a/m4/ltversion.m4 b/m4/ltversion.m4
index 9c7b5d4..07a8602 100644
--- a/m4/ltversion.m4
+++ b/m4/ltversion.m4
@@ -9,15 +9,15 @@
 
 # @configure_input@
 
-# serial 3293 ltversion.m4
+# serial 3337 ltversion.m4
 # This file is part of GNU Libtool
 
-m4_define([LT_PACKAGE_VERSION], [2.4])
-m4_define([LT_PACKAGE_REVISION], [1.3293])
+m4_define([LT_PACKAGE_VERSION], [2.4.2])
+m4_define([LT_PACKAGE_REVISION], [1.3337])
 
 AC_DEFUN([LTVERSION_VERSION],
-[macro_version='2.4'
-macro_revision='1.3293'
+[macro_version='2.4.2'
+macro_revision='1.3337'
 _LT_DECL(, macro_version, 0, [Which release of libtool.m4 was used?])
 _LT_DECL(, macro_revision, 0)
 ])
diff --git a/missing b/missing
index 28055d2..86a8fc3 100755
--- a/missing
+++ b/missing
@@ -1,10 +1,10 @@
 #! /bin/sh
 # Common stub for a few missing GNU programs while installing.
 
-scriptversion=2009-04-28.21; # UTC
+scriptversion=2012-01-06.13; # UTC
 
 # Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006,
-# 2008, 2009 Free Software Foundation, Inc.
+# 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
 # Originally by Fran,cois Pinard <pinard at iro.umontreal.ca>, 1996.
 
 # This program is free software; you can redistribute it and/or modify
@@ -84,7 +84,6 @@ Supported PROGRAM values:
   help2man     touch the output file
   lex          create \`lex.yy.c', if possible, from existing .c
   makeinfo     touch the output file
-  tar          try tar, gnutar, gtar, then tar without non-portable flags
   yacc         create \`y.tab.[ch]', if possible, from existing .[ch]
 
 Version suffixes to PROGRAM as well as the prefixes \`gnu-', \`gnu', and
@@ -122,15 +121,6 @@ case $1 in
     # Not GNU programs, they don't have --version.
     ;;
 
-  tar*)
-    if test -n "$run"; then
-       echo 1>&2 "ERROR: \`tar' requires --run"
-       exit 1
-    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
-       exit 1
-    fi
-    ;;
-
   *)
     if test -z "$run" && ($1 --version) > /dev/null 2>&1; then
        # We have it, but it failed.
@@ -226,7 +216,7 @@ WARNING: \`$1' $msg.  You should only need it if
          \`Bison' from any GNU archive site."
     rm -f y.tab.c y.tab.h
     if test $# -ne 1; then
-        eval LASTARG="\${$#}"
+        eval LASTARG=\${$#}
 	case $LASTARG in
 	*.y)
 	    SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
@@ -256,7 +246,7 @@ WARNING: \`$1' is $msg.  You should only need it if
          \`Flex' from any GNU archive site."
     rm -f lex.yy.c
     if test $# -ne 1; then
-        eval LASTARG="\${$#}"
+        eval LASTARG=\${$#}
 	case $LASTARG in
 	*.l)
 	    SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
@@ -318,41 +308,6 @@ WARNING: \`$1' is $msg.  You should only need it if
     touch $file
     ;;
 
-  tar*)
-    shift
-
-    # We have already tried tar in the generic part.
-    # Look for gnutar/gtar before invocation to avoid ugly error
-    # messages.
-    if (gnutar --version > /dev/null 2>&1); then
-       gnutar "$@" && exit 0
-    fi
-    if (gtar --version > /dev/null 2>&1); then
-       gtar "$@" && exit 0
-    fi
-    firstarg="$1"
-    if shift; then
-	case $firstarg in
-	*o*)
-	    firstarg=`echo "$firstarg" | sed s/o//`
-	    tar "$firstarg" "$@" && exit 0
-	    ;;
-	esac
-	case $firstarg in
-	*h*)
-	    firstarg=`echo "$firstarg" | sed s/h//`
-	    tar "$firstarg" "$@" && exit 0
-	    ;;
-	esac
-    fi
-
-    echo 1>&2 "\
-WARNING: I can't seem to be able to run \`tar' with the given arguments.
-         You may want to install GNU tar or Free paxutils, or check the
-         command line arguments."
-    exit 1
-    ;;
-
   *)
     echo 1>&2 "\
 WARNING: \`$1' is needed, and is $msg.
diff --git a/src/Makefile.am b/src/Makefile.am
index 8a2dee2..1225fc4 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,7 +1,8 @@
-# $Id: Makefile.am 13510 2011-11-02 16:22:13Z sloot $
+# $Id: Makefile.am 15925 2013-04-03 15:55:40Z sloot $
 # $URL $
 
 AM_CPPFLAGS = -I at top_srcdir@/include
+AM_CXXFLAGS = -DSYSCONF_PATH=\"$(SYSCONFDIR)\"
 
 bin_PROGRAMS = ucto
 
@@ -10,7 +11,7 @@ LDADD = libucto.la
 ucto_SOURCES = ucto.cxx
 
 lib_LTLIBRARIES = libucto.la
-libucto_la_LDFLAGS = -version-info 1:0:0
+libucto_la_LDFLAGS = -version-info 2:0:0
 
 libucto_la_SOURCES =  tokenize.cxx unicode.cxx
 
diff --git a/src/Makefile.in b/src/Makefile.in
index 7170674..264bcfd 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -1,9 +1,9 @@
-# Makefile.in generated by automake 1.11.1 from Makefile.am.
+# Makefile.in generated by automake 1.11.3 from Makefile.am.
 # @configure_input@
 
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006, 2007, 2008, 2009  Free Software Foundation,
-# Inc.
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
@@ -15,7 +15,7 @@
 
 @SET_MAKE@
 
-# $Id: Makefile.am 13510 2011-11-02 16:22:13Z sloot $
+# $Id: Makefile.am 15925 2013-04-03 15:55:40Z sloot $
 # $URL $
 
 
@@ -42,11 +42,11 @@ bin_PROGRAMS = ucto$(EXEEXT)
 subdir = src
 DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ac_define_dir.m4 \
-	$(top_srcdir)/m4/ax_icu_check.m4 $(top_srcdir)/m4/libtool.m4 \
-	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
-	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-	$(top_srcdir)/m4/pkg.m4 $(top_srcdir)/configure.ac
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \
+	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
+	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
+	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/m4/pkg.m4 \
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -74,6 +74,12 @@ am__nobase_list = $(am__nobase_strip_setup); \
 am__base_list = \
   sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
   sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
 am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)"
 LTLIBRARIES = $(lib_LTLIBRARIES)
 libucto_la_LIBADD =
@@ -183,7 +189,7 @@ SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
 STRIP = @STRIP@
-SYSCONF_PATH = @SYSCONF_PATH@
+SYSCONFDIR = @SYSCONFDIR@
 VERSION = @VERSION@
 XML2_CFLAGS = @XML2_CFLAGS@
 XML2_LIBS = @XML2_LIBS@
@@ -239,14 +245,17 @@ sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
 sysconfdir = @sysconfdir@
 target_alias = @target_alias@
+ticcutils_CFLAGS = @ticcutils_CFLAGS@
+ticcutils_LIBS = @ticcutils_LIBS@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 AM_CPPFLAGS = -I at top_srcdir@/include
+AM_CXXFLAGS = -DSYSCONF_PATH=\"$(SYSCONFDIR)\"
 LDADD = libucto.la
 ucto_SOURCES = ucto.cxx
 lib_LTLIBRARIES = libucto.la
-libucto_la_LDFLAGS = -version-info 1:0:0
+libucto_la_LDFLAGS = -version-info 2:0:0
 libucto_la_SOURCES = tokenize.cxx unicode.cxx
 TESTS_ENVIRONMENT = $(SHELL) -x
 TESTS = tst.sh
@@ -317,7 +326,7 @@ clean-libLTLIBRARIES:
 	  echo "rm -f \"$${dir}/so_locations\""; \
 	  rm -f "$${dir}/so_locations"; \
 	done
-libucto.la: $(libucto_la_OBJECTS) $(libucto_la_DEPENDENCIES) 
+libucto.la: $(libucto_la_OBJECTS) $(libucto_la_DEPENDENCIES) $(EXTRA_libucto_la_DEPENDENCIES) 
 	$(libucto_la_LINK) -rpath $(libdir) $(libucto_la_OBJECTS) $(libucto_la_LIBADD) $(LIBS)
 install-binPROGRAMS: $(bin_PROGRAMS)
 	@$(NORMAL_INSTALL)
@@ -362,7 +371,7 @@ clean-binPROGRAMS:
 	list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \
 	echo " rm -f" $$list; \
 	rm -f $$list
-ucto$(EXEEXT): $(ucto_OBJECTS) $(ucto_DEPENDENCIES) 
+ucto$(EXEEXT): $(ucto_OBJECTS) $(ucto_DEPENDENCIES) $(EXTRA_ucto_DEPENDENCIES) 
 	@rm -f ucto$(EXEEXT)
 	$(CXXLINK) $(ucto_OBJECTS) $(ucto_LDADD) $(LIBS)
 
@@ -536,14 +545,15 @@ check-TESTS: $(TESTS)
 	  fi; \
 	  dashes=`echo "$$dashes" | sed s/./=/g`; \
 	  if test "$$failed" -eq 0; then \
-	    echo "$$grn$$dashes"; \
+	    col="$$grn"; \
 	  else \
-	    echo "$$red$$dashes"; \
+	    col="$$red"; \
 	  fi; \
-	  echo "$$banner"; \
-	  test -z "$$skipped" || echo "$$skipped"; \
-	  test -z "$$report" || echo "$$report"; \
-	  echo "$$dashes$$std"; \
+	  echo "$${col}$$dashes$${std}"; \
+	  echo "$${col}$$banner$${std}"; \
+	  test -z "$$skipped" || echo "$${col}$$skipped$${std}"; \
+	  test -z "$$report" || echo "$${col}$$report$${std}"; \
+	  echo "$${col}$$dashes$${std}"; \
 	  test "$$failed" -eq 0; \
 	else :; fi
 
@@ -597,10 +607,15 @@ install-am: all-am
 
 installcheck: installcheck-am
 install-strip:
-	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	  `test -z '$(STRIP)' || \
-	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
 mostlyclean-generic:
 
 clean-generic:
diff --git a/src/tokenize.cxx b/src/tokenize.cxx
index b64ee07..bc443a5 100644
--- a/src/tokenize.cxx
+++ b/src/tokenize.cxx
@@ -1,7 +1,7 @@
 /*
-  $Id: tokenize.cxx 14336 2012-02-27 10:38:57Z sloot $
+  $Id: tokenize.cxx 15910 2013-04-03 13:57:51Z sloot $
   $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/src/tokenize.cxx $
-  Copyright (c) 2006 - 2012
+  Copyright (c) 2006 - 2013
   Tilburg University
 
   This file is part of Ucto
@@ -20,24 +20,9 @@
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
   For more information and updates, see:
-      http://ilk.uvt.nl/frog
+  http://ilk.uvt.nl/frog
 */
 
-/* ************************************
-* 
-*  Original version by Maarten van Gompel, ILK, Tilburg University
-*     proycon AT anaproy DOT nl
-*
-*     Tilburg University
-*
-*     Licensed under GPLv3
-*
-*     v0.2 - 2010-05-11
-*     v0.3 - 2010-05-19 - Migration from GLIB to ICU, by Ko van der Sloot
-*     v0.4 - 2010-12-10 - promoted into a separate module
-*
-************************************ */
-
 #include <cstring>
 #include <cstdlib>
 #include <iostream>
@@ -45,15 +30,14 @@
 #include <sstream>
 #include "unicode/ustream.h"
 #include "unicode/regex.h"
+#include "unicode/ucnv.h"
 #include "ucto/unicode.h"
 #include "config.h"
 #include "libfolia/document.h"
-#include "libfolia/folia.h"
 #include "ucto/tokenize.h"
 
 using namespace std;
-
-#define Log 
+using namespace TiCC;
 
 namespace Tokenizer {
 
@@ -105,8 +89,7 @@ namespace Tokenizer {
     RegexMatcher *matcher;
     UnicodeRegexMatcher();
     vector<UnicodeString> results;
-  };
-  
+  };  
   
   UnicodeRegexMatcher::UnicodeRegexMatcher( const UnicodeString& pat ){
     failString = "";
@@ -144,14 +127,14 @@ namespace Tokenizer {
     post = "";
     results.clear();
     if ( matcher ){
-      //  *Log(theErrLog) << "start matcher [" << line << "]" << endl;
+      // cerr << "start matcher [" << line << "], pattern = " << Pattern() << endl;
       matcher->reset( line );
       if ( matcher->find() ){
-	// *Log(theErrLog) << "matched " << folia::UnicodeToUTF8(line) << endl;
+	// cerr << "matched " << folia::UnicodeToUTF8(line) << endl;
 	int start = -1;
 	int end = 0;
 	for ( int i=0; i <= matcher->groupCount(); ++i ){
-	  // *Log(theErrLog) << "group " << i << endl;
+	  // cerr << "group " << i << endl;
 	  u_stat = U_ZERO_ERROR;
 	  start = matcher->start( i, u_stat );
 	  if (!U_FAILURE(u_stat)){
@@ -163,19 +146,19 @@ namespace Tokenizer {
 	    break;
 	  if ( start > end ){
 	    pre = UnicodeString( line, end, start );
-	    // *Log(theErrLog) << "found pre " << folia::UnicodeToUTF8(pre) << endl;
+	    // cerr << "found pre " << folia::UnicodeToUTF8(pre) << endl;
 	  }
 	  end = matcher->end( i, u_stat );
 	  if (!U_FAILURE(u_stat)){
 	    results.push_back( UnicodeString( line, start, end - start ) );
-	    // *Log(theErrLog) << "added result " << folia::UnicodeToUTF8( results[results.size()-1] ) << endl;
+	    // cerr << "added result " << folia::UnicodeToUTF8( results[results.size()-1] ) << endl;
 	  }
 	  else
 	    break;
 	}
 	if ( end < line.length() ){
 	  post = UnicodeString( line, end );
-	  // *Log(theErrLog) << "found post " << folia::UnicodeToUTF8(post) << endl;
+	  // cerr << "found post " << folia::UnicodeToUTF8(post) << endl;
 	}
 	return true;
       }
@@ -216,12 +199,9 @@ namespace Tokenizer {
   const UnicodeString type_number = "NUMBER";
   const UnicodeString type_unknown = "UNKNOWN";
 
-  const UnicodeString explicit_eos_marker = "<utt>";
-
   ostream& operator<<( ostream& os, const Quoting& q ){
     for( size_t i=0; i < q.quotes.size(); ++i ){
-      os << folia::UnicodeToUTF8(q.quotes[i].openQuote) 
-	 << "\t" << folia::UnicodeToUTF8( q.quotes[i].closeQuote ) << endl;
+      os << q.quotes[i].openQuote << "\t" << q.quotes[i].closeQuote << endl;
     }
     return os;
   }
@@ -234,7 +214,7 @@ namespace Tokenizer {
       for ( size_t i = 0; i < quotestack.size(); i++) {
 	if (quoteindexstack[i] >= beginindex ) {
 	  new_quotestack.push_back(quotestack[i]);
-	  new_quoteindexstack.push_back(quoteindexstack[i]-beginindex);			
+	  new_quoteindexstack.push_back(quoteindexstack[i]-beginindex);
 	}
       }
       quoteindexstack = new_quoteindexstack;
@@ -287,8 +267,7 @@ namespace Tokenizer {
   
 
   ostream& operator<< (std::ostream& os, const Token& t ){
-    os << folia::UnicodeToUTF8( *t.type ) << " : " << t.role 
-       << ":" << folia::UnicodeToUTF8( t.us );
+    os << *t.type << " : " << t.role  << ":" << t.us;
     return os;
   }
   
@@ -303,23 +282,32 @@ namespace Tokenizer {
 
   ostream& operator<< (std::ostream& os, const Rule& r ){
     if ( r.regexp ){
-      os << folia::UnicodeToUTF8( r.id ) << "=\"" << folia::UnicodeToUTF8( r.regexp->Pattern() ) << "\"";
+      os << r.id << "=\"" << r.regexp->Pattern() << "\"";
     }
     else
-      os << folia::UnicodeToUTF8( r.id ) << "NULL";
+      os << r.id  << "=NULL";
     return os;
   }
 
   TokenizerClass::TokenizerClass():
     inputEncoding( "UTF-8" ), eosmark("<utt>"), 
-    theErrLog(&cerr), 
     tokDebug(0), verbose(false), 
     detectBounds(true), detectQuotes(false), doFilter(true), detectPar(true),
-    paragraphsignal(true),sentencesignal(true),
+    paragraphsignal(true),
     sentenceperlineoutput(false), sentenceperlineinput(false), 
     lowercase(false), uppercase(false), 
-    xmlout(false), passthru(false), parCount(0)
-  {}
+    xmlout(false), passthru(false), textclass("current")
+  { 
+    theErrLog = new TiCC::LogStream(cerr);
+    theErrLog->setstamp( NoStamp );
+  }
+
+  void TokenizerClass::setErrorLog( TiCC::LogStream *os ) { 
+    if ( theErrLog != os ){
+      delete theErrLog;
+    }
+    theErrLog = os; 
+  }
 
   string TokenizerClass::setInputEncoding( const std::string& enc ){
     string old = inputEncoding;
@@ -333,23 +321,17 @@ namespace Tokenizer {
       s.erase( pos );
     }
   }
-
-  folia::Document TokenizerClass::tokenize( istream& IN ) {
-    bool in_paragraph = false; //for XML
+  
+  vector<Token> TokenizerClass::tokenizeStream( istream& IN ) {
+    vector<Token> outputTokens;
     bool done = false;
     bool bos = true;
-    parCount = 0;
-    folia::Document doc( "id='" + docid + "'" );
-    doc.addStyle( "type=\"text/xsl\" href=\"folia.xsl\"" );
-    doc.declare( folia::AnnotationType::TOKEN, settingsfilename, "annotator='ucto', annotatortype='auto'" );
-    folia::FoliaElement *text = new folia::Text( "id='" + docid + ".text'" );
-    doc.append( text );
     string line;      
     do {	    
       done = !getline( IN, line );
       stripCR( line );
       if ( sentenceperlineinput )
-	line += string(" ") + folia::UnicodeToUTF8(explicit_eos_marker);
+	line += string(" ") + folia::UnicodeToUTF8(eosmark);
       int numS;
       if ( (done) || (line.empty()) ){
 	signalParagraph();
@@ -365,215 +347,175 @@ namespace Tokenizer {
       if ( numS > 0 ) { //process sentences 
 	if  (tokDebug > 0) *Log(theErrLog) << "[tokenize] " << numS << " sentence(s) in buffer, processing..." << endl;
 	for (int i = 0; i < numS; i++) {
-	  int begin, end;
-	  if (!getSentence(i, begin, end)) {
-	    if  (tokDebug > 0) *Log(theErrLog) << "[tokenize] ERROR: Sentence index " << i << " is out of range!!" << endl;
-	    throw uRangeError("Sentence index"); //should never happen
-	  }
-	  /* ******* Begin process sentence  ********** */
-	  if (tokDebug >= 1) *Log(theErrLog) << "[tokenize] Outputting sentence " << i << ", begin="<<begin << ",end="<< end << endl;
-	  outputTokensXML( doc, begin, end, in_paragraph );
+	  vector<Token> v = getSentence( i );
+	  outputTokens.insert( outputTokens.end(), v.begin(), v.end() );
 	}
 	//clear processed sentences from buffer
 	if  (tokDebug > 0) *Log(theErrLog) << "[tokenize] flushing " << numS << " sentence(s) from buffer..." << endl;
 	flushSentences(numS);	    
-      } else {
+      } 
+      else {
 	if  (tokDebug > 0) *Log(theErrLog) << "[tokenize] No sentences yet, reading on..." << endl;
       }	
     } while (!done);
-    return doc;
+    return outputTokens;
   }
   
+  folia::Document TokenizerClass::tokenize( istream& IN ) {
+    vector<Token> v = tokenizeStream( IN );
+    folia::Document doc( "id='" + docid + "'" );
+    outputTokensDoc( doc, v );
+    return doc;
+  }
+    
+  void TokenizerClass::tokenize( istream& IN, ostream& OUT) {
+    vector<Token> v = tokenizeStream( IN );
+    if (xmlout) {
+      folia::Document doc( "id='" + docid + "'" );
+      outputTokensDoc( doc, v );
+      OUT << doc << endl;
+    } 
+    else {
+      outputTokens( OUT, v );
+      OUT << endl;
+    }
+  }
   
   bool TokenizerClass::tokenize( folia::Document& doc ) {
-    doc.declare( folia::AnnotationType::TOKEN, settingsfilename, "annotator='ucto', annotatortype='auto'" );
-    bool result = false;
     if (tokDebug >= 2) *Log(theErrLog) << "tokenize doc " << doc << endl;
-
     for ( size_t i = 0; i < doc.doc()->size(); i++) {
       if (tokDebug >= 2) *Log(theErrLog) << "[tokenize] Invoking processing of first-level element " << doc.doc()->index(i)->id() << endl;
-      result = tokenize(doc.doc()->index(i)) || result;
-    }      
-    return result;
+      tokenizeElement( doc.doc()->index(i) );
+    }
+    return true; 
   }
   
-  
-  bool TokenizerClass::tokenize(folia::FoliaElement * element) {
-    if (element->isinstance(folia::Word_t) || element->isinstance(folia::TextContent_t)) return false;
-    if (tokDebug >= 2) *Log(theErrLog) << "[tokenize] Processing FoLiA element " << element->id() << endl;
-    if (element->hastext()) {
-      if (element->isinstance(folia::Paragraph_t)) {
+  void TokenizerClass::tokenizeElement(folia::FoliaElement * element) {
+    if ( element->isinstance(folia::Word_t) 
+	 || element->isinstance(folia::TextContent_t)) 
+      // shortcut
+      return;
+    if (tokDebug >= 2) *Log(theErrLog) << "[tokenizeElement] Processing FoLiA element " << element->id() << endl;
+    if ( element->hastext( textclass ) ) {
+      // We have an element which contains text. That's nice
+      // now we must see wether some 'formatting' is there. ( like Words() or
+      // Sentences() )
+      // If so: assume that the text is tokenized already, and don't spoil that
+      if ( element->isinstance(folia::Paragraph_t) ) {
 	//tokenize paragraph: check for absence of sentences
 	vector<folia::Sentence*> sentences = element->sentences();
-	if (sentences.size() == 0) {
-	  //no sentences yet, good
-	  tokenize(element,true,false);
-	  return true;
-	} 
-      } else if ( (element->isinstance(folia::Sentence_t)) || (element->isinstance(folia::Head_t)) ) {
-	//tokenize sentence: check for absence of words
+	if (sentences.size() > 0) {
+	  // bail out
+	  return;
+	}
+      } 
+      else if ( ( element->isinstance(folia::Sentence_t) ) 
+		|| ( element->isinstance(folia::Head_t) ) ) {
+	//tokenize sentence: check for absence of Word's
 	vector<folia::Word*> words = element->words();
-	if (words.size() == 0) {
-	  tokenize(element,false,true);
-	  return true;
-	} else {
-	  return false;
+	if (words.size() > 0) {
+	  // bail out
+	  return;
 	}
-      } else {
+      }
+      else {
+	// Some other element that contains text. Probably deeper.
+	// look it up. skip all paragraphs and sentences
 	vector<folia::Paragraph*> paragraphs = element->paragraphs();
-	if (paragraphs.size() == 0) {
-	  vector<folia::Sentence*> sentences = element->sentences();
-	  if (sentences.size() == 0) {
-	    vector<folia::Word*> words = element->words();
-	    if (words.size() == 0) {
-	      bool treat_as_paragraph = element->isinstance(folia::Event_t);
-	      tokenize(element,treat_as_paragraph,false);
-	      return true;			
-	    }
-	  }
+	if (paragraphs.size() > 0) {
+	  // already paragraphs, bail out
+	  return;
+	}
+	vector<folia::Sentence*> sentences = element->sentences();
+	if (sentences.size() > 0) {
+	  // already sentences, bail out
+	  return;
+	}
+	vector<folia::Word*> words = element->words();
+	if (words.size() > 0) {
+	  // already words, bail out
+	  return;
 	}
-	return false;
       }
+      // so we have text, in an element without 'formatting' yet, good
+      // lets Tokenize the available text!
+      tokenizeSentenceElement( element );
+      return;
     }
-    //recursion step for other elements
-    if (tokDebug >= 2) *Log(theErrLog) << "[tokenize] Processing children of FoLiA element " << element->id() << endl;
+    //recursion step for textless elements
+    if (tokDebug >= 2) *Log(theErrLog) << "[tokenizeElement] Processing children of FoLiA element " << element->id() << endl;
     for ( size_t i = 0; i < element->size(); i++) {
-      tokenize(element->index(i));
+      tokenizeElement( element->index(i));
     }
-    return false;
+    return;
   }
-  
-  bool TokenizerClass::tokenize(folia::FoliaElement * element, bool root_is_paragraph, bool root_is_sentence) {
-    if (tokDebug >= 1) *Log(theErrLog) << "[tokenize] Processing FoLiA sentence" << endl;
-    UnicodeString line = element->stricttext() + " "  + explicit_eos_marker;
-    tokenizeLine(line);		
-    int numS = countSentences(true); //force buffer to empty
+
+  void TokenizerClass::tokenizeSentenceElement( folia::FoliaElement *element ){
+    folia::Document *doc = element->doc();
+    if ( passthru ){
+      doc->declare( folia::AnnotationType::TOKEN, "passthru", "annotator='ucto', annotatortype='auto', datetime='now()'" );
+    }
+    else {
+      doc->declare( folia::AnnotationType::TOKEN, settingsfilename, "annotator='ucto', annotatortype='auto', datetime='now()'" );
+    }
+    UnicodeString line = element->stricttext( textclass ) + " "  + eosmark;
+    if (tokDebug >= 1) 
+      *Log(theErrLog) << "[tokenizeSentenceElement] Processing sentence:" 
+		      << line << endl;
+    if ( passthru ){
+      bool bos = true;
+      passthruLine( folia::UnicodeToUTF8(line), bos );
+    }
+    else
+      tokenizeLine( line );		
     //ignore EOL data, we have by definition only one sentence:
-    bool in_par = false; //very ugly, I know
+    int numS = countSentences(true); //force buffer to empty
+    vector<Token> outputTokens;
     for (int i = 0; i < numS; i++) {
-      int begin, end;
-      if (!getSentence(i, begin, end)) throw uRangeError("Sentence index"); //should never happen
-      if (tokDebug >= 1) *Log(theErrLog) << "[tokenize] Outputting sentence " << i << ", begin="<<begin << ",end="<< end << endl;
-      outputTokensXML(element,begin,end,in_par,root_is_paragraph,root_is_sentence);
+      vector<Token> v = getSentence( i );
+      outputTokens.insert( outputTokens.end(), v.begin(), v.end() );
     }
+    outputTokensXML( element, outputTokens );
     flushSentences(numS);	
-    if (numS > 0)
-      return true;
-    else
-      return false;
   }
   
-  
-  void TokenizerClass::tokenize( istream& IN, ostream& OUT) {
-    bool firstoutput = true;
-    bool in_paragraph = false; //for XML
-    bool done = false;
-    bool bos = true;
-    folia::Document doc( "id='" + docid + "'" );
-    if ( xmlout ){
-      parCount = 0;
-      doc.addStyle( "type=\"text/xsl\" href=\"folia.xsl\"" );
-      doc.declare( folia::AnnotationType::TOKEN, settingsfilename, "annotator='ucto', annotatortype='auto'" );
-      folia::FoliaElement *text = new folia::Text( "id='" + docid + ".text'" );
-      doc.append( text );
+  void TokenizerClass::outputTokensDoc( folia::Document& doc,
+					const vector<Token>& tv ) const {
+    doc.addStyle( "text/xsl", "folia.xsl" );
+    if ( passthru ){
+      doc.declare( folia::AnnotationType::TOKEN, "passthru", "annotator='ucto', annotatortype='auto', datetime='now()'" );
     }
-    string line;      
-    do {	    
-      done = !getline( IN, line );
-      stripCR( line );
-      if ( sentenceperlineinput )
-	line += string(" ") + folia::UnicodeToUTF8(explicit_eos_marker);
-      int numS;
-      if ( (done) || (line.empty()) ){
-	signalParagraph();
-	numS = countSentences(true); //count full sentences in token buffer, force buffer to empty!
-      } else {
-	if ( passthru )
-	  passthruLine( line, bos );
-	else
-	  tokenizeLine( line ); 
-	numS = countSentences(); //count full sentences in token buffer	    
-      }			
-      if ( numS > 0 ) { //process sentences 
-	if  (tokDebug > 0) *Log(theErrLog) << "[tokenize] " << numS << " sentence(s) in buffer, processing..." << endl;
-	for (int i = 0; i < numS; i++) {
-	  int begin, end;
-	  if (!getSentence(i, begin, end)) {
-	    if  (tokDebug > 0) *Log(theErrLog) << "[tokenize] ERROR: Sentence index " << i << " is out of range!!" << endl;
-	    throw uRangeError("Sentence index"); //should never happen
-	  }
-	  /* ******* Begin process sentence  ********** */
-	  if (tokDebug >= 1) *Log(theErrLog) << "[tokenize] Outputting sentence " << i << ", begin="<<begin << ",end="<< end << endl;
-	  if (xmlout) {
-	    outputTokensXML( doc, begin, end, in_paragraph );
-	  } else {
-	    outputTokens(OUT, begin, end, firstoutput );
-	    firstoutput = false;
-	  }	       
-	}
-	//clear processed sentences from buffer
-	if  (tokDebug > 0) *Log(theErrLog) << "[tokenize] flushing " << numS << " sentence(s) from buffer..." << endl;
-	flushSentences(numS);	    
-      } else {
-	if  (tokDebug > 0) *Log(theErrLog) << "[tokenize] No sentences yet, reading on..." << endl;
-      }	
-    } while (!done);
-    if (xmlout) {
-      OUT << doc << endl;
-    } else {
-      OUT << endl;
+    else {
+      doc.declare( folia::AnnotationType::TOKEN, settingsfilename, 
+		   "annotator='ucto', annotatortype='auto', datetime='now()'");
     }
-  }
-  
-  UnicodeString xmlescape(UnicodeString s_in) {
-    UnicodeString s = s_in;      
-    s = s.findAndReplace("&","&");
-    s = s.findAndReplace("\"",""");
-    s = s.findAndReplace("<","<");
-    s = s.findAndReplace(">",">");
-    return s;
-  }
-  
-  void TokenizerClass::outputTokensXML( folia::Document& doc,
-					const size_t begin, const size_t end, 
-					bool& in_paragraph ) {	
-    
+    folia::FoliaElement *text = new folia::Text( "id='" + docid + ".text'" );
+    doc.append( text );
     folia::FoliaElement *root = doc.doc()->index(0);
-    if (end >= tokens.size()) {
-      throw uRangeError("End index for outputTokensXML exceeds available buffer length" );
-    }
-    
-    outputTokensXML(root,begin,end,in_paragraph);    
+    outputTokensXML(root, tv );
   }
 
-
-
   void TokenizerClass::outputTokensXML( folia::FoliaElement *root,
-					const size_t begin, const size_t end, 
-					bool& in_paragraph, 
-					bool root_is_paragraph,
-					bool root_is_sentence) {
+					const vector<Token>& tv ) const {
     short quotelevel = 0;
     folia::FoliaElement *lastS = 0;
-
-    //static int parCount = 0;    // Isn't this FATAL when multithreading?
+    int parCount = 0;
     if  (tokDebug > 0) *Log(theErrLog) << "[outputTokensXML] parCount =" << parCount << endl;
     
-       
-    if ((!root_is_paragraph) && (!root_is_sentence)) { 
-      if ( in_paragraph ){
-	root = root->rindex(0);
-	if  (tokDebug > 0) *Log(theErrLog) << "[outputTokensXML] root changed to " << root << endl;
-      }
-    }
-    
-    if (root_is_sentence) {
+    bool root_is_sentence = false;
+    bool root_is_paragraph = false;
+    if ( root->isinstance( folia::Sentence_t ) ){
       lastS = root;
+      root_is_sentence = true;
     }
-    
-    for ( size_t i = begin; i <= end; i++) {
-      
-      if (((!root_is_paragraph) && (!root_is_sentence)) && ((tokens[i].role & NEWPARAGRAPH) || (!in_paragraph))) {	    
+    else if ( root->isinstance( folia::Paragraph_t )
+	      || root->isinstance( folia::Event_t ) ){
+      root_is_paragraph = true;
+    }
+
+    bool in_paragraph = false;
+    for ( size_t i=0; i < tv.size(); i++) {      
+      if (((!root_is_paragraph) && (!root_is_sentence)) && ((tv[i].role & NEWPARAGRAPH) || (!in_paragraph))) {	    
 	parCount++;
 	if ( in_paragraph )
 	  root = root->parent();
@@ -586,13 +528,15 @@ namespace Tokenizer {
 	root = p;
 	quotelevel = 0;
       }
-      if (tokens[i].role & ENDQUOTE) {
-	if  (tokDebug > 0) *Log(theErrLog) << "[outputTokensXML] End of quote";
+      if ( tv[i].role & ENDQUOTE) {
+	if  (tokDebug > 0) *Log(theErrLog) << "[outputTokensXML] End of quote" << endl;
 	quotelevel--;
 	root = root->parent();
-	//	*Log(theErrLog) << "ENDQUOTE, terug naar " << root << endl;
+	lastS = root;
+	if  (tokDebug > 0) *Log(theErrLog) << "[outputTokensXML] back to " << root->classname() << endl;
+
       }
-      if ((tokens[i].role & BEGINOFSENTENCE) && (!root_is_sentence)) {
+      if (( tv[i].role & BEGINOFSENTENCE) && (!root_is_sentence)) {
 	if  (tokDebug > 0) *Log(theErrLog) << "[outputTokensXML] Creating sentence in '" << root->id() << "' ()" << endl;
 	folia::KWargs args;
 	args["generate_id"] = root->id();
@@ -600,32 +544,36 @@ namespace Tokenizer {
 	// *Log(theErrLog) << "created " << s << endl;
 	root->append( s );
 	root = s;
-	lastS = s;
+	lastS = root;
       }	
-      if  (tokDebug > 0) *Log(theErrLog) << "[outputTokensXML] Creating word element for " << tokens[i].us << endl;
+      if  (tokDebug > 0) *Log(theErrLog) << "[outputTokensXML] Creating word element for " << tv[i].us << endl;
       folia::KWargs args;
       args["generate_id"] = lastS->id();
-      args["class"] = folia::UnicodeToUTF8( *tokens[i].type );
-      if (tokens[i].role & NOSPACE) {
+      args["class"] = folia::UnicodeToUTF8( *tv[i].type );
+      if ( passthru )
+	args["set"] = "passthru";
+      else
+	args["set"] = settingsfilename;
+      if ( tv[i].role & NOSPACE) {
 	args["space"]= "no";
       }
       folia::FoliaElement *w = new folia::Word( root->doc(), args );
-      w->settext( folia::UnicodeToUTF8( tokens[i].us ) );
-      //      *Log(theErrLog) << "created " << w << " text= " <<  tokens[i].us << endl;
+      w->settext( folia::UnicodeToUTF8( tv[i].us ) );
+      //      *Log(theErrLog) << "created " << w << " text= " <<  tv[i].us << endl;
       root->append( w );
-      if (tokens[i].role & BEGINQUOTE) {
-	if  (tokDebug > 0) *Log(theErrLog) << "[outputTokensXML] Creating quote element";
-	lastS = root;
+      if ( tv[i].role & BEGINQUOTE) {
+	if  (tokDebug > 0) *Log(theErrLog) << "[outputTokensXML] Creating quote element" << endl;
 	folia::FoliaElement *q = new folia::Quote( root->doc(), "generate_id='" + root->id() + "'" );
 	//	*Log(theErrLog) << "created " << q << endl;
 	root->append( q );
 	root = q;
 	quotelevel++;
       }    
-      if ( (tokens[i].role & ENDOFSENTENCE) && (!root_is_sentence) ) {
-	if  (tokDebug > 0) *Log(theErrLog) << "[outputTokensXML] End of sentence";
+      if ( ( tv[i].role & ENDOFSENTENCE) && (!root_is_sentence) ) {
+	if  (tokDebug > 0) *Log(theErrLog) << "[outputTokensXML] End of sentence" << endl;
 	root = root->parent();
-	//	*Log(theErrLog) << "endsentence, terug naar " << root << endl;
+	lastS = root;
+	if  (tokDebug > 0) *Log(theErrLog) << "[outputTokensXML] back to " << root->classname() << endl;
       }
       in_paragraph = true;
     }
@@ -640,85 +588,62 @@ namespace Tokenizer {
     if ( tok & ENDQUOTE) os << "ENDQUOTE ";		
     return os;
   }
-
+  
   void TokenizerClass::outputTokens( ostream& OUT, 
-				     const size_t begin, const size_t end, 
-				     const bool firstoutput ) const {
+				     const vector<Token>& toks ) const { 
     short quotelevel = 0;
-    if (end >= tokens.size()) {
-      throw uRangeError( "End index for outputTokens exceeds available buffer length" );
-    }
-    for ( size_t i = begin; i <= end; i++) {
-      if ((detectPar) && (tokens[i].role & NEWPARAGRAPH) && (!verbose) && (!firstoutput)) {
+    for ( size_t i = 0; i < toks.size(); i++) {
+      if ((detectPar) && ( toks[i].role & NEWPARAGRAPH) && (!verbose) && (i != 0) ) {
 	if (sentenceperlineoutput) {
 	  OUT << endl;
-	} else {
+	} 
+	else {
 	  OUT << endl << endl;
 	}
       }
       if (lowercase) {
-	UnicodeString s = tokens[i].us;
-	OUT << folia::UnicodeToUTF8( s.toLower() );
-      } else if (uppercase) {
-	UnicodeString s = tokens[i].us;
-	OUT << folia::UnicodeToUTF8( s.toUpper() );
-      } else {
-	OUT << folia::UnicodeToUTF8( tokens[i].us );
+	UnicodeString s = toks[i].us;
+	OUT << s.toLower();
+      } 
+      else if (uppercase) {
+	UnicodeString s = toks[i].us;
+	OUT << s.toUpper();
+      } 
+      else {
+	OUT << toks[i].us;
       }      
-      if (tokens[i].role & NEWPARAGRAPH) quotelevel = 0;
-      if (tokens[i].role & BEGINQUOTE) quotelevel++;
+      if ( toks[i].role & NEWPARAGRAPH) quotelevel = 0;
+      if ( toks[i].role & BEGINQUOTE) quotelevel++;
       if (verbose) {
-	OUT << "\t" +  folia::UnicodeToUTF8( *tokens[i].type ) + "\t" 
-	    << tokens[i].role << endl;
+	OUT << "\t" << *toks[i].type << "\t" << toks[i].role << endl;
       }
-      if (tokens[i].role & ENDQUOTE) quotelevel--;
+      if ( toks[i].role & ENDQUOTE) quotelevel--;
       
-      if (quotelevel == 0) {
-	if (tokens[i].role & ENDOFSENTENCE) {
-	  if (verbose) {
+      if ( toks[i].role & ENDOFSENTENCE) {
+	if (verbose) {
+	  OUT << endl;
+	} 
+	else if (quotelevel == 0) {
+	  if (sentenceperlineoutput) {
 	    OUT << endl;
-	  } else {
-	    if (sentenceperlineoutput) {
-	      OUT << endl;
-	    } else {
-	      UnicodeString tmp = folia::UTF8ToUnicode( eosmark );
-	      OUT << " " + tmp;
-	    }
+	  } 
+	  else {
+	    OUT << " " + eosmark;
 	  }
 	}
       }
-      if ( (i <= end) && (!verbose) ) {
-	if (!( (tokens[i].role & ENDOFSENTENCE) && (sentenceperlineoutput) )) {
+      if ( (i < toks.size() ) && (!verbose) ) {
+	if (!( ( toks[i].role & ENDOFSENTENCE) && (sentenceperlineoutput) )) {
+	  OUT << " ";
+	  //FBK: ADD SPACE WITHIN QUOTE CONTEXT IN ANY CASE
+	} 
+	else if ((quotelevel > 0) && (sentenceperlineoutput)) {
 	  OUT << " ";  
 	}
       }
     } 
   }
   
-  vector<string> TokenizerClass::getSentences() const {
-    short quotelevel = 0;
-    vector<string> sentences;
-    const int size = tokens.size();
-    string sentence = "";
-    for (int i = 0; i < size; i++) {
-      if (tokens[i].role & NEWPARAGRAPH) quotelevel = 0;
-      if (tokens[i].role & BEGINQUOTE) quotelevel++;
-      if (tokens[i].role & ENDQUOTE) quotelevel--;	
-      sentence += folia::UnicodeToUTF8(tokens[i].us);
-      if ((tokens[i].role & ENDOFSENTENCE) && (quotelevel == 0)) {
-	sentence += " " + eosmark;
-	sentences.push_back(sentence);
-	sentence = "";
-      } else if (i < size) {
-	sentence += " ";
-      }
-    }
-    if (!sentence.empty()) {
-      sentences.push_back(sentence);
-    }      
-    return sentences;     
-  }
-  
   int TokenizerClass::countSentences(bool forceentirebuffer) {
     //Return the number of *completed* sentences in the token buffer
     
@@ -733,8 +658,8 @@ namespace Tokenizer {
     for (int i = begin; i < size; i++) {
       if (tokDebug >= 5)
 	*Log(theErrLog) << "[countSentences] buffer#" <<i 
-			<< " word=[" << folia::UnicodeToUTF8( tokens[i].us) 
-			<<"] role=" << tokens[i].role 
+			<< " word=[" << tokens[i].us
+			<< "] role=" << tokens[i].role 
 			<< ", quotelevel="<< quotelevel << endl;
       if (tokens[i].role & NEWPARAGRAPH) quotelevel = 0;
       if (tokens[i].role & BEGINQUOTE) quotelevel++;
@@ -744,16 +669,14 @@ namespace Tokenizer {
 	//Change TEMPENDOFSENTENCE to ENDOFSENTENCE and make sure sentences match up sanely
 	tokens[i].role ^= TEMPENDOFSENTENCE;
 	tokens[i].role |= ENDOFSENTENCE;
-	if (!(tokens[begin].role & BEGINOFSENTENCE)) {
-	  tokens[begin].role |= BEGINOFSENTENCE;
-	}
-      }		
+	tokens[begin].role |= BEGINOFSENTENCE;
+      }
       if ((tokens[i].role & ENDOFSENTENCE) && (quotelevel == 0)) {
 	begin = i + 1;
 	count++;
 	if (tokDebug >= 5) 
 	  *Log(theErrLog) << "[countSentences] SENTENCE #" << count << " found" << endl;
-	if ((begin < size) && !(tokens[begin].role & BEGINOFSENTENCE)) {
+	if ((begin < size) ){
 	  tokens[begin].role |= BEGINOFSENTENCE;
 	}
       }
@@ -786,7 +709,8 @@ namespace Tokenizer {
     if (begin == size) {
       tokens.clear();
       quotes.clearStack();
-    } else {
+    } 
+    else {
       tokens.erase (tokens.begin(),tokens.begin()+begin);
       if (!quotes.emptyStack()) {
 	quotes.flushStack( begin );
@@ -799,77 +723,102 @@ namespace Tokenizer {
     return tokens.size();
   }
   
-  bool TokenizerClass::getSentence( int index, int& begin, int& end ) {
+  vector<Token> TokenizerClass::getSentence( int index ) {
+    vector<Token> outToks;
     int count = 0;
     const int size = tokens.size();
     short quotelevel = 0;
-    begin = 0;
-    for (int i = 0; i < size; i++) {
-      if (tokens[i].role & NEWPARAGRAPH) quotelevel = 0;
-      if (tokens[i].role & BEGINQUOTE) quotelevel++;
+    size_t begin = 0;
+    size_t end = 0;
+    for ( int i = 0; i < size; i++) {
+      if (tokens[i].role & NEWPARAGRAPH) quotelevel = 0;      
       if (tokens[i].role & ENDQUOTE) quotelevel--;	
       if ((tokens[i].role & BEGINOFSENTENCE) && (quotelevel == 0)) {
 	begin = i;
       }
+      //FBK: QUOTELEVEL GOES UP BEFORE begin IS UPDATED... RESULTS IN DUPLICATE OUTPUT
+      if (tokens[i].role & BEGINQUOTE) quotelevel++;
+      
       if ((tokens[i].role & ENDOFSENTENCE) && (quotelevel == 0)) {
 	if (count == index) {
 	  end = i;
-	  if (!(tokens[begin].role & BEGINOFSENTENCE)) //sanity check
-	    tokens[begin].role |= BEGINOFSENTENCE;
-	  return true;
+	  tokens[begin].role |= BEGINOFSENTENCE;  //sanity check
+	  if (tokDebug >= 1) 
+	    *Log(theErrLog) << "[tokenize] extracted sentence " << index << ", begin="<<begin << ",end="<< end << endl;
+	  for ( size_t i=begin; i <= end; ++i ){
+	    outToks.push_back( tokens[i] );
+	  }
+	  return outToks;
 	}
 	count++;
       }	
     }  
-    return false;
+    throw uRangeError( "No sentence exists with the specified index: " 
+		       + toString( index ) );
+    return outToks;
   }
   
-  vector<Token*> TokenizerClass::getSentence( int selectsentence) { 
-    //index starts at 0
-    short quotelevel = 0;
-    const int size = tokens.size();
-    vector<Token*> sentence;
-    int count = 0;
-    for (int i = 0; i < size; i++) {
-      if (count == selectsentence) {
-	sentence.push_back(&tokens[i]);
-      }
-      if (tokens[i].role & NEWPARAGRAPH) quotelevel = 0;
-      if (tokens[i].role & BEGINQUOTE) quotelevel++;
-      if (tokens[i].role & ENDQUOTE) quotelevel--;
-      if ((tokens[i].role & ENDOFSENTENCE) && (quotelevel == 0)) {
-	if (selectsentence == count) {
-	  return sentence;
-	} else {
-	  count++;
-	}
-      }
+  string TokenizerClass::getSentenceString( unsigned int i ){
+    vector<Token> v = getSentence( i );
+    if ( !v.empty() ){
+      //This only makes sense in non-verbose mode, force verbose=false
+      stringstream TMPOUT;
+      const bool tv = verbose;
+      verbose = false;
+      outputTokens( TMPOUT, v );
+      verbose = tv;
+      return TMPOUT.str(); 
     }
-    return sentence;
+    return "";
   }
+  
+  vector<string> TokenizerClass::getSentences() {
+    vector<string> sentences;
+    int numS = countSentences(true); //force buffer to empty
+    for (int i = 0; i < numS; i++) {
+      string tmp = getSentenceString( i );
+      sentences.push_back( tmp );
+    }
+    return sentences;
+  }  
 
-  string TokenizerClass::getSentenceString( unsigned int i, 
-					    const bool firstoutput ) {
-    int begin, end;
-    if (!getSentence(i,begin,end)) {
-      throw uRangeError( "No sentence exists with the specified index: " 
-			 + toString( i ) );
+  // FBK: return true if character is a quote.
+  bool TokenizerClass::u_isquote(UChar c) {
+    bool quote = false;
+    if ((c == '"') || ( UnicodeString(c) == """) || (c == '\'')) {
+      quote = true;
     }
-    
-    //This only makes sense in non-verbose mode, force verbose=false
-    stringstream TMPOUT;
-    const bool t = verbose;
-    verbose = false;
-    outputTokens( TMPOUT, begin,end, firstoutput);
-    verbose = t;
-    return TMPOUT.str(); 
+    else {
+      UnicodeString opening = quotes.lookupOpen( c );
+      if (!opening.isEmpty()) {
+	quote = true;
+      } 
+      else {
+	UnicodeString closing = quotes.lookupClose( c );
+	if (!closing.isEmpty()) {
+	  quote = true;
+	}
+      }
+    }
+    return quote;
   }
   
-  bool TokenizerClass::terminatesWithEOS( ) const {
-    if ( tokens.size() < 1 )
-      return false;
-    else
-      return (tokens[tokens.size() - 1].role & ENDOFSENTENCE);
+  //FBK: USED TO CHECK IF CHARACTER AFTER QUOTE IS AN BOS. 
+  //MOSTLY THE SAME AS ABOVE, EXCEPT WITHOUT CHECK FOR PUNCTUATION
+  //BECAUSE: '"Hoera!", zei de man' MUST NOT BE SPLIT ON ','..
+  bool is_BOS( UChar c ){
+    bool is_bos = false;
+    UBlockCode s = ublock_getCode(c);
+    //test for languages that distinguish case
+    if ( (s == UBLOCK_BASIC_LATIN) || (s == UBLOCK_GREEK) 
+	 || (s == UBLOCK_CYRILLIC) || (s == UBLOCK_GEORGIAN) 
+	 || (s == UBLOCK_ARMENIAN) || (s == UBLOCK_DESERET)) { 
+      if ( u_isupper(c) || u_istitle(c) ) {
+	//next 'word' starts with more punctuation or with uppercase
+	is_bos = true;
+      }        
+    }
+    return is_bos;
   }
   
   bool TokenizerClass::resolveQuote(int endindex, const UnicodeString& open ) {
@@ -890,6 +839,7 @@ namespace Tokenizer {
       int beginsentence = beginindex + 1;
       int expectingend = 0;
       int subquote = 0;
+      int size = tokens.size();
       for (int i = beginsentence; i < endindex; i++) {
 	if (tokens[i].role & BEGINQUOTE) subquote++;
 	
@@ -897,71 +847,114 @@ namespace Tokenizer {
 	  if (tokens[i].role & BEGINOFSENTENCE) expectingend++;
 	  if (tokens[i].role & ENDOFSENTENCE) expectingend--;
 	  
-	  if (tokens[i].role & TEMPENDOFSENTENCE) {			    
+	  if (tokens[i].role & TEMPENDOFSENTENCE) {	
 	    tokens[i].role ^= TEMPENDOFSENTENCE;
 	    tokens[i].role |= ENDOFSENTENCE;
 	    tokens[beginsentence].role |= BEGINOFSENTENCE;
 	    beginsentence = i + 1;
 	  }
+	  // In case of nested quoted sentences, such as:
+	  //    MvD: "Nou, Van het Gouden Been ofzo herinner ik mij als kind: 'Waar is mijn gouden been?'"
+	  // the BEGINOFSENTENCE is only set for the inner quoted sentence 'Waar is mijn gouden been'. However,
+	  // We also need one for the outser sentence.
+	} 
+	else if ( (tokens[i].role & ENDQUOTE) 
+		  && (tokens[i].role & ENDOFSENTENCE)) {
+	  tokens[beginsentence].role |= BEGINOFSENTENCE;
+	  beginsentence = i + 1;
 	}
-	
 	if (tokens[i].role & ENDQUOTE) subquote--;
-	
-	/*  
-	    if (tokens[i].role & BEGINOFSENTENCE) {
-	    if (i - 1 > beginindex)
-	    tokens[i-1].role |= ENDOFSENTENCE;
-	    }
-	    if (tokens[i].role & ENDOFSENTENCE) {
-	    if (i + 1 < endindex)
-	    tokens[i+1].role |= BEGINOFSENTENCE;
-	    }*/
       }
-	
-	  
       if ((expectingend == 0) && (subquote == 0)) {
 	//ok, all good, mark the quote:
 	tokens[beginindex].role |= BEGINQUOTE;
 	tokens[endindex].role |= ENDQUOTE;	   
-      } else if ((expectingend == 1) && (subquote == 0) && !(tokens[endindex - 1].role & ENDOFSENTENCE)) {
+      } 
+      else if ((expectingend == 1) && (subquote == 0) && !(tokens[endindex - 1].role & ENDOFSENTENCE)) {
 	//missing one endofsentence, we can correct, last token in quote token is endofsentence:	    
 	if (tokDebug >= 2) *Log(theErrLog) << "[resolveQuote] Missing endofsentence in quote, fixing... " << expectingend << endl;
 	tokens[endindex - 1].role |= ENDOFSENTENCE;	    
 	//mark the quote
 	tokens[beginindex].role |= BEGINQUOTE;
 	tokens[endindex].role |= ENDQUOTE;	   
-      } else {
+      } 
+      else {
 	if (tokDebug >= 2) *Log(theErrLog) << "[resolveQuote] Quote can not be resolved, unbalanced sentences or subquotes within quote, skipping... (expectingend=" << expectingend << ",subquote=" << subquote << ")" << endl;
 	//something is wrong. Sentences within quote are not balanced, so we won't mark the quote.
       }      
-      
       //remove from stack (ok, granted, stack is a bit of a misnomer here)
       quotes.eraseAtPos( stackindex );
+      //FBK: ENDQUOTES NEED TO BE MARKED AS ENDOFSENTENCE IF THE PREVIOUS TOKEN
+      //WAS AN ENDOFSENTENCE. OTHERWISE THE SENTENCES WILL NOT BE SPLIT.
+      if ((tokens[endindex].role & ENDQUOTE) && (tokens[endindex-1].role & ENDOFSENTENCE)) {
+        //FBK: CHECK FOR EOS AFTER QUOTES
+        if ((endindex+1 == size) || //FBK: endindex EQUALS TOKEN SIZE, MUST BE EOSMARKERS 
+            ((endindex + 1 < size) && (is_BOS(tokens[endindex+1].us[0])))) {
+	  tokens[endindex].role |= ENDOFSENTENCE; 
+	  // FBK: CHECK IF NEXT TOKEN IS A QUOTE AND NEXT TO THE QUOTE A BOS
+        } 
+	else if ( (endindex + 2 < size) 
+		  && (u_isquote(tokens[endindex+1].us[0])) 
+		  && (is_BOS(tokens[endindex+2].us[0]))) {
+	  tokens[endindex].role |= ENDOFSENTENCE;
+	  // If the current token is an ENDQUOTE and the next token is a quote and also the last token,
+	  // the current token is an EOS.
+        } 
+	else if ( (endindex + 2 == size) 
+		  && (u_isquote(tokens[endindex+1].us[0]))) {
+	  tokens[endindex].role |= ENDOFSENTENCE;
+        }
+      }
       return true;
-    } else {
+    } 
+    else {
       return false;
     }    
   }
 
-  bool checkEos( UChar c ){
+  bool TokenizerClass::detectEos( size_t i ) const {
     bool is_eos = false;
-    UBlockCode s = ublock_getCode(c);
-    //test for languages that distinguish case
-    if ((s == UBLOCK_BASIC_LATIN) || (s == UBLOCK_GREEK) ||
-	(s == UBLOCK_CYRILLIC) || (s == UBLOCK_GEORGIAN) ||
-	(s == UBLOCK_ARMENIAN) || (s == UBLOCK_DESERET)) { 
-      if ( u_isupper(c) || u_istitle(c) || u_ispunct(c) ) {
-	//next 'word' starts with more punctuation or with uppercase
+    UChar c = tokens[i].us[0]; 
+    if ( c == '.' ) {	
+      if (i + 1 == tokens.size() ) {	//No next character? 
+	is_eos = true; //Newline after period
+      }
+      else {
+	UChar c = tokens[i+1].us[0]; 
+	UBlockCode s = ublock_getCode(c);
+	//test for languages that distinguish case
+	if ((s == UBLOCK_BASIC_LATIN) || (s == UBLOCK_GREEK) ||
+	    (s == UBLOCK_CYRILLIC) || (s == UBLOCK_GEORGIAN) ||
+	    (s == UBLOCK_ARMENIAN) || (s == UBLOCK_DESERET)) { 
+	  if ( u_isupper(c) || u_istitle(c) || u_ispunct(c) ) {
+	    //next 'word' starts with more punctuation or with uppercase
+	    is_eos = true;
+	  }
+	} 
+	else {
+	  // just normal ASCII punctuation
+	  is_eos = true;
+	}
+      }
+    }
+    else { //no period
+      //Check for other EOS markers
+      if ( !detectQuotes && 
+	   ( c == '\'' || c == '`' || UnicodeString(c) == "’" 
+	     || UnicodeString(c) == "‘" || c == '"' || UnicodeString(c) == "”" 
+	     || UnicodeString(c) == "“" )
+	   && (i + 1 == tokens.size() ) ) {	//No next character? 
+	is_eos = true; //Newline after single quote
+      }
+      else if ( eosmarkers.indexOf( c ) >= 0 ){
 	is_eos = true;
       }
-    } else {
-      // just normal ASCII punctuation
-      is_eos = true;
     }
     return is_eos;
   }
-
-  void TokenizerClass::detectQuoteBounds( const int i, const UChar c ) {
+  
+  void TokenizerClass::detectQuoteBounds( const int i ) {
+    UChar c = tokens[i].us[0]; 
     //Detect Quotation marks
     if ((c == '"') || ( UnicodeString(c) == """) ) {
       if (tokDebug > 1 )
@@ -970,78 +963,114 @@ namespace Tokenizer {
 	if (tokDebug > 1 ) *Log(theErrLog) << "[detectQuoteBounds] Doesn't resolve, so assuming beginquote, pushing to stack for resolution later" << endl;
 	quotes.push( i, c );
       }
-    } else {
+    } 
+    else if ((c == '\'') ) {
+      if (tokDebug > 1 )
+	*Log(theErrLog) << "[detectQuoteBounds] Standard single-quote (ambiguous) found @i="<< i << endl;
+      if (!resolveQuote(i,c)) {
+	if (tokDebug > 1 ) *Log(theErrLog) << "[detectQuoteBounds] Doesn't resolve, so assuming beginquote, pushing to stack for resolution later" << endl;
+	quotes.push( i, c );
+      }
+    } 
+    else {
       UnicodeString close = quotes.lookupOpen( c );
       if ( !close.isEmpty() ){ // we have a opening quote
 	if (tokDebug > 1 ) *Log(theErrLog) << "[detectQuoteBounds] Opening quote found @i="<< i << ", pushing to stack for resultion later..." << endl;	      
 	quotes.push( i, c ); // remember it
-      } else {
+      } 
+      else {
 	UnicodeString open = quotes.lookupClose( c );	
 	if ( !open.isEmpty() ) { // we have a closeing quote
 	  if (tokDebug > 1 ) *Log(theErrLog) << "[detectQuoteBounds] Closing quote found @i="<< i << ", attempting to resolve..." << endl;	      
-	  if (!resolveQuote(i, open )) // resolve the matching opening
-	    if (tokDebug > 1 ) *Log(theErrLog) << "[detectQuoteBounds] Unable to resolve" << endl;	      
-	  //} else if (UnicodeString(c) == "―") {
-	  //TODO: Implement
+	  if (!resolveQuote(i, open )) { // resolve the matching opening
+	    if (tokDebug > 1 ) {
+	      *Log(theErrLog) << "[detectQuoteBounds] Unable to resolve" << endl;	      
+	    } 
+	  }
 	}
-
       }
     }
   }
-  
+
+  bool isClosing( const Token& tok ){
+    if ( tok.us.length() == 1 &&
+	 ( tok.us[0] == ')' || tok.us[0] == '}' 
+	   || tok.us[0] == ']' || tok.us[0] == '>' ) )
+      return true;
+    return false;
+  }
+
   void TokenizerClass::detectSentenceBounds( const int offset ){
     //find sentences
     const int size = tokens.size();    
-          
-    if (sentenceperlineinput) {
-      tokens[offset].role |= BEGINOFSENTENCE;
-      tokens[size - 1].role |= ENDOFSENTENCE;
-    }
-    
     for (int i = offset; i < size; i++) {
-      if ((offset == 0) && (sentencesignal)) {
-	tokens[i].role |= BEGINOFSENTENCE;
-	sentencesignal = false;
-      }
       if (tokDebug > 1 )
 	*Log(theErrLog) << "[detectSentenceBounds] i="<< i << " word=[" 
-			<< folia::UnicodeToUTF8( tokens[i].us ) 
-			<<"] role=" << tokens[i].role << endl;
-      if ( tokens[i].type->startsWith( "PUNCTUATION") ) { //TODO: make comparison more efficient?
-	UChar c = tokens[i].us[0]; 
-	bool is_eos = false;
-	if (c == '.') {	
-	  if (i + 1 == size) {	//No next character? 
-	    is_eos = true; //Newline after period
-	  } else {
-	    // check next token for eos
-	    is_eos = checkEos(tokens[i+1].us[0] );
-	  }
-	} else { //no period
-	  //Check for other EOS markers
-	  if ( eosmarkers.indexOf( c ) >= 0 )
-	    is_eos = true;
+			<< tokens[i].us
+			<< "] type=" << *tokens[i].type
+			<< ", role=" << tokens[i].role << endl;
+      if ( tokens[i].type->startsWith("PUNCTUATION") ) {
+	// we have some kind of punctuation. Does it mark an eos?
+	bool is_eos = detectEos( i );
+	if (is_eos) {
+	  if ((tokDebug > 1 )) 
+	    *Log(theErrLog) << "[detectSentenceBounds] EOS FOUND @i=" << i << endl;
+	  tokens[i].role |= ENDOFSENTENCE;
+	  //if this is the end of the sentence, the next token is the beginning of a new one
+	  if ((i + 1 < size) && !(tokens[i+1].role & BEGINOFSENTENCE))
+	    tokens[i+1].role |= BEGINOFSENTENCE;
+	  //if previous token is EOS and not BOS, it will stop being EOS, as this one will take its place
+	  if ((i > 0) && (tokens[i-1].role & ENDOFSENTENCE) && !(tokens[i-1].role & BEGINOFSENTENCE) ) {
+	    tokens[i-1].role ^= ENDOFSENTENCE; 
+	    if (tokens[i].role & BEGINOFSENTENCE) {
+	      tokens[i].role ^= BEGINOFSENTENCE;
+	    }
+	  }   		
 	}
+	else if ( isClosing(tokens[i] ) ) {
+	  // we have a closing symbol
+	  if ((tokDebug > 1 )) 
+	    *Log(theErrLog) << "[detectSentenceBounds] Close FOUND @i=" << i << endl;
+	  //if previous token is EOS and not BOS, it will stop being EOS, as this one will take its place
+	  if ((i > 0) && (tokens[i-1].role & ENDOFSENTENCE) && !(tokens[i-1].role & BEGINOFSENTENCE) ) {
+	    tokens[i-1].role ^= ENDOFSENTENCE; 
+	    if (tokens[i].role & BEGINOFSENTENCE) {
+	      tokens[i].role ^= BEGINOFSENTENCE;
+	    }
+	  }   		
+	}
+      }
+    }
+  }    
+
+  void TokenizerClass::detectQuotedSentenceBounds( const int offset ){
+    //find sentences
+    const int size = tokens.size();    
+    for (int i = offset; i < size; i++) {
+      if (tokDebug > 1 )
+	*Log(theErrLog) << "[detectQuotedSentenceBounds] i="<< i << " word=[" 
+			<< tokens[i].us
+			<<"] role=" << tokens[i].role << endl;
+      if ( tokens[i].type->startsWith("PUNCTUATION") ) {
+	// we have some kind of punctuation. Does it mark an eos?
+	bool is_eos = detectEos( i );
 	if (is_eos) {
-	  if ((detectQuotes) && (!quotes.emptyStack())) {
-	    
+	  if ( !quotes.emptyStack() ) {
 	    if ((tokDebug > 1 )) 
-	      *Log(theErrLog) << "[detectSentenceBounds] Preliminary EOS FOUND @i=" << i << endl;
-	    
+	      *Log(theErrLog) << "[detectQuotedSentenceBounds] Preliminary EOS FOUND @i=" << i << endl;
 	    //if there are quotes on the stack, we set a temporary EOS marker, to be resolved later when full quote is found.
 	    tokens[i].role |= TEMPENDOFSENTENCE;
 	    //If previous token is also TEMPENDOFSENTENCE, it stops being so in favour of this one
 	    if ((i > 0) && (tokens[i-1].role & TEMPENDOFSENTENCE))
 	      tokens[i-1].role ^= TEMPENDOFSENTENCE;
-	  } else if (!sentenceperlineinput)  { //No quotes on stack (and no one-sentence-per-line input)
-	    sentencesignal = true;
+	  }
+	  else if (!sentenceperlineinput)  { //No quotes on stack (and no one-sentence-per-line input)
 	    if ((tokDebug > 1 )) 
-	      *Log(theErrLog) << "[detectSentenceBounds] EOS FOUND @i=" << i << endl;
+	      *Log(theErrLog) << "[detectQuotedSentenceBounds] EOS FOUND @i=" << i << endl;
 	    tokens[i].role |= ENDOFSENTENCE;
 	    //if this is the end of the sentence, the next token is the beginning of a new one
 	    if ((i + 1 < size) && !(tokens[i+1].role & BEGINOFSENTENCE))
 	      tokens[i+1].role |= BEGINOFSENTENCE;
-	    
 	    //if previous token is EOS and not BOS, it will stop being EOS, as this one will take its place
 	    if ((i > 0) && (tokens[i-1].role & ENDOFSENTENCE) && !(tokens[i-1].role & BEGINOFSENTENCE) ) {
 	      tokens[i-1].role ^= ENDOFSENTENCE; 
@@ -1050,10 +1079,21 @@ namespace Tokenizer {
 	      }
 	    }   		
 	  }	  	  
-	} else if (detectQuotes) {
-	  //check for other bounds
-	  detectQuoteBounds(i,c);
 	}
+	else if ( isClosing(tokens[i] ) ) {
+	  // we have a closing symbol
+	  if ((tokDebug > 1 )) 
+	    *Log(theErrLog) << "[detectSentenceBounds] Close FOUND @i=" << i << endl;
+	  //if previous token is EOS and not BOS, it will stop being EOS, as this one will take its place
+	  if ((i > 0) && (tokens[i-1].role & ENDOFSENTENCE) && !(tokens[i-1].role & BEGINOFSENTENCE) ) {
+	    tokens[i-1].role ^= ENDOFSENTENCE; 
+	    if (tokens[i].role & BEGINOFSENTENCE) {
+	      tokens[i].role ^= BEGINOFSENTENCE;
+	    }
+	  }   		
+	}
+	//check quotes
+	detectQuoteBounds(i);
       }
     }
   }    
@@ -1084,26 +1124,31 @@ namespace Tokenizer {
       
       if ( u_isspace(c)) {
 	if (tokDebug) *Log(theErrLog) << "[passthruLine] word=[" << word << "]" << endl;
-	if ( word == explicit_eos_marker ) {
+	if ( word == eosmark ) {
 	  word = "";
 	  if (!tokens.empty()) 
 	    tokens[tokens.size() - 1].role |= ENDOFSENTENCE;
 	  bos = true;
-	} else {        
+	} 
+	else {        
 	  const UnicodeString *type;
 	  if (alpha && !num && !punct) {
 	    type = &type_word;
-	  } else if (num && !alpha && !punct) {
+	  } 
+	  else if (num && !alpha && !punct) {
 	    type = &type_number;                
-	  } else if (punct && !alpha && !num) {
+	  } 
+	  else if (punct && !alpha && !num) {
 	    type = &type_punctuation;                    
-	  } else {
+	  } 
+	  else {
 	    type = &type_unknown;                
 	  }
 	  if (bos) {
 	    tokens.push_back( Token( type, word , BEGINOFSENTENCE ) );
 	    bos = false;
-	  } else {
+	  } 
+	  else {
 	    tokens.push_back( Token( type, word ) );
 	  }
 	  alpha = false;
@@ -1111,19 +1156,22 @@ namespace Tokenizer {
 	  punct = false;
           word = "";
 	}
-      } else {
+      } 
+      else {
 	if ( u_isalpha(c)) {
 	  alpha = true;
-	} else if (u_ispunct(c)) {
+	} 
+	else if (u_ispunct(c)) {
 	  punct = true;
-	} else if (u_isdigit(c)) {
+	} 
+	else if (u_isdigit(c)) {
 	  num = true;
 	}            
 	word += c;
       }    
     }
     if (word != "") {
-      if ( word == explicit_eos_marker ) {
+      if ( word == eosmark ) {
 	word = "";
 	if (!tokens.empty()) 
 	  tokens[tokens.size() - 1].role |= ENDOFSENTENCE;
@@ -1132,17 +1180,21 @@ namespace Tokenizer {
 	const UnicodeString *type;
 	if (alpha && !num && !punct) {
 	  type = &type_word;
-	} else if (num && !alpha && !punct) {
+	} 
+	else if (num && !alpha && !punct) {
 	  type = &type_number;                
-	} else if (punct && !alpha && !num) {
+	} 
+	else if (punct && !alpha && !num) {
 	  type = &type_punctuation;                    
-	} else {
+	}
+	else {
 	  type = &type_unknown;                
 	}
 	if (bos) {
 	  tokens.push_back( Token( type, word , BEGINOFSENTENCE ) );
 	  bos = false;
-	} else {
+	}
+	else {
 	  tokens.push_back( Token( type, word ) );
 	}
       }   
@@ -1153,8 +1205,23 @@ namespace Tokenizer {
     }
   }
 
+  string TokenizerClass::checkBOM( const string& s, string& enc ){
+    UErrorCode err = U_ZERO_ERROR;
+    int32_t bomLength = 0;
+    const char *encoding = ucnv_detectUnicodeSignature( s.c_str(),s.length(),
+							&bomLength, &err);
+    if ( bomLength ){
+      enc = encoding;
+      if (tokDebug) 
+	*Log(theErrLog) << "Autdetected encoding: " << enc << endl;
+      return s.substr( bomLength );
+    }
+    return s;
+  }
+  
   // string wrapper
-  int TokenizerClass::tokenizeLine( const string& s ){
+  int TokenizerClass::tokenizeLine( const string& in_s ){
+    string s = checkBOM( in_s, inputEncoding );
     UnicodeString uinputstring;
     try {
       uinputstring = UnicodeString( s.c_str(), s.length(), inputEncoding.c_str() );
@@ -1174,108 +1241,126 @@ namespace Tokenizer {
   int TokenizerClass::tokenizeLine( const UnicodeString& originput ){ 
     if (tokDebug) 
       *Log(theErrLog) << "[tokenizeLine] input: line=[" 
-		      << folia::UnicodeToUTF8( originput ) << "]" << endl;
+		      << originput << "]" << endl;
     UnicodeString input = normalizer.normalize( originput );
     if ( doFilter ){
       input = filter.filter( input );
     }
+    if ( input.isBogus() ){ //only tokenize valid input
+      *theErrLog << "ERROR: Invalid UTF-8 in line!:" << input << endl;
+      return 0;
+    }
     if (tokDebug) 
       *Log(theErrLog) << "[tokenizeLine] filtered input: line=[" 
-		      << folia::UnicodeToUTF8( input ) << "]" << endl;
+		      << input << "]" << endl;
     const int begintokencount = tokens.size();    
     if (tokDebug) *Log(theErrLog) << "[tokenizeLine] Tokens still in buffer: " << begintokencount << endl;
 
-    if ( !input.isBogus() ){ //only tokenize valid input
-      bool tokenizeword = false;
-      bool reset = false;
-      //iterate over all characters
-      UnicodeString word;
-
-      for ( int i=0; i < input.length(); ++i ) {
-	UChar c = input[i];
-	if (reset) { //reset values for new word
-	  reset = false;
-	  if (!u_isspace(c)) word = c; else word = "";
-	  tokenizeword = false;
-	} else {
-	  if (!u_isspace(c)) word += c;
-	}
-	if ( u_isspace(c) || i == input.length()-1 ){
-	  if (tokDebug)
-	    *Log(theErrLog) << "[tokenizeLine] space detected, word=[" 
-			    << folia::UnicodeToUTF8( word ) << "]" << endl;
-	  if ( i == input.length()-1 ) {
-	    if ( u_ispunct(c) || u_isdigit(c)) tokenizeword = true; 
-	  } else { // isspace
-	    //word.remove(word.length()-1);
-	  }
-	  int expliciteosfound = -1;
-	  if ( word.length() >= explicit_eos_marker.length() ) {
-	    expliciteosfound = word.lastIndexOf(explicit_eos_marker);		    
-	        
-	    if (expliciteosfound != -1) { //( word == explicit_eos_marker ) {
+    bool tokenizeword = false;
+    bool reset = false;
+    //iterate over all characters
+    UnicodeString word;
+    
+    for ( int i=0; i < input.length(); ++i ) {
+      UChar c = input[i];
+      if (reset) { //reset values for new word
+	reset = false;
+	if (!u_isspace(c)) word = c; else word = "";
+	tokenizeword = false;
+      } 
+      else {
+	if (!u_isspace(c)) word += c;
+      }
+      if ( u_isspace(c) || i == input.length()-1 ){
+	if (tokDebug)
+	  *Log(theErrLog) << "[tokenizeLine] space detected, word=[" 
+			  << word << "]" << endl;
+	if ( i == input.length()-1 ) {
+	  if ( u_ispunct(c) || u_isdigit(c)) tokenizeword = true; 
+	} 
+	int expliciteosfound = -1;
+	if ( word.length() >= eosmark.length() ) {
+	  expliciteosfound = word.lastIndexOf(eosmark);
+	  
+	  if (expliciteosfound != -1) { // word contains eosmark
+	    if (tokDebug >= 2) 
+	      *Log(theErrLog) << "[tokenizeLine] Found explicit EOS marker @"<<expliciteosfound << endl;
+	    int eospos = tokens.size()-1;
+	    if (expliciteosfound > 0) {		    		    
+	      UnicodeString realword;		    
+	      word.extract(0,expliciteosfound,realword);
+	      if (tokDebug >= 2)
+		*Log(theErrLog) << "[tokenizeLine] Prefix before EOS: "
+				<< realword << endl;
+	      tokenizeWord( realword, false );
+	      eospos++;
+	    }
+	    if (expliciteosfound + eosmark.length() < word.length())  {
+	      UnicodeString realword;		    
+	      word.extract(expliciteosfound+eosmark.length(),word.length() - expliciteosfound - eosmark.length(),realword);
 	      if (tokDebug >= 2) 
-		*Log(theErrLog) << "[tokenizeLine] Found explicit EOS marker @"<<expliciteosfound << endl;
-	      if (expliciteosfound > 0) {		    		    
-		UnicodeString realword;		    
-		word.extract(0,explicit_eos_marker.length(),realword);
-		if (tokDebug >= 2)
-		  *Log(theErrLog) << "[tokenizeLine] Prefix before EOS: "
-				  << folia::UnicodeToUTF8( realword ) << endl;
-		tokenizeWord( realword, false );
-	      }
-	      if (expliciteosfound + explicit_eos_marker.length() < word.length())  {
-		UnicodeString realword;		    
-		word.extract(expliciteosfound+explicit_eos_marker.length(),word.length() - expliciteosfound - explicit_eos_marker.length(),realword);
-		if (tokDebug >= 2) 
-		  *Log(theErrLog) << "[tokenizeLine] Prefix after EOS: "
-				  << folia::UnicodeToUTF8( realword ) << endl;
-		tokenizeWord( realword, true );
-	      }
-	      if (!tokens.empty()) {
-		if (tokDebug >= 2) 
-		  *Log(theErrLog) << "[tokenizeLine] Assigned EOS" << endl;
-		tokens[tokens.size() - 1].role |= ENDOFSENTENCE;
-	      }			
+		*Log(theErrLog) << "[tokenizeLine] postfix after EOS: "
+				<< realword << endl;
+	      tokenizeWord( realword, true );
 	    }
-	  }			    
-	  if ((word.length() > 0) && (expliciteosfound == -1)) {	    
-	    if (!tokenizeword) {	      
-	      //single character or nothing tokenisable found, so no need to tokenize anything
-	      if (tokDebug >= 2)
-		*Log(theErrLog) << "[tokenizeLine] Word ok, no need for further tokenisation for: ["
-				<< folia::UnicodeToUTF8( word ) << "]" << endl;;
-	      tokens.push_back( Token( &type_word, word ) );
-	    } else {
-	      if (tokDebug >= 2)
-		*Log(theErrLog) << "[tokenizeLine] Further tokenisation necessary for: [" 
-				<< folia::UnicodeToUTF8( word ) << "]" << endl;
-	      tokenizeWord( word, true );            
-	    } 
+	    if ( !tokens.empty() && eospos >= 0 ) {
+	      if (tokDebug >= 2) 
+		*Log(theErrLog) << "[tokenizeLine] Assigned EOS" << endl;
+	      tokens[eospos].role |= ENDOFSENTENCE;
+	    }			
 	  }
-	  //reset values for new word
-	  reset = true;        
-	} else if ( u_ispunct(c) || u_isdigit(c)) {
-	  if (tokDebug) 
-	    *Log(theErrLog) << "[tokenizeLine] punctuation or digit detected, word=[" 
-			    << folia::UnicodeToUTF8( word ) << "]" << endl;
-	  
-	  //there is punctuation or digits in this word, mark to run through tokeniser
-	  tokenizeword = true; 
+	}			    
+	if ((word.length() > 0) && (expliciteosfound == -1)) {	    
+	  if (!tokenizeword) {	      
+	    //single character or nothing tokenisable found, so no need to tokenize anything
+	    if (tokDebug >= 2)
+	      *Log(theErrLog) << "[tokenizeLine] Word ok, no need for further tokenisation for: ["
+			      << word << "]" << endl;;
+	    tokens.push_back( Token( &type_word, word ) );
+	  } 
+	  else {
+	    if (tokDebug >= 2)
+	      *Log(theErrLog) << "[tokenizeLine] Further tokenisation necessary for: [" 
+			      << word << "]" << endl;
+	    tokenizeWord( word, true );            
+	  } 
 	}
-      }        	
-    } else {
-      //ELSE print error message
-      *theErrLog << "ERROR: Invalid UTF-8 in line!" << endl;
-    }
+	//reset values for new word
+	reset = true;        
+      }
+      else if ( u_ispunct(c) || u_isdigit(c)) {
+	if (tokDebug) 
+	  *Log(theErrLog) << "[tokenizeLine] punctuation or digit detected, word=[" 
+			  << word << "]" << endl;
+	
+	//there is punctuation or digits in this word, mark to run through tokeniser
+	tokenizeword = true; 
+      }
+    }        	
     int numNewTokens = tokens.size() - begintokencount;
     if ( numNewTokens > 0 ){
       if (paragraphsignal) {
 	tokens[begintokencount].role |= NEWPARAGRAPH | BEGINOFSENTENCE;
 	paragraphsignal = false;
       }
-      if ( detectBounds )
-	detectSentenceBounds( begintokencount );  //find sentence boundaries
+      if ( detectBounds ){
+	//find sentence boundaries
+	if (sentenceperlineinput) {
+	  tokens[begintokencount].role |= BEGINOFSENTENCE;
+	  tokens[tokens.size() - 1].role |= ENDOFSENTENCE;
+	  if ( detectQuotes ){
+	    detectQuotedSentenceBounds( begintokencount ); 
+	  }
+	}
+	else {
+	  if ( detectQuotes ){
+	    detectQuotedSentenceBounds( begintokencount ); 
+	  }
+	  else {
+	    detectSentenceBounds( begintokencount );
+	  }
+	}
+      }
     }
     return numNewTokens;
   }
@@ -1306,15 +1391,16 @@ namespace Tokenizer {
   void TokenizerClass::tokenizeWord( const UnicodeString& input, bool space ) {
     if ( tokDebug > 2 )
       *Log(theErrLog) << "   [tokenizeWord] Input: (" << input.length() << ") "
-		      << "word=[" << folia::UnicodeToUTF8( input ) << "]" << endl;
-    if ( input == explicit_eos_marker ) {
+		      << "word=[" << input << "]" << endl;
+    if ( input == eosmark ) {
       if (tokDebug >= 2)
 	*Log(theErrLog) << "   [tokenizeWord] Found explicit EOS marker" << endl;
       if (!tokens.empty()) {
 	if (tokDebug >= 2) 
 	  *Log(theErrLog) << "   [tokenizeWord] Assigned EOS" << endl;
 	tokens[tokens.size() - 1].role |= ENDOFSENTENCE;
-      } else {
+      } 
+      else {
 	*Log(theErrLog) << "[WARNING] Found explicit EOS marker by itself, this will have no effect!" << endl; 
       }
       return;
@@ -1328,39 +1414,47 @@ namespace Tokenizer {
       if ( u_ispunct(c)) {
 	if (  u_charType( c ) == U_CURRENCY_SYMBOL ) {
 	  type = &type_currency;
-	} else {
+	} 
+	else {
 	  type = &type_punctuation;
 	}
-      } else if ( u_isalpha(c)) {
+      } 
+      else if ( u_isalpha(c)) {
 	type = &type_word;
-      } else if ( u_isdigit(c)) {
+      }
+      else if ( u_isdigit(c)) {
 	type = &type_number;
-      } else if ( u_isspace(c)) {
+      }
+      else if ( u_isspace(c)) {
 	return;
-      } else {
+      }
+      else {
 	if (  u_charType( c ) == U_CURRENCY_SYMBOL ) {
 	  type = &type_currency;
-	} else {
+	} 
+	else {
 	  type = &type_unknown;
 	}
       } 
-      tokens.push_back( Token( type, input, space ? NOROLE : NOSPACE ) ); 
-    } else {
+      Token T( type, input, space ? NOROLE : NOSPACE );
+      tokens.push_back( T );
+      if (tokDebug >= 2) 
+	*Log(theErrLog) << "   [tokenizeWord] added token " << T << endl;
+    }
+    else {
       for ( unsigned int i = 0; i < rules.size(); i++) {
 	if ( tokDebug >= 4)
-	  *Log(theErrLog) << "\tTESTING " << folia::UnicodeToUTF8( rules[i]->id )
-			  << endl;
+	  *Log(theErrLog) << "\tTESTING " << rules[i]->id << endl;
 	//Find first matching rule
 	UnicodeString pre, post;
 	vector<UnicodeString> matches;
 	if ( rules[i]->matchAll( input, pre, post, matches ) ){
 	  if ( tokDebug >= 4 )
-	    *Log(theErrLog) << "\tMATCH: " << folia::UnicodeToUTF8( rules[i]->id )
-			    << endl;    
+	    *Log(theErrLog) << "\tMATCH: " << rules[i]->id << endl;    
 	  if ( pre.length() > 0 ){
 	    if ( tokDebug >= 4 ){
 	      *Log(theErrLog) << "\tTOKEN pre-context (" << pre.length() 
-			      << "): [" << folia::UnicodeToUTF8( pre ) << "]" << endl;
+			      << "): [" << pre << "]" << endl;
 	    }
 	    tokenizeWord( pre, false ); //pre-context, no space after
 	  }
@@ -1371,7 +1465,7 @@ namespace Tokenizer {
 	    for ( int m=0; m < max; ++m ){
 	      if ( tokDebug >= 4 )
 		*Log(theErrLog) << "\tTOKEN match[" << m << "] = " 
-				<< folia::UnicodeToUTF8( matches[m] )<< endl; 
+				<< matches[m] << endl; 
 	      if ( post.length() > 0 ) space = false;
 	      tokens.push_back( Token( &rules[i]->id, matches[m], space ? NOROLE : NOSPACE ) );
 	    }
@@ -1381,7 +1475,7 @@ namespace Tokenizer {
 	  if ( post.length() > 0 ){
 	    if ( tokDebug >= 4 ){
 	      *Log(theErrLog) << "\tTOKEN post-context (" << post.length() 
-			      << "): [" << folia::UnicodeToUTF8(post) << "]" << endl;
+			      << "): [" << post << "]" << endl;
 	    }
 	    tokenizeWord( post, space ? NOROLE : NOSPACE );
 	  }
@@ -1528,33 +1622,69 @@ namespace Tokenizer {
     return true;
   }
   
+  bool TokenizerClass::readabbreviations( const string& fname,
+					  UnicodeString& abbrev_pattern ) {
+    if ( tokDebug > 0 )
+      *theErrLog << "%include " << fname << endl;
+    ifstream f(fname.c_str());
+    if ( !f ){
+      return false;
+    }    
+    else {
+      string rawline;
+      while ( getline(f,rawline) ){
+	UnicodeString line = folia::UTF8ToUnicode(rawline);
+	line.trim();
+	if ((line.length() > 0) && (line[0] != '#')) {
+	  if ( tokDebug >= 5 )
+	    *theErrLog << "include line = " << rawline << endl;
+	  if (!abbrev_pattern.isEmpty()) abbrev_pattern += '|';
+	  abbrev_pattern += line;
+	}
+      }
+    }
+    return true;
+  }
+  
   ConfigMode getMode( const UnicodeString& line ) {
     ConfigMode mode = NONE;
     if (line == "[RULES]") {
       mode = RULES;
-    } else if (line == "[RULE-ORDER]") {
+    } 
+    else if (line == "[RULE-ORDER]") {
       mode = RULEORDER;
-    } else if (line == "[ABBREVIATIONS]") {
+    } 
+    else if (line == "[ABBREVIATIONS]") {
       mode = ABBREVIATIONS;
-    } else if (line == "[ATTACHEDPREFIXES]") {
+    }
+    else if (line == "[ATTACHEDPREFIXES]") {
       mode = ATTACHEDPREFIXES;
-    } else if (line == "[ATTACHEDSUFFIXES]") {
+    }
+    else if (line == "[ATTACHEDSUFFIXES]") {
       mode = ATTACHEDSUFFIXES;
-    } else if (line == "[PREFIXES]") {
+    }
+    else if (line == "[PREFIXES]") {
       mode = PREFIXES;
-    } else if (line == "[SUFFIXES]") {
+    }
+    else if (line == "[SUFFIXES]") {
       mode = SUFFIXES;
-    } else if (line == "[TOKENS]") {
+    }
+    else if (line == "[TOKENS]") {
       mode = TOKENS;
-    } else if (line == "[UNITS]") {
+    } 
+    else if (line == "[UNITS]") {
       mode = UNITS;
-    } else if (line == "[ORDINALS]") {
+    }
+    else if (line == "[ORDINALS]") {
       mode = ORDINALS;
-    } else if (line == "[EOSMARKERS]") {
+    }
+    else if (line == "[EOSMARKERS]") {
       mode = EOSMARKERS;
-    } else if (line == "[QUOTES]") {
+    }
+    else if (line == "[QUOTES]") {
       mode = QUOTES;
-    } else if (line == "[FILTER]") {
+    }
+    else if (line == "[FILTER]") {
       mode = FILTER;
     }
     return mode;
@@ -1594,13 +1724,13 @@ namespace Tokenizer {
 	}
 	if ( !found ){
 	  *Log(theErrLog) << "RULE-ORDER specified for undefined RULE '" 
-			  << folia::UnicodeToUTF8( sort[i] ) << "'" << endl;
+			  << sort[i] << "'" << endl;
 	}
       }
       vector<Rule*>::iterator it = rules.begin();
       while ( it != rules.end() ){
-	*Log(theErrLog) << "NU RULE-ORDER specified for RULE '" 
-			<< folia::UnicodeToUTF8((*it)->id) << "'" << endl;
+	*Log(theErrLog) << "No RULE-ORDER specified for RULE '" 
+			<< (*it)->id << "' (put at end)." << endl;
 	result.push_back( *it );
 	++it;
       }
@@ -1676,6 +1806,13 @@ namespace Tokenizer {
 	      throw uConfigError( "%include '" + file + "' failed" );
 	  }
 	    break;
+	  case ABBREVIATIONS:{
+	    string file = rawline.substr( 9 );
+	    file = confdir + file + ".abr";
+	    if ( !readabbreviations( file, abbrev_pattern ) )
+	      throw uConfigError( "%include '" + file + "' failed" );
+	  }
+	    break;
 	  default:
 	    throw uConfigError( string("%include not implemented for this section" ) );
 	  }
@@ -1831,13 +1968,15 @@ namespace Tokenizer {
       // was case insensitive, but seems a bad idea
     }
     if (!token_pattern.isEmpty()){
-      rules.insert(rules.begin(), new Rule("WORD-TOKEN", "^(" + token_pattern + ")(?:\\p{P}*)?$"));
+      //      rules.insert(rules.begin(), new Rule("WORD-TOKEN", "^(" + token_pattern + ")(?:\\p{P}*)?$"));
+      // removed ^ in front. this fixes bug94
+      rules.insert(rules.begin(), new Rule("WORD-TOKEN", "(" + token_pattern + ")(?:\\p{P}*)?$"));
     }    
     if (!withprefix_pattern.isEmpty()){
       rules.insert(rules.begin(), new Rule("WORD-WITHPREFIX", "(?:\\A|[^\\p{Lu}\\.]|[^\\p{Ll}\\.])(?:" + withprefix_pattern + ")\\p{L}+")); 
     }
     if (!withsuffix_pattern.isEmpty()){
-      rules.insert(rules.begin(), new Rule("WORD-WITHSUFFIX", "((?:\\p{Lu}|\\p{Ll})+(?:" + withsuffix_pattern + "))(?:\\Z|\\P{Lu}|\\P{Ll})")); 
+      rules.insert(rules.begin(), new Rule("WORD-WITHSUFFIX", "((?:\\p{Lu}|\\p{Ll})+(?:" + withsuffix_pattern + "))(?:\\Z|\\p{P})")); 
       // NB: (?:\\p{Lu}|\\p{Ll}) is used because of icu bug 8824
       //     see http://bugs.icu-project.org/trac/ticket/8824
       //     normally (?i) could be used in front and (\\p{L}) would do.
@@ -1846,10 +1985,8 @@ namespace Tokenizer {
       rules.insert(rules.begin(), new Rule("PREFIX", "(?:\\A|[^\\p{Lu}\\.]|[^\\p{Ll}\\.])(" + prefix_pattern + ")(\\p{L}+)")); 
     }
     if (!suffix_pattern.isEmpty()){
-      rules.insert(rules.begin(), new Rule("SUFFIX", "(\\p{Lu}|\\p{Ll}+)(" + suffix_pattern + ")(?:\\Z|\\P{L})")); 
+      rules.insert(rules.begin(), new Rule("SUFFIX", "((?:\\p{Lu}|\\p{Ll})+)(" + suffix_pattern + ")(?:\\Z|\\P{L})")); 
     }
-    //rules.insert(rules.begin(), new Rule("EOSMARKER", "(?:.*)?(" + *explicit_eos_marker + ")(?:.*)?")); 
-
     sortRules( rules, rules_order );
     return true;
   }
@@ -1866,7 +2003,7 @@ namespace Tokenizer {
       for ( size_t i=0; i < rules.size(); ++i ){
 	*Log(theErrLog) << "rule " << i << " " << *rules[i] << endl;
       }
-      *Log(theErrLog) << "EOS markers: " << folia::UnicodeToUTF8( eosmarkers ) << endl;
+      *Log(theErrLog) << "EOS markers: " << eosmarkers << endl;
       *Log(theErrLog) << "Quotations: " << quotes << endl;
       *Log(theErrLog) << "Filter: " << filter << endl;
     }
diff --git a/src/ucto.cxx b/src/ucto.cxx
index 2541f82..3234b5b 100644
--- a/src/ucto.cxx
+++ b/src/ucto.cxx
@@ -1,7 +1,7 @@
 /*
-  $Id: ucto.cxx 14472 2012-03-19 10:54:28Z sloot $
+  $Id: ucto.cxx 15910 2013-04-03 13:57:51Z sloot $
   $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/src/ucto.cxx $
-  Copyright (c) 1998 - 2012
+  Copyright (c) 1998 - 2013
   ILK  -  Tilburg University
   CNTS -  University of Antwerp
  
@@ -31,8 +31,8 @@
 #include <string>
 #include <iostream>
 #include <fstream>
+#include "libfolia/document.h"
 #include "ucto/tokenize.h"
-#include "libfolia/folia.h"
 #include <unistd.h>
 
 using namespace std;
@@ -63,6 +63,7 @@ void usage(){
        << "\t-F              - Input file is in FoLiA XML. All untokenised sentences will be tokenised." << endl
        << "\t-X              - Output FoLiA XML, use the Document ID specified with --id=" << endl
        << "\t--id <DocID>    - use the specified Document ID to label the FoLia doc." << endl
+       << "\t--textclass <class> - use the specified class to search text in the the FoLia doc." << endl
        << "\t                  (-x and -F disable usage of most other options: -nulPQVsS)" << endl;
 }
 
@@ -81,6 +82,7 @@ int main( int argc, char *argv[] ){
   bool verbose = false;
   string eosmarker = "<utt>";
   string docid = "untitleddoc";
+  string textclass = "current";
   string normalization = "NFC";
   string inputEncoding = "UTF-8";
   string cfile = "tokconfig-en";
@@ -91,6 +93,7 @@ int main( int argc, char *argv[] ){
 
   static struct option longOpts[] = { { "passthru", 0, 0, 1 },
 				      { "id", 1, 0, 2 },
+				      { "textclass", 1, 0, 3 },
 				      { 0,0,0,0} };
 
   int opt;
@@ -119,12 +122,13 @@ int main( int argc, char *argv[] ){
 	case 'm': sentenceperlineinput = true; break;
 	case 'N': normalization = optarg; break;
 	case 'v': verbose = true; break;
-	case 'V': cout << "Ucto - Unicode Tokenizer - version " << Version() << endl << "(c) ILK 2009 - 2012, Induction of Linguistic Knowledge Research Group, Tilburg University" << endl << "Licensed under the GNU General Public License v3" << endl; 
+	case 'V': cout << "Ucto - Unicode Tokenizer - version " << Version() << endl << "(c) ILK 2009 - 2013, Induction of Linguistic Knowledge Research Group, Tilburg University" << endl << "Licensed under the GNU General Public License v3" << endl; 
 	  cout << "based on [" << folia::VersionName() << "]" << endl;
 	  return EXIT_SUCCESS;
 	case 'x': xmlout = true; docid = optarg; break;
 	case 'X': xmlout = true; break;
 	case 2: docid = optarg; break;
+	case 3: textclass = optarg; break;
 	default: usage(); return EXIT_SUCCESS;
 	}
     }
@@ -211,6 +215,7 @@ int main( int argc, char *argv[] ){
     tokenizer.setNormalization( normalization );
     tokenizer.setInputEncoding( inputEncoding );
     tokenizer.setFiltering(dofiltering);
+    tokenizer.setTextClass(textclass);
     tokenizer.setXMLOutput(xmlout, docid);
     tokenizer.setXMLInput(xmlin);
 
diff --git a/src/unicode.cxx b/src/unicode.cxx
index 0c86a27..d096607 100644
--- a/src/unicode.cxx
+++ b/src/unicode.cxx
@@ -1,7 +1,7 @@
 /*
-  $Id: unicode.cxx 13842 2012-01-02 16:32:58Z sloot $
+  $Id: unicode.cxx 15910 2013-04-03 13:57:51Z sloot $
   $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/src/unicode.cxx $
-  Copyright (c) 1998 - 2012
+  Copyright (c) 1998 - 2013
   ILK  -  Tilburg University
   CNTS -  University of Antwerp
  
diff --git a/tests/Makefile.in b/tests/Makefile.in
index 8bfea14..4f27663 100644
--- a/tests/Makefile.in
+++ b/tests/Makefile.in
@@ -1,9 +1,9 @@
-# Makefile.in generated by automake 1.11.1 from Makefile.am.
+# Makefile.in generated by automake 1.11.3 from Makefile.am.
 # @configure_input@
 
 # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006, 2007, 2008, 2009  Free Software Foundation,
-# Inc.
+# 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
+# Foundation, Inc.
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
@@ -39,11 +39,11 @@ host_triplet = @host@
 subdir = tests
 DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ac_define_dir.m4 \
-	$(top_srcdir)/m4/ax_icu_check.m4 $(top_srcdir)/m4/libtool.m4 \
-	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
-	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
-	$(top_srcdir)/m4/pkg.m4 $(top_srcdir)/configure.ac
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_icu_check.m4 \
+	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
+	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
+	$(top_srcdir)/m4/lt~obsolete.m4 $(top_srcdir)/m4/pkg.m4 \
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -129,7 +129,7 @@ SED = @SED@
 SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
 STRIP = @STRIP@
-SYSCONF_PATH = @SYSCONF_PATH@
+SYSCONFDIR = @SYSCONFDIR@
 VERSION = @VERSION@
 XML2_CFLAGS = @XML2_CFLAGS@
 XML2_LIBS = @XML2_LIBS@
@@ -185,6 +185,8 @@ sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
 sysconfdir = @sysconfdir@
 target_alias = @target_alias@
+ticcutils_CFLAGS = @ticcutils_CFLAGS@
+ticcutils_LIBS = @ticcutils_LIBS@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
@@ -279,10 +281,15 @@ install-am: all-am
 
 installcheck: installcheck-am
 install-strip:
-	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	  `test -z '$(STRIP)' || \
-	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
 mostlyclean-generic:
 
 clean-generic:

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/ucto.git



More information about the debian-science-commits mailing list