[ucto] 01/01: Imported Upstream version 0.8.0
Joost van Baal
joostvb at moszumanska.debian.org
Sun Jul 5 04:54:20 UTC 2015
This is an automated email from the git hooks/post-receive script.
joostvb pushed a commit to annotated tag upstream/0.8.0
in repository ucto.
commit 698c22bbeddfdba10353a50618c363378ab919a9
Author: Joost van Baal-Ilić <joostvb at nusku.mdcc.cx>
Date: Sun Jul 5 06:54:04 2015 +0200
Imported Upstream version 0.8.0
---
ChangeLog | 69 +++++++++++++++++++++++
NEWS | 20 +++++++
config/Makefile.am | 4 +-
config/Makefile.in | 4 +-
config/tokconfig-de | 6 +-
config/tokconfig-en | 6 +-
config/tokconfig-es | 10 ++--
config/tokconfig-fr | 10 ++--
config/tokconfig-fy | 4 +-
config/tokconfig-generic | 10 ++--
config/tokconfig-it | 12 +++-
config/tokconfig-nl | 6 +-
config/tokconfig-nl-twitter | 12 ++--
config/tokconfig-pt | 10 ++--
config/{tokconfig-generic => tokconfig-ru} | 68 +++++++++++++----------
config/tokconfig-sv | 9 +++
configure | 20 +++----
configure.ac | 4 +-
docs/ucto.1 | 89 ++++++++++++++++--------------
include/ucto/tokenize.h | 6 +-
include/ucto/unicode.h | 4 +-
src/tokenize.cxx | 38 ++++++++++++-
src/ucto.cxx | 4 +-
src/unicode.cxx | 4 +-
24 files changed, 303 insertions(+), 126 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 4a11737..9e322f1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,72 @@
+2015-01-29 10:10 sloot
+
+ * [r18006] src/tokenize.cxx, src/ucto.cxx, src/unicode.cxx: 2015
+
+2015-01-23 13:19 mvgompel
+
+ * [r17995] src/tokenize.cxx: allatonce enabled by default for
+ tokenize() to folia doc
+
+2015-01-21 14:51 sloot
+
+ * [r17993] include/ucto/tokenize.h, include/ucto/unicode.h: 2015
+
+2015-01-09 15:15 mvgompel
+
+ * [r17959] include/ucto/tokenize.h, src/tokenize.cxx: added new
+ tokenize(string,string) meta-function for the API
+
+2015-01-07 16:05 mvgompel
+
+ * [r17958] config/tokconfig-nl-twitter: added FRACNUMBER too
+
+2015-01-07 16:04 mvgompel
+
+ * [r17957] config/tokconfig-de, config/tokconfig-en,
+ config/tokconfig-es, config/tokconfig-fr, config/tokconfig-fy,
+ config/tokconfig-generic, config/tokconfig-it,
+ config/tokconfig-nl, config/tokconfig-nl-twitter,
+ config/tokconfig-pt, config/tokconfig-ru, config/tokconfig-sv,
+ config/tokconfig-tr: fixing date rules and adding FRACNUMBER
+
+2015-01-07 15:32 mvgompel
+
+ * [r17956] config/tokconfig-nl-twitter: <Flo> zou je ook nog een
+ variant dd/mm toe kunnen voegen?
+
+2015-01-07 15:22 mvgompel
+
+ * [r17955] config/tokconfig-nl-twitter: handle dates with slashes
+ in twitter: < proycon> doen wij datums niet met hyphens officieel
+ in het Nederlads? <Flo>| heeft twitter maling aan ;)
+
+2014-12-13 22:52 mvgompel
+
+ * [r17934] config/tokconfig-ru: fix
+
+2014-12-13 22:42 mvgompel
+
+ * [r17933] config/Makefile.am, config/tokconfig-ru: added Russian
+
+2014-12-02 15:40 sloot
+
+ * [r17908] docs/ucto.1: fixed hyphens according to
+ /usr/share/doc/groff-base/README.Debian
+
+2014-12-02 15:34 sloot
+
+ * [r17907] docs/ucto.1: fixed hyphens according to
+ /usr/share/doc/groff-base/README.Debian
+
+2014-11-26 17:11 sloot
+
+ * [r17873] src/tokenize.cxx: fixed terrible bug,
+ also some code-cleanup
+
+2014-11-26 16:26 sloot
+
+ * [r17872] configure.ac: bumped version
+
2014-11-26 16:17 sloot
* [r17871] configure.ac, src/tokenize.cxx, tests/testfolia.ok,
diff --git a/NEWS b/NEWS
index 67b2768..be75fe9 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,23 @@
+0.8.0 2015-01-29
+[Ko van der Sloot]
+* next release
+[Maarten van Gompel]
+* added new tokenize(string,string) meta-function for the API
+* allatonce enabled by default for tokenize() to folia doc
+* fixing date rules and adding FRACNUMBER
+* added Russian
+* Adicionei regras para tokenização portuguesa.
+[Antal vd Bosch]
+* added RK to dutch abbrev.
+
+0.7.0 2014-11-26
+[Ko van der Sloot]
+* unofficial release
+* experimental PUNCTUATION filter
+* bug fixes
+[Maarten van Gompel]
+* reduced memory usage
+
0.6.0 2014-09-23
[Ko van der Sloot]
* release
diff --git a/config/Makefile.am b/config/Makefile.am
index 6f17afd..3366550 100644
--- a/config/Makefile.am
+++ b/config/Makefile.am
@@ -1,8 +1,8 @@
-# $Id: Makefile.am 17387 2014-06-25 10:36:49Z mvgompel $
+# $Id: Makefile.am 17933 2014-12-13 22:42:56Z mvgompel $
# $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/config/Makefile.am $
config_DATA = tokconfig-en tokconfig-nl tokconfig-fr tokconfig-it tokconfig-es tokconfig-pt \
- tokconfig-de tokconfig-sv tokconfig-nl-twitter tokconfig-nl-sonarchat tokconfig-fy \
+ tokconfig-de tokconfig-sv tokconfig-nl-twitter tokconfig-nl-sonarchat tokconfig-fy tokconfig-ru \
tokconfig-generic url.rule e-mail.rule smiley.rule \
ligatures.filter standard-quotes.quote \
exotic-quotes.quote standard-eos.eos exotic-eos.eos nl_afk.abr es.abr pt.abr
diff --git a/config/Makefile.in b/config/Makefile.in
index be5b46f..51243d0 100644
--- a/config/Makefile.in
+++ b/config/Makefile.in
@@ -14,7 +14,7 @@
@SET_MAKE@
-# $Id: Makefile.am 17387 2014-06-25 10:36:49Z mvgompel $
+# $Id: Makefile.am 17933 2014-12-13 22:42:56Z mvgompel $
# $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/config/Makefile.am $
VPATH = @srcdir@
@@ -283,7 +283,7 @@ top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
config_DATA = tokconfig-en tokconfig-nl tokconfig-fr tokconfig-it tokconfig-es tokconfig-pt \
- tokconfig-de tokconfig-sv tokconfig-nl-twitter tokconfig-nl-sonarchat tokconfig-fy \
+ tokconfig-de tokconfig-sv tokconfig-nl-twitter tokconfig-nl-sonarchat tokconfig-fy tokconfig-ru \
tokconfig-generic url.rule e-mail.rule smiley.rule \
ligatures.filter standard-quotes.quote \
exotic-quotes.quote standard-eos.eos exotic-eos.eos nl_afk.abr es.abr pt.abr
diff --git a/config/tokconfig-de b/config/tokconfig-de
index 3183ab6..7054c4c 100644
--- a/config/tokconfig-de
+++ b/config/tokconfig-de
@@ -1,8 +1,8 @@
[RULE-ORDER]
TOKEN SUFFIX WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
-ABBREVIATION INITIALS INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+ABBREVIATION INITIALS INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
[RULES]
%include url
@@ -34,6 +34,8 @@ PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
DATE=\p{N}{1,2}[\.-]\p{Ps}?\p{N}{1,2}[\.-]\p{Ps}?\p{N}{2,4}
DATE-REVERSE=\p{N}{4}-\p{N}{1,2}[\.-]\p{N}{1,2}\.?
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
+
NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
#NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/config/tokconfig-en b/config/tokconfig-en
index aa125ea..818db3b 100644
--- a/config/tokconfig-en
+++ b/config/tokconfig-en
@@ -1,8 +1,8 @@
[RULE-ORDER]
SUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
-ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
[RULES]
%include url
@@ -31,6 +31,8 @@ PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
DATE=\p{N}{1,2}/\p{Ps}?\p{N}{1,2}[/]\p{Ps}?\p{N}{2,4}
DATE-REVERSE=\p{N}{4}/\p{N}{1,2}/\p{N}{1,2}
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
+
NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
#NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/config/tokconfig-es b/config/tokconfig-es
index 9b361ed..2558962 100644
--- a/config/tokconfig-es
+++ b/config/tokconfig-es
@@ -1,8 +1,8 @@
[RULE-ORDER]
ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
-ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
[RULES]
%include url
@@ -28,8 +28,10 @@ INITIAL=^(?:\p{Lt}|\p{Lu})\.$
PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
#Date
-DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
-DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
+DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/-]\p{N}{2,4}
+DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2}
+
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
#NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/config/tokconfig-fr b/config/tokconfig-fr
index 53c05e5..82ee6b6 100644
--- a/config/tokconfig-fr
+++ b/config/tokconfig-fr
@@ -1,8 +1,8 @@
[RULE-ORDER]
PREFIX SUFFIX WORD-TOKEN ABBREVIATION-KNOWN URL URL-WWW URL-DOMAIN
E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
-ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR FRACNUMBER TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
[RULES]
%include url
@@ -28,8 +28,10 @@ INITIAL=^(?:\p{Lt}|\p{Lu})\.$
PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
#Date
-DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
-DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
+DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/-]\p{N}{2,4}
+DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2}
+
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
NUMBER-YEAR=(['`’]\p{N}{2})(?:\P{N}|\z)
#NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/config/tokconfig-fy b/config/tokconfig-fy
index 2981a6c..7e1b815 100644
--- a/config/tokconfig-fy
+++ b/config/tokconfig-fy
@@ -2,7 +2,7 @@
WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
ABBREVIATION INITIALS INITIAL SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
# to do PREFIXES (is leeg nu) UNITS (uitgecommentarieerd in de c++ code)
[RULES]
@@ -40,6 +40,8 @@ PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
+
NUMBER-YEAR=(['`’]\p{N}{2})(?:\P{N}|\z)
#NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/config/tokconfig-generic b/config/tokconfig-generic
index 50c5a85..35773a8 100644
--- a/config/tokconfig-generic
+++ b/config/tokconfig-generic
@@ -1,8 +1,8 @@
[RULE-ORDER]
URL URL-WWW URL-DOMAIN
E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
-ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
[RULES]
%include url
@@ -28,8 +28,10 @@ INITIAL=^(?:\p{Lt}|\p{Lu})\.$
PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
#Date
-DATE=\p{N}{1,2}/\p{Ps}?\p{N}{1,2}[/]\p{Ps}?\p{N}{2,4}
-DATE-REVERSE=\p{N}{4}/\p{N}{1,2}/\p{N}{1,2}
+DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/-]\p{N}{2,4}
+DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2}
+
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
#NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/config/tokconfig-it b/config/tokconfig-it
index aefe567..615b758 100644
--- a/config/tokconfig-it
+++ b/config/tokconfig-it
@@ -1,3 +1,9 @@
+[RULE-ORDER]
+ABBREVIATION-KNOWN SUFFIX NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
+E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+
[RULES]
%include url
%include e-mail
@@ -22,8 +28,10 @@ INITIAL=^(?:\p{Lt}|\p{Lu})\.$
PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
#Date
-DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
-DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
+DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/-]\p{N}{2,4}
+DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2}
+
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
#NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/config/tokconfig-nl b/config/tokconfig-nl
index 1e2f6c2..f314c84 100644
--- a/config/tokconfig-nl
+++ b/config/tokconfig-nl
@@ -4,8 +4,8 @@ WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL QUOTE-COMPOUND
NUMBER-STRING STRING-NUMBER URL URL-WWW URL-DOMAIN E-MAIL
WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
ABBREVIATION INITIALS INITIAL SMILEY REVERSE-SMILEY
-PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
# to do PREFIXES (is leeg nu) UNITS (uitgecommentarieerd in de c++ code)
[RULES]
@@ -44,6 +44,8 @@ PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
+
#numberstring like 20jarige
NUMBER-STRING=\p{N}+(?:\p{Pd}?)(?:\p{L}+)
diff --git a/config/tokconfig-nl-twitter b/config/tokconfig-nl-twitter
index d175822..3be6444 100644
--- a/config/tokconfig-nl-twitter
+++ b/config/tokconfig-nl-twitter
@@ -2,8 +2,8 @@
WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW
URL-DOMAIN E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX
WORD-COMPOUND ABBREVIATION INITIALS INITIAL SMILEY REVERSE-SMILEY HASHTAG
-ADDRESSEE PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+ADDRESSEE PUNCTUATION-MULTI DATE-REVERSE DATE FRACTIONORDATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
# to do PREFIXES (is leeg nu) UNITS (uitgecommentarieerd in de c++ code)
[RULES]
@@ -40,8 +40,12 @@ ADDRESSEE=@[\p{L}\p{Mn}\p{N}_\-]+
PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
#Date
-DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
-DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
+DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/\-]\p{N}{2,4}
+DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2}
+
+FRACTIONORDATE=\p{N}{1,2}[/\-]\p{N}{1,2}
+
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
NUMBER-YEAR=(['`’]\p{N}{2})(?:\P{N}|\z)
#NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/config/tokconfig-pt b/config/tokconfig-pt
index 0fcd021..d6060fd 100644
--- a/config/tokconfig-pt
+++ b/config/tokconfig-pt
@@ -1,8 +1,8 @@
[RULE-ORDER]
ABBREVIATION-KNOWN SUFFIX NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
-ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
[RULES]
%include url
@@ -28,8 +28,10 @@ INITIAL=^(?:\p{Lt}|\p{Lu})\.$
PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
#Date
-DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
-DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
+DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/-]\p{N}{2,4}
+DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2}
+
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
#NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/config/tokconfig-generic b/config/tokconfig-ru
similarity index 77%
copy from config/tokconfig-generic
copy to config/tokconfig-ru
index 50c5a85..63c8503 100644
--- a/config/tokconfig-generic
+++ b/config/tokconfig-ru
@@ -1,8 +1,8 @@
[RULE-ORDER]
URL URL-WWW URL-DOMAIN
-E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
-ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+E-MAIL ABBREVIATION-KNOWN WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
[RULES]
%include url
@@ -31,6 +31,8 @@ PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
DATE=\p{N}{1,2}/\p{Ps}?\p{N}{1,2}[/]\p{Ps}?\p{N}{2,4}
DATE-REVERSE=\p{N}{4}/\p{N}{1,2}/\p{N}{1,2}
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
+
NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
#NUMBER-YEAR=('\p{N}{2})\P{N}
@@ -57,46 +59,52 @@ UNKNOWN=.
[TOKENS]
[UNITS]
-km
-m
-cm
-mm
-g
-kg
-C
-l
-s
-sec
-min
-gb
-mb
-kb
+км
+м
+ч
+сек
+мин
+мм
+мт
+см
+МБ
+ГБ
+КБ
[CURRENCY]
-USD
-GBP
-CAD
-NZD
-AUD
-SGD
-HKD
-EUR
[ABBREVIATIONS]
+гл
+лат
+напр
+вкл
+вм
+ок
+приб
+им
+обл
+руб
+ст
+стр
+см
+шт
+тов
+пл
+США
+СЕ
+СССР
+РФ
+пер
[FILTER]
-fl fl
-ff ff
-ffi ffi
-ffl ffl
# also filter soft hyphen
\u00AD
-
[EOSMARKERS]
%include standard-eos
[QUOTES]
%include standard-quotes
+%include exotic-quotes
diff --git a/config/tokconfig-sv b/config/tokconfig-sv
index 7665c31..f76a8f5 100644
--- a/config/tokconfig-sv
+++ b/config/tokconfig-sv
@@ -1,6 +1,13 @@
+[RULE-ORDER]
+SUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
+E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+
[RULES]
%include url
%include e-mail
+%include smiley
#Ex (oud)-studente(s)
WORD-PARPREFIX-PARSUFFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})
@@ -27,6 +34,8 @@ PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
DATE=\p{N}{4}-\p{N}{1,2}[\.-]\p{N}{1,2}\.?
DATE-SHORT=\p{N}{1,2}[-]\p{Ps}?\p{N}{1,2}[-]\p{Ps}?\p{N}{2,4}
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
+
NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
#NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/configure b/configure
index b93a2e9..3539551 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for ucto 0.7.0.
+# Generated by GNU Autoconf 2.69 for ucto 0.8.0.
#
# Report bugs to <timbl at uvt.nl>.
#
@@ -589,8 +589,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='ucto'
PACKAGE_TARNAME='ucto'
-PACKAGE_VERSION='0.7.0'
-PACKAGE_STRING='ucto 0.7.0'
+PACKAGE_VERSION='0.8.0'
+PACKAGE_STRING='ucto 0.8.0'
PACKAGE_BUGREPORT='timbl at uvt.nl'
PACKAGE_URL=''
@@ -1350,7 +1350,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures ucto 0.7.0 to adapt to many kinds of systems.
+\`configure' configures ucto 0.8.0 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1420,7 +1420,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of ucto 0.7.0:";;
+ short | recursive ) echo "Configuration of ucto 0.8.0:";;
esac
cat <<\_ACEOF
@@ -1549,7 +1549,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-ucto configure 0.7.0
+ucto configure 0.8.0
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2078,7 +2078,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by ucto $as_me 0.7.0, which was
+It was created by ucto $as_me 0.8.0, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2941,7 +2941,7 @@ fi
# Define the identity of the package.
PACKAGE='ucto'
- VERSION='0.7.0'
+ VERSION='0.8.0'
cat >>confdefs.h <<_ACEOF
@@ -16570,7 +16570,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by ucto $as_me 0.7.0, which was
+This file was extended by ucto $as_me 0.8.0, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -16636,7 +16636,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-ucto config.status 0.7.0
+ucto config.status 0.8.0
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
diff --git a/configure.ac b/configure.ac
index 51c4c01..8a1225a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,10 +1,10 @@
# -*- Autoconf -*-
# Process this file with autoconf to produce a configure script.
-# $Id: configure.ac 17872 2014-11-26 16:26:43Z sloot $
+# $Id: configure.ac 18009 2015-01-29 11:43:21Z sloot $
# $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/configure.ac $
AC_PREREQ(2.59)
-AC_INIT([ucto], [0.7.0], [timbl at uvt.nl])
+AC_INIT([ucto], [0.8.0], [timbl at uvt.nl])
AM_INIT_AUTOMAKE
AC_CONFIG_SRCDIR([configure.ac])
AC_CONFIG_MACRO_DIR([m4])
diff --git a/docs/ucto.1 b/docs/ucto.1
index e6c506c..10135a3 100644
--- a/docs/ucto.1
+++ b/docs/ucto.1
@@ -1,132 +1,142 @@
-.TH ucto 1 "2013 march 6"
+.TH ucto 1 "2014 december 2"
.SH NAME
-ucto - Unicode Tokenizer
+ucto \- Unicode Tokenizer
.SH SYNOPSYS
-ucto [[options]] [input-file] [[output-file]]
+ucto [[options]] [input\(hyfile] [[output\(hyfile]]
.SH DESCRIPTION
-.B ucto
-ucto tokenizes text files: it separates words from punctuation, splits
-sentences (and optionally paragraphs), and finds paired quotes.
-Ucto is preconfigured with tokenisation rules for several languages.
+.B ucto
+ucto tokenizes text files: it separates words from punctuation, splits
+sentences (and optionally paragraphs), and finds paired quotes.
+Ucto is preconfigured with tokenisation rules for several languages.
.SH OPTIONS
-.BR -c " configfile"
+.BR \-c " configfile"
.RS
read settings from a file
.RE
-.BR -d " value"
+.BR \-d " value"
.RS
set debug mode to 'value'
.RE
-.BR -e " value"
+.BR \-e " value"
.RS
set input encoding. (default UTF8)
.RE
-.BR -f
+.BR \-N " value"
+.RS
+set UTF8 output normalization. (default NFC)
+.RE
+
+.BR \-f
.RS
disable filtering of special characters
.RE
-.BR -L " language"
+.BR \-L " language"
.RS
Automatically selects a configuration file by language code.
-e.g. 'fr' will select the file tokconfig-fr from the installation directory
+e.g. 'fr' will select the file tokconfig\(hyfr from the installation directory
.RE
-.BR -l
+.BR \-l
.RS
Convert to all lowercase
.RE
-.BR -u
+.BR \-u
.RS
Convert to all uppercase
.RE
-.BR -n
+.BR \-n
.RS
Emit one sentence per line on output
.RE
-.BR -m
+.BR \-m
.RS
Assume one sentence per line on input
.RE
-.BR --passthru
+.BR \-\-passthru
.RS
Don't tokenize, but perform input decoding and simple token role detection
.RE
-.B -P
+.BR \-\-filterpunct
+.RS
+remove most of the punctuation from the output. (not from abreviations!)
+.RE
+
+.B \-P
.RS
Disable Paragraph Detection
.RE
-.B -Q
+.B \-Q
.RS
Enable Quote Detection. (this is experimental and may lead to unexpected results)
.RE
-.B -S
+.B \-S
.RS
Disable Sentence Detection
.RE
-.B -s
+.B \-s
<string>
.RS
-Set End-of-sentence marker. (Default <utt>)
+Set End\(hyof\(hysentence marker. (Default <utt>)
.RE
-.B -V
-.RS
+.B \-V
+.RS
Show version information
.RE
-.B -v
+.B \-v
.RS
set Verbose mode
.RE
-.B -F
+.B \-F
.RS
-Read a FoLiA XML document, tokenize it, and output the modified doc. (this disables usage of most other options: -nulPQvsS)
+Read a FoLiA XML document, tokenize it, and output the modified doc. (this disables usage of most other options: \-nulPQvsS)
.RE
-.BR --textclass " cls"
+.BR \-\-textclass "cls"
.RS
When tokenizing a FoLiA XML document, search for text nodes of class 'cls'
.RE
-.B -X
+.B \-X
.RS
-Output FoLiA XML. (this disables usage of most other options: -nulPQvsS)
-.RE
+Output FoLiA XML. (this disables usage of most other options: \-nulPQvsS)
+.RE
-.B --id
+.B \-\-id
<DocId>
.RS
Use the specified Document ID for the FoLiA XML
.RE
-.B -x
+.B \-x
<DocId>
.B (obsolete)
.RS
-Output FoLiA XML, use the specified Document ID. (this disables usage of most other options: -nulPQvsS)
+Output FoLiA XML, use the specified Document ID. (this disables usage of most other options: \-nulPQvsS)
.B obsolete
-Use
-.B -X
-and
-.B --id
+Use
+.B \-X
+and
+.B \-\-id
instead
.RE
@@ -137,4 +147,3 @@ likely
Maarten van Gompel proycon at anaproy.nl
Ko van der Sloot Timbl at uvt.nl
-
diff --git a/include/ucto/tokenize.h b/include/ucto/tokenize.h
index ae88928..581ff59 100644
--- a/include/ucto/tokenize.h
+++ b/include/ucto/tokenize.h
@@ -1,7 +1,7 @@
/*
- $Id: tokenize.h 17865 2014-11-21 17:04:25Z mvgompel $
+ $Id: tokenize.h 17993 2015-01-21 14:51:26Z sloot $
$URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/include/ucto/tokenize.h $
- Copyright (c) 2006 - 2014
+ Copyright (c) 2006 - 2015
Tilburg University
This file is part of Ucto.
@@ -155,6 +155,8 @@ namespace Tokenizer {
//Tokenize from input stream to a vecto of Tokens
std::vector<Token> tokenizeStream( std::istream&, bool allatonce=true );
+ //Tokenize from input file to output file (support xmlin + xmlout)
+ void tokenize( const std::string&, const std::string& );
//Tokenize from input stream to output stream
void tokenize( std::istream&, std::ostream& );
void tokenize( std::istream* in, std::ostream* out){
diff --git a/include/ucto/unicode.h b/include/ucto/unicode.h
index 8a61cdc..156446d 100644
--- a/include/ucto/unicode.h
+++ b/include/ucto/unicode.h
@@ -1,7 +1,7 @@
/*
- $Id: unicode.h 16823 2014-01-06 10:21:09Z sloot $
+ $Id: unicode.h 17993 2015-01-21 14:51:26Z sloot $
$URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/include/ucto/unicode.h $
- Copyright (c) 1998 - 2014
+ Copyright (c) 1998 - 2015
ILK - Tilburg University
CNTS - University of Antwerp
diff --git a/src/tokenize.cxx b/src/tokenize.cxx
index 077bdcd..01e3dbc 100644
--- a/src/tokenize.cxx
+++ b/src/tokenize.cxx
@@ -1,7 +1,7 @@
/*
- $Id: tokenize.cxx 17873 2014-11-26 17:11:31Z sloot $
+ $Id: tokenize.cxx 18006 2015-01-29 10:10:59Z sloot $
$URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/src/tokenize.cxx $
- Copyright (c) 2006 - 2014
+ Copyright (c) 2006 - 2015
Tilburg University
This file is part of Ucto
@@ -387,7 +387,7 @@ namespace Tokenizer {
int parCount = 0;
vector<Token> buffer;
do {
- vector<Token> v = tokenizeStream( IN , false);
+ vector<Token> v = tokenizeStream( IN , true);
for (vector<Token>::iterator iter = v.begin(); iter != v.end(); iter++) {
if (iter->role & NEWPARAGRAPH) {
//process the buffer
@@ -401,6 +401,38 @@ namespace Tokenizer {
return doc;
}
+ void TokenizerClass::tokenize( const string & ifile, const string & ofile) {
+ ostream *OUT = NULL;
+ if ( ofile.empty() )
+ OUT = &cout;
+ else {
+ OUT = new ofstream( ofile.c_str() );
+ }
+
+ istream *IN = NULL;
+ if (!xmlin) {
+ if ( ifile.empty() )
+ IN = &cin;
+ else {
+ IN = new ifstream( ifile.c_str() );
+ if ( !IN || !IN->good() ){
+ cerr << "Error: problems opening inputfile " << ifile << endl;
+ cerr << "Courageously refusing to start..." << endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+ this->tokenize( *IN, *OUT );
+ } else {
+ folia::Document doc;
+ doc.readFromFile(ifile);
+ this->tokenize(doc);
+ *OUT << doc << endl;
+ }
+
+ if ( IN != &cin ) delete IN;
+ if ( OUT != &cout ) delete OUT;
+ }
+
void TokenizerClass::tokenize( istream& IN, ostream& OUT) {
if (xmlout) {
folia::Document doc = tokenize( IN );
diff --git a/src/ucto.cxx b/src/ucto.cxx
index dcc1f1d..d70c0d9 100644
--- a/src/ucto.cxx
+++ b/src/ucto.cxx
@@ -1,7 +1,7 @@
/*
- $Id: ucto.cxx 17839 2014-11-20 16:04:12Z sloot $
+ $Id: ucto.cxx 18006 2015-01-29 10:10:59Z sloot $
$URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/src/ucto.cxx $
- Copyright (c) 1998 - 2014
+ Copyright (c) 1998 - 2015
ILK - Tilburg University
CNTS - University of Antwerp
diff --git a/src/unicode.cxx b/src/unicode.cxx
index 696b951..4299047 100644
--- a/src/unicode.cxx
+++ b/src/unicode.cxx
@@ -1,7 +1,7 @@
/*
- $Id: unicode.cxx 16823 2014-01-06 10:21:09Z sloot $
+ $Id: unicode.cxx 18006 2015-01-29 10:10:59Z sloot $
$URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/src/unicode.cxx $
- Copyright (c) 1998 - 2014
+ Copyright (c) 1998 - 2015
ILK - Tilburg University
CNTS - University of Antwerp
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/ucto.git
More information about the debian-science-commits
mailing list