[ucto] 01/01: Imported Upstream version 0.8.0

Joost van Baal joostvb at moszumanska.debian.org
Sun Jul 5 04:54:20 UTC 2015


This is an automated email from the git hooks/post-receive script.

joostvb pushed a commit to annotated tag upstream/0.8.0
in repository ucto.

commit 698c22bbeddfdba10353a50618c363378ab919a9
Author: Joost van Baal-Ilić <joostvb at nusku.mdcc.cx>
Date:   Sun Jul 5 06:54:04 2015 +0200

    Imported Upstream version 0.8.0
---
 ChangeLog                                  | 69 +++++++++++++++++++++++
 NEWS                                       | 20 +++++++
 config/Makefile.am                         |  4 +-
 config/Makefile.in                         |  4 +-
 config/tokconfig-de                        |  6 +-
 config/tokconfig-en                        |  6 +-
 config/tokconfig-es                        | 10 ++--
 config/tokconfig-fr                        | 10 ++--
 config/tokconfig-fy                        |  4 +-
 config/tokconfig-generic                   | 10 ++--
 config/tokconfig-it                        | 12 +++-
 config/tokconfig-nl                        |  6 +-
 config/tokconfig-nl-twitter                | 12 ++--
 config/tokconfig-pt                        | 10 ++--
 config/{tokconfig-generic => tokconfig-ru} | 68 +++++++++++++----------
 config/tokconfig-sv                        |  9 +++
 configure                                  | 20 +++----
 configure.ac                               |  4 +-
 docs/ucto.1                                | 89 ++++++++++++++++--------------
 include/ucto/tokenize.h                    |  6 +-
 include/ucto/unicode.h                     |  4 +-
 src/tokenize.cxx                           | 38 ++++++++++++-
 src/ucto.cxx                               |  4 +-
 src/unicode.cxx                            |  4 +-
 24 files changed, 303 insertions(+), 126 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 4a11737..9e322f1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,72 @@
+2015-01-29 10:10  sloot
+
+	* [r18006] src/tokenize.cxx, src/ucto.cxx, src/unicode.cxx: 2015
+
+2015-01-23 13:19  mvgompel
+
+	* [r17995] src/tokenize.cxx: allatonce enabled by default for
+	  tokenize() to folia doc
+
+2015-01-21 14:51  sloot
+
+	* [r17993] include/ucto/tokenize.h, include/ucto/unicode.h: 2015
+
+2015-01-09 15:15  mvgompel
+
+	* [r17959] include/ucto/tokenize.h, src/tokenize.cxx: added new
+	  tokenize(string,string) meta-function for the API
+
+2015-01-07 16:05  mvgompel
+
+	* [r17958] config/tokconfig-nl-twitter: added FRACNUMBER too
+
+2015-01-07 16:04  mvgompel
+
+	* [r17957] config/tokconfig-de, config/tokconfig-en,
+	  config/tokconfig-es, config/tokconfig-fr, config/tokconfig-fy,
+	  config/tokconfig-generic, config/tokconfig-it,
+	  config/tokconfig-nl, config/tokconfig-nl-twitter,
+	  config/tokconfig-pt, config/tokconfig-ru, config/tokconfig-sv,
+	  config/tokconfig-tr: fixing date rules and adding FRACNUMBER
+
+2015-01-07 15:32  mvgompel
+
+	* [r17956] config/tokconfig-nl-twitter: <Flo> zou je ook nog een
+	  variant dd/mm toe kunnen voegen?
+
+2015-01-07 15:22  mvgompel
+
+	* [r17955] config/tokconfig-nl-twitter: handle dates with slashes
+	  in twitter: < proycon> doen wij datums niet met hyphens officieel
+	  in het Nederlads? <Flo>| heeft twitter maling aan ;)
+
+2014-12-13 22:52  mvgompel
+
+	* [r17934] config/tokconfig-ru: fix
+
+2014-12-13 22:42  mvgompel
+
+	* [r17933] config/Makefile.am, config/tokconfig-ru: added Russian
+
+2014-12-02 15:40  sloot
+
+	* [r17908] docs/ucto.1: fixed hyphens according to
+	  /usr/share/doc/groff-base/README.Debian
+
+2014-12-02 15:34  sloot
+
+	* [r17907] docs/ucto.1: fixed hyphens according to
+	  /usr/share/doc/groff-base/README.Debian
+
+2014-11-26 17:11  sloot
+
+	* [r17873] src/tokenize.cxx: fixed terrible bug,
+	  also some code-cleanup
+
+2014-11-26 16:26  sloot
+
+	* [r17872] configure.ac: bumped version
+
 2014-11-26 16:17  sloot
 
 	* [r17871] configure.ac, src/tokenize.cxx, tests/testfolia.ok,
diff --git a/NEWS b/NEWS
index 67b2768..be75fe9 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,23 @@
+0.8.0 2015-01-29
+[Ko van der Sloot]
+* next release
+[Maarten van Gompel]
+* added new tokenize(string,string) meta-function for the API
+* allatonce enabled by default for tokenize() to folia doc
+* fixing date rules and adding FRACNUMBER
+* added Russian
+* Adicionei regras para tokenização portuguesa.
+[Antal vd Bosch]
+* added RK to dutch abbrev.
+
+0.7.0 2014-11-26
+[Ko van der Sloot]
+* unofficial release
+* experimental PUNCTUATION filter
+* bug fixes
+[Maarten van Gompel]
+* reduced memory usage
+
 0.6.0 2014-09-23
 [Ko van der Sloot]
 * release
diff --git a/config/Makefile.am b/config/Makefile.am
index 6f17afd..3366550 100644
--- a/config/Makefile.am
+++ b/config/Makefile.am
@@ -1,8 +1,8 @@
-# $Id: Makefile.am 17387 2014-06-25 10:36:49Z mvgompel $
+# $Id: Makefile.am 17933 2014-12-13 22:42:56Z mvgompel $
 # $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/config/Makefile.am $
 
 config_DATA = tokconfig-en tokconfig-nl tokconfig-fr tokconfig-it tokconfig-es tokconfig-pt \
-	tokconfig-de tokconfig-sv tokconfig-nl-twitter tokconfig-nl-sonarchat tokconfig-fy \
+	tokconfig-de tokconfig-sv tokconfig-nl-twitter tokconfig-nl-sonarchat tokconfig-fy tokconfig-ru \
 	tokconfig-generic url.rule e-mail.rule smiley.rule \
 	ligatures.filter standard-quotes.quote \
 	exotic-quotes.quote standard-eos.eos exotic-eos.eos nl_afk.abr es.abr pt.abr
diff --git a/config/Makefile.in b/config/Makefile.in
index be5b46f..51243d0 100644
--- a/config/Makefile.in
+++ b/config/Makefile.in
@@ -14,7 +14,7 @@
 
 @SET_MAKE@
 
-# $Id: Makefile.am 17387 2014-06-25 10:36:49Z mvgompel $
+# $Id: Makefile.am 17933 2014-12-13 22:42:56Z mvgompel $
 # $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/config/Makefile.am $
 
 VPATH = @srcdir@
@@ -283,7 +283,7 @@ top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 config_DATA = tokconfig-en tokconfig-nl tokconfig-fr tokconfig-it tokconfig-es tokconfig-pt \
-	tokconfig-de tokconfig-sv tokconfig-nl-twitter tokconfig-nl-sonarchat tokconfig-fy \
+	tokconfig-de tokconfig-sv tokconfig-nl-twitter tokconfig-nl-sonarchat tokconfig-fy tokconfig-ru \
 	tokconfig-generic url.rule e-mail.rule smiley.rule \
 	ligatures.filter standard-quotes.quote \
 	exotic-quotes.quote standard-eos.eos exotic-eos.eos nl_afk.abr es.abr pt.abr
diff --git a/config/tokconfig-de b/config/tokconfig-de
index 3183ab6..7054c4c 100644
--- a/config/tokconfig-de
+++ b/config/tokconfig-de
@@ -1,8 +1,8 @@
 [RULE-ORDER]
 TOKEN SUFFIX WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
 E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
-ABBREVIATION INITIALS INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+ABBREVIATION INITIALS INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
 
 [RULES]
 %include url
@@ -34,6 +34,8 @@ PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 DATE=\p{N}{1,2}[\.-]\p{Ps}?\p{N}{1,2}[\.-]\p{Ps}?\p{N}{2,4}
 DATE-REVERSE=\p{N}{4}-\p{N}{1,2}[\.-]\p{N}{1,2}\.?
 
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
+
 NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
 #NUMBER-YEAR=('\p{N}{2})\P{N}
 
diff --git a/config/tokconfig-en b/config/tokconfig-en
index aa125ea..818db3b 100644
--- a/config/tokconfig-en
+++ b/config/tokconfig-en
@@ -1,8 +1,8 @@
 [RULE-ORDER]
 SUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
 E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
-ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
 
 [RULES]
 %include url
@@ -31,6 +31,8 @@ PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 DATE=\p{N}{1,2}/\p{Ps}?\p{N}{1,2}[/]\p{Ps}?\p{N}{2,4}
 DATE-REVERSE=\p{N}{4}/\p{N}{1,2}/\p{N}{1,2}
 
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
+
 NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
 #NUMBER-YEAR=('\p{N}{2})\P{N}
 
diff --git a/config/tokconfig-es b/config/tokconfig-es
index 9b361ed..2558962 100644
--- a/config/tokconfig-es
+++ b/config/tokconfig-es
@@ -1,8 +1,8 @@
 [RULE-ORDER]
 ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
 E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
-ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
 
 [RULES]
 %include url
@@ -28,8 +28,10 @@ INITIAL=^(?:\p{Lt}|\p{Lu})\.$
 PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 
 #Date
-DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
-DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
+DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/-]\p{N}{2,4}
+DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2}
+
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
 
 NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
 #NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/config/tokconfig-fr b/config/tokconfig-fr
index 53c05e5..82ee6b6 100644
--- a/config/tokconfig-fr
+++ b/config/tokconfig-fr
@@ -1,8 +1,8 @@
 [RULE-ORDER]
 PREFIX SUFFIX WORD-TOKEN ABBREVIATION-KNOWN URL URL-WWW URL-DOMAIN
 E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
-ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR FRACNUMBER TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
 
 [RULES]
 %include url
@@ -28,8 +28,10 @@ INITIAL=^(?:\p{Lt}|\p{Lu})\.$
 PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 
 #Date
-DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
-DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
+DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/-]\p{N}{2,4}
+DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2}
+
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
 
 NUMBER-YEAR=(['`’]\p{N}{2})(?:\P{N}|\z)
 #NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/config/tokconfig-fy b/config/tokconfig-fy
index 2981a6c..7e1b815 100644
--- a/config/tokconfig-fy
+++ b/config/tokconfig-fy
@@ -2,7 +2,7 @@
 WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
 E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
 ABBREVIATION INITIALS INITIAL SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
 # to do PREFIXES (is leeg nu) UNITS (uitgecommentarieerd in de c++ code) 
 
 [RULES]
@@ -40,6 +40,8 @@ PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
 DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
 
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
+
 NUMBER-YEAR=(['`’]\p{N}{2})(?:\P{N}|\z)
 #NUMBER-YEAR=('\p{N}{2})\P{N}
 
diff --git a/config/tokconfig-generic b/config/tokconfig-generic
index 50c5a85..35773a8 100644
--- a/config/tokconfig-generic
+++ b/config/tokconfig-generic
@@ -1,8 +1,8 @@
 [RULE-ORDER]
 URL URL-WWW URL-DOMAIN
 E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
-ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
 
 [RULES]
 %include url
@@ -28,8 +28,10 @@ INITIAL=^(?:\p{Lt}|\p{Lu})\.$
 PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 
 #Date
-DATE=\p{N}{1,2}/\p{Ps}?\p{N}{1,2}[/]\p{Ps}?\p{N}{2,4}
-DATE-REVERSE=\p{N}{4}/\p{N}{1,2}/\p{N}{1,2}
+DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/-]\p{N}{2,4}
+DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2}
+
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
 
 NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
 #NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/config/tokconfig-it b/config/tokconfig-it
index aefe567..615b758 100644
--- a/config/tokconfig-it
+++ b/config/tokconfig-it
@@ -1,3 +1,9 @@
+[RULE-ORDER]
+ABBREVIATION-KNOWN SUFFIX NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
+E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
+
 [RULES]
 %include url
 %include e-mail
@@ -22,8 +28,10 @@ INITIAL=^(?:\p{Lt}|\p{Lu})\.$
 PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 
 #Date
-DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
-DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
+DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/-]\p{N}{2,4}
+DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2}
+
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
 
 NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
 #NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/config/tokconfig-nl b/config/tokconfig-nl
index 1e2f6c2..f314c84 100644
--- a/config/tokconfig-nl
+++ b/config/tokconfig-nl
@@ -4,8 +4,8 @@ WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL QUOTE-COMPOUND
 NUMBER-STRING STRING-NUMBER URL URL-WWW URL-DOMAIN E-MAIL
 WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
 ABBREVIATION INITIALS INITIAL SMILEY REVERSE-SMILEY 
-PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
+PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
 # to do PREFIXES (is leeg nu) UNITS (uitgecommentarieerd in de c++ code) 
 
 [RULES]
@@ -44,6 +44,8 @@ PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
 DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
 
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
+
 #numberstring like 20jarige
 NUMBER-STRING=\p{N}+(?:\p{Pd}?)(?:\p{L}+)
 
diff --git a/config/tokconfig-nl-twitter b/config/tokconfig-nl-twitter
index d175822..3be6444 100644
--- a/config/tokconfig-nl-twitter
+++ b/config/tokconfig-nl-twitter
@@ -2,8 +2,8 @@
 WORD-WITHSUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW 
 URL-DOMAIN E-MAIL WORD-PARPREFIX-PARSUFFIX WORD-PARPREFIX WORD-PARSUFFIX 
 WORD-COMPOUND ABBREVIATION INITIALS INITIAL SMILEY REVERSE-SMILEY HASHTAG 
-ADDRESSEE PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
+ADDRESSEE PUNCTUATION-MULTI DATE-REVERSE DATE FRACTIONORDATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
 # to do PREFIXES (is leeg nu) UNITS (uitgecommentarieerd in de c++ code) 
 
 [RULES]
@@ -40,8 +40,12 @@ ADDRESSEE=@[\p{L}\p{Mn}\p{N}_\-]+
 PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 
 #Date
-DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
-DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
+DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/\-]\p{N}{2,4}
+DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2}
+
+FRACTIONORDATE=\p{N}{1,2}[/\-]\p{N}{1,2}
+
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
 
 NUMBER-YEAR=(['`’]\p{N}{2})(?:\P{N}|\z)
 #NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/config/tokconfig-pt b/config/tokconfig-pt
index 0fcd021..d6060fd 100644
--- a/config/tokconfig-pt
+++ b/config/tokconfig-pt
@@ -1,8 +1,8 @@
 [RULE-ORDER]
 ABBREVIATION-KNOWN SUFFIX NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
 E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
-ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN 
 
 [RULES]
 %include url
@@ -28,8 +28,10 @@ INITIAL=^(?:\p{Lt}|\p{Lu})\.$
 PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 
 #Date
-DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
-DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}
+DATE=\p{N}{1,2}[/\-]\p{N}{1,2}[/-]\p{N}{2,4}
+DATE-REVERSE=\p{N}{4}[/\-]\p{N}{1,2}[/\-]\p{N}{1,2}
+
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
 
 NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
 #NUMBER-YEAR=('\p{N}{2})\P{N}
diff --git a/config/tokconfig-generic b/config/tokconfig-ru
similarity index 77%
copy from config/tokconfig-generic
copy to config/tokconfig-ru
index 50c5a85..63c8503 100644
--- a/config/tokconfig-generic
+++ b/config/tokconfig-ru
@@ -1,8 +1,8 @@
 [RULE-ORDER]
 URL URL-WWW URL-DOMAIN
-E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
-ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE DATE-REVERSE
-NUMBER-YEAR TIME NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+E-MAIL ABBREVIATION-KNOWN WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
 
 [RULES]
 %include url
@@ -31,6 +31,8 @@ PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 DATE=\p{N}{1,2}/\p{Ps}?\p{N}{1,2}[/]\p{Ps}?\p{N}{2,4}
 DATE-REVERSE=\p{N}{4}/\p{N}{1,2}/\p{N}{1,2}
 
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
+
 NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
 #NUMBER-YEAR=('\p{N}{2})\P{N}
 
@@ -57,46 +59,52 @@ UNKNOWN=.
 [TOKENS]
 
 [UNITS]
-km
-m
-cm
-mm
-g
-kg
-C
-l
-s
-sec
-min
-gb
-mb
-kb
+км
+м
+ч
+сек
+мин
+мм
+мт
+см
+МБ
+ГБ
+КБ
 
 
 [CURRENCY]
-USD
-GBP
-CAD
-NZD
-AUD
-SGD
-HKD
-EUR
 
 [ABBREVIATIONS]
+гл
+лат
+напр
+вкл
+вм
+ок
+приб
+им
+обл
+руб
+ст
+стр
+см
+шт
+тов
+пл
+США
+СЕ
+СССР
+РФ
+пер
 
 
 [FILTER]
-fl fl
-ff ff
-ffi ffi
-ffl ffl
 # also filter soft hyphen 
 \u00AD
 
-
 [EOSMARKERS]
 %include standard-eos
 
 [QUOTES]
 %include standard-quotes
+%include exotic-quotes
diff --git a/config/tokconfig-sv b/config/tokconfig-sv
index 7665c31..f76a8f5 100644
--- a/config/tokconfig-sv
+++ b/config/tokconfig-sv
@@ -1,6 +1,13 @@
+[RULE-ORDER]
+SUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN
+E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND 
+ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE
+NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN
+
 [RULES]
 %include url
 %include e-mail
+%include smiley
 
 #Ex (oud)-studente(s)
 WORD-PARPREFIX-PARSUFFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})
@@ -27,6 +34,8 @@ PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}
 DATE=\p{N}{4}-\p{N}{1,2}[\.-]\p{N}{1,2}\.?
 DATE-SHORT=\p{N}{1,2}[-]\p{Ps}?\p{N}{1,2}[-]\p{Ps}?\p{N}{2,4}
 
+FRACNUMBER=\p{N}+(?:/\p{N}+)+
+
 NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
 #NUMBER-YEAR=('\p{N}{2})\P{N}
 
diff --git a/configure b/configure
index b93a2e9..3539551 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for ucto 0.7.0.
+# Generated by GNU Autoconf 2.69 for ucto 0.8.0.
 #
 # Report bugs to <timbl at uvt.nl>.
 #
@@ -589,8 +589,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ucto'
 PACKAGE_TARNAME='ucto'
-PACKAGE_VERSION='0.7.0'
-PACKAGE_STRING='ucto 0.7.0'
+PACKAGE_VERSION='0.8.0'
+PACKAGE_STRING='ucto 0.8.0'
 PACKAGE_BUGREPORT='timbl at uvt.nl'
 PACKAGE_URL=''
 
@@ -1350,7 +1350,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ucto 0.7.0 to adapt to many kinds of systems.
+\`configure' configures ucto 0.8.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1420,7 +1420,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ucto 0.7.0:";;
+     short | recursive ) echo "Configuration of ucto 0.8.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1549,7 +1549,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ucto configure 0.7.0
+ucto configure 0.8.0
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2078,7 +2078,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ucto $as_me 0.7.0, which was
+It was created by ucto $as_me 0.8.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2941,7 +2941,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ucto'
- VERSION='0.7.0'
+ VERSION='0.8.0'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -16570,7 +16570,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ucto $as_me 0.7.0, which was
+This file was extended by ucto $as_me 0.8.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -16636,7 +16636,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ucto config.status 0.7.0
+ucto config.status 0.8.0
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 51c4c01..8a1225a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,10 +1,10 @@
 #                                               -*- Autoconf -*-
 # Process this file with autoconf to produce a configure script.
-# $Id: configure.ac 17872 2014-11-26 16:26:43Z sloot $
+# $Id: configure.ac 18009 2015-01-29 11:43:21Z sloot $
 # $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/configure.ac $
 
 AC_PREREQ(2.59)
-AC_INIT([ucto], [0.7.0], [timbl at uvt.nl])
+AC_INIT([ucto], [0.8.0], [timbl at uvt.nl])
 AM_INIT_AUTOMAKE
 AC_CONFIG_SRCDIR([configure.ac])
 AC_CONFIG_MACRO_DIR([m4])
diff --git a/docs/ucto.1 b/docs/ucto.1
index e6c506c..10135a3 100644
--- a/docs/ucto.1
+++ b/docs/ucto.1
@@ -1,132 +1,142 @@
-.TH ucto 1 "2013 march 6"
+.TH ucto 1 "2014 december 2"
 
 .SH NAME
-ucto - Unicode Tokenizer
+ucto \- Unicode Tokenizer
 .SH SYNOPSYS
-ucto [[options]] [input-file] [[output-file]]
+ucto [[options]] [input\(hyfile] [[output\(hyfile]]
 
 .SH DESCRIPTION
-.B ucto 
-ucto tokenizes text files: it separates words from punctuation, splits 
-sentences (and optionally paragraphs), and finds paired quotes. 
-Ucto is preconfigured with tokenisation rules for several languages. 
+.B ucto
+ucto tokenizes text files: it separates words from punctuation, splits
+sentences (and optionally paragraphs), and finds paired quotes.
+Ucto is preconfigured with tokenisation rules for several languages.
 
 .SH OPTIONS
 
-.BR -c " configfile"
+.BR \-c " configfile"
 .RS
 read settings from a file
 .RE
 
-.BR -d " value"
+.BR \-d " value"
 .RS
 set debug mode to 'value'
 .RE
 
-.BR -e " value"
+.BR \-e " value"
 .RS
 set input encoding. (default UTF8)
 .RE
 
-.BR -f
+.BR \-N " value"
+.RS
+set UTF8 output normalization. (default NFC)
+.RE
+
+.BR \-f
 .RS
 disable filtering of special characters
 .RE
 
-.BR -L " language"
+.BR \-L " language"
 .RS
  Automatically selects a configuration file by language code.
-e.g. 'fr' will select the file tokconfig-fr from the installation directory
+e.g. 'fr' will select the file tokconfig\(hyfr from the installation directory
 .RE
 
-.BR -l 
+.BR \-l
 .RS
 Convert to all lowercase
 .RE
 
-.BR -u 
+.BR \-u
 .RS
 Convert to all uppercase
 .RE
 
-.BR -n 
+.BR \-n
 .RS
 Emit one sentence per line on output
 .RE
 
-.BR -m
+.BR \-m
 .RS
 Assume one sentence per line on input
 .RE
 
-.BR --passthru    
+.BR \-\-passthru
 .RS
 Don't tokenize, but perform input decoding and simple token role detection
 .RE
 
-.B -P
+.BR \-\-filterpunct
+.RS
+remove most of the punctuation from the output. (not from abreviations!)
+.RE
+
+.B \-P
 .RS
 Disable Paragraph Detection
 .RE
 
-.B -Q
+.B \-Q
 .RS
 Enable Quote Detection. (this is experimental and may lead to unexpected results)
 .RE
 
-.B -S
+.B \-S
 .RS
 Disable Sentence Detection
 .RE
 
-.B -s
+.B \-s
 <string>
 .RS
-Set End-of-sentence marker. (Default <utt>)
+Set End\(hyof\(hysentence marker. (Default <utt>)
 .RE
 
-.B -V
-.RS 
+.B \-V
+.RS
 Show version information
 .RE
 
-.B -v
+.B \-v
 .RS
 set Verbose mode
 .RE
 
-.B -F
+.B \-F
 .RS
-Read a FoLiA XML document, tokenize it, and output the modified doc. (this disables usage of most other options: -nulPQvsS)
+Read a FoLiA XML document, tokenize it, and output the modified doc. (this disables usage of most other options: \-nulPQvsS)
 .RE
 
-.BR --textclass " cls"
+.BR \-\-textclass "cls"
 .RS
 When tokenizing a FoLiA XML document, search for text nodes of class 'cls'
 .RE
 
-.B -X
+.B \-X
 .RS
-Output FoLiA XML. (this disables usage of most other options: -nulPQvsS)
-.RE	
+Output FoLiA XML. (this disables usage of most other options: \-nulPQvsS)
+.RE
 
-.B --id
+.B \-\-id
 <DocId>
 .RS
 Use the specified Document ID for the FoLiA XML
 .RE
 
-.B -x
+.B \-x
 <DocId>
 .B (obsolete)
 .RS
-Output FoLiA XML, use the specified Document ID. (this disables usage of most other options: -nulPQvsS)
+Output FoLiA XML, use the specified Document ID. (this disables usage of most other options: \-nulPQvsS)
 
 .B obsolete
-Use 
-.B -X 
-and 
-.B --id
+Use
+.B \-X
+and
+.B \-\-id
 instead
 .RE
 
@@ -137,4 +147,3 @@ likely
 Maarten van Gompel proycon at anaproy.nl
 
 Ko van der Sloot Timbl at uvt.nl
-
diff --git a/include/ucto/tokenize.h b/include/ucto/tokenize.h
index ae88928..581ff59 100644
--- a/include/ucto/tokenize.h
+++ b/include/ucto/tokenize.h
@@ -1,7 +1,7 @@
 /*
-  $Id: tokenize.h 17865 2014-11-21 17:04:25Z mvgompel $
+  $Id: tokenize.h 17993 2015-01-21 14:51:26Z sloot $
   $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/include/ucto/tokenize.h $
-  Copyright (c) 2006 - 2014
+  Copyright (c) 2006 - 2015
   Tilburg University
 
   This file is part of Ucto.
@@ -155,6 +155,8 @@ namespace Tokenizer {
     //Tokenize from input stream to a vecto of Tokens
     std::vector<Token> tokenizeStream( std::istream&, bool allatonce=true );
 
+    //Tokenize from input file to output file (support xmlin + xmlout)
+    void tokenize( const std::string&, const std::string& );
     //Tokenize from input stream to output stream
     void tokenize( std::istream&, std::ostream& );
     void tokenize( std::istream* in, std::ostream* out){
diff --git a/include/ucto/unicode.h b/include/ucto/unicode.h
index 8a61cdc..156446d 100644
--- a/include/ucto/unicode.h
+++ b/include/ucto/unicode.h
@@ -1,7 +1,7 @@
 /*
- $Id: unicode.h 16823 2014-01-06 10:21:09Z sloot $
+ $Id: unicode.h 17993 2015-01-21 14:51:26Z sloot $
  $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/include/ucto/unicode.h $
-  Copyright (c) 1998 - 2014
+  Copyright (c) 1998 - 2015
   ILK  -  Tilburg University
   CNTS -  University of Antwerp
  
diff --git a/src/tokenize.cxx b/src/tokenize.cxx
index 077bdcd..01e3dbc 100644
--- a/src/tokenize.cxx
+++ b/src/tokenize.cxx
@@ -1,7 +1,7 @@
 /*
-  $Id: tokenize.cxx 17873 2014-11-26 17:11:31Z sloot $
+  $Id: tokenize.cxx 18006 2015-01-29 10:10:59Z sloot $
   $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/src/tokenize.cxx $
-  Copyright (c) 2006 - 2014
+  Copyright (c) 2006 - 2015
   Tilburg University
 
   This file is part of Ucto
@@ -387,7 +387,7 @@ namespace Tokenizer {
     int parCount = 0;
     vector<Token> buffer;
     do {
-	vector<Token> v = tokenizeStream( IN , false);
+	vector<Token> v = tokenizeStream( IN , true);
 	for (vector<Token>::iterator iter = v.begin(); iter != v.end(); iter++) {
 	    if (iter->role & NEWPARAGRAPH) {
 		//process the buffer
@@ -401,6 +401,38 @@ namespace Tokenizer {
     return doc;
   }
 
+  void TokenizerClass::tokenize( const string & ifile, const string & ofile) {
+    ostream *OUT = NULL;
+    if ( ofile.empty() )
+        OUT = &cout;
+    else {
+        OUT = new ofstream( ofile.c_str() );
+    }
+
+    istream *IN = NULL;
+    if (!xmlin) {
+        if ( ifile.empty() )
+            IN = &cin;
+        else {
+            IN = new ifstream( ifile.c_str() );
+            if ( !IN || !IN->good() ){
+                cerr << "Error: problems opening inputfile " << ifile << endl;
+                cerr << "Courageously refusing to start..."  << endl;
+                exit(EXIT_FAILURE);
+            }
+        }
+      this->tokenize( *IN, *OUT );
+    } else {
+      folia::Document doc;
+      doc.readFromFile(ifile);
+      this->tokenize(doc);
+      *OUT << doc << endl;
+    }
+
+    if ( IN != &cin ) delete IN;
+    if ( OUT != &cout ) delete OUT;    
+  }
+
   void TokenizerClass::tokenize( istream& IN, ostream& OUT) {
     if (xmlout) {
       folia::Document doc = tokenize( IN );
diff --git a/src/ucto.cxx b/src/ucto.cxx
index dcc1f1d..d70c0d9 100644
--- a/src/ucto.cxx
+++ b/src/ucto.cxx
@@ -1,7 +1,7 @@
 /*
-  $Id: ucto.cxx 17839 2014-11-20 16:04:12Z sloot $
+  $Id: ucto.cxx 18006 2015-01-29 10:10:59Z sloot $
   $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/src/ucto.cxx $
-  Copyright (c) 1998 - 2014
+  Copyright (c) 1998 - 2015
   ILK  -  Tilburg University
   CNTS -  University of Antwerp
 
diff --git a/src/unicode.cxx b/src/unicode.cxx
index 696b951..4299047 100644
--- a/src/unicode.cxx
+++ b/src/unicode.cxx
@@ -1,7 +1,7 @@
 /*
-  $Id: unicode.cxx 16823 2014-01-06 10:21:09Z sloot $
+  $Id: unicode.cxx 18006 2015-01-29 10:10:59Z sloot $
   $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/src/unicode.cxx $
-  Copyright (c) 1998 - 2014
+  Copyright (c) 1998 - 2015
   ILK  -  Tilburg University
   CNTS -  University of Antwerp
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/ucto.git



More information about the debian-science-commits mailing list