[ucto] 01/01: Imported Upstream version 0.6.0
Joost van Baal
joostvb at moszumanska.debian.org
Sun Jul 5 04:54:20 UTC 2015
This is an automated email from the git hooks/post-receive script.
joostvb pushed a commit to annotated tag upstream/0.6.0
in repository ucto.
commit 95f44dba86f13495476e3ac404b580bc2f75be20
Author: Joost van Baal-Ilić <joostvb at nusku.mdcc.cx>
Date: Sun Jul 5 06:53:48 2015 +0200
Imported Upstream version 0.6.0
---
ChangeLog | 112 +++++++++++++++++++++++++++++++++++++++++++
NEWS | 10 ++++
config/nl_afk.abr | 1 +
configure | 38 +++++++--------
configure.ac | 6 +--
include/ucto/tokenize.h | 13 +++--
src/tokenize.cxx | 5 +-
src/ucto.cxx | 123 +++++++++++++++++++++++++++---------------------
8 files changed, 226 insertions(+), 82 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 7daad52..509f56c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,115 @@
+2014-09-22 10:16 sloot
+
+ * [r17687] src/ucto.cxx: typo
+
+2014-09-22 10:15 sloot
+
+ * [r17686] src/ucto.cxx: clearer option handling, better checks,
+ and fixed non-working -S option
+ (nobody seems to use it)
+
+2014-09-20 11:50 mvgompel
+
+ * [r17684] src/ucto.cxx: no more nasty segfault for ucto -h
+
+2014-09-18 14:09 sloot
+
+ * [r17682] src/tokenize.cxx: small edit to satisfy clang
+
+2014-09-16 14:28 sloot
+
+ * [r17674] NEWS: updated NEWS
+
+2014-09-15 15:07 sloot
+
+ * [r17662] include/ucto/tokenize.h, src/tokenize.cxx: removed ugly
+ hack
+
+2014-09-15 12:57 sloot
+
+ * [r17654] include/ucto/tokenize.h: added a useful function to the
+ API
+
+2014-09-15 09:13 sloot
+
+ * [r17651] include/ucto/tokenize.h: removed declared, but undefined
+ function
+
+2014-09-13 18:13 mvgompel
+
+ * [r17648] include/ucto/tokenize.h, src/tokenize.cxx: little
+ alternative function working with pointers, needed for
+ python-frog
+
+2014-08-25 09:46 sloot
+
+ * [r17549] configure.ac, src/ucto.cxx: use ticcutils 0.6
+
+2014-08-23 14:39 antalb
+
+ * [r17546] config/nl_afk.abr: added RK
+
+2014-08-21 15:23 sloot
+
+ * [r17544] src/ucto.cxx: switched to TiCC CommandLine stuff
+
+2014-08-14 10:00 sloot
+
+ * [r17513] src/tokenize.cxx: adapt to newer libfolia
+
+2014-08-06 13:41 sloot
+
+ * [r17484] include/ucto/tokenize.h, src/tokenize.cxx: generally
+ switched to UChar32 and Unicode codepoints.
+ removed some dead code too
+
+2014-08-06 13:11 sloot
+
+ * [r17483] src/tokenize.cxx: added EMOTICON recognision. Needed
+ some rework vor 2 codepoint long characters
+
+2014-08-06 13:09 sloot
+
+ * [r17482] tests/smileys.nl.tok.V, tests/smileys.nl.txt,
+ tests/testfolia.ok, tests/testfolia2.ok: added Emoticons to the
+ smiley test.
+
+2014-07-15 15:10 sloot
+
+ * [r17453] src/tokenize.cxx: some code moved around
+
+2014-07-15 14:17 sloot
+
+ * [r17451] tests/test.en.tok.V, tests/testfolia.ok,
+ tests/testfolia2.ok: some tests are changed
+
+2014-07-03 14:50 sloot
+
+ * [r17418] src/tokenize.cxx: handle Note like Caption
+
+2014-06-30 08:22 mvgompel
+
+ * [r17393] src/tokenize.cxx: adapted to new libfolia
+
+2014-06-25 10:36 mvgompel
+
+ * [r17387] config/Makefile.am, config/pt.abr, config/tokconfig-pt:
+ Adicionei regras para tokenização portuguesa.
+
+2014-06-17 15:11 sloot
+
+ * [r17357] src/tokenize.cxx: avoid crash on empty inputfile
+
+2014-06-11 14:02 sloot
+
+ * [r17335] src/tst.sh: hmm
+
+2014-06-11 13:44 sloot
+
+ * [r17332] NEWS, configure.ac, include/ucto/tokenize.h, src/tst.sh:
+ NEWS
+ fixed 'make distcheck'
+
2014-05-22 15:45 sloot
* [r17245] COPYING: and reinstated
diff --git a/NEWS b/NEWS
index c3dd379..67b2768 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,15 @@
+0.6.0 2014-09-23
+[Ko van der Sloot]
+* release
+
0.5.5 2014-06-xx
* made getSentence() public
+* adapted to most recent libfolia (0.11 or above)
+* needs libticcutils 0.6 or above
+* uses TiCC::CommandLine
+* detect EMOTICON's
+* generally switched to UChar32 and Unicode codepoints. (avoid length() problems)
+* handle FoLiA Note like Caption
* a lot of bug fixes concerning FoLiA output (<t> nodes, textclass values etc.)
* again some changes around quotes
* improved tokenisation in differeny languages
diff --git a/config/nl_afk.abr b/config/nl_afk.abr
index 2238e3a..063ea6f 100644
--- a/config/nl_afk.abr
+++ b/config/nl_afk.abr
@@ -123,6 +123,7 @@ Prov
RED
Red
Rijkscomm
+RK
Rom
SEPT
Sept
diff --git a/configure b/configure
index 18af1a3..bd55878 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for ucto 0.5.5.
+# Generated by GNU Autoconf 2.69 for ucto 0.6.0.
#
# Report bugs to <timbl at uvt.nl>.
#
@@ -589,8 +589,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='ucto'
PACKAGE_TARNAME='ucto'
-PACKAGE_VERSION='0.5.5'
-PACKAGE_STRING='ucto 0.5.5'
+PACKAGE_VERSION='0.6.0'
+PACKAGE_STRING='ucto 0.6.0'
PACKAGE_BUGREPORT='timbl at uvt.nl'
PACKAGE_URL=''
@@ -1350,7 +1350,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures ucto 0.5.5 to adapt to many kinds of systems.
+\`configure' configures ucto 0.6.0 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1420,7 +1420,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of ucto 0.5.5:";;
+ short | recursive ) echo "Configuration of ucto 0.6.0:";;
esac
cat <<\_ACEOF
@@ -1549,7 +1549,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-ucto configure 0.5.5
+ucto configure 0.6.0
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2078,7 +2078,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by ucto $as_me 0.5.5, which was
+It was created by ucto $as_me 0.6.0, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2941,7 +2941,7 @@ fi
# Define the identity of the package.
PACKAGE='ucto'
- VERSION='0.5.5'
+ VERSION='0.6.0'
cat >>confdefs.h <<_ACEOF
@@ -15911,12 +15911,12 @@ if test -n "$ticcutils_CFLAGS"; then
pkg_cv_ticcutils_CFLAGS="$ticcutils_CFLAGS"
elif test -n "$PKG_CONFIG"; then
if test -n "$PKG_CONFIG" && \
- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.4 \""; } >&5
- ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.4 ") 2>&5
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.6 \""; } >&5
+ ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.6 ") 2>&5
ac_status=$?
$as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
test $ac_status = 0; }; then
- pkg_cv_ticcutils_CFLAGS=`$PKG_CONFIG --cflags "ticcutils >= 0.4 " 2>/dev/null`
+ pkg_cv_ticcutils_CFLAGS=`$PKG_CONFIG --cflags "ticcutils >= 0.6 " 2>/dev/null`
test "x$?" != "x0" && pkg_failed=yes
else
pkg_failed=yes
@@ -15928,12 +15928,12 @@ if test -n "$ticcutils_LIBS"; then
pkg_cv_ticcutils_LIBS="$ticcutils_LIBS"
elif test -n "$PKG_CONFIG"; then
if test -n "$PKG_CONFIG" && \
- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.4 \""; } >&5
- ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.4 ") 2>&5
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.6 \""; } >&5
+ ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.6 ") 2>&5
ac_status=$?
$as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
test $ac_status = 0; }; then
- pkg_cv_ticcutils_LIBS=`$PKG_CONFIG --libs "ticcutils >= 0.4 " 2>/dev/null`
+ pkg_cv_ticcutils_LIBS=`$PKG_CONFIG --libs "ticcutils >= 0.6 " 2>/dev/null`
test "x$?" != "x0" && pkg_failed=yes
else
pkg_failed=yes
@@ -15954,14 +15954,14 @@ else
_pkg_short_errors_supported=no
fi
if test $_pkg_short_errors_supported = yes; then
- ticcutils_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "ticcutils >= 0.4 " 2>&1`
+ ticcutils_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "ticcutils >= 0.6 " 2>&1`
else
- ticcutils_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "ticcutils >= 0.4 " 2>&1`
+ ticcutils_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "ticcutils >= 0.6 " 2>&1`
fi
# Put the nasty error message in config.log where it belongs
echo "$ticcutils_PKG_ERRORS" >&5
- as_fn_error $? "Package requirements (ticcutils >= 0.4 ) were not met:
+ as_fn_error $? "Package requirements (ticcutils >= 0.6 ) were not met:
$ticcutils_PKG_ERRORS
@@ -16535,7 +16535,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by ucto $as_me 0.5.5, which was
+This file was extended by ucto $as_me 0.6.0, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -16601,7 +16601,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-ucto config.status 0.5.5
+ucto config.status 0.6.0
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
diff --git a/configure.ac b/configure.ac
index 4299c5e..e504a6f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,10 +1,10 @@
# -*- Autoconf -*-
# Process this file with autoconf to produce a configure script.
-# $Id: configure.ac 17332 2014-06-11 13:44:28Z sloot $
+# $Id: configure.ac 17549 2014-08-25 09:46:17Z sloot $
# $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/configure.ac $
AC_PREREQ(2.59)
-AC_INIT([ucto], [0.5.5], [timbl at uvt.nl])
+AC_INIT([ucto], [0.6.0], [timbl at uvt.nl])
AM_INIT_AUTOMAKE
AC_CONFIG_SRCDIR([configure.ac])
AC_CONFIG_MACRO_DIR([m4])
@@ -99,7 +99,7 @@ AC_ARG_WITH(ticcutils,
[PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$withval/lib/pkgconfig"],
[PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$prefix/lib/pkgconfig"])
# AC_MSG_NOTICE( [pkg-config search path: $PKG_CONFIG_PATH] )
-PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.4] )
+PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.6] )
CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS"
LIBS="$LIBS $ticcutils_LIBS"
diff --git a/include/ucto/tokenize.h b/include/ucto/tokenize.h
index 3df3824..9d0c0e5 100644
--- a/include/ucto/tokenize.h
+++ b/include/ucto/tokenize.h
@@ -1,5 +1,5 @@
/*
- $Id: tokenize.h 17484 2014-08-06 13:41:23Z sloot $
+ $Id: tokenize.h 17662 2014-09-15 15:07:27Z sloot $
$URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/include/ucto/tokenize.h $
Copyright (c) 2006 - 2014
Tilburg University
@@ -148,7 +148,7 @@ namespace Tokenizer {
// Tokenize from input stream to FoLiA document
folia::Document tokenize( std::istream& );
-
+ //
// Tokenize a folia document
bool tokenize(folia::Document& );
@@ -235,7 +235,6 @@ namespace Tokenizer {
bool setSentencePerLineInput( bool b=true ) { bool t = sentenceperlineinput; sentenceperlineinput = b; return t; };
bool getSentencePerLineInput() const { return sentenceperlineinput; }
- std::string getDocID() const { return docid; }
bool getXMLOutput() const { return xmlout; }
bool getXMLInput() const { return xmlin; }
@@ -258,7 +257,13 @@ namespace Tokenizer {
return res;
}
- bool setXMLOutput( bool b, const std::string& id) { bool t = xmlout; docid = id; xmlout = b; return t; }
+ std::string getDocID() const { return docid; }
+ std::string setDocID( const std::string& id ) {
+ const std::string s = docid; docid = id; return s; }
+ bool setXMLOutput( bool b ) {
+ bool t = xmlout; xmlout = b; return t; }
+ bool setXMLOutput( bool b, const std::string& id ) {
+ setDocID( id ); return setXMLOutput(b); }
bool setXMLInput( bool b ) { bool t = xmlin; xmlin = b; return t; }
void outputTokens( std::ostream&, const std::vector<Token>& ) const;
diff --git a/src/tokenize.cxx b/src/tokenize.cxx
index 188151a..cc1ac75 100644
--- a/src/tokenize.cxx
+++ b/src/tokenize.cxx
@@ -1,5 +1,5 @@
/*
- $Id: tokenize.cxx 17513 2014-08-14 10:00:05Z sloot $
+ $Id: tokenize.cxx 17682 2014-09-18 14:09:27Z sloot $
$URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/src/tokenize.cxx $
Copyright (c) 2006 - 2014
Tilburg University
@@ -484,7 +484,6 @@ namespace Tokenizer {
for ( size_t i = 0; i < element->size(); i++) {
tokenizeElement( element->index(i));
}
- // appendText( element, outputclass );
return;
}
@@ -1014,7 +1013,7 @@ namespace Tokenizer {
quotes.push( i, c );
}
}
- else if ((c == '\'') ) {
+ else if ( c == '\'' ) {
if (tokDebug > 1 )
*Log(theErrLog) << "[detectQuoteBounds] Standard single-quote (ambiguous) found @i="<< i << endl;
if (!resolveQuote(i,c)) {
diff --git a/src/ucto.cxx b/src/ucto.cxx
index ff51f1a..2c67771 100644
--- a/src/ucto.cxx
+++ b/src/ucto.cxx
@@ -1,5 +1,5 @@
/*
- $Id: ucto.cxx 16823 2014-01-06 10:21:09Z sloot $
+ $Id: ucto.cxx 17687 2014-09-22 10:16:56Z sloot $
$URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/src/ucto.cxx $
Copyright (c) 1998 - 2014
ILK - Tilburg University
@@ -27,11 +27,11 @@
*/
#include <cstdlib>
#include <cstring>
-#include <getopt.h>
#include <string>
#include <iostream>
#include <fstream>
#include "libfolia/document.h"
+#include "ticcutils/CommandLine.h"
#include "ucto/tokenize.h"
#include <unistd.h>
@@ -93,59 +93,79 @@ int main( int argc, char *argv[] ){
string ofile;
string c_file;
string L_file;
-
- static struct option longOpts[] = { { "passthru", 0, 0, 1 },
- { "id", 1, 0, 2 },
- { "textclass", 1, 0, 3 },
- { "inputclass", 1, 0, 4 },
- { "outputclass", 1, 0, 5 },
- { 0,0,0,0} };
-
- int opt;
- int longOpt;
bool passThru = false;
+
try {
- while ((opt = getopt_long( argc, argv,
- "d:e:fhlPQunmN:vVSL:c:s:x:FX",
- longOpts, &longOpt )) != -1) {
- switch (opt)
- {
- case 1: passThru = true; break;
- case 'd': debug = stringTo<int>(optarg); break;
- case 'e': inputEncoding = optarg; break;
- case 'f': dofiltering = false; break;
- case 'F': xmlin = true; break;
- case 'P': paragraphdetection = false; break;
- case 'Q': quotedetection = true; break;
- case 'c': c_file = optarg; break;
- case 's': eosmarker = optarg; break;
- case 'S': splitsentences = false; break;
- case 'L': L_file = string("tokconfig-") + string(optarg); break;
- case 'u': touppercase = true; break;
- case 'l': tolowercase = true; break;
- case 'n': sentenceperlineoutput = true; break;
- case 'm': sentenceperlineinput = true; break;
- case 'N': normalization = optarg; break;
- case 'v': verbose = true; break;
- case 'V': cout << "Ucto - Unicode Tokenizer - version " << Version() << endl << "(c) ILK 2009 - 2014, Induction of Linguistic Knowledge Research Group, Tilburg University" << endl << "Licensed under the GNU General Public License v3" << endl;
- cout << "based on [" << folia::VersionName() << "]" << endl;
- return EXIT_SUCCESS;
- case 'x': xmlout = true; docid = optarg; break;
- case 'X': xmlout = true; break;
- case 2: docid = optarg; break;
- case 3: inputclass = optarg;
- cerr << "--textclass is deprecated!. use --inputclass instead!" << endl;
- break;
- case 4: inputclass = optarg; break;
- case 5: outputclass = optarg; break;
- default: usage(); return EXIT_SUCCESS;
- }
+ TiCC::CL_Options Opts( "d:e:fhlPQunmN:vVSL:c:s:x:FX",
+ "passthru,textclass:,inputclass:,outputclass:,id:");
+ Opts.init(argc, argv );
+ if ( Opts.extract( 'h' ) ){
+ usage();
+ return EXIT_SUCCESS;
+ }
+ if ( Opts.extract( 'V' ) ){
+ cout << "Ucto - Unicode Tokenizer - version " << Version() << endl
+ << "(c) ILK 2009 - 2014, Induction of Linguistic Knowledge Research Group, Tilburg University" << endl
+ << "Licensed under the GNU General Public License v3" << endl;
+ cout << "based on [" << folia::VersionName() << "]" << endl;
+ return EXIT_SUCCESS;
+ }
+ Opts.extract('e', inputEncoding );
+ dofiltering = !Opts.extract( 'f' );
+ paragraphdetection = !Opts.extract( 'P' );
+ splitsentences = !Opts.extract( 'S' );
+ xmlin = Opts.extract( 'F' );
+ quotedetection = Opts.extract( 'Q' );
+ Opts.extract( 'c', c_file );
+ Opts.extract( 's', eosmarker );
+ touppercase = Opts.extract( 'u' );
+ tolowercase = Opts.extract( 'l' );
+ sentenceperlineoutput = Opts.extract( 'n' );
+ sentenceperlineinput = Opts.extract( 'm' );
+ Opts.extract( 'N', normalization );
+ verbose = Opts.extract( 'v' );
+ if ( Opts.extract( 'x', docid ) ){
+ xmlout = true;
+ if ( Opts.is_present( 'X' ) ){
+ throw TiCC::OptionError( "conflicting options -x and -X" );
+ }
+ if ( Opts.is_present( "id" ) ){
+ throw TiCC::OptionError( "conflicting options -x and --id" );
+ }
+ }
+ else {
+ xmlout = Opts.extract( 'X' );
+ Opts.extract( "id", docid );
+ }
+ passThru = Opts.extract( "passthru" );
+ Opts.extract( "textclass", inputclass );
+ Opts.extract( "inputclass", inputclass );
+ Opts.extract( "outputclass", outputclass );
+ string value;
+ if ( Opts.extract('d', value ) ){
+ if ( !TiCC::stringTo(value,debug) ){
+ throw TiCC::OptionError( "invalid value for -d: " + value );
+ }
+ }
+ if ( Opts.extract('L', value ) ){
+ L_file = string("tokconfig-") + string(value);
+ }
+ if ( !Opts.empty() ){
+ string tomany = Opts.toString();
+ throw TiCC::OptionError( "unhandled option(s): " + tomany );
+ }
+ vector<string> files = Opts.getMassOpts();
+ if ( files.size() > 0 ){
+ ifile = files[0];
+ }
+ if ( files.size() > 1 ){
+ ofile = files[1];
}
}
- catch ( exception &e ){
- cerr << "Error in option '" << char(opt) << "' : " << e.what() << endl << endl;
+ catch( const TiCC::OptionError& e ){
+ cerr << "ucto: " << e.what() << endl;
usage();
- return EXIT_FAILURE;
+ exit(EXIT_FAILURE);
}
if ( !passThru ){
@@ -165,9 +185,6 @@ int main( int argc, char *argv[] ){
}
}
- if (optind < argc) ifile = argv[optind++];
- if (optind < argc) ofile = argv[optind++];
-
if ((!ifile.empty()) && (ifile == ofile)) {
cerr << "Error: Output file equals input file! Courageously refusing to start..." << endl;
return EXIT_FAILURE;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/ucto.git
More information about the debian-science-commits
mailing list