[apertium] 01/02: Imported Upstream version 3.4.2~r68466
Tino Didriksen
tinodidriksen-guest at moszumanska.debian.org
Sat Jun 4 19:11:49 UTC 2016
This is an automated email from the git hooks/post-receive script.
tinodidriksen-guest pushed a commit to branch master
in repository apertium.
commit 31a02c7becb12f7c0323dba2247baa69c28b35b6
Author: Tino Didriksen <tino at didriksen.cc>
Date: Sat Jun 4 19:10:44 2016 +0000
Imported Upstream version 3.4.2~r68466
---
CMakeLists.txt | 4 +-
COPYING | 41 +-
COPYING.hunalign | 20 +-
NEWS | 27 +-
apertium.m4 | 14 +-
apertium.pc.in | 2 +-
apertium/Makefile.am | 283 ++++--
apertium/a.cc | 50 +
apertium/a.h | 37 +
apertium/align.cc | 56 ++
apertium/align.h | 35 +
apertium/analysis.cc | 55 ++
apertium/analysis.h | 37 +
apertium/apertium-createmodes.awk | 37 +-
apertium/apertium-desmediawiki.1 | 6 +-
apertium/apertium-gen-modes.in | 85 ++
apertium/apertium-header.sh | 6 +-
apertium/apertium-multiple-translations.cc | 4 +-
apertium/apertium-prelatex.l | 2 +-
apertium/apertium-transfer.1 | 3 +
apertium/apertium-unformat.1 | 2 +-
apertium/apertium.1 | 2 +-
apertium/apertium_filter_ambiguity.cc | 4 +-
apertium/apertium_gen_wlist_lextor_translation.cc | 8 +-
apertium/apertium_interchunk.cc | 4 +-
apertium/apertium_lextor.cc | 8 +-
apertium/apertium_lextor_eval.cc | 8 +-
apertium/apertium_postchunk.cc | 4 +-
apertium/apertium_pretransfer.cc | 4 +-
apertium/apertium_re.cc | 7 +-
apertium/apertium_re.h | 4 +-
apertium/apertium_tagger.cc | 1082 ++++++++++++++++++++-
apertium/apertium_tagger.h | 105 ++
apertium/apertium_tagger_apply_new_rules.cc | 14 +-
apertium/apertium_tagger_readwords.cc | 8 +-
apertium/apertium_tmxbuild.cc | 4 +-
apertium/apertium_transfer.cc | 15 +-
apertium/basic_5_3_1_tagger.cc | 20 +
apertium/basic_5_3_1_tagger.h | 32 +
apertium/basic_5_3_2_tagger.cc | 20 +
apertium/basic_5_3_2_tagger.h | 33 +
apertium/basic_5_3_3_tagger.h | 35 +
apertium/basic_exception_type.cc | 20 +
apertium/basic_exception_type.h | 29 +
apertium/basic_stream_tagger.cc | 125 +++
apertium/basic_stream_tagger.h | 56 ++
apertium/basic_stream_tagger_trainer.cc | 59 ++
apertium/basic_stream_tagger_trainer.h | 41 +
apertium/basic_tagger.cc | 48 +
apertium/basic_tagger.h | 60 ++
apertium/collection.cc | 4 +-
apertium/collection.h | 4 +-
apertium/constant_manager.cc | 4 +-
apertium/constant_manager.h | 4 +-
apertium/constructor_eq_delete.h | 32 +
apertium/deformat.xsl | 4 +-
apertium/deserialiser.h | 255 +++++
apertium/endian_double_util.cc | 4 +-
apertium/endian_double_util.h | 4 +-
apertium/err_exception.h | 23 +
apertium/exception.h | 95 ++
apertium/exception_type.cc | 32 +
apertium/exception_type.h | 38 +
apertium/file_tagger.cc | 42 +
apertium/file_tagger.h | 52 +
apertium/format.dtd | 4 +-
apertium/format.rnc | 111 +++
apertium/format.rng | 303 ++++++
apertium/hmm.cc | 356 +++----
apertium/hmm.h | 53 +-
apertium/i.cc | 50 +
apertium/i.h | 38 +
apertium/interchunk.cc | 62 +-
apertium/interchunk.dtd | 4 +-
apertium/interchunk.h | 8 +-
apertium/interchunk.rnc | 353 +++++++
apertium/{transfer.rng => interchunk.rng} | 157 +--
apertium/interchunk_word.cc | 4 +-
apertium/interchunk_word.h | 4 +-
apertium/latex_accentsmap.cc | 4 +-
apertium/latex_accentsmap.h | 4 +-
apertium/lemma.cc | 55 ++
apertium/lemma.h | 36 +
apertium/lexchoice.xsl | 4 +-
apertium/lexchoicebil.xsl | 4 +-
apertium/lexical_unit.h | 32 +
apertium/lextor.cc | 24 +-
apertium/lextor.h | 4 +-
apertium/lextor_data.cc | 4 +-
apertium/lextor_data.h | 4 +-
apertium/lextor_eval.cc | 4 +-
apertium/lextor_eval.h | 4 +-
apertium/lextor_word.cc | 4 +-
apertium/lextor_word.h | 4 +-
apertium/linebreak.cc | 94 ++
apertium/linebreak.h | 36 +
apertium/lswpost.cc | 297 ++----
apertium/lswpost.h | 49 +-
apertium/modes-header.sh | 32 -
apertium/modes.dtd | 21 +-
apertium/modes.rnc | 30 +
apertium/modes.rng | 19 +-
apertium/modes2bash.xsl | 81 +-
apertium/modes2debugmodes.xsl | 162 +++
apertium/morpheme.cc | 57 ++
apertium/morpheme.h | 35 +
apertium/morpho_stream.cc | 4 +-
apertium/morpho_stream.h | 4 +-
apertium/new2old.xsl | 4 +-
apertium/optional.h | 123 +++
apertium/postchunk.cc | 61 +-
apertium/postchunk.dtd | 4 +-
apertium/postchunk.h | 8 +-
apertium/postchunk.rnc | 348 +++++++
apertium/{transfer.rng => postchunk.rng} | 205 ++--
apertium/reformat.xsl | 4 +-
apertium/serialiser.h | 284 ++++++
apertium/stream.cc | 774 +++++++++++++++
apertium/stream.h | 69 ++
apertium/stream_5_3_1_tagger.cc | 68 ++
apertium/stream_5_3_1_tagger.h | 53 +
apertium/stream_5_3_1_tagger_trainer.cc | 51 +
apertium/stream_5_3_1_tagger_trainer.h | 41 +
apertium/stream_5_3_2_tagger.cc | 104 ++
apertium/stream_5_3_2_tagger.h | 55 ++
apertium/stream_5_3_2_tagger_trainer.cc | 56 ++
apertium/stream_5_3_2_tagger_trainer.h | 38 +
apertium/stream_5_3_3_tagger.cc | 223 +++++
apertium/stream_5_3_3_tagger.h | 62 ++
apertium/stream_5_3_3_tagger_trainer.cc | 92 ++
apertium/stream_5_3_3_tagger_trainer.h | 39 +
apertium/streamed_type.h | 32 +
apertium/string_utils.cc | 4 +-
apertium/string_utils.h | 4 +-
apertium/tag.cc | 34 +
apertium/tag.h | 31 +
apertium/tagger.cc | 763 ---------------
apertium/tagger.dtd | 4 +-
apertium/tagger.h | 83 --
apertium/tagger.rnc | 122 +++
apertium/tagger.rng | 310 ++++++
apertium/tagger_data.cc | 4 +-
apertium/tagger_data.h | 4 +-
apertium/tagger_data_hmm.cc | 6 +-
apertium/tagger_data_hmm.h | 4 +-
apertium/tagger_data_lsw.cc | 4 +-
apertium/tagger_data_lsw.h | 4 +-
apertium/tagger_utils.cc | 104 +-
apertium/tagger_utils.h | 29 +-
apertium/tagger_word.cc | 8 +-
apertium/tagger_word.h | 4 +-
apertium/tmx_align_parameters.h | 3 +-
apertium/tmx_alignment.cc | 2 +-
apertium/tmx_builder.cc | 16 +-
apertium/tmx_builder.h | 4 +-
apertium/tmx_translate.cc | 4 +-
apertium/transfer.cc | 467 +++++----
apertium/transfer.dtd | 16 +-
apertium/transfer.h | 24 +-
apertium/transfer.rnc | 407 ++++++++
apertium/transfer.rng | 56 +-
apertium/transfer_data.cc | 4 +-
apertium/transfer_data.h | 4 +-
apertium/transfer_instr.cc | 4 +-
apertium/transfer_instr.h | 11 +-
apertium/transfer_mult.cc | 33 +-
apertium/transfer_mult.h | 7 +-
apertium/transfer_token.cc | 7 +-
apertium/transfer_token.h | 4 +-
apertium/transfer_word.cc | 7 +-
apertium/transfer_word.h | 4 +-
apertium/transfer_word_list.cc | 4 +-
apertium/transfer_word_list.h | 4 +-
apertium/transferpp.cc | 4 +-
apertium/trx_reader.cc | 29 +-
apertium/trx_reader.h | 7 +-
apertium/tsx_reader.cc | 8 +-
apertium/tsx_reader.h | 12 +-
apertium/ttag.h | 4 +-
apertium/unlocked_cstdio.h | 10 +-
apertium/utf_converter.cc | 4 +-
apertium/utf_converter.h | 4 +-
apertium/wchar_t_exception.h | 53 +
apertium/wchar_t_exception_type.cc | 90 ++
apertium/wchar_t_exception_type.h | 45 +
configure.ac | 77 +-
186 files changed, 9106 insertions(+), 2538 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e65295e..4e93b25 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,9 +62,9 @@ FIND_PACKAGE (LibPcre REQUIRED)
SET (PACKAGE_BUGREPORT sortiz at users.sourceforge.net)
SET (PACKAGE_NAME apertium/apertium.h)
-SET (PACKAGE_STRING apertium/apertium.h 3.4.0)
+SET (PACKAGE_STRING apertium/apertium.h 3.4.2)
SET (PACKAGE_TARNAME lttoolbox-lttoolbox-h)
-SET (PACKAGE_VERSION 3.4.0)
+SET (PACKAGE_VERSION 3.4.2)
MACRO (BOOL_CHECK_SYMBOL_EXISTS SYMBOL HEADER VAR)
CHECK_SYMBOL_EXISTS (${SYMBOL} ${HEADER} ${VAR})
diff --git a/COPYING b/COPYING
index 623b625..d159169 100644
--- a/COPYING
+++ b/COPYING
@@ -1,12 +1,12 @@
- GNU GENERAL PUBLIC LICENSE
- Version 2, June 1991
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
- Copyright (C) 1989, 1991 Free Software Foundation, Inc.
- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
- Preamble
+ Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
@@ -15,7 +15,7 @@ software--to make sure the software is free for all its users. This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it. (Some other Free Software Foundation software is covered by
-the GNU Library General Public License instead.) You can apply it to
+the GNU Lesser General Public License instead.) You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
@@ -55,8 +55,8 @@ patent must be licensed for everyone's free use or not licensed at all.
The precise terms and conditions for copying, distribution and
modification follow.
-
- GNU GENERAL PUBLIC LICENSE
+
+ GNU GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License applies to any program or other work which contains
@@ -110,7 +110,7 @@ above, provided that you also meet all of these conditions:
License. (Exception: if the Program itself is interactive but
does not normally print such an announcement, your work based on
the Program is not required to print an announcement.)
-
+
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
@@ -168,7 +168,7 @@ access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.
-
+
4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License. Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
@@ -225,7 +225,7 @@ impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
-
+
8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
@@ -255,7 +255,7 @@ make exceptions for this. Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.
- NO WARRANTY
+ NO WARRANTY
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
@@ -277,9 +277,9 @@ YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.
- END OF TERMS AND CONDITIONS
-
- How to Apply These Terms to Your New Programs
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
@@ -303,17 +303,16 @@ the "copyright" line and a pointer to where the full notice is found.
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
Also add information on how to contact you by electronic and paper mail.
If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:
- Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision version 69, Copyright (C) year name of author
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
@@ -336,5 +335,5 @@ necessary. Here is a sample; alter the names:
This General Public License does not permit incorporating your program into
proprietary programs. If your program is a subroutine library, you may
consider it more useful to permit linking proprietary applications with the
-library. If this is what you want to do, use the GNU Library General
+library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License.
diff --git a/COPYING.hunalign b/COPYING.hunalign
index b1e3f5a..4362b49 100644
--- a/COPYING.hunalign
+++ b/COPYING.hunalign
@@ -1,8 +1,8 @@
- GNU LESSER GENERAL PUBLIC LICENSE
- Version 2.1, February 1999
+ GNU LESSER GENERAL PUBLIC LICENSE
+ Version 2.1, February 1999
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
- 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
@@ -10,7 +10,7 @@
as the successor of the GNU Library Public License, version 2, hence
the version number 2.1.]
- Preamble
+ Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
@@ -112,7 +112,7 @@ modification follow. Pay close attention to the difference between a
former contains code derived from the library, whereas the latter must
be combined with the library in order to run.
- GNU LESSER GENERAL PUBLIC LICENSE
+ GNU LESSER GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License Agreement applies to any software library or other
@@ -146,7 +146,7 @@ such a program is covered only if its contents constitute a work based
on the Library (independent of the use of the Library in a tool for
writing it). Whether that is true depends on what the Library does
and what the program that uses the Library does.
-
+
1. You may copy and distribute verbatim copies of the Library's
complete source code as you receive it, in any medium, provided that
you conspicuously and appropriately publish on each copy an
@@ -432,7 +432,7 @@ decision will be guided by the two goals of preserving the free status
of all derivatives of our free software and of promoting the sharing
and reuse of software generally.
- NO WARRANTY
+ NO WARRANTY
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
@@ -455,7 +455,7 @@ FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
DAMAGES.
- END OF TERMS AND CONDITIONS
+ END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Libraries
@@ -485,7 +485,7 @@ convey the exclusion of warranty; and each file should have at least the
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Also add information on how to contact you by electronic and paper mail.
@@ -500,5 +500,3 @@ necessary. Here is a sample; alter the names:
Ty Coon, President of Vice
That's all there is to it!
-
-
diff --git a/NEWS b/NEWS
index fd2d8ae..5edac6a 100644
--- a/NEWS
+++ b/NEWS
@@ -6,12 +6,37 @@
SVN
---
+Version 3.4.2, 2016-05-15 (-r68437)
+---------------------------------
+
+* some bugfixes to apertium-tagger, e.g.
+ https://sourceforge.net/p/apertium/tickets/94/
+
+* bugfixes to modes: now accept dirs with spaces, and allow installing apertium
+ itself and language data to different prefixes, as well as auto-generating
+ debug modes
+
* fix a crash when apertium-tagger is compiled with with clang
* new option -n to deformatters turns off dot-insertion
+ http://sourceforge.net/p/apertium/tickets/68
+
+* new transfer instruction <reject-current-rule shifting="yes|no"/>;
+ see transfer.dtd for details (not implemented for
+ interchunk/postchunk)
+
+* apertium-transfer-tools-generalisation-dev branch merged
+
+* apertium-tagger: supervised training and tagging for unigram models
+ based on http://coltekin.net/cagri/papers/trmorph-tools.pdf
+
+* fix some off-by-one/out-of-bounds segfaults in transfer
+ https://sourceforge.net/p/apertium/tickets/89/
+* various distribution-related fixes, static analysis fixes,
+ documentation
-Version 3.4, 2015-03-17 (-r59200)
+Version 3.4.0, 2015-03-17 (-r59200)
---------------------------------
* transfer files now work even if they were compiled with a different
diff --git a/apertium.m4 b/apertium.m4
index 6fa79b2..5bfc562 100644
--- a/apertium.m4
+++ b/apertium.m4
@@ -1,7 +1,7 @@
# apertium.m4 - Macros to locate and utilise apertium libraries -*- Autoconf -*-
-# serial 1 (apertium-3.4.0)
+# serial 1 (apertium-3.4.2)
#
-# Copyright (C) 2013--2015 Universitat d'Alacant / Universidad de Alicante
+# Copyright (C) 2013--2016 Universitat d'Alacant / Universidad de Alicante
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
@@ -14,9 +14,7 @@
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
-# 02111-1307, USA.
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
# AP_CHECK_LING([ID], [MONOLINGUAL_PACKAGE])
@@ -106,7 +104,7 @@ modes/%.mode: modes.xml
apertium_modesdir=\$(prefix)/share/apertium/modes/
install-modes:
mv modes modes.bak
- apertium-gen-modes modes.xml \$(BASENAME)
+ apertium-gen-modes -f modes.xml \$(prefix)/share/apertium/\$(BASENAME)
rm -rf modes
mv modes.bak modes
test -d \$(DESTDIR)\$(apertium_modesdir) || mkdir \$(DESTDIR)\$(apertium_modesdir)
@@ -149,8 +147,8 @@ langs:
hfst-concatenate -1 .deps/\@S|@*.autobil.upper -2 .deps/\@S|@*.any-nonplus.hfst -o .deps/\@S|@*.autobil.nonplussed # bidix [^+]*
echo ' %+ ' | hfst-regexp2fst > .deps/\@S|@*.single-plus.hfst # +
hfst-concatenate -1 .deps/\@S|@*.single-plus.hfst -2 .deps/\@S|@*.autobil.nonplussed -o .deps/\@S|@*.autobil.postplus # + bidix [^+]*
- hfst-repeat -f0 -t1 -i .deps/\@S|@*.autobil.postplus -o .deps/\@S|@*.autobil.postplus.0,1 # (+ bidix [^+]*){0,1} -- gives at most one +
- hfst-concatenate -1 .deps/\@S|@*.autobil.nonplussed -2 .deps/\@S|@*.autobil.postplus.0,1 -o \@S|@@ # bidix [^+]* (+ bidix [^+]*){0,1}
+ hfst-repeat -f0 -t3 -i .deps/\@S|@*.autobil.postplus -o .deps/\@S|@*.autobil.postplus.0,3 # (+ bidix [^+]*){0,3} -- gives at most three +
+ hfst-concatenate -1 .deps/\@S|@*.autobil.nonplussed -2 .deps/\@S|@*.autobil.postplus.0,3 -o \@S|@@ # bidix [^+]* (+ bidix [^+]*){0,3}
EOF
diff --git a/apertium.pc.in b/apertium.pc.in
index c60ec75..4812b86 100644
--- a/apertium.pc.in
+++ b/apertium.pc.in
@@ -4,7 +4,7 @@ libdir=@libdir@
includedir=@includedir@
Name: apertium
-Description: lttoolbox-based translation modules generator
+Description: rule-based machine translation system
Version: @VERSION@
Libs: -L${libdir} -l at GENERIC_LIBRARY_NAME@@GENERIC_MAJOR_VERSION@ @APERTIUM_LIBS@
Cflags: -I${includedir}/@GENERIC_LIBRARY_NAME at -@GENERIC_API_VERSION@ -I${libdir}/@GENERIC_LIBRARY_NAME at -@GENERIC_API_VERSION@/include @APERTIUM_CFLAGS@
diff --git a/apertium/Makefile.am b/apertium/Makefile.am
index 6fda448..939826a 100644
--- a/apertium/Makefile.am
+++ b/apertium/Makefile.am
@@ -1,29 +1,155 @@
-h_sources = collection.h constant_manager.h hmm.h lswpost.h interchunk.h \
- interchunk_word.h lextor_data.h lextor_eval.h lextor.h \
- lextor_word.h morpho_stream.h postchunk.h string_utils.h \
- tagger_data.h tagger_data_hmm.h tagger_data_lsw.h tagger.h tagger_utils.h tagger_word.h \
- transfer_data.h transfer.h transfer_instr.h transfer_token.h \
- transfer_word.h transfer_word_list.h trx_reader.h tsx_reader.h \
- ttag.h utf_converter.h apertium_re.h unlocked_cstdio.h \
- endian_double_util.h transfer_mult.h tmx_builder.h \
- tmx_alignment.h tmx_arguments_parser.h tmx_book_to_matrix.h \
- tmx_dictionary.h tmx_dic_tree.h tmx_quasi_diagonal.h \
- tmx_serialize_impl.h tmx_strings_and_streams.h \
- tmx_trail_postprocessors.h tmx_translate.h tmx_words.h \
- tmx_align_parameters.h tmx_aligner_tool.h latex_accentsmap.h
-
-cc_sources = collection.cc constant_manager.cc hmm.cc lswpost.cc interchunk.cc \
- interchunk_word.cc lextor.cc lextor_data.cc lextor_eval.cc \
- lextor_word.cc morpho_stream.cc postchunk.cc string_utils.cc \
- tagger.cc tagger_data.cc tagger_data_hmm.cc tagger_data_lsw.cc tagger_utils.cc tagger_word.cc \
- transfer.cc transfer_data.cc transfer_instr.cc \
- transfer_token.cc transfer_word.cc transfer_word_list.cc \
- trx_reader.cc tsx_reader.cc utf_converter.cc apertium_re.cc \
- endian_double_util.cc transfer_mult.cc tmx_builder.cc \
- tmx_aligner_tool.cc tmx_alignment.cc tmx_arguments_parser.cc \
- tmx_book_to_matrix.cc tmx_dictionary.cc \
- tmx_strings_and_streams.cc tmx_trail_postprocessors.cc \
- tmx_translate.cc latex_accentsmap.cc
+AUTOMAKE_OPTIONS = subdir-objects
+
+h_sources = a.h \
+ align.h \
+ analysis.h \
+ apertium_re.h \
+ apertium_tagger.h \
+ basic_5_3_1_tagger.h \
+ basic_5_3_2_tagger.h \
+ basic_5_3_3_tagger.h \
+ basic_exception_type.h \
+ basic_stream_tagger.h \
+ basic_stream_tagger_trainer.h \
+ basic_tagger.h \
+ collection.h \
+ constant_manager.h \
+ constructor_eq_delete.h \
+ deserialiser.h \
+ endian_double_util.h \
+ err_exception.h \
+ exception.h \
+ exception_type.h \
+ file_tagger.h \
+ hmm.h \
+ i.h \
+ interchunk.h \
+ interchunk_word.h \
+ latex_accentsmap.h \
+ lemma.h \
+ lexical_unit.h \
+ linebreak.h \
+ lswpost.h \
+ morpheme.h \
+ morpho_stream.h \
+ optional.h \
+ postchunk.h \
+ serialiser.h \
+ stream.h \
+ stream_5_3_1_tagger.h \
+ stream_5_3_2_tagger.h \
+ stream_5_3_3_tagger.h \
+ stream_5_3_1_tagger_trainer.h \
+ stream_5_3_2_tagger_trainer.h \
+ stream_5_3_3_tagger_trainer.h \
+ streamed_type.h \
+ string_utils.h \
+ tag.h \
+ tagger_data.h \
+ tagger_data_hmm.h \
+ tagger_data_lsw.h \
+ tagger_utils.h \
+ tagger_word.h \
+ tmx_aligner_tool.h \
+ tmx_alignment.h \
+ tmx_align_parameters.h \
+ tmx_arguments_parser.h \
+ tmx_book_to_matrix.h \
+ tmx_builder.h \
+ tmx_dictionary.h \
+ tmx_dic_tree.h \
+ tmx_quasi_diagonal.h \
+ tmx_serialize_impl.h \
+ tmx_strings_and_streams.h \
+ tmx_trail_postprocessors.h \
+ tmx_translate.h \
+ tmx_words.h \
+ transfer_data.h \
+ transfer.h \
+ transfer_instr.h \
+ transfer_mult.h \
+ transfer_token.h \
+ transfer_word.h \
+ transfer_word_list.h \
+ trx_reader.h \
+ tsx_reader.h \
+ ttag.h \
+ unlocked_cstdio.h \
+ utf_converter.h \
+ wchar_t_exception.h \
+ wchar_t_exception_type.h
+
+#DEPR.:
+# lextor_data.h
+# lextor_eval.h
+# lextor.h
+# lextor_word.h
+
+cc_sources = a.cc \
+ align.cc \
+ analysis.cc \
+ apertium_re.cc \
+ basic_5_3_1_tagger.cc \
+ basic_5_3_2_tagger.cc \
+ basic_exception_type.cc \
+ basic_stream_tagger.cc \
+ basic_stream_tagger_trainer.cc \
+ basic_tagger.cc \
+ collection.cc \
+ constant_manager.cc \
+ endian_double_util.cc \
+ exception_type.cc \
+ file_tagger.cc \
+ hmm.cc \
+ i.cc \
+ interchunk.cc \
+ interchunk_word.cc \
+ latex_accentsmap.cc \
+ lemma.cc \
+ linebreak.cc \
+ lswpost.cc \
+ morpheme.cc \
+ morpho_stream.cc \
+ postchunk.cc \
+ stream.cc \
+ stream_5_3_1_tagger.cc \
+ stream_5_3_2_tagger.cc \
+ stream_5_3_3_tagger.cc \
+ stream_5_3_1_tagger_trainer.cc \
+ stream_5_3_2_tagger_trainer.cc \
+ stream_5_3_3_tagger_trainer.cc \
+ string_utils.cc \
+ tag.cc \
+ tagger_data.cc \
+ tagger_data_hmm.cc \
+ tagger_data_lsw.cc \
+ tagger_utils.cc \
+ tagger_word.cc \
+ tmx_aligner_tool.cc \
+ tmx_alignment.cc \
+ tmx_arguments_parser.cc \
+ tmx_book_to_matrix.cc \
+ tmx_builder.cc \
+ tmx_dictionary.cc \
+ tmx_strings_and_streams.cc \
+ tmx_trail_postprocessors.cc \
+ tmx_translate.cc \
+ transfer.cc \
+ transfer_data.cc \
+ transfer_instr.cc \
+ transfer_mult.cc \
+ transfer_token.cc \
+ transfer_word.cc \
+ transfer_word_list.cc \
+ trx_reader.cc \
+ tsx_reader.cc \
+ utf_converter.cc \
+ wchar_t_exception_type.cc
+#DEPR.:
+# lextor.cc
+# lextor_data.cc
+# lextor_eval.cc
+# lextor_word.cc
library_includedir = $(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME)
library_include_HEADERS = $(h_sources)
@@ -46,30 +172,41 @@ lib_LTLIBRARIES = libapertium3.la
libapertium3_la_SOURCES = $(h_sources) $(cc_sources)
libapertium3_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION) -release $(GENERIC_RELEASE)
-bin_PROGRAMS = apertium-pretransfer apertium-destxt apertium-retxt \
- apertium-deshtml apertium-rehtml \
- apertium-rehtml-noent \
- apertium-desxpresstag apertium-rexpresstag \
- apertium-desodt apertium-reodt \
- apertium-desrtf apertium-rertf \
- apertium-deswxml apertium-rewxml \
- apertium-deslatex apertium-relatex \
- apertium-desxlsx apertium-rexlsx \
- apertium-despptx apertium-repptx \
- apertium-desmediawiki apertium-remediawiki \
- apertium-prelatex apertium-postlatex \
- apertium-postlatex-raw \
- apertium-tagger \
- apertium-preprocess-transfer apertium-transfer \
- apertium-filter-ambiguity \
- apertium-interchunk apertium-postchunk \
- apertium-multiple-translations \
- apertium-tagger-apply-new-rules \
- apertium-tagger-readwords \
- apertium-tmxbuild
- ###pruebas-lextor apertium-lextor-search apertium-lextor-eval
- ###apertium-gen-wlist-lextor-translation
- ###apertium-lextor
+bin_PROGRAMS = apertium-deshtml \
+ apertium-deslatex \
+ apertium-desmediawiki \
+ apertium-desodt \
+ apertium-despptx \
+ apertium-desrtf \
+ apertium-destxt \
+ apertium-deswxml \
+ apertium-desxlsx \
+ apertium-desxpresstag \
+ apertium-filter-ambiguity \
+ apertium-interchunk \
+ apertium-multiple-translations \
+ apertium-postchunk \
+ apertium-postlatex \
+ apertium-postlatex-raw \
+ apertium-prelatex \
+ apertium-preprocess-transfer \
+ apertium-pretransfer \
+ apertium-rehtml \
+ apertium-rehtml-noent \
+ apertium-relatex \
+ apertium-remediawiki \
+ apertium-reodt \
+ apertium-repptx \
+ apertium-rertf \
+ apertium-retxt \
+ apertium-rewxml \
+ apertium-rexlsx \
+ apertium-rexpresstag \
+ apertium-tagger \
+ apertium-tagger-apply-new-rules \
+ apertium-tagger-readwords \
+ apertium-tmxbuild \
+ apertium-transfer
bin_SCRIPTS = $(GENERATEDSCRIPTS)
@@ -81,8 +218,10 @@ apertiumlib = $(prefix)/lib
apertiumsysconf = $(prefix)/etc/apertium
apertium_DATA = deformat.xsl reformat.xsl new2old.xsl lexchoice.xsl \
- lexchoicebil.xsl tagger.dtd interchunk.dtd format.dtd \
- transfer.dtd postchunk.dtd modes.dtd modes2bash.xsl \
+ lexchoicebil.xsl \
+ tagger.dtd interchunk.dtd format.dtd transfer.dtd postchunk.dtd modes.dtd \
+ tagger.rnc interchunk.rnc format.rnc transfer.rnc postchunk.rnc modes.rnc \
+ modes2bash.xsl modes2debugmodes.xsl \
apertium-createmodes.awk
apertium_pretransfer_SOURCES = apertium_pretransfer.cc
@@ -372,16 +511,12 @@ apertium-gen-reformat: Makefile.am gen-header.sh
@echo "rm /tmp/\$$\$$.reformat.l /tmp/\$$\$$.lex.cc" >> $@
@chmod a+x $@
-apertium-gen-modes: Makefile.am modes-header.sh
- @echo "Creating apertium-gen-modes script"
+apertium-gen-modes: apertium-gen-modes.in
@echo "#!$(BASH)" > $@
- @echo "APERTIUMDIR="$(apertiumdir) >> $@
- @cat modes-header.sh >> $@
- @echo "$(XMLLINT) --dtdvalid $(apertiumdir)/modes.dtd --noout \$$FILE1 && \\" >> $@
- @if [ `basename $(XSLTPROC)` == xsltproc ]; \
- then echo "$(XSLTPROC) --stringparam prefix $(prefix)/bin --stringparam dataprefix \$$FULLDIRNAME $(apertiumdir)/modes2bash.xsl \$$FILE1 | awk -f $(apertiumdir)/apertium-createmodes.awk PARAM=\$$FULLDIRNAME"; \
- else echo "$(XSLTPROC) $(apertiumdir)/modes2bash.xsl \$$FILE1 \\\$$prefix=$(prefix)/bin \\\$$dataprefix=\$$FULLDIRNAME| awk -f $(apertiumdir)/apertium-createmodes.awk PARAM=\$$FULLDIRNAME"; \
- fi >> $@
+ @echo "APERTIUMDIR=$(apertiumdir)" >> $@
+ @echo "XMLLINT=$(XMLLINT)" >> $@
+ @echo "XSLTPROC=$(XSLTPROC)" >> $@
+ cat $< >> $@
@chmod a+x $@
apertium-utils-fixlatex: Makefile.am utils-fixlatex-header.sh
@@ -478,10 +613,8 @@ man_MANS=apertium.1 apertium-deshtml.1 apertium-desrtf.1 apertium-destxt.1 \
apertium-desxlsx.1 apertium-rexlsx.1 \
apertium-despptx.1 apertium-repptx.1 \
apertium-desmediawiki.1 apertium-remediawiki.1 \
- apertium-filter-ambiguity.1 apertium-gen-deformat.1 apertium-gen-lextorbil.1 \
- apertium-gen-lextormono.1 apertium-gen-reformat.1 apertium-gen-stopwords-lextor.1 \
- apertium-gen-wlist-lextor.1 apertium-gen-wlist-lextor-translation.1 \
- apertium-lextor.1 apertium-preprocess-corpus-lextor.1 \
+ apertium-filter-ambiguity.1 apertium-gen-deformat.1 \
+ apertium-gen-reformat.1 \
apertium-preprocess-transfer.1 apertium-pretransfer.1 apertium-rehtml.1 \
apertium-rertf.1 apertium-retxt.1 apertium-tagger.1 apertium-transfer.1 \
apertium-validate-dictionary.1 apertium-validate-tagger.1 \
@@ -490,11 +623,16 @@ man_MANS=apertium.1 apertium-deshtml.1 apertium-desrtf.1 apertium-destxt.1 \
apertium-validate-postchunk.1 apertium-validate-modes.1 apertium-tagger-apply-new-rules.1 \
apertium-validate-acx.1 apertium-multiple-translations.1 \
apertium-unformat.1
- ###apertium-lextor-eval.1
+#DEPR.:
+# apertium-lextor-eval.1
+# apertium-gen-lextorbil.1
+# apertium-gen-lextormono.1 apertium-gen-stopwords-lextor.1
+# apertium-gen-wlist-lextor.1 apertium-gen-wlist-lextor-translation.1
+# apertium-lextor.1 apertium-preprocess-corpus-lextor.1
EXTRA_DIST = gen-header.sh deformat-header.sh \
reformat.xsl deformat.xsl new2old.xsl lexchoice.xsl lexchoicebil.xsl \
- tagger.dtd transfer.dtd format.dtd txt-format.xml \
+ txt-format.xml \
html-format.xml odt-format.xml rtf-format.xml wxml-format.xml latex-format.xml\
html-noent-format.xml \
xlsx-format.xml pptx-format.xml mediawiki-format.xml trans-header.sh \
@@ -502,7 +640,12 @@ EXTRA_DIST = gen-header.sh deformat-header.sh \
apertium-header.sh apertium-unformat-header.sh $(man_MANS) \
xpresstag-format.xml \
validate-header.sh transformdic-header.sh transformdicbil-header.sh \
- gen-wlist-lextor-header.sh gen-stopwords-lextor.sh preprocess-corpus-lextor.sh \
- interchunk.dtd postchunk.dtd modes.dtd \
+ tagger.dtd interchunk.dtd format.dtd transfer.dtd postchunk.dtd modes.dtd \
+ tagger.rnc interchunk.rnc format.rnc transfer.rnc postchunk.rnc modes.rnc \
utils-fixlatex-header.sh \
- modes-header.sh apertium-createmodes.awk modes2bash.xsl ###trans-lextor-header.sh
+ apertium-gen-modes.in apertium-createmodes.awk modes2bash.xsl modes2debugmodes.xsl
+#DEPR.:
+# trans-lextor-header.sh
+# gen-wlist-lextor-header.sh
+# gen-stopwords-lextor.sh
+# preprocess-corpus-lextor.sh
diff --git a/apertium/a.cc b/apertium/a.cc
new file mode 100644
index 0000000..e97e0b0
--- /dev/null
+++ b/apertium/a.cc
@@ -0,0 +1,50 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "a.h"
+
+#include "analysis.h"
+#include "exception.h"
+
+namespace Apertium {
+bool operator==(const a &a_, const a &b_) {
+ return a_.TheTags == b_.TheTags && a_.TheMorphemes == b_.TheMorphemes;
+}
+
+bool operator<(const a &a_, const a &b_) {
+ if (a_.TheTags == b_.TheTags)
+ return a_.TheMorphemes < b_.TheMorphemes;
+
+ return a_.TheTags < b_.TheTags;
+}
+
+a::a() : TheTags(), TheMorphemes() {}
+
+a::a(const Analysis &Analysis_) : TheTags(), TheMorphemes() {
+ if (Analysis_.TheMorphemes.empty())
+ throw Exception::Analysis::TheMorphemes_empty("can't convert const "
+ "Analysis & comprising empty "
+ "Morpheme std::vector to a");
+
+ if (Analysis_.TheMorphemes.front().TheTags.empty())
+ throw Exception::Morpheme::TheTags_empty("can't convert const Analysis & "
+ "comprising Morpheme comprising "
+ "empty Tag std::vector to a");
+
+ TheTags = Analysis_.TheMorphemes.front().TheTags;
+ TheMorphemes = std::vector<Morpheme>(Analysis_.TheMorphemes.begin() + 1,
+ Analysis_.TheMorphemes.end());
+}
+}
diff --git a/apertium/a.h b/apertium/a.h
new file mode 100644
index 0000000..e38b429
--- /dev/null
+++ b/apertium/a.h
@@ -0,0 +1,37 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef A_H
+#define A_H
+
+#include "analysis.h"
+#include "morpheme.h"
+#include "tag.h"
+
+#include <vector>
+
+namespace Apertium {
+class a {
+public:
+ friend bool operator==(const a &a_, const a &b_);
+ friend bool operator<(const a &a_, const a &b_);
+ a();
+ a(const Analysis &Analysis_);
+ std::vector<Tag> TheTags;
+ std::vector<Morpheme> TheMorphemes;
+};
+}
+
+#endif // A_H
diff --git a/apertium/align.cc b/apertium/align.cc
new file mode 100644
index 0000000..4d82680
--- /dev/null
+++ b/apertium/align.cc
@@ -0,0 +1,56 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "align.h"
+
+#include "linebreak.h"
+
+#include <iomanip>
+#include <ios>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace Apertium {
+void align::align_(
+ const std::vector<std::pair<std::string, std::string> > &string_) {
+ const std::streamsize width_ = col(string_) + 2;
+
+ for (std::vector<std::pair<std::string, std::string> >::const_iterator i_ =
+ string_.begin();
+ i_ != string_.end(); ++i_) {
+ std::cerr << " " << std::setw(width_) << std::left << i_->first
+ << std::setw(0)
+ << linebreak::linebreak_(i_->second, width_ + 2, width_ + 4)
+ << '\n';
+ }
+}
+
+std::string::size_type
+align::col(const std::vector<std::pair<std::string, std::string> > &string_) {
+ std::string::size_type col_ = 0;
+
+ for (std::vector<std::pair<std::string, std::string> >::const_iterator i_ =
+ string_.begin();
+ i_ != string_.end(); ++i_) {
+ if (i_->first.size() > col_)
+ col_ = i_->first.size();
+ }
+
+ return col_;
+}
+}
diff --git a/apertium/align.h b/apertium/align.h
new file mode 100644
index 0000000..0e314b6
--- /dev/null
+++ b/apertium/align.h
@@ -0,0 +1,35 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef ALIGN_H
+#define ALIGN_H
+
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace Apertium {
+class align {
+public:
+ static void
+ align_(const std::vector<std::pair<std::string, std::string> > &string_);
+
+private:
+ static std::string::size_type
+ col(const std::vector<std::pair<std::string, std::string> > &string_);
+};
+}
+
+#endif // ALIGN_H
diff --git a/apertium/analysis.cc b/apertium/analysis.cc
new file mode 100644
index 0000000..61cb55e
--- /dev/null
+++ b/apertium/analysis.cc
@@ -0,0 +1,55 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "analysis.h"
+
+#include "exception.h"
+#include "morpheme.h"
+
+#include <string>
+#include <vector>
+
+namespace Apertium {
+std::wostream &operator<<(std::wostream &Stream_, const Analysis &Analysis_) {
+ Stream_ << static_cast<std::wstring>(Analysis_);
+ return Stream_;
+}
+
+bool operator==(const Analysis &a, const Analysis &b) {
+ return a.TheMorphemes == b.TheMorphemes;
+}
+
+bool operator<(const Analysis &a, const Analysis &b) {
+ return a.TheMorphemes < b.TheMorphemes;
+}
+
+Analysis::operator std::wstring() const {
+ if (TheMorphemes.empty())
+ throw Exception::Analysis::TheMorphemes_empty(
+ "can't convert Analysis comprising empty Morpheme std::vector to "
+ "std::wstring");
+
+ std::vector<Morpheme>::const_iterator Morpheme_ = TheMorphemes.begin();
+ std::wstring wstring_ = *Morpheme_;
+ ++Morpheme_;
+
+ // Call .end() each iteration to save memory.
+ for (; Morpheme_ != TheMorphemes.end(); ++Morpheme_) {
+ wstring_ += L"+" + static_cast<std::wstring>(*Morpheme_);
+ }
+
+ return wstring_;
+}
+}
diff --git a/apertium/analysis.h b/apertium/analysis.h
new file mode 100644
index 0000000..8f57893
--- /dev/null
+++ b/apertium/analysis.h
@@ -0,0 +1,37 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef ANALYSIS_H
+#define ANALYSIS_H
+
+#include "morpheme.h"
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace Apertium {
+class Analysis {
+public:
+ friend std::wostream &operator<<(std::wostream &Stream_,
+ const Analysis &Analysis_);
+ friend bool operator==(const Analysis &a, const Analysis &b);
+ friend bool operator<(const Analysis &a, const Analysis &b);
+ operator std::wstring() const;
+ std::vector<Morpheme> TheMorphemes;
+};
+}
+
+#endif // ANALYSIS_H
diff --git a/apertium/apertium-createmodes.awk b/apertium/apertium-createmodes.awk
index aa944f0..a18b3b5 100755
--- a/apertium/apertium-createmodes.awk
+++ b/apertium/apertium-createmodes.awk
@@ -1,20 +1,25 @@
+#!/usr/bin/awk -f
-{
- if($0 ~ /----.+----/)
- {
- HEAD = substr($0, 5, length($0)-8);
- split(HEAD, ARR, ":");
- NAME = substr(ARR[1], 5, length(ARR[1]));
+# Parse output from modes2bash.xsl
+
+BEGIN {
+ FS="^ *# *"
+}
+
+NF==2 && /\.mode$/ {
+ filename = $2
+ if(seen[filename]) {
+ print "apertium-createmodes.awk: "filename" seen twice" > "/dev/stderr"
+ filename = 0
}
- else if(HEAD != 0)
- {
- myfilename = NAME ".mode";
- if(ARR[3] == "yes")
- {
- myfilename = "../" myfilename;
- }
- # fool code because a bug in mawk
- printf $0 "\n" >> myfilename;
- close(myfilename);
+ else {
+ print "" > filename
+ seen[filename] = 1
}
+ next
+}
+
+filename {
+ print $0 >> filename
+ close(filename)
}
diff --git a/apertium/apertium-desmediawiki.1 b/apertium/apertium-desmediawiki.1
index ee7b380..7954471 100644
--- a/apertium/apertium-desmediawiki.1
+++ b/apertium/apertium-desmediawiki.1
@@ -1,4 +1,4 @@
-.TH apertium-deswikimedia 1 2009-08-30 "" ""
+.TH apertium-desmediawiki 1 2009-08-30 "" ""
.SH NAME
apertium-desmediawiki \- This application is part of (
.B apertium
@@ -11,7 +11,7 @@ This tool is part of the apertium open-source machine translation toolbox: \fBht
.PP
.SH DESCRIPTION
.BR apertium-desmediawiki
-is a processor for wikimedia XML dumps (i.e., those produced using
+is a processor for mediawiki XML dumps (i.e., those produced using
Special:Export. Data should be passed through this
processor before being piped to lt-proc. The program takes input
in the form of a text file and produces output suitable for
@@ -37,7 +37,7 @@ echo "gener" | apertium-destxt | lt-proc ca-es.automorf.bin
Complicated links - [[page|alternative text]], [[link]]s, etc. are not
supported.
.PP
-The wikimedia parser has special support for mixing apostrophes and
+The mediawiki parser has special support for mixing apostrophes and
apostrophes as formatting. This is not supported either.
.SH AUTHOR
Copyright (c) 2005, 2006 Universitat d'Alacant / Universidad de Alicante.
diff --git a/apertium/apertium-gen-modes.in b/apertium/apertium-gen-modes.in
new file mode 100644
index 0000000..1358a00
--- /dev/null
+++ b/apertium/apertium-gen-modes.in
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Makefile.am prepends APERTIUMDIR, XMLLINT, XSLTPROC and the right shebang
+
+show_help () {
+ cat <<EOF
+USAGE: $(basename "$0") modes.xml
+ $(basename "$0") modes.xml BASENAME
+ $(basename "$0") -f modes.xml INSTALLDIR
+
+Creates all modes under the 'modes/' subdirectory of the directory of
+modes.xml, and further creates copies of installable modes in the same
+directory as modes.xml.
+
+If only modes.xml is given, all files refer only to datafiles under
+the same directory as modes.xml.
+
+If only modes.xml and BASENAME are given, installable modes will refer
+to datafiles in ${APERTIUMDIR}/\${BASENAME}.
+
+If -f is given, the second non-option argument INSTALLDIR is the full
+path to where installed data files for installable modes are.
+
+
+If a mode has attribute gendebug="yes", the script will also
+auto-generate debug modes (e.g. -morph, -tagger, -chunker).
+
+Use option -v to show the actual commands this script runs.
+EOF
+ exit 1
+}
+
+verbose=false
+fullpath=false
+OPTIND=1
+while getopts "hHfv" opt; do
+ case "$opt" in
+ h|H)
+ echo show_help
+ exit 0
+ ;;
+ v) verbose=true
+ ;;
+ f) fullpath=true
+ ;;
+ '?')
+ show_help >&2
+ exit 1
+ ;;
+ esac
+done
+shift $((OPTIND-1))
+
+xmlfile="$1"
+if [[ ! -e "${xmlfile}" ]]; then
+ echo "ERROR: '${xmlfile}' file not found"
+ exit 1
+fi
+xmldir=$(cd "$(dirname "${xmlfile}")"; pwd)
+
+case $# in
+ 1) installdir="${xmldir}";;
+ 2) if ${fullpath}; then
+ installdir="$2"
+ else
+ installdir="${APERTIUMDIR}/$2"
+ fi
+ ;;
+ *) show_help >&2
+ exit 1
+ ;;
+esac
+
+$verbose && set -x
+set -o pipefail # introduced in bash 3; available in OSX>=10.5; should be safe
+
+[[ -d "${xmldir}"/modes ]] || mkdir "${xmldir}"/modes
+
+"${XMLLINT}" --dtdvalid "${APERTIUMDIR}"/modes.dtd --noout "${xmlfile}" || exit $?
+
+"${XSLTPROC}" "${APERTIUMDIR}"/modes2debugmodes.xsl "${xmlfile}" \
+ | "${XSLTPROC}" --stringparam devdir "${xmldir}" \
+ --stringparam installdir "${installdir}" \
+ "${APERTIUMDIR}"/modes2bash.xsl \
+ - \
+ | awk -f "${APERTIUMDIR}"/apertium-createmodes.awk
diff --git a/apertium/apertium-header.sh b/apertium/apertium-header.sh
index 91975c7..bfef765 100644
--- a/apertium/apertium-header.sh
+++ b/apertium/apertium-header.sh
@@ -13,9 +13,7 @@
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
-# 02111-1307, USA.
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
message ()
@@ -490,7 +488,7 @@ case "$FORMAT" in
else OPTION="-g";
fi
;;
- txt|rtf|html|xpresstag)
+ txt|rtf|html|xpresstag|mediawiki)
if [ "$UWORDS" = "no" ]; then OPTION="-n";
else OPTION="-g";
fi;
diff --git a/apertium/apertium-multiple-translations.cc b/apertium/apertium-multiple-translations.cc
index 2349064..2bd084c 100644
--- a/apertium/apertium-multiple-translations.cc
+++ b/apertium/apertium-multiple-translations.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/transfer_mult.h>
#include <lttoolbox/lt_locale.h>
diff --git a/apertium/apertium-prelatex.l b/apertium/apertium-prelatex.l
index 237541e..03a3d66 100644
--- a/apertium/apertium-prelatex.l
+++ b/apertium/apertium-prelatex.l
@@ -214,7 +214,7 @@ wstring convertir(string const &multibyte, int const length)
\\usepackage\[[^\]]*\] {
wstring ws = convertir(yytext+12,yyleng-13);
fputws((wstring(L"<usepackage/><PARAM>")+ws+wstring(L"</PARAM>")).c_str(), yyout);
- if(ws.find(L"ngerman") >= 0)
+ if(ws.find(L"ngerman") != wstring::npos)
ngermanbabel = true;
}
diff --git a/apertium/apertium-transfer.1 b/apertium/apertium-transfer.1
index 6d4692c..754da49 100644
--- a/apertium/apertium-transfer.1
+++ b/apertium/apertium-transfer.1
@@ -63,6 +63,9 @@ case-sensitiveness while accessing bilingual dictionary
.B -t
trace mode: show rule numbers and matched content
.PP
+.B -T
+extended trace mode, for use with apertium-transfer-tools
+.PP
.B -z
null-flushing output on
.PP
diff --git a/apertium/apertium-unformat.1 b/apertium/apertium-unformat.1
index 873f38b..68cf84b 100644
--- a/apertium/apertium-unformat.1
+++ b/apertium/apertium-unformat.1
@@ -8,7 +8,7 @@ This tool is part of the apertium machine translation
architecture: \fBhttp://apertium.sf.net\fR.
.SH SYNOPSIS
.B apertium-unformat
-[-f format] [infile [outfile]]
+[\-f format] [infile [outfile]]
.SH DESCRIPTION
.BR apertium
is the application that extract unformatted text from documents.
diff --git a/apertium/apertium.1 b/apertium/apertium.1
index b703450..fae0ff2 100644
--- a/apertium/apertium.1
+++ b/apertium/apertium.1
@@ -53,7 +53,7 @@ source text.
.SH OPTIONS
.PP
.B -d datadir
-The directory holding the linguistic data. By default it will used the
+The directory holding the linguistic data. By default it will use the
expected installation path.
.PP
.B language-pair
diff --git a/apertium/apertium_filter_ambiguity.cc b/apertium/apertium_filter_ambiguity.cc
index 167a5d7..915f764 100644
--- a/apertium/apertium_filter_ambiguity.cc
+++ b/apertium/apertium_filter_ambiguity.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/tsx_reader.h>
#include <lttoolbox/compression.h>
diff --git a/apertium/apertium_gen_wlist_lextor_translation.cc b/apertium/apertium_gen_wlist_lextor_translation.cc
index 49b2c81..d32d756 100644
--- a/apertium/apertium_gen_wlist_lextor_translation.cc
+++ b/apertium/apertium_gen_wlist_lextor_translation.cc
@@ -14,9 +14,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <iostream>
@@ -103,9 +101,7 @@ int main(int argc, char* argv[]) {
<<L" General Public License for more details.\n"
<<L"\n"
<<L" You should have received a copy of the GNU General Public License\n"
- <<L" along with this program; if not, write to the Free Software\n"
- <<L" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA\n"
- <<L" 02111-1307, USA.\n";
+ <<L" along with this program; if not, see <http://www.gnu.org/licenses/>.\n";
exit(EXIT_SUCCESS);
break;
default:
diff --git a/apertium/apertium_interchunk.cc b/apertium/apertium_interchunk.cc
index dd589fe..a358449 100644
--- a/apertium/apertium_interchunk.cc
+++ b/apertium/apertium_interchunk.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/interchunk.h>
#include <lttoolbox/lt_locale.h>
diff --git a/apertium/apertium_lextor.cc b/apertium/apertium_lextor.cc
index 9247ba9..bb40af4 100644
--- a/apertium/apertium_lextor.cc
+++ b/apertium/apertium_lextor.cc
@@ -14,9 +14,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <iostream>
@@ -217,9 +215,7 @@ int main(int argc, char* argv[]) {
<<L" General Public License for more details.\n"
<<L"\n"
<<L" You should have received a copy of the GNU General Public License\n"
- <<L" along with this program; if not, write to the Free Software\n"
- <<L" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA\n"
- <<L" 02111-1307, USA.\n";
+ <<L" along with this program; if not, see <http://www.gnu.org/licenses/>.\n";
exit(EXIT_SUCCESS);
break;
default:
diff --git a/apertium/apertium_lextor_eval.cc b/apertium/apertium_lextor_eval.cc
index 62a249b..f1e9dfa 100644
--- a/apertium/apertium_lextor_eval.cc
+++ b/apertium/apertium_lextor_eval.cc
@@ -14,9 +14,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <iostream>
@@ -165,9 +163,7 @@ int main(int argc, char* argv[]) {
<<" General Public License for more details.\n"
<<"\n"
<<" You should have received a copy of the GNU General Public License\n"
- <<" along with this program; if not, write to the Free Software\n"
- <<" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA\n"
- <<" 02111-1307, USA.\n";
+ <<" along with this program; if not, see <http://www.gnu.org/licenses/>.\n";
exit(EXIT_SUCCESS);
break;
default:
diff --git a/apertium/apertium_postchunk.cc b/apertium/apertium_postchunk.cc
index e3cc36f..f313305 100644
--- a/apertium/apertium_postchunk.cc
+++ b/apertium/apertium_postchunk.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/postchunk.h>
#include <lttoolbox/lt_locale.h>
diff --git a/apertium/apertium_pretransfer.cc b/apertium/apertium_pretransfer.cc
index c459673..b425ed7 100644
--- a/apertium/apertium_pretransfer.cc
+++ b/apertium/apertium_pretransfer.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <cstdio>
#include <cstdlib>
diff --git a/apertium/apertium_re.cc b/apertium/apertium_re.cc
index 7db9473..cd18a28 100644
--- a/apertium/apertium_re.cc
+++ b/apertium/apertium_re.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/apertium_re.h>
#include <lttoolbox/compression.h>
@@ -25,7 +23,8 @@
using namespace Apertium;
using namespace std;
-ApertiumRE::ApertiumRE()
+ApertiumRE::ApertiumRE() :
+re(0)
{
empty = true;
}
diff --git a/apertium/apertium_re.h b/apertium/apertium_re.h
index ee44fab..a8ed899 100644
--- a/apertium/apertium_re.h
+++ b/apertium/apertium_re.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _APERTIUM_RE_
diff --git a/apertium/apertium_tagger.cc b/apertium/apertium_tagger.cc
index 78ef8d0..be9bf4e 100644
--- a/apertium/apertium_tagger.cc
+++ b/apertium/apertium_tagger.cc
@@ -1,40 +1,1058 @@
-/*
- * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
- */
-
-#include <apertium/tagger.h>
-#include <apertium/tsx_reader.h>
-#include <lttoolbox/match_exe.h>
-#include <lttoolbox/match_state.h>
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "apertium_tagger.h"
+
+#include "apertium_config.h"
+
+#include "align.h"
+#include "basic_exception_type.h"
+#include "basic_stream_tagger.h"
+#include "basic_stream_tagger_trainer.h"
+#include "basic_tagger.h"
+#include "exception.h"
+#include "file_tagger.h"
+#include "err_exception.h"
+#include <apertium/hmm.cc>
+#include "linebreak.h"
+#include <apertium/lswpost.h>
+#include "stream_5_3_1_tagger.h"
+#include "stream_5_3_1_tagger_trainer.h"
+#include "stream_5_3_2_tagger.h"
+#include "stream_5_3_2_tagger_trainer.h"
+#include "stream_5_3_3_tagger.h"
+#include "stream_5_3_3_tagger_trainer.h"
+#include <apertium/tagger_word.h>
+
#include <lttoolbox/lt_locale.h>
-#include <iostream>
+#include <cerrno>
+#include <cstdio>
#include <cstdlib>
-#include <apertium/string_utils.h>
+#include <cstring>
+#include <fstream>
+#include <getopt.h>
+#include <iomanip>
+#include <ios>
+#include <iostream>
+#include <locale>
+#include <sstream>
+#include <string>
+#include <unistd.h>
+
+#ifdef _MSC_VER
+#include <fcntl.h>
+#include <io.h>
+#endif // _MSC_VER
+
+namespace Apertium {
+apertium_tagger::apertium_tagger(int &argc, char **&argv)
+ : argc(argc), argv(argv), The_val(),
+
+#if HAVE_GETOPT_LONG
+
+ The_indexptr(), FunctionTypeTypeOption_indexptr(),
+ FunctionTypeOption_indexptr(),
+
+#else
+
+ FunctionTypeTypeOption_val(), FunctionTypeOptiona_val(),
+
+#endif // HAVE_GETOPT_LONG
+
+ TheFunctionTypeType(), TheUnigramType(), TheFunctionType(),
+ TheFunctionTypeOptionArgument(0), TheFlags() {
+ try {
+ while (true) {
+ The_val =
+
+#if HAVE_GETOPT_LONG
+
+ getopt_long(argc, argv, "dfgmpr:s:t:u:wz", longopts, &The_indexptr);
+
+#else
+
+ getopt(argc, argv, "dfgmpr:s:t:u:wz");
+
+#endif // HAVE_GETOPT_LONG
+
+ if (The_val == -1)
+ break;
+
+#if HAVE_GETOPT_LONG
+
+ set_indexptr();
+
+#endif // HAVE_GETOPT_LONG
+
+ switch (The_val) {
+ case 'd':
+ flagOptionCase(&basic_Tagger::Flags::getDebug,
+ &basic_Tagger::Flags::setDebug);
+ break;
+ case 'f':
+ flagOptionCase(&basic_Tagger::Flags::getFirst,
+ &basic_Tagger::Flags::setFirst);
+ break;
+ case 'm':
+ flagOptionCase(&basic_Tagger::Flags::getMark,
+ &basic_Tagger::Flags::setMark);
+ break;
+ case 'p':
+ flagOptionCase(&basic_Tagger::Flags::getShowSuperficial,
+ &basic_Tagger::Flags::setShowSuperficial);
+ break;
+ case 'z':
+ flagOptionCase(&basic_Tagger::Flags::getNullFlush,
+ &basic_Tagger::Flags::setNullFlush);
+ break;
+ case 'u':
+ functionTypeTypeOptionCase(Unigram);
+
+ if (std::strncmp(optarg, "1", sizeof "1" - 1) == 0) {
+ TheUnigramType = Stream_5_3_1;
+ break;
+ }
+
+ if (std::strncmp(optarg, "2", sizeof "2" - 1) == 0) {
+ TheUnigramType = Stream_5_3_2;
+ break;
+ }
+
+ if (std::strncmp(optarg, "3", sizeof "3" - 1) == 0) {
+ TheUnigramType = Stream_5_3_3;
+ break;
+ }
+
+ {
+ std::stringstream what_;
+ what_ << "invalid argument '" << optarg << "' for '--unigram'\n"
+"Valid arguments are:\n"
+" - '1'\n"
+" - '2'\n"
+" - '3'";
+ throw Exception::apertium_tagger::InvalidArgument(what_);
+ }
+ break;
+ case 'w':
+ functionTypeTypeOptionCase(SlidingWindow);
+ break;
+ case 'g':
+ functionTypeOptionCase(Tagger);
+ break;
+ case 'r':
+ functionTypeOptionCase(Retrain);
+ getIterationsArgument();
+ break;
+ case 's':
+ functionTypeOptionCase(Supervised);
+ getIterationsArgument();
+ break;
+ case 't':
+ functionTypeOptionCase(Train);
+ getIterationsArgument();
+ break;
+ case 'h':
+ help();
+ return;
+ default:
+ throw err_Exception();
+ }
+ }
+
+ if (!TheFunctionType) {
+ help();
+ return;
+ }
+
+ switch (*TheFunctionType) {
+ case Tagger:
+ if (!TheFunctionTypeType) {
+ HMM HiddenMarkovModelTagger_;
+ g_FILE_Tagger(HiddenMarkovModelTagger_);
+ break;
+ }
+
+ switch (*TheFunctionTypeType) {
+ case Unigram: {
+ switch (*TheUnigramType) {
+ case Stream_5_3_1: {
+ Stream_5_3_1_Tagger Stream_5_3_1_Tagger_(TheFlags);
+ g_StreamTagger(Stream_5_3_1_Tagger_);
+ } break;
+ case Stream_5_3_2: {
+ Stream_5_3_2_Tagger Stream_5_3_2_Tagger_(TheFlags);
+ g_StreamTagger(Stream_5_3_2_Tagger_);
+ } break;
+ case Stream_5_3_3: {
+ Stream_5_3_3_Tagger Stream_5_3_3_Tagger_(TheFlags);
+ g_StreamTagger(Stream_5_3_3_Tagger_);
+ } break;
+ default:
+ std::abort();
+ }
+ } break;
+ case SlidingWindow: {
+ LSWPoST SlidingWindowTagger_;
+ g_FILE_Tagger(SlidingWindowTagger_);
+ } break;
+ default:
+ std::abort();
+ }
-using namespace Apertium;
-using namespace std;
+ break;
+ case Retrain:
+ if (!TheFunctionTypeType) {
+ HMM HiddenMarkovModelTagger_;
+ r_FILE_Tagger(HiddenMarkovModelTagger_);
+ break;
+ }
+
+ switch (*TheFunctionTypeType) {
+ case Unigram: {
+ std::stringstream what_;
+ what_ << "invalid option -- 'u'";
+ throw Exception::apertium_tagger::InvalidOption(what_);
+ }
+ case SlidingWindow: {
+ LSWPoST SlidingWindowTagger_;
+ r_FILE_Tagger(SlidingWindowTagger_);
+ } break;
+ default:
+ std::abort();
+ }
+
+ break;
+ case Supervised:
+ if (!TheFunctionTypeType) {
+ HMM HiddenMarkovModelTagger_;
+ s_FILE_Tagger(HiddenMarkovModelTagger_);
+ break;
+ }
+
+ switch (*TheFunctionTypeType) {
+ case Unigram: {
+ switch (*TheUnigramType) {
+ case Stream_5_3_1: {
+ Stream_5_3_1_TaggerTrainer Stream_5_3_1_TaggerTrainer_(TheFlags);
+ s_StreamTaggerTrainer(Stream_5_3_1_TaggerTrainer_);
+ } break;
+ case Stream_5_3_2: {
+ Stream_5_3_2_TaggerTrainer Stream_5_3_2_TaggerTrainer_(TheFlags);
+ s_StreamTaggerTrainer(Stream_5_3_2_TaggerTrainer_);
+ } break;
+ case Stream_5_3_3: {
+ Stream_5_3_3_TaggerTrainer Stream_5_3_3_TaggerTrainer_(TheFlags);
+ s_StreamTaggerTrainer(Stream_5_3_3_TaggerTrainer_);
+ } break;
+ default:
+ std::abort();
+ }
+ } break;
+ case SlidingWindow: {
+ std::stringstream what_;
+ what_ << "invalid option -- 'w'";
+ throw Exception::apertium_tagger::InvalidOption(what_);
+ }
+ default:
+ std::abort();
+ }
+
+ break;
+ case Train:
+ if (!TheFunctionTypeType) {
+ HMM HiddenMarkovModelTagger_;
+ t_FILE_Tagger(HiddenMarkovModelTagger_);
+ break;
+ }
+
+ switch (*TheFunctionTypeType) {
+ case Unigram: {
+ std::stringstream what_;
+ what_ << "invalid option -- 'u'";
+ throw Exception::apertium_tagger::InvalidOption(what_);
+ }
+ case SlidingWindow: {
+ LSWPoST SlidingWindowTagger_;
+ t_FILE_Tagger(SlidingWindowTagger_);
+ } break;
+ default:
+ std::abort();
+ }
+
+ break;
+ default:
+ std::abort();
+ }
+ } catch (const basic_ExceptionType &basic_ExceptionType_) {
+ std::cerr << "apertium-tagger: " << basic_ExceptionType_.what() << '\n';
+ throw err_Exception();
+ }
+}
+
+void apertium_tagger::help() {
+
+#if HAVE_GETOPT_LONG
+#define NAME(NAME_) ", --" #NAME_
+#define ARGUMENT(ARGUMENT_) "=" #ARGUMENT_
+#else
+#define NAME(NAME_)
+#define ARGUMENT(ARGUMENT_) " " #ARGUMENT_
+#endif // HAVE_GETOPT_LONG
+
+ std::cerr <<
+"Usage: apertium-tagger [OPTION]... -g SERIALISED_BASIC_TAGGER \\\n"
+" [INPUT \\\n"
+" [OUTPUT]]\n"
+"\n"
+" or: apertium-tagger [OPTION]... -r ITERATIONS \\\n"
+" CORPUS \\\n"
+" SERIALISED_BASIC_TAGGER\n"
+"\n"
+" or: apertium-tagger [OPTION]... -s ITERATIONS \\\n"
+" DICTIONARY \\\n"
+" CORPUS \\\n"
+" TAGGER_SPECIFICATION \\\n"
+" SERIALISED_BASIC_TAGGER \\\n"
+" TAGGED_CORPUS \\\n"
+" UNTAGGED_CORPUS\n"
+"\n"
+" or: apertium-tagger [OPTION]... -s 0 \\\n"
+" -u MODEL \\\n"
+" SERIALISED_BASIC_TAGGER \\\n"
+" TAGGED_CORPUS\n"
+"\n"
+" or: apertium-tagger [OPTION]... -t ITERATIONS \\\n"
+" DICTIONARY \\\n"
+" CORPUS \\\n"
+" TAGGER_SPECIFICATION \\\n"
+" SERIALISED_BASIC_TAGGER\n"
+"\n"
+"\n"
+"Mandatory arguments to long options are mandatory for short options too.\n"
+"\n";
+
+ std::vector<std::pair<std::string, std::string> > options_description_;
+ options_description_.push_back(std::make_pair("-d" NAME(debug), "with -g, print error messages about the input"));
+ options_description_.push_back(std::make_pair("-f" NAME(first), "with -g, reorder each lexical unit's analyses so that the chosen one is first"));
+ options_description_.push_back(std::make_pair("-m" NAME(mark), "with -g, mark disambiguated lexical units"));
+ options_description_.push_back(std::make_pair("-p" NAME(show-superficial), "with -g, output each lexical unit's surface form"));
+ options_description_.push_back(std::make_pair("-z" NAME(null-flush), "with -g, flush the output after getting each null character"));
+ align::align_(options_description_);
+ std::cerr << '\n';
+ options_description_.clear();
+ options_description_.push_back(std::make_pair("-u" NAME(unigram) ARGUMENT(MODEL), "use unigram algorithm MODEL from <http://coltekin.net/cagri/papers/trmorph-tools.pdf>"));
+ align::align_(options_description_);
+ std::cerr << '\n';
+ options_description_.clear();
+ options_description_.push_back(std::make_pair("-w" NAME(sliding-window), "use the Light Sliding Window algorithm"));
+ align::align_(options_description_);
+ std::cerr << '\n';
+ options_description_.clear();
+ options_description_.push_back(std::make_pair("-g" NAME(tagger), "disambiguate the input"));
+ align::align_(options_description_);
+ std::cerr << '\n';
+ options_description_.clear();
+ options_description_.push_back(std::make_pair("-r" NAME(retrain) ARGUMENT(ITERATIONS), "with -u: exit;\notherwise: retrain the tagger with ITERATIONS unsupervised iterations"));
+ options_description_.push_back(std::make_pair("-s" NAME(supervised) ARGUMENT(ITERATIONS), "with -u: train the tagger with a hand-tagged corpus;\nwith -w: exit;\notherwise: initialise the tagger with a hand-tagged corpus and retrain it with ITERATIONS unsupervised iterations"));
+ options_description_.push_back(std::make_pair("-t" NAME(train) ARGUMENT(ITERATIONS), "with -u: exit;\notherwise: train the tagger with ITERATIONS unsupervised iterations"));
+ align::align_(options_description_);
+ std::cerr << '\n';
+ options_description_.clear();
+ options_description_.push_back(std::make_pair("-h" NAME(help), "display this help and exit"));
+ align::align_(options_description_);
+}
+
+#if HAVE_GETOPT_LONG
+
+std::string apertium_tagger::option_string(const int &indexptr_) {
+ return option_string(longopts[indexptr_]);
+}
+
+std::string apertium_tagger::option_string(const struct option &option_) {
+ std::stringstream option_string_;
+ option_string_ << "--" << option_.name;
+ return option_string_.str();
+}
+
+#else
+
+std::string apertium_tagger::option_string(const int &val_) {
+ std::stringstream option_string_;
+ option_string_ << '-' << static_cast<char>(val_);
+ return option_string_.str();
+}
+
+#endif // HAVE_GETOPT_LONG
+
+void apertium_tagger::locale_global_() {
+
+#if defined __clang__
+
+ std::locale::global(std::locale(""));
+
+#else
+#if defined __APPLE__
-int main(int argc, char *argv[])
-{
LtLocale::tryToSetLocale();
- Tagger t;
- t.main(argc, argv);
- return EXIT_SUCCESS;
+#else
+
+ std::locale::global(std::locale(""));
+
+#endif // defined __APPLE__
+#endif // defined __clang__
+}
+
+#if HAVE_GETOPT_LONG
+
+const struct option apertium_tagger::longopts[] = {
+ {"help", no_argument, 0, 'h'},
+ {"debug", no_argument, 0, 'd'},
+ {"first", no_argument, 0, 'f'},
+ {"mark", no_argument, 0, 'm'},
+ {"show-superficial", no_argument, 0, 'p'},
+ {"null-flush", no_argument, 0, 'z'},
+ {"unigram", no_argument, 0, 'u'},
+ {"sliding-window", no_argument, 0, 'w'},
+ {"tagger", no_argument, 0, 'g'},
+ {"retrain", required_argument, 0, 'r'},
+ {"supervised", required_argument, 0, 's'},
+ {"train", required_argument, 0, 't'},
+ {0, 0, 0, 0}};
+
+#endif // HAVE_GETOPT_LONG
+
+#if HAVE_GETOPT_LONG
+
+void apertium_tagger::set_indexptr() {
+ if (The_val == longopts[The_indexptr].val)
+ return;
+
+ for (std::size_t longopts_Index = 0; longopts[longopts_Index].val != 0;
+ ++longopts_Index) {
+ if (The_val == longopts[longopts_Index].val) {
+ The_indexptr = longopts_Index;
+ return;
+ }
+ }
+}
+
+#endif // HAVE_GETOPT_LONG
+
+void apertium_tagger::flagOptionCase(
+ bool (basic_Tagger::Flags::*GetFlag)() const,
+ void (basic_Tagger::Flags::*SetFlag)(const bool &)) {
+ if ((TheFlags.*GetFlag)()) {
+ std::stringstream what_;
+ what_ << "unexpected '" << option_string() << "' following '"
+ << option_string() << '\'';
+ throw Exception::apertium_tagger::UnexpectedFlagOption(what_);
+ }
+
+ (TheFlags.*SetFlag)(true);
+}
+
+std::string apertium_tagger::option_string() {
+
+#if HAVE_GETOPT_LONG
+
+ return option_string(The_indexptr);
+
+#else
+
+ return option_string(The_val);
+
+#endif // HAVE_GETOPT_LONG
+}
+
+void apertium_tagger::functionTypeTypeOptionCase(
+ const FunctionTypeType &FunctionTypeType_) {
+ if (
+
+#if HAVE_GETOPT_LONG
+
+ FunctionTypeTypeOption_indexptr
+
+#else
+
+ FunctionTypeTypeOption_val
+
+#endif // HAVE_GETOPT_LONG
+
+ ) {
+ std::stringstream what_;
+ what_ << "unexpected '" << option_string() << "' following '"
+ << option_string(
+
+#if HAVE_GETOPT_LONG
+
+ *FunctionTypeTypeOption_indexptr
+
+#else
+ *FunctionTypeTypeOption_val
+
+#endif // HAVE_GETOPT_LONG
+
+ ) << '\'';
+ throw Exception::apertium_tagger::UnexpectedFunctionTypeTypeOption(what_);
+ }
+
+ TheFunctionTypeType = FunctionTypeType_;
+
+#if HAVE_GETOPT_LONG
+
+ FunctionTypeTypeOption_indexptr = The_indexptr;
+
+#else
+
+ FunctionTypeTypeOption_val = The_val;
+
+#endif // HAVE_GETOPT_LONG
+}
+
+void
+apertium_tagger::functionTypeOptionCase(const FunctionType &FunctionType_) {
+ if (
+
+#if HAVE_GETOPT_LONG
+
+ FunctionTypeOption_indexptr
+
+#else
+
+ FunctionTypeOptiona_val
+
+#endif // HAVE_GETOPT_LONG
+
+ ) {
+ std::stringstream what_;
+ what_ << "unexpected '" << option_string() << "' following '"
+ << option_string(
+
+#if HAVE_GETOPT_LONG
+
+ *FunctionTypeOption_indexptr
+
+#else
+
+ *FunctionTypeOptiona_val
+
+#endif // HAVE_GETOPT_LONG
+
+ ) << '\'';
+ throw Exception::apertium_tagger::UnexpectedFunctionTypeOption(what_);
+ }
+
+ TheFunctionType = FunctionType_;
+
+#if HAVE_GETOPT_LONG
+
+ FunctionTypeOption_indexptr = The_indexptr;
+
+#else
+
+ FunctionTypeOptiona_val = The_val;
+
+#endif // HAVE_GETOPT_LONG
+}
+
+void apertium_tagger::getIterationsArgument() {
+ try {
+ TheFunctionTypeOptionArgument = optarg_unsigned_long();
+ } catch (const ExceptionType &ExceptionType_) {
+ std::stringstream what_;
+ what_ << "invalid argument '" << optarg << "' for '" << option_string()
+ << '\'';
+ throw Exception::apertium_tagger::InvalidArgument(what_);
+ }
+}
+
+unsigned long apertium_tagger::optarg_unsigned_long() const {
+ char *str_end;
+ errno = 0;
+ unsigned long N_0 = std::strtoul(optarg, &str_end, 10);
+
+ if (*str_end != '\0') {
+ std::stringstream what_;
+ what_ << "can't convert char *optarg \"" << optarg << "\" to unsigned long";
+ throw Exception::apertium_tagger::str_end_not_eq_NULL(what_);
+ }
+
+ if (*optarg == '\0') {
+ std::stringstream what_;
+ what_ << "can't convert char *optarg of size 1 \"\" to unsigned long";
+ throw Exception::apertium_tagger::optarg_eq_NULL(what_);
+ }
+
+ if (errno == ERANGE) {
+ std::stringstream what_;
+ what_ << "can't convert char *optarg \"" << optarg
+ << "\" to unsigned long, not in unsigned long range";
+ throw Exception::apertium_tagger::ERANGE_(what_);
+ }
+
+ return N_0;
+}
+
+void apertium_tagger::g_StreamTagger(basic_StreamTagger &StreamTagger_) {
+ locale_global_();
+
+ if (argc - optind < 1 || !(argc - optind < 4)) {
+ std::stringstream what_;
+ what_ << "expected 1, 2, or 3 file arguments, got " << argc - optind;
+ throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_);
+ }
+
+ std::ifstream SerialisedAnalysisFrequencies(argv[optind]);
+
+ if (SerialisedAnalysisFrequencies.fail()) {
+ std::stringstream what_;
+ what_ << "can't open SERIALISED_BASIC_TAGGER file \"" << argv[optind]
+ << "\"";
+ throw Exception::apertium_tagger::ifstream_fail(what_);
+ }
+
+ try {
+ StreamTagger_.deserialise(SerialisedAnalysisFrequencies);
+ } catch (const basic_ExceptionType &basic_ExceptionType_) {
+ std::stringstream what_;
+ what_ << "can't deserialise SERIALISED_BASIC_TAGGER file \"" << argv[optind]
+ << "\"";
+ throw Exception::apertium_tagger::deserialise(what_);
+ }
+
+ if (argc - optind < 2) {
+ Stream Input(TheFlags);
+ StreamTagger_.tag(Input, std::wcout);
+ return;
+ }
+
+ std::wifstream Input_stream(argv[optind + 1]);
+
+ if (Input_stream.fail()) {
+ std::stringstream what_;
+ what_ << "can't open INPUT file \"" << argv[optind + 1] << "\"";
+ throw Exception::apertium_tagger::wifstream_fail(what_);
+ }
+
+ if (argc - optind < 3) {
+ Stream Input(TheFlags, Input_stream, argv[optind + 1]);
+ StreamTagger_.tag(Input, std::wcout);
+ return;
+ }
+
+ std::wofstream Output_stream(argv[optind + 2]);
+
+ if (Output_stream.fail()) {
+ std::stringstream what_;
+ what_ << "can't open OUTPUT file \"" << argv[optind + 2] << "\"";
+ throw Exception::apertium_tagger::wofstream_fail(what_);
+ }
+
+ Stream Input(TheFlags, Input_stream, argv[optind + 1]);
+ StreamTagger_.tag(Input, Output_stream);
+}
+
+void apertium_tagger::s_StreamTaggerTrainer(
+ basic_StreamTaggerTrainer &StreamTaggerTrainer_) {
+ locale_global_();
+
+ if (TheFunctionTypeOptionArgument != 0) {
+ std::stringstream what_;
+ what_ << "invalid argument '" << TheFunctionTypeOptionArgument
+ << "' for '--supervised'";
+ throw Exception::apertium_tagger::InvalidArgument(what_);
+ }
+
+ if (argc - optind < 2 || !(argc - optind < 3)) {
+ std::stringstream what_;
+ what_ << "expected 2 file arguments, got " << argc - optind;
+ throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_);
+ }
+
+ std::wifstream TaggedCorpus_stream(argv[optind + 1]);
+
+ if (TaggedCorpus_stream.fail()) {
+ std::stringstream what_;
+ what_ << "can't open TAGGED_CORPUS file \"" << argv[optind + 1] << "\"";
+ throw Exception::apertium_tagger::wifstream_fail(what_);
+ }
+
+ Stream TaggedCorpus(TheFlags, TaggedCorpus_stream, argv[optind]);
+ StreamTaggerTrainer_.train(TaggedCorpus);
+
+ std::ofstream Serialised_basic_Tagger(argv[optind]);
+
+ if (Serialised_basic_Tagger.fail()) {
+ std::stringstream what_;
+ what_ << "can't open SERIALISED_BASIC_TAGGER file \"" << argv[optind]
+ << "\"";
+ throw Exception::apertium_tagger::ofstream_fail(what_);
+ }
+
+ StreamTaggerTrainer_.serialise(Serialised_basic_Tagger);
+}
+
+void apertium_tagger::g_FILE_Tagger(FILE_Tagger &FILE_Tagger_) {
+ LtLocale::tryToSetLocale();
+
+ if (argc - optind < 1 || !(argc - optind < 4)) {
+ std::stringstream what_;
+ what_ << "expected 1, 2, or 3 file arguments, got " << argc - optind;
+ throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_);
+ }
+
+ FILE *Serialised_FILE_Tagger = std::fopen(argv[optind], "rb");
+
+ if (Serialised_FILE_Tagger == NULL) {
+ std::stringstream what_;
+ what_ << "can't open SERIALISED_BASIC_TAGGER file \"" << argv[optind]
+ << "\" for reading in binary mode";
+ throw Exception::apertium_tagger::fopen(what_);
+ }
+
+ FILE_Tagger_.deserialise(Serialised_FILE_Tagger);
+
+ if (std::fclose(Serialised_FILE_Tagger) != 0) {
+ std::stringstream what_;
+ what_ << "can't close SERIALISED_BASIC_TAGGER file \"" << argv[optind]
+ << "\"";
+ throw Exception::apertium_tagger::fclose(what_);
+ }
+
+ FILE_Tagger_.set_debug(TheFlags.getDebug());
+ TaggerWord::generate_marks = TheFlags.getMark();
+ FILE_Tagger_.set_show_sf(TheFlags.getShowSuperficial());
+ FILE_Tagger_.setNullFlush(TheFlags.getNullFlush());
+
+ if (argc - optind < 2)
+ FILE_Tagger_.tagger(stdin, stdout, TheFlags.getFirst());
+ else {
+ FILE *Input = std::fopen(argv[optind + 1], "r");
+
+ if (Input == NULL) {
+ std::stringstream what_;
+ what_ << "can't open INPUT file \"" << argv[optind + 1]
+ << "\" for reading";
+ throw Exception::apertium_tagger::fopen(what_);
+ }
+
+#ifdef _MSC_VER
+
+ _setmode(_fileno(Input), _O_U8TEXT);
+
+#endif // _MSC_VER
+
+ if (argc - optind < 3)
+ FILE_Tagger_.tagger(Input, stdout, TheFlags.getFirst());
+ else {
+ FILE *Output = std::fopen(argv[optind + 2], "w");
+
+ if (Output == NULL) {
+ std::stringstream what_;
+ what_ << "can't open OUTPUT file \"" << argv[optind + 2]
+ << "\" for writing";
+ throw Exception::apertium_tagger::fopen(what_);
+ }
+
+#ifdef _MSC_VER
+
+ _setmode(_fileno(Output), _O_U8TEXT);
+
+#endif // _MSC_VER
+
+ FILE_Tagger_.tagger(Input, Output, TheFlags.getFirst());
+
+ if (std::fclose(Output) != 0) {
+ std::stringstream what_;
+ what_ << "can't close OUTPUT file \"" << argv[optind + 2] << "\"";
+ throw Exception::apertium_tagger::fclose(what_);
+ }
+ }
+
+ if (std::fclose(Input) != 0) {
+ std::stringstream what_;
+ what_ << "can't close INPUT file \"" << argv[optind + 1] << "\"";
+ throw Exception::apertium_tagger::fclose(what_);
+ }
+ }
+}
+
+void apertium_tagger::r_FILE_Tagger(FILE_Tagger &FILE_Tagger_) {
+ LtLocale::tryToSetLocale();
+
+ if (argc - optind < 2 || !(argc - optind < 3)) {
+ std::stringstream what_;
+ what_ << "expected 2 file arguments, got " << argc - optind;
+ throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_);
+ }
+
+ FILE *Serialised_FILE_Tagger = std::fopen(argv[optind + 1], "rb");
+
+ if (Serialised_FILE_Tagger == NULL) {
+ std::stringstream what_;
+ what_ << "can't open SERIALISED_BASIC_TAGGER file \"" << argv[optind + 1]
+ << "\" for reading in binary mode";
+ throw Exception::apertium_tagger::fopen(what_);
+ }
+
+ FILE_Tagger_.deserialise(Serialised_FILE_Tagger);
+
+ if (std::fclose(Serialised_FILE_Tagger) != 0) {
+ std::stringstream what_;
+ what_ << "can't close SERIALISED_BASIC_TAGGER file \"" << argv[optind + 1]
+ << "\"";
+ throw Exception::apertium_tagger::fclose(what_);
+ }
+
+ FILE_Tagger_.set_debug(TheFlags.getDebug());
+ TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags());
+
+ FILE *Corpus = std::fopen(argv[optind], "r");
+
+ if (Corpus == NULL) {
+ std::stringstream what_;
+ what_ << "can't open CORPUS file \"" << argv[optind] << "\" for reading";
+ throw Exception::apertium_tagger::fopen(what_);
+ }
+
+#ifdef _MSC_VER
+
+ _setmode(_fileno(Corpus), _O_U8TEXT);
+
+#endif // _MSC_VER
+
+ FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument);
+
+ if (std::fclose(Corpus) != 0) {
+ std::stringstream what_;
+ what_ << "can't close CORPUS file \"" << argv[optind] << "\"";
+ throw Exception::apertium_tagger::fclose(what_);
+ }
+
+ Serialised_FILE_Tagger = std::fopen(argv[optind + 1], "wb");
+
+ if (Serialised_FILE_Tagger == NULL) {
+ std::stringstream what_;
+ what_ << "can't open SERIALISED_BASIC_TAGGER file \"" << argv[optind + 1]
+ << "\" for writing in binary mode";
+ throw Exception::apertium_tagger::fopen(what_);
+ }
+
+ FILE_Tagger_.serialise(Serialised_FILE_Tagger);
+
+ if (std::fclose(Serialised_FILE_Tagger) != 0) {
+ std::stringstream what_;
+ what_ << "can't close SERIALISED_BASIC_TAGGER file \"" << argv[optind + 1]
+ << "\"";
+ throw Exception::apertium_tagger::fclose(what_);
+ }
+}
+
+void apertium_tagger::s_FILE_Tagger(FILE_Tagger &FILE_Tagger_) {
+ LtLocale::tryToSetLocale();
+
+ if (argc - optind < 6 || !(argc - optind < 7)) {
+ std::stringstream what_;
+ what_ << "expected 6 file arguments, got " << argc - optind;
+ throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_);
+ }
+
+ FILE_Tagger_.deserialise(argv[optind + 2]);
+ FILE_Tagger_.set_debug(TheFlags.getDebug());
+ TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags());
+
+ FILE *Dictionary = std::fopen(argv[optind], "r");
+
+ if (Dictionary == NULL) {
+ std::stringstream what_;
+ what_ << "can't open DICTIONARY file \"" << argv[optind]
+ << "\" for reading";
+ throw Exception::apertium_tagger::fopen(what_);
+ }
+
+ FILE_Tagger_.read_dictionary(Dictionary);
+
+ if (std::fclose(Dictionary) != 0) {
+ std::stringstream what_;
+ what_ << "can't close DICTIONARY file \"" << argv[optind] << "\"";
+ throw Exception::apertium_tagger::fclose(what_);
+ }
+
+ FILE *TaggedCorpus = std::fopen(argv[optind + 4], "r");
+
+ if (TaggedCorpus == NULL) {
+ std::stringstream what_;
+ what_ << "can't open TAGGED_CORPUS file \"" << argv[optind + 4]
+ << "\" for reading";
+ throw Exception::apertium_tagger::fopen(what_);
+ }
+
+ FILE *UntaggedCorpus = std::fopen(argv[optind + 5], "r");
+
+ if (UntaggedCorpus == NULL) {
+ std::stringstream what_;
+ what_ << "can't open UNTAGGED_CORPUS file \"" << argv[optind + 5]
+ << "\" for reading";
+ throw Exception::apertium_tagger::fopen(what_);
+ }
+
+#ifdef _MSC_VER
+
+ _setmode(_fileno(TaggedCorpus), _O_U8TEXT);
+ _setmode(_fileno(UntaggedCorpus), _O_U8TEXT);
+
+#endif // _MSC_VER
+
+ FILE_Tagger_.init_probabilities_from_tagged_text_(TaggedCorpus,
+ UntaggedCorpus);
+
+ if (std::fclose(TaggedCorpus) != 0) {
+ std::stringstream what_;
+ what_ << "can't close TAGGED_CORPUS file \"" << argv[optind + 4] << "\"";
+ throw Exception::apertium_tagger::fclose(what_);
+ }
+
+ if (std::fclose(UntaggedCorpus) != 0) {
+ std::stringstream what_;
+ what_ << "can't close UNTAGGED_CORPUS file \"" << argv[optind + 5] << "\"";
+ throw Exception::apertium_tagger::fclose(what_);
+ }
+
+ FILE *Corpus = std::fopen(argv[optind + 1], "r");
+
+ if (Corpus == NULL) {
+ std::stringstream what_;
+ what_ << "can't open CORPUS file \"" << argv[optind + 1]
+ << "\" for reading";
+ throw Exception::apertium_tagger::fopen(what_);
+ }
+
+#ifdef _MSC_VER
+
+ _setmode(_fileno(Corpus), _O_U8TEXT);
+
+#endif // _MSC_VER
+
+ FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument);
+
+ if (std::fclose(Corpus) != 0) {
+ std::stringstream what_;
+ what_ << "can't close CORPUS file \"" << argv[optind + 1] << "\"";
+ throw Exception::apertium_tagger::fclose(what_);
+ }
+
+ FILE *Stream_ = std::fopen(argv[optind + 3], "wb");
+
+ if (Stream_ == NULL) {
+ std::stringstream what_;
+ what_ << "can't open STREAM file \"" << argv[optind + 3]
+ << "\" for writing in binary mode";
+ throw Exception::apertium_tagger::fopen(what_);
+ }
+
+ FILE_Tagger_.serialise(Stream_);
+
+ if (std::fclose(Stream_) != 0) {
+ std::stringstream what_;
+ what_ << "can't close STREAM file \"" << argv[optind + 3] << "\"";
+ throw Exception::apertium_tagger::fclose(what_);
+ }
+}
+
+void apertium_tagger::t_FILE_Tagger(FILE_Tagger &FILE_Tagger_) {
+ LtLocale::tryToSetLocale();
+
+ if (argc - optind < 4 || !(argc - optind < 5)) {
+ std::stringstream what_;
+ what_ << "expected 4 file arguments, got " << argc - optind;
+ throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_);
+ }
+
+ FILE_Tagger_.deserialise(argv[optind + 2]);
+ FILE_Tagger_.set_debug(TheFlags.getDebug());
+ TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags());
+
+ FILE *Dictionary = std::fopen(argv[optind], "r");
+
+ if (Dictionary == NULL) {
+ std::stringstream what_;
+ what_ << "can't open DICTIONARY file \"" << argv[optind]
+ << "\" for reading";
+ throw Exception::apertium_tagger::fopen(what_);
+ }
+
+ FILE_Tagger_.read_dictionary(Dictionary);
+
+ if (std::fclose(Dictionary) != 0) {
+ std::stringstream what_;
+ what_ << "can't close DICTIONARY file \"" << argv[optind] << "\"";
+ throw Exception::apertium_tagger::fclose(what_);
+ }
+
+ FILE *Corpus = std::fopen(argv[optind + 1], "r");
+
+ if (Corpus == NULL) {
+ std::stringstream what_;
+ what_ << "can't open CORPUS file \"" << argv[optind + 1]
+ << "\" for reading";
+ throw Exception::apertium_tagger::fopen(what_);
+ }
+
+#ifdef _MSC_VER
+
+ _setmode(_fileno(Corpus), _O_U8TEXT);
+
+#endif // _MSC_VER
+
+ FILE_Tagger_.init_probabilities_kupiec_(Corpus);
+ FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument);
+
+ if (std::fclose(Corpus) != 0) {
+ std::stringstream what_;
+ what_ << "can't close CORPUS file \"" << argv[optind + 1] << "\"";
+ throw Exception::apertium_tagger::fclose(what_);
+ }
+
+ FILE *Stream_ = std::fopen(argv[optind + 3], "wb");
+
+ if (Stream_ == NULL) {
+ std::stringstream what_;
+ what_ << "can't open STREAM file \"" << argv[optind + 3]
+ << "\" for writing in binary mode";
+ throw Exception::apertium_tagger::fopen(what_);
+ }
+
+ FILE_Tagger_.serialise(Stream_);
+
+ if (std::fclose(Stream_) != 0) {
+ std::stringstream what_;
+ what_ << "can't close STREAM file \"" << argv[optind + 3] << "\"";
+ throw Exception::apertium_tagger::fclose(what_);
+ }
+}
+}
+
+int main(int argc, char **argv) {
+ try {
+ apertium_tagger(argc, argv);
+ } catch (const err_Exception &err_Exception_) {
+ std::cerr << "Try 'apertium-tagger --help' for more information.\n";
+ return 1;
+ } catch (...) {
+ throw;
+ }
}
diff --git a/apertium/apertium_tagger.h b/apertium/apertium_tagger.h
new file mode 100644
index 0000000..e4e9656
--- /dev/null
+++ b/apertium/apertium_tagger.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef APERTIUM_TAGGER_H
+#define APERTIUM_TAGGER_H
+
+#include "apertium_config.h"
+
+#include "basic_stream_tagger.h"
+#include "basic_stream_tagger_trainer.h"
+#include "basic_tagger.h"
+#include "constructor_eq_delete.h"
+#include "file_tagger.h"
+#include "optional.h"
+
+#include <getopt.h>
+#include <string>
+
+namespace Apertium {
+class apertium_tagger : private constructor_eq_delete {
+public:
+ apertium_tagger(int &argc, char **&argv);
+
+private:
+ enum FunctionTypeType { Unigram, SlidingWindow };
+ enum UnigramType { Stream_5_3_1, Stream_5_3_2, Stream_5_3_3 };
+ enum FunctionType { Tagger, Retrain, Supervised, Train };
+ static void help();
+
+#if HAVE_GETOPT_LONG
+
+ static std::string option_string(const int &indexptr_);
+ static std::string option_string(const struct option &option_);
+
+#else
+
+ static std::string option_string(const int &val_);
+
+#endif // HAVE_GETOPT_LONG
+
+ static void locale_global_();
+
+#if HAVE_GETOPT_LONG
+
+ static const struct option longopts[];
+
+#endif // HAVE_GETOPT_LONG
+
+#if HAVE_GETOPT_LONG
+
+ void set_indexptr();
+
+#endif // HAVE_GETOPT_LONG
+
+ void flagOptionCase(bool (basic_Tagger::Flags::*GetFlag)() const,
+ void (basic_Tagger::Flags::*SetFlag)(const bool &));
+ std::string option_string();
+ void functionTypeTypeOptionCase(const FunctionTypeType &FunctionTypeType_);
+ void functionTypeOptionCase(const FunctionType &FunctionType_);
+ void getIterationsArgument();
+ unsigned long optarg_unsigned_long() const;
+ void g_StreamTagger(basic_StreamTagger &StreamTagger_);
+ void s_StreamTaggerTrainer(basic_StreamTaggerTrainer &StreamTaggerTrainer_);
+ void g_FILE_Tagger(FILE_Tagger &FILE_Tagger_);
+ void r_FILE_Tagger(FILE_Tagger &FILE_Tagger_);
+ void s_FILE_Tagger(FILE_Tagger &FILE_Tagger_);
+ void t_FILE_Tagger(FILE_Tagger &FILE_Tagger_);
+ int &argc;
+ char **&argv;
+ int The_val;
+
+#if HAVE_GETOPT_LONG
+
+ int The_indexptr;
+ Optional<int> FunctionTypeTypeOption_indexptr;
+ Optional<int> FunctionTypeOption_indexptr;
+
+#else
+
+ Optional<int> FunctionTypeTypeOption_val;
+ Optional<int> FunctionTypeOptiona_val;
+
+#endif // HAVE_GETOPT_LONG
+
+ Optional<FunctionTypeType> TheFunctionTypeType;
+ Optional<UnigramType> TheUnigramType;
+ Optional<FunctionType> TheFunctionType;
+ unsigned long TheFunctionTypeOptionArgument;
+ basic_Tagger::Flags TheFlags;
+};
+}
+
+#endif // APERTIUM_TAGGER_H
diff --git a/apertium/apertium_tagger_apply_new_rules.cc b/apertium/apertium_tagger_apply_new_rules.cc
index 80c2373..7b85305 100644
--- a/apertium/apertium_tagger_apply_new_rules.cc
+++ b/apertium/apertium_tagger_apply_new_rules.cc
@@ -13,9 +13,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <fstream>
@@ -52,9 +50,9 @@ void help(char *name) {
cerr<<"ARGUMENTS: \n"
<<" --filein|-i: To specify the file with the HMM parameter to process\n\n"
- <<" --fileout|-o: To specify the file to which the HMM will be writen\n\n"
+ <<" --fileout|-o: To specify the file to which the HMM will be written\n\n"
<<" --tsxfile|-x: File containing the rules to apply\n\n"
- <<"NOTE: Parameters are read from and writen to the files provided\n";
+ <<"NOTE: Parameters are read from and written to the files provided\n";
}
int main(int argc, char* argv[]) {
@@ -115,9 +113,7 @@ int main(int argc, char* argv[]) {
<<" General Public License for more details.\n"
<<"\n"
<<" You should have received a copy of the GNU General Public License\n"
- <<" along with this program; if not, write to the Free Software\n"
- <<" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA\n"
- <<" 02111-1307, USA.\n";
+ <<" along with this program; if not, see <http://www.gnu.org/licenses/>.\n";
exit(EXIT_SUCCESS);
break;
default:
@@ -171,7 +167,7 @@ int main(int argc, char* argv[]) {
fout=fopen(fileout.c_str(), "wb");
check_file(fout, fileout);
cerr<<"Writing apertium-tagger data to file '"<<fileout<<"' ... "<<flush;
- tagger_data_hmm.write(fout);
+ hmm.serialise(fout);
fclose(fout);
cerr<<"done.\n";
}
diff --git a/apertium/apertium_tagger_readwords.cc b/apertium/apertium_tagger_readwords.cc
index 8fd47f9..a985b63 100644
--- a/apertium/apertium_tagger_readwords.cc
+++ b/apertium/apertium_tagger_readwords.cc
@@ -13,9 +13,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
/*
@@ -163,9 +161,7 @@ int main(int argc, char* argv[]) {
<<" General Public License for more details.\n"
<<"\n"
<<" You should have received a copy of the GNU General Public License\n"
- <<" along with this program; if not, write to the Free Software\n"
- <<" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA\n"
- <<" 02111-1307, USA.\n";
+ <<" along with this program; if not, see <http://www.gnu.org/licenses/>.\n";
exit(EXIT_SUCCESS);
break;
default:
diff --git a/apertium/apertium_tmxbuild.cc b/apertium/apertium_tmxbuild.cc
index f988f6c..f404882 100644
--- a/apertium/apertium_tmxbuild.cc
+++ b/apertium/apertium_tmxbuild.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <cstdlib>
#include <getopt.h>
diff --git a/apertium/apertium_transfer.cc b/apertium/apertium_transfer.cc
index 5ee19ee..a097d34 100644
--- a/apertium/apertium_transfer.cc
+++ b/apertium/apertium_transfer.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/transfer.h>
#include <lttoolbox/lt_locale.h>
@@ -53,6 +51,7 @@ void message(char *progname)
cerr << " -x bindix extended mode with user dictionary" << endl;
cerr << " -c case-sensitiveness while accessing bilingual dictionary" << endl;
cerr << " -t trace (show rule numbers and patterns matched)" << endl;
+ cerr << " -T trace, for apertium-transfer-tools (also sets -t)" << endl;
cerr << " -z null-flushing output on '\0'" << endl;
cerr << " -h shows this message" << endl;
@@ -116,13 +115,14 @@ int main(int argc, char *argv[])
{"case-sensitive", no_argument, 0, 'c'},
{"null-flush", no_argument, 0, 'z'},
{"trace", no_argument, 0, 't'},
+ {"trace_att", no_argument, 0, 'T'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}
};
- int c=getopt_long(argc, argv, "nbx:czth", long_options, &option_index);
+ int c=getopt_long(argc, argv, "nbx:cztTh", long_options, &option_index);
#else
- int c=getopt(argc, argv, "nbx:czth");
+ int c=getopt(argc, argv, "nbx:cztTh");
#endif
if (c==-1)
break;
@@ -150,6 +150,11 @@ int main(int argc, char *argv[])
t.setTrace(true);
break;
+ case 'T':
+ t.setTrace(true);
+ t.setTraceATT(true);
+ break;
+
case 'z':
t.setNullFlush(true);
break;
diff --git a/apertium/basic_5_3_1_tagger.cc b/apertium/basic_5_3_1_tagger.cc
new file mode 100644
index 0000000..73c2cfb
--- /dev/null
+++ b/apertium/basic_5_3_1_tagger.cc
@@ -0,0 +1,20 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "basic_5_3_1_tagger.h"
+
+namespace Apertium {
+basic_5_3_1_Tagger::basic_5_3_1_Tagger() {}
+}
diff --git a/apertium/basic_5_3_1_tagger.h b/apertium/basic_5_3_1_tagger.h
new file mode 100644
index 0000000..d79cdef
--- /dev/null
+++ b/apertium/basic_5_3_1_tagger.h
@@ -0,0 +1,32 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef BASIC_5_3_1_TAGGER_H
+#define BASIC_5_3_1_TAGGER_H
+
+#include "analysis.h"
+
+#include <cstddef>
+#include <map>
+
+namespace Apertium {
+class basic_5_3_1_Tagger {
+protected:
+ basic_5_3_1_Tagger();
+ std::map<Analysis, std::size_t> Model;
+};
+}
+
+#endif // BASIC_5_3_1_TAGGER_H
diff --git a/apertium/basic_5_3_2_tagger.cc b/apertium/basic_5_3_2_tagger.cc
new file mode 100644
index 0000000..dd94766
--- /dev/null
+++ b/apertium/basic_5_3_2_tagger.cc
@@ -0,0 +1,20 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "basic_5_3_2_tagger.h"
+
+namespace Apertium {
+basic_5_3_2_Tagger::basic_5_3_2_Tagger() {}
+}
diff --git a/apertium/basic_5_3_2_tagger.h b/apertium/basic_5_3_2_tagger.h
new file mode 100644
index 0000000..94a3653
--- /dev/null
+++ b/apertium/basic_5_3_2_tagger.h
@@ -0,0 +1,33 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef BASIC_5_3_2_TAGGER_H
+#define BASIC_5_3_2_TAGGER_H
+
+#include "a.h"
+#include "lemma.h"
+
+#include <cstddef>
+#include <map>
+
+namespace Apertium {
+class basic_5_3_2_Tagger {
+protected:
+ basic_5_3_2_Tagger();
+ std::map<a, std::map<Lemma, std::size_t> > Model;
+};
+}
+
+#endif // BASIC_5_3_2_TAGGER_H
diff --git a/apertium/basic_5_3_3_tagger.h b/apertium/basic_5_3_3_tagger.h
new file mode 100644
index 0000000..055fd91
--- /dev/null
+++ b/apertium/basic_5_3_3_tagger.h
@@ -0,0 +1,35 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef BASIC_5_3_3_TAGGER_H
+#define BASIC_5_3_3_TAGGER_H
+
+#include "i.h"
+#include "lemma.h"
+
+#include <cstddef>
+#include <map>
+#include <utility>
+
+namespace Apertium {
+class basic_5_3_3_Tagger {
+protected:
+ std::pair<std::map<i, std::map<Lemma, std::size_t> >,
+ std::pair<std::map<i, std::map<Lemma, std::size_t> >,
+ std::map<Lemma, std::map<i, std::size_t> > > > Model;
+};
+}
+
+#endif // BASIC_5_3_3_TAGGER_H
diff --git a/apertium/basic_exception_type.cc b/apertium/basic_exception_type.cc
new file mode 100644
index 0000000..a6e24cd
--- /dev/null
+++ b/apertium/basic_exception_type.cc
@@ -0,0 +1,20 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "basic_exception_type.h"
+
+namespace Apertium {
+basic_ExceptionType::~basic_ExceptionType() throw() {}
+}
diff --git a/apertium/basic_exception_type.h b/apertium/basic_exception_type.h
new file mode 100644
index 0000000..a02cf95
--- /dev/null
+++ b/apertium/basic_exception_type.h
@@ -0,0 +1,29 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef BASIC_EXCEPTION_TYPE_H
+#define BASIC_EXCEPTION_TYPE_H
+
+#include <exception>
+
+namespace Apertium {
+class basic_ExceptionType : public std::exception {
+public:
+ virtual ~basic_ExceptionType() throw() = 0;
+ virtual const char *what() const throw() = 0;
+};
+}
+
+#endif // BASIC_EXCEPTION_TYPE_H
diff --git a/apertium/basic_stream_tagger.cc b/apertium/basic_stream_tagger.cc
new file mode 100644
index 0000000..bc72281
--- /dev/null
+++ b/apertium/basic_stream_tagger.cc
@@ -0,0 +1,125 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "basic_stream_tagger.h"
+
+#include "apertium_config.h"
+
+#include "basic_tagger.h"
+#include "lexical_unit.h"
+#include "stream.h"
+#include "streamed_type.h"
+
+#include <ostream>
+
+#if ENABLE_DEBUG
+
+#include <iomanip>
+#include <iostream>
+#include <limits>
+
+#endif // ENABLE_DEBUG
+
+namespace Apertium {
+basic_StreamTagger::~basic_StreamTagger() {}
+
+void basic_StreamTagger::tag(Stream &Input, std::wostream &Output) const {
+ while (true) {
+ StreamedType StreamedType_ = Input.get();
+ Output << StreamedType_.TheString;
+
+ if (!StreamedType_.TheLexicalUnit) {
+ if (!Input.flush_())
+ break;
+
+ Output << std::flush;
+ continue;
+ }
+
+#if ENABLE_DEBUG
+
+ std::wcerr << L"\n\n";
+
+#endif // ENABLE_DEBUG
+
+ tag(*StreamedType_.TheLexicalUnit, Output);
+
+ if (Input.flush_())
+ Output << std::flush;
+ }
+}
+
+basic_StreamTagger::basic_StreamTagger(const basic_Tagger::Flags &Flags_)
+ : basic_Tagger(Flags_) {}
+
+void basic_StreamTagger::tag(const LexicalUnit &LexicalUnit_,
+ std::wostream &Output) const {
+#if ENABLE_DEBUG
+
+ for (std::vector<Analysis>::const_iterator Analysis_ =
+ LexicalUnit_.TheAnalyses.begin();
+ Analysis_ != LexicalUnit_.TheAnalyses.end(); ++Analysis_) {
+ std::wcerr << L"score(\"" << *Analysis_ << L"\") ==\n "
+ << score_DEBUG(*Analysis_) << L" ==\n " << std::fixed
+ << std::setprecision(std::numeric_limits<long double>::digits10)
+ << score(*Analysis_) << L"\n";
+ }
+
+#endif // ENABLE_DEBUG
+
+ Output << L"^";
+
+ if (LexicalUnit_.TheAnalyses.empty()) {
+ if (TheFlags.getShowSuperficial())
+ Output << LexicalUnit_.TheSurfaceForm << L"/";
+
+ Output << L"*" << LexicalUnit_.TheSurfaceForm << L"$";
+ return;
+ }
+
+ if (TheFlags.getMark()) {
+ if (LexicalUnit_.TheAnalyses.size() != 1)
+ Output << L"=";
+ }
+
+ if (TheFlags.getShowSuperficial())
+ Output << LexicalUnit_.TheSurfaceForm << L"/";
+
+ std::vector<Analysis>::const_iterator TheAnalysis =
+ LexicalUnit_.TheAnalyses.begin();
+
+ for (std::vector<Analysis>::const_iterator Analysis_ =
+ LexicalUnit_.TheAnalyses.begin() + 1;
+ // Call .end() each iteration to save memory.
+ Analysis_ != LexicalUnit_.TheAnalyses.end(); ++Analysis_) {
+ if (score(*Analysis_) > score(*TheAnalysis))
+ TheAnalysis = Analysis_;
+ }
+
+ Output << *TheAnalysis;
+
+ if (TheFlags.getFirst()) {
+ for (std::vector<Analysis>::const_iterator Analysis_ =
+ LexicalUnit_.TheAnalyses.begin();
+ // Call .end() each iteration to save memory.
+ Analysis_ != LexicalUnit_.TheAnalyses.end(); ++Analysis_) {
+ if (Analysis_ != TheAnalysis)
+ Output << L"/" << *Analysis_;
+ }
+ }
+
+ Output << L"$";
+}
+}
diff --git a/apertium/basic_stream_tagger.h b/apertium/basic_stream_tagger.h
new file mode 100644
index 0000000..57eeb91
--- /dev/null
+++ b/apertium/basic_stream_tagger.h
@@ -0,0 +1,56 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef BASIC_STREAM_TAGGER_H
+#define BASIC_STREAM_TAGGER_H
+
+#include "apertium_config.h"
+
+#include "basic_tagger.h"
+#include "lexical_unit.h"
+#include "stream.h"
+
+#include <istream>
+#include <ostream>
+
+#if ENABLE_DEBUG
+
+#include <string>
+
+#endif // ENABLE_DEBUG
+
+namespace Apertium {
+class basic_StreamTagger : protected basic_Tagger {
+public:
+ virtual ~basic_StreamTagger();
+ virtual void deserialise(std::istream &Serialised_basic_Tagger) = 0;
+ void tag(Stream &Input, std::wostream &Output) const;
+
+protected:
+ basic_StreamTagger(const Flags &Flags_);
+ virtual long double score(const Analysis &Analysis_) const = 0;
+
+#if ENABLE_DEBUG
+
+ virtual std::wstring score_DEBUG(const Analysis &Analysis_) const = 0;
+
+#endif // ENABLE_DEBUG
+
+private:
+ void tag(const LexicalUnit &LexicalUnit_, std::wostream &Output) const;
+};
+}
+
+#endif // BASIC_STREAM_TAGGER_H
diff --git a/apertium/basic_stream_tagger_trainer.cc b/apertium/basic_stream_tagger_trainer.cc
new file mode 100644
index 0000000..294d918
--- /dev/null
+++ b/apertium/basic_stream_tagger_trainer.cc
@@ -0,0 +1,59 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "basic_stream_tagger_trainer.h"
+
+#include "analysis.h"
+#include "basic_tagger.h"
+#include "exception.h"
+#include "stream.h"
+#include "streamed_type.h"
+
+namespace Apertium {
+basic_StreamTaggerTrainer::~basic_StreamTaggerTrainer() {}
+
+void basic_StreamTaggerTrainer::train(Stream &TaggedCorpus) {
+ while (true) {
+ StreamedType StreamedType_ = TaggedCorpus.get();
+
+ if (!StreamedType_.TheLexicalUnit)
+ break;
+
+ if (StreamedType_.TheLexicalUnit->TheAnalyses.empty())
+ throw Exception::LexicalUnit::TheAnalyses_empty(
+ "can't train LexicalUnit comprising empty Analysis std::vector");
+
+ if (OccurrenceCoefficient %
+ StreamedType_.TheLexicalUnit->TheAnalyses.size() !=
+ 0) {
+ OccurrenceCoefficient *= StreamedType_.TheLexicalUnit->TheAnalyses.size();
+ multiplyModel(StreamedType_.TheLexicalUnit->TheAnalyses.size());
+ }
+
+ for (std::vector<Analysis>::const_iterator Analysis_ =
+ StreamedType_.TheLexicalUnit->TheAnalyses.begin();
+ Analysis_ != StreamedType_.TheLexicalUnit->TheAnalyses.end();
+ ++Analysis_) {
+ train_Analysis(*Analysis_,
+ OccurrenceCoefficient /
+ StreamedType_.TheLexicalUnit->TheAnalyses.size());
+ }
+ }
+}
+
+basic_StreamTaggerTrainer::basic_StreamTaggerTrainer(
+ const basic_Tagger::Flags &Flags_)
+ : basic_Tagger(Flags_), OccurrenceCoefficient(1) {}
+}
diff --git a/apertium/basic_stream_tagger_trainer.h b/apertium/basic_stream_tagger_trainer.h
new file mode 100644
index 0000000..db6b25f
--- /dev/null
+++ b/apertium/basic_stream_tagger_trainer.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef BASIC_STREAM_TAGGER_TRAINER_H
+#define BASIC_STREAM_TAGGER_TRAINER_H
+
+#include "basic_tagger.h"
+#include "stream.h"
+
+#include <ostream>
+
+namespace Apertium {
+class basic_StreamTaggerTrainer : protected basic_Tagger {
+public:
+ virtual ~basic_StreamTaggerTrainer();
+ void train(Stream &TaggedCorpus);
+ virtual void serialise(std::ostream &Serialised_basic_Tagger) const = 0;
+
+protected:
+ basic_StreamTaggerTrainer(const Flags &Flags_);
+ virtual void train_Analysis(const Analysis &Analysis_,
+ const std::size_t &Coefficient_) = 0;
+ virtual void
+ multiplyModel(const std::size_t &OccurrenceCoefficientMultiplier) = 0;
+ std::size_t OccurrenceCoefficient;
+};
+}
+
+#endif // BASIC_STREAM_TAGGER_TRAINER_H
diff --git a/apertium/basic_tagger.cc b/apertium/basic_tagger.cc
new file mode 100644
index 0000000..361a24f
--- /dev/null
+++ b/apertium/basic_tagger.cc
@@ -0,0 +1,48 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "basic_tagger.h"
+
+namespace Apertium {
+basic_Tagger::Flags::Flags()
+ : Debug(false), First(false), Mark(false), ShowSuperficial(false),
+ NullFlush(false) {}
+
+bool basic_Tagger::Flags::getDebug() const { return Debug; }
+
+void basic_Tagger::Flags::setDebug(const bool &Debug_) { Debug = Debug_; }
+
+bool basic_Tagger::Flags::getFirst() const { return First; }
+
+void basic_Tagger::Flags::setFirst(const bool &First_) { First = First_; }
+
+bool basic_Tagger::Flags::getMark() const { return Mark; }
+
+void basic_Tagger::Flags::setMark(const bool &Mark_) { Mark = Mark_; }
+
+bool basic_Tagger::Flags::getShowSuperficial() const { return ShowSuperficial; }
+
+void basic_Tagger::Flags::setShowSuperficial(const bool &ShowSuperficial_) {
+ ShowSuperficial = ShowSuperficial_;
+}
+
+bool basic_Tagger::Flags::getNullFlush() const { return NullFlush; }
+
+void basic_Tagger::Flags::setNullFlush(const bool &NullFlush_) {
+ NullFlush = NullFlush_;
+}
+
+basic_Tagger::basic_Tagger(const Flags &Flags_) : TheFlags(Flags_) {}
+}
diff --git a/apertium/basic_tagger.h b/apertium/basic_tagger.h
new file mode 100644
index 0000000..925bcd1
--- /dev/null
+++ b/apertium/basic_tagger.h
@@ -0,0 +1,60 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef BASIC_TAGGER_H
+#define BASIC_TAGGER_H
+
+namespace Apertium {
+class basic_Tagger {
+public:
+ class Flags {
+ public:
+ Flags();
+ bool getDebug() const;
+ void setDebug(const bool &Debug_);
+ bool getFirst() const;
+ void setFirst(const bool &First_);
+ bool getMark() const;
+ void setMark(const bool &Mark_);
+ bool getShowSuperficial() const;
+ void setShowSuperficial(const bool &ShowSuperficial_);
+ bool getNullFlush() const;
+ void setNullFlush(const bool &NullFlush_);
+ static bool (Flags::*GetDebug)() const;
+ static void (Flags::*SetDebug)(const bool &);
+ static bool (Flags::*GetFirst)() const;
+ static void (Flags::*SetFirst)(const bool &);
+ static bool (Flags::*GetMark)() const;
+ static void (Flags::*SetMark)(const bool &);
+ static bool (Flags::*GetShowSuperficial)() const;
+ static void (Flags::*SetShowSuperficial)(const bool &);
+ static bool (Flags::*GetNullFlush)() const;
+ static void (Flags::*SetNullFlush)(const bool &);
+
+ private:
+ bool Debug : 1;
+ bool First : 1;
+ bool Mark : 1;
+ bool ShowSuperficial : 1;
+ bool NullFlush : 1;
+ };
+
+protected:
+ basic_Tagger(const Flags &Flags_);
+ Flags TheFlags;
+};
+}
+
+#endif // BASIC_TAGGER_H
diff --git a/apertium/collection.cc b/apertium/collection.cc
index a902745..313447b 100644
--- a/apertium/collection.cc
+++ b/apertium/collection.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <lttoolbox/compression.h>
#include <apertium/collection.h>
diff --git a/apertium/collection.h b/apertium/collection.h
index b07b7a3..eb0035f 100644
--- a/apertium/collection.h
+++ b/apertium/collection.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __COLLECTION_H
#define __COLLECTION_H
diff --git a/apertium/constant_manager.cc b/apertium/constant_manager.cc
index a7f07c4..66ff19e 100644
--- a/apertium/constant_manager.cc
+++ b/apertium/constant_manager.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/constant_manager.h>
#include <lttoolbox/compression.h>
diff --git a/apertium/constant_manager.h b/apertium/constant_manager.h
index 00ad49a..d264ab3 100644
--- a/apertium/constant_manager.h
+++ b/apertium/constant_manager.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _CONSTANTMANAGER_
#define _CONSTANTMANAGER_
diff --git a/apertium/constructor_eq_delete.h b/apertium/constructor_eq_delete.h
new file mode 100644
index 0000000..a9c671d
--- /dev/null
+++ b/apertium/constructor_eq_delete.h
@@ -0,0 +1,32 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef CONSTRUCTOR_EQ_DELETE_H
+#define CONSTRUCTOR_EQ_DELETE_H
+
+namespace Apertium {
+class constructor_eq_delete {
+protected:
+ constructor_eq_delete() {}
+ ~constructor_eq_delete() {}
+
+private:
+ constructor_eq_delete(const constructor_eq_delete &constructor_eq_delete_);
+ constructor_eq_delete &
+ operator=(constructor_eq_delete constructor_eq_delete_);
+};
+}
+
+#endif // CONSTRUCTOR_EQ_DELETE_H
diff --git a/apertium/deformat.xsl b/apertium/deformat.xsl
index e7c4626..938af7a 100644
--- a/apertium/deformat.xsl
+++ b/apertium/deformat.xsl
@@ -13,9 +13,7 @@
General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA.
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
-->
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
diff --git a/apertium/deserialiser.h b/apertium/deserialiser.h
new file mode 100644
index 0000000..5bfa242
--- /dev/null
+++ b/apertium/deserialiser.h
@@ -0,0 +1,255 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef DESERIALISER_H
+#define DESERIALISER_H
+
+#include "a.h"
+#include "analysis.h"
+#include "basic_exception_type.h"
+#include "exception.h"
+#include "i.h"
+#include "lemma.h"
+#include "morpheme.h"
+#include "tag.h"
+
+#include <cstddef>
+#include <istream>
+#include <limits>
+#include <map>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace Apertium {
+template <typename DeserialisedType> class Deserialiser;
+
+template <> class Deserialiser<a> {
+public:
+ inline static a deserialise(std::istream &Stream_);
+};
+
+template <> class Deserialiser<Analysis> {
+public:
+ inline static Analysis deserialise(std::istream &Stream_);
+};
+
+template <> class Deserialiser<i> {
+public:
+ inline static i deserialise(std::istream &Stream_);
+};
+
+template <> class Deserialiser<Lemma> {
+public:
+ inline static Lemma deserialise(std::istream &Stream_);
+};
+
+template <> class Deserialiser<Morpheme> {
+public:
+ inline static Morpheme deserialise(std::istream &Stream_);
+};
+
+template <> class Deserialiser<Tag> {
+public:
+ inline static Tag deserialise(std::istream &Stream_);
+};
+
+template <typename value_type>
+class Deserialiser<std::basic_string<value_type> > {
+public:
+ inline static std::basic_string<value_type>
+ deserialise(std::istream &Stream_);
+};
+
+template <typename key_type, typename mapped_type>
+class Deserialiser<std::map<key_type, mapped_type> > {
+public:
+ inline static std::map<key_type, mapped_type>
+ deserialise(std::istream &Stream_);
+};
+
+template <typename first_type, typename second_type>
+class Deserialiser<std::pair<first_type, second_type> > {
+public:
+ inline static std::pair<first_type, second_type>
+ deserialise(std::istream &Stream_);
+};
+
+template <> class Deserialiser<std::size_t> {
+public:
+ inline static std::size_t deserialise(std::istream &Stream_);
+};
+
+template <typename value_type> class Deserialiser<std::vector<value_type> > {
+public:
+ inline static std::vector<value_type> deserialise(std::istream &Stream_);
+};
+
+template <> class Deserialiser<wchar_t> {
+public:
+ inline static wchar_t deserialise(std::istream &Stream_);
+};
+
+a Deserialiser<a>::deserialise(std::istream &Stream_) {
+ a StreamedType_;
+ StreamedType_.TheTags = Deserialiser<std::vector<Tag> >::deserialise(Stream_);
+ StreamedType_.TheMorphemes =
+ Deserialiser<std::vector<Morpheme> >::deserialise(Stream_);
+ return StreamedType_;
+}
+
+Analysis Deserialiser<Analysis>::deserialise(std::istream &Stream_) {
+ Analysis SerialisedType_;
+ SerialisedType_.TheMorphemes =
+ Deserialiser<std::vector<Morpheme> >::deserialise(Stream_);
+ return SerialisedType_;
+}
+
+i Deserialiser<i>::deserialise(std::istream &Stream_) {
+ i StreamedType_;
+ StreamedType_.TheTags = Deserialiser<std::vector<Tag> >::deserialise(Stream_);
+ return StreamedType_;
+}
+
+Lemma Deserialiser<Lemma>::deserialise(std::istream &Stream_) {
+ Lemma StreamedType_;
+ StreamedType_.TheLemma = Deserialiser<std::wstring>::deserialise(Stream_);
+ return StreamedType_;
+}
+
+Morpheme Deserialiser<Morpheme>::deserialise(std::istream &Stream_) {
+ Morpheme SerialisedType_;
+ SerialisedType_.TheLemma = Deserialiser<std::wstring>::deserialise(Stream_);
+ SerialisedType_.TheTags =
+ Deserialiser<std::vector<Tag> >::deserialise(Stream_);
+ return SerialisedType_;
+}
+
+Tag Deserialiser<Tag>::deserialise(std::istream &Stream_) {
+ Tag SerialisedType_;
+ SerialisedType_.TheTag = Deserialiser<std::wstring>::deserialise(Stream_);
+ return SerialisedType_;
+}
+
+template <typename value_type>
+std::basic_string<value_type>
+Deserialiser<std::basic_string<value_type> >::deserialise(
+ std::istream &Stream_) {
+ std::size_t SerialisedValueCount =
+ Deserialiser<std::size_t>::deserialise(Stream_);
+ std::basic_string<value_type> SerialisedType_;
+
+ for (; SerialisedValueCount != 0; --SerialisedValueCount) {
+ SerialisedType_.push_back(Deserialiser<value_type>::deserialise(Stream_));
+ }
+
+ return SerialisedType_;
+}
+
+template <typename key_type, typename mapped_type>
+std::map<key_type, mapped_type>
+Deserialiser<std::map<key_type, mapped_type> >::deserialise(
+ std::istream &Stream_) {
+ std::size_t SerialisedValueCount =
+ Deserialiser<std::size_t>::deserialise(Stream_);
+ std::map<key_type, mapped_type> SerialisedType_;
+
+ for (; SerialisedValueCount != 0; --SerialisedValueCount) {
+ SerialisedType_.insert(
+ Deserialiser<std::pair<key_type, mapped_type> >::deserialise(Stream_));
+ }
+
+ return SerialisedType_;
+}
+
+template <typename first_type, typename second_type>
+std::pair<first_type, second_type>
+Deserialiser<std::pair<first_type, second_type> >::deserialise(
+ std::istream &Stream_) {
+ std::pair<first_type, second_type> SerialisedType_;
+ SerialisedType_.first = Deserialiser<first_type>::deserialise(Stream_);
+ SerialisedType_.second = Deserialiser<second_type>::deserialise(Stream_);
+ return SerialisedType_;
+}
+
+std::size_t Deserialiser<std::size_t>::deserialise(std::istream &Stream_) {
+ try {
+ std::size_t SerialisedType_ = 0;
+ unsigned char SerialisedTypeSize = Stream_.get();
+
+ if (!Stream_)
+ throw Exception::Deserialiser::not_Stream_good("can't deserialise size");
+
+ for (; SerialisedTypeSize != 0;) {
+ SerialisedType_ +=
+ static_cast<std::size_t>(Stream_.get())
+ << std::numeric_limits<unsigned char>::digits * --SerialisedTypeSize;
+
+ if (!Stream_)
+ throw Exception::Deserialiser::not_Stream_good(
+ "can't deserialise byte");
+ }
+
+ return SerialisedType_;
+ } catch (const basic_ExceptionType &basic_ExceptionType_) {
+ std::stringstream what_;
+ what_ << "can't deserialise std::size_t: " << basic_ExceptionType_.what();
+ throw Exception::Deserialiser::size_t_(what_);
+ }
+}
+
+template <typename value_type>
+std::vector<value_type>
+Deserialiser<std::vector<value_type> >::deserialise(std::istream &Stream_) {
+ std::size_t SerialisedValueCount =
+ Deserialiser<std::size_t>::deserialise(Stream_);
+ std::vector<value_type> SerialisedType_;
+
+ for (; SerialisedValueCount != 0; --SerialisedValueCount) {
+ SerialisedType_.push_back(Deserialiser<value_type>::deserialise(Stream_));
+ }
+
+ return SerialisedType_;
+}
+
+wchar_t Deserialiser<wchar_t>::deserialise(std::istream &Stream_) {
+ try {
+ unsigned wchar_t SerialisedType_ = 0;
+ unsigned char SerialisedTypeSize = Stream_.get();
+
+ if (!Stream_)
+ throw Exception::Deserialiser::not_Stream_good("can't deserialise size");
+
+ for (; SerialisedTypeSize != 0;) {
+ SerialisedType_ +=
+ static_cast<std::size_t>(Stream_.get())
+ << std::numeric_limits<unsigned char>::digits * --SerialisedTypeSize;
+
+ if (!Stream_)
+ throw Exception::Deserialiser::not_Stream_good(
+ "can't deserialise byte");
+ }
+
+ return static_cast<wchar_t>(SerialisedType_);
+ } catch (const basic_ExceptionType &basic_ExceptionType_) {
+ std::stringstream what_;
+ what_ << "can't deserialise wchar_t: " << basic_ExceptionType_.what();
+ throw Exception::Deserialiser::wchar_t_(what_);
+ }
+}
+}
+
+#endif // DESERIALISER_H
diff --git a/apertium/endian_double_util.cc b/apertium/endian_double_util.cc
index cf37475..d9e2d99 100644
--- a/apertium/endian_double_util.cc
+++ b/apertium/endian_double_util.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <cstdio>
diff --git a/apertium/endian_double_util.h b/apertium/endian_double_util.h
index 4fc7933..1a5f4c6 100644
--- a/apertium/endian_double_util.h
+++ b/apertium/endian_double_util.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _ENDIANDOUBLEUTIL_
#define _ENDIANDOUBLEUTIL_
diff --git a/apertium/err_exception.h b/apertium/err_exception.h
new file mode 100644
index 0000000..cf3522f
--- /dev/null
+++ b/apertium/err_exception.h
@@ -0,0 +1,23 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef ERR_EXCEPTION_H
+#define ERR_EXCEPTION_H
+
+namespace Apertium {
+class err_Exception {};
+}
+
+#endif // ERR_EXCEPTION_H
diff --git a/apertium/exception.h b/apertium/exception.h
new file mode 100644
index 0000000..0836bb2
--- /dev/null
+++ b/apertium/exception.h
@@ -0,0 +1,95 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EXCEPTION_APERTIUM_TAGGER_H
+#define EXCEPTION_APERTIUM_TAGGER_H
+
+#include "exception_type.h"
+
+#include <sstream>
+
+namespace Apertium {
+namespace Exception {
+
+#define EXCEPTION(EXCEPTION_TYPE) \
+ class EXCEPTION_TYPE : public ::Apertium::ExceptionType { \
+ public: \
+ EXCEPTION_TYPE(const char *const what_) : ExceptionType(what_) {} \
+ EXCEPTION_TYPE(const std::string &what_) : ExceptionType(what_) {} \
+ EXCEPTION_TYPE(const std::stringstream &what_) : ExceptionType(what_) {} \
+ ~EXCEPTION_TYPE() throw() {} \
+ };
+
+namespace Analysis {
+EXCEPTION(TheMorphemes_empty)
+}
+
+namespace apertium_tagger {
+EXCEPTION(deserialise)
+EXCEPTION(fclose)
+EXCEPTION(fopen)
+EXCEPTION(ifstream_fail)
+EXCEPTION(ofstream_fail)
+EXCEPTION(optarg_eq_NULL)
+EXCEPTION(str_end_not_eq_NULL)
+EXCEPTION(wifstream_fail)
+EXCEPTION(wofstream_fail)
+EXCEPTION(ERANGE_)
+EXCEPTION(InvalidArgument)
+EXCEPTION(InvalidOption)
+EXCEPTION(UnexpectedFileArgumentCount)
+EXCEPTION(UnexpectedFlagOption)
+EXCEPTION(UnexpectedFunctionTypeOption)
+EXCEPTION(UnexpectedFunctionTypeTypeOption)
+}
+
+namespace Deserialiser {
+EXCEPTION(size_t_)
+EXCEPTION(not_Stream_good)
+EXCEPTION(wchar_t_)
+}
+
+namespace LexicalUnit {
+EXCEPTION(TheAnalyses_empty)
+}
+
+namespace Morpheme {
+EXCEPTION(TheLemma_empty)
+EXCEPTION(TheTags_empty)
+}
+
+namespace Optional {
+EXCEPTION(TheOptionalTypePointer_null)
+}
+
+namespace Serialiser {
+EXCEPTION(not_Stream_good)
+EXCEPTION(size_t_)
+EXCEPTION(wchar_t_)
+}
+
+namespace Tag {
+EXCEPTION(TheTags_empty)
+}
+
+namespace wchar_t_ExceptionType {
+EXCEPTION(EILSEQ_)
+}
+
+#undef EXCEPTION
+}
+}
+
+#endif // EXCEPTION_APERTIUM_TAGGER_H
diff --git a/apertium/exception_type.cc b/apertium/exception_type.cc
new file mode 100644
index 0000000..b02bb0e
--- /dev/null
+++ b/apertium/exception_type.cc
@@ -0,0 +1,32 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "exception_type.h"
+
+#include <sstream>
+#include <string>
+
+namespace Apertium {
+ExceptionType::ExceptionType(const char *const what_) : what_(what_) {}
+
+ExceptionType::ExceptionType(const std::string &what_) : what_(what_) {}
+
+ExceptionType::ExceptionType(const std::stringstream &what_)
+ : what_(what_.str()) {}
+
+ExceptionType::~ExceptionType() throw() {}
+
+const char *ExceptionType::what() const throw() { return what_.c_str(); }
+}
diff --git a/apertium/exception_type.h b/apertium/exception_type.h
new file mode 100644
index 0000000..6f16e01
--- /dev/null
+++ b/apertium/exception_type.h
@@ -0,0 +1,38 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EXCEPTION_TYPE_H
+#define EXCEPTION_TYPE_H
+
+#include "basic_exception_type.h"
+
+#include <sstream>
+#include <string>
+
+namespace Apertium {
+class ExceptionType : public basic_ExceptionType {
+public:
+ ExceptionType(const char *const what_);
+ ExceptionType(const std::string &what_);
+ ExceptionType(const std::stringstream &what_);
+ virtual ~ExceptionType() throw() = 0;
+ const char *what() const throw();
+
+protected:
+ const std::string what_;
+};
+}
+
+#endif // EXCEPTION_TYPE_H
diff --git a/apertium/file_tagger.cc b/apertium/file_tagger.cc
new file mode 100644
index 0000000..7ee6a10
--- /dev/null
+++ b/apertium/file_tagger.cc
@@ -0,0 +1,42 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "file_tagger.h"
+
+#include <apertium/tsx_reader.h>
+
+#include <cstdio>
+
+namespace Apertium {
+FILE_Tagger::FILE_Tagger() : debug(false), show_sf(false), null_flush(false) {}
+
+FILE_Tagger::~FILE_Tagger() {}
+
+void FILE_Tagger::set_debug(const bool &Debug) { debug = Debug; }
+
+void FILE_Tagger::set_show_sf(const bool &ShowSuperficial) {
+ show_sf = ShowSuperficial;
+}
+
+void FILE_Tagger::setNullFlush(const bool &NullFlush) {
+ null_flush = NullFlush;
+}
+
+void FILE_Tagger::deserialise(char *const TaggerSpecificationFilename) {
+ TSXReader TaggerSpecificationReader_;
+ TaggerSpecificationReader_.read(TaggerSpecificationFilename);
+ deserialise(TaggerSpecificationReader_.getTaggerData());
+}
+}
diff --git a/apertium/file_tagger.h b/apertium/file_tagger.h
new file mode 100644
index 0000000..75f0e9f
--- /dev/null
+++ b/apertium/file_tagger.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef FILE_TAGGER_H
+#define FILE_TAGGER_H
+
+#include <apertium/tagger_data.h>
+
+#include <cstdio>
+#include <string>
+#include <vector>
+
+namespace Apertium {
+class FILE_Tagger {
+public:
+ FILE_Tagger();
+ virtual ~FILE_Tagger();
+ virtual void deserialise(FILE *Serialised_FILE_Tagger) = 0;
+ void set_debug(const bool &Debug);
+ void set_show_sf(const bool &ShowSuperficial);
+ void setNullFlush(const bool &NullFlush);
+ virtual void tagger(FILE *Input, FILE *Output, const bool &First = false) = 0;
+ virtual std::vector<std::wstring> &getArrayTags() = 0;
+ virtual void train(FILE *Corpus, unsigned long Count) = 0;
+ virtual void serialise(FILE *Stream_) = 0;
+ void deserialise(char *const TaggerSpecificationFilename);
+ virtual void read_dictionary(FILE *Dictionary) = 0;
+ virtual void init_probabilities_from_tagged_text_(FILE *TaggedCorpus,
+ FILE *Corpus) = 0;
+ virtual void init_probabilities_kupiec_(FILE *Corpus) = 0;
+
+protected:
+ virtual void deserialise(const TaggerData &Deserialised_FILE_Tagger) = 0;
+ bool debug;
+ bool show_sf;
+ bool null_flush;
+};
+}
+
+#endif // FILE_TAGGER_H
diff --git a/apertium/format.dtd b/apertium/format.dtd
index dc7ed60..80b1f90 100644
--- a/apertium/format.dtd
+++ b/apertium/format.dtd
@@ -12,9 +12,7 @@
General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA.
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
DTD for the format specification files
Sergio Ortiz 2005.05.13
diff --git a/apertium/format.rnc b/apertium/format.rnc
new file mode 100644
index 0000000..ec654ba
--- /dev/null
+++ b/apertium/format.rnc
@@ -0,0 +1,111 @@
+# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+# DTD for the format specification files
+# Sergio Ortiz 2005.05.13
+
+format = element format { attlist.format, options, rules }
+attlist.format &= attribute name { text }
+# 'format' is the root element containing the whole format specification
+# file. The attribute 'name' specifies the name of the format
+options =
+ element options {
+ attlist.options,
+ largeblocks,
+ input,
+ output,
+ tag-name,
+ escape-chars,
+ space-chars,
+ case-sensitive
+ }
+attlist.options &= empty
+# General options of the format
+largeblocks = element largeblocks { attlist.largeblocks, empty }
+attlist.largeblocks &= attribute size { text }
+# The attribute size is used to define the maximal size in bytes of
+# inline format blocks
+input = element input { attlist.input, empty }
+attlist.input &= attribute zip-path { text }?
+attlist.input &= attribute encoding { text }
+# Reserved for future extensions
+output = element output { attlist.output, empty }
+attlist.output &= attribute zip-path { text }?
+attlist.output &= attribute encoding { text }
+# Reserved for future extensions
+tag-name = element tag-name { attlist.tag-name, empty }
+attlist.tag-name &= attribute regexp { text }
+# The attribute regexp defines (whith a _flex_ regular expression) how
+# take a tag name from a whole tag. '\'
+escape-chars = element escape-chars { attlist.escape-chars, empty }
+attlist.escape-chars &= attribute regexp { text }
+# The attribute regexp defines (whith a _flex_ regular expression) the
+# set of characters to be escaped with preceding a backslash '\'
+space-chars = element space-chars { attlist.space-chars, empty }
+attlist.space-chars &= attribute regexp { text }
+# Define the space characters (in regexp) with a _flex_ regular
+# expression
+case-sensitive =
+ element case-sensitive { attlist.case-sensitive, empty }
+attlist.case-sensitive &= attribute value { "yes" | "no" }
+# The attribute 'value' is set to 'yes' if the case is relevant in the
+# specification of the format. Otherwise is set to 'no'
+rules =
+ element rules { attlist.rules, (format-rule | replacement-rule)+ }
+attlist.rules &= empty
+# Group the rules of processing format and the rules of substitute
+# expressions by characters that are part of the text
+format-rule =
+ element format-rule {
+ attlist.format-rule,
+ (tag | (begin, end))
+ }
+attlist.format-rule &=
+ attribute type { "comment" | "empty" | "open" | "close" }?
+attlist.format-rule &= attribute eos { "yes" | "no" }?
+attlist.format-rule &= attribute priority { text }
+# Format rule parent element. It may include a 'tag' element or
+# a couple of elements 'begin', 'end'. In the first case, this element is
+# considered to be part of the format. In the second case, the begin and
+# the end element are considered to enclosing format. The attribute
+# 'eos' (end of sentence) is set to 'yes' if that rule defines a dot in
+# the text being processed (is no by default). The attribute 'priority'
+# marks the order of precedence of the rule
+tag = element tag { attlist.tag, empty }
+attlist.tag &= attribute regexp { text }
+# Define an element that is part of the format by the pattern specified
+# as a value for the regexp attribute
+begin = element begin { attlist.begin, empty }
+attlist.begin &= attribute regexp { text }
+# The attribute 'regexp' is the regular expression that detects the
+# begining delimiter of a block of format
+end = element end { attlist.end, empty }
+attlist.end &= attribute regexp { text }
+# The attribute 'regexp' is the regular expression that detects the
+# ending delimiter of a block of format
+replacement-rule =
+ element replacement-rule { attlist.replacement-rule, replace+ }
+attlist.replacement-rule &= attribute regexp { text }
+# Root element for a replacement rule. The attribute 'regexp' is the
+# general expression to detect the elements to replace
+replace = element replace { attlist.replace, empty }
+attlist.replace &= attribute source { text }
+attlist.replace &= attribute target { text }
+attlist.replace &= attribute prefer { "yes" | "no" }?
+start = format
+# Replacement rule. The 'source' is a string of one or more characters.
+# The 'target' MUST be a single character. The 'prefer' attribute, when
+# set to 'yes' defines the preferred reverse translation of the
+# replacement.
diff --git a/apertium/format.rng b/apertium/format.rng
new file mode 100644
index 0000000..7934076
--- /dev/null
+++ b/apertium/format.rng
@@ -0,0 +1,303 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+ DTD for the format specification files
+ Sergio Ortiz 2005.05.13
+-->
+<grammar xmlns="http://relaxng.org/ns/structure/1.0">
+ <define name="format">
+ <element name="format">
+ <ref name="attlist.format"/>
+ <ref name="options"/>
+ <ref name="rules"/>
+ </element>
+ </define>
+ <define name="attlist.format" combine="interleave">
+ <attribute name="name"/>
+ </define>
+ <!--
+ 'format' is the root element containing the whole format specification
+ file. The attribute 'name' specifies the name of the format
+ -->
+ <define name="options">
+ <element name="options">
+ <ref name="attlist.options"/>
+ <ref name="largeblocks"/>
+ <ref name="input"/>
+ <ref name="output"/>
+ <ref name="tag-name"/>
+ <ref name="escape-chars"/>
+ <ref name="space-chars"/>
+ <ref name="case-sensitive"/>
+ </element>
+ </define>
+ <define name="attlist.options" combine="interleave">
+ <empty/>
+ </define>
+ <!-- General options of the format -->
+ <define name="largeblocks">
+ <element name="largeblocks">
+ <ref name="attlist.largeblocks"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.largeblocks" combine="interleave">
+ <attribute name="size"/>
+ </define>
+ <!--
+ The attribute size is used to define the maximal size in bytes of
+ inline format blocks
+ -->
+ <define name="input">
+ <element name="input">
+ <ref name="attlist.input"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.input" combine="interleave">
+ <optional>
+ <attribute name="zip-path"/>
+ </optional>
+ </define>
+ <define name="attlist.input" combine="interleave">
+ <attribute name="encoding"/>
+ </define>
+ <!-- Reserved for future extensions -->
+ <define name="output">
+ <element name="output">
+ <ref name="attlist.output"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.output" combine="interleave">
+ <optional>
+ <attribute name="zip-path"/>
+ </optional>
+ </define>
+ <define name="attlist.output" combine="interleave">
+ <attribute name="encoding"/>
+ </define>
+ <!-- Reserved for future extensions -->
+ <define name="tag-name">
+ <element name="tag-name">
+ <ref name="attlist.tag-name"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.tag-name" combine="interleave">
+ <attribute name="regexp"/>
+ </define>
+ <!--
+ The attribute regexp defines (whith a _flex_ regular expression) how
+ take a tag name from a whole tag. '\'
+ -->
+ <define name="escape-chars">
+ <element name="escape-chars">
+ <ref name="attlist.escape-chars"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.escape-chars" combine="interleave">
+ <attribute name="regexp"/>
+ </define>
+ <!--
+ The attribute regexp defines (whith a _flex_ regular expression) the
+ set of characters to be escaped with preceding a backslash '\'
+ -->
+ <define name="space-chars">
+ <element name="space-chars">
+ <ref name="attlist.space-chars"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.space-chars" combine="interleave">
+ <attribute name="regexp"/>
+ </define>
+ <!--
+ Define the space characters (in regexp) with a _flex_ regular
+ expression
+ -->
+ <define name="case-sensitive">
+ <element name="case-sensitive">
+ <ref name="attlist.case-sensitive"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.case-sensitive" combine="interleave">
+ <attribute name="value">
+ <choice>
+ <value>yes</value>
+ <value>no</value>
+ </choice>
+ </attribute>
+ </define>
+ <!--
+ The attribute 'value' is set to 'yes' if the case is relevant in the
+ specification of the format. Otherwise is set to 'no'
+ -->
+ <define name="rules">
+ <element name="rules">
+ <ref name="attlist.rules"/>
+ <oneOrMore>
+ <choice>
+ <ref name="format-rule"/>
+ <ref name="replacement-rule"/>
+ </choice>
+ </oneOrMore>
+ </element>
+ </define>
+ <define name="attlist.rules" combine="interleave">
+ <empty/>
+ </define>
+ <!--
+ Group the rules of processing format and the rules of substitute
+ expressions by characters that are part of the text
+ -->
+ <define name="format-rule">
+ <element name="format-rule">
+ <ref name="attlist.format-rule"/>
+ <choice>
+ <ref name="tag"/>
+ <group>
+ <ref name="begin"/>
+ <ref name="end"/>
+ </group>
+ </choice>
+ </element>
+ </define>
+ <define name="attlist.format-rule" combine="interleave">
+ <optional>
+ <attribute name="type">
+ <choice>
+ <value>comment</value>
+ <value>empty</value>
+ <value>open</value>
+ <value>close</value>
+ </choice>
+ </attribute>
+ </optional>
+ </define>
+ <define name="attlist.format-rule" combine="interleave">
+ <optional>
+ <attribute name="eos">
+ <choice>
+ <value>yes</value>
+ <value>no</value>
+ </choice>
+ </attribute>
+ </optional>
+ </define>
+ <define name="attlist.format-rule" combine="interleave">
+ <attribute name="priority"/>
+ </define>
+ <!--
+ Format rule parent element. It may include a 'tag' element or
+ a couple of elements 'begin', 'end'. In the first case, this element is
+ considered to be part of the format. In the second case, the begin and
+ the end element are considered to enclosing format. The attribute
+ 'eos' (end of sentence) is set to 'yes' if that rule defines a dot in
+ the text being processed (is no by default). The attribute 'priority'
+ marks the order of precedence of the rule
+ -->
+ <define name="tag">
+ <element name="tag">
+ <ref name="attlist.tag"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.tag" combine="interleave">
+ <attribute name="regexp"/>
+ </define>
+ <!--
+ Define an element that is part of the format by the pattern specified
+ as a value for the regexp attribute
+ -->
+ <define name="begin">
+ <element name="begin">
+ <ref name="attlist.begin"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.begin" combine="interleave">
+ <attribute name="regexp"/>
+ </define>
+ <!--
+ The attribute 'regexp' is the regular expression that detects the
+ begining delimiter of a block of format
+ -->
+ <define name="end">
+ <element name="end">
+ <ref name="attlist.end"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.end" combine="interleave">
+ <attribute name="regexp"/>
+ </define>
+ <!--
+ The attribute 'regexp' is the regular expression that detects the
+ ending delimiter of a block of format
+ -->
+ <define name="replacement-rule">
+ <element name="replacement-rule">
+ <ref name="attlist.replacement-rule"/>
+ <oneOrMore>
+ <ref name="replace"/>
+ </oneOrMore>
+ </element>
+ </define>
+ <define name="attlist.replacement-rule" combine="interleave">
+ <attribute name="regexp"/>
+ </define>
+ <!--
+ Root element for a replacement rule. The attribute 'regexp' is the
+ general expression to detect the elements to replace
+ -->
+ <define name="replace">
+ <element name="replace">
+ <ref name="attlist.replace"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.replace" combine="interleave">
+ <attribute name="source"/>
+ </define>
+ <define name="attlist.replace" combine="interleave">
+ <attribute name="target"/>
+ </define>
+ <define name="attlist.replace" combine="interleave">
+ <optional>
+ <attribute name="prefer">
+ <choice>
+ <value>yes</value>
+ <value>no</value>
+ </choice>
+ </attribute>
+ </optional>
+ </define>
+ <start>
+ <choice>
+ <ref name="format"/>
+ </choice>
+ </start>
+</grammar>
+<!--
+ Replacement rule. The 'source' is a string of one or more characters.
+ The 'target' MUST be a single character. The 'prefer' attribute, when
+ set to 'yes' defines the preferred reverse translation of the
+ replacement.
+-->
diff --git a/apertium/hmm.cc b/apertium/hmm.cc
index 09a37da..5a6d1d6 100644
--- a/apertium/hmm.cc
+++ b/apertium/hmm.cc
@@ -13,9 +13,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
/*
* First order hidden Markov model (HMM) implementation (source)
@@ -47,20 +45,54 @@
using namespace Apertium;
using namespace tagger_utils;
-HMM::HMM(TaggerDataHMM *tdhmm)
-{
- this->tdhmm = tdhmm;
+void HMM::deserialise(FILE *Serialised_FILE_Tagger) {
+ tdhmm.read(Serialised_FILE_Tagger);
+ eos = (tdhmm.getTagIndex())[L"TAG_SENT"];
+}
+
+std::vector<std::wstring> &HMM::getArrayTags() {
+ return tdhmm.getArrayTags();
+}
+
+void HMM::serialise(FILE *Stream_) { tdhmm.write(Stream_); }
+
+void HMM::deserialise(const TaggerData &Deserialised_FILE_Tagger) {
+ tdhmm = TaggerDataHMM(Deserialised_FILE_Tagger);
+ eos = (tdhmm.getTagIndex())[L"TAG_SENT"];
+}
+
+void HMM::init_probabilities_from_tagged_text_(FILE *TaggedCorpus,
+ FILE *UntaggedCorpus) {
+ init_probabilities_from_tagged_text(TaggedCorpus, UntaggedCorpus);
+ apply_rules();
+}
+
+void HMM::init_probabilities_kupiec_(FILE *Corpus) {
+ init_probabilities_kupiec(Corpus);
+ apply_rules();
+}
+
+void HMM::train(FILE *Corpus, unsigned long Count) {
+ for (; Count > 0; --Count) {
+ std::fseek(Corpus, 0, SEEK_SET);
+ train(Corpus);
+ }
- debug=false;
- show_sf=false;
- null_flush = false;
- eos = (tdhmm->getTagIndex())[L"TAG_SENT"];
+ apply_rules();
}
-HMM::~HMM()
+HMM::HMM() {}
+
+HMM::HMM(TaggerDataHMM tdhmm)
{
+ tdhmm = tdhmm;
+ eos = (tdhmm.getTagIndex())[L"TAG_SENT"];
}
+HMM::HMM(TaggerDataHMM *tdhmm) : tdhmm(*tdhmm) {}
+
+HMM::~HMM() {}
+
void
HMM::init()
{
@@ -72,18 +104,6 @@ HMM::set_eos(TTag t)
eos = t;
}
-void
-HMM::set_debug(bool d)
-{
- debug = d;
-}
-
-void
-HMM::set_show_sf(bool sf)
-{
- show_sf = sf;
-}
-
void
HMM::read_ambiguity_classes(FILE *in)
{
@@ -104,19 +124,19 @@ HMM::read_ambiguity_classes(FILE *in)
if(ambiguity_class.size() != 0)
{
- tdhmm->getOutput().add(ambiguity_class);
+ tdhmm.getOutput().add(ambiguity_class);
}
}
- tdhmm->setProbabilities(tdhmm->getTagIndex().size(), tdhmm->getOutput().size());
+ tdhmm.setProbabilities(tdhmm.getTagIndex().size(), tdhmm.getOutput().size());
}
void
HMM::write_ambiguity_classes(FILE *out)
{
- for(int i=0, limit = tdhmm->getOutput().size(); i != limit; i++)
+ for(int i=0, limit = tdhmm.getOutput().size(); i != limit; i++)
{
- set<TTag> const &ac = (tdhmm->getOutput())[i];
+ set<TTag> const &ac = (tdhmm.getOutput())[i];
Compression::multibyte_write(ac.size(), out);
for(set<TTag>::const_iterator it = ac.begin(), limit2 = ac.end();
it != limit2; it++)
@@ -129,20 +149,20 @@ HMM::write_ambiguity_classes(FILE *out)
void
HMM::read_probabilities(FILE *in)
{
- tdhmm->read(in);
+ tdhmm.read(in);
}
void
HMM::write_probabilities(FILE *out)
{
- tdhmm->write(out);
+ tdhmm.write(out);
}
void
HMM::init_probabilities_kupiec (FILE *is)
{
- int N = tdhmm->getN();
- int M = tdhmm->getM();
+ int N = tdhmm.getN();
+ int M = tdhmm.getM();
int i=0, j=0, k=0, k1=0, k2=0, nw=0;
#ifdef __GNUC__
double classes_ocurrences[M]; //M = Number of ambiguity classes
@@ -156,9 +176,9 @@ HMM::init_probabilities_kupiec (FILE *is)
vector <vector <double> > tags_pair_estimate(N, vector<double>(N, 0));
#endif
- Collection &output = tdhmm->getOutput();
+ Collection &output = tdhmm.getOutput();
- MorphoStream lexmorfo(is, true, tdhmm);
+ MorphoStream lexmorfo(is, true, &tdhmm);
TaggerWord *word=NULL;
@@ -182,16 +202,11 @@ HMM::init_probabilities_kupiec (FILE *is)
tags=word->get_tags();
if (tags.size()==0) { //This is an unknown word
- tags = tdhmm->getOpenClass();
+ tags = tdhmm.getOpenClass();
+ }
+ else {
+ require_ambiguity_class(tdhmm, tags, *word);
}
- else if (output.has_not(tags)) {
- wstring errors;
- errors = L"A new ambiguity class was found. I cannot continue.\n";
- errors+= L"Word '"+word->get_superficial_form()+L"' not found in the dictionary.\n";
- errors+= L"New ambiguity class: "+word->get_string_tags()+L"\n";
- errors+= L"Take a look at the dictionary and at the training corpus. Then, retrain.";
- fatal_error(errors);
- }
k2=output[tags];
@@ -246,9 +261,9 @@ HMM::init_probabilities_kupiec (FILE *is)
for(j=0; j<N; j++) {
if (sum>0)
- (tdhmm->getA())[i][j] = tags_pair_estimate[i][j]/sum;
+ (tdhmm.getA())[i][j] = tags_pair_estimate[i][j]/sum;
else {
- (tdhmm->getA())[i][j] = 0;
+ (tdhmm.getA())[i][j] = 0;
}
}
}
@@ -258,9 +273,9 @@ HMM::init_probabilities_kupiec (FILE *is)
for(k=0; k<M; k++) {
if (output[k].find(i)!=output[k].end()) {
if (tags_estimate[i]>0)
- (tdhmm->getB())[i][k] = (classes_ocurrences[k]/output[k].size())/tags_estimate[i];
+ (tdhmm.getB())[i][k] = (classes_ocurrences[k]/output[k].size())/tags_estimate[i];
else
- (tdhmm->getB())[i][k] = 0;
+ (tdhmm.getB())[i][k] = 0;
}
}
}
@@ -270,8 +285,8 @@ HMM::init_probabilities_kupiec (FILE *is)
void
HMM::init_probabilities_from_tagged_text(FILE *ftagged, FILE *funtagged) {
int i, j, k, nw=0;
- int N = tdhmm->getN();
- int M = tdhmm->getM();
+ int N = tdhmm.getN();
+ int M = tdhmm.getM();
#ifdef __GNUC__
double tags_pair[N][N];
double emission[N][M];
@@ -281,11 +296,11 @@ HMM::init_probabilities_from_tagged_text(FILE *ftagged, FILE *funtagged) {
#endif
- MorphoStream stream_tagged(ftagged, true, tdhmm);
- MorphoStream stream_untagged(funtagged, true, tdhmm);
+ MorphoStream stream_tagged(ftagged, true, &tdhmm);
+ MorphoStream stream_untagged(funtagged, true, &tdhmm);
TaggerWord *word_tagged=NULL, *word_untagged=NULL;
- Collection &output = tdhmm->getOutput();
+ Collection &output = tdhmm.getOutput();
set<TTag> tags;
@@ -344,17 +359,10 @@ HMM::init_probabilities_from_tagged_text(FILE *ftagged, FILE *funtagged) {
if (word_untagged->get_tags().size()==0) { // Unknown word
- tags = tdhmm->getOpenClass();
+ tags = tdhmm.getOpenClass();
}
- else if (output.has_not(word_untagged->get_tags())) { //We are training, there is no problem
- wstring errors;
- errors = L"A new ambiguity class was found. I cannot continue.\n";
- errors+= L"Word '"+word_untagged->get_superficial_form()+L"' not found in the dictionary.\n";
- errors+= L"New ambiguity class: "+word_untagged->get_string_tags()+L"\n";
- errors+= L"Take a look at the dictionary, then retrain.";
- fatal_error(errors);
- }
else {
+ require_ambiguity_class(tdhmm, word_untagged->get_tags(), *word_untagged);
tags = word_untagged->get_tags();
}
@@ -375,7 +383,7 @@ HMM::init_probabilities_from_tagged_text(FILE *ftagged, FILE *funtagged) {
for(j=0; j<N; j++)
sum += tags_pair[i][j]+1.0;
for(j=0; j<N; j++)
- (tdhmm->getA())[i][j] = (tags_pair[i][j]+1.0)/sum;
+ (tdhmm.getA())[i][j] = (tags_pair[i][j]+1.0)/sum;
}
@@ -391,7 +399,7 @@ HMM::init_probabilities_from_tagged_text(FILE *ftagged, FILE *funtagged) {
}
for(k=0; k<M; k++) {
if (output[k].find(i)!=output[k].end())
- (tdhmm->getB())[i][k] = (emission[i][k]+(((double)1.0)/((double)nclasses_appear)))/(times_appear+((double)1.0));
+ (tdhmm.getB())[i][k] = (emission[i][k]+(((double)1.0)/((double)nclasses_appear)))/(times_appear+((double)1.0));
}
}
@@ -401,14 +409,14 @@ HMM::init_probabilities_from_tagged_text(FILE *ftagged, FILE *funtagged) {
void
HMM::apply_rules()
{
- vector<TForbidRule> &forbid_rules = tdhmm->getForbidRules();
- vector<TEnforceAfterRule> &enforce_rules = tdhmm->getEnforceRules();
- int N = tdhmm->getN();
+ vector<TForbidRule> &forbid_rules = tdhmm.getForbidRules();
+ vector<TEnforceAfterRule> &enforce_rules = tdhmm.getEnforceRules();
+ int N = tdhmm.getN();
int i, j, j2;
bool found;
for(i=0; i<(int) forbid_rules.size(); i++) {
- (tdhmm->getA())[forbid_rules[i].tagi][forbid_rules[i].tagj] = ZERO;
+ (tdhmm.getA())[forbid_rules[i].tagi][forbid_rules[i].tagj] = ZERO;
}
for(i=0; i<(int) enforce_rules.size(); i++) {
@@ -421,7 +429,7 @@ HMM::apply_rules()
}
}
if (!found)
- (tdhmm->getA())[enforce_rules[i].tagi][j] = ZERO;
+ (tdhmm.getA())[enforce_rules[i].tagi][j] = ZERO;
}
}
@@ -429,67 +437,30 @@ HMM::apply_rules()
for(i=0; i<N; i++) {
double sum=0;
for(j=0; j<N; j++)
- sum += (tdhmm->getA())[i][j];
+ sum += (tdhmm.getA())[i][j];
for(j=0; j<N; j++) {
if (sum>0)
- (tdhmm->getA())[i][j] = (tdhmm->getA())[i][j]/sum;
+ (tdhmm.getA())[i][j] = (tdhmm.getA())[i][j]/sum;
else
- (tdhmm->getA())[i][j] = 0;
+ (tdhmm.getA())[i][j] = 0;
}
}
}
void
-HMM::read_dictionary (FILE *fdic) {
- int i, k, nw=0;
- TaggerWord *word=NULL;
- set <TTag> tags;
- Collection &output = tdhmm->getOutput();
-
- MorphoStream morpho_stream(fdic, true, tdhmm);
-
- // In the input dictionary there must be all punctuation marks, including the end-of-sentece mark
-
- word = morpho_stream.get_next_word();
-
- while (word) {
- if (++nw%10000==0) wcerr<<L'.'<<flush;
-
- tags = word->get_tags();
-
- if (tags.size()>0)
- k = output[tags];
+HMM::read_dictionary(FILE *fdic) {
+ tagger_utils::read_dictionary(fdic, tdhmm);
+ int N = (tdhmm.getTagIndex()).size();
+ int M = (tdhmm.getOutput()).size();
+ wcerr << N << L" states and " << M <<L" ambiguity classes\n";
- delete word;
- word = morpho_stream.get_next_word();
- }
- wcerr<<L"\n";
-
- // OPEN AMBIGUITY CLASS
- // It contains all tags that are not closed.
- // Unknown words are assigned the open ambiguity class
- k=output[tdhmm->getOpenClass()];
-
- int N = (tdhmm->getTagIndex()).size();
-
- // Create ambiguity class holding one single tag for each tag.
- // If not created yet
- for(i = 0; i != N; i++) {
- set<TTag> amb_class;
- amb_class.insert(i);
- k=output[amb_class];
- }
-
- int M = output.size();
-
- wcerr<< N <<L" states and "<< M <<L" ambiguity classes\n";
- tdhmm->setProbabilities(N, M);
+ tdhmm.setProbabilities(N, M);
}
void
HMM::filter_ambiguity_classes(FILE *in, FILE *out) {
set<set<TTag> > ambiguity_classes;
- MorphoStream morpho_stream(in, true, tdhmm);
+ MorphoStream morpho_stream(in, true, &tdhmm);
TaggerWord *word = morpho_stream.get_next_word();
@@ -520,13 +491,13 @@ HMM::train (FILE *ftxt) {
map < int, map <int, double> >::iterator it;
double prob, loli;
vector < set<TTag> > pending;
- Collection &output = tdhmm->getOutput();
+ Collection &output = tdhmm.getOutput();
int ndesconocidas=0;
// alpha => forward probabilities
// beta => backward probabilities
- MorphoStream morpho_stream(ftxt, true, tdhmm);
+ MorphoStream morpho_stream(ftxt, true, &tdhmm);
loli = 0;
tag = eos;
@@ -553,18 +524,11 @@ HMM::train (FILE *ftxt) {
tags = word->get_tags();
if (tags.size()==0) { // This is an unknown word
- tags = tdhmm->getOpenClass();
+ tags = tdhmm.getOpenClass();
ndesconocidas++;
}
- if (output.has_not(tags)) {
- wstring errors;
- errors = L"A new ambiguity class was found. I cannot continue.\n";
- errors+= L"Word '"+word->get_superficial_form()+L"' not found in the dictionary.\n";
- errors+= L"New ambiguity class: "+word->get_string_tags()+L"\n";
- errors+= L"Take a look at the dictionary, then retrain.";
- fatal_error(errors);
- }
+ require_ambiguity_class(tdhmm, tags, *word);
k = output[tags];
len = pending.size();
@@ -579,7 +543,7 @@ HMM::train (FILE *ftxt) {
//cerr<<"alpha["<<len-1<<"]["<<j<<"]="<<alpha[len-1][j]<<"\n";
//cerr<<"a["<<j<<"]["<<i<<"]="<<a[j][i]<<"\n";
//cerr<<"b["<<i<<"]["<<k<<"]="<<b[i][k]<<"\n";
- alpha[len][i] += alpha[len-1][j]*(tdhmm->getA())[j][i]*(tdhmm->getB())[i][k];
+ alpha[len][i] += alpha[len-1][j]*(tdhmm.getA())[j][i]*(tdhmm.getB())[i][k];
}
if (alpha[len][i]==0)
alpha[len][i]=DBL_MIN;
@@ -608,8 +572,8 @@ HMM::train (FILE *ftxt) {
i=*itag;
for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) {
j = *jtag;
- beta[1-t%2][j] += (tdhmm->getA())[j][i]*(tdhmm->getB())[i][k]*beta[t%2][i];
- xsi[j][i] += alpha[len-t-1][j]*(tdhmm->getA())[j][i]*(tdhmm->getB())[i][k]*beta[t%2][i]/prob;
+ beta[1-t%2][j] += (tdhmm.getA())[j][i]*(tdhmm.getB())[i][k]*beta[t%2][i];
+ xsi[j][i] += alpha[len-t-1][j]*(tdhmm.getA())[j][i]*(tdhmm.getB())[i][k]*beta[t%2][i]/prob;
}
double previous_value = gamma[i];
@@ -646,19 +610,19 @@ HMM::train (FILE *ftxt) {
word = morpho_stream.get_next_word();
}
- if ((pending.size()>1) || ((tag!=eos)&&(tag != (tdhmm->getTagIndex())[L"TAG_kEOF"])))
+ if ((pending.size()>1) || ((tag!=eos)&&(tag != (tdhmm.getTagIndex())[L"TAG_kEOF"])))
wcerr<<L"Warning: Thee las tag is not the end-of-sentence-tag\n";
- int N = tdhmm->getN();
- int M = tdhmm->getM();
+ int N = tdhmm.getN();
+ int M = tdhmm.getM();
//Clean previous values
for(i=0; i<N; i++) {
for(j=0; j<N; j++)
- (tdhmm->getA())[i][j]=ZERO;
+ (tdhmm.getA())[i][j]=ZERO;
for(k=0; k<M; k++)
- (tdhmm->getB())[i][k]=ZERO;
+ (tdhmm.getB())[i][k]=ZERO;
}
// new parameters
@@ -672,20 +636,20 @@ HMM::train (FILE *ftxt) {
gamma[i]=DBL_MIN;
}
- (tdhmm->getA())[i][j] = xsi[i][j]/gamma[i];
+ (tdhmm.getA())[i][j] = xsi[i][j]/gamma[i];
- if (isnan((tdhmm->getA())[i][j])) {
+ if (isnan((tdhmm.getA())[i][j])) {
wcerr<<L"NAN\n";
- wcerr <<L"Error: BW - NAN(1) a["<<i<<L"]["<<j<<L"]="<<(tdhmm->getA())[i][j]<<L"\txsi["<<i<<L"]["<<j<<L"]="<<xsi[i][j]<<L"\tgamma["<<i<<L"]="<<gamma[i]<<L"\n";
+ wcerr <<L"Error: BW - NAN(1) a["<<i<<L"]["<<j<<L"]="<<(tdhmm.getA())[i][j]<<L"\txsi["<<i<<L"]["<<j<<L"]="<<xsi[i][j]<<L"\tgamma["<<i<<L"]="<<gamma[i]<<L"\n";
exit(1);
}
- if (isinf((tdhmm->getA())[i][j])) {
+ if (isinf((tdhmm.getA())[i][j])) {
wcerr<<L"INF\n";
- wcerr <<L"Error: BW - INF(1) a["<<i<<L"]["<<j<<L"]="<<(tdhmm->getA())[i][j]<<L"\txsi["<<i<<L"]["<<j<<L"]="<<xsi[i][j]<<L"\tgamma["<<i<<L"]="<<gamma[i]<<L"\n";
+ wcerr <<L"Error: BW - INF(1) a["<<i<<L"]["<<j<<L"]="<<(tdhmm.getA())[i][j]<<L"\txsi["<<i<<L"]["<<j<<L"]="<<xsi[i][j]<<L"\tgamma["<<i<<L"]="<<gamma[i]<<L"\n";
exit(1);
}
- if ((tdhmm->getA())[i][j]==0) {
- //cerr <<"Error: BW - ZERO(1) a["<<i<<"]["<<j<<"]="<<(tdhmm->getA())[i][j]<<"\txsi["<<i<<"]["<<j<<"]="<<xsi[i][j]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
+ if ((tdhmm.getA())[i][j]==0) {
+ //cerr <<"Error: BW - ZERO(1) a["<<i<<"]["<<j<<"]="<<(tdhmm.getA())[i][j]<<"\txsi["<<i<<"]["<<j<<"]="<<xsi[i][j]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
// exit(1);
}
}
@@ -697,18 +661,18 @@ HMM::train (FILE *ftxt) {
for (kt=phi[i].begin(); kt!=phi[i].end(); kt++) {
k = kt->first;
if (phi[i][k]>0) {
- (tdhmm->getB())[i][k] = phi[i][k]/gamma[i];
+ (tdhmm.getB())[i][k] = phi[i][k]/gamma[i];
- if (isnan((tdhmm->getB())[i][k])) {
- wcerr<<L"Error: BW - NAN(2) b["<<i<<L"]["<<k<<L"]="<<(tdhmm->getB())[i][k]<<L"\tphi["<<i<<L"]["<<k<<L"]="<<phi[i][k]<<L"\tgamma["<<i<<L"]="<<gamma[i]<<L"\n";
+ if (isnan((tdhmm.getB())[i][k])) {
+ wcerr<<L"Error: BW - NAN(2) b["<<i<<L"]["<<k<<L"]="<<(tdhmm.getB())[i][k]<<L"\tphi["<<i<<L"]["<<k<<L"]="<<phi[i][k]<<L"\tgamma["<<i<<L"]="<<gamma[i]<<L"\n";
exit(1);
}
- if (isinf((tdhmm->getB())[i][k])) {
- wcerr<<L"Error: BW - INF(2) b["<<i<<L"]["<<k<<L"]="<<(tdhmm->getB())[i][k]<<L"\tphi["<<i<<L"]["<<k<<L"]="<<phi[i][k]<<L"\tgamma["<<i<<L"]="<<gamma[i]<<L"\n";
+ if (isinf((tdhmm.getB())[i][k])) {
+ wcerr<<L"Error: BW - INF(2) b["<<i<<L"]["<<k<<L"]="<<(tdhmm.getB())[i][k]<<L"\tphi["<<i<<L"]["<<k<<L"]="<<phi[i][k]<<L"\tgamma["<<i<<L"]="<<gamma[i]<<L"\n";
exit(1);
}
- if ((tdhmm->getB())[i][k]==0) {
- //cerr <<"Error: BW - ZERO(2) b["<<i<<"]["<<k<<"]="<<(tdhmm->getB())[i][k]<<"\tphi["<<i<<"]["<<k<<"]="<<phi[i][k]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
+ if ((tdhmm.getB())[i][k]==0) {
+ //cerr <<"Error: BW - ZERO(2) b["<<i<<"]["<<k<<"]="<<(tdhmm.getB())[i][k]<<"\tphi["<<i<<"]["<<k<<"]="<<phi[i][k]<<"\tgamma["<<i<<"]="<<gamma[i]<<"\n";
// exit(1);
}
}
@@ -720,20 +684,20 @@ HMM::train (FILE *ftxt) {
for(i=0; i<N; i++) {
double sum=0;
for(j=0; j<N; j++)
- sum+=(tdhmm->getA())[i][j];
+ sum+=(tdhmm.getA())[i][j];
for(j=0; j<N; j++)
- (tdhmm->getA())[i][j]=(tdhmm->getA())[i][j]/sum;
+ (tdhmm.getA())[i][j]=(tdhmm.getA())[i][j]/sum;
}
for(i=0; i<N; i++) {
double sum=0;
for(k=0; k<M; k++) {
if(output[k].find(i)!=output[k].end())
- sum+=(tdhmm->getB())[i][k];
+ sum+=(tdhmm.getB())[i][k];
}
for(k=0; k<M; k++) {
if(output[k].find(i)!=output[k].end())
- (tdhmm->getB())[i][k]=(tdhmm->getB())[i][k]/sum;
+ (tdhmm.getB())[i][k]=(tdhmm.getB())[i][k]/sum;
}
}
@@ -741,7 +705,7 @@ HMM::train (FILE *ftxt) {
}
void
-HMM::tagger(FILE *in, FILE *out, bool show_all_good_first) {
+HMM::tagger(FILE *Input, FILE *Output, const bool &First) {
int i, j, k, nw;
TaggerWord *word=NULL;
TTag tag;
@@ -750,7 +714,7 @@ HMM::tagger(FILE *in, FILE *out, bool show_all_good_first) {
set <TTag>::iterator itag, jtag;
double prob, loli, x;
- int N = tdhmm->getN();
+ int N = tdhmm.getN();
#ifdef __GNUC__
double alpha[2][N];
vector<TTag> best[2][N];
@@ -762,10 +726,10 @@ HMM::tagger(FILE *in, FILE *out, bool show_all_good_first) {
vector <TaggerWord> wpend;
int nwpend;
- MorphoStream morpho_stream(in, debug, tdhmm);
+ MorphoStream morpho_stream(Input, debug, &tdhmm);
morpho_stream.setNullFlush(null_flush);
- Collection &output = tdhmm->getOutput();
+ Collection &output = tdhmm.getOutput();
loli = nw = 0;
@@ -784,19 +748,9 @@ HMM::tagger(FILE *in, FILE *out, bool show_all_good_first) {
tags = word->get_tags();
if (tags.size()==0) // This is an unknown word
- tags = tdhmm->getOpenClass();
+ tags = tdhmm.getOpenClass();
- if (output.has_not(tags)) { // Encontrada una clase de ambigüedad desconocida hasta el momento
- if (debug) {
- wstring errors;
- errors = L"A new ambiguity class was found. \n";
- errors+= L"Retraining the tagger is necessary so as to take it into account.\n";
- errors+= L"Word '"+word->get_superficial_form()+L"'.\n";
- errors+= L"New ambiguity class: "+word->get_string_tags()+L"\n";
- wcerr<<L"Error: "<<errors;
- }
- tags = find_similar_ambiguity_class(tags);
- }
+ tags = require_similar_ambiguity_class(tdhmm, tags, *word, debug);
k = output[tags]; //Ambiguity class the word belongs to
@@ -813,7 +767,7 @@ HMM::tagger(FILE *in, FILE *out, bool show_all_good_first) {
i=*itag;
for (jtag=pretags.begin(); jtag!=pretags.end(); jtag++) { //For all tags from the previous word
j=*jtag;
- x = alpha[1-nwpend%2][j]*(tdhmm->getA())[j][i]*(tdhmm->getB())[i][k];
+ x = alpha[1-nwpend%2][j]*(tdhmm.getA())[j][i]*(tdhmm.getB())[i][k];
if (alpha[nwpend%2][i]<=x) {
if (nwpend>1)
best[nwpend%2][i] = best[1-nwpend%2][j];
@@ -836,14 +790,14 @@ HMM::tagger(FILE *in, FILE *out, bool show_all_good_first) {
wcerr<<L"Problem with word '"<<word->get_superficial_form()<<L"' "<<word->get_string_tags()<<L"\n";
}
for (unsigned t=0; t<best[nwpend%2][tag].size(); t++) {
- if (show_all_good_first) {
- wstring const &micad = wpend[t].get_all_chosen_tag_first(best[nwpend%2][tag][t], (tdhmm->getTagIndex())[L"TAG_kEOF"]);
- fputws_unlocked(micad.c_str(), out);
+ if (First) {
+ wstring const &micad = wpend[t].get_all_chosen_tag_first(best[nwpend%2][tag][t], (tdhmm.getTagIndex())[L"TAG_kEOF"]);
+ fputws_unlocked(micad.c_str(), Output);
} else {
- // print out
+ // print Output
wpend[t].set_show_sf(show_sf);
- wstring const &micad = wpend[t].get_lexical_form(best[nwpend%2][tag][t], (tdhmm->getTagIndex())[L"TAG_kEOF"]);
- fputws_unlocked(micad.c_str(), out);
+ wstring const &micad = wpend[t].get_lexical_form(best[nwpend%2][tag][t], (tdhmm.getTagIndex())[L"TAG_kEOF"]);
+ fputws_unlocked(micad.c_str(), Output);
}
}
@@ -858,13 +812,13 @@ HMM::tagger(FILE *in, FILE *out, bool show_all_good_first) {
{
if(null_flush)
{
- fputwc_unlocked(L'\0', out);
+ fputwc_unlocked(L'\0', Output);
tags.clear();
tags.insert(eos);
alpha[0][eos] = 1;
}
- fflush(out);
+ fflush(Output);
morpho_stream.setEndOfFile(false);
}
word = morpho_stream.get_next_word();
@@ -884,9 +838,9 @@ HMM::print_A() {
int i,j;
cout<<"TRANSITION MATRIX (A)\n------------------------------\n";
- for(i=0; i != tdhmm->getN(); i++)
- for(j=0; j != tdhmm->getN(); j++) {
- cout<<"A["<<i<<"]["<<j<<"] = "<<(tdhmm->getA())[i][j]<<"\n";
+ for(i=0; i != tdhmm.getN(); i++)
+ for(j=0; j != tdhmm.getN(); j++) {
+ cout<<"A["<<i<<"]["<<j<<"] = "<<(tdhmm.getA())[i][j]<<"\n";
}
}
@@ -895,11 +849,11 @@ HMM::print_B() {
int i,k;
cout<<"EMISSION MATRIX (B)\n-------------------------------\n";
- for(i=0; i != tdhmm->getN(); i++)
- for(k=0; k != tdhmm->getM(); k++) {
- Collection &output = tdhmm->getOutput();
+ for(i=0; i != tdhmm.getN(); i++)
+ for(k=0; k != tdhmm.getM(); k++) {
+ Collection &output = tdhmm.getOutput();
if(output[k].find(i)!=output[k].end())
- cout<<"B["<<i<<"]["<<k<<"] = "<<(tdhmm->getB())[i][k]<<"\n";
+ cout<<"B["<<i<<"]["<<k<<"] = "<<(tdhmm.getB())[i][k]<<"\n";
}
}
@@ -907,8 +861,8 @@ void HMM::print_ambiguity_classes() {
set<TTag> ambiguity_class;
set<TTag>::iterator itag;
cout<<"AMBIGUITY CLASSES\n-------------------------------\n";
- for(int i=0; i != tdhmm->getM(); i++) {
- ambiguity_class = (tdhmm->getOutput())[i];
+ for(int i=0; i != tdhmm.getM(); i++) {
+ ambiguity_class = (tdhmm.getOutput())[i];
cout <<i<<": ";
for (itag=ambiguity_class.begin(); itag!=ambiguity_class.end(); itag++) {
cout << *itag <<" ";
@@ -916,35 +870,3 @@ void HMM::print_ambiguity_classes() {
cout << "\n";
}
}
-
-set<TTag>
-HMM::find_similar_ambiguity_class(set<TTag> c) {
- int size_ret = -1;
- set<TTag> ret=tdhmm->getOpenClass(); //Se devolver� si no encontramos ninguna clase mejor
- bool skeep_class;
- Collection &output = tdhmm->getOutput();
-
- for(int k=0; k<output.size(); k++) {
- if ((((int)output[k].size())>((int)size_ret)) && (((int)output[k].size())<((int)c.size()))) {
- skeep_class=false;
- // Test if output[k] is a subset of class
- for(set<TTag>::const_iterator it=output[k].begin(); it!=output[k].end(); it++) {
- if (c.find(*it)==c.end()) {
- skeep_class=true; //output[k] is not a subset of class
- break;
- }
- }
- if (!skeep_class) {
- size_ret = output[k].size();
- ret = output[k];
- }
- }
- }
- return ret;
-}
-
-void
-HMM::setNullFlush(bool nf)
-{
- null_flush = nf;
-}
diff --git a/apertium/hmm.h b/apertium/hmm.h
index cc139ce..4a394f9 100644
--- a/apertium/hmm.h
+++ b/apertium/hmm.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
/**
* First order hidden Markov model (HMM) implementation (header)
@@ -25,6 +23,8 @@
#ifndef __HMM_H
#define __HMM_H
+#include "file_tagger.h"
+
#include <cstdio>
#include <fstream>
#include <math.h>
@@ -49,13 +49,10 @@ using namespace std;
/** HMM
* first-order hidden Markov Model
*/
-class HMM {
+class HMM : public Apertium::FILE_Tagger {
private:
- TaggerDataHMM *tdhmm;
+ TaggerDataHMM tdhmm;
TTag eos; // end-of-sentence tag
- bool debug; //If true, print error messages when tagging input text
- bool show_sf; //If true, print superficial forms when tagging input text
- bool null_flush; // If true, flush on '\0'
/** It allocs memory for the transition (a) and the emission (b) matrices.
* Before calling this method the number of ambiguity classes must be known.
@@ -63,21 +60,21 @@ private:
* @see: read_ambiguity_classes, read_dictionary
*/
void init();
-
- /** This method returns a known ambiguity class that is a subset of
- * the one received as a parameter. This is useful when a new
- * ambiguity class is found because of changes in the morphological
- * dictionary used by the MT system.
- * @param c set of tags (ambiguity class)
- * @return a known ambiguity class
- */
- set<TTag> find_similar_ambiguity_class(set<TTag> c);
-
public:
+ void deserialise(FILE *Serialised_FILE_Tagger);
+ std::vector<std::wstring> &getArrayTags();
+ void train(FILE *Corpus, unsigned long Count);
+ void serialise(FILE *Stream_);
+ void deserialise(const TaggerData &Deserialised_FILE_Tagger);
+ void init_probabilities_from_tagged_text_(FILE *TaggedCorpus,
+ FILE *UntaggedCorpus);
+ void init_probabilities_kupiec_(FILE *Corpus);
+ HMM();
+ HMM(TaggerDataHMM *tdhmm);
/** Constructor
*/
- HMM(TaggerDataHMM *tdhmm);
+ HMM(TaggerDataHMM tdhmm);
/** Destructor
*/
@@ -87,21 +84,6 @@ public:
* @param t the end-of-sentence tag
*/
void set_eos(TTag t);
-
- /** Used to set the debug flag
- *
- */
- void set_debug(bool d);
-
- /** Used to set the show superficial forms flag
- *
- */
- void set_show_sf(bool sf);
-
- /**
- * Used to set the null_flush flag
- */
- void setNullFlush(bool nf);
/** It reads the ambiguity classes from the stream received as
* input
@@ -162,9 +144,8 @@ public:
* @param in the input stream with the untagged text to tag
* @param out the output stream with the tagged text
*/
- void tagger (FILE *in, FILE *out, bool show_all_good_first=false);
+ void tagger(FILE *Input, FILE *Output, const bool &First = false);
-
/** Prints the A matrix.
*/
void print_A();
diff --git a/apertium/i.cc b/apertium/i.cc
new file mode 100644
index 0000000..d0e909e
--- /dev/null
+++ b/apertium/i.cc
@@ -0,0 +1,50 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "i.h"
+
+#include "analysis.h"
+#include "exception.h"
+#include "morpheme.h"
+
+namespace Apertium {
+bool operator==(const i &a_, const i &b_) { return a_.TheTags == b_.TheTags; }
+
+bool operator<(const i &a_, const i &b_) { return a_.TheTags < b_.TheTags; }
+
+i::i() {}
+
+i::i(const Analysis &Analysis_) : TheTags() {
+ if (Analysis_.TheMorphemes.empty())
+ throw Exception::Analysis::TheMorphemes_empty("can't convert const "
+ "Analysis & comprising empty "
+ "Morpheme std::vector to i");
+
+ if (Analysis_.TheMorphemes.front().TheTags.empty())
+ throw Exception::Morpheme::TheTags_empty("can't convert const Analysis & "
+ "comprising Morpheme comprising "
+ "empty Tag std::vector to i");
+
+ TheTags = Analysis_.TheMorphemes.front().TheTags;
+}
+
+i::i(const Morpheme &Morpheme_) : TheTags() {
+ if (Morpheme_.TheTags.empty())
+ throw Exception::Morpheme::TheTags_empty(
+ "can't convert const Morpheme & comprising empty Tag std::vector to i");
+
+ TheTags = Morpheme_.TheTags;
+}
+}
diff --git a/apertium/i.h b/apertium/i.h
new file mode 100644
index 0000000..67b6dac
--- /dev/null
+++ b/apertium/i.h
@@ -0,0 +1,38 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef I_H
+#define I_H
+
+#include "analysis.h"
+#include "morpheme.h"
+#include "tag.h"
+
+#include <vector>
+
+namespace Apertium {
+class i {
+ friend bool operator==(const i &a_, const i &b_);
+ friend bool operator<(const i &a_, const i &b_);
+
+public:
+ i();
+ i(const Analysis &Analysis_);
+ i(const Morpheme &Morpheme_);
+ std::vector<Tag> TheTags;
+};
+}
+
+#endif // I_H
diff --git a/apertium/interchunk.cc b/apertium/interchunk.cc
index 7147270..039829a 100644
--- a/apertium/interchunk.cc
+++ b/apertium/interchunk.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/interchunk.h>
#include <apertium/trx_reader.h>
@@ -34,18 +32,11 @@ using namespace Apertium;
using namespace std;
void
-Interchunk::copy(Interchunk const &o)
-{
-}
-
-void
Interchunk::destroy()
{
- if(me)
- {
- delete me;
- me = NULL;
- }
+ delete me;
+ me = NULL;
+
if(doc)
{
xmlFreeDoc(doc);
@@ -53,7 +44,15 @@ Interchunk::destroy()
}
}
-Interchunk::Interchunk()
+Interchunk::Interchunk() :
+word(0),
+blank(0),
+lword(0),
+lblank(0),
+output(0),
+any_char(0),
+any_tag(0),
+nwords(0)
{
me = NULL;
doc = NULL;
@@ -71,22 +70,6 @@ Interchunk::~Interchunk()
destroy();
}
-Interchunk::Interchunk(Interchunk const &o)
-{
- copy(o);
-}
-
-Interchunk &
-Interchunk::operator =(Interchunk const &o)
-{
- if(this != &o)
- {
- destroy();
- copy(o);
- }
- return *this;
-}
-
void
Interchunk::readData(FILE *in)
{
@@ -241,6 +224,11 @@ Interchunk::checkIndex(xmlNode *element, int index, int limit)
string
Interchunk::evalString(xmlNode *element)
{
+ if (element == 0)
+ {
+ throw "Interchunk::evalString() was passed a NULL element";
+ }
+
map<xmlNode *, TransferInstr>::iterator it;
it = evalStringCache.find(element);
if(it != evalStringCache.end())
@@ -630,6 +618,8 @@ Interchunk::processCallMacro(xmlNode *localroot)
}
}
+ // ToDo: Is it at all valid if npar <= 0 ?
+
InterchunkWord **myword = NULL;
if(npar > 0)
{
@@ -644,7 +634,7 @@ Interchunk::processCallMacro(xmlNode *localroot)
int idx = 0;
int lastpos = 0;
- for(xmlNode *i = localroot->children; i != NULL; i = i->next)
+ for(xmlNode *i = localroot->children; npar && i != NULL; i = i->next)
{
if(i->type == XML_ELEMENT_NODE)
{
@@ -675,14 +665,8 @@ Interchunk::processCallMacro(xmlNode *localroot)
swap(myblank, blank);
swap(npar, lword);
- if(myword)
- {
- delete[] myword;
- }
- if(myblank)
- {
- delete[] myblank;
- }
+ delete[] myword;
+ delete[] myblank;
}
void
diff --git a/apertium/interchunk.dtd b/apertium/interchunk.dtd
index 70083ac..9c9721b 100644
--- a/apertium/interchunk.dtd
+++ b/apertium/interchunk.dtd
@@ -13,9 +13,7 @@
General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA.
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
Draft of DTD for the structural transfer rule files
diff --git a/apertium/interchunk.h b/apertium/interchunk.h
index eea20ed..aa3f839 100644
--- a/apertium/interchunk.h
+++ b/apertium/interchunk.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _INTERCHUNK_
#define _INTERCHUNK_
@@ -76,7 +74,6 @@ private:
bool trace;
string emptyblank;
- void copy(Interchunk const &o);
void destroy();
void readData(FILE *input);
void readInterchunk(string const &input);
@@ -124,15 +121,12 @@ private:
public:
Interchunk();
~Interchunk();
- Interchunk(Interchunk const &o);
- Interchunk & operator =(Interchunk const &o);
void read(string const &transferfile, string const &datafile);
void interchunk(FILE *in, FILE *out);
bool getNullFlush(void);
void setNullFlush(bool null_flush);
void setTrace(bool trace);
-
};
#endif
diff --git a/apertium/interchunk.rnc b/apertium/interchunk.rnc
new file mode 100644
index 0000000..a4da35a
--- /dev/null
+++ b/apertium/interchunk.rnc
@@ -0,0 +1,353 @@
+# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+# Draft of DTD for the structural transfer rule files
+#
+# Sergio Ortiz, Gema Ramírez-Sánchez, Mireia Ginestí, Mikel L. Forcada,
+# 2005.07.29.
+
+condition =
+ and
+ | or
+ | not
+ | equal
+ | begins-with
+ | begins-with-list
+ | ends-with
+ | ends-with-list
+ | contains-substring
+ | in
+container = var | clip
+sentence = let | out | choose | modify-case | call-macro | append
+value =
+ b
+ | clip
+ | lit
+ | lit-tag
+ | var
+ | get-case-from
+ | case-of
+ | concat
+ | chunk
+stringvalue = clip | lit | var | get-case-from | case-of
+interchunk =
+ element interchunk {
+ attlist.interchunk,
+ section-def-cats,
+ section-def-attrs,
+ section-def-vars,
+ section-def-lists?,
+ section-def-macros?,
+ section-rules
+ }
+attlist.interchunk &= empty
+# 'interchunk' is the root element containing the whole structural
+# interchunk rule file.
+section-def-cats =
+ element section-def-cats { attlist.section-def-cats, def-cat+ }
+attlist.section-def-cats &= empty
+# The 'def-cats' section defines the categories used to build the
+# patterns used in rules
+def-cat = element def-cat { attlist.def-cat, cat-item+ }
+attlist.def-cat &=
+ attribute n { xsd:ID },
+ attribute c { text }?
+# Each 'def-cat' defines one category in terms of a list of
+# category items and has a unique name 'n', which is mandatory
+cat-item = element cat-item { attlist.cat-item, empty }
+attlist.cat-item &=
+ attribute lemma { text }?,
+ attribute tags { text },
+ attribute c { text }?
+# Each 'cat-item' (category item) represents a set of lexical forms
+# and has a mandatory attribute 'tags' whose value is a sequence of
+# dot-separated tag names; this sequence is a subsequence of the
+# tag sequence defining each possible lexical form. For example,
+# tags="n.f" would match all lexical forms containing this tag
+# sequence, such as "^casa<n><f><pl>$".
+#
+# In addition, an optional attribute, "lemma", may be used to
+# define lexical forms having a particular substring in their lemma
+section-def-attrs =
+ element section-def-attrs { attlist.section-def-attrs, def-attr+ }
+attlist.section-def-attrs &= empty
+# The 'def-attrs' section defines the attributes that will be
+# identified in matched lexical forms
+def-attr = element def-attr { attlist.def-attr, attr-item+ }
+attlist.def-attr &=
+ attribute n { xsd:ID },
+ attribute c { text }?
+# Each def-attr defines one attribute in terms of a list of
+# attribute items and has a mandatory unique name n
+attr-item = element attr-item { attlist.attr-item, empty }
+attlist.attr-item &=
+ attribute tags { text }?,
+ attribute c { text }?
+# Each 'attr-item' specifies a subsequence of the tags in
+# that lexical form (attribute 'tags')
+section-def-vars =
+ element section-def-vars { attlist.section-def-vars, def-var+ }
+attlist.section-def-vars &= empty
+# The 'def-vars' section defines the global variables
+# that will be used to transfer information between rules
+def-var = element def-var { attlist.def-var, empty }
+attlist.def-var &=
+ attribute n { xsd:ID },
+ attribute v { text }?,
+ attribute c { text }?
+# The definition of a global variable has a mandatory unique name 'n' that
+# will be used to refer to it. A value of initialization can also be specified
+# by means the 'v' attribute. The default value of the initialization is the
+# empty string.
+section-def-lists =
+ element section-def-lists { attlist.section-def-lists, def-list+ }
+attlist.section-def-lists &= empty
+# Element 'section-def-lists' encloses a set of list definitions
+def-list = element def-list { attlist.def-list, list-item+ }
+attlist.def-list &=
+ attribute n { xsd:ID },
+ attribute c { text }?
+# The 'def-list' element defines a named list to search with the 'in'
+# element. Attribute 'n' sets the name of the list
+list-item = element list-item { attlist.list-item, empty }
+attlist.list-item &=
+ attribute v { text },
+ attribute c { text }?
+# Attribute 'v' of 'list-item' element contains the value to be added to
+# the list being defined
+section-def-macros =
+ element section-def-macros { attlist.section-def-macros, def-macro+ }
+attlist.section-def-macros &= empty
+#
+# The 'def-macros' section defines macros containing portions of
+# code frequently used in the action part of rules
+#
+def-macro = element def-macro { attlist.def-macro, sentence+ }
+attlist.def-macro &= attribute n { xsd:ID }
+attlist.def-macro &=
+ attribute npar { text },
+ attribute c { text }?
+# Macro definition:
+#
+# A macro has a mandatory name (the value of 'n'), a number of parameters
+# (the value of 'npar') and a body containing arguments and statements.
+section-rules = element section-rules { attlist.section-rules, rule+ }
+attlist.section-rules &= empty
+# The rules section contains a sequence of one or more rules
+rule = element rule { attlist.rule, pattern, action }
+attlist.rule &= attribute comment { text }?
+# Each rule has a pattern and an action
+# * attribute 'comment' allows to put in comments about the purpose of
+# the rule being defined
+pattern = element pattern { attlist.pattern, pattern-item+ }
+attlist.pattern &= empty
+# The pattern is specified in terms of pattern items, each one
+# representing a lexical form in the matched pattern
+pattern-item = element pattern-item { attlist.pattern-item, empty }
+attlist.pattern-item &= attribute n { xsd:IDREF }
+# Each attribute to be activated is referred to by its name in the def-cats section
+action = element action { attlist.action, sentence* }
+attlist.action &= attribute c { text }?
+# Encloses the procedural part of a rule
+choose = element choose { attlist.choose, when+, otherwise? }
+attlist.choose &= attribute c { text }?
+# The choose statement is a selection statement (similar to a case
+# statement) composed of one or more tested cases and an optional
+# otherwise
+when = element when { attlist.when, test, sentence* }
+attlist.when &= attribute c { text }?
+# Each tested case is a block of zero or more statements
+otherwise = element otherwise { attlist.otherwise, sentence+ }
+attlist.otherwise &= attribute c { text }?
+# The otherwise case is also a block of one or more statements
+test = element test { attlist.test, condition }
+attlist.test &= attribute c { text }?
+# The test in a tested case may be a conjunction, a disjunction, or
+# a negation of simpler tests, as well as a simple equality test
+and = element and { attlist.and, condition, condition+ }
+attlist.and &= empty
+# Each conjuntion test contains two or more simpler tests
+or = element or { attlist.or, condition, condition+ }
+attlist.or &= empty
+# Each disjunction test contains two or more simpler tests
+not = element not { attlist.not, condition }
+attlist.not &= empty
+# The negation of a simpler test is a test itself
+equal = element equal { attlist.equal, value, value }
+attlist.equal &= attribute caseless { "no" | "yes" }?
+# The simplest test is an equality test. The right part and the
+# left part of the equality may both be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+begins-with = element begins-with { attlist.begins-with, value, value }
+attlist.begins-with &= attribute caseless { "no" | "yes" }?
+# Tests if the left part contains the right part at the beginning.
+# Both parts of the test may both be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+ends-with = element ends-with { attlist.ends-with, value, value }
+attlist.ends-with &= attribute caseless { "no" | "yes" }?
+# Tests if the left part contains the right part at the end.
+# Both parts of the test may both be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+begins-with-list =
+ element begins-with-list { attlist.begins-with-list, value, \list }
+attlist.begins-with-list &= attribute caseless { "no" | "yes" }?
+# Tests if the left part contains the right part at the beginning.
+# First parts of the test may be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. The second part
+# must be always a list. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+ends-with-list =
+ element ends-with-list { attlist.ends-with-list, value, \list }
+attlist.ends-with-list &= attribute caseless { "no" | "yes" }?
+# Tests if the left part contains the right part at the end.
+# First parts of the test may be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. The second part
+# must be always a list. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+contains-substring =
+ element contains-substring {
+ attlist.contains-substring, value, value
+ }
+attlist.contains-substring &= attribute caseless { "no" | "yes" }?
+# Tests if the left part contains the right part.
+# Both parts of the test may both be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+in = element in { attlist.in, value, \list }
+attlist.in &= attribute caseless { "no" | "yes" }?
+# 'in' performs a search of a value in a list. If 'caseless' is set to yes,
+# this search is performed without attending to the case
+\list = element list { attlist.list, empty }
+attlist.list &= attribute n { xsd:IDREF }
+# 'list' refers, with the name in attribute 'n', a list defined before in
+# the 'section-def-list' section
+let = element let { attlist.let, container, value }
+attlist.let &= empty
+# An assignment statement ('let') assigns the value of a clip (see
+# below), a literal string ('lit'), a literal tag('lit-tag') or the
+# value of a global variable ('var') to either a global variable ('var')
+# or a clip
+append = element append { attlist.append, value+ }
+attlist.append &= attribute n { xsd:IDREF }
+# This instruction appends the value of a clip (see
+# below), a literal string ('lit'), a literal tag('lit-tag') or the
+# value of a global variable ('var') to either a global variable ('var')
+# or a clip, identified by the "n" attribute
+out = element out { attlist.out, (b | chunk | var)+ }
+attlist.out &= attribute c { text }?
+# 'out' is an output statement; it may output blanks or chunks
+modify-case =
+ element modify-case { attlist.modify-case, container, stringvalue }
+attlist.modify-case &= empty
+# The first argument of 'modify-case' copy the case of the second
+# argument.
+call-macro = element call-macro { attlist.call-macro, with-param* }
+attlist.call-macro &= attribute n { xsd:IDREF }
+# A macro may be called anywhere by name with one or more
+# arguments
+with-param = element with-param { attlist.with-param, empty }
+attlist.with-param &= attribute pos { text }
+# The attribute pos in each argument is used to refer to a lexical
+# form in the current rule. For example, if a 2-parameter macro
+# has been defined to perform noun-adjective agreement operations,
+# it may be used with arguments 1 and 2 in a noun-adjective rule,
+# with arguments 2, 3 and 1 in a determiner-noun-adjective rule, with
+# arguments 1 and 3 in a noun-adverb-adjective rule, and with
+# arguments 2 and 1 in an adjective-noun rule
+clip = element clip { attlist.clip, empty }
+attlist.clip &=
+ attribute pos { text },
+ attribute part { text },
+ attribute c { text }?
+# A 'clip' is a substring of a source-language or target-language
+# lexical form, extracted according to an attribute:
+#
+# * 'pos' is an index (1, 2, 3...) used to select a lexical form
+# inside the rule;
+#
+# * the value of 'part' is the name of an attribute defined in
+# def-attrs, but may take also the values 'lem' (referring to
+# the lemma of the lexical form), 'lemh' (lemma head), 'lemq'
+# (lemma queue) and 'whole' (referring to the whole lexical form).
+#
+lit = element lit { attlist.lit, empty }
+attlist.lit &= attribute v { text }
+# A literal string value: the value of the literal is the value of
+# the 'v' attribute
+lit-tag = element lit-tag { attlist.lit-tag, empty }
+attlist.lit-tag &= attribute v { text }
+# A literal string value: the value of the literal is the value of
+# the 'v' attribute
+var = element var { attlist.var, empty }
+attlist.var &= attribute n { xsd:IDREF }
+# Each 'var' is a variable identifier: the attribute n is the name
+# of the variable. When it is in an 'out', a 'test', or the right
+# part of a 'let', it represents the value of the variable; when in
+# the left part of a 'let' it represents the reference of the
+# variable.
+get-case-from =
+ element get-case-from { attlist.get-case-from, (clip | lit | var) }
+attlist.get-case-from &= attribute pos { text }
+# Atención, falta modificar todos los comentarios donde intervenga
+# get-case-from
+case-of = element case-of { attlist.case-of, empty }
+attlist.case-of &=
+ attribute pos { text },
+ attribute part { text }
+# A 'case-of' is a value representing the case of a "clip". This value
+# will be "aa" (all lowercase), "Aa" (first uppercase) and "AA",
+# (all uppercase).
+#
+# * 'pos' is an index (1, 2, 3...) used to select a lexical form
+# inside the rule;
+#
+# * the value of 'part' is the name of an attribute defined in
+# def-attrs, but may take also the values 'lem' (referring to
+# the lemma of the lexical form), 'lemh' (lemma head), 'lemq'
+# (lemma queue) and 'whole' (referring to the whole lexical form).
+concat = element concat { attlist.concat, value+ }
+attlist.concat &= empty
+# Concatenates a sequence of values
+chunk = element chunk { attlist.chunk, value+ }
+attlist.chunk &= empty
+# Encloses a chunk
+pseudolemma = element pseudolemma { attlist.pseudolemma, value }
+attlist.pseudolemma &= empty
+b = element b { attlist.b, empty }
+attlist.b &= attribute pos { text }?
+start = interchunk | pseudolemma
+# 'b' is a [super]blanks item, indexed by pos; for example, a 'b'
+# with pos="2" refers to the [super]blanks (including format data
+# encapsulated by the de-formatter) between lexical form 2 and
+# lexical form 3. Managing [super]blanks explicitly allows for the
+# correct placement of format when the result of structural
+# transfer has more or less lexical items than the original or has
+# been reordered in some way. If attribute "pos" is not specified, then
+# a single blank (ASCII 32) is generated.
diff --git a/apertium/transfer.rng b/apertium/interchunk.rng
similarity index 88%
copy from apertium/transfer.rng
copy to apertium/interchunk.rng
index b78f94d..571fb0e 100644
--- a/apertium/transfer.rng
+++ b/apertium/interchunk.rng
@@ -1,9 +1,24 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<!--
- Draft of DTD for the structural transfer rule files
+ Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
- Sergio Ortiz, Gema Ram�rez-S�nchez, Mireia Ginest�, Mikel L. Forcada,
- 2005.07.29.
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+ Draft of DTD for the structural transfer rule files
+
+ Sergio Ortiz, Gema Ram�rez-S�nchez, Mireia Ginest�, Mikel L. Forcada,
+ 2005.07.29.
-->
<grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
<define name="condition">
@@ -46,6 +61,7 @@
<ref name="get-case-from"/>
<ref name="case-of"/>
<ref name="concat"/>
+ <ref name="chunk"/>
</choice>
</define>
<define name="stringvalue">
@@ -57,9 +73,9 @@
<ref name="case-of"/>
</choice>
</define>
- <define name="transfer">
- <element name="transfer">
- <ref name="attlist.transfer"/>
+ <define name="interchunk">
+ <element name="interchunk">
+ <ref name="attlist.interchunk"/>
<ref name="section-def-cats"/>
<ref name="section-def-attrs"/>
<ref name="section-def-vars"/>
@@ -72,21 +88,12 @@
<ref name="section-rules"/>
</element>
</define>
- <define name="attlist.transfer" combine="interleave">
- <optional>
- <attribute name="default">
- <choice>
- <value>lu</value>
- <value>chunk</value>
- </choice>
- </attribute>
- </optional>
+ <define name="attlist.interchunk" combine="interleave">
+ <empty/>
</define>
<!--
- 'transfer' is the root element containing the whole structural
- transfer rule file. Attribute 'default' specifies if
- unmatched words have to be written as lexical units ("lu", this is
- the default value) or as chunks ("chunk").
+ 'interchunk' is the root element containing the whole structural
+ interchunk rule file.
-->
<define name="section-def-cats">
<element name="section-def-cats">
@@ -729,8 +736,6 @@
<ref name="attlist.out"/>
<oneOrMore>
<choice>
- <ref name="mlu"/>
- <ref name="lu"/>
<ref name="b"/>
<ref name="chunk"/>
<ref name="var"/>
@@ -743,11 +748,7 @@
<attribute name="c"/>
</optional>
</define>
- <!--
- 'out' is an output statement; it may output any sequence of
- clips, literal strings, literal tags, variables, and whitespace items
- (see below)
- -->
+ <!-- 'out' is an output statement; it may output blanks or chunks -->
<define name="modify-case">
<element name="modify-case">
<ref name="attlist.modify-case"/>
@@ -805,20 +806,8 @@
</define>
<define name="attlist.clip" combine="interleave">
<attribute name="pos"/>
- <attribute name="side">
- <choice>
- <value>sl</value>
- <value>tl</value>
- </choice>
- </attribute>
<attribute name="part"/>
<optional>
- <attribute name="queue"/>
- </optional>
- <optional>
- <attribute name="link-to"/>
- </optional>
- <optional>
<attribute name="c"/>
</optional>
</define>
@@ -829,21 +818,11 @@
* 'pos' is an index (1, 2, 3...) used to select a lexical form
inside the rule;
- * 'side' is used to select a source-language ('sl') or a
- target-language ('tl') clip
-
* the value of 'part' is the name of an attribute defined in
def-attrs, but may take also the values 'lem' (referring to
the lemma of the lexical form), 'lemh' (lemma head), 'lemq'
(lemma queue) and 'whole' (referring to the whole lexical form).
- * the value of 'queue' may be 'no' or 'yes'. 'yes' is assumed by
- default.
-
- * 'link-to' causes the other attributes to be ignored in clip evaluation
- when using 'clip' as a right hand side element (as value), and
- returns its value. When using as a left hand side (as reference),
- the value of the 'as' attribute is ignored.
-->
<define name="lit">
<element name="lit">
@@ -914,12 +893,6 @@
</define>
<define name="attlist.case-of" combine="interleave">
<attribute name="pos"/>
- <attribute name="side">
- <choice>
- <value>sl</value>
- <value>tl</value>
- </choice>
- </attribute>
<attribute name="part"/>
</define>
<!--
@@ -930,9 +903,6 @@
* 'pos' is an index (1, 2, 3...) used to select a lexical form
inside the rule;
- * 'side' is used to select a source-language ('sl') or a
- target-language ('tl') clip
-
* the value of 'part' is the name of an attribute defined in
def-attrs, but may take also the values 'lem' (referring to
the lemma of the lexical form), 'lemh' (lemma head), 'lemq'
@@ -950,83 +920,25 @@
<empty/>
</define>
<!-- Concatenates a sequence of values -->
- <define name="mlu">
- <element name="mlu">
- <ref name="attlist.mlu"/>
- <oneOrMore>
- <ref name="lu"/>
- </oneOrMore>
- </element>
- </define>
- <define name="attlist.mlu" combine="interleave">
- <empty/>
- </define>
- <!-- Encloses a multiword -->
- <define name="lu">
- <element name="lu">
- <ref name="attlist.lu"/>
- <oneOrMore>
- <ref name="value"/>
- </oneOrMore>
- </element>
- </define>
- <define name="attlist.lu" combine="interleave">
- <empty/>
- </define>
- <!-- Encloses a word inside an 'out' element. -->
<define name="chunk">
<element name="chunk">
<ref name="attlist.chunk"/>
- <ref name="tags"/>
<oneOrMore>
- <choice>
- <ref name="mlu"/>
- <ref name="lu"/>
- <ref name="b"/>
- <ref name="var"/>
- </choice>
+ <ref name="value"/>
</oneOrMore>
</element>
</define>
<define name="attlist.chunk" combine="interleave">
- <optional>
- <attribute name="name"/>
- </optional>
- <optional>
- <attribute name="namefrom"/>
- </optional>
- <optional>
- <attribute name="case"/>
- </optional>
- <optional>
- <attribute name="c"/>
- </optional>
- </define>
- <!--
- Encloses a chunk inside an 'out' element.
- * 'name' the pseudolemma of the chunk.
- * 'namefrom' get the name from a variable.
- * 'case' the variable to get the uppercase/lowercase policy
- to apply it to the chunk name
- -->
- <define name="tags">
- <element name="tags">
- <ref name="attlist.tags"/>
- <oneOrMore>
- <ref name="tag"/>
- </oneOrMore>
- </element>
- </define>
- <define name="attlist.tags" combine="interleave">
<empty/>
</define>
- <define name="tag">
- <element name="tag">
- <ref name="attlist.tag"/>
+ <!-- Encloses a chunk -->
+ <define name="pseudolemma">
+ <element name="pseudolemma">
+ <ref name="attlist.pseudolemma"/>
<ref name="value"/>
</element>
</define>
- <define name="attlist.tag" combine="interleave">
+ <define name="attlist.pseudolemma" combine="interleave">
<empty/>
</define>
<define name="b">
@@ -1042,7 +954,8 @@
</define>
<start>
<choice>
- <ref name="transfer"/>
+ <ref name="interchunk"/>
+ <ref name="pseudolemma"/>
</choice>
</start>
</grammar>
diff --git a/apertium/interchunk_word.cc b/apertium/interchunk_word.cc
index e14fb62..6b5155d 100644
--- a/apertium/interchunk_word.cc
+++ b/apertium/interchunk_word.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
-* 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/interchunk_word.h>
diff --git a/apertium/interchunk_word.h b/apertium/interchunk_word.h
index c3aa23f..bb42922 100644
--- a/apertium/interchunk_word.h
+++ b/apertium/interchunk_word.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _INTERCHUNKWORD_
diff --git a/apertium/latex_accentsmap.cc b/apertium/latex_accentsmap.cc
index f35596e..966cc3a 100644
--- a/apertium/latex_accentsmap.cc
+++ b/apertium/latex_accentsmap.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/latex_accentsmap.h>
diff --git a/apertium/latex_accentsmap.h b/apertium/latex_accentsmap.h
index cbc9c97..6c04b69 100644
--- a/apertium/latex_accentsmap.h
+++ b/apertium/latex_accentsmap.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <map>
diff --git a/apertium/lemma.cc b/apertium/lemma.cc
new file mode 100644
index 0000000..bedd263
--- /dev/null
+++ b/apertium/lemma.cc
@@ -0,0 +1,55 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "lemma.h"
+
+#include "analysis.h"
+#include "exception.h"
+#include "morpheme.h"
+
+namespace Apertium {
+bool operator==(const Lemma &a_, const Lemma &b_) {
+ return a_.TheLemma == b_.TheLemma;
+}
+
+bool operator<(const Lemma &a_, const Lemma &b_) {
+ return a_.TheLemma < b_.TheLemma;
+}
+
+Lemma::Lemma() : TheLemma() {}
+
+Lemma::Lemma(const Analysis &Analysis_) : TheLemma() {
+ if (Analysis_.TheMorphemes.empty())
+ throw Exception::Analysis::TheMorphemes_empty(
+ "can't convert const Analysis & comprising empty Morpheme std::vector "
+ "to Lemma");
+
+ if (Analysis_.TheMorphemes.front().TheLemma.empty())
+ throw Exception::Morpheme::TheLemma_empty(
+ "can't convert const Analysis & comprising Morpheme comprising empty "
+ "Lemma std::wstring to Lemma");
+
+ TheLemma = Analysis_.TheMorphemes.front().TheLemma;
+}
+
+Lemma::Lemma(const Morpheme &Morpheme_) : TheLemma() {
+ if (Morpheme_.TheLemma.empty())
+ throw Exception::Morpheme::TheLemma_empty("can't convert const Morpheme & "
+ "comprising empty Lemma "
+ "std::wstring to Lemma");
+
+ TheLemma = Morpheme_.TheLemma;
+}
+}
diff --git a/apertium/lemma.h b/apertium/lemma.h
new file mode 100644
index 0000000..70da7e0
--- /dev/null
+++ b/apertium/lemma.h
@@ -0,0 +1,36 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef LEMMA_H
+#define LEMMA_H
+
+#include "analysis.h"
+#include "morpheme.h"
+
+#include <string>
+
+namespace Apertium {
+class Lemma {
+public:
+ friend bool operator==(const Lemma &a_, const Lemma &b_);
+ friend bool operator<(const Lemma &a_, const Lemma &b_);
+ Lemma();
+ Lemma(const Analysis &Analysis_);
+ Lemma(const Morpheme &Morpheme_);
+ std::wstring TheLemma;
+};
+}
+
+#endif // LEMMA_H
diff --git a/apertium/lexchoice.xsl b/apertium/lexchoice.xsl
index 57ec7b3..4344731 100644
--- a/apertium/lexchoice.xsl
+++ b/apertium/lexchoice.xsl
@@ -13,9 +13,7 @@
General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA.
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
-->
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" encoding="ISO-8859-1"/>
diff --git a/apertium/lexchoicebil.xsl b/apertium/lexchoicebil.xsl
index 300b86f..0965730 100644
--- a/apertium/lexchoicebil.xsl
+++ b/apertium/lexchoicebil.xsl
@@ -13,9 +13,7 @@
General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA.
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
-->
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" encoding="ISO-8859-1"/>
diff --git a/apertium/lexical_unit.h b/apertium/lexical_unit.h
new file mode 100644
index 0000000..3c615fe
--- /dev/null
+++ b/apertium/lexical_unit.h
@@ -0,0 +1,32 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef TAGGING_EXPRESSION_H
+#define TAGGING_EXPRESSION_H
+
+#include "analysis.h"
+
+#include <string>
+#include <vector>
+
+namespace Apertium {
+class LexicalUnit {
+public:
+ std::wstring TheSurfaceForm;
+ std::vector<Analysis> TheAnalyses;
+};
+}
+
+#endif // LEXICAL_UNIT_H
diff --git a/apertium/lextor.cc b/apertium/lextor.cc
index e2c6de5..527682e 100644
--- a/apertium/lextor.cc
+++ b/apertium/lextor.cc
@@ -13,9 +13,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/lextor.h>
@@ -34,12 +32,16 @@ using namespace Apertium;
bool LexTor::debug;
double LexTor::angleth;
-LexTor::LexTor() {
+LexTor::LexTor() :
+fstpbil(0)
+{
lextor_data=NULL;
tlmodel=NULL;
}
-LexTor::LexTor(const LexTor& lt) {
+LexTor::LexTor(const LexTor& lt) :
+fstpbil(0)
+{
lextor_data=lt.lextor_data;
tlmodel=lt.tlmodel;
}
@@ -335,11 +337,7 @@ LexTor::trainlch(wistream& is, int left, int right, LexTorData& tlwordmodel,
//The counts of the TL co-occurrence model are transferred to the SL. If the SL word is ambiguous
//it will have more than one translation into TL, so we need to normalize using the frequency of words
//in the TL
-#ifdef __GNUC__
- double translation_weighs[translation_buffer[i].size()];
-#else
vector <double> translation_weighs(translation_buffer[i].size());
-#endif
double sum=0.0;
if (translation_buffer[i].size()>1) {
for(int j=0; j<(int)translation_buffer[i].size(); j++) {
@@ -681,11 +679,7 @@ LexTor::estimate_winner_lch(deque<LexTorWord>& window, int word_index, double we
int
LexTor::estimate_winner_lch_voting(deque<LexTorWord>& window, int word_index, double weigth_exponent) {
-#ifdef __GNUC__
- double lexchoices_count[window[word_index].n_lexical_choices()];
-#else
vector <double> lexchoices_count(window[word_index].n_lexical_choices());
-#endif
if (debug) {
wcerr<<L"WINDOW: ";
@@ -924,11 +918,7 @@ LexTor::estimate_winner_lch_votingtl(deque<LexTorWord>& window, int word_index,
//If the SL word is ambiguous it will have more than one
//translation into TL, so we need to normalize using the
//frequency of words in the TL
-#ifdef __GNUC__
- double translation_weighs[translation_window[k].size()];
-#else
vector <double> translation_weighs(translation_window[k].size());
-#endif
double sum=0.0;
if (translation_window[k].size()>1) {
for(unsigned j=0; j<translation_window[k].size(); j++) {
diff --git a/apertium/lextor.h b/apertium/lextor.h
index 5abde35..58e7ec4 100644
--- a/apertium/lextor.h
+++ b/apertium/lextor.h
@@ -13,9 +13,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __LEXTOR_H
#define __LEXTOR_H
diff --git a/apertium/lextor_data.cc b/apertium/lextor_data.cc
index 065c14a..6269a5a 100644
--- a/apertium/lextor_data.cc
+++ b/apertium/lextor_data.cc
@@ -13,9 +13,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/lextor_data.h>
diff --git a/apertium/lextor_data.h b/apertium/lextor_data.h
index e20c565..370f844 100644
--- a/apertium/lextor_data.h
+++ b/apertium/lextor_data.h
@@ -13,9 +13,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __LEXTORDATA_H
#define __LEXTORDATA_H
diff --git a/apertium/lextor_eval.cc b/apertium/lextor_eval.cc
index d59e78f..6390a62 100644
--- a/apertium/lextor_eval.cc
+++ b/apertium/lextor_eval.cc
@@ -13,9 +13,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <iostream>
diff --git a/apertium/lextor_eval.h b/apertium/lextor_eval.h
index 14dfcfa..79f0118 100644
--- a/apertium/lextor_eval.h
+++ b/apertium/lextor_eval.h
@@ -13,9 +13,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __LEXTOR_EVAL_H
diff --git a/apertium/lextor_word.cc b/apertium/lextor_word.cc
index 95a8fde..618945f 100644
--- a/apertium/lextor_word.cc
+++ b/apertium/lextor_word.cc
@@ -13,9 +13,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/lextor_word.h>
diff --git a/apertium/lextor_word.h b/apertium/lextor_word.h
index 2195469..19cd22f 100644
--- a/apertium/lextor_word.h
+++ b/apertium/lextor_word.h
@@ -13,9 +13,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __LEXTORWORD_H
#define __LEXTORWORD_H
diff --git a/apertium/linebreak.cc b/apertium/linebreak.cc
new file mode 100644
index 0000000..0fe5510
--- /dev/null
+++ b/apertium/linebreak.cc
@@ -0,0 +1,94 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "linebreak.h"
+
+#include <string>
+
+namespace Apertium {
+std::string linebreak::linebreak_(std::string string_,
+ std::string::size_type col,
+ const std::string::size_type &wrapmargin) {
+ std::string::size_type i_ = 0;
+
+ while (true) {
+ if (i_ == string_.size())
+ return string_;
+
+ if (col < 79) {
+ if (string_.at(i_) == '\n') {
+ if (i_ + 1 == string_.size()) {
+ string_.erase(i_, 1);
+ return string_;
+ }
+
+ string_.insert(i_ + 1, wrapmargin, ' ');
+ col = wrapmargin;
+ i_ += wrapmargin;
+ continue;
+ }
+
+ ++col;
+ ++i_;
+ continue;
+ }
+
+ if (string_.at(i_) == ' ') {
+ std::string::size_type j_ = i_ + 1;
+
+ for (; i_ != 0; --i_) {
+ if (string_.at(i_ - 1) != ' ')
+ break;
+ }
+
+ for (;; ++j_) {
+ if (j_ == string_.size()) {
+ string_.erase(i_, j_ - i_);
+ return string_;
+ }
+
+ if (string_.at(j_) != ' ')
+ break;
+ }
+
+ linebreak_(string_, col, wrapmargin, i_, j_);
+ continue;
+ }
+
+ std::string::size_type j_ = i_;
+
+ for (; j_ != 0; --j_) {
+ if (string_.at(j_ - 1) == ' ')
+ break;
+ }
+
+ for (i_ = j_; i_ != 0; --i_) {
+ if (string_.at(i_ - 1) != ' ')
+ break;
+ }
+
+ linebreak_(string_, col, wrapmargin, i_, j_);
+ }
+}
+
+void linebreak::linebreak_(std::string &string_, std::string::size_type &col,
+ const std::string::size_type &wrapmargin,
+ std::string::size_type &i_,
+ const std::string::size_type &j_) {
+ string_.replace(i_, j_ - i_, '\n' + std::string(wrapmargin, ' '));
+ col = wrapmargin;
+ i_ += 1 /* '\n' */ + wrapmargin /* std::string(wrapmargin, ' ') */;
+}
+}
diff --git a/apertium/linebreak.h b/apertium/linebreak.h
new file mode 100644
index 0000000..c5e1d46
--- /dev/null
+++ b/apertium/linebreak.h
@@ -0,0 +1,36 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef LINEBREAK_H
+#define LINEBREAK_H
+
+#include <string>
+
+namespace Apertium {
+class linebreak {
+public:
+ static std::string linebreak_(std::string string_,
+ std::string::size_type col,
+ const std::string::size_type &wrapmargin);
+
+private:
+ static void linebreak_(std::string &string_, std::string::size_type &col,
+ const std::string::size_type &wrapmargin,
+ std::string::size_type &i_,
+ const std::string::size_type &j_);
+};
+}
+
+#endif // LINEBREAK_H
diff --git a/apertium/lswpost.cc b/apertium/lswpost.cc
index 231e6f0..7883d93 100644
--- a/apertium/lswpost.cc
+++ b/apertium/lswpost.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
/**
* Light Sliding-Window Part of Speech Tagger (LSWPoST) implementation (source)
@@ -43,84 +41,88 @@
#include <vector>
#include <algorithm>
#include <apertium/string_utils.h>
+#include <cstdlib>
using namespace std;
using namespace Apertium;
using namespace tagger_utils;
-LSWPoST::LSWPoST(TaggerDataLSW *t) {
- this->tdlsw = t;
- debug=false;
- show_sf=false;
- null_flush = false;
- eos = (tdlsw->getTagIndex())[L"TAG_SENT"];
+void LSWPoST::deserialise(FILE *Serialised_FILE_Tagger) {
+ tdlsw.read(Serialised_FILE_Tagger);
+ eos = (tdlsw.getTagIndex())[L"TAG_SENT"];
}
-LSWPoST::~LSWPoST() {
+std::vector<std::wstring> &LSWPoST::getArrayTags() {
+ return tdlsw.getArrayTags();
}
-void
-LSWPoST::set_eos(TTag t) {
- eos = t;
-}
+void LSWPoST::serialise(FILE *Stream_) { tdlsw.write(Stream_); }
-void
-LSWPoST::set_debug(bool d) {
- debug = d;
-}
+void LSWPoST::deserialise(const TaggerData &Deserialised_FILE_Tagger) {
+ tdlsw = TaggerDataLSW(Deserialised_FILE_Tagger);
+ eos = (tdlsw.getTagIndex())[L"TAG_SENT"];
+}
-void
-LSWPoST::set_show_sf(bool sf) {
- show_sf = sf;
+void LSWPoST::init_probabilities_from_tagged_text_(FILE *TaggedCorpus,
+ FILE *UntaggedCorpus) {
+ std::abort();
}
-
-void
-LSWPoST::setNullFlush(bool nf) {
- null_flush = nf;
+
+void LSWPoST::init_probabilities_kupiec_(FILE *Corpus) {
+ init_probabilities(Corpus);
+}
+
+void LSWPoST::train(FILE *Corpus, unsigned long Count) {
+ for (; Count > 0; --Count) {
+ std::fseek(Corpus, 0, SEEK_SET);
+ train(Corpus);
+ }
}
+LSWPoST::LSWPoST() {}
+
+LSWPoST::LSWPoST(TaggerDataLSW t) {
+ tdlsw = t;
+ eos = (tdlsw.getTagIndex())[L"TAG_SENT"];
+}
+
+LSWPoST::~LSWPoST() {}
+
+LSWPoST::LSWPoST(TaggerDataLSW *tdlsw) : tdlsw(*tdlsw) {}
+
+void
+LSWPoST::set_eos(TTag t) {
+ eos = t;
+}
+
void
LSWPoST::init_probabilities(FILE *ftxt) {
- int N = tdlsw->getN();
+ int N = tdlsw.getN();
int nw = 0;
TaggerWord *word = NULL;
set<TTag> tags_left, tags_mid, tags_right;
set<TTag>::iterator iter_left, iter_mid, iter_right;
vector<vector<vector<double> > > para_matrix(N, vector<vector<double> >(N, vector<double>(N, 0)));
- Collection &output = tdlsw->getOutput();
- MorphoStream morpho_stream(ftxt, true, tdlsw);
+ MorphoStream morpho_stream(ftxt, true, &tdlsw);
int num_valid_seq = 0;
word = new TaggerWord(); // word for tags left
- word->add_tag(eos, L"sent", tdlsw->getPreferRules());
+ word->add_tag(eos, L"sent", tdlsw.getPreferRules());
tags_left = word->get_tags(); // tags left
if (tags_left.size()==0) { //This is an unknown word
- tags_left = tdlsw->getOpenClass();
- }
- if (output.has_not(tags_left)) {
- wstring errors;
- errors = L"A new ambiguity class was found. I cannot continue.\n";
- errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n";
- errors += L"New ambiguity class: " + word->get_string_tags() + L"\n";
- errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
- fatal_error(errors);
+ tags_left = tdlsw.getOpenClass();
}
+
+ require_ambiguity_class(tdlsw, tags_left, *word);
++nw;
delete word;
word = morpho_stream.get_next_word(); // word for tags mid
tags_mid = word->get_tags(); // tags mid
if (tags_mid.size()==0) { //This is an unknown word
- tags_mid = tdlsw->getOpenClass();
- }
- if (output.has_not(tags_mid)) {
- wstring errors;
- errors = L"A new ambiguity class was found. I cannot continue.\n";
- errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n";
- errors += L"New ambiguity class: " + word->get_string_tags() + L"\n";
- errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
- fatal_error(errors);
+ tags_mid = tdlsw.getOpenClass();
}
+ require_ambiguity_class(tdlsw, tags_mid, *word);
++nw;
delete word;
if (morpho_stream.getEndOfFile()) {
@@ -137,16 +139,9 @@ LSWPoST::init_probabilities(FILE *ftxt) {
tags_right = word->get_tags(); // tags right
if (tags_right.size()==0) { //This is an unknown word
- tags_right = tdlsw->getOpenClass();
- }
- if (output.has_not(tags_right)) {
- wstring errors;
- errors = L"A new ambiguity class was found. I cannot continue.\n";
- errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n";
- errors += L"New ambiguity class: " + word->get_string_tags() + L"\n";
- errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
- fatal_error(errors);
+ tags_right = tdlsw.getOpenClass();
}
+ require_ambiguity_class(tdlsw, tags_right, *word);
num_valid_seq = tags_left.size() * tags_mid.size() * tags_right.size();
for (iter_left = tags_left.begin(); iter_left != tags_left.end(); ++iter_left) {
@@ -180,7 +175,7 @@ LSWPoST::init_probabilities(FILE *ftxt) {
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
for (int k = 0; k < N; ++k) {
- tdlsw->getD()[i][j][k] = para_matrix[i][j][k];
+ tdlsw.getD()[i][j][k] = para_matrix[i][j][k];
}
}
}
@@ -190,8 +185,8 @@ LSWPoST::init_probabilities(FILE *ftxt) {
bool LSWPoST::is_valid_seq(TTag left, TTag mid, TTag right) {
- vector<TForbidRule> &forbid_rules = tdlsw->getForbidRules();
- vector<TEnforceAfterRule> &enforce_rules = tdlsw->getEnforceRules();
+ vector<TForbidRule> &forbid_rules = tdlsw.getForbidRules();
+ vector<TEnforceAfterRule> &enforce_rules = tdlsw.getEnforceRules();
for (size_t r = 0; r < forbid_rules.size(); ++r) {
if ((left == forbid_rules[r].tagi && mid == forbid_rules[r].tagj)
@@ -231,93 +226,41 @@ bool LSWPoST::is_valid_seq(TTag left, TTag mid, TTag right) {
void
LSWPoST::read_dictionary(FILE *fdic) {
- int i, k, nw = 0;
- TaggerWord *word = NULL;
- set<TTag> tags;
- Collection &output = tdlsw->getOutput();
-
- MorphoStream morpho_stream(fdic, true, tdlsw);
-
- // In the input dictionary there must be all punctuation marks, including the end-of-sentece mark
-
- word = morpho_stream.get_next_word();
-
- while (word) {
- if (++nw % 10000 == 0)
- wcerr << L'.' << flush;
-
- tags = word->get_tags();
-
- if (tags.size() > 0)
- k = output[tags];
-
- delete word;
- word = morpho_stream.get_next_word();
- }
- wcerr << L"\n";
-
- // OPEN AMBIGUITY CLASS
- // It contains all tags that are not closed.
- // Unknown words are assigned the open ambiguity class
- k = output[tdlsw->getOpenClass()];
-
- int N = (tdlsw->getTagIndex()).size();
-
- // Create ambiguity class holding one single tag for each tag.
- // If not created yet
- for (i = 0; i != N; i++) {
- set < TTag > amb_class;
- amb_class.insert(i);
- k = output[amb_class];
- }
-
- wcerr << N << L" states\n";
+ tagger_utils::read_dictionary(fdic, tdlsw);
+ int N = (tdlsw.getTagIndex()).size();
+ int M = (tdlsw.getOutput()).size();
+ wcerr << N << L" states and " << M <<L" ambiguity classes\n";
// set up the probability matrix of tdlsw, the pointer to the TaggerDataLSW object
- tdlsw->setProbabilities(N);
+ tdlsw.setProbabilities(N);
}
void
LSWPoST::train(FILE *ftxt) {
- int N = tdlsw->getN();
+ int N = tdlsw.getN();
int nw = 0;
TaggerWord *word = NULL;
set<TTag> tags_left, tags_mid, tags_right;
set<TTag>::iterator iter_left, iter_mid, iter_right;
vector<vector<vector<double> > > para_matrix_new(N, vector<vector<double> >(N, vector<double>(N, 0)));
- Collection &output = tdlsw->getOutput();
- MorphoStream morpho_stream(ftxt, true, tdlsw);
+ MorphoStream morpho_stream(ftxt, true, &tdlsw);
word = new TaggerWord(); // word for tags left
- word->add_tag(eos, L"sent", tdlsw->getPreferRules());
+ word->add_tag(eos, L"sent", tdlsw.getPreferRules());
tags_left = word->get_tags(); // tags left
if (tags_left.size()==0) { //This is an unknown word
- tags_left = tdlsw->getOpenClass();
- }
- if (output.has_not(tags_left)) {
- wstring errors;
- errors = L"A new ambiguity class was found. I cannot continue.\n";
- errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n";
- errors += L"New ambiguity class: " + word->get_string_tags() + L"\n";
- errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
- fatal_error(errors);
+ tags_left = tdlsw.getOpenClass();
}
+ require_ambiguity_class(tdlsw, tags_left, *word);
++nw;
delete word;
word = morpho_stream.get_next_word(); // word for tags mid
tags_mid = word->get_tags(); // tags mid
if (tags_mid.size()==0) { //This is an unknown word
- tags_mid = tdlsw->getOpenClass();
- }
- if (output.has_not(tags_mid)) {
- wstring errors;
- errors = L"A new ambiguity class was found. I cannot continue.\n";
- errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n";
- errors += L"New ambiguity class: " + word->get_string_tags() + L"\n";
- errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
- fatal_error(errors);
+ tags_mid = tdlsw.getOpenClass();
}
+ require_ambiguity_class(tdlsw, tags_mid, *word);
++nw;
delete word;
if (morpho_stream.getEndOfFile()) {
@@ -333,23 +276,16 @@ LSWPoST::train(FILE *ftxt) {
tags_right = word->get_tags(); // tags right
if (tags_right.size()==0) { //This is an unknown word
- tags_right = tdlsw->getOpenClass();
- }
- if (output.has_not(tags_right)) {
- wstring errors;
- errors = L"A new ambiguity class was found. I cannot continue.\n";
- errors += L"Word '" + word->get_superficial_form() + L"' not found in the dictionary.\n";
- errors += L"New ambiguity class: " + word->get_string_tags() + L"\n";
- errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
- fatal_error(errors);
+ tags_right = tdlsw.getOpenClass();
}
+ require_ambiguity_class(tdlsw, tags_right, *word);
double normalization = 0;
for (iter_left = tags_left.begin(); iter_left != tags_left.end(); ++iter_left) {
for (iter_mid = tags_mid.begin(); iter_mid != tags_mid.end(); ++iter_mid) {
for (iter_right = tags_right.begin(); iter_right != tags_right.end(); ++iter_right) {
- normalization += tdlsw->getD()[*iter_left][*iter_mid][*iter_right];
+ normalization += tdlsw.getD()[*iter_left][*iter_mid][*iter_right];
}
}
}
@@ -359,7 +295,7 @@ LSWPoST::train(FILE *ftxt) {
for (iter_right = tags_right.begin(); iter_right != tags_right.end(); ++iter_right) {
if (normalization > ZERO) {
para_matrix_new[*iter_left][*iter_mid][*iter_right] +=
- tdlsw->getD()[*iter_left][*iter_mid][*iter_right] / normalization;
+ tdlsw.getD()[*iter_left][*iter_mid][*iter_right] / normalization;
}
}
}
@@ -374,7 +310,7 @@ LSWPoST::train(FILE *ftxt) {
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
for (int k = 0; k < N; ++k) {
- tdlsw->getD()[i][j][k] = para_matrix_new[i][j][k];
+ tdlsw.getD()[i][j][k] = para_matrix_new[i][j][k];
}
}
}
@@ -383,55 +319,37 @@ LSWPoST::train(FILE *ftxt) {
void
LSWPoST::print_para_matrix() {
wcout << L"para matrix D\n----------------------------\n";
- for (int i = 0; i < tdlsw->getN(); ++i) {
- for (int j = 0; j < tdlsw->getN(); ++j) {
- for (int k = 0; k < tdlsw->getN(); ++k) {
+ for (int i = 0; i < tdlsw.getN(); ++i) {
+ for (int j = 0; j < tdlsw.getN(); ++j) {
+ for (int k = 0; k < tdlsw.getN(); ++k) {
wcout << L"D[" << i << L"][" << j << L"][" << k << L"] = "
- << tdlsw->getD()[i][j][k] << "\n";
+ << tdlsw.getD()[i][j][k] << "\n";
}
}
}
}
void
-LSWPoST::tagger(FILE *in, FILE *out, bool show_all_good_first) {
+LSWPoST::tagger(FILE *Input, FILE *Output, const bool &First) {
TaggerWord *word_left = NULL, *word_mid = NULL, *word_right = NULL;
set<TTag> tags_left, tags_mid, tags_right;
set<TTag>::iterator iter_left, iter_mid, iter_right;
- MorphoStream morpho_stream(in, debug, tdlsw);
+ MorphoStream morpho_stream(Input, debug, &tdlsw);
morpho_stream.setNullFlush(null_flush);
- Collection &output = tdlsw->getOutput();
word_left = new TaggerWord(); // word left
- word_left->add_tag(eos, L"sent", tdlsw->getPreferRules());
+ word_left->add_tag(eos, L"sent", tdlsw.getPreferRules());
word_left->set_show_sf(show_sf);
tags_left = word_left->get_tags(); // tags left
- if (output.has_not(tags_left)) {
- if (debug) {
- wstring errors;
- errors = L"A new ambiguity class was found. I cannot continue.\n";
- errors += L"Word '" + word_left->get_superficial_form() + L"' not found in the dictionary.\n";
- errors += L"New ambiguity class: " + word_left->get_string_tags() + L"\n";
- errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
- fatal_error(errors);
- }
- tags_left = find_similar_ambiguity_class(tags_left);
- }
+
+ tags_left = require_similar_ambiguity_class(tdlsw, tags_left, *word_left, debug);
word_mid = morpho_stream.get_next_word(); // word mid
word_mid->set_show_sf(show_sf);
tags_mid = word_mid->get_tags(); // tags mid
- if (output.has_not(tags_mid)) {
- if (debug) {
- wstring errors;
- errors = L"A new ambiguity class was found. I cannot continue.\n";
- errors += L"Word '" + word_mid->get_superficial_form() + L"' not found in the dictionary.\n";
- errors += L"New ambiguity class: " + word_mid->get_string_tags() + L"\n";
- errors += L"Take a look at the dictionary and at the training corpus. Then, retrain.";
- fatal_error(errors);
- }
- tags_mid = find_similar_ambiguity_class(tags_mid);
- }
+
+ tags_mid = require_similar_ambiguity_class(tdlsw, tags_mid, *word_mid, debug);
+
if (morpho_stream.getEndOfFile()) {
delete word_left;
delete word_mid;
@@ -445,17 +363,7 @@ LSWPoST::tagger(FILE *in, FILE *out, bool show_all_good_first) {
while (word_right) {
tags_right = word_right->get_tags();
- if (output.has_not(tags_right)) {
- if (debug) {
- wstring errors;
- errors = L"A new ambiguity class was found. \n";
- errors+= L"Retraining the tagger is necessary so as to take it into account.\n";
- errors+= L"Word '"+word_right->get_superficial_form()+L"'.\n";
- errors+= L"New ambiguity class: "+word_right->get_string_tags()+L"\n";
- fatal_error(errors);
- }
- tags_right = find_similar_ambiguity_class(tags_right);
- }
+ tags_right = require_similar_ambiguity_class(tdlsw, tags_right, *word_right, debug);
double max = -1;
TTag tag_max = *tags_mid.begin();
@@ -463,7 +371,7 @@ LSWPoST::tagger(FILE *in, FILE *out, bool show_all_good_first) {
double n = 0;
for (iter_left = tags_left.begin(); iter_left != tags_left.end(); ++iter_left) {
for (iter_right = tags_right.begin(); iter_right != tags_right.end(); ++iter_right) {
- n += tdlsw->getD()[*iter_left][*iter_mid][*iter_right];
+ n += tdlsw.getD()[*iter_left][*iter_mid][*iter_right];
}
}
if (n > max) {
@@ -472,13 +380,13 @@ LSWPoST::tagger(FILE *in, FILE *out, bool show_all_good_first) {
}
}
- micad = word_mid->get_lexical_form(tag_max, (tdlsw->getTagIndex())[L"TAG_kEOF"]);
- fputws_unlocked(micad.c_str(), out);
+ micad = word_mid->get_lexical_form(tag_max, (tdlsw.getTagIndex())[L"TAG_kEOF"]);
+ fputws_unlocked(micad.c_str(), Output);
if (morpho_stream.getEndOfFile()) {
if (null_flush) {
- fputwc_unlocked(L'\0', out);
+ fputwc_unlocked(L'\0', Output);
}
- fflush(out);
+ fflush(Output);
morpho_stream.setEndOfFile(false);
}
@@ -495,30 +403,3 @@ LSWPoST::tagger(FILE *in, FILE *out, bool show_all_good_first) {
delete word_left;
delete word_mid;
}
-
-set<TTag>
-LSWPoST::find_similar_ambiguity_class(set<TTag> c) {
- int size_ret = -1;
- set<TTag> ret=tdlsw->getOpenClass(); // return open-class as default, if no better is found.
- bool skip_class;
- Collection &output = tdlsw->getOutput();
-
- for(int k=0; k<output.size(); k++) {
- if ((((int)output[k].size())>((int)size_ret)) && (((int)output[k].size())<((int)c.size()))) {
- skip_class=false;
- // Test if output[k] is a subset of class
- for(set<TTag>::const_iterator it=output[k].begin(); it!=output[k].end(); it++) {
- if (c.find(*it)==c.end()) {
- skip_class=true; //output[k] is not a subset of class
- break;
- }
- }
- if (!skip_class) {
- size_ret = output[k].size();
- ret = output[k];
- }
- }
- }
- return ret;
-}
-
diff --git a/apertium/lswpost.h b/apertium/lswpost.h
index f4e99e1..14f452f 100644
--- a/apertium/lswpost.h
+++ b/apertium/lswpost.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
/**
* Light Sliding-Window Part of Speech Tagger (LSWPoST) implementation (header)
@@ -25,6 +23,8 @@
#ifndef __LSWPOST_H
#define __LSWPOST_H
+#include "file_tagger.h"
+
#include <cstdio>
#include <fstream>
#include <math.h>
@@ -48,18 +48,26 @@
/** LSWPoST
* Light Sliding-Window Part of Speech Tagger
*/
-class LSWPoST {
+class LSWPoST : public Apertium::FILE_Tagger {
private:
- TaggerDataLSW * tdlsw;
+ TaggerDataLSW tdlsw;
TTag eos; // end-of-sentence tag
- bool debug; //If true, print error messages when tagging input text
- bool show_sf; //If true, print superficial forms when tagging input text
- bool null_flush; //If true, flush on '\0'
public:
+ void deserialise(FILE *Serialised_FILE_Tagger);
+ std::vector<std::wstring> &getArrayTags();
+ void train(FILE *Corpus, unsigned long Count);
+ void serialise(FILE *Stream_);
+ void deserialise(const TaggerData &Deserialised_FILE_Tagger);
+ void init_probabilities_from_tagged_text_(FILE *TaggedCorpus,
+ FILE *UntaggedCorpus);
+ void init_probabilities_kupiec_(FILE *Corpus);
+ LSWPoST();
+ LSWPoST(TaggerDataLSW *tdlsw);
+
/** Constructor
*/
- LSWPoST(TaggerDataLSW *t);
+ LSWPoST(TaggerDataLSW t);
/** Destructor
*/
@@ -69,18 +77,6 @@ public:
* @param t the end-of-sentence tag
*/
void set_eos(TTag t);
-
- /** Used to set the debug flag
- */
- void set_debug(bool d);
-
- /** Used to set the show superficial forms flag
- */
- void set_show_sf(bool sf);
-
- /** Used to set the null_flush flag
- */
- void setNullFlush(bool nf);
/** It reads the expanded dictionary received as a parameter and calculates
* the set of ambiguity classes that the tagger will manage.
@@ -110,15 +106,6 @@ public:
/** Do the tagging
*/
- void tagger(FILE *in, FILE *out, bool show_all_good_first);
-
- /** This method returns a known ambiguity class that is a subset of
- * the one received as a parameter. This is useful when a new
- * ambiguity class is found because of changes in the morphological
- * dictionary used by the MT system.
- * @param c set of tags (ambiguity class)
- * @return a known ambiguity class
- */
- set<TTag> find_similar_ambiguity_class(set<TTag> c);
+ void tagger(FILE *Input, FILE *Output, const bool &First = false);
};
#endif
diff --git a/apertium/modes-header.sh b/apertium/modes-header.sh
deleted file mode 100644
index 44c0793..0000000
--- a/apertium/modes-header.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-
-if [ $# -lt 1 ]
-then echo "USAGE: $(basename $0) <modes file> ";
- exit 1;
-fi
-
-FLEXOPTS=""
-FILE1=$1;
-
-if [ ! -e $1 ]
-then echo "ERROR: '$1' file not found";
- exit 1;
-fi
-
-DIRNAME=$(dirname $1);
-FULLDIRNAME=$(cd $DIRNAME; pwd);
-
-rm -Rf *.mode
-
-if [ ! -d $FULLDIRNAME/modes ]
-then mkdir $FULLDIRNAME/modes
-else rm -Rf $FULLDIRNAME/modes && mkdir $FULLDIRNAME/modes
-fi
-
-FILE1=$FULLDIRNAME/$(basename $1)
-cd $FULLDIRNAME/modes
-
-if [ $# -eq 2 ]; then
- PREFIX=$2;
- FULLDIRNAME=$APERTIUMDIR"/"$PREFIX;
-fi
-
diff --git a/apertium/modes.dtd b/apertium/modes.dtd
index 4d91930..8f9d92c 100644
--- a/apertium/modes.dtd
+++ b/apertium/modes.dtd
@@ -1,30 +1,29 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-<!--
- Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright (C) 2005-2016 Universitat d'Alacant / Universidad de Alicante
+
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
-
+
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
-
+
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA.
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
- DTD for the modes.xml file
--->
+ DTD for the modes.xml file
+-->
<!ELEMENT modes (mode+)>
<!ELEMENT mode (pipeline)>
<!ATTLIST mode name ID #REQUIRED>
<!ATTLIST mode install CDATA #IMPLIED>
+<!ATTLIST mode gendebug CDATA #IMPLIED>
<!ELEMENT pipeline (program+)>
diff --git a/apertium/modes.rnc b/apertium/modes.rnc
new file mode 100644
index 0000000..88f42fb
--- /dev/null
+++ b/apertium/modes.rnc
@@ -0,0 +1,30 @@
+# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+# DTD for the modes.xml file
+
+modes = element modes { attlist.modes, mode+ }
+attlist.modes &= empty
+mode = element mode { attlist.mode, pipeline }
+attlist.mode &= attribute name { xsd:ID }
+attlist.mode &= attribute install { text }?
+pipeline = element pipeline { attlist.pipeline, program+ }
+attlist.pipeline &= empty
+program = element program { attlist.program, file* }
+attlist.program &= attribute name { text }
+attlist.program &= attribute prefix { text }?
+file = element file { attlist.file, empty }
+attlist.file &= attribute name { text }
+start = modes
diff --git a/apertium/modes.rng b/apertium/modes.rng
index 62695c2..d3ec5fa 100644
--- a/apertium/modes.rng
+++ b/apertium/modes.rng
@@ -1,5 +1,22 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
-<!-- DTD for the modes.xml file -->
+<!--
+ Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+ DTD for the modes.xml file
+-->
<grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
<define name="modes">
<element name="modes">
diff --git a/apertium/modes2bash.xsl b/apertium/modes2bash.xsl
index f03decc..0017281 100644
--- a/apertium/modes2bash.xsl
+++ b/apertium/modes2bash.xsl
@@ -1,6 +1,6 @@
-<?xml version="1.0" encoding="ISO-8859-1"?><!-- -*- nxml -*- -->
+<?xml version="1.0" encoding="UTF-8"?><!-- -*- nxml -*- -->
<!--
- Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+ Copyright (C) 2005-2014 Universitat d'Alacant / Universidad de Alicante
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
@@ -13,56 +13,83 @@
General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA.
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
-->
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
-<xsl:output method="text" encoding="ISO-8859-1" indent="no"/>
+<xsl:output method="text" encoding="UTF-8" indent="no"/>
-<xsl:param name="prefix"/>
-<xsl:param name="dataprefix"/>
+<xsl:param name="installdir"/>
+<xsl:param name="devdir"/>
<xsl:template match="modes">
- <xsl:for-each select="./mode">
- ----<xsl:value-of select="./@name"/>:<xsl:choose><xsl:when test="@install = 'yes'">install:yes</xsl:when><xsl:otherwise>install:no</xsl:otherwise></xsl:choose>----
- <xsl:apply-templates/>
- </xsl:for-each>
+ <xsl:apply-templates/>
</xsl:template>
+
+<!-- Print filenames first, then the contents of each file. This gets
+ parsed by apertium-createmodes.awk which splits it into the
+ specific files. -->
<xsl:template match="mode">
- <xsl:apply-templates/>
+ <xsl:choose>
+ <xsl:when test="@install = 'yes'">
+ <xsl:text>
+# </xsl:text>
+ <xsl:value-of select="./@name"/>
+ <xsl:text>.mode
+ </xsl:text>
+ <xsl:apply-templates>
+ <xsl:with-param name="dir"><xsl:value-of select="$installdir"/></xsl:with-param>
+ </xsl:apply-templates>
+ </xsl:when>
+ </xsl:choose>
+ <xsl:variable name="dir">
+ <xsl:text>'</xsl:text>
+ <xsl:value-of select="$devdir"/>
+ <xsl:text>'</xsl:text>
+ </xsl:variable>
+ <xsl:text>
+# modes/</xsl:text>
+ <xsl:value-of select="./@name"/>
+ <xsl:text>.mode
+ </xsl:text>
+ <xsl:apply-templates>
+ <xsl:with-param name="dir"><xsl:value-of select="$devdir"/></xsl:with-param>
+ </xsl:apply-templates>
</xsl:template>
+
<xsl:template match="pipeline">
+ <xsl:param name="dir" />
<xsl:for-each select="./*">
<xsl:if test="not(position()=1)">
- <xsl:value-of select="string('|')"/>
+ <xsl:text>| </xsl:text>
</xsl:if>
- <xsl:apply-templates select="."/>
+ <xsl:apply-templates select=".">
+ <xsl:with-param name="dir"><xsl:value-of select="$dir"/></xsl:with-param>
+ </xsl:apply-templates>
</xsl:for-each>
</xsl:template>
<xsl:template match="program">
- <xsl:choose>
- <xsl:when test="@prefix">
- <xsl:value-of select="@prefix"/>
- <xsl:value-of select="string('/')"/>
- </xsl:when>
- </xsl:choose>
+ <xsl:param name="dir" />
<xsl:value-of select="./@name"/>
- <xsl:for-each select="./*">
- <xsl:value-of select="string(' ')"/>
- <xsl:apply-templates select="."/>
+ <xsl:for-each select="./*">
+ <xsl:text> </xsl:text>
+ <xsl:apply-templates select=".">
+ <xsl:with-param name="dir"><xsl:value-of select="$dir"/></xsl:with-param>
+ </xsl:apply-templates>
</xsl:for-each>
</xsl:template>
<xsl:template match="file">
- <xsl:value-of select="$dataprefix"/>
- <xsl:value-of select="string('/')"/>
+ <xsl:param name="dir" />
+ <xsl:text>'</xsl:text>
+ <xsl:value-of select="$dir" />
+ <xsl:text>/</xsl:text>
<xsl:value-of select="./@name"/>
- <xsl:value-of select="string(' ')"/>
+ <xsl:text>' </xsl:text>
</xsl:template>
+
</xsl:stylesheet>
diff --git a/apertium/modes2debugmodes.xsl b/apertium/modes2debugmodes.xsl
new file mode 100644
index 0000000..256b4f8
--- /dev/null
+++ b/apertium/modes2debugmodes.xsl
@@ -0,0 +1,162 @@
+<?xml version="1.0" encoding="UTF-8"?> <!-- -*- nxml -*- -->
+<!--
+Copyright (C) 2016 Universitat d'Alacant / Universidad de Alicante
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License as
+published by the Free Software Foundation; either version 2 of the
+License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, see <http://www.gnu.org/licenses/>.
+-->
+<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:output method="xml" encoding="UTF-8"/>
+
+ <xsl:template name="replaceString">
+ <xsl:param name="haystack"/>
+ <xsl:param name="needle"/>
+ <xsl:param name="replacement"/>
+ <xsl:choose>
+ <xsl:when test="contains($haystack, $needle)">
+ <xsl:value-of select="substring-before($haystack, $needle)"/>
+ <xsl:value-of select="$replacement"/>
+ <xsl:call-template name="replaceString">
+ <xsl:with-param name="haystack"
+ select="substring-after($haystack, $needle)"/>
+ <xsl:with-param name="needle" select="$needle"/>
+ <xsl:with-param name="replacement" select="$replacement"/>
+ </xsl:call-template>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:value-of select="$haystack"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+ <xsl:template match="mode[not(@gendebug='yes')]">
+ <!-- Output these unchanged; don't apply templates here -->
+ <xsl:copy-of select="."/>
+ </xsl:template>
+
+ <xsl:template match="mode[@gendebug='yes']">
+ <xsl:comment>
+ <xsl:text> **************** </xsl:text>
+ <xsl:value-of select="./@name"/>
+ <xsl:text>: **************** </xsl:text>
+ </xsl:comment>
+ <xsl:copy-of select="."/>
+ <xsl:apply-templates select="./pipeline/program"/>
+ </xsl:template>
+
+ <xsl:template name="debugSuffix">
+ <xsl:param name="progname"/>
+ <xsl:variable name="p" select="normalize-space($progname)"/>
+ <!-- TODO: We also need to know what names have been used already
+ to make them unique! Might be easier to do uniquifying
+ outside XSLT -->
+ <xsl:choose>
+ <xsl:when test="starts-with($p, 'cg-proc')">
+ <xsl:text>-disam</xsl:text>
+ </xsl:when>
+ <xsl:when test="starts-with($p, 'apertium-tagger')">
+ <xsl:text>-tagger</xsl:text>
+ </xsl:when>
+ <xsl:when test="starts-with($p, 'apertium-pretransfer')">
+ <xsl:text>-pretransfer</xsl:text>
+ </xsl:when>
+ <xsl:when test="starts-with($p, 'lrx-proc')">
+ <xsl:text>-lex</xsl:text>
+ </xsl:when>
+ <xsl:when test="starts-with($p, 'apertium-transfer')">
+ <xsl:text>-chunker</xsl:text>
+ </xsl:when>
+ <xsl:when test="starts-with($p, 'apertium-interchunk')">
+ <xsl:text>-interchunk</xsl:text>
+ </xsl:when>
+ <xsl:when test="starts-with($p, 'apertium-postchunk')">
+ <xsl:text>-postchunk</xsl:text>
+ </xsl:when>
+ <xsl:when test="contains($p, '$1')">
+ <xsl:text>-dgen</xsl:text>
+ </xsl:when>
+ <xsl:when test="starts-with($p, 'lt-proc') and contains($p, '-b')">
+ <xsl:text>-biltrans</xsl:text>
+ </xsl:when>
+ <xsl:when test="starts-with($p, 'lt-proc') and contains($p, '-p')">
+ <xsl:text>-pgen</xsl:text>
+ </xsl:when>
+ <xsl:when test="starts-with($p, 'lt-proc')">
+ <xsl:text>-morph</xsl:text>
+ </xsl:when>
+ <xsl:when test="starts-with($p, 'hfst-proc')">
+ <xsl:text>-morph</xsl:text>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:text>-NAMEME</xsl:text>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+ <xsl:template name="traceOpt">
+ <xsl:param name="progname"/>
+ <xsl:variable name="p" select="normalize-space($progname)"/>
+ <xsl:choose>
+ <xsl:when test="starts-with($p, 'cg-proc')">
+ <xsl:text> -t</xsl:text>
+ </xsl:when>
+ <xsl:when test="starts-with($p, 'lrx-proc')">
+ <xsl:text> -t</xsl:text>
+ </xsl:when>
+ <xsl:when test="starts-with($p, 'apertium-transfer')">
+ <xsl:text> -t</xsl:text>
+ </xsl:when>
+ <xsl:when test="starts-with($p, 'apertium-interchunk')">
+ <xsl:text> -t</xsl:text>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:text></xsl:text>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+ <xsl:template match="program">
+ <mode install="no">
+ <xsl:attribute name="name">
+ <xsl:value-of select="../../@name"/>
+ <xsl:call-template name="debugSuffix">
+ <xsl:with-param name="progname" select="./@name"/>
+ </xsl:call-template>
+ </xsl:attribute>
+ <pipeline>
+ <xsl:copy-of select="./preceding-sibling::*"/>
+ <program>
+ <xsl:attribute name="name">
+ <xsl:call-template name="replaceString">
+ <xsl:with-param name="haystack" select="./@name"/>
+ <xsl:with-param name="needle" select="'$1'"/>
+ <xsl:with-param name="replacement" select="'-d'"/>
+ </xsl:call-template>
+ <xsl:call-template name="traceOpt">
+ <xsl:with-param name="progname" select="./@name"/>
+ </xsl:call-template>
+ </xsl:attribute>
+ <xsl:copy-of select="./*"/>
+ </program>
+ </pipeline>
+ </mode>
+ </xsl:template>
+
+ <!-- catch-all -->
+ <xsl:template match="@* | node()">
+ <xsl:copy>
+ <xsl:apply-templates select="@* | node()"/>
+ </xsl:copy>
+ </xsl:template>
+
+</xsl:stylesheet>
diff --git a/apertium/morpheme.cc b/apertium/morpheme.cc
new file mode 100644
index 0000000..1344c56
--- /dev/null
+++ b/apertium/morpheme.cc
@@ -0,0 +1,57 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "morpheme.h"
+
+#include "exception.h"
+#include "tag.h"
+
+#include <string>
+#include <vector>
+
+namespace Apertium {
+bool operator==(const Morpheme &a, const Morpheme &b) {
+ return a.TheLemma == b.TheLemma && a.TheTags == b.TheTags;
+}
+
+bool operator<(const Morpheme &a, const Morpheme &b) {
+ if (a.TheLemma != b.TheLemma)
+ return a.TheLemma < b.TheLemma;
+
+ return a.TheTags < b.TheTags;
+}
+
+Morpheme::operator std::wstring() const {
+ if (TheTags.empty())
+ throw Exception::Morpheme::TheTags_empty("can't convert Morpheme "
+ "comprising empty Tag std::vector "
+ "to std::wstring");
+
+ if (TheLemma.empty())
+ throw Exception::Morpheme::TheLemma_empty("can't convert Morpheme "
+ "comprising empty TheLemma "
+ "std::wstring to std::wstring");
+
+ std::wstring wstring_ = TheLemma;
+
+ for (std::vector<Tag>::const_iterator Tag_ = TheTags.begin();
+ // Call .end() each iteration to save memory.
+ Tag_ != TheTags.end(); ++Tag_) {
+ wstring_ += static_cast<std::wstring>(*Tag_);
+ }
+
+ return wstring_;
+}
+}
diff --git a/apertium/morpheme.h b/apertium/morpheme.h
new file mode 100644
index 0000000..8329941
--- /dev/null
+++ b/apertium/morpheme.h
@@ -0,0 +1,35 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef MORPHEME_H
+#define MORPHEME_H
+
+#include "tag.h"
+
+#include <string>
+#include <vector>
+
+namespace Apertium {
+class Morpheme {
+public:
+ friend bool operator==(const Morpheme &a, const Morpheme &b);
+ friend bool operator<(const Morpheme &a, const Morpheme &b);
+ operator std::wstring() const;
+ std::wstring TheLemma;
+ std::vector<Tag> TheTags;
+};
+}
+
+#endif // MORPHEME_H
diff --git a/apertium/morpho_stream.cc b/apertium/morpho_stream.cc
index 843d8af..5ae441b 100644
--- a/apertium/morpho_stream.cc
+++ b/apertium/morpho_stream.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
/**
* Word class and MorphoStream class definitions
diff --git a/apertium/morpho_stream.h b/apertium/morpho_stream.h
index 2f499e5..5cd85c0 100644
--- a/apertium/morpho_stream.h
+++ b/apertium/morpho_stream.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
/**
* Word class and MorphoStream class definitions
diff --git a/apertium/new2old.xsl b/apertium/new2old.xsl
index 1174230..cf5f9a2 100644
--- a/apertium/new2old.xsl
+++ b/apertium/new2old.xsl
@@ -13,9 +13,7 @@
General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA.
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
-->
<!--
==========================================================================
diff --git a/apertium/optional.h b/apertium/optional.h
new file mode 100644
index 0000000..9ca4fac
--- /dev/null
+++ b/apertium/optional.h
@@ -0,0 +1,123 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef OPTIONAL_H
+#define OPTIONAL_H
+
+#include "exception.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <exception>
+#include <new>
+
+namespace Apertium {
+template <typename OptionalType> class Optional;
+
+template <typename OptionalType>
+void swap(Optional<OptionalType> &A, Optional<OptionalType> &B);
+
+template <typename OptionalType> class Optional {
+public:
+ friend void swap<OptionalType>(Optional &A, Optional &B);
+ Optional();
+ Optional(const OptionalType &OptionalType_);
+ Optional(const Optional &Optional_);
+ Optional &operator=(Optional Optional_);
+ ~Optional();
+ const OptionalType &operator*() const;
+ OptionalType &operator*();
+ const OptionalType *operator->() const;
+ OptionalType *operator->();
+ operator bool() const;
+
+private:
+ OptionalType *TheOptionalTypePointer;
+};
+
+template <typename OptionalType>
+void swap(Optional<OptionalType> &A, Optional<OptionalType> &B) {
+ using std::swap;
+ swap(A.TheOptionalTypePointer, B.TheOptionalTypePointer);
+}
+
+template <typename OptionalType>
+Optional<OptionalType>::Optional()
+ : TheOptionalTypePointer(NULL) {}
+
+template <typename OptionalType>
+Optional<OptionalType>::Optional(const OptionalType &OptionalType_)
+ : TheOptionalTypePointer(new OptionalType(OptionalType_)) {}
+
+template <typename OptionalType>
+Optional<OptionalType>::Optional(const Optional &Optional_) {
+ if (Optional_.TheOptionalTypePointer == NULL) {
+ TheOptionalTypePointer = NULL;
+ return;
+ }
+
+ TheOptionalTypePointer =
+ new OptionalType(*(Optional_.TheOptionalTypePointer));
+}
+
+template <typename OptionalType>
+Optional<OptionalType> &Optional<OptionalType>::operator=(Optional Optional_) {
+ swap(*this, Optional_);
+ return *this;
+}
+
+template <typename OptionalType> Optional<OptionalType>::~Optional() {
+ if (TheOptionalTypePointer == NULL)
+ return;
+
+ delete TheOptionalTypePointer;
+}
+
+template <typename OptionalType>
+const OptionalType &Optional<OptionalType>::operator*() const {
+ if (TheOptionalTypePointer == NULL)
+ throw Exception::Optional::TheOptionalTypePointer_null(
+ "can't dereference Optional comprising null OptionalType pointer");
+
+ return *TheOptionalTypePointer;
+}
+
+template <typename OptionalType>
+OptionalType &Optional<OptionalType>::operator*() {
+ return const_cast<OptionalType &>(
+ static_cast<const Optional &>(*this).operator*());
+}
+
+template <typename OptionalType>
+const OptionalType *Optional<OptionalType>::operator->() const {
+ if (TheOptionalTypePointer == NULL)
+ throw Exception::Optional::TheOptionalTypePointer_null(
+ "can't dereference Optional comprising null OptionalType pointer");
+
+ return TheOptionalTypePointer;
+}
+
+template <typename OptionalType>
+OptionalType *Optional<OptionalType>::operator->() {
+ return const_cast<OptionalType *>(
+ static_cast<const Optional &>(*this).operator->());
+}
+
+template <typename OptionalType> Optional<OptionalType>::operator bool() const {
+ return TheOptionalTypePointer != NULL;
+}
+}
+
+#endif
diff --git a/apertium/postchunk.cc b/apertium/postchunk.cc
index f0d91b6..39ff7c9 100644
--- a/apertium/postchunk.cc
+++ b/apertium/postchunk.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/postchunk.h>
#include <apertium/trx_reader.h>
@@ -34,11 +32,6 @@ using namespace Apertium;
using namespace std;
void
-Postchunk::copy(Postchunk const &o)
-{
-}
-
-void
Postchunk::destroy()
{
if(me)
@@ -53,7 +46,15 @@ Postchunk::destroy()
}
}
-Postchunk::Postchunk()
+Postchunk::Postchunk() :
+word(0),
+blank(0),
+lword(0),
+lblank(0),
+output(0),
+any_char(0),
+any_tag(0),
+nwords(0)
{
me = NULL;
doc = NULL;
@@ -69,22 +70,6 @@ Postchunk::~Postchunk()
destroy();
}
-Postchunk::Postchunk(Postchunk const &o)
-{
- copy(o);
-}
-
-Postchunk &
-Postchunk::operator =(Postchunk const &o)
-{
- if(this != &o)
- {
- destroy();
- copy(o);
- }
- return *this;
-}
-
void
Postchunk::readData(FILE *in)
{
@@ -743,6 +728,11 @@ Postchunk::processCallMacro(xmlNode *localroot)
break;
}
}
+
+ if (npar <= 0)
+ {
+ throw "Postchunk::processCallMacro() assumes npar > 0, but got npar <= 0";
+ }
InterchunkWord **myword = NULL;
if(npar > 0)
@@ -764,6 +754,9 @@ Postchunk::processCallMacro(xmlNode *localroot)
if(i->type == XML_ELEMENT_NODE)
{
int pos = atoi((const char *) i->properties->children->content);
+ if(!checkIndex(localroot, pos, lword)) {
+ pos=1; // for a rule to match, there has to be at least one word, so should be safe
+ }
myword[idx] = word[pos];
if(blank)
{
@@ -791,14 +784,8 @@ Postchunk::processCallMacro(xmlNode *localroot)
swap(myblank, blank);
swap(npar, lword);
- if(myword)
- {
- delete[] myword;
- }
- if(myblank)
- {
- delete[] myblank;
- }
+ delete[] myword;
+ delete[] myblank;
}
void
@@ -1654,7 +1641,7 @@ Postchunk::applyRule()
splitWordsAndBlanks(chunk, tmpword, tmpblank);
word = new InterchunkWord *[tmpword.size()+1];
- lword = tmpword.size()+1;
+ lword = tmpword.size();
word[0] = new InterchunkWord(UtfConverter::toUtf8(wordzero(chunk)));
for(unsigned int i = 1, limit = tmpword.size()+1; i != limit; i++)
@@ -1664,7 +1651,7 @@ Postchunk::applyRule()
if(limit != 2)
{
blank = new string *[limit - 2];
- lblank = limit - 2;
+ lblank = limit - 3;
}
else
{
@@ -1689,7 +1676,7 @@ Postchunk::applyRule()
{
delete word[i];
}
- delete word;
+ delete[] word;
}
if(blank)
{
@@ -1697,7 +1684,7 @@ Postchunk::applyRule()
{
delete blank[i];
}
- delete blank;
+ delete[] blank;
}
word = NULL;
blank = NULL;
diff --git a/apertium/postchunk.dtd b/apertium/postchunk.dtd
index 6288a6a..08f1bcc 100644
--- a/apertium/postchunk.dtd
+++ b/apertium/postchunk.dtd
@@ -13,9 +13,7 @@
General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA.
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
Draft of DTD for the structural transfer rule files
diff --git a/apertium/postchunk.h b/apertium/postchunk.h
index a2c094a..adb76c8 100644
--- a/apertium/postchunk.h
+++ b/apertium/postchunk.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _POSTCHUNK_
#define _POSTCHUNK_
@@ -74,7 +72,6 @@ private:
bool null_flush;
bool internal_null_flush;
- void copy(Postchunk const &o);
void destroy();
void readData(FILE *input);
void readPostchunk(string const &input);
@@ -131,14 +128,11 @@ private:
public:
Postchunk();
~Postchunk();
- Postchunk(Postchunk const &o);
- Postchunk & operator =(Postchunk const &o);
void read(string const &transferfile, string const &datafile);
void postchunk(FILE *in, FILE *out);
bool getNullFlush(void);
void setNullFlush(bool null_flush);
-
};
#endif
diff --git a/apertium/postchunk.rnc b/apertium/postchunk.rnc
new file mode 100644
index 0000000..9518704
--- /dev/null
+++ b/apertium/postchunk.rnc
@@ -0,0 +1,348 @@
+# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+# Draft of DTD for the structural transfer rule files
+#
+# Sergio Ortiz, Gema Ramírez-Sánchez, Mireia Ginestí, Mikel L. Forcada,
+# 2005.07.29.
+
+condition =
+ and
+ | or
+ | not
+ | equal
+ | begins-with
+ | begins-with-list
+ | ends-with
+ | ends-with-list
+ | contains-substring
+ | in
+container = var | clip
+sentence = let | out | choose | modify-case | call-macro | append
+value =
+ b
+ | clip
+ | lit
+ | lit-tag
+ | var
+ | get-case-from
+ | case-of
+ | concat
+ | lu-count
+ | lu
+ | mlu
+stringvalue = clip | lit | var | get-case-from | case-of | lu-count
+postchunk =
+ element postchunk {
+ attlist.postchunk,
+ section-def-cats,
+ section-def-attrs,
+ section-def-vars,
+ section-def-lists?,
+ section-def-macros?,
+ section-rules
+ }
+attlist.postchunk &= empty
+# 'postchunk' is the root element containing the whole structural
+# postchunk rule file.
+section-def-cats =
+ element section-def-cats { attlist.section-def-cats, def-cat+ }
+attlist.section-def-cats &= empty
+# The 'def-cats' section defines the categories used to build the
+# patterns used in rules
+def-cat = element def-cat { attlist.def-cat, cat-item+ }
+attlist.def-cat &=
+ attribute n { xsd:ID },
+ attribute c { text }?
+# Each 'def-cat' defines one category in terms of a list of
+# category items and has a unique name 'n', which is mandatory
+cat-item = element cat-item { attlist.cat-item, empty }
+attlist.cat-item &= attribute name { text }
+# In addition, a required attribute, "name", is used to specify
+# wich chunk name is detected by this cat-item
+section-def-attrs =
+ element section-def-attrs { attlist.section-def-attrs, def-attr+ }
+attlist.section-def-attrs &= empty
+# The 'def-attrs' section defines the attributes that will be
+# identified in matched lexical forms
+def-attr = element def-attr { attlist.def-attr, attr-item+ }
+attlist.def-attr &=
+ attribute n { xsd:ID },
+ attribute c { text }?
+# Each def-attr defines one attribute in terms of a list of
+# attribute items and has a mandatory unique name n
+attr-item = element attr-item { attlist.attr-item, empty }
+attlist.attr-item &=
+ attribute tags { text }?,
+ attribute c { text }?
+# Each 'attr-item' specifies a subsequence of the tags in
+# that lexical form (attribute 'tags')
+section-def-vars =
+ element section-def-vars { attlist.section-def-vars, def-var+ }
+attlist.section-def-vars &= empty
+# The 'def-vars' section defines the global variables
+# that will be used to transfer information between rules
+def-var = element def-var { attlist.def-var, empty }
+attlist.def-var &=
+ attribute n { xsd:ID },
+ attribute v { text }?,
+ attribute c { text }?
+# The definition of a global variable has a mandatory unique name 'n' that
+# will be used to refer to it. A value of initialization can also be specified
+# by means the 'v' attribute. The default value of the initialization is the
+# empty string.
+section-def-lists =
+ element section-def-lists { attlist.section-def-lists, def-list+ }
+attlist.section-def-lists &= empty
+# Element 'section-def-lists' encloses a set of list definitions
+def-list = element def-list { attlist.def-list, list-item+ }
+attlist.def-list &=
+ attribute n { xsd:ID },
+ attribute c { text }?
+# The 'def-list' element defines a named list to search with the 'in'
+# element. Attribute 'n' sets the name of the list
+list-item = element list-item { attlist.list-item, empty }
+attlist.list-item &=
+ attribute v { text },
+ attribute c { text }?
+# Attribute 'v' of 'list-item' element contains the value to be added to
+# the list being defined
+section-def-macros =
+ element section-def-macros { attlist.section-def-macros, def-macro+ }
+attlist.section-def-macros &= empty
+#
+# The 'def-macros' section defines macros containing portions of
+# code frequently used in the action part of rules
+#
+def-macro = element def-macro { attlist.def-macro, sentence+ }
+attlist.def-macro &= attribute n { xsd:ID }
+attlist.def-macro &=
+ attribute npar { text },
+ attribute c { text }?
+# Macro definition:
+#
+# A macro has a mandatory name (the value of 'n'), a number of parameters
+# (the value of 'npar') and a body containing arguments and statements.
+section-rules = element section-rules { attlist.section-rules, rule+ }
+attlist.section-rules &= empty
+# The rules section contains a sequence of one or more rules
+rule = element rule { attlist.rule, pattern, action }
+attlist.rule &= attribute comment { text }?
+# Each rule has a pattern and an action
+# * Attribute 'comment' allows to include a comment with the rule
+pattern = element pattern { attlist.pattern, pattern-item }
+attlist.pattern &= empty
+# The pattern is specified in terms of pattern items, each one
+# representing a lexical form in the matched pattern
+pattern-item = element pattern-item { attlist.pattern-item, empty }
+attlist.pattern-item &= attribute n { xsd:IDREF }
+# Each attribute to be activated is referred to by its name in the def-cats section
+action = element action { attlist.action, sentence* }
+attlist.action &= attribute c { text }?
+# Encloses the procedural part of a rule
+choose = element choose { attlist.choose, when+, otherwise? }
+attlist.choose &= attribute c { text }?
+# The choose statement is a selection statement (similar to a case
+# statement) composed of one or more tested cases and an optional
+# otherwise
+when = element when { attlist.when, test, sentence* }
+attlist.when &= attribute c { text }?
+# Each tested case is a block of zero or more statements
+otherwise = element otherwise { attlist.otherwise, sentence+ }
+attlist.otherwise &= attribute c { text }?
+# The otherwise case is also a block of one or more statements
+test = element test { attlist.test, condition }
+attlist.test &= attribute c { text }?
+# The test in a tested case may be a conjunction, a disjunction, or
+# a negation of simpler tests, as well as a simple equality test
+and = element and { attlist.and, condition, condition+ }
+attlist.and &= empty
+# Each conjuntion test contains two or more simpler tests
+or = element or { attlist.or, condition, condition+ }
+attlist.or &= empty
+# Each disjunction test contains two or more simpler tests
+not = element not { attlist.not, condition }
+attlist.not &= empty
+# The negation of a simpler test is a test itself
+equal = element equal { attlist.equal, value, value }
+attlist.equal &= attribute caseless { "no" | "yes" }?
+# The simplest test is an equality test. The right part and the
+# left part of the equality may both be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+begins-with = element begins-with { attlist.begins-with, value, value }
+attlist.begins-with &= attribute caseless { "no" | "yes" }?
+# Tests if the left part contains the right part at the beginning.
+# Both parts of the test may both be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+ends-with = element ends-with { attlist.ends-with, value, value }
+attlist.ends-with &= attribute caseless { "no" | "yes" }?
+# Tests if the left part contains the right part at the end.
+# Both parts of the test may both be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+begins-with-list =
+ element begins-with-list { attlist.begins-with-list, value, \list }
+attlist.begins-with-list &= attribute caseless { "no" | "yes" }?
+# Tests if the left part contains the right part at the beginning.
+# First parts of the test may be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. The second part
+# must be always a list. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+ends-with-list =
+ element ends-with-list { attlist.ends-with-list, value, \list }
+attlist.ends-with-list &= attribute caseless { "no" | "yes" }?
+# Tests if the left part contains the right part at the end.
+# First parts of the test may be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. The second part
+# must be always a list. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+contains-substring =
+ element contains-substring {
+ attlist.contains-substring, value, value
+ }
+attlist.contains-substring &= attribute caseless { "no" | "yes" }?
+# Tests if the left part contains the right part.
+# Both parts of the test may both be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+in = element in { attlist.in, value, \list }
+attlist.in &= attribute caseless { "no" | "yes" }?
+# 'in' performs a search of a value in a list. If 'caseless' is set to yes,
+# this search is performed without attending to the case
+\list = element list { attlist.list, empty }
+attlist.list &= attribute n { xsd:IDREF }
+# 'list' refers, with the name in attribute 'n', a list defined before in
+# the 'section-def-list' section
+let = element let { attlist.let, container, value }
+attlist.let &= empty
+# An assignment statement ('let') assigns the value of a clip (see
+# below), a literal string ('lit'), a literal tag('lit-tag') or the
+# value of a global variable ('var') to either a global variable ('var')
+# or a clip
+append = element append { attlist.append, value+ }
+attlist.append &= attribute n { xsd:IDREF }
+# This instruction appends the value of a clip (see
+# below), a literal string ('lit'), a literal tag('lit-tag') or the
+# value of a global variable ('var') to either a global variable ('var')
+# or a clip, identified by the "n" attribute
+out = element out { attlist.out, (b | lu | mlu | var)+ }
+attlist.out &= attribute c { text }?
+# 'out' is an output statement; it may output blanks or chunks
+modify-case =
+ element modify-case { attlist.modify-case, container, stringvalue }
+attlist.modify-case &= empty
+# The first argument of 'modify-case' copy the case of the second
+# argument.
+call-macro = element call-macro { attlist.call-macro, with-param* }
+attlist.call-macro &= attribute n { xsd:IDREF }
+# A macro may be called anywhere by name with one or more
+# arguments
+with-param = element with-param { attlist.with-param, empty }
+attlist.with-param &= attribute pos { text }
+# The attribute pos in each argument is used to refer to a lexical
+# form in the current rule. For example, if a 2-parameter macro
+# has been defined to perform noun-adjective agreement operations,
+# it may be used with arguments 1 and 2 in a noun-adjective rule,
+# with arguments 2, 3 and 1 in a determiner-noun-adjective rule, with
+# arguments 1 and 3 in a noun-adverb-adjective rule, and with
+# arguments 2 and 1 in an adjective-noun rule
+clip = element clip { attlist.clip, empty }
+attlist.clip &=
+ attribute pos { text },
+ attribute part { text },
+ attribute c { text }?
+# A 'clip' is a substring of a source-language or target-language
+# lexical form, extracted according to an attribute:
+#
+# * 'pos' is an index (1, 2, 3...) used to select a lexical form
+# inside the rule;
+#
+# * the value of 'part' is the name of an attribute defined in
+# def-attrs, but may take also the values 'lem' (referring to
+# the lemma of the lexical form), 'lemh' (lemma head), 'lemq'
+# (lemma queue) and 'whole' (referring to the whole lexical form).
+#
+lit = element lit { attlist.lit, empty }
+attlist.lit &= attribute v { text }
+# A literal string value: the value of the literal is the value of
+# the 'v' attribute
+lit-tag = element lit-tag { attlist.lit-tag, empty }
+attlist.lit-tag &= attribute v { text }
+# A literal string value: the value of the literal is the value of
+# the 'v' attribute
+var = element var { attlist.var, empty }
+attlist.var &= attribute n { xsd:IDREF }
+# Each 'var' is a variable identifier: the attribute n is the name
+# of the variable. When it is in an 'out', a 'test', or the right
+# part of a 'let', it represents the value of the variable; when in
+# the left part of a 'let' it represents the reference of the
+# variable.
+get-case-from =
+ element get-case-from { attlist.get-case-from, (clip | lit | var) }
+attlist.get-case-from &= attribute pos { text }
+# Atención, falta modificar todos los comentarios donde intervenga
+# get-case-from
+case-of = element case-of { attlist.case-of, empty }
+attlist.case-of &=
+ attribute pos { text },
+ attribute part { text }
+# A 'case-of' is a value representing the case of a "clip". This value
+# will be "aa" (all lowercase), "Aa" (first uppercase) and "AA",
+# (all uppercase).
+#
+# * 'pos' is an index (1, 2, 3...) used to select a lexical form
+# inside the rule;
+#
+# * the value of 'part' is the name of an attribute defined in
+# def-attrs, but may take also the values 'lem' (referring to
+# the lemma of the lexical form), 'lemh' (lemma head), 'lemq'
+# (lemma queue) and 'whole' (referring to the whole lexical form).
+concat = element concat { attlist.concat, value+ }
+attlist.concat &= empty
+# Concatenates a sequence of values
+mlu = element mlu { attlist.mlu, lu+ }
+attlist.mlu &= empty
+# Encloses a multiword
+lu = element lu { attlist.lu, value+ }
+attlist.lu &= empty
+# Encloses a word
+b = element b { attlist.b, empty }
+attlist.b &= attribute pos { text }?
+# 'b' is a [super]blanks item, indexed by pos; for example, a 'b'
+# with pos="2" refers to the [super]blanks (including format data
+# encapsulated by the de-formatter) between lexical form 2 and
+# lexical form 3. Managing [super]blanks explicitly allows for the
+# correct placement of format when the result of structural
+# transfer has more or less lexical items than the original or has
+# been reordered in some way. If attribute "pos" is not specified, then
+# a single blank (ASCII 32) is generated.
+lu-count = element lu-count { attlist.lu-count, empty }
+attlist.lu-count &= empty
+start = postchunk
+# Number of lexical units (words inside the chunk) in the rule
diff --git a/apertium/transfer.rng b/apertium/postchunk.rng
similarity index 84%
copy from apertium/transfer.rng
copy to apertium/postchunk.rng
index b78f94d..9cbc1af 100644
--- a/apertium/transfer.rng
+++ b/apertium/postchunk.rng
@@ -1,9 +1,24 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<!--
- Draft of DTD for the structural transfer rule files
+ Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
- Sergio Ortiz, Gema Ram�rez-S�nchez, Mireia Ginest�, Mikel L. Forcada,
- 2005.07.29.
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+ Draft of DTD for the structural transfer rule files
+
+ Sergio Ortiz, Gema Ram�rez-S�nchez, Mireia Ginest�, Mikel L. Forcada,
+ 2005.07.29.
-->
<grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
<define name="condition">
@@ -46,6 +61,9 @@
<ref name="get-case-from"/>
<ref name="case-of"/>
<ref name="concat"/>
+ <ref name="lu-count"/>
+ <ref name="lu"/>
+ <ref name="mlu"/>
</choice>
</define>
<define name="stringvalue">
@@ -55,11 +73,12 @@
<ref name="var"/>
<ref name="get-case-from"/>
<ref name="case-of"/>
+ <ref name="lu-count"/>
</choice>
</define>
- <define name="transfer">
- <element name="transfer">
- <ref name="attlist.transfer"/>
+ <define name="postchunk">
+ <element name="postchunk">
+ <ref name="attlist.postchunk"/>
<ref name="section-def-cats"/>
<ref name="section-def-attrs"/>
<ref name="section-def-vars"/>
@@ -72,21 +91,12 @@
<ref name="section-rules"/>
</element>
</define>
- <define name="attlist.transfer" combine="interleave">
- <optional>
- <attribute name="default">
- <choice>
- <value>lu</value>
- <value>chunk</value>
- </choice>
- </attribute>
- </optional>
+ <define name="attlist.postchunk" combine="interleave">
+ <empty/>
</define>
<!--
- 'transfer' is the root element containing the whole structural
- transfer rule file. Attribute 'default' specifies if
- unmatched words have to be written as lexical units ("lu", this is
- the default value) or as chunks ("chunk").
+ 'postchunk' is the root element containing the whole structural
+ postchunk rule file.
-->
<define name="section-def-cats">
<element name="section-def-cats">
@@ -130,24 +140,11 @@
</element>
</define>
<define name="attlist.cat-item" combine="interleave">
- <optional>
- <attribute name="lemma"/>
- </optional>
- <attribute name="tags"/>
- <optional>
- <attribute name="c"/>
- </optional>
+ <attribute name="name"/>
</define>
<!--
- Each 'cat-item' (category item) represents a set of lexical forms
- and has a mandatory attribute 'tags' whose value is a sequence of
- dot-separated tag names; this sequence is a subsequence of the
- tag sequence defining each possible lexical form. For example,
- tags="n.f" would match all lexical forms containing this tag
- sequence, such as "^casa<n><f><pl>$".
-
- In addition, an optional attribute, "lemma", may be used to
- define lexical forms having a particular substring in their lemma
+ In addition, a required attribute, "name", is used to specify
+ wich chunk name is detected by this cat-item
-->
<define name="section-def-attrs">
<element name="section-def-attrs">
@@ -356,15 +353,12 @@
</define>
<!--
Each rule has a pattern and an action
- * attribute 'comment' allows to put in comments about the purpose of
- the rule being defined
+ * Attribute 'comment' allows to include a comment with the rule
-->
<define name="pattern">
<element name="pattern">
<ref name="attlist.pattern"/>
- <oneOrMore>
- <ref name="pattern-item"/>
- </oneOrMore>
+ <ref name="pattern-item"/>
</element>
</define>
<define name="attlist.pattern" combine="interleave">
@@ -729,10 +723,9 @@
<ref name="attlist.out"/>
<oneOrMore>
<choice>
- <ref name="mlu"/>
- <ref name="lu"/>
<ref name="b"/>
- <ref name="chunk"/>
+ <ref name="lu"/>
+ <ref name="mlu"/>
<ref name="var"/>
</choice>
</oneOrMore>
@@ -743,11 +736,7 @@
<attribute name="c"/>
</optional>
</define>
- <!--
- 'out' is an output statement; it may output any sequence of
- clips, literal strings, literal tags, variables, and whitespace items
- (see below)
- -->
+ <!-- 'out' is an output statement; it may output blanks or chunks -->
<define name="modify-case">
<element name="modify-case">
<ref name="attlist.modify-case"/>
@@ -805,20 +794,8 @@
</define>
<define name="attlist.clip" combine="interleave">
<attribute name="pos"/>
- <attribute name="side">
- <choice>
- <value>sl</value>
- <value>tl</value>
- </choice>
- </attribute>
<attribute name="part"/>
<optional>
- <attribute name="queue"/>
- </optional>
- <optional>
- <attribute name="link-to"/>
- </optional>
- <optional>
<attribute name="c"/>
</optional>
</define>
@@ -829,21 +806,11 @@
* 'pos' is an index (1, 2, 3...) used to select a lexical form
inside the rule;
- * 'side' is used to select a source-language ('sl') or a
- target-language ('tl') clip
-
* the value of 'part' is the name of an attribute defined in
def-attrs, but may take also the values 'lem' (referring to
the lemma of the lexical form), 'lemh' (lemma head), 'lemq'
(lemma queue) and 'whole' (referring to the whole lexical form).
- * the value of 'queue' may be 'no' or 'yes'. 'yes' is assumed by
- default.
-
- * 'link-to' causes the other attributes to be ignored in clip evaluation
- when using 'clip' as a right hand side element (as value), and
- returns its value. When using as a left hand side (as reference),
- the value of the 'as' attribute is ignored.
-->
<define name="lit">
<element name="lit">
@@ -914,12 +881,6 @@
</define>
<define name="attlist.case-of" combine="interleave">
<attribute name="pos"/>
- <attribute name="side">
- <choice>
- <value>sl</value>
- <value>tl</value>
- </choice>
- </attribute>
<attribute name="part"/>
</define>
<!--
@@ -930,9 +891,6 @@
* 'pos' is an index (1, 2, 3...) used to select a lexical form
inside the rule;
- * 'side' is used to select a source-language ('sl') or a
- target-language ('tl') clip
-
* the value of 'part' is the name of an attribute defined in
def-attrs, but may take also the values 'lem' (referring to
the lemma of the lexical form), 'lemh' (lemma head), 'lemq'
@@ -973,62 +931,7 @@
<define name="attlist.lu" combine="interleave">
<empty/>
</define>
- <!-- Encloses a word inside an 'out' element. -->
- <define name="chunk">
- <element name="chunk">
- <ref name="attlist.chunk"/>
- <ref name="tags"/>
- <oneOrMore>
- <choice>
- <ref name="mlu"/>
- <ref name="lu"/>
- <ref name="b"/>
- <ref name="var"/>
- </choice>
- </oneOrMore>
- </element>
- </define>
- <define name="attlist.chunk" combine="interleave">
- <optional>
- <attribute name="name"/>
- </optional>
- <optional>
- <attribute name="namefrom"/>
- </optional>
- <optional>
- <attribute name="case"/>
- </optional>
- <optional>
- <attribute name="c"/>
- </optional>
- </define>
- <!--
- Encloses a chunk inside an 'out' element.
- * 'name' the pseudolemma of the chunk.
- * 'namefrom' get the name from a variable.
- * 'case' the variable to get the uppercase/lowercase policy
- to apply it to the chunk name
- -->
- <define name="tags">
- <element name="tags">
- <ref name="attlist.tags"/>
- <oneOrMore>
- <ref name="tag"/>
- </oneOrMore>
- </element>
- </define>
- <define name="attlist.tags" combine="interleave">
- <empty/>
- </define>
- <define name="tag">
- <element name="tag">
- <ref name="attlist.tag"/>
- <ref name="value"/>
- </element>
- </define>
- <define name="attlist.tag" combine="interleave">
- <empty/>
- </define>
+ <!-- Encloses a word -->
<define name="b">
<element name="b">
<ref name="attlist.b"/>
@@ -1040,19 +943,29 @@
<attribute name="pos"/>
</optional>
</define>
+ <!--
+ 'b' is a [super]blanks item, indexed by pos; for example, a 'b'
+ with pos="2" refers to the [super]blanks (including format data
+ encapsulated by the de-formatter) between lexical form 2 and
+ lexical form 3. Managing [super]blanks explicitly allows for the
+ correct placement of format when the result of structural
+ transfer has more or less lexical items than the original or has
+ been reordered in some way. If attribute "pos" is not specified, then
+ a single blank (ASCII 32) is generated.
+ -->
+ <define name="lu-count">
+ <element name="lu-count">
+ <ref name="attlist.lu-count"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.lu-count" combine="interleave">
+ <empty/>
+ </define>
<start>
<choice>
- <ref name="transfer"/>
+ <ref name="postchunk"/>
</choice>
</start>
</grammar>
-<!--
- 'b' is a [super]blanks item, indexed by pos; for example, a 'b'
- with pos="2" refers to the [super]blanks (including format data
- encapsulated by the de-formatter) between lexical form 2 and
- lexical form 3. Managing [super]blanks explicitly allows for the
- correct placement of format when the result of structural
- transfer has more or less lexical items than the original or has
- been reordered in some way. If attribute "pos" is not specified, then
- a single blank (ASCII 32) is generated.
--->
+<!-- Number of lexical units (words inside the chunk) in the rule -->
diff --git a/apertium/reformat.xsl b/apertium/reformat.xsl
index 5378652..d59f889 100644
--- a/apertium/reformat.xsl
+++ b/apertium/reformat.xsl
@@ -13,9 +13,7 @@
General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA.
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
-->
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
diff --git a/apertium/serialiser.h b/apertium/serialiser.h
new file mode 100644
index 0000000..f79e78c
--- /dev/null
+++ b/apertium/serialiser.h
@@ -0,0 +1,284 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef SERIALISER_H
+#define SERIALISER_H
+
+#include "a.h"
+#include "basic_exception_type.h"
+#include "analysis.h"
+#include "exception.h"
+#include "i.h"
+#include "lemma.h"
+#include "morpheme.h"
+#include "tag.h"
+
+#include <cstddef>
+#include <ios>
+#include <limits>
+#include <map>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace Apertium {
+template <typename SerialisedType>
+static unsigned char compressedSize(const SerialisedType &SerialisedType_) {
+ unsigned char compressedSize_ = 0;
+
+ for (; static_cast<unsigned char>(SerialisedType_ >>
+ std::numeric_limits<unsigned char>::digits *
+ compressedSize_) != 0;
+ ++compressedSize_) {
+ }
+
+ return compressedSize_;
+}
+
+template <typename SerialisedType> class Serialiser;
+
+template <> class Serialiser<a> {
+public:
+ inline static void serialise(const a &StreamedType_, std::ostream &Stream_);
+};
+
+template <> class Serialiser<Analysis> {
+public:
+ inline static void serialise(const Analysis &SerialisedType_,
+ std::ostream &Stream_);
+};
+
+template <> class Serialiser<i> {
+public:
+ inline static void serialise(const i &StreamedType_, std::ostream &Stream_);
+};
+
+template <> class Serialiser<Lemma> {
+public:
+ inline static void serialise(const Lemma &StreamedType_,
+ std::ostream &Stream_);
+};
+
+template <> class Serialiser<Morpheme> {
+public:
+ inline static void serialise(const Morpheme &SerialisedType_,
+ std::ostream &Stream_);
+};
+
+template <> class Serialiser<Tag> {
+public:
+ inline static void serialise(const Tag &SerialisedType_,
+ std::ostream &Stream_);
+};
+
+template <typename value_type>
+class Serialiser<std::basic_string<value_type> > {
+public:
+ inline static void
+ serialise(const std::basic_string<value_type> &SerialisedType_,
+ std::ostream &Stream_);
+};
+
+template <typename key_type, typename mapped_type>
+class Serialiser<std::map<key_type, mapped_type> > {
+public:
+ inline static void
+ serialise(const std::map<key_type, mapped_type> &SerialisedType_,
+ std::ostream &Stream_);
+};
+
+template <typename first_type, typename second_type>
+class Serialiser<std::pair<first_type, second_type> > {
+public:
+ inline static void
+ serialise(const std::pair<first_type, second_type> &SerialisedType_,
+ std::ostream &Stream_);
+};
+
+template <> class Serialiser<std::size_t> {
+public:
+ inline static void serialise(const std::size_t &SerialisedType_,
+ std::ostream &Stream_);
+};
+
+template <typename value_type> class Serialiser<std::vector<value_type> > {
+public:
+ inline static void serialise(const std::vector<value_type> &SerialisedType_,
+ std::ostream &Stream_);
+};
+
+template <> class Serialiser<wchar_t> {
+public:
+ inline static void serialise(const wchar_t &SerialisedType_,
+ std::ostream &Stream_);
+};
+
+void Serialiser<a>::serialise(const a &StreamedType_, std::ostream &Stream_) {
+ Serialiser<std::vector<Tag> >::serialise(StreamedType_.TheTags, Stream_);
+ Serialiser<std::vector<Morpheme> >::serialise(StreamedType_.TheMorphemes,
+ Stream_);
+}
+
+void Serialiser<Analysis>::serialise(const Analysis &SerialisedType_,
+ std::ostream &Stream_) {
+ Serialiser<std::vector<Morpheme> >::serialise(SerialisedType_.TheMorphemes,
+ Stream_);
+}
+
+void Serialiser<i>::serialise(const i &StreamedType_, std::ostream &Stream_) {
+ Serialiser<std::vector<Tag> >::serialise(StreamedType_.TheTags, Stream_);
+}
+
+void Serialiser<Lemma>::serialise(const Lemma &StreamedType_,
+ std::ostream &Stream_) {
+ Serialiser<std::wstring>::serialise(StreamedType_.TheLemma, Stream_);
+}
+
+void Serialiser<Morpheme>::serialise(const Morpheme &SerialisedType_,
+ std::ostream &Stream_) {
+ Serialiser<std::wstring>::serialise(SerialisedType_.TheLemma, Stream_);
+ Serialiser<std::vector<Tag> >::serialise(SerialisedType_.TheTags, Stream_);
+}
+
+void Serialiser<Tag>::serialise(const Tag &SerialisedType_,
+ std::ostream &Stream_) {
+ Serialiser<std::wstring>::serialise(SerialisedType_.TheTag, Stream_);
+}
+
+template <typename value_type>
+void Serialiser<std::basic_string<value_type> >::serialise(
+ const std::basic_string<value_type> &SerialisedType_,
+ std::ostream &Stream_) {
+ Serialiser<std::size_t>::serialise(SerialisedType_.size(), Stream_);
+
+ for (typename std::basic_string<value_type>::const_iterator
+ SerialisedType_iterator = SerialisedType_.begin();
+ // Call .end() each iteration to save memory.
+ SerialisedType_iterator != SerialisedType_.end();
+ ++SerialisedType_iterator) {
+ Serialiser<value_type>::serialise(*SerialisedType_iterator, Stream_);
+ }
+}
+
+template <typename key_type, typename mapped_type>
+void Serialiser<std::map<key_type, mapped_type> >::serialise(
+ const std::map<key_type, mapped_type> &SerialisedType_,
+ std::ostream &Stream_) {
+ Serialiser<std::size_t>::serialise(SerialisedType_.size(), Stream_);
+
+ for (typename std::map<key_type, mapped_type>::const_iterator
+ SerialisedType_iterator = SerialisedType_.begin();
+ // Call .end() each iteration to save memory.
+ SerialisedType_iterator != SerialisedType_.end();
+ ++SerialisedType_iterator) {
+ Serialiser<std::pair<key_type, mapped_type> >::serialise(
+ *SerialisedType_iterator, Stream_);
+ }
+}
+
+template <typename first_type, typename second_type>
+void Serialiser<std::pair<first_type, second_type> >::serialise(
+ const std::pair<first_type, second_type> &SerialisedType_,
+ std::ostream &Stream_) {
+ Serialiser<first_type>::serialise(SerialisedType_.first, Stream_);
+ Serialiser<second_type>::serialise(SerialisedType_.second, Stream_);
+}
+
+void Serialiser<std::size_t>::serialise(const std::size_t &SerialisedType_,
+ std::ostream &Stream_) {
+ try {
+ Stream_.put(compressedSize(SerialisedType_));
+
+ if (!Stream_) {
+ std::stringstream what_;
+ what_ << "can't serialise size " << std::hex
+ << /* [1] */ +compressedSize(SerialisedType_) << std::dec;
+ throw Exception::Serialiser::not_Stream_good(what_);
+ }
+
+ for (unsigned char CompressedSize = compressedSize(SerialisedType_);
+ CompressedSize != 0; Stream_.put(static_cast<unsigned char>(
+ SerialisedType_ >>
+ std::numeric_limits<unsigned char>::digits * --CompressedSize))) {
+ if (!Stream_) {
+ std::stringstream what_;
+ what_ << "can't serialise byte " << std::hex
+ << /* [1] */ +static_cast<unsigned char>(
+ SerialisedType_ >>
+ std::numeric_limits<unsigned char>::digits *
+ CompressedSize) << std::dec;
+ throw Exception::Serialiser::not_Stream_good(what_);
+ }
+ }
+ } catch (const basic_ExceptionType &basic_ExceptionType_) {
+ std::stringstream what_;
+ what_ << "can't serialise const std::size_t & : "
+ << basic_ExceptionType_.what();
+ throw Exception::Serialiser::size_t_(what_);
+ }
+}
+
+template <typename value_type>
+void Serialiser<std::vector<value_type> >::serialise(
+ const std::vector<value_type> &SerialisedType_, std::ostream &Stream_) {
+ Serialiser<std::size_t>::serialise(SerialisedType_.size(), Stream_);
+
+ for (typename std::vector<value_type>::const_iterator value_type_ =
+ SerialisedType_.begin();
+ // Call .end() each iteration to save memory.
+ value_type_ != SerialisedType_.end(); ++value_type_) {
+ Serialiser<value_type>::serialise(*value_type_, Stream_);
+ }
+}
+
+void Serialiser<wchar_t>::serialise(const wchar_t &SerialisedType_,
+ std::ostream &Stream_) {
+ try {
+ Stream_.put(compressedSize(SerialisedType_));
+
+ if (!Stream_) {
+ std::stringstream what_;
+ what_ << "can't serialise size " << std::hex
+ << /* [1] */ +compressedSize(SerialisedType_);
+ throw Exception::Serialiser::not_Stream_good(what_);
+ }
+
+ for (unsigned char CompressedSize = compressedSize(SerialisedType_);
+ CompressedSize != 0; Stream_.put(static_cast<unsigned char>(
+ static_cast<unsigned wchar_t>(SerialisedType_) >>
+ std::numeric_limits<unsigned char>::digits * --CompressedSize))) {
+ if (!Stream_) {
+ std::stringstream what_;
+ what_ << "can't serialise byte " << std::hex
+ << /* [1] */ +(static_cast<unsigned wchar_t>(SerialisedType_) >>
+ std::numeric_limits<unsigned char>::digits *
+ CompressedSize);
+ throw Exception::Serialiser::not_Stream_good(what_);
+ }
+ }
+ } catch (const basic_ExceptionType &basic_ExceptionType_) {
+ std::stringstream what_;
+ what_ << "can't serialise const wchar_t & : "
+ << basic_ExceptionType_.what();
+ throw Exception::Serialiser::wchar_t_(what_);
+ }
+}
+}
+
+// [1] operator+ promotes its operand to a printable integral type.
+
+#endif // SERIALISER_H
diff --git a/apertium/stream.cc b/apertium/stream.cc
new file mode 100644
index 0000000..926e62a
--- /dev/null
+++ b/apertium/stream.cc
@@ -0,0 +1,774 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "stream.h"
+
+#include "analysis.h"
+#include "basic_tagger.h"
+#include "streamed_type.h"
+#include "wchar_t_exception.h"
+
+#include <climits>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <istream>
+#include <ostream>
+#include <sstream>
+#include <string>
+
+namespace Apertium {
+Stream::Stream(const basic_Tagger::Flags &Flags_)
+ : TheCharacterStream(std::wcin), TheFilename(), TheLineNumber(1), TheLine(),
+ TheFlags(Flags_), private_flush_(false), ThePreviousCase() {}
+
+Stream::Stream(const basic_Tagger::Flags &Flags_,
+ std::wifstream &CharacterStream_, const char *const Filename_)
+ : TheCharacterStream(CharacterStream_), TheFilename(Filename_),
+ TheLineNumber(1), TheLine(), TheFlags(Flags_), private_flush_(false),
+ ThePreviousCase() {}
+
+Stream::Stream(const basic_Tagger::Flags &Flags_,
+ std::wifstream &CharacterStream_, const std::string &Filename_)
+ : TheCharacterStream(CharacterStream_), TheFilename(Filename_),
+ TheLineNumber(1), TheLine(), TheFlags(Flags_), private_flush_(false),
+ ThePreviousCase() {}
+
+Stream::Stream(const basic_Tagger::Flags &Flags_,
+ std::wifstream &CharacterStream_,
+ const std::stringstream &Filename_)
+ : TheCharacterStream(CharacterStream_), TheFilename(Filename_.str()),
+ TheLineNumber(1), TheLine(), TheFlags(Flags_), private_flush_(false),
+ ThePreviousCase() {}
+
+StreamedType Stream::get() {
+ StreamedType TheStreamedType;
+ std::wstring Lemma;
+ private_flush_ = false;
+
+ if (!is_eof_throw_if_not_TheCharacterStream_good()) {
+ while (true) {
+ const wchar_t Character_ = TheCharacterStream.get();
+
+ if (is_eof_throw_if_not_TheCharacterStream_good(TheStreamedType, Lemma,
+ Character_))
+ break;
+
+ TheLine.push_back(Character_);
+
+ switch (Character_) {
+ case L'\\': // <\> 92, Hex 5c, Octal 134
+ case_0x5c(TheStreamedType, Lemma, Character_);
+ continue;
+ case L'[':
+ if (ThePreviousCase) {
+ switch (ThePreviousCase->ThePreviousCase) {
+ case L']':
+ case L'$':
+ break;
+ default:
+ std::wstringstream Message;
+ Message << L"unexpected '" << Character_ << L"' following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '[' expected to follow ']' or '$'";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+ }
+
+ push_back_Character(TheStreamedType, Lemma, Character_);
+ ThePreviousCase = PreviousCaseType(Character_);
+ continue;
+ case L']':
+ if (!ThePreviousCase) {
+ std::wstringstream Message;
+ Message << L"unexpected '" << Character_
+ << L"', ']' expected to follow '['";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ switch (ThePreviousCase->ThePreviousCase) {
+ case L'[':
+ push_back_Character(TheStreamedType, Lemma, Character_);
+ ThePreviousCase = PreviousCaseType(Character_);
+ continue;
+ default:
+ std::wstringstream Message;
+ Message << L"unexpected '" << Character_ << L"' following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', ']' expected to follow '['";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ std::abort();
+ case L'^':
+ if (ThePreviousCase) {
+ switch (ThePreviousCase->ThePreviousCase) {
+ case L'[':
+ push_back_Character(TheStreamedType, Lemma, Character_);
+ continue;
+ case L']':
+ case L'$':
+ break;
+ default:
+ std::wstringstream Message;
+ Message << L"unexpected '" << Character_ << L"' following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '^' expected to follow '[', ']', or '$'";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+ }
+
+ TheStreamedType.TheLexicalUnit = LexicalUnit();
+ ThePreviousCase = PreviousCaseType(Character_);
+ continue;
+ case L'/':
+ if (!ThePreviousCase) {
+ std::wstringstream Message;
+ Message
+ << L"unexpected '" << Character_
+ << L"', '/' expected to follow '[', to follow '>' immediately, "
+ L"or to follow '^' or '#' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ switch (ThePreviousCase->ThePreviousCase) {
+ case L'[':
+ push_back_Character(TheStreamedType, Lemma, Character_);
+ continue;
+ case L'^':
+ if (ThePreviousCase->isPreviousCharacter) {
+ std::wstringstream Message;
+ Message << L"unexpected '" << Character_
+ << L"' immediately following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '/' expected to follow '[', to follow '>' "
+ L"immediately, or to follow '^' or '#' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ ThePreviousCase = PreviousCaseType(Character_);
+
+ {
+ const wchar_t Character_ = TheCharacterStream.get();
+
+ if (is_eof_throw_if_not_TheCharacterStream_good(
+ TheStreamedType, Lemma, Character_)) {
+ std::wstringstream Message;
+ Message << L"unexpected end-of-file following '"
+ << ThePreviousCase->ThePreviousCase
+ << "', end-of-file expected to follow ']' or '$'";
+ throw wchar_t_Exception::Stream::UnexpectedEndOfFile(
+ Message_what(Message));
+ }
+
+ TheLine.push_back(Character_);
+
+ switch (Character_) {
+ case L'\\':
+ TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis());
+ TheStreamedType.TheLexicalUnit->TheAnalyses.back()
+ .TheMorphemes.push_back(Morpheme());
+ case_0x5c(TheStreamedType, Lemma, Character_);
+ continue;
+ case L'*':
+ ThePreviousCase = PreviousCaseType(Character_);
+ continue;
+ case L'\n': {
+ std::wstringstream Message;
+ Message << L"unexpected newline following '"
+ << ThePreviousCase->ThePreviousCase
+ << "', newline expected to follow '[', ']', or '$'";
+ throw wchar_t_Exception::Stream::UnexpectedCharacter(
+ Message_what(Message));
+ };
+ case L'[':
+ case L']':
+ case L'^':
+ case L'#':
+ case L'<':
+ case L'>':
+ case L'+':
+ case L'$': {
+ std::wstringstream Message;
+ Message << L"unexpected '" << Character_
+ << L"' immediately following '"
+ << ThePreviousCase->ThePreviousCase << L"', expected '*'";
+ throw wchar_t_Exception::Stream::UnexpectedPreviousCase(
+ Message_what(Message));
+ }
+ default:
+ TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis());
+ TheStreamedType.TheLexicalUnit->TheAnalyses.back()
+ .TheMorphemes.push_back(Morpheme());
+ push_back_Character(TheStreamedType, Lemma, Character_);
+ continue;
+ }
+ }
+
+ continue;
+ case L'>':
+ if (!ThePreviousCase->isPreviousCharacter) {
+ std::wstringstream Message;
+ Message << L"unexpected '" << Character_
+ << L"' not immediately following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '/' expected to follow '[', to follow '>' "
+ L"immediately, or to follow '^' or '#' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ break;
+ case L'#':
+ if (ThePreviousCase->isPreviousCharacter) {
+ std::wstringstream Message;
+ Message << L"unexpected '" << Character_
+ << L"' immediately following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '/' expected to follow '[', to follow '>' "
+ L"immediately, or to follow '^' or '#' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ break;
+ default:
+ std::wstringstream Message;
+ Message << L"unexpected '" << Character_ << L"' following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '/' expected to follow '[', to follow '>' "
+ L"immediately, or to follow '^' or '#' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ TheStreamedType.TheLexicalUnit->TheAnalyses.push_back(Analysis());
+ TheStreamedType.TheLexicalUnit->TheAnalyses.back()
+ .TheMorphemes.push_back(Morpheme());
+ ThePreviousCase = PreviousCaseType(Character_);
+ continue;
+ case L'*':
+ if (ThePreviousCase) {
+ switch (ThePreviousCase->ThePreviousCase) {
+ case L'[':
+ case L']':
+ case L'$':
+ break;
+ default:
+ std::wstringstream Message;
+ Message
+ << L"unexpected '" << Character_ << L"' following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '*' expected to follow '[', ']', or '$' or to follow "
+ L"'/' immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+ }
+
+ push_back_Character(TheStreamedType, Lemma, Character_);
+ continue;
+ case L'<':
+ if (!ThePreviousCase) {
+ std::wstringstream Message;
+ Message
+ << L"unexpected '" << Character_
+ << L"', '<' expected to follow '[', to follow '>' immediately, "
+ L"or to follow '#', '/' or '+' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ switch (ThePreviousCase->ThePreviousCase) {
+ case L'[':
+ push_back_Character(TheStreamedType, Lemma, Character_);
+ continue;
+ case L'/':
+ case L'#':
+ case L'+':
+ if (ThePreviousCase->isPreviousCharacter) {
+ std::wstringstream Message;
+ Message
+ << L"unexpected '" << Character_ << L"' immediately following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '<' expected to follow '[', to follow '>' immediately, "
+ L"or to follow '#', '/' or '+' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ break;
+ case L'>':
+ if (!ThePreviousCase->isPreviousCharacter) {
+ std::wstringstream Message;
+ Message
+ << L"unexpected '" << Character_
+ << L"' not immediately following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '<' expected to follow '[', to follow '>' immediately, "
+ L"or to follow '#', '/' or '+' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ break;
+ default:
+ std::wstringstream Message;
+ Message
+ << L"unexpected '" << Character_ << L"' following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '<' expected to follow '[', to follow '>' immediately, "
+ L"or to follow '#', '/' or '+' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ TheStreamedType.TheLexicalUnit->TheAnalyses.back()
+ .TheMorphemes.back()
+ .TheTags.push_back(Tag());
+ ThePreviousCase = PreviousCaseType(Character_);
+ continue;
+ case L'>':
+ if (!ThePreviousCase) {
+ std::wstringstream Message;
+ Message << L"unexpected '" << Character_
+ << L"', '>' expected to "
+ L"follow '[' or to follow "
+ L"'<' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ switch (ThePreviousCase->ThePreviousCase) {
+ case L'[':
+ push_back_Character(TheStreamedType, Lemma, Character_);
+ continue;
+ case L'<':
+ if (ThePreviousCase->isPreviousCharacter) {
+ std::wstringstream Message;
+ Message << L"unexpected '" << Character_
+ << L"' immediately following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '>' expected to "
+ L"follow '[' or to follow "
+ L"'<' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ ThePreviousCase = PreviousCaseType(Character_);
+ continue;
+ default:
+ std::wstringstream Message;
+ Message << L"unexpected '" << Character_ << L"' following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '>' expected to "
+ L"follow '[' or to follow "
+ L"'<' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ std::abort();
+ case L'#':
+ if (ThePreviousCase) {
+ switch (ThePreviousCase->ThePreviousCase) {
+ case L'[':
+ case L']':
+ case L'$':
+ push_back_Character(TheStreamedType, Lemma, Character_);
+ continue;
+ case L'/':
+ if (ThePreviousCase->isPreviousCharacter) {
+ std::wstringstream Message;
+ Message
+ << L"unexpected '" << Character_
+ << L"' immediately following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '#' expected to follow '[', ']', or '$', to follow "
+ L"'>' immediately, or to follow '/' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ break;
+ case L'>':
+ if (!ThePreviousCase->isPreviousCharacter) {
+ std::wstringstream Message;
+ Message
+ << L"unexpected '" << Character_
+ << L"' not immediately following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '#' expected to follow '[', ']', or '$', to follow "
+ L"'>' immediately, or to follow '/' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ break;
+ default:
+ std::wstringstream Message;
+ Message << L"unexpected '" << Character_ << L"' following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '#' expected to follow '[', ']', or '$', to follow "
+ L"'>' immediately, or to follow '/' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ ThePreviousCase = PreviousCaseType(Character_);
+ continue;
+ }
+
+ push_back_Character(TheStreamedType, Lemma, Character_);
+ continue;
+ case L'+':
+ if (ThePreviousCase) {
+ switch (ThePreviousCase->ThePreviousCase) {
+ case L'[':
+ case L']':
+ case L'$':
+ push_back_Character(TheStreamedType, Lemma, Character_);
+ continue;
+ case L'>':
+ if (!ThePreviousCase->isPreviousCharacter) {
+ std::wstringstream Message;
+ Message
+ << L"unexpected '" << Character_
+ << L"' not immediately following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '+' expected to follow '[', ']', or '$', to follow "
+ L"'>' "
+ L"immediately, or to follow '#' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ break;
+ case L'#':
+ if (ThePreviousCase->isPreviousCharacter) {
+ std::wstringstream Message;
+ Message
+ << L"unexpected '" << Character_
+ << L"' immediately following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '+' expected to follow '[', ']', or '$', to follow "
+ L"'>' "
+ L"immediately, or to follow '#' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ break;
+ default: {
+ std::wstringstream Message;
+ Message << L"unexpected '" << Character_ << L"' following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '+' expected to follow '[', ']', or '$', to follow "
+ L"'>' immediately, or to follow '#' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+ }
+
+ TheStreamedType.TheLexicalUnit->TheAnalyses.back()
+ .TheMorphemes.push_back(Morpheme());
+ ThePreviousCase = PreviousCaseType(Character_);
+ continue;
+ }
+
+ push_back_Character(TheStreamedType, Lemma, Character_);
+ continue;
+ case L'$':
+ if (!ThePreviousCase) {
+ std::wstringstream Message;
+ Message
+ << L"unexpected '" << Character_
+ << L"', '$' expected to follow '[', to follow '>' immediately, "
+ L"or to follow '*' or '#' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ switch (ThePreviousCase->ThePreviousCase) {
+ case L'[':
+ push_back_Character(TheStreamedType, Lemma, Character_);
+ continue;
+ case L'*':
+ if (ThePreviousCase->isPreviousCharacter) {
+ std::wstringstream Message;
+ Message
+ << L"unexpected '" << Character_ << L"' immediately following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '$' expected to follow '[', to follow '>' immediately, "
+ L"or to follow '*' or '#' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ if (TheFlags.getDebug()) {
+ if (Lemma != TheStreamedType.TheLexicalUnit->TheSurfaceForm)
+ std::wcerr << L"unexpected lemma \"" << Lemma
+ << L"\", expected \""
+ << TheStreamedType.TheLexicalUnit->TheSurfaceForm
+ << L"\"\n";
+ }
+
+ ThePreviousCase = PreviousCaseType(Character_);
+ return TheStreamedType;
+ case L'>':
+ if (!ThePreviousCase->isPreviousCharacter) {
+ std::wstringstream Message;
+ Message
+ << L"unexpected '" << Character_
+ << L"' not immediately following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '$' expected to follow '[', to follow '>' immediately, "
+ L"or to follow '*' or '#' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ break;
+ case L'#':
+ if (ThePreviousCase->isPreviousCharacter) {
+ std::wstringstream Message;
+ Message
+ << L"unexpected '" << Character_ << L"' immediately following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '$' expected to follow '[', to follow '>' immediately, "
+ L"or to follow '*' or '#' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ break;
+ default:
+ std::wstringstream Message;
+ Message
+ << L"unexpected '" << Character_ << L"' following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', '$' expected to follow '[', to follow '>' immediately, "
+ L"or to follow '*' or '#' not immediately";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+
+ ThePreviousCase = PreviousCaseType(Character_);
+ return TheStreamedType;
+ case L'\n':
+ if (ThePreviousCase) {
+ switch (ThePreviousCase->ThePreviousCase) {
+ case L'[':
+ case L']':
+ case L'$':
+ break;
+ default:
+ std::wstringstream Message;
+ Message << L"unexpected newline following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', newline expected to follow '[', ']', or '$'";
+ throw wchar_t_Exception::Stream::UnexpectedCase(
+ Message_what(Message));
+ }
+ }
+
+ push_back_Character(TheStreamedType, Lemma, Character_);
+ ++TheLineNumber;
+ TheLine.clear();
+ continue;
+ default:
+ push_back_Character(TheStreamedType, Lemma, Character_);
+ continue;
+ }
+
+ std::abort();
+ }
+ }
+
+ if (ThePreviousCase) {
+ switch (ThePreviousCase->ThePreviousCase) {
+ case L']':
+ case L'$':
+ break;
+ default:
+ std::wstringstream Message;
+ Message << L"unexpected end-of-file following '"
+ << ThePreviousCase->ThePreviousCase
+ << L"', end-of-file expected to follow ']' "
+ L"or '$'";
+ throw wchar_t_Exception::Stream::UnexpectedEndOfFile(
+ Message_what(Message));
+ }
+ }
+
+ return TheStreamedType;
+}
+
+bool Stream::flush_() const { return private_flush_; }
+
+Stream::PreviousCaseType::PreviousCaseType(const wchar_t &PreviousCase_)
+ : ThePreviousCase(PreviousCase_), isPreviousCharacter(true) {}
+
+bool Stream::is_eof_throw_if_not_TheCharacterStream_good() const {
+ if (TheCharacterStream.eof())
+ return true;
+
+ if (!TheCharacterStream) {
+ std::wstringstream Message;
+ Message << L"can't get const wchar_t: TheCharacterStream not good";
+ throw wchar_t_Exception::Stream::TheCharacterStream_not_good(
+ Message_what(Message));
+ }
+
+ return false;
+}
+
+std::wstring Stream::Message_what(const std::wstringstream &Message) const {
+ std::wstringstream what_;
+
+ if (TheFilename)
+ what_ << std::wstring(TheFilename->begin(), TheFilename->end()) << L": ";
+
+ what_ << TheLineNumber << L":" << TheLine.size() << L": " << Message.str()
+ << L'\n' << TheLine << L'\n' << std::wstring(TheLine.size() - 1, L' ')
+ << L'^';
+ return what_.str();
+}
+
+bool
+Stream::is_eof_throw_if_not_TheCharacterStream_good(StreamedType &StreamedType_,
+ std::wstring &Lemma,
+ const wchar_t &Character_) {
+ if (isTheCharacterStream_eof(StreamedType_, Lemma, Character_))
+ return true;
+
+ if (!TheCharacterStream) {
+ std::wstringstream Message;
+ Message << L"can't get const wchar_t: TheCharacterStream not good";
+ throw wchar_t_Exception::Stream::TheCharacterStream_not_good(
+ Message_what(Message));
+ }
+
+ return false;
+}
+
+bool Stream::isTheCharacterStream_eof(StreamedType &StreamedType_,
+ std::wstring &Lemma,
+ const wchar_t &Character_) {
+ if (TheCharacterStream.eof())
+ return true;
+
+ if (TheFlags.getNullFlush()) {
+ if (Character_ == L'\0') {
+ push_back_Character(StreamedType_, Lemma, Character_);
+ private_flush_ = true;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void Stream::push_back_Character(StreamedType &StreamedType_,
+ std::wstring &Lemma,
+ const wchar_t &Character_) {
+ if (ThePreviousCase) {
+ switch (ThePreviousCase->ThePreviousCase) {
+ case L'[':
+ StreamedType_.TheString += Character_;
+ break;
+ case L']':
+ StreamedType_.TheString += Character_;
+ break;
+ case L'^':
+ StreamedType_.TheLexicalUnit->TheSurfaceForm += Character_;
+ break;
+ case L'/':
+ StreamedType_.TheLexicalUnit->TheAnalyses.back()
+ .TheMorphemes.back()
+ .TheLemma.push_back(Character_);
+ break;
+ case L'*':
+ Lemma += Character_;
+ break;
+ case L'<':
+ StreamedType_.TheLexicalUnit->TheAnalyses.back()
+ .TheMorphemes.back()
+ .TheTags.back()
+ .TheTag += Character_;
+ break;
+ case L'>': {
+ std::wstringstream Message;
+ Message << L"unexpected '" << Character_ << L"' immediately following '"
+ << ThePreviousCase->ThePreviousCase << L"'";
+ throw wchar_t_Exception::Stream::UnexpectedCharacter(
+ Message_what(Message));
+ }
+ case L'#':
+ StreamedType_.TheLexicalUnit->TheAnalyses.back()
+ .TheMorphemes.back()
+ .TheLemma.push_back(Character_);
+ break;
+ case L'+':
+ StreamedType_.TheLexicalUnit->TheAnalyses.back()
+ .TheMorphemes.back()
+ .TheLemma.push_back(Character_);
+ break;
+ case L'$':
+ StreamedType_.TheString += Character_;
+ break;
+ default:
+ std::wstringstream Message;
+ Message << L"unexpected previous reserved or special character '"
+ << ThePreviousCase->ThePreviousCase << L"'";
+ throw wchar_t_Exception::Stream::UnexpectedPreviousCase(
+ Message_what(Message));
+ }
+
+ ThePreviousCase->isPreviousCharacter = false;
+ return;
+ }
+
+ StreamedType_.TheString += Character_;
+}
+
+void Stream::case_0x5c(StreamedType &StreamedType_, std::wstring &Lemma,
+ const wchar_t &Character_) {
+ push_back_Character(StreamedType_, Lemma, Character_);
+
+ {
+ const wchar_t Character_ = TheCharacterStream.get();
+
+ if (is_eof_throw_if_not_TheCharacterStream_good(StreamedType_, Lemma,
+ Character_)) {
+ std::wstringstream Message;
+ Message << L"unexpected end-of-file following '\\', end-of-file "
+ L"expected to follow ']' or '$'";
+ throw wchar_t_Exception::Stream::UnexpectedEndOfFile(
+ Message_what(Message));
+ }
+
+ TheLine.push_back(Character_);
+ push_back_Character(StreamedType_, Lemma, Character_);
+ }
+}
+}
diff --git a/apertium/stream.h b/apertium/stream.h
new file mode 100644
index 0000000..dda0e7c
--- /dev/null
+++ b/apertium/stream.h
@@ -0,0 +1,69 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef STREAM_H
+#define STREAM_H
+
+#include "basic_tagger.h"
+#include "optional.h"
+#include "streamed_type.h"
+
+#include <cstddef>
+#include <istream>
+#include <sstream>
+#include <string>
+
+namespace Apertium {
+class Stream {
+public:
+ Stream(const basic_Tagger::Flags &Flags_);
+ Stream(const basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_,
+ const char *const Filename_);
+ Stream(const basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_,
+ const std::string &Filename_);
+ Stream(const basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_,
+ const std::stringstream &Filename_);
+ StreamedType get();
+ bool flush_() const;
+
+private:
+ class PreviousCaseType {
+ public:
+ PreviousCaseType(const wchar_t &PreviousCase_);
+ wchar_t ThePreviousCase;
+ bool isPreviousCharacter : 1;
+ };
+ bool is_eof_throw_if_not_TheCharacterStream_good() const;
+ std::wstring Message_what(const std::wstringstream &Message) const;
+ bool is_eof_throw_if_not_TheCharacterStream_good(StreamedType &StreamedType_,
+ std::wstring &Lemma,
+ const wchar_t &Character_);
+ bool isTheCharacterStream_eof(StreamedType &StreamedType_,
+ std::wstring &Lemma, const wchar_t &Character_);
+ void push_back_Character(StreamedType &StreamedType_, std::wstring &Lemma,
+ const wchar_t &Character_);
+ void case_0x5c(StreamedType &StreamedType_, std::wstring &Lemma,
+ const wchar_t &Character_);
+ std::wistream &TheCharacterStream;
+ Optional<std::string> TheFilename;
+ std::size_t TheLineNumber;
+ std::wstring TheLine;
+ const basic_Tagger::Flags &TheFlags;
+ bool private_flush_ : 1;
+ Optional<PreviousCaseType> ThePreviousCase;
+};
+}
+
+#endif // STREAM_H
diff --git a/apertium/stream_5_3_1_tagger.cc b/apertium/stream_5_3_1_tagger.cc
new file mode 100644
index 0000000..93eb34d
--- /dev/null
+++ b/apertium/stream_5_3_1_tagger.cc
@@ -0,0 +1,68 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "stream_5_3_1_tagger.h"
+
+#include "apertium_config.h"
+
+#include "analysis.h"
+#include "deserialiser.h"
+#include "lexical_unit.h"
+#include "stream.h"
+#include "streamed_type.h"
+
+#include <cstddef>
+#include <istream>
+#include <map>
+#include <ostream>
+
+#if ENABLE_DEBUG
+
+#include <sstream>
+#include <string>
+
+#endif // ENABLE_DEBUG
+
+namespace Apertium {
+Stream_5_3_1_Tagger::Stream_5_3_1_Tagger(const Flags &Flags_)
+ : basic_5_3_1_Tagger(), basic_StreamTagger(Flags_) {}
+
+void Stream_5_3_1_Tagger::deserialise(std::istream &Serialised_basic_Tagger) {
+ Model = Deserialiser<std::map<Analysis, std::size_t> >::deserialise(
+ Serialised_basic_Tagger);
+}
+
+long double Stream_5_3_1_Tagger::score(const Analysis &Analysis_) const {
+ return tokenCount_T(Analysis_);
+}
+
+long double Stream_5_3_1_Tagger::tokenCount_T(const Analysis &Analysis_) const {
+ if (Model.find(Analysis_) == Model.end())
+ return 1;
+
+ return 1 + Model.find(Analysis_)->second;
+}
+
+#if ENABLE_DEBUG
+
+std::wstring Stream_5_3_1_Tagger::score_DEBUG(const Analysis &Analysis_) const {
+ std::wstringstream score_DEBUG_;
+ score_DEBUG_ << tokenCount_T(Analysis_);
+ return score_DEBUG_.str();
+}
+
+#endif // ENABLE_DEBUG
+
+}
diff --git a/apertium/stream_5_3_1_tagger.h b/apertium/stream_5_3_1_tagger.h
new file mode 100644
index 0000000..aab04c8
--- /dev/null
+++ b/apertium/stream_5_3_1_tagger.h
@@ -0,0 +1,53 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef STREAM_5_3_1_TAGGER_H
+#define STREAM_5_3_1_TAGGER_H
+
+#include "apertium_config.h"
+
+#include "analysis.h"
+#include "basic_5_3_1_tagger.h"
+#include "basic_stream_tagger.h"
+
+#include <istream>
+
+#if ENABLE_DEBUG
+
+#include <string>
+
+#endif // ENABLE_DEBUG
+
+namespace Apertium {
+class Stream_5_3_1_Tagger : private basic_5_3_1_Tagger,
+ public basic_StreamTagger {
+public:
+ Stream_5_3_1_Tagger(const Flags &Flags_);
+ void deserialise(std::istream &Serialised_basic_Tagger);
+
+private:
+ long double score(const Analysis &Analysis_) const;
+ long double tokenCount_T(const Analysis &Analysis_) const;
+
+#if ENABLE_DEBUG
+
+ std::wstring score_DEBUG(const Analysis &Analysis_) const;
+
+#endif // ENABLE_DEBUG
+
+};
+}
+
+#endif // STREAM_5_3_1_TAGGER_H
diff --git a/apertium/stream_5_3_1_tagger_trainer.cc b/apertium/stream_5_3_1_tagger_trainer.cc
new file mode 100644
index 0000000..bfdba92
--- /dev/null
+++ b/apertium/stream_5_3_1_tagger_trainer.cc
@@ -0,0 +1,51 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "stream_5_3_1_tagger_trainer.h"
+
+#include "analysis.h"
+#include "basic_tagger.h"
+#include "serialiser.h"
+
+#include <cstddef>
+#include <map>
+#include <ostream>
+#include <utility>
+
+namespace Apertium {
+Stream_5_3_1_TaggerTrainer::Stream_5_3_1_TaggerTrainer(
+ const basic_Tagger::Flags &Flags_)
+ : basic_5_3_1_Tagger(), basic_StreamTaggerTrainer(Flags_) {}
+
+void Stream_5_3_1_TaggerTrainer::serialise(
+ std::ostream &Serialised_basic_Tagger) const {
+ Serialiser<std::map<Analysis, std::size_t> >::serialise(
+ Model, Serialised_basic_Tagger);
+}
+
+void
+Stream_5_3_1_TaggerTrainer::train_Analysis(const Analysis &Analysis_,
+ const std::size_t &Coefficient_) {
+ Model.insert(std::make_pair(Analysis_, 0)).first->second += Coefficient_;
+}
+
+void Stream_5_3_1_TaggerTrainer::multiplyModel(
+ const std::size_t &OccurrenceCoefficientMultiplier) {
+ for (std::map<Analysis, std::size_t>::iterator Analysis_ = Model.begin();
+ Analysis_ != Model.end(); ++Analysis_) {
+ Analysis_->second *= OccurrenceCoefficientMultiplier;
+ }
+}
+}
diff --git a/apertium/stream_5_3_1_tagger_trainer.h b/apertium/stream_5_3_1_tagger_trainer.h
new file mode 100644
index 0000000..486f7be
--- /dev/null
+++ b/apertium/stream_5_3_1_tagger_trainer.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef STREAM_5_3_1_TAGGER_TRAINER_H
+#define STREAM_5_3_1_TAGGER_TRAINER_H
+
+#include "basic_5_3_1_tagger.h"
+#include "basic_stream_tagger_trainer.h"
+
+#include "analysis.h"
+#include "stream.h"
+
+#include <ostream>
+
+namespace Apertium {
+class Stream_5_3_1_TaggerTrainer : private basic_5_3_1_Tagger,
+ public basic_StreamTaggerTrainer {
+public:
+ Stream_5_3_1_TaggerTrainer(const Flags &Flags_);
+ void serialise(std::ostream &Serialised_basic_Tagger) const;
+
+private:
+ void train_Analysis(const Analysis &Analysis_,
+ const std::size_t &Coefficient_);
+ void multiplyModel(const std::size_t &OccurrenceCoefficientMultiplier);
+};
+}
+
+#endif // STREAM_5_3_1_TAGGER_TRAINER_H
diff --git a/apertium/stream_5_3_2_tagger.cc b/apertium/stream_5_3_2_tagger.cc
new file mode 100644
index 0000000..e7bf2a0
--- /dev/null
+++ b/apertium/stream_5_3_2_tagger.cc
@@ -0,0 +1,104 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "stream_5_3_2_tagger.h"
+
+#include "apertium_config.h"
+
+#include "a.h"
+#include "analysis.h"
+#include "deserialiser.h"
+#include "lemma.h"
+
+#include <cstddef>
+#include <istream>
+#include <map>
+
+#if ENABLE_DEBUG
+
+#include <sstream>
+#include <string>
+
+#endif // ENABLE_DEBUG
+
+namespace Apertium {
+Stream_5_3_2_Tagger::Stream_5_3_2_Tagger(const Flags &Flags_)
+ : basic_5_3_2_Tagger(), basic_StreamTagger(Flags_) {}
+
+void Stream_5_3_2_Tagger::deserialise(std::istream &Serialised_basic_Tagger) {
+ Model =
+ Deserialiser<std::map<a, std::map<Lemma, std::size_t> > >::deserialise(
+ Serialised_basic_Tagger);
+}
+
+long double Stream_5_3_2_Tagger::score(const Analysis &Analysis_) const {
+ return (tokenCount_r_a(Analysis_) * tokenCount_a(Analysis_)) /
+ (tokenCount_a(Analysis_) + typeCount_a(Analysis_));
+}
+
+long double
+Stream_5_3_2_Tagger::tokenCount_r_a(const Analysis &Analysis_) const {
+ if (Model.find(a(Analysis_)) == Model.end())
+ return 1;
+
+ if (Model.find(a(Analysis_))->second.find(Lemma(Analysis_)) ==
+ Model.find(a(Analysis_))->second.end())
+ return 1;
+
+ return 1 + Model.find(a(Analysis_))->second.find(Lemma(Analysis_))->second;
+}
+
+long double Stream_5_3_2_Tagger::tokenCount_a(const Analysis &Analysis_) const {
+ if (Model.find(a(Analysis_)) == Model.end())
+ return 1;
+
+ long double tokenCount_a_ = 1;
+
+ for (std::map<Lemma, std::size_t>::const_iterator Lemma_ =
+ Model.find(a(Analysis_))->second.begin();
+ Lemma_ != Model.find(a(Analysis_))->second.end(); ++Lemma_) {
+ tokenCount_a_ += Lemma_->second;
+ }
+
+ return tokenCount_a_;
+}
+
+long double Stream_5_3_2_Tagger::typeCount_a(const Analysis &Analysis_) const {
+ if (Model.find(a(Analysis_)) == Model.end())
+ return 1;
+
+ return (Model.find(a(Analysis_))->second.find(Lemma(Analysis_)) ==
+ Model.find(a(Analysis_))->second.end()
+ ? 1
+ : 0) +
+ Model.find(a(Analysis_))->second.size();
+}
+
+#if ENABLE_DEBUG
+
+std::wstring Stream_5_3_2_Tagger::score_DEBUG(const Analysis &Analysis_) const {
+ std::wstringstream score_DEBUG_;
+
+ score_DEBUG_ << L"(" << tokenCount_r_a(Analysis_) << L" * "
+ << tokenCount_a(Analysis_) << L") /\n ("
+ << tokenCount_a(Analysis_) << L" + " << typeCount_a(Analysis_)
+ << L")";
+
+ return score_DEBUG_.str();
+}
+
+#endif // ENABLE_DEBUG
+
+}
diff --git a/apertium/stream_5_3_2_tagger.h b/apertium/stream_5_3_2_tagger.h
new file mode 100644
index 0000000..397d729
--- /dev/null
+++ b/apertium/stream_5_3_2_tagger.h
@@ -0,0 +1,55 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef STREAM_5_3_2_TAGGER_H
+#define STREAM_5_3_2_TAGGER_H
+
+#include "apertium_config.h"
+
+#include "analysis.h"
+#include "basic_5_3_2_tagger.h"
+#include "basic_stream_tagger.h"
+
+#include <istream>
+
+#if ENABLE_DEBUG
+
+#include <string>
+
+#endif // ENABLE_DEBUG
+
+namespace Apertium {
+class Stream_5_3_2_Tagger : private basic_5_3_2_Tagger,
+ public basic_StreamTagger {
+public:
+ Stream_5_3_2_Tagger(const Flags &Flags_);
+ void deserialise(std::istream &Serialised_basic_Tagger);
+
+private:
+ long double score(const Analysis &Analysis_) const;
+ long double tokenCount_r_a(const Analysis &Analysis_) const;
+ long double tokenCount_a(const Analysis &Analysis_) const;
+ long double typeCount_a(const Analysis &Analysis_) const;
+
+#if ENABLE_DEBUG
+
+ std::wstring score_DEBUG(const Analysis &Analysis_) const;
+
+#endif // ENABLE_DEBUG
+
+};
+}
+
+#endif // STREAM_5_3_2_TAGGER_H
diff --git a/apertium/stream_5_3_2_tagger_trainer.cc b/apertium/stream_5_3_2_tagger_trainer.cc
new file mode 100644
index 0000000..fcc2e70
--- /dev/null
+++ b/apertium/stream_5_3_2_tagger_trainer.cc
@@ -0,0 +1,56 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "stream_5_3_2_tagger_trainer.h"
+
+#include "a.h"
+#include "analysis.h"
+#include "lemma.h"
+#include "serialiser.h"
+
+#include <map>
+#include <ostream>
+#include <utility>
+
+namespace Apertium {
+Stream_5_3_2_TaggerTrainer::Stream_5_3_2_TaggerTrainer(const Flags &Flags_)
+ : basic_StreamTaggerTrainer(Flags_) {}
+
+void Stream_5_3_2_TaggerTrainer::serialise(
+ std::ostream &Serialised_basic_Tagger) const {
+ Serialiser<std::map<a, std::map<Lemma, std::size_t> > >::serialise(
+ Model, Serialised_basic_Tagger);
+}
+
+void
+Stream_5_3_2_TaggerTrainer::train_Analysis(const Analysis &Analysis_,
+ const std::size_t &Coefficient_) {
+ Model.insert(std::make_pair(static_cast<a>(Analysis_),
+ std::map<Lemma, std::size_t>()))
+ .first->second.insert(std::make_pair(static_cast<Lemma>(Analysis_), 0))
+ .first->second += Coefficient_;
+}
+
+void Stream_5_3_2_TaggerTrainer::multiplyModel(
+ const std::size_t &OccurrenceCoefficientMultiplier) {
+ for (std::map<a, std::map<Lemma, std::size_t> >::iterator a_ = Model.begin();
+ a_ != Model.end(); ++a_) {
+ for (std::map<Lemma, std::size_t>::iterator r_ = a_->second.begin();
+ r_ != a_->second.end(); ++r_) {
+ r_->second *= OccurrenceCoefficientMultiplier;
+ }
+ }
+}
+}
diff --git a/apertium/stream_5_3_2_tagger_trainer.h b/apertium/stream_5_3_2_tagger_trainer.h
new file mode 100644
index 0000000..04c2a07
--- /dev/null
+++ b/apertium/stream_5_3_2_tagger_trainer.h
@@ -0,0 +1,38 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef STREAM_5_3_2_TAGGER_TRAINER_H
+#define STREAM_5_3_2_TAGGER_TRAINER_H
+
+#include "basic_5_3_2_tagger.h"
+#include "basic_stream_tagger_trainer.h"
+
+#include <ostream>
+
+namespace Apertium {
+class Stream_5_3_2_TaggerTrainer : private basic_5_3_2_Tagger,
+ public basic_StreamTaggerTrainer {
+public:
+ Stream_5_3_2_TaggerTrainer(const Flags &Flags_);
+ void serialise(std::ostream &Serialised_basic_Tagger) const;
+
+private:
+ void train_Analysis(const Analysis &Analysis_,
+ const std::size_t &Coefficient_);
+ void multiplyModel(const std::size_t &OccurrenceCoefficientMultiplier);
+};
+}
+
+#endif // STREAM_5_3_2_TAGGER_TRAINER_H
diff --git a/apertium/stream_5_3_3_tagger.cc b/apertium/stream_5_3_3_tagger.cc
new file mode 100644
index 0000000..f9e9359
--- /dev/null
+++ b/apertium/stream_5_3_3_tagger.cc
@@ -0,0 +1,223 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "stream_5_3_3_tagger.h"
+
+#include "apertium_config.h"
+
+#include "analysis.h"
+#include "deserialiser.h"
+#include "i.h"
+#include "lemma.h"
+#include "morpheme.h"
+
+#include <vector>
+
+#if ENABLE_DEBUG
+
+#include <sstream>
+#include <string>
+
+#endif // ENABLE_DEBUG
+
+namespace Apertium {
+Stream_5_3_3_Tagger::Stream_5_3_3_Tagger(const Flags &Flags_)
+ : basic_StreamTagger(Flags_) {}
+
+void Stream_5_3_3_Tagger::deserialise(std::istream &Serialised_basic_Tagger) {
+ Model = Deserialiser<
+ std::pair<std::map<i, std::map<Lemma, std::size_t> >,
+ std::pair<std::map<i, std::map<Lemma, std::size_t> >,
+ std::map<Lemma, std::map<i, std::size_t> > > > >::
+ deserialise(Serialised_basic_Tagger);
+}
+
+long double Stream_5_3_3_Tagger::score(const Analysis &Analysis_) const {
+ long double score = tokenCount_r_i(Analysis_) * tokenCount_i(Analysis_),
+ score_Divisor = tokenCount_i(Analysis_) + typeCount_i(Analysis_);
+
+ for (std::vector<Morpheme>::const_iterator Morpheme_ =
+ Analysis_.TheMorphemes.begin() + 1;
+ Morpheme_ != Analysis_.TheMorphemes.end(); ++Morpheme_) {
+ score *= tokenCount_d_i_Morpheme(Lemma(*Morpheme_), i(*(Morpheme_ - 1))) *
+ tokenCount_i_d_Morpheme(i(*Morpheme_), Lemma(*Morpheme_));
+ score_Divisor *=
+ (tokenCount_i_Morpheme(i(*(Morpheme_ - 1))) +
+ typeCount_i_Morpheme(i(*(Morpheme_ - 1)), Lemma(*Morpheme_))) *
+ (tokenCount_d_Morpheme(Lemma(*Morpheme_)) +
+ typeCount_d_Morpheme(Lemma(*Morpheme_), i(*Morpheme_)));
+ }
+
+ return score / score_Divisor;
+}
+
+long double
+Stream_5_3_3_Tagger::tokenCount_r_i(const Analysis &Analysis_) const {
+ if (Model.first.find(i(Analysis_)) == Model.first.end())
+ return 1;
+
+ if (Model.first.find(i(Analysis_))->second.find(Lemma(Analysis_)) ==
+ Model.first.find(i(Analysis_))->second.end())
+ return 1;
+
+ return 1 +
+ Model.first.find(i(Analysis_))->second.find(Lemma(Analysis_))->second;
+}
+
+long double Stream_5_3_3_Tagger::tokenCount_i(const Analysis &Analysis_) const {
+ if (Model.first.find(i(Analysis_)) == Model.first.end())
+ return 1;
+
+ long double tokenCount_i_ = 1;
+
+ for (std::map<Lemma, std::size_t>::const_iterator Lemma_ =
+ Model.first.find(i(Analysis_))->second.begin();
+ Lemma_ != Model.first.find(i(Analysis_))->second.end(); ++Lemma_) {
+ tokenCount_i_ += Lemma_->second;
+ }
+
+ return tokenCount_i_;
+}
+
+long double Stream_5_3_3_Tagger::typeCount_i(const Analysis &Analysis_) const {
+ if (Model.first.find(i(Analysis_)) == Model.first.end())
+ return 1;
+
+ return (Model.first.find(i(Analysis_))->second.find(Lemma(Analysis_)) ==
+ Model.first.find(i(Analysis_))->second.end()
+ ? 1
+ : 0) +
+ Model.first.find(i(Analysis_))->second.size();
+}
+
+long double Stream_5_3_3_Tagger::tokenCount_d_i_Morpheme(const Lemma &Lemma_,
+ const i &i_) const {
+ if (Model.second.first.find(i_) == Model.second.first.end())
+ return 1;
+
+ if (Model.second.first.find(i_)->second.find(Lemma_) ==
+ Model.second.first.find(i_)->second.end())
+ return 1;
+
+ return 1 + Model.second.first.find(i_)->second.find(Lemma_)->second;
+}
+
+long double
+Stream_5_3_3_Tagger::tokenCount_i_d_Morpheme(const i &i_,
+ const Lemma &Lemma_) const {
+ if (Model.second.second.find(Lemma_) == Model.second.second.end())
+ return 1;
+
+ if (Model.second.second.find(Lemma_)->second.find(i_) ==
+ Model.second.second.find(Lemma_)->second.end())
+ return 1;
+
+ return 1 + Model.second.second.find(Lemma_)->second.find(i_)->second;
+}
+
+long double Stream_5_3_3_Tagger::tokenCount_i_Morpheme(const i &i_) const {
+ if (Model.second.first.find(i_) == Model.second.first.end())
+ return 1;
+
+ long double typeCount_i_Morpheme_ = 1;
+
+ for (std::map<Lemma, std::size_t>::const_iterator Lemma_ =
+ Model.second.first.find(i_)->second.begin();
+ Lemma_ != Model.second.first.find(i_)->second.end(); ++Lemma_) {
+ typeCount_i_Morpheme_ += Lemma_->second;
+ }
+
+ return typeCount_i_Morpheme_;
+}
+
+long double
+Stream_5_3_3_Tagger::typeCount_i_Morpheme(const i &i_,
+ const Lemma &Lemma_) const {
+ if (Model.second.first.find(i_) == Model.second.first.end())
+ return 1;
+
+ return (Model.second.first.find(i_)->second.find(Lemma_) ==
+ Model.second.first.find(i_)->second.end()
+ ? 1
+ : 0) +
+ Model.second.first.find(i_)->second.size();
+}
+
+long double
+Stream_5_3_3_Tagger::tokenCount_d_Morpheme(const Lemma &Lemma_) const {
+ if (Model.second.second.find(Lemma_) == Model.second.second.end())
+ return 1;
+
+ long double tokenCount_d_Morpheme_ = 1;
+
+ for (std::map<i, std::size_t>::const_iterator i_ =
+ Model.second.second.find(Lemma_)->second.begin();
+ i_ != Model.second.second.find(Lemma_)->second.end(); ++i_) {
+ tokenCount_d_Morpheme_ += i_->second;
+ }
+
+ return tokenCount_d_Morpheme_;
+}
+
+long double Stream_5_3_3_Tagger::typeCount_d_Morpheme(const Lemma &Lemma_,
+ const i &i_) const {
+ if (Model.second.second.find(Lemma_) == Model.second.second.end())
+ return 1;
+
+ return (Model.second.second.find(Lemma_)->second.find(i_) ==
+ Model.second.second.find(Lemma_)->second.end()
+ ? 1
+ : 0) +
+ Model.second.second.find(Lemma_)->second.size();
+}
+
+#if ENABLE_DEBUG
+
+std::wstring Stream_5_3_3_Tagger::score_DEBUG(const Analysis &Analysis_) const {
+ std::wstringstream score_DEBUG_;
+
+ score_DEBUG_ << L"(" << tokenCount_r_i(Analysis_) << L" * "
+ << tokenCount_i(Analysis_);
+
+ for (std::vector<Morpheme>::const_iterator Morpheme_ =
+ Analysis_.TheMorphemes.begin() + 1;
+ Morpheme_ != Analysis_.TheMorphemes.end(); ++Morpheme_) {
+ score_DEBUG_ << L" * " << tokenCount_d_i_Morpheme(Lemma(*Morpheme_),
+ i(*(Morpheme_ - 1)))
+ << L" * "
+ << tokenCount_i_d_Morpheme(i(*Morpheme_), Lemma(*Morpheme_));
+ }
+
+ score_DEBUG_ << L") /\n [(" << tokenCount_i(Analysis_) << L" + "
+ << typeCount_i(Analysis_) << L")";
+
+ for (std::vector<Morpheme>::const_iterator Morpheme_ =
+ Analysis_.TheMorphemes.begin() + 1;
+ Morpheme_ != Analysis_.TheMorphemes.end(); ++Morpheme_) {
+ score_DEBUG_ << L" * (" << tokenCount_i_Morpheme(i(*(Morpheme_ - 1)))
+ << L" + "
+ << typeCount_i_Morpheme(i(*(Morpheme_ - 1)), Lemma(*Morpheme_))
+ << L") * (" << tokenCount_d_Morpheme(Lemma(*Morpheme_))
+ << L" + "
+ << typeCount_d_Morpheme(Lemma(*Morpheme_), i(*Morpheme_))
+ << L")";
+ }
+
+ score_DEBUG_ << L"]";
+ return score_DEBUG_.str();
+}
+
+#endif // ENABLE_DEBUG
+}
diff --git a/apertium/stream_5_3_3_tagger.h b/apertium/stream_5_3_3_tagger.h
new file mode 100644
index 0000000..9b7c5ca
--- /dev/null
+++ b/apertium/stream_5_3_3_tagger.h
@@ -0,0 +1,62 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef STREAM_5_3_3_TAGGER_H
+#define STREAM_5_3_3_TAGGER_H
+
+#include "apertium_config.h"
+
+#include "analysis.h"
+#include "basic_5_3_3_tagger.h"
+#include "basic_stream_tagger.h"
+#include "i.h"
+#include "lemma.h"
+
+#include <istream>
+
+#if ENABLE_DEBUG
+
+#include <string>
+
+#endif // ENABLE_DEBUG
+
+namespace Apertium {
+class Stream_5_3_3_Tagger : private basic_5_3_3_Tagger,
+ public basic_StreamTagger {
+public:
+ Stream_5_3_3_Tagger(const Flags &Flags_);
+ void deserialise(std::istream &Serialised_basic_Tagger);
+
+private:
+ long double score(const Analysis &Analysis_) const;
+ long double tokenCount_r_i(const Analysis &Analysis_) const;
+ long double tokenCount_i(const Analysis &Analysis_) const;
+ long double typeCount_i(const Analysis &Analysis_) const;
+ long double tokenCount_d_i_Morpheme(const Lemma &Lemma_, const i &i_) const;
+ long double tokenCount_i_d_Morpheme(const i &i_, const Lemma &Lemma_) const;
+ long double tokenCount_i_Morpheme(const i &i_) const;
+ long double typeCount_i_Morpheme(const i &i_, const Lemma &Lemma_) const;
+ long double tokenCount_d_Morpheme(const Lemma &Lemma_) const;
+ long double typeCount_d_Morpheme(const Lemma &Lemma_, const i &i_) const;
+
+#if ENABLE_DEBUG
+
+ std::wstring score_DEBUG(const Analysis &Analysis_) const;
+
+#endif // ENABLE_DEBUG
+};
+}
+
+#endif // STREAM_5_3_3_TAGGER_H
diff --git a/apertium/stream_5_3_3_tagger_trainer.cc b/apertium/stream_5_3_3_tagger_trainer.cc
new file mode 100644
index 0000000..24b21fa
--- /dev/null
+++ b/apertium/stream_5_3_3_tagger_trainer.cc
@@ -0,0 +1,92 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "analysis.h"
+#include "i.h"
+#include "lemma.h"
+#include "serialiser.h"
+#include "stream_5_3_3_tagger_trainer.h"
+
+#include <cstddef>
+#include <map>
+#include <ostream>
+#include <utility>
+#include <vector>
+
+namespace Apertium {
+Stream_5_3_3_TaggerTrainer::Stream_5_3_3_TaggerTrainer(const Flags &Flags_)
+ : basic_StreamTaggerTrainer(Flags_) {}
+
+void Stream_5_3_3_TaggerTrainer::serialise(
+ std::ostream &Serialised_basic_Tagger) const {
+ Serialiser<
+ std::pair<std::map<i, std::map<Lemma, std::size_t> >,
+ std::pair<std::map<i, std::map<Lemma, std::size_t> >,
+ std::map<Lemma, std::map<i, std::size_t> > > > >::
+ serialise(Model, Serialised_basic_Tagger);
+}
+
+void
+Stream_5_3_3_TaggerTrainer::train_Analysis(const Analysis &Analysis_,
+ const std::size_t &Coefficient_) {
+ Model.first.insert(
+ std::make_pair(i(Analysis_), std::map<Lemma, std::size_t>()))
+ .first->second.insert(std::make_pair(Lemma(Analysis_), 0))
+ .first->second += Coefficient_;
+
+ for (std::vector<Morpheme>::const_iterator Morpheme_ =
+ Analysis_.TheMorphemes.begin() + 1;
+ Morpheme_ != Analysis_.TheMorphemes.end(); ++Morpheme_) {
+ Model.second.first.insert(std::make_pair(i(*(Morpheme_ - 1)),
+ std::map<Lemma, std::size_t>()))
+ .first->second.insert(std::make_pair(Lemma(*Morpheme_), 0))
+ .first->second += Coefficient_;
+ Model.second.second.insert(std::make_pair(Lemma(*Morpheme_),
+ std::map<i, std::size_t>()))
+ .first->second.insert(std::make_pair(i(*Morpheme_), 0))
+ .first->second += Coefficient_;
+ }
+}
+
+void Stream_5_3_3_TaggerTrainer::multiplyModel(
+ const std::size_t &OccurrenceCoefficientMultiplier) {
+ for (std::map<i, std::map<Lemma, std::size_t> >::iterator i_ =
+ Model.first.begin();
+ i_ != Model.first.end(); ++i_) {
+ for (std::map<Lemma, std::size_t>::iterator Lemma_ = i_->second.begin();
+ Lemma_ != i_->second.end(); ++Lemma_) {
+ Lemma_->second *= OccurrenceCoefficientMultiplier;
+ }
+ }
+
+ for (std::map<i, std::map<Lemma, std::size_t> >::iterator i_ =
+ Model.second.first.begin();
+ i_ != Model.second.first.end(); ++i_) {
+ for (std::map<Lemma, std::size_t>::iterator Lemma_ = i_->second.begin();
+ Lemma_ != i_->second.end(); ++Lemma_) {
+ Lemma_->second *= OccurrenceCoefficientMultiplier;
+ }
+ }
+
+ for (std::map<Lemma, std::map<i, std::size_t> >::iterator Lemma_ =
+ Model.second.second.begin();
+ Lemma_ != Model.second.second.end(); ++Lemma_) {
+ for (std::map<i, std::size_t>::iterator i_ = Lemma_->second.begin();
+ i_ != Lemma_->second.end(); ++i_) {
+ i_->second *= OccurrenceCoefficientMultiplier;
+ }
+ }
+}
+}
diff --git a/apertium/stream_5_3_3_tagger_trainer.h b/apertium/stream_5_3_3_tagger_trainer.h
new file mode 100644
index 0000000..7e2e5d6
--- /dev/null
+++ b/apertium/stream_5_3_3_tagger_trainer.h
@@ -0,0 +1,39 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef STREAM_5_3_3_TAGGER_TRAINER_H
+#define STREAM_5_3_3_TAGGER_TRAINER_H
+
+#include "analysis.h"
+#include "basic_5_3_3_tagger.h"
+#include "basic_stream_tagger_trainer.h"
+
+#include <ostream>
+
+namespace Apertium {
+class Stream_5_3_3_TaggerTrainer : private basic_5_3_3_Tagger,
+ public basic_StreamTaggerTrainer {
+public:
+ Stream_5_3_3_TaggerTrainer(const Flags &Flags_);
+ void serialise(std::ostream &Serialised_basic_Tagger) const;
+
+private:
+ void train_Analysis(const Analysis &Analysis_,
+ const std::size_t &Coefficient_);
+ void multiplyModel(const std::size_t &OccurrenceCoefficientMultiplier);
+};
+}
+
+#endif // STREAM_5_3_3_TAGGER_TRAINER_H
diff --git a/apertium/streamed_type.h b/apertium/streamed_type.h
new file mode 100644
index 0000000..7e9bad5
--- /dev/null
+++ b/apertium/streamed_type.h
@@ -0,0 +1,32 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef STREAMED_TYPE_H
+#define STREAMED_TYPE_H
+
+#include "lexical_unit.h"
+#include "optional.h"
+
+#include <string>
+
+namespace Apertium {
+class StreamedType {
+public:
+ std::wstring TheString;
+ Optional<LexicalUnit> TheLexicalUnit;
+};
+}
+
+#endif // STREAMED_TYPE_H
diff --git a/apertium/string_utils.cc b/apertium/string_utils.cc
index 5c8761a..2832e03 100644
--- a/apertium/string_utils.cc
+++ b/apertium/string_utils.cc
@@ -13,9 +13,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/string_utils.h>
diff --git a/apertium/string_utils.h b/apertium/string_utils.h
index 7b05525..310197e 100644
--- a/apertium/string_utils.h
+++ b/apertium/string_utils.h
@@ -13,9 +13,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __STRINGUTILS_H_
#define __STRINGUTILS_H_
diff --git a/apertium/tag.cc b/apertium/tag.cc
new file mode 100644
index 0000000..54f3558
--- /dev/null
+++ b/apertium/tag.cc
@@ -0,0 +1,34 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "tag.h"
+
+#include "exception.h"
+
+#include <string>
+
+namespace Apertium {
+bool operator==(const Tag &a, const Tag &b) { return a.TheTag == b.TheTag; }
+
+bool operator<(const Tag &a, const Tag &b) { return a.TheTag < b.TheTag; }
+
+Tag::operator std::wstring() const {
+ if (TheTag.empty())
+ throw Exception::Tag::TheTags_empty("can't convert Tag comprising empty "
+ "TheTag std::wstring to std::wstring");
+
+ return L"<" + TheTag + L">";
+}
+}
diff --git a/apertium/tag.h b/apertium/tag.h
new file mode 100644
index 0000000..9b6945c
--- /dev/null
+++ b/apertium/tag.h
@@ -0,0 +1,31 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef TAG_H
+#define TAG_H
+
+#include <string>
+
+namespace Apertium {
+class Tag {
+public:
+ friend bool operator==(const Tag &a, const Tag &b);
+ friend bool operator<(const Tag &a, const Tag &b);
+ operator std::wstring() const;
+ std::wstring TheTag;
+};
+}
+
+#endif // TAG_H
diff --git a/apertium/tagger.cc b/apertium/tagger.cc
deleted file mode 100644
index 630bfc5..0000000
--- a/apertium/tagger.cc
+++ /dev/null
@@ -1,763 +0,0 @@
-/*
- * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
- */
-/** PoS tagger main program.
- *
- * @author Felipe Sánchez-Martínez - fsanchez at dlsi.ua.es
- */
-
-#include <apertium/tagger.h>
-
-#include <apertium/hmm.h>
-#include <apertium/lswpost.h>
-#include <apertium/tagger_utils.h>
-#include <apertium/tsx_reader.h>
-#include <apertium/tagger_word.h>
-
-#include <cstdio>
-#include <fstream>
-#include <string>
-#include <libgen.h>
-#include <locale.h>
-
-#include <sys/types.h>
-#include <unistd.h>
-#include <getopt.h>
-#include <apertium/string_utils.h>
-#ifdef _MSC_VER
-#include <io.h>
-#include <fcntl.h>
-#endif
-
-using namespace Apertium;
-using namespace std;
-
-void
-Tagger::setShowSF(bool val) {
- showSF = val;
-}
-
-bool
-Tagger::getShowSF() {
- return showSF;
-}
-
-int
-Tagger::getMode(int argc, char *argv[]) {
- int mode=UNKNOWN_MODE;
-
- int c;
-
- is_sw = false;
-
-#if HAVE_GETOPT_LONG
- int option_index=0;
-#endif
-
- while (true) {
-#if HAVE_GETOPT_LONG
- static struct option long_options[] = {
- {"sliding-window", no_argument, 0, 'w'},
- {"train", required_argument, 0, 't'},
- {"supervised", required_argument, 0, 's'},
- {"retrain", required_argument, 0, 'r'},
- {"tagger", no_argument, 0, 'g'},
- {"show-superficial", no_argument, 0, 'p'},
- {"eval", no_argument, 0, 'e'},
- {"first", no_argument, 0, 'f'},
- {"help", no_argument, 0, 'h'},
- {"debug", no_argument, 0, 'd'},
- {"mark", no_argument, 0, 'm'},
- {"null-flush", no_argument, 0, 'z'},
- {"help", no_argument, 0, 'h'},
- {0, 0, 0, 0}
- };
-
- c=getopt_long(argc, argv, "wmdt:s:r:gpefhz",long_options, &option_index);
-#else
- c=getopt(argc, argv, "wmdt:s:r:gpefhz");
-#endif
- if (c==-1)
- break;
-
- switch (c) {
- case 'w':
- is_sw = true;
- if (mode == TRAIN_HMM_UNSUPERVISED_MODE) {
- mode = TRAIN_LSW_UNSUPERVISED_MODE;
- } else if (mode == TRAIN_HMM_SUPERVISED_MODE) {
- mode = TRAIN_LSW_SUPERVISED_MODE;
- } else if (mode == RETRAIN_HMM_UNSUPERVISED_MODE) {
- mode = RETRAIN_LSW_UNSUPERVISED_MODE;
- } else if (mode == TAGGER_HMM_MODE) {
- mode = TAGGER_LSW_MODE;
- } else if (mode == TAGGER_HMM_EVAL_MODE) {
- mode = TAGGER_LSW_EVAL_MODE;
- } else if (mode == TAGGER_HMM_FIRST_MODE) {
- mode = TAGGER_LSW_FIRST_MODE;
- }
- break;
- case 'm':
- TaggerWord::generate_marks = true;
- break;
-
- case 'd':
- debug=true;
- break;
-
- case 't': //Training
- if(!isNumber(optarg)) {
- wcerr <<L"Error: mandatory --train argument <n> must be zero or a positive integer\n";
- help();
- }
- else {
- nit = atoi(optarg); //Number of iterations
- }
- if(mode==UNKNOWN_MODE) {
- if (is_sw) {
- mode = TRAIN_LSW_UNSUPERVISED_MODE;
- }
- else {
- mode = TRAIN_HMM_UNSUPERVISED_MODE;
- }
- }
- else {
- wcerr<<L"Error: --train <n> argument cannot be mixed with --retrain or --tagger arguments\n";
- help();
- }
- break;
-
- case 's':
- if(!isNumber(optarg)) {
- wcerr<<L"Error: mandatory --supervised argument <n> must be zero or a positive integer\n";
- help();
- }
- else {
- nit = atoi(optarg); //Number of iterations
- }
-
- if(mode==UNKNOWN_MODE) {
- if (is_sw) {
- mode = TRAIN_LSW_SUPERVISED_MODE;
- }
- else {
- mode = TRAIN_HMM_SUPERVISED_MODE;
- }
- }
- else {
- wcerr<<L"Error: --supervised optional argument should only appear after --train <n> argument\n";
- help();
- }
- break;
-
- case 'p':
- setShowSF(true);
- break;
-
- case 'r':
- if(!isNumber(optarg)) {
- wcerr<<L"Error: mandatory --train argument <n> must be zero or a positive integer\n";
- help();
- }
- else {
- nit = atoi(optarg); //Number of iterations
- }
- if(mode==UNKNOWN_MODE) {
- if (is_sw) {
- mode = RETRAIN_LSW_UNSUPERVISED_MODE;
- }
- else {
- mode = RETRAIN_HMM_UNSUPERVISED_MODE;
- }
- }
- else {
- wcerr<<L"Error: --retrain argument cannot be mixed with --train or --tagger arguments\n";
- help();
- }
- break;
-
- case 'g':
- if(mode==UNKNOWN_MODE) {
- if (is_sw) {
- mode = TAGGER_LSW_MODE;
- }
- else {
- mode = TAGGER_HMM_MODE;
- }
- }
- else {
- wcerr<<L"Error: --tagger argument cannot be mixed with --train or --retrain arguments\n";
- help();
- }
- break;
-
- case 'e':
- if(mode==TAGGER_HMM_MODE) {
- mode = TAGGER_HMM_EVAL_MODE;
- }
- else if (mode == TAGGER_LSW_MODE) {
- mode = TAGGER_LSW_EVAL_MODE;
- }
- else {
- wcerr<<L"Error: --eval optional argument should only appear after --tagger argument\n";
- help();
- }
- break;
-
- case 'f':
- if(mode==TAGGER_HMM_MODE) {
- mode = TAGGER_HMM_FIRST_MODE;
- }
- else if (mode == TAGGER_LSW_MODE) {
- mode = TAGGER_LSW_FIRST_MODE;
- }
- else {
- wcerr<<L"Error: --first optional argument should only appear after --tagger argument\n";
- help();
- }
- break;
-
- case 'z':
- null_flush = true;
- break;
-
- case 'h':
- help();
- break;
-
- default:
- //wcerr<<L"Error: getopt() returned the char code '"<<c<<L"'\n";
- help();
- break;
- }
- }
-
- if(mode==UNKNOWN_MODE) {
- wcerr<<L"Error: Arguments missing\n";
- help();
- }
-
- switch(argc-optind) {
- case 6:
- if(mode != TRAIN_HMM_SUPERVISED_MODE
- && mode != TRAIN_LSW_SUPERVISED_MODE) {
- help();
- }
- break;
-
- case 4:
- if(mode != TRAIN_HMM_UNSUPERVISED_MODE
- && mode != TRAIN_LSW_UNSUPERVISED_MODE) {
- help();
- }
- break;
- case 3:
- if (mode != TAGGER_HMM_MODE
- && mode != TAGGER_HMM_FIRST_MODE
- && mode != TAGGER_LSW_MODE
- && mode != TAGGER_LSW_FIRST_MODE) {
- help();
- }
- break;
-
- case 2:
- if(mode != RETRAIN_HMM_UNSUPERVISED_MODE
- && mode != TAGGER_HMM_MODE
- && mode != RETRAIN_LSW_UNSUPERVISED_MODE
- && mode != TAGGER_LSW_MODE) {
- help();
- }
- break;
-
- case 1:
- if (mode != TAGGER_HMM_MODE
- && mode != TAGGER_HMM_FIRST_MODE
- && mode != TAGGER_LSW_MODE
- && mode != TAGGER_LSW_FIRST_MODE) {
- help();
- }
- break;
-
- default:
- help();
- break;
- }
-
- for(int i = optind; i != argc; i++) {
- filenames.push_back(argv[i]);
- }
-
- return mode;
-}
-
-Tagger::Tagger() {
- debug = false;
- showSF = false;
- null_flush = false;
-}
-
-void
-Tagger::main(int argc, char *argv[]) {
- name = argv[0];
- int mode = getMode(argc, argv);
-
- switch(mode) {
- case TRAIN_HMM_UNSUPERVISED_MODE:
- trainHMM();
- break;
-
- case TRAIN_LSW_UNSUPERVISED_MODE:
- trainLSW();
- break;
-
- case TRAIN_HMM_SUPERVISED_MODE:
- trainHMMSupervised();
- break;
-
- case TRAIN_LSW_SUPERVISED_MODE:
- trainLSWSupervised();
- break;
-
- case RETRAIN_HMM_UNSUPERVISED_MODE:
- retrainHMM();
- break;
-
- case RETRAIN_LSW_UNSUPERVISED_MODE:
- retrainLSW();
- break;
-
- case TAGGER_HMM_MODE:
- taggerHMM();
- break;
-
- case TAGGER_LSW_MODE:
- taggerLSW();
- break;
-
- case TAGGER_HMM_FIRST_MODE:
- taggerHMM(true);
- break;
-
- case TAGGER_LSW_FIRST_MODE:
- taggerLSW(true);
- break;
-
- default:
- wcerr<<L"Error: Unknown working mode mode\n";
- help();
- break;
- }
-}
-
-void
-Tagger::taggerHMM(bool mode_first) {
- FILE *ftdata = fopen(filenames[0].c_str(), "rb");
- if (!ftdata) {
- filerror(filenames[0]);
- }
-
- TaggerDataHMM tdhmm;
- tdhmm.read(ftdata);
- fclose(ftdata);
-
- HMM hmm(&tdhmm);
-
- hmm.set_show_sf(showSF);
- hmm.setNullFlush(null_flush);
-
- if(filenames.size() == 1) {
- hmm.tagger(stdin, stdout, mode_first);
- }
- else {
- FILE *finput = fopen(filenames[1].c_str(), "r");
- if (!finput) {
- filerror(filenames[1]);
- }
-#ifdef _MSC_VER
- _setmode(_fileno(finput), _O_U8TEXT);
-#endif
- if(filenames.size() == 2) {
- hmm.tagger(finput, stdout, mode_first);
- }
- else {
- FILE *foutput = fopen(filenames[2].c_str(), "w");
- if (!foutput) {
- filerror(filenames[2]);
- }
-#ifdef _MSC_VER
- _setmode(_fileno(foutput), _O_U8TEXT);
-#endif
-
- hmm.tagger(finput, foutput, mode_first);
- fclose(foutput);
- }
- fclose(finput);
- }
-}
-
-void
-Tagger::taggerLSW(bool mode_first) {
- FILE *ftdata = fopen(filenames[0].c_str(), "rb");
- if (!ftdata) {
- filerror(filenames[0]);
- }
-
- TaggerDataLSW tdlsw;
- tdlsw.read(ftdata);
- fclose(ftdata);
-
- LSWPoST lswpost(&tdlsw);
- lswpost.set_show_sf(showSF);
- lswpost.setNullFlush(null_flush);
-
- if(filenames.size() == 1) {
- lswpost.tagger(stdin, stdout, mode_first);
- }
- else {
- FILE *finput = fopen(filenames[1].c_str(), "r");
- if (!finput) {
- filerror(filenames[1]);
- }
-#ifdef _MSC_VER
- _setmode(_fileno(finput), _O_U8TEXT);
-#endif
- if(filenames.size() == 2) {
- lswpost.tagger(finput, stdout, mode_first);
- }
- else {
- FILE *foutput = fopen(filenames[2].c_str(), "w");
- if (!foutput) {
- filerror(filenames[2]);
- }
-#ifdef _MSC_VER
- _setmode(_fileno(foutput), _O_U8TEXT);
-#endif
-
- lswpost.tagger(finput, foutput, mode_first);
- fclose(foutput);
- }
- fclose(finput);
- }
-}
-
-void
-Tagger::filerror(string const &filename) {
- cerr << "Error: cannot open file '" << filenames[0] << "'\n\n";
- help();
-}
-
-void
-Tagger::trainHMM() {
- TSXReader treader;
- treader.read(filenames[2]);
- TaggerDataHMM tdhmm(treader.getTaggerData());
- HMM hmm(&tdhmm);
- hmm.set_debug(debug);
- hmm.set_eos((tdhmm.getTagIndex())[L"TAG_SENT"]);
- TaggerWord::setArrayTags(tdhmm.getArrayTags());
-
- wcerr << L"Calculating ambiguity classes...\n";
- FILE *fdic = fopen(filenames[0].c_str(), "r");
- if(fdic) {
- hmm.read_dictionary(fdic);
- }
- else {
- filerror(filenames[0]);
- }
- wcerr << L"Kupiec's initialization of transition and emission probabilities...\n";
- FILE *fcrp = fopen(filenames[1].c_str(), "r");
- if(fcrp) {
-#ifdef _MSC_VER
- _setmode(_fileno(fcrp), _O_U8TEXT);
-#endif
- hmm.init_probabilities_kupiec(fcrp);
- }
- else {
- filerror(filenames[1]);
- }
-
- wcerr << L"Applying forbid and enforce rules...\n";
- hmm.apply_rules();
-
- wcerr << L"Training (Baum-Welch)...\n";
- for(int i=0; i != nit; i++) {
- fseek(fcrp, 0, SEEK_SET);
- hmm.train(fcrp);
- }
- wcerr << L"Applying forbid and enforce rules...\n";
- hmm.apply_rules();
-
- fclose(fdic);
- fclose(fcrp);
-
- FILE *ftdata = fopen(filenames[3].c_str(), "wb");
- if(!ftdata) {
- filerror(filenames[3]);
- }
- tdhmm.write(ftdata);
- fclose(ftdata);
-}
-
-void
-Tagger::trainLSW() {
- TSXReader treader;
- treader.read(filenames[2]);
- TaggerDataLSW tdlsw(treader.getTaggerData());
- LSWPoST lswpost(&tdlsw);
- lswpost.set_debug(debug);
- lswpost.set_eos(tdlsw.getTagIndex()[L"TAG_SENT"]);
- TaggerWord::setArrayTags(tdlsw.getArrayTags());
-
- wcerr << L"Calculating ambiguity classes...\n";
- FILE *fdic = fopen(filenames[0].c_str(), "r");
- if(fdic) {
- lswpost.read_dictionary(fdic);
- }
- else {
- filerror(filenames[0]);
- }
- wcerr << L"Average initialization of Light Sliding-Window probabilities, with forbid and enforce rules...\n";
- FILE *fcrp = fopen(filenames[1].c_str(), "r");
- if(fcrp) {
-#ifdef _MSC_VER
- _setmode(_fileno(fcrp), _O_U8TEXT);
-#endif
- lswpost.init_probabilities(fcrp);
- }
- else {
- filerror(filenames[1]);
- }
-
- wcerr << L"Training (Light Sliding-Window, Unsupervised)...\n";
- for(int i=0; i != nit; i++) {
- fseek(fcrp, 0, SEEK_SET);
- lswpost.train(fcrp);
- wcout << L"iteration " << (i + 1) << " done." << endl;
- }
-
- fclose(fdic);
- fclose(fcrp);
-
- FILE *ftdata = fopen(filenames[3].c_str(), "wb");
- if(!ftdata) {
- filerror(filenames[3]);
- }
- tdlsw.write(ftdata);
- fclose(ftdata);
-}
-
-void
-Tagger::trainHMMSupervised() {
- TSXReader treader;
- treader.read(filenames[2]);
- TaggerDataHMM tdhmm(treader.getTaggerData());
- HMM hmm(&tdhmm);
- hmm.set_debug(debug);
- hmm.set_eos(tdhmm.getTagIndex()[L"TAG_SENT"]);
- TaggerWord::setArrayTags(tdhmm.getArrayTags());
-
- wcerr << L"Calculating ambiguity classes...\n";
- FILE *fdic = fopen(filenames[0].c_str(), "r");
- if(fdic) {
- hmm.read_dictionary(fdic);
- }
- else {
- filerror(filenames[0]);
- }
- wcerr << L"Kupiec's initialization of transition and emission probabilities...\n";
- FILE *ftagged = fopen(filenames[4].c_str(), "r");
- FILE *funtagged = fopen(filenames[5].c_str(), "r");
- if(ftagged && funtagged) {
-#ifdef _MSC_VER
- _setmode(_fileno(ftagged), _O_U8TEXT);
- _setmode(_fileno(funtagged), _O_U8TEXT);
-#endif
- wcerr << L"Initializing transition and emission probabilities from a hand-tagged corpus...\n";
- hmm.init_probabilities_from_tagged_text(ftagged, funtagged);
- }
- else {
- filerror(filenames[4]+ "' or '" + filenames[5]);
- }
- fclose(ftagged);
- fclose(funtagged);
-
- wcerr << L"Applying forbid and enforce rules...\n";
- hmm.apply_rules();
-
- wcerr << L"Training (Baum-Welch)...\n";
- FILE *fcrp = fopen(filenames[1].c_str(), "r");
- if(fcrp) {
-#ifdef _MSC_VER
- _setmode(_fileno(fcrp), _O_U8TEXT);
-#endif
- for(int i=0; i != nit; i++) {
- fseek(fcrp, 0, SEEK_SET);
- hmm.train(fcrp);
- }
- wcerr << L"Applying forbid and enforce rules...\n";
- hmm.apply_rules();
- }
- else {
- filerror(filenames[1]);
- }
-
- fclose(fdic);
- fclose(fcrp);
-
- FILE *ftdata = fopen(filenames[3].c_str(), "wb");
- if(!ftdata) {
- filerror(filenames[3]);
- }
- tdhmm.write(ftdata);
- fclose(ftdata);
-}
-
-void
-Tagger::trainLSWSupervised() {
-}
-
-void
-Tagger::retrainHMM() {
- TaggerDataHMM tdhmm;
- FILE *ftdata = fopen(filenames[1].c_str(), "rb");
- if(!ftdata) {
- filerror(filenames[1]);
- }
- tdhmm.read(ftdata);
- fclose(ftdata);
-
- HMM hmm(&tdhmm);
- hmm.set_debug(debug);
- hmm.set_eos((tdhmm.getTagIndex())[L"TAG_SENT"]);
- TaggerWord::setArrayTags(tdhmm.getArrayTags());
-
- FILE *fcrp = fopen(filenames[0].c_str(), "r");
- if(!fcrp) {
- filerror(filenames[0]);
- }
-#ifdef _MSC_VER
- _setmode(_fileno(fcrp), _O_U8TEXT);
-#endif
- wcerr << L"Training (Baum-Welch)...\n";
- for(int i=0; i != nit; i++) {
- fseek(fcrp, 0, SEEK_SET);
- hmm.train(fcrp);
- }
- wcerr << L"Applying forbid and enforce rules...\n";
- hmm.apply_rules();
- fclose(fcrp);
-
- ftdata = fopen(filenames[1].c_str(), "wb");
- if(!ftdata) {
- filerror(filenames[1]);
- }
- tdhmm.write(ftdata);
- fclose(ftdata);
-}
-
-void
-Tagger::retrainLSW() {
- TaggerDataLSW tdlsw;
- FILE *ftdata = fopen(filenames[1].c_str(), "rb");
- if(!ftdata) {
- filerror(filenames[1]);
- }
- tdlsw.read(ftdata);
- fclose(ftdata);
-
- LSWPoST lswpost(&tdlsw);
- lswpost.set_debug(debug);
- lswpost.set_eos((tdlsw.getTagIndex())[L"TAG_SENT"]);
- TaggerWord::setArrayTags(tdlsw.getArrayTags());
-
- FILE *fcrp = fopen(filenames[0].c_str(), "r");
- if(!fcrp) {
- filerror(filenames[0]);
- }
-#ifdef _MSC_VER
- _setmode(_fileno(fcrp), _O_U8TEXT);
-#endif
- wcerr << L"Training (Light Sliding-Window, Unsupervised)...\n";
- for(int i=0; i != nit; i++) {
- fseek(fcrp, 0, SEEK_SET);
- lswpost.train(fcrp);
- wcout << L"iteration " << (i + 1) << " done." << endl;
- }
- fclose(fcrp);
-
- ftdata = fopen(filenames[1].c_str(), "wb");
- if(!ftdata) {
- filerror(filenames[1]);
- }
- tdlsw.write(ftdata);
- fclose(ftdata);
-}
-
-void
-Tagger::help() {
- ostream &out = cerr;
- char* localname=new char[name.size()+1];
- strcpy(localname, name.c_str());
- out << basename(localname) << ": HMM/LSW part-of-speech tagging and training program" << endl;
- out << "GENERIC USAGE: " << basename(localname) << "[-d] <OPTION>=[PARAM] [FILES]" << endl;
- out << "USAGE: " << basename(localname) << "[-d] [-w] -t=n DIC CRP TSX TAGGER_DATA" << endl;
- out << " " << basename(localname) << "[-d] [-w] -s=n DIC CRP TSX TAGGER_DATA HTAG UNTAG" << endl;
- out << " " << basename(localname) << "[-d] [-w] -r=n CRP TAGGER_DATA" << endl;
- out << " " << basename(localname) << "[-d] [-w] -g [-f] TAGGER_DATA [INPUT [OUTPUT]]" << endl;
- out << endl;
- out << "Where OPTIONS are:" << endl;
- out << " -w, --sliding-window: use the Sliding-Window training and tagging algorithm," << endl;
- out << " or if not specified, use the HMM algorithm by default" << endl;
- out << " -t, --train=n: performs n iterations of training (unsupervised)" << endl;
- out << " -s, --supervised=n: initializes parameters against a hand-tagged text (supervised)," << endl;
- out << " and trains it with n iterations" << endl;
- out << " -r, --retrain=n: retrains the model with n additional iterations (unsupervised)" << endl;
- out << " -g, --tagger: tags input text" << endl;
- out << " -p, --show-superficial: show superficial forms in the output stream" << endl;
- out << " -f, --first: used in conjuntion with -g (--tagger) makes the tagger"<< endl;
- out << " give all lexical forms of each word, with the chosen" << endl;
- out << " one in the first place (after the lemma)"<<endl;
- out << " -d, --debug: print error mesages when tagging input text" << endl;
- out << " -m, --mark: generate marks of solved ambiguities" << endl;
- out << " -z, --null-flush: flush output stream when reading '\\0' characters" <<endl;
- out << endl;
- out << "And FILES are:" << endl;
- out << " DIC: full expanded dictionary file" << endl;
- out << " CRP: training text corpus file" << endl;
- out << " TSX: tagger specification file, in XML format" << endl;
- out << " TAGGER_DATA: tagger data file, built in the training and used while" << endl;
- out << " tagging" << endl;
- out << " HTAG: hand-tagged text corpus" << endl;
- out << " UNTAG: untagged text corpus, morphological analysis of HTAG" << endl;
- out << " corpus to use both jointly with -s option" << endl;
- out << " INPUT: input file, stdin by default" << endl;
- out << " OUTPUT: output file, stdout by default" << endl;
- delete[] localname;
- exit(EXIT_FAILURE);
-}
-
-bool
-Tagger::isNumber(const char *str) {
- for(unsigned int i = 0, limit = strlen(str); i != limit; i++) {
- if(!isdigit(str[i])) {
- return false;
- }
- }
-
- return true;
-}
diff --git a/apertium/tagger.dtd b/apertium/tagger.dtd
index 7bdb72b..db5025f 100644
--- a/apertium/tagger.dtd
+++ b/apertium/tagger.dtd
@@ -14,9 +14,7 @@
General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA.
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
DTD for the tagset and the rules to enforce the state to state
transition probabilities used by the part-of-speech tagger.
diff --git a/apertium/tagger.h b/apertium/tagger.h
deleted file mode 100644
index 3d2a95d..0000000
--- a/apertium/tagger.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
- */
-#ifndef __TAGGER_
-#define __TAGGER_
-
-#include <cstdio>
-#include <fstream>
-#include <map>
-#include <set>
-
-#include <apertium/constant_manager.h>
-#include <apertium/tagger_data_hmm.h>
-#include <apertium/tagger_data_lsw.h>
-#include <apertium/ttag.h>
-
-using namespace std;
-
-class Tagger
-{
-private:
-
- enum Mode{UNKNOWN_MODE,
- TRAIN_HMM_UNSUPERVISED_MODE,
- TRAIN_HMM_SUPERVISED_MODE,
- RETRAIN_HMM_UNSUPERVISED_MODE,
- RETRAIN_HMM_SUPERVISED_MODE,
- TAGGER_HMM_MODE,
- TAGGER_HMM_EVAL_MODE,
- TAGGER_HMM_FIRST_MODE,
- TRAIN_LSW_UNSUPERVISED_MODE,
- TRAIN_LSW_SUPERVISED_MODE,
- RETRAIN_LSW_UNSUPERVISED_MODE,
- RETRAIN_LSW_SUPERVISED_MODE,
- TAGGER_LSW_MODE,
- TAGGER_LSW_EVAL_MODE,
- TAGGER_LSW_FIRST_MODE};
-
- vector<string> filenames;
- int nit;
- string name;
- bool debug;
-
- bool showSF; // show superficial forms
- bool null_flush; // flush on '\0'
- bool is_sw; // use Sliding-Window algorithm, other than HMM
-
- void setShowSF(bool val);
- bool getShowSF();
-
- int getMode(int argc, char *argv[]);
- void taggerHMM(bool model_first=false);
- void taggerLSW(bool model_first=false);
- void trainHMM();
- void trainLSW();
- void retrainHMM();
- void retrainLSW();
- void trainHMMSupervised();
- void trainLSWSupervised();
- void help();
- void filerror(string const &filename);
- bool isNumber(const char *str);
-public:
- Tagger();
- void main(int argc, char *argv[]);
-};
-
-#endif
diff --git a/apertium/tagger.rnc b/apertium/tagger.rnc
new file mode 100644
index 0000000..f14288c
--- /dev/null
+++ b/apertium/tagger.rnc
@@ -0,0 +1,122 @@
+# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+# DTD for the tagset and the rules to enforce the state to state
+# transition probabilities used by the part-of-speech tagger.
+# 2005.07.29.
+
+tagger =
+ element tagger {
+ attlist.tagger,
+ tagset,
+ forbid?,
+ enforce-rules?,
+ preferences?,
+ discard-on-ambiguity?
+ }
+attlist.tagger &= attribute name { text }
+# 'tagger' is the root element containing the whole tagset for a given
+# language specified through the mandatory attribute 'name'
+tagset = element tagset { attlist.tagset, def-label+, def-mult* }
+attlist.tagset &= empty
+# The 'tagset' section defines the correspondance between simple
+# or multiple morphological categories defining a lexical form and the coarser
+# ones with which the part-of-speech tagger works
+def-label = element def-label { attlist.def-label, tags-item+ }
+attlist.def-label &=
+ attribute name { text },
+ attribute c { text }?,
+ attribute closed { text }?
+# Each 'def-label' defines one coarse tag in terms of a list of fine tags
+# and has a mandatory unique name. The optional attribute 'closed="true"' may be used
+# to specify if the defined fine tags belong to a closed list.
+# c is for comments and is ignored
+tags-item = element tags-item { attlist.tags-item, empty }
+attlist.tags-item &=
+ attribute tags { text },
+ attribute lemma { text }?
+# Each 'tags-item' may be a dot-separated subsequence of the morphological tags
+# corresponding to a coarse tag optionally in association with a given lemma
+def-mult = element def-mult { attlist.def-mult, sequence+ }
+attlist.def-mult &=
+ attribute name { text },
+ attribute c { text }?,
+ attribute closed { text }?
+# Each 'def-mult' defines one coarse tag in terms of a sequence of coarse
+# tags previously defined as 'def-labels' or a sequence of fine tags. A mandatory
+# name is required for each 'def-mult' which may also has an optional attribute
+# 'closed="true"' if it belongs to a closed list
+# c is for comments and is ignored
+sequence =
+ element sequence { attlist.sequence, (tags-item | label-item)+ }
+attlist.sequence &= empty
+# Element 'sequence' encloses a set of tags or labels which defines
+# a unit with more than one label
+label-item = element label-item { attlist.label-item, empty }
+attlist.label-item &=
+ attribute label { text },
+ attribute c { text }?
+# Each 'label' of the 'label-item' correspond to a coarse tag previously
+# defined as a 'def-label' by a name.
+# c is for comments and is ignored
+forbid = element forbid { attlist.forbid, label-sequence+ }
+attlist.forbid &= empty
+# Element 'forbid' contains sequences of morphological categories that are not
+# allowed in a given language
+label-sequence =
+ element label-sequence { attlist.label-sequence, label-item+ }
+attlist.label-sequence &= attribute c { text }?
+# Each 'label-sequence' is restricted to two 'label-items'
+# c is for comments and is ignored
+enforce-rules =
+ element enforce-rules { attlist.enforce-rules, enforce-after+ }
+attlist.enforce-rules &= empty
+# Element 'enforce-rules' defines sets of coarse tags that must follow specified ones
+enforce-after =
+ element enforce-after { attlist.enforce-after, label-set }
+attlist.enforce-after &=
+ attribute label { text },
+ attribute c { text }?
+# Each 'enforce-after' encloses the set of coarse tags ('label-set') that must follow
+# the one defined in 'label', as a mandatory attribute
+# c is for comments and is ignored
+label-set = element label-set { attlist.label-set, label-item+ }
+attlist.label-set &= attribute c { text }?
+# The set of 'label-items' enforced after a 'label' are enclosed inside element 'label-set'
+# c is for comments and is ignored
+preferences = element preferences { attlist.preferences, prefer+ }
+attlist.preferences &= empty
+# Element 'preferences' allows to decide amongst two or more fine tag sequences
+# which are grouped in the same coarse tag.
+prefer = element prefer { attlist.prefer, empty }
+attlist.prefer &=
+ attribute tags { text },
+ attribute c { text }?
+# Each 'prefer' element has a mandatory attribute 'tags' made of a sequence of fine tags
+# c is for comments and is ignored
+discard-on-ambiguity =
+ element discard-on-ambiguity {
+ attlist.discard-on-ambiguity, discard+
+ }
+attlist.discard-on-ambiguity &= empty
+# List of label-item or tags-item to be discarded when an ambiguity
+# occurs inside a word
+discard = element discard { attlist.discard, empty }
+attlist.discard &=
+ attribute tags { text },
+ attribute c { text }?
+start = tagger
+# Each 'discard' element has a mandatory attribute 'tags' made of a sequence of fine tags
+# c is for comments and is ignored
diff --git a/apertium/tagger.rng b/apertium/tagger.rng
new file mode 100644
index 0000000..a30f3b9
--- /dev/null
+++ b/apertium/tagger.rng
@@ -0,0 +1,310 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!--
+ Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+ DTD for the tagset and the rules to enforce the state to state
+ transition probabilities used by the part-of-speech tagger.
+ 2005.07.29.
+-->
+<grammar xmlns="http://relaxng.org/ns/structure/1.0">
+ <define name="tagger">
+ <element name="tagger">
+ <ref name="attlist.tagger"/>
+ <ref name="tagset"/>
+ <optional>
+ <ref name="forbid"/>
+ </optional>
+ <optional>
+ <ref name="enforce-rules"/>
+ </optional>
+ <optional>
+ <ref name="preferences"/>
+ </optional>
+ <optional>
+ <ref name="discard-on-ambiguity"/>
+ </optional>
+ </element>
+ </define>
+ <define name="attlist.tagger" combine="interleave">
+ <attribute name="name"/>
+ </define>
+ <!--
+ 'tagger' is the root element containing the whole tagset for a given
+ language specified through the mandatory attribute 'name'
+ -->
+ <define name="tagset">
+ <element name="tagset">
+ <ref name="attlist.tagset"/>
+ <oneOrMore>
+ <ref name="def-label"/>
+ </oneOrMore>
+ <zeroOrMore>
+ <ref name="def-mult"/>
+ </zeroOrMore>
+ </element>
+ </define>
+ <define name="attlist.tagset" combine="interleave">
+ <empty/>
+ </define>
+ <!--
+ The 'tagset' section defines the correspondance between simple
+ or multiple morphological categories defining a lexical form and the coarser
+ ones with which the part-of-speech tagger works
+ -->
+ <define name="def-label">
+ <element name="def-label">
+ <ref name="attlist.def-label"/>
+ <oneOrMore>
+ <ref name="tags-item"/>
+ </oneOrMore>
+ </element>
+ </define>
+ <define name="attlist.def-label" combine="interleave">
+ <attribute name="name"/>
+ <optional>
+ <attribute name="c"/>
+ </optional>
+ <optional>
+ <attribute name="closed"/>
+ </optional>
+ </define>
+ <!--
+ Each 'def-label' defines one coarse tag in terms of a list of fine tags
+ and has a mandatory unique name. The optional attribute 'closed="true"' may be used
+ to specify if the defined fine tags belong to a closed list.
+ c is for comments and is ignored
+ -->
+ <define name="tags-item">
+ <element name="tags-item">
+ <ref name="attlist.tags-item"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.tags-item" combine="interleave">
+ <attribute name="tags"/>
+ <optional>
+ <attribute name="lemma"/>
+ </optional>
+ </define>
+ <!--
+ Each 'tags-item' may be a dot-separated subsequence of the morphological tags
+ corresponding to a coarse tag optionally in association with a given lemma
+ -->
+ <define name="def-mult">
+ <element name="def-mult">
+ <ref name="attlist.def-mult"/>
+ <oneOrMore>
+ <ref name="sequence"/>
+ </oneOrMore>
+ </element>
+ </define>
+ <define name="attlist.def-mult" combine="interleave">
+ <attribute name="name"/>
+ <optional>
+ <attribute name="c"/>
+ </optional>
+ <optional>
+ <attribute name="closed"/>
+ </optional>
+ </define>
+ <!--
+ Each 'def-mult' defines one coarse tag in terms of a sequence of coarse
+ tags previously defined as 'def-labels' or a sequence of fine tags. A mandatory
+ name is required for each 'def-mult' which may also has an optional attribute
+ 'closed="true"' if it belongs to a closed list
+ c is for comments and is ignored
+ -->
+ <define name="sequence">
+ <element name="sequence">
+ <ref name="attlist.sequence"/>
+ <oneOrMore>
+ <choice>
+ <ref name="tags-item"/>
+ <ref name="label-item"/>
+ </choice>
+ </oneOrMore>
+ </element>
+ </define>
+ <define name="attlist.sequence" combine="interleave">
+ <empty/>
+ </define>
+ <!--
+ Element 'sequence' encloses a set of tags or labels which defines
+ a unit with more than one label
+ -->
+ <define name="label-item">
+ <element name="label-item">
+ <ref name="attlist.label-item"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.label-item" combine="interleave">
+ <attribute name="label"/>
+ <optional>
+ <attribute name="c"/>
+ </optional>
+ </define>
+ <!--
+ Each 'label' of the 'label-item' correspond to a coarse tag previously
+ defined as a 'def-label' by a name.
+ c is for comments and is ignored
+ -->
+ <define name="forbid">
+ <element name="forbid">
+ <ref name="attlist.forbid"/>
+ <oneOrMore>
+ <ref name="label-sequence"/>
+ </oneOrMore>
+ </element>
+ </define>
+ <define name="attlist.forbid" combine="interleave">
+ <empty/>
+ </define>
+ <!--
+ Element 'forbid' contains sequences of morphological categories that are not
+ allowed in a given language
+ -->
+ <define name="label-sequence">
+ <element name="label-sequence">
+ <ref name="attlist.label-sequence"/>
+ <oneOrMore>
+ <ref name="label-item"/>
+ </oneOrMore>
+ </element>
+ </define>
+ <define name="attlist.label-sequence" combine="interleave">
+ <optional>
+ <attribute name="c"/>
+ </optional>
+ </define>
+ <!--
+ Each 'label-sequence' is restricted to two 'label-items'
+ c is for comments and is ignored
+ -->
+ <define name="enforce-rules">
+ <element name="enforce-rules">
+ <ref name="attlist.enforce-rules"/>
+ <oneOrMore>
+ <ref name="enforce-after"/>
+ </oneOrMore>
+ </element>
+ </define>
+ <define name="attlist.enforce-rules" combine="interleave">
+ <empty/>
+ </define>
+ <!-- Element 'enforce-rules' defines sets of coarse tags that must follow specified ones -->
+ <define name="enforce-after">
+ <element name="enforce-after">
+ <ref name="attlist.enforce-after"/>
+ <ref name="label-set"/>
+ </element>
+ </define>
+ <define name="attlist.enforce-after" combine="interleave">
+ <attribute name="label"/>
+ <optional>
+ <attribute name="c"/>
+ </optional>
+ </define>
+ <!--
+ Each 'enforce-after' encloses the set of coarse tags ('label-set') that must follow
+ the one defined in 'label', as a mandatory attribute
+ c is for comments and is ignored
+ -->
+ <define name="label-set">
+ <element name="label-set">
+ <ref name="attlist.label-set"/>
+ <oneOrMore>
+ <ref name="label-item"/>
+ </oneOrMore>
+ </element>
+ </define>
+ <define name="attlist.label-set" combine="interleave">
+ <optional>
+ <attribute name="c"/>
+ </optional>
+ </define>
+ <!--
+ The set of 'label-items' enforced after a 'label' are enclosed inside element 'label-set'
+ c is for comments and is ignored
+ -->
+ <define name="preferences">
+ <element name="preferences">
+ <ref name="attlist.preferences"/>
+ <oneOrMore>
+ <ref name="prefer"/>
+ </oneOrMore>
+ </element>
+ </define>
+ <define name="attlist.preferences" combine="interleave">
+ <empty/>
+ </define>
+ <!--
+ Element 'preferences' allows to decide amongst two or more fine tag sequences
+ which are grouped in the same coarse tag.
+ -->
+ <define name="prefer">
+ <element name="prefer">
+ <ref name="attlist.prefer"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.prefer" combine="interleave">
+ <attribute name="tags"/>
+ <optional>
+ <attribute name="c"/>
+ </optional>
+ </define>
+ <!--
+ Each 'prefer' element has a mandatory attribute 'tags' made of a sequence of fine tags
+ c is for comments and is ignored
+ -->
+ <define name="discard-on-ambiguity">
+ <element name="discard-on-ambiguity">
+ <ref name="attlist.discard-on-ambiguity"/>
+ <oneOrMore>
+ <ref name="discard"/>
+ </oneOrMore>
+ </element>
+ </define>
+ <define name="attlist.discard-on-ambiguity" combine="interleave">
+ <empty/>
+ </define>
+ <!--
+ List of label-item or tags-item to be discarded when an ambiguity
+ occurs inside a word
+ -->
+ <define name="discard">
+ <element name="discard">
+ <ref name="attlist.discard"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.discard" combine="interleave">
+ <attribute name="tags"/>
+ <optional>
+ <attribute name="c"/>
+ </optional>
+ </define>
+ <start>
+ <choice>
+ <ref name="tagger"/>
+ </choice>
+ </start>
+</grammar>
+<!--
+ Each 'discard' element has a mandatory attribute 'tags' made of a sequence of fine tags
+ c is for comments and is ignored
+-->
diff --git a/apertium/tagger_data.cc b/apertium/tagger_data.cc
index 2a1f7b2..b55d9fb 100644
--- a/apertium/tagger_data.cc
+++ b/apertium/tagger_data.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/tagger_data.h>
#include <lttoolbox/compression.h>
diff --git a/apertium/tagger_data.h b/apertium/tagger_data.h
index aff8303..31f8b0c 100644
--- a/apertium/tagger_data.h
+++ b/apertium/tagger_data.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TAGGERDATA_
#define _TAGGERDATA_
diff --git a/apertium/tagger_data_hmm.cc b/apertium/tagger_data_hmm.cc
index 70fd29c..8c5d984 100644
--- a/apertium/tagger_data_hmm.cc
+++ b/apertium/tagger_data_hmm.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/hmm.h>
#include <apertium/tagger_data_hmm.h>
@@ -115,7 +113,7 @@ TaggerDataHMM::setProbabilities(int const myN, int const myM,
a[i] = new double[N];
if(myA != NULL)
{
- for(int j = 0; j != N; j++)
+ for(int j = 0; j != N; j++) // ToDo: N should be M? Check use of N and M in this function
{
a[i][j] = myA[i][j];
}
diff --git a/apertium/tagger_data_hmm.h b/apertium/tagger_data_hmm.h
index b2820cf..ecd5177 100644
--- a/apertium/tagger_data_hmm.h
+++ b/apertium/tagger_data_hmm.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TAGGERDATAHMM_
#define _TAGGERDATAHMM_
diff --git a/apertium/tagger_data_lsw.cc b/apertium/tagger_data_lsw.cc
index 76b2e6c..507c9f2 100644
--- a/apertium/tagger_data_lsw.cc
+++ b/apertium/tagger_data_lsw.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/lswpost.h>
#include <apertium/tagger_data_lsw.h>
diff --git a/apertium/tagger_data_lsw.h b/apertium/tagger_data_lsw.h
index e081b9d..a615e93 100644
--- a/apertium/tagger_data_lsw.h
+++ b/apertium/tagger_data_lsw.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TAGGERDATALSW_
#define _TAGGERDATALSW_
diff --git a/apertium/tagger_utils.cc b/apertium/tagger_utils.cc
index 125871f..31bc954 100644
--- a/apertium/tagger_utils.cc
+++ b/apertium/tagger_utils.cc
@@ -12,11 +12,11 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
+
#include <apertium/tagger_utils.h>
+#include <apertium/morpho_stream.h>
#include <stdio.h>
#include <apertium/string_utils.h>
@@ -121,7 +121,103 @@ wstring tagger_utils::trim(wstring s)
return s;
}
-
+
+void
+tagger_utils::read_dictionary(FILE *fdic, TaggerData &td) {
+ int i, k, nw = 0;
+ TaggerWord *word = NULL;
+ set <TTag> tags;
+ Collection &output = td.getOutput();
+
+ MorphoStream morpho_stream(fdic, true, &td);
+
+ // In the input dictionary there must be all punctuation marks, including the end-of-sentece mark
+
+ word = morpho_stream.get_next_word();
+
+ while (word) {
+ if (++nw % 10000 == 0)
+ wcerr << L'.' << flush;
+
+ tags = word->get_tags();
+
+ if (tags.size() > 0)
+ k = output[tags];
+
+ delete word;
+ word = morpho_stream.get_next_word();
+ }
+ wcerr << L"\n";
+
+ // OPEN AMBIGUITY CLASS
+ // It contains all tags that are not closed.
+ // Unknown words are assigned the open ambiguity class
+ k = output[td.getOpenClass()];
+
+ // Create ambiguity class holding one single tag for each tag.
+ // If not created yet
+ int N = (td.getTagIndex()).size();
+ for(i = 0; i != N; i++) {
+ set<TTag> amb_class;
+ amb_class.insert(i);
+ k = output[amb_class];
+ }
+}
+
+set<TTag>
+tagger_utils::find_similar_ambiguity_class(TaggerData &td, set<TTag> &c) {
+ int size_ret = -1;
+ set<TTag> ret = td.getOpenClass(); // return open-class as default, if no better is found.
+ bool skip_class;
+ Collection &output = td.getOutput();
+
+ for(int k=0; k<output.size(); k++) {
+ if ((((int)output[k].size())>((int)size_ret)) && (((int)output[k].size())<((int)c.size()))) {
+ skip_class = false;
+ // Test if output[k] is a subset of class
+ for(set<TTag>::const_iterator it=output[k].begin(); it!=output[k].end(); it++) {
+ if (c.find(*it)==c.end()) {
+ skip_class = true; //output[k] is not a subset of class
+ break;
+ }
+ }
+ if (!skip_class) {
+ size_ret = output[k].size();
+ ret = output[k];
+ }
+ }
+ }
+ return ret;
+}
+
+void
+tagger_utils::require_ambiguity_class(TaggerData &td, set<TTag> &tags, TaggerWord &word) {
+ if (td.getOutput().has_not(tags)) {
+ wstring errors;
+ errors = L"A new ambiguity class was found. I cannot continue.\n";
+ errors+= L"Word '" + word.get_superficial_form() + L"' not found in the dictionary.\n";
+ errors+= L"New ambiguity class: " + word.get_string_tags() + L"\n";
+ errors+= L"Take a look at the dictionary, then retrain.";
+ fatal_error(errors);
+ }
+}
+
+set<TTag>
+tagger_utils::require_similar_ambiguity_class(TaggerData &td, set<TTag> &tags, TaggerWord &word, bool debug) {
+ if (td.getOutput().has_not(tags)) {
+ if (debug) {
+ wstring errors;
+ errors = L"A new ambiguity class was found. \n";
+ errors += L"Retraining the tagger is necessary so as to take it into account.\n";
+ errors += L"Word '" + word.get_superficial_form() + L"'.\n";
+ errors += L"New ambiguity class: " + word.get_string_tags() + L"\n";
+ wcerr << L"Error: " << errors;
+ }
+ return find_similar_ambiguity_class(td, tags);
+ }
+ return tags;
+}
+
template <class T>
ostream& operator<< (ostream& os, const map <int, T> & f){
typename map <int, T>::const_iterator it;
diff --git a/apertium/tagger_utils.h b/apertium/tagger_utils.h
index 72daab8..c9bbda3 100644
--- a/apertium/tagger_utils.h
+++ b/apertium/tagger_utils.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __TAGGERUTILS_H
#define __TAGGERUTILS_H
@@ -27,6 +25,8 @@
#include <vector>
#include <apertium/ttag.h>
#include <cstdlib>
+#include <apertium/tagger_data.h>
+#include <apertium/tagger_word.h>
using namespace std;
@@ -68,6 +68,29 @@ void clear_array_vector(vector<TTag> v[], int l);
*/
int nguiones_fs(wstring const &cadena);
+/** Reads the expanded dictionary received as a parameter puts the resulting
+ * ambiguity classes that the tagger will manage.
+ * @param fdic the input stream with the expanded dictionary to read
+ * @param td the tagger data instance to mutate
+ */
+void read_dictionary(FILE *fdic, TaggerData &td);
+
+/** This method returns a known ambiguity class that is a subset of
+* the one received as a parameter. This is useful when a new
+* ambiguity class is found because of changes in the morphological
+* dictionary used by the MT system.
+* @param c set of tags (ambiguity class)
+* @return a known ambiguity class
+*/
+set<TTag> find_similar_ambiguity_class(TaggerData &td, set<TTag> &c);
+
+/** Dies with an error message if the tags aren't in the tagger data */
+void require_ambiguity_class(TaggerData &td, set<TTag> &tags, TaggerWord &word);
+
+/** As with find_similar_ambiguity_class, but returns tags if it's already fine
+ * & prints a warning if debug */
+set<TTag> require_similar_ambiguity_class(TaggerData &td, set<TTag> &tags, TaggerWord &word, bool debug);
+
wstring trim(wstring s);
};
diff --git a/apertium/tagger_word.cc b/apertium/tagger_word.cc
index 9b363d4..882a84c 100644
--- a/apertium/tagger_word.cc
+++ b/apertium/tagger_word.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/tagger_word.h>
#include <apertium/utf_converter.h>
@@ -32,7 +30,9 @@ bool TaggerWord::show_ignored_string=true;
map<wstring, ApertiumRE, Ltstr> TaggerWord::patterns;
-TaggerWord::TaggerWord(bool prev_plus_cut){
+TaggerWord::TaggerWord(bool prev_plus_cut) :
+show_sf(false)
+{
ignored_string = L"";
plus_cut=false;
previous_plus_cut=prev_plus_cut;
diff --git a/apertium/tagger_word.h b/apertium/tagger_word.h
index 768b20a..49ac0b6 100644
--- a/apertium/tagger_word.h
+++ b/apertium/tagger_word.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __TAGGERWORD_H
#define __TAGGERWORD_H
diff --git a/apertium/tmx_align_parameters.h b/apertium/tmx_align_parameters.h
index 3366f48..411314b 100644
--- a/apertium/tmx_align_parameters.h
+++ b/apertium/tmx_align_parameters.h
@@ -41,7 +41,8 @@ AlignParameters() : justSentenceIds(true),
qualityThreshold(-100000),
postprocessTrailQualityThreshold(-1),
postprocessTrailStartAndEndQualityThreshold(-1),
- postprocessTrailByTopologyQualityThreshold(-1)
+ postprocessTrailByTopologyQualityThreshold(-1),
+ utfCharCountingMode(false)
{}
diff --git a/apertium/tmx_alignment.cc b/apertium/tmx_alignment.cc
index 4463169..6318cac 100644
--- a/apertium/tmx_alignment.cc
+++ b/apertium/tmx_alignment.cc
@@ -434,7 +434,7 @@ void setBox( AlignMatrix& m, int huPos, int enPos, int radius, int insideOfRadiu
{
if ( (x>=0) && (x<m.size()) && (y>=0) && (y<m.otherSize()) )
{
- m.cell(x,y) = insideOfRadiusValue ;
+ m.cell(x,y) = insideOfRadiusValue ; // ToDo: Should this be (y,x) instead? Function has args y,x not x,y. Fix here or function
}
}
}
diff --git a/apertium/tmx_builder.cc b/apertium/tmx_builder.cc
index b8d211c..8bde013 100644
--- a/apertium/tmx_builder.cc
+++ b/apertium/tmx_builder.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/tmx_builder.h>
#include <apertium/utf_converter.h>
@@ -43,7 +41,8 @@
using namespace Apertium;
using namespace std;
-TMXBuilder::TMXBuilder(wstring const &l1, wstring const &l2)
+TMXBuilder::TMXBuilder(wstring const &l1, wstring const &l2):
+low_limit(0)
{
lang1 = l1;
lang2 = l2;
@@ -160,8 +159,8 @@ TMXBuilder::compatible(FILE *f1, FILE *f2, bool lazy)
bool
TMXBuilder::check(string const &file1, string const &file2, bool lazy)
{
- FILE *f1 = fopen(file1.c_str(), "r");
- FILE *f2 = fopen(file2.c_str(), "r");
+ FILE *f1 = fopen(file1.c_str(), "rb");
+ FILE *f2 = fopen(file2.c_str(), "rb");
if(!f1 && !f2)
{
wcerr << L"Error: Cannot access files '" << UtfConverter::fromUtf8(file1);
@@ -173,6 +172,7 @@ TMXBuilder::check(string const &file1, string const &file2, bool lazy)
wcerr << L"Error: Cannot access file '";
wcerr << UtfConverter::fromUtf8(file2);
wcerr << "'" << endl;
+ fclose(f2);
return false;
}
else if(!f2)
@@ -180,6 +180,8 @@ TMXBuilder::check(string const &file1, string const &file2, bool lazy)
wcerr << L"Error: Cannot access file '";
wcerr << UtfConverter::fromUtf8(file2);
wcerr << "'" << endl;
+ fclose(f1);
+ return false;
}
bool retval = compatible(f1, f2, lazy);
@@ -589,7 +591,7 @@ TMXBuilder::outputTU(FILE *f1, FILE *f2, FILE *output)
partes[1] = L"";
conta = 0;
}
- if(conta != 2)
+ if(conta < 2)
{
partes[conta] += val;
}
diff --git a/apertium/tmx_builder.h b/apertium/tmx_builder.h
index 4ca1c08..6d31203 100644
--- a/apertium/tmx_builder.h
+++ b/apertium/tmx_builder.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TMXBUILDER_
#define _TMXBUILDER_
diff --git a/apertium/tmx_translate.cc b/apertium/tmx_translate.cc
index 3669a9a..327ed03 100644
--- a/apertium/tmx_translate.cc
+++ b/apertium/tmx_translate.cc
@@ -175,12 +175,12 @@ void trivialTranslate(
{
bool logging = false;
- std::ofstream* translateLogsPtr;
+ std::ofstream* translateLogsPtr = 0;
if (logging)
{
translateLogsPtr = new std::ofstream( "/dev/null", std::ios::app );
}
- std::ostream& logs = *translateLogsPtr ; // std::cout;
+ std::ostream& logs = translateLogsPtr ? *translateLogsPtr : std::cout ;
translatedSentence.id = sentence.id;
Phrase& words = translatedSentence.words;
diff --git a/apertium/transfer.cc b/apertium/transfer.cc
index 5b33920..1dc4c61 100644
--- a/apertium/transfer.cc
+++ b/apertium/transfer.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/transfer.h>
#include <apertium/trx_reader.h>
@@ -33,11 +31,6 @@ using namespace Apertium;
using namespace std;
void
-Transfer::copy(Transfer const &o)
-{
-}
-
-void
Transfer::destroy()
{
if(me)
@@ -49,10 +42,18 @@ Transfer::destroy()
{
xmlFreeDoc(doc);
doc = NULL;
- }
+ }
}
-Transfer::Transfer()
+Transfer::Transfer() :
+word(0),
+blank(0),
+lword(0),
+lblank(0),
+output(0),
+any_char(0),
+any_tag(0),
+nwords(0)
{
me = NULL;
doc = NULL;
@@ -65,6 +66,7 @@ Transfer::Transfer()
null_flush = false;
internal_null_flush = false;
trace = false;
+ trace_att = false;
emptyblank = "";
}
@@ -73,23 +75,7 @@ Transfer::~Transfer()
destroy();
}
-Transfer::Transfer(Transfer const &o)
-{
- copy(o);
-}
-
-Transfer &
-Transfer::operator =(Transfer const &o)
-{
- if(this != &o)
- {
- destroy();
- copy(o);
- }
- return *this;
-}
-
-void
+void
Transfer::readData(FILE *in)
{
alphabet.read(in);
@@ -98,18 +84,18 @@ Transfer::readData(FILE *in)
Transducer t;
t.read(in, alphabet.size());
-
- map<int, int> finals;
-
+
+ map<int, int> finals;
+
// finals
for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++)
{
int key = Compression::multibyte_read(in);
finals[key] = Compression::multibyte_read(in);
- }
-
+ }
+
me = new MatchExe(t, finals);
-
+
// attr_items
bool recompile_attrs = Compression::string_read(in) != string(pcre_version());
for(int i = 0, limit = Compression::multibyte_read(in); i != limit; i++)
@@ -146,7 +132,7 @@ Transfer::readData(FILE *in)
wstring const cad_v = Compression::wstring_read(in);
lists[cad_k].insert(UtfConverter::toUtf8(cad_v));
listslow[cad_k].insert(UtfConverter::toUtf8(StringUtils::tolower(cad_v)));
- }
+ }
}
}
@@ -184,7 +170,7 @@ Transfer::read(string const &transferfile, string const &datafile,
string const &fstfile)
{
readTransfer(transferfile);
-
+
// datafile
FILE *in = fopen(datafile.c_str(), "rb");
if(!in)
@@ -194,7 +180,7 @@ Transfer::read(string const &transferfile, string const &datafile,
}
readData(in);
fclose(in);
-
+
if(fstfile != "")
{
readBil(fstfile);
@@ -205,15 +191,15 @@ void
Transfer::readTransfer(string const &in)
{
doc = xmlReadFile(in.c_str(), NULL, 0);
-
+
if(doc == NULL)
{
cerr << "Error: Could not parse file '" << in << "'." << endl;
exit(EXIT_FAILURE);
}
-
+
root_element = xmlDocGetRootElement(doc);
-
+
// search for root element attributes
for(xmlAttr *i = root_element->properties; i != NULL; i = i->next)
{
@@ -227,9 +213,9 @@ Transfer::readTransfer(string const &in)
{
defaultAttrs = lu; // default value for 'default'
}
- }
+ }
}
-
+
// search for macros & rules
for(xmlNode *i = root_element->children; i != NULL; i = i->next)
{
@@ -243,7 +229,7 @@ Transfer::readTransfer(string const &in)
{
collectRules(i);
}
- }
+ }
}
}
@@ -268,11 +254,11 @@ Transfer::collectRules(xmlNode *localroot)
void
Transfer::collectMacros(xmlNode *localroot)
-{
+{
for(xmlNode *i = localroot->children; i != NULL; i = i->next)
{
if(i->type == XML_ELEMENT_NODE)
- {
+ {
macro_map.push_back(i);
}
}
@@ -290,7 +276,7 @@ Transfer::checkIndex(xmlNode *element, int index, int limit)
}
-string
+string
Transfer::evalString(xmlNode *element)
{
map<xmlNode *, TransferInstr>::iterator it;
@@ -313,42 +299,42 @@ Transfer::evalString(xmlNode *element)
return word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition());
}
break;
-
+
case ti_linkto_sl:
if(checkIndex(element, ti.getPos(), lword))
{
if(word[ti.getPos()]->source(attr_items[ti.getContent()], ti.getCondition()) != "")
{
return "<" + string((char *) ti.getPointer()) + ">";
- }
+ }
else
{
return "";
}
}
break;
-
+
case ti_linkto_tl:
if(checkIndex(element, ti.getPos(), lword))
{
if(word[ti.getPos()]->target(attr_items[ti.getContent()], ti.getCondition()) != "")
{
return "<" + string((char *) ti.getPointer()) + ">";
- }
+ }
else
{
return "";
}
}
break;
-
+
case ti_var:
return variables[ti.getContent()];
case ti_lit_tag:
case ti_lit:
return ti.getContent();
-
+
case ti_b:
if(checkIndex(element, ti.getPos(), lblank))
{
@@ -359,7 +345,7 @@ Transfer::evalString(xmlNode *element)
return " ";
}
break;
-
+
case ti_get_case_from:
if(checkIndex(element, ti.getPos(), lword))
{
@@ -367,21 +353,21 @@ Transfer::evalString(xmlNode *element)
evalString((xmlNode *) ti.getPointer()));
}
break;
-
+
case ti_case_of_sl:
if(checkIndex(element, ti.getPos(), lword))
{
return caseOf(word[ti.getPos()]->source(attr_items[ti.getContent()]));
}
break;
-
+
case ti_case_of_tl:
if(checkIndex(element, ti.getPos(), lword))
{
return caseOf(word[ti.getPos()]->target(attr_items[ti.getContent()]));
}
break;
-
+
default:
return "";
}
@@ -430,8 +416,8 @@ Transfer::evalString(xmlNode *element)
else
{
evalStringCache[element] = TransferInstr(ti_linkto_tl, (const char *) part, pos, (void *) as, queue);
- }
- }
+ }
+ }
else if(!xmlStrcmp(side, (const xmlChar *) "sl"))
{
evalStringCache[element] = TransferInstr(ti_clip_sl, (const char *) part, pos, NULL, queue);
@@ -443,8 +429,8 @@ Transfer::evalString(xmlNode *element)
}
else if(!xmlStrcmp(element->name, (const xmlChar *) "lit-tag"))
{
- evalStringCache[element] = TransferInstr(ti_lit_tag,
- tags((const char *) element->properties->children->content), 0);
+ evalStringCache[element] = TransferInstr(ti_lit_tag,
+ tags((const char *) element->properties->children->content), 0);
}
else if(!xmlStrcmp(element->name, (const xmlChar *) "lit"))
{
@@ -501,7 +487,7 @@ Transfer::evalString(xmlNode *element)
pos = atoi((const char *) i->children->content) - 1;
}
}
-
+
if(!xmlStrcmp(side, (const xmlChar *) "sl"))
{
evalStringCache[element] = TransferInstr(ti_case_of_sl, (const char *) part, pos);
@@ -509,10 +495,10 @@ Transfer::evalString(xmlNode *element)
else
{
evalStringCache[element] = TransferInstr(ti_case_of_tl, (const char *) part, pos);
- }
+ }
}
else if(!xmlStrcmp(element->name, (const xmlChar *) "concat"))
- {
+ {
string value;
for(xmlNode *i = element->children; i != NULL; i = i->next)
{
@@ -533,7 +519,7 @@ Transfer::evalString(xmlNode *element)
myword.append(evalString(i));
}
}
-
+
if(myword != "")
{
return "^"+myword+"$";
@@ -546,15 +532,15 @@ Transfer::evalString(xmlNode *element)
else if(!xmlStrcmp(element->name, (const xmlChar *) "mlu"))
{
string value;
-
+
bool first_time = true;
-
+
for(xmlNode *i = element->children; i != NULL; i = i->next)
{
if(i->type == XML_ELEMENT_NODE)
{
string myword;
-
+
for(xmlNode *j = i->children; j != NULL; j = j->next)
{
if(j->type == XML_ELEMENT_NODE)
@@ -562,7 +548,7 @@ Transfer::evalString(xmlNode *element)
myword.append(evalString(j));
}
}
-
+
if(!first_time)
{
if(myword != "" && myword[0] != '#') //'+#' problem
@@ -577,7 +563,7 @@ Transfer::evalString(xmlNode *element)
first_time = false;
}
}
-
+
value.append(myword);
}
}
@@ -646,7 +632,7 @@ Transfer::processOut(xmlNode *localroot)
myword.append(evalString(k));
}
}
-
+
if(!first_time)
{
if(myword != "" && myword[0] != '#') //'+#' problem
@@ -668,11 +654,11 @@ Transfer::processOut(xmlNode *localroot)
}
else // 'b'
{
- fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(),
+ fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(),
output);
}
}
- else
+ else
{
if(!xmlStrcmp(i->name, (const xmlChar *) "chunk"))
{
@@ -681,7 +667,7 @@ Transfer::processOut(xmlNode *localroot)
else // 'b'
{
fputws_unlocked(UtfConverter::fromUtf8(evalString(i)).c_str(), output);
- }
+ }
}
}
}
@@ -693,8 +679,8 @@ Transfer::processChunk(xmlNode *localroot)
string name, namefrom;
string caseofchunk = "aa";
string result;
-
-
+
+
for(xmlAttr *i = localroot->properties; i != NULL; i = i->next)
{
if(!xmlStrcmp(i->name, (const xmlChar *) "name"))
@@ -744,7 +730,7 @@ Transfer::processChunk(xmlNode *localroot)
exit(EXIT_FAILURE);
}
}
-
+
for(xmlNode *i = localroot->children; i != NULL; i = i->next)
{
if(i->type == XML_ELEMENT_NODE)
@@ -765,7 +751,7 @@ Transfer::processChunk(xmlNode *localroot)
}
}
if(myword != "")
- {
+ {
result.append("^");
result.append(myword);
result.append("$");
@@ -787,12 +773,12 @@ Transfer::processChunk(xmlNode *localroot)
mylocalword.append(evalString(k));
}
}
-
+
if(!first_time)
{
if(mylocalword != "" && mylocalword[0] != '#') // '+#' problem
{
- myword += '+';
+ myword += '+';
}
}
else
@@ -842,12 +828,13 @@ Transfer::processTags(xmlNode *localroot)
return result;
}
-void
+int
Transfer::processInstruction(xmlNode *localroot)
{
+ int words_to_consume = -1;
if(!xmlStrcmp(localroot->name, (const xmlChar *) "choose"))
{
- processChoose(localroot);
+ words_to_consume = processChoose(localroot);
}
else if(!xmlStrcmp(localroot->name, (const xmlChar *) "let"))
{
@@ -869,6 +856,33 @@ Transfer::processInstruction(xmlNode *localroot)
{
processModifyCase(localroot);
}
+ else if(!xmlStrcmp(localroot->name, (const xmlChar *) "reject-current-rule"))
+ {
+ words_to_consume = processRejectCurrentRule(localroot);
+ }
+ return words_to_consume;
+}
+
+int
+Transfer::processRejectCurrentRule(xmlNode *localroot)
+{
+ bool shifting = true;
+ string value;
+ for(xmlAttr *i = localroot->properties; i != NULL; i = i->next)
+ {
+ if(!xmlStrcmp(i->name, (const xmlChar *) "shifting"))
+ {
+ value = (char *) i->children->content;
+ break;
+ }
+ }
+
+ if(value == "no")
+ {
+ shifting = false;
+ }
+
+ return shifting ? 1 : 0;
}
void
@@ -901,15 +915,15 @@ Transfer::processLet(xmlNode *localroot)
case ti_var:
variables[ti.getContent()] = evalString(rightSide);
return;
-
+
case ti_clip_sl:
word[ti.getPos()]->setSource(attr_items[ti.getContent()], evalString(rightSide), ti.getCondition());
return;
-
+
case ti_clip_tl:
word[ti.getPos()]->setTarget(attr_items[ti.getContent()], evalString(rightSide), ti.getCondition());
- return;
-
+ return;
+
default:
return;
}
@@ -950,9 +964,9 @@ Transfer::processLet(xmlNode *localroot)
else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to"))
{
as = i->children->content;
- }
+ }
}
-
+
if(!xmlStrcmp(side, (const xmlChar *) "tl"))
{
word[pos]->setTarget(attr_items[(const char *) part], evalString(rightSide), queue);
@@ -962,7 +976,7 @@ Transfer::processLet(xmlNode *localroot)
{
word[pos]->setSource(attr_items[(const char *) part], evalString(rightSide), queue);
evalStringCache[leftSide] = TransferInstr(ti_clip_sl, (const char *) part, pos, NULL, queue);
- }
+ }
}
}
@@ -974,7 +988,7 @@ Transfer::processAppend(xmlNode *localroot)
{
if(!xmlStrcmp(i->name, (const xmlChar *) "n"))
{
- name = (char *) i->children->content;
+ name = (char *) i->children->content;
break;
}
}
@@ -1035,21 +1049,22 @@ Transfer::processModifyCase(xmlNode *localroot)
{
queue = false;
}
- }
+ }
else if(!xmlStrcmp(i->name, (const xmlChar *) "link-to"))
{
as = i->children->content;
+ (void)as; // ToDo, remove "as" and the whole else?
}
}
if(!xmlStrcmp(side, (const xmlChar *) "sl"))
{
- string const result = copycase(evalString(rightSide),
+ string const result = copycase(evalString(rightSide),
word[pos]->source(attr_items[(const char *) part], queue));
word[pos]->setSource(attr_items[(const char *) part], result);
}
else
{
- string const result = copycase(evalString(rightSide),
+ string const result = copycase(evalString(rightSide),
word[pos]->target(attr_items[(const char *) part], queue));
word[pos]->setTarget(attr_items[(const char *) part], result);
}
@@ -1077,11 +1092,13 @@ Transfer::processCallMacro(xmlNode *localroot)
break;
}
}
+
+ // ToDo: Is it at all valid if npar <= 0 ?
TransferWord **myword = NULL;
if(npar > 0)
{
- myword = new TransferWord *[npar];
+ myword = new TransferWord *[npar];
}
string **myblank = NULL;
if(npar > 0)
@@ -1092,7 +1109,7 @@ Transfer::processCallMacro(xmlNode *localroot)
int idx = 0;
int lastpos = 0;
- for(xmlNode *i = localroot->children; i != NULL; i = i->next)
+ for(xmlNode *i = localroot->children; npar && i != NULL; i = i->next)
{
if(i->type == XML_ELEMENT_NODE)
{
@@ -1110,7 +1127,7 @@ Transfer::processCallMacro(xmlNode *localroot)
swap(myword, word);
swap(myblank, blank);
swap(npar, lword);
-
+
for(xmlNode *i = macro->children; i != NULL; i = i->next)
{
if(i->type == XML_ELEMENT_NODE)
@@ -1123,19 +1140,14 @@ Transfer::processCallMacro(xmlNode *localroot)
swap(myblank, blank);
swap(npar, lword);
- if(myword)
- {
- delete[] myword;
- }
- if(myblank)
- {
- delete[] myblank;
- }
+ delete[] myword;
+ delete[] myblank;
}
-void
+int
Transfer::processChoose(xmlNode *localroot)
{
+ int words_to_consume = -1;
for(xmlNode *i = localroot->children; i != NULL; i = i->next)
{
if(i->type == XML_ELEMENT_NODE)
@@ -1143,7 +1155,7 @@ Transfer::processChoose(xmlNode *localroot)
if(!xmlStrcmp(i->name, (const xmlChar *) "when"))
{
bool picked_option = false;
-
+
for(xmlNode *j = i->children; j != NULL; j = j->next)
{
if(j->type == XML_ELEMENT_NODE)
@@ -1161,14 +1173,18 @@ Transfer::processChoose(xmlNode *localroot)
}
else
{
- processInstruction(j);
+ words_to_consume = processInstruction(j);
+ if(words_to_consume != -1)
+ {
+ return words_to_consume;
+ }
}
}
}
if(picked_option)
{
- return;
- }
+ return words_to_consume;
+ }
}
else if(!xmlStrcmp(i->name, (const xmlChar *) "otherwise"))
{
@@ -1176,12 +1192,17 @@ Transfer::processChoose(xmlNode *localroot)
{
if(j->type == XML_ELEMENT_NODE)
{
- processInstruction(j);
- }
- }
+ words_to_consume = processInstruction(j);
+ if(words_to_consume != -1)
+ {
+ return words_to_consume;
+ }
+ }
+ }
}
}
}
+ return words_to_consume;
}
bool
@@ -1222,7 +1243,7 @@ Transfer::processLogical(xmlNode *localroot)
else if(!xmlStrcmp(localroot->name, (const xmlChar *) "not"))
{
return processNot(localroot);
- }
+ }
else if(!xmlStrcmp(localroot->name, (const xmlChar *) "in"))
{
return processIn(localroot);
@@ -1240,15 +1261,15 @@ Transfer::processIn(xmlNode *localroot)
for(xmlNode *i = localroot->children; i != NULL; i = i->next)
{
if(i->type == XML_ELEMENT_NODE)
- {
+ {
if(value == NULL)
{
- value = i;
+ value = i;
}
else
{
- idlist = i->properties->children->content;
- break;
+ idlist = i->properties->children->content;
+ break;
}
}
}
@@ -1257,7 +1278,7 @@ Transfer::processIn(xmlNode *localroot)
if(localroot->properties != NULL)
{
- if(!xmlStrcmp(localroot->properties->children->content,
+ if(!xmlStrcmp(localroot->properties->children->content,
(const xmlChar *) "yes"))
{
set<string, Ltstr> &myset = listslow[(const char *) idlist];
@@ -1292,7 +1313,7 @@ Transfer::processTest(xmlNode *localroot)
{
return processLogical(i);
}
- }
+ }
return false;
}
@@ -1372,7 +1393,7 @@ Transfer::processEqual(xmlNode *localroot)
return tolower(evalString(first)) == tolower(evalString(second));
}
else
- {
+ {
return evalString(first) == evalString(second);
}
}
@@ -1382,7 +1403,7 @@ bool
Transfer::beginsWith(string const &s1, string const &s2) const
{
int const limit = s2.size(), constraint = s1.size();
-
+
if(constraint < limit)
{
return false;
@@ -1402,7 +1423,7 @@ bool
Transfer::endsWith(string const &s1, string const &s2) const
{
int const limit = s2.size(), constraint = s1.size();
-
+
if(constraint < limit)
{
return false;
@@ -1522,7 +1543,7 @@ Transfer::processBeginsWithList(xmlNode *localroot)
string needle = evalString(first);
set<string, Ltstr>::iterator it, limit;
- if(localroot->properties == NULL ||
+ if(localroot->properties == NULL ||
xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes"))
{
it = lists[(const char *) idlist].begin();
@@ -1534,7 +1555,7 @@ Transfer::processBeginsWithList(xmlNode *localroot)
it = listslow[(const char *) idlist].begin();
limit = listslow[(const char *) idlist].end();
}
-
+
for(; it != limit; it++)
{
if(beginsWith(needle, *it))
@@ -1571,7 +1592,7 @@ Transfer::processEndsWithList(xmlNode *localroot)
string needle = evalString(first);
set<string, Ltstr>::iterator it, limit;
- if(localroot->properties == NULL ||
+ if(localroot->properties == NULL ||
xmlStrcmp(localroot->properties->children->content, (const xmlChar *) "yes"))
{
it = lists[(const char *) idlist].begin();
@@ -1583,7 +1604,7 @@ Transfer::processEndsWithList(xmlNode *localroot)
it = listslow[(const char *) idlist].begin();
limit = listslow[(const char *) idlist].end();
}
-
+
for(; it != limit; it++)
{
if(endsWith(needle, *it))
@@ -1654,16 +1675,16 @@ Transfer::copycase(string const &source_word, string const &target_word)
{
result = StringUtils::toupper(t_word);
}
-
+
if(firstupper)
{
result[0] = towupper(result[0]);
}
-
+
return UtfConverter::toUtf8(result);
}
-string
+string
Transfer::caseOf(string const &str)
{
wstring const s = UtfConverter::fromUtf8(str);
@@ -1722,23 +1743,32 @@ Transfer::tags(string const &str) const
result += str[i];
}
}
-
+
result += '>';
return result;
}
-void
+int
Transfer::processRule(xmlNode *localroot)
{
+ int instruction_return, words_to_consume = -1;
// localroot is suposed to be an 'action' tag
for(xmlNode *i = localroot->children; i != NULL; i = i->next)
{
if(i->type == XML_ELEMENT_NODE)
{
- processInstruction(i);
+ instruction_return = processInstruction(i);
+ // When an instruction which modifies the number of words to be consumed
+ // from the input is found, execution of the rule is stopped
+ if(instruction_return != -1)
+ {
+ words_to_consume = instruction_return;
+ break;
+ }
}
}
+ return words_to_consume;
}
TransferToken &
@@ -1758,7 +1788,7 @@ Transfer::readToken(FILE *in)
return input_buffer.add(TransferToken(content, tt_eof));
}
if(val == '\\')
- {
+ {
content += L'\\';
content += (wchar_t) fgetwc_unlocked(in);
}
@@ -1822,11 +1852,17 @@ Transfer::setTrace(bool trace)
}
void
+Transfer::setTraceATT(bool trace)
+{
+ this->trace_att = trace;
+}
+
+void
Transfer::transfer_wrapper_null_flush(FILE *in, FILE *out)
{
null_flush = false;
internal_null_flush = true;
-
+
while(!feof(in))
{
transfer(in, out);
@@ -1840,7 +1876,7 @@ Transfer::transfer_wrapper_null_flush(FILE *in, FILE *out)
internal_null_flush = false;
null_flush = true;
-}
+}
void
Transfer::transfer(FILE *in, FILE *out)
@@ -1849,28 +1885,107 @@ Transfer::transfer(FILE *in, FILE *out)
{
transfer_wrapper_null_flush(in, out);
}
-
+
int last = 0;
+ int prev_last = 0;
+ int lastrule_id = -1;
+ set<int> banned_rules;
output = out;
ms.init(me->getInitial());
-
+
while(true)
{
+ if(trace_att)
+ {
+ cerr << "Loop start " << endl;
+ cerr << "ms.size: " << ms.size() << endl;
+
+ cerr << "tmpword.size(): " << tmpword.size() << endl;
+ for (unsigned int ind = 0; ind < tmpword.size(); ind++)
+ {
+ if(ind != 0)
+ {
+ wcerr << L" ";
+ }
+ wcerr << *tmpword[ind];
+ }
+ wcerr << endl;
+
+ cerr << "tmpblank.size(): " << tmpblank.size() << endl;
+ for (unsigned int ind = 0; ind < tmpblank.size(); ind++)
+ {
+ wcerr << L"'";
+ wcerr << *tmpblank[ind];
+ wcerr << L"' ";
+ }
+ wcerr << endl;
+
+ cerr << "last: " << last << endl;
+ cerr << "prev_last: " << prev_last << endl << endl;
+ }
+
if(ms.size() == 0)
{
if(lastrule != NULL)
{
- applyRule();
- input_buffer.setPos(last);
+ int num_words_to_consume = applyRule();
+
+ if(trace_att)
+ {
+ cerr << "num_words_to_consume: " << num_words_to_consume << endl;
+ }
+
+ //Consume all the words from the input which matched the rule.
+ //This piece of code is executed unless the rule contains a "reject-current-rule" instruction
+ if(num_words_to_consume < 0)
+ {
+ banned_rules.clear();
+ input_buffer.setPos(last);
+ }
+ else if(num_words_to_consume > 0)
+ {
+ banned_rules.clear();
+ if(prev_last >= input_buffer.getSize())
+ {
+ input_buffer.setPos(0);
+ }
+ else
+ {
+ input_buffer.setPos(prev_last+1);
+ }
+ int num_consumed_words = 0;
+ while(num_consumed_words < num_words_to_consume)
+ {
+ TransferToken& local_tt = input_buffer.next();
+ if (local_tt.getType() == tt_word)
+ {
+ num_consumed_words++;
+ }
+ }
+ }
+ else
+ {
+ //Add rule to banned rules
+ banned_rules.insert(lastrule_id);
+ input_buffer.setPos(prev_last);
+ input_buffer.next();
+ last = input_buffer.getPos();
+ }
+ lastrule_id = -1;
}
else
{
- if(tmpword.size() != 0)
- {
- pair<wstring, int> tr;
- if(useBilingual && preBilingual == false)
- {
+ if(tmpword.size() != 0)
+ {
+ if(trace_att)
+ {
+ cerr << "printing tmpword[0]" <<endl;
+ }
+
+ pair<wstring, int> tr;
+ if(useBilingual && preBilingual == false)
+ {
if(isExtended && (*tmpword[0])[0] == L'*')
{
tr = extended.biltransWithQueue((*tmpword[0]).substr(1), false);
@@ -1893,7 +2008,7 @@ Transfer::transfer(FILE *in, FILE *out)
wstring sl;
wstring tl;
int seenSlash = 0;
- for(wstring::const_iterator it = tmpword[0]->begin(); it != tmpword[0]->end(); it++)
+ for(wstring::const_iterator it = tmpword[0]->begin(); it != tmpword[0]->end(); it++)
{
if(*it == L'\\')
{
@@ -1911,7 +2026,7 @@ Transfer::transfer(FILE *in, FILE *out)
}
continue;
}
- else if(*it == L'/')
+ else if(*it == L'/')
{
seenSlash++;
continue;
@@ -1929,15 +2044,15 @@ Transfer::transfer(FILE *in, FILE *out)
break;
}
}
- //tmpword[0]->assign(sl);
+ //tmpword[0]->assign(sl);
tr = pair<wstring, int>(tl, false);
- //wcerr << L"pb: " << *tmpword[0] << L" :: " << sl << L" >> " << tl << endl ;
+ //wcerr << L"pb: " << *tmpword[0] << L" :: " << sl << L" >> " << tl << endl ;
}
else
{
tr = pair<wstring, int>(*tmpword[0], 0);
}
-
+
if(tr.first.size() != 0)
{
if(defaultAttrs == lu)
@@ -1953,32 +2068,40 @@ Transfer::transfer(FILE *in, FILE *out)
fputws_unlocked(L"^unknown<unknown>{^", output);
}
else
- {
+ {
fputws_unlocked(L"^default<default>{^", output);
- }
+ }
fputws_unlocked(tr.first.c_str(), output);
fputws_unlocked(L"$}$", output);
}
}
+ banned_rules.clear();
tmpword.clear();
input_buffer.setPos(last);
- input_buffer.next();
+ input_buffer.next();
+ prev_last = last;
last = input_buffer.getPos();
ms.init(me->getInitial());
}
else if(tmpblank.size() != 0)
{
- fputws_unlocked(tmpblank[0]->c_str(), output);
- tmpblank.clear();
- last = input_buffer.getPos();
- ms.init(me->getInitial());
+ if(trace_att)
+ {
+ cerr << "printing tmpblank[0]" <<endl;
+ }
+ fputws_unlocked(tmpblank[0]->c_str(), output);
+ tmpblank.clear();
+ prev_last = last;
+ last = input_buffer.getPos();
+ ms.init(me->getInitial());
}
}
}
- int val = ms.classifyFinals(me->getFinals());
+ int val = ms.classifyFinals(me->getFinals(), banned_rules);
if(val != -1)
{
- lastrule = rule_map[val-1];
+ lastrule = rule_map[val-1];
+ lastrule_id = val;
last = input_buffer.getPos();
if(trace)
@@ -1997,7 +2120,7 @@ Transfer::transfer(FILE *in, FILE *out)
}
TransferToken ¤t = readToken(in);
-
+
switch(current.getType())
{
case tt_word:
@@ -2030,12 +2153,13 @@ Transfer::transfer(FILE *in, FILE *out)
}
}
-void
+int
Transfer::applyRule()
{
+ int words_to_consume;
unsigned int limit = tmpword.size();
//wcerr << L"applyRule: " << tmpword.size() << endl;
-
+
for(unsigned int i = 0; i != limit; i++)
{
if(i == 0)
@@ -2057,7 +2181,7 @@ Transfer::applyRule()
{
blank[i-1] = new string(UtfConverter::toUtf8(*tmpblank[i-1]));
}
-
+
pair<wstring, int> tr;
if(useBilingual && preBilingual == false)
{
@@ -2069,7 +2193,7 @@ Transfer::applyRule()
wstring sl;
wstring tl;
int seenSlash = 0;
- for(wstring::const_iterator it = tmpword[i]->begin(); it != tmpword[i]->end(); it++)
+ for(wstring::const_iterator it = tmpword[i]->begin(); it != tmpword[i]->end(); it++)
{
if(*it == L'\\')
{
@@ -2088,7 +2212,7 @@ Transfer::applyRule()
continue;
}
- if(*it == L'/')
+ if(*it == L'/')
{
seenSlash++;
continue;
@@ -2105,8 +2229,8 @@ Transfer::applyRule()
{
break;
}
- }
- //tmpword[i]->assign(sl);
+ }
+ //tmpword[i]->assign(sl);
tr = pair<wstring, int>(tl, false);
}
else
@@ -2114,11 +2238,11 @@ Transfer::applyRule()
tr = pair<wstring, int>(*tmpword[i], false);
}
- word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]),
+ word[i] = new TransferWord(UtfConverter::toUtf8(*tmpword[i]),
UtfConverter::toUtf8(tr.first), tr.second);
}
- processRule(lastrule);
+ words_to_consume = processRule(lastrule);
lastrule = NULL;
if(word)
@@ -2142,6 +2266,7 @@ Transfer::applyRule()
tmpword.clear();
tmpblank.clear();
ms.init(me->getInitial());
+ return words_to_consume;
}
/* HERE */
@@ -2181,7 +2306,7 @@ Transfer::applyWord(wstring const &word_str)
}
}
break;
-
+
default:
ms.step(towlower(word_str[i]), any_char);
break;
@@ -2190,8 +2315,8 @@ Transfer::applyWord(wstring const &word_str)
ms.step(L'$');
}
-void
-Transfer::setPreBilingual(bool value)
+void
+Transfer::setPreBilingual(bool value)
{
preBilingual = value;
}
@@ -2202,8 +2327,8 @@ Transfer::getPreBilingual(void) const
return preBilingual;
}
-void
-Transfer::setUseBilingual(bool value)
+void
+Transfer::setUseBilingual(bool value)
{
useBilingual = value;
}
diff --git a/apertium/transfer.dtd b/apertium/transfer.dtd
index 2295199..b41acc2 100644
--- a/apertium/transfer.dtd
+++ b/apertium/transfer.dtd
@@ -13,9 +13,7 @@
General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA.
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
Draft of DTD for the structural transfer rule files
@@ -25,7 +23,7 @@
<!ENTITY % condition "(and|or|not|equal|begins-with|begins-with-list|ends-with|ends-with-list|contains-substring|in)">
<!ENTITY % container "(var|clip)">
-<!ENTITY % sentence "(let|out|choose|modify-case|call-macro|append)">
+<!ENTITY % sentence "(let|out|choose|modify-case|call-macro|append|reject-current-rule)">
<!ENTITY % value "(b|clip|lit|lit-tag|var|get-case-from|case-of|concat|lu|mlu|chunk)">
<!ENTITY % stringvalue "(clip|lit|var|get-case-from|case-of)">
@@ -451,6 +449,16 @@ get-case-from -->
<!ELEMENT lu (%value;)+>
<!-- Encloses a word inside an 'out' element. -->
+<!ELEMENT reject-current-rule EMPTY>
+<!ATTLIST reject-current-rule shifting (yes|no) #IMPLIED>
+<!--
+ This instruction cancels the execution of the rule being processed.
+ If "shifting" is set to "yes" or is not specified, the matching process
+ consumes exactly one word at the input. If "shifting" is set to "no"
+ then marks the rule to not to be considered in the current matching
+ until the input buffer advances at least one single word
+-->
+
<!ELEMENT chunk (tags,(mlu|lu|b|var)+)>
<!ATTLIST chunk name CDATA #IMPLIED
namefrom CDATA #IMPLIED
diff --git a/apertium/transfer.h b/apertium/transfer.h
index 61927c0..4a42eae 100644
--- a/apertium/transfer.h
+++ b/apertium/transfer.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TRANSFER_
#define _TRANSFER_
@@ -42,7 +40,7 @@ using namespace std;
class Transfer
{
private:
-
+
Alphabet alphabet;
MatchExe *me;
MatchState ms;
@@ -71,20 +69,20 @@ private:
xmlNode *lastrule;
unsigned int nwords;
-
+
map<xmlNode *, TransferInstr> evalStringCache;
enum OutputType{lu,chunk};
-
+
OutputType defaultAttrs;
bool preBilingual;
bool useBilingual;
bool null_flush;
bool internal_null_flush;
bool trace;
+ bool trace_att;
string emptyblank;
- void copy(Transfer const &o);
void destroy();
void readData(FILE *input);
void readBil(string const &filename);
@@ -96,6 +94,7 @@ private:
void processLet(xmlNode *localroot);
void processAppend(xmlNode *localroot);
+ int processRejectCurrentRule(xmlNode *localroot);
void processOut(xmlNode *localroot);
void processCallMacro(xmlNode *localroot);
void processModifyCase(xmlNode *localroot);
@@ -111,10 +110,10 @@ private:
bool processContainsSubstring(xmlNode *localroot);
bool processNot(xmlNode *localroot);
bool processIn(xmlNode *localroot);
- void processRule(xmlNode *localroot);
+ int processRule(xmlNode *localroot);
string evalString(xmlNode *localroot);
- void processInstruction(xmlNode *localroot);
- void processChoose(xmlNode *localroot);
+ int processInstruction(xmlNode *localroot);
+ int processChoose(xmlNode *localroot);
string processChunk(xmlNode *localroot);
string processTags(xmlNode *localroot);
@@ -126,15 +125,13 @@ private:
wstring readBlank(FILE *in);
wstring readUntil(FILE *in, int const symbol) const;
void applyWord(wstring const &word_str);
- void applyRule();
+ int applyRule();
TransferToken & readToken(FILE *in);
bool checkIndex(xmlNode *element, int index, int limit);
void transfer_wrapper_null_flush(FILE *in, FILE *out);
public:
Transfer();
~Transfer();
- Transfer(Transfer const &o);
- Transfer & operator =(Transfer const &o);
void read(string const &transferfile, string const &datafile,
string const &fstfile = "");
@@ -148,6 +145,7 @@ public:
bool getNullFlush(void);
void setNullFlush(bool null_flush);
void setTrace(bool trace);
+ void setTraceATT(bool trace);
};
#endif
diff --git a/apertium/transfer.rnc b/apertium/transfer.rnc
new file mode 100644
index 0000000..cb6755b
--- /dev/null
+++ b/apertium/transfer.rnc
@@ -0,0 +1,407 @@
+# Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+# Draft of DTD for the structural transfer rule files
+#
+# Sergio Ortiz, Gema Ramírez-Sánchez, Mireia Ginestí, Mikel L. Forcada,
+# 2005.07.29.
+
+condition =
+ and
+ | or
+ | not
+ | equal
+ | begins-with
+ | begins-with-list
+ | ends-with
+ | ends-with-list
+ | contains-substring
+ | in
+container = var | clip
+sentence =
+ let
+ | out
+ | choose
+ | modify-case
+ | call-macro
+ | append
+ | reject-current-rule
+value =
+ b
+ | clip
+ | lit
+ | lit-tag
+ | var
+ | get-case-from
+ | case-of
+ | concat
+ | lu
+ | mlu
+ | chunk
+stringvalue = clip | lit | var | get-case-from | case-of
+transfer =
+ element transfer {
+ attlist.transfer,
+ section-def-cats,
+ section-def-attrs?,
+ section-def-vars?,
+ section-def-lists?,
+ section-def-macros?,
+ section-rules
+ }
+attlist.transfer &= attribute default { "lu" | "chunk" }?
+# 'transfer' is the root element containing the whole structural
+# transfer rule file. Attribute 'default' specifies if
+# unmatched words have to be written as lexical units ("lu", this is
+# the default value) or as chunks ("chunk").
+section-def-cats =
+ element section-def-cats { attlist.section-def-cats, def-cat+ }
+attlist.section-def-cats &= empty
+# The 'def-cats' section defines the categories used to build the
+# patterns used in rules
+def-cat = element def-cat { attlist.def-cat, cat-item+ }
+attlist.def-cat &=
+ attribute n { xsd:ID },
+ attribute c { text }?
+# Each 'def-cat' defines one category in terms of a list of
+# category items and has a unique name 'n', which is mandatory
+cat-item = element cat-item { attlist.cat-item, empty }
+attlist.cat-item &=
+ attribute lemma { text }?,
+ attribute tags { text },
+ attribute c { text }?
+# Each 'cat-item' (category item) represents a set of lexical forms
+# and has a mandatory attribute 'tags' whose value is a sequence of
+# dot-separated tag names; this sequence is a subsequence of the
+# tag sequence defining each possible lexical form. For example,
+# tags="n.f" would match all lexical forms containing this tag
+# sequence, such as "^casa<n><f><pl>$".
+#
+# In addition, an optional attribute, "lemma", may be used to
+# define lexical forms having a particular substring in their lemma
+section-def-attrs =
+ element section-def-attrs { attlist.section-def-attrs, def-attr+ }
+attlist.section-def-attrs &= empty
+# The 'def-attrs' section defines the attributes that will be
+# identified in matched lexical forms
+def-attr = element def-attr { attlist.def-attr, attr-item+ }
+attlist.def-attr &=
+ attribute n { xsd:ID },
+ attribute c { text }?
+# Each def-attr defines one attribute in terms of a list of
+# attribute items and has a mandatory unique name n
+attr-item = element attr-item { attlist.attr-item, empty }
+attlist.attr-item &=
+ attribute tags { text }?,
+ attribute c { text }?
+# Each 'attr-item' specifies a subsequence of the tags in
+# that lexical form (attribute 'tags')
+section-def-vars =
+ element section-def-vars { attlist.section-def-vars, def-var+ }
+attlist.section-def-vars &= empty
+# The 'def-vars' section defines the global variables
+# that will be used to transfer information between rules
+def-var = element def-var { attlist.def-var, empty }
+attlist.def-var &=
+ attribute n { xsd:ID },
+ attribute v { text }?,
+ attribute c { text }?
+# The definition of a global variable has a mandatory unique name 'n' that
+# will be used to refer to it. A value of initialization can also be specified
+# by means the 'v' attribute. The default value of the initialization is the
+# empty string.
+section-def-lists =
+ element section-def-lists { attlist.section-def-lists, def-list+ }
+attlist.section-def-lists &= empty
+# Element 'section-def-lists' encloses a set of list definitions
+def-list = element def-list { attlist.def-list, list-item+ }
+attlist.def-list &=
+ attribute n { xsd:ID },
+ attribute c { text }?
+# The 'def-list' element defines a named list to search with the 'in'
+# element. Attribute 'n' sets the name of the list
+list-item = element list-item { attlist.list-item, empty }
+attlist.list-item &=
+ attribute v { text },
+ attribute c { text }?
+# Attribute 'v' of 'list-item' element contains the value to be added to
+# the list being defined
+section-def-macros =
+ element section-def-macros { attlist.section-def-macros, def-macro+ }
+attlist.section-def-macros &= empty
+#
+# The 'def-macros' section defines macros containing portions of
+# code frequently used in the action part of rules
+#
+def-macro = element def-macro { attlist.def-macro, sentence+ }
+attlist.def-macro &= attribute n { xsd:ID }
+attlist.def-macro &=
+ attribute npar { text },
+ attribute c { text }?
+# Macro definition:
+#
+# A macro has a mandatory name (the value of 'n'), a number of parameters
+# (the value of 'npar') and a body containing arguments and statements.
+section-rules = element section-rules { attlist.section-rules, rule+ }
+attlist.section-rules &= empty
+# The rules section contains a sequence of one or more rules
+rule = element rule { attlist.rule, pattern, action }
+attlist.rule &= attribute comment { text }?
+# Each rule has a pattern and an action
+# * attribute 'comment' allows to put in comments about the purpose of
+# the rule being defined
+pattern = element pattern { attlist.pattern, pattern-item+ }
+attlist.pattern &= empty
+# The pattern is specified in terms of pattern items, each one
+# representing a lexical form in the matched pattern
+pattern-item = element pattern-item { attlist.pattern-item, empty }
+attlist.pattern-item &= attribute n { xsd:IDREF }
+# Each attribute to be activated is referred to by its name in the def-cats section
+action = element action { attlist.action, sentence* }
+attlist.action &= attribute c { text }?
+# Encloses the procedural part of a rule
+choose = element choose { attlist.choose, when+, otherwise? }
+attlist.choose &= attribute c { text }?
+# The choose statement is a selection statement (similar to a case
+# statement) composed of one or more tested cases and an optional
+# otherwise
+when = element when { attlist.when, test, sentence* }
+attlist.when &= attribute c { text }?
+# Each tested case is a block of zero or more statements
+otherwise = element otherwise { attlist.otherwise, sentence+ }
+attlist.otherwise &= attribute c { text }?
+# The otherwise case is also a block of one or more statements
+test = element test { attlist.test, condition }
+attlist.test &= attribute c { text }?
+# The test in a tested case may be a conjunction, a disjunction, or
+# a negation of simpler tests, as well as a simple equality test
+and = element and { attlist.and, condition, condition+ }
+attlist.and &= empty
+# Each conjuntion test contains two or more simpler tests
+or = element or { attlist.or, condition, condition+ }
+attlist.or &= empty
+# Each disjunction test contains two or more simpler tests
+not = element not { attlist.not, condition }
+attlist.not &= empty
+# The negation of a simpler test is a test itself
+equal = element equal { attlist.equal, value, value }
+attlist.equal &= attribute caseless { "no" | "yes" }?
+# The simplest test is an equality test. The right part and the
+# left part of the equality may both be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+begins-with = element begins-with { attlist.begins-with, value, value }
+attlist.begins-with &= attribute caseless { "no" | "yes" }?
+# Tests if the left part contains the right part at the beginning.
+# Both parts of the test may both be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+ends-with = element ends-with { attlist.ends-with, value, value }
+attlist.ends-with &= attribute caseless { "no" | "yes" }?
+# Tests if the left part contains the right part at the end.
+# Both parts of the test may both be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+begins-with-list =
+ element begins-with-list { attlist.begins-with-list, value, \list }
+attlist.begins-with-list &= attribute caseless { "no" | "yes" }?
+# Tests if the left part contains the right part at the beginning.
+# First parts of the test may be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. The second part
+# must be always a list. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+ends-with-list =
+ element ends-with-list { attlist.ends-with-list, value, \list }
+attlist.ends-with-list &= attribute caseless { "no" | "yes" }?
+# Tests if the left part contains the right part at the end.
+# First parts of the test may be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. The second part
+# must be always a list. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+contains-substring =
+ element contains-substring {
+ attlist.contains-substring, value, value
+ }
+attlist.contains-substring &= attribute caseless { "no" | "yes" }?
+# Tests if the left part contains the right part.
+# Both parts of the test may both be a clip (see below), a
+# literal string ('lit'), a literal tag ('lit-tag') or the value of
+# a variable ('var') defined in the def-vars section. When the attribute
+# 'caseless' is set to 'yes', the comparison is made without attending
+# to the case.
+in = element in { attlist.in, value, \list }
+attlist.in &= attribute caseless { "no" | "yes" }?
+# 'in' performs a search of a value in a list. If 'caseless' is set to yes,
+# this search is performed without attending to the case
+\list = element list { attlist.list, empty }
+attlist.list &= attribute n { xsd:IDREF }
+# 'list' refers, with the name in attribute 'n', a list defined before in
+# the 'section-def-list' section
+let = element let { attlist.let, container, value }
+attlist.let &= empty
+# An assignment statement ('let') assigns the value of a clip (see
+# below), a literal string ('lit'), a literal tag('lit-tag') or the
+# value of a global variable ('var') to either a global variable ('var')
+# or a clip
+append = element append { attlist.append, value+ }
+attlist.append &= attribute n { xsd:IDREF }
+# This instruction appends the value of a clip (see
+# below), a literal string ('lit'), a literal tag('lit-tag') or the
+# value of a global variable ('var') to either a global variable ('var')
+# or a clip, identified by the "n" attribute
+out = element out { attlist.out, (mlu | lu | b | chunk | var)+ }
+attlist.out &= attribute c { text }?
+# 'out' is an output statement; it may output any sequence of
+# clips, literal strings, literal tags, variables, and whitespace items
+# (see below)
+modify-case =
+ element modify-case { attlist.modify-case, container, stringvalue }
+attlist.modify-case &= empty
+# The first argument of 'modify-case' copy the case of the second
+# argument.
+call-macro = element call-macro { attlist.call-macro, with-param* }
+attlist.call-macro &= attribute n { xsd:IDREF }
+# A macro may be called anywhere by name with one or more
+# arguments
+with-param = element with-param { attlist.with-param, empty }
+attlist.with-param &= attribute pos { text }
+# The attribute pos in each argument is used to refer to a lexical
+# form in the current rule. For example, if a 2-parameter macro
+# has been defined to perform noun-adjective agreement operations,
+# it may be used with arguments 1 and 2 in a noun-adjective rule,
+# with arguments 2, 3 and 1 in a determiner-noun-adjective rule, with
+# arguments 1 and 3 in a noun-adverb-adjective rule, and with
+# arguments 2 and 1 in an adjective-noun rule
+clip = element clip { attlist.clip, empty }
+attlist.clip &=
+ attribute pos { text },
+ attribute side { "sl" | "tl" },
+ attribute part { text },
+ attribute queue { text }?,
+ attribute link-to { text }?,
+ attribute c { text }?
+# A 'clip' is a substring of a source-language or target-language
+# lexical form, extracted according to an attribute:
+#
+# * 'pos' is an index (1, 2, 3...) used to select a lexical form
+# inside the rule;
+#
+# * 'side' is used to select a source-language ('sl') or a
+# target-language ('tl') clip
+#
+# * the value of 'part' is the name of an attribute defined in
+# def-attrs, but may take also the values 'lem' (referring to
+# the lemma of the lexical form), 'lemh' (lemma head), 'lemq'
+# (lemma queue) and 'whole' (referring to the whole lexical form).
+#
+# * the value of 'queue' may be 'no' or 'yes'. 'yes' is assumed by
+# default.
+#
+# * 'link-to' causes the other attributes to be ignored in clip evaluation
+# when using 'clip' as a right hand side element (as value), and
+# returns its value. When using as a left hand side (as reference),
+# the value of the 'as' attribute is ignored.
+lit = element lit { attlist.lit, empty }
+attlist.lit &= attribute v { text }
+# A literal string value: the value of the literal is the value of
+# the 'v' attribute
+lit-tag = element lit-tag { attlist.lit-tag, empty }
+attlist.lit-tag &= attribute v { text }
+# A literal string value: the value of the literal is the value of
+# the 'v' attribute
+var = element var { attlist.var, empty }
+attlist.var &= attribute n { xsd:IDREF }
+# Each 'var' is a variable identifier: the attribute n is the name
+# of the variable. When it is in an 'out', a 'test', or the right
+# part of a 'let', it represents the value of the variable; when in
+# the left part of a 'let' it represents the reference of the
+# variable.
+get-case-from =
+ element get-case-from { attlist.get-case-from, (clip | lit | var) }
+attlist.get-case-from &= attribute pos { text }
+# Atención, falta modificar todos los comentarios donde intervenga
+# get-case-from
+case-of = element case-of { attlist.case-of, empty }
+attlist.case-of &=
+ attribute pos { text },
+ attribute side { "sl" | "tl" },
+ attribute part { text }
+# A 'case-of' is a value representing the case of a "clip". This value
+# will be "aa" (all lowercase), "Aa" (first uppercase) and "AA",
+# (all uppercase).
+#
+# * 'pos' is an index (1, 2, 3...) used to select a lexical form
+# inside the rule;
+#
+# * 'side' is used to select a source-language ('sl') or a
+# target-language ('tl') clip
+#
+# * the value of 'part' is the name of an attribute defined in
+# def-attrs, but may take also the values 'lem' (referring to
+# the lemma of the lexical form), 'lemh' (lemma head), 'lemq'
+# (lemma queue) and 'whole' (referring to the whole lexical form).
+concat = element concat { attlist.concat, value+ }
+attlist.concat &= empty
+# Concatenates a sequence of values
+mlu = element mlu { attlist.mlu, lu+ }
+attlist.mlu &= empty
+# Encloses a multiword
+lu = element lu { attlist.lu, value+ }
+attlist.lu &= empty
+# Encloses a word inside an 'out' element.
+reject-current-rule =
+ element reject-current-rule { attlist.reject-current-rule, empty }
+attlist.reject-current-rule &= attribute shifting { "yes" | "no" }?
+# This instruction cancels the execution of the rule being processed.
+# If "shifting" is set to "yes" or is not specified, the matching process
+# consumes exactly one word at the input. If "shifting" is set to "no"
+# then marks the rule to not to be considered in the current matching
+# until the input buffer advances at least one single word
+chunk = element chunk { attlist.chunk, tags, (mlu | lu | b | var)+ }
+attlist.chunk &=
+ attribute name { text }?,
+ attribute namefrom { text }?,
+ attribute case { text }?,
+ attribute c { text }?
+# Encloses a chunk inside an 'out' element.
+# * 'name' the pseudolemma of the chunk.
+# * 'namefrom' get the name from a variable.
+# * 'case' the variable to get the uppercase/lowercase policy
+# to apply it to the chunk name
+tags = element tags { attlist.tags, tag+ }
+attlist.tags &= empty
+tag = element tag { attlist.tag, value }
+attlist.tag &= empty
+b = element b { attlist.b, empty }
+attlist.b &= attribute pos { text }?
+start = transfer
+# 'b' is a [super]blanks item, indexed by pos; for example, a 'b'
+# with pos="2" refers to the [super]blanks (including format data
+# encapsulated by the de-formatter) between lexical form 2 and
+# lexical form 3. Managing [super]blanks explicitly allows for the
+# correct placement of format when the result of structural
+# transfer has more or less lexical items than the original or has
+# been reordered in some way. If attribute "pos" is not specified, then
+# a single blank (ASCII 32) is generated.
diff --git a/apertium/transfer.rng b/apertium/transfer.rng
index b78f94d..73bd756 100644
--- a/apertium/transfer.rng
+++ b/apertium/transfer.rng
@@ -1,9 +1,24 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<!--
- Draft of DTD for the structural transfer rule files
+ Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
- Sergio Ortiz, Gema Ram�rez-S�nchez, Mireia Ginest�, Mikel L. Forcada,
- 2005.07.29.
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+ Draft of DTD for the structural transfer rule files
+
+ Sergio Ortiz, Gema Ram�rez-S�nchez, Mireia Ginest�, Mikel L. Forcada,
+ 2005.07.29.
-->
<grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
<define name="condition">
@@ -34,6 +49,7 @@
<ref name="modify-case"/>
<ref name="call-macro"/>
<ref name="append"/>
+ <ref name="reject-current-rule"/>
</choice>
</define>
<define name="value">
@@ -46,6 +62,9 @@
<ref name="get-case-from"/>
<ref name="case-of"/>
<ref name="concat"/>
+ <ref name="lu"/>
+ <ref name="mlu"/>
+ <ref name="chunk"/>
</choice>
</define>
<define name="stringvalue">
@@ -61,8 +80,12 @@
<element name="transfer">
<ref name="attlist.transfer"/>
<ref name="section-def-cats"/>
- <ref name="section-def-attrs"/>
- <ref name="section-def-vars"/>
+ <optional>
+ <ref name="section-def-attrs"/>
+ </optional>
+ <optional>
+ <ref name="section-def-vars"/>
+ </optional>
<optional>
<ref name="section-def-lists"/>
</optional>
@@ -974,6 +997,29 @@
<empty/>
</define>
<!-- Encloses a word inside an 'out' element. -->
+ <define name="reject-current-rule">
+ <element name="reject-current-rule">
+ <ref name="attlist.reject-current-rule"/>
+ <empty/>
+ </element>
+ </define>
+ <define name="attlist.reject-current-rule" combine="interleave">
+ <optional>
+ <attribute name="shifting">
+ <choice>
+ <value>yes</value>
+ <value>no</value>
+ </choice>
+ </attribute>
+ </optional>
+ </define>
+ <!--
+ This instruction cancels the execution of the rule being processed.
+ If "shifting" is set to "yes" or is not specified, the matching process
+ consumes exactly one word at the input. If "shifting" is set to "no"
+ then marks the rule to not to be considered in the current matching
+ until the input buffer advances at least one single word
+ -->
<define name="chunk">
<element name="chunk">
<ref name="attlist.chunk"/>
diff --git a/apertium/transfer_data.cc b/apertium/transfer_data.cc
index c1312ad..5515620 100644
--- a/apertium/transfer_data.cc
+++ b/apertium/transfer_data.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/transfer_data.h>
diff --git a/apertium/transfer_data.h b/apertium/transfer_data.h
index 91d5223..d39d009 100644
--- a/apertium/transfer_data.h
+++ b/apertium/transfer_data.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TRANSFERDATA_
#define _TRANSFERDATA_
diff --git a/apertium/transfer_instr.cc b/apertium/transfer_instr.cc
index 4569a7c..efc936f 100644
--- a/apertium/transfer_instr.cc
+++ b/apertium/transfer_instr.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/transfer_instr.h>
#include <apertium/string_utils.h>
diff --git a/apertium/transfer_instr.h b/apertium/transfer_instr.h
index 5d03a81..d80a264 100644
--- a/apertium/transfer_instr.h
+++ b/apertium/transfer_instr.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TRANSFERINSTR_
#define _TRANSFERINSTR_
@@ -51,7 +49,12 @@ private:
void copy(TransferInstr const &o);
void destroy();
public:
- TransferInstr(){};
+ TransferInstr() :
+ type(ti_clip_sl),
+ pos(0),
+ pointer(0),
+ condition(false)
+ {}
TransferInstr(TransferInstrType t, string const &c, int const p,
void *ptr=NULL, bool cond = true);
~TransferInstr();
diff --git a/apertium/transfer_mult.cc b/apertium/transfer_mult.cc
index 1ac73bb..0fd5347 100644
--- a/apertium/transfer_mult.cc
+++ b/apertium/transfer_mult.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/transfer_mult.h>
#include <apertium/trx_reader.h>
@@ -30,11 +28,6 @@
using namespace std;
void
-TransferMult::copy(TransferMult const &o)
-{
-}
-
-void
TransferMult::destroy()
{
if(me)
@@ -44,7 +37,13 @@ TransferMult::destroy()
}
}
-TransferMult::TransferMult()
+TransferMult::TransferMult() :
+word(0),
+blank(0),
+output(0),
+any_char(0),
+any_tag(0),
+nwords(0)
{
me = NULL;
isRule = false;
@@ -57,22 +56,6 @@ TransferMult::~TransferMult()
destroy();
}
-TransferMult::TransferMult(TransferMult const &o)
-{
- copy(o);
-}
-
-TransferMult &
-TransferMult::operator =(TransferMult const &o)
-{
- if(this != &o)
- {
- destroy();
- copy(o);
- }
- return *this;
-}
-
string
TransferMult::tolower(string const &str) const
{
diff --git a/apertium/transfer_mult.h b/apertium/transfer_mult.h
index b2cb239..6e83f4b 100644
--- a/apertium/transfer_mult.h
+++ b/apertium/transfer_mult.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TRANSFER_MULT_
#define _TRANSFER_MULT_
@@ -67,7 +65,6 @@ private:
OutputType defaultAttrs;
- void copy(TransferMult const &o);
void destroy();
void readData(FILE *input);
void readBil(string const &filename);
@@ -93,8 +90,6 @@ private:
public:
TransferMult();
~TransferMult();
- TransferMult(TransferMult const &o);
- TransferMult & operator =(TransferMult const &o);
void read(string const &datafile, string const &fstfile);
void transfer(FILE *in, FILE *out);
diff --git a/apertium/transfer_token.cc b/apertium/transfer_token.cc
index c1b4f91..f120cd4 100644
--- a/apertium/transfer_token.cc
+++ b/apertium/transfer_token.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/transfer_token.h>
#include <apertium/string_utils.h>
@@ -33,7 +31,8 @@ TransferToken::destroy()
{
}
-TransferToken::TransferToken()
+TransferToken::TransferToken() :
+type(tt_eof)
{
}
diff --git a/apertium/transfer_token.h b/apertium/transfer_token.h
index b06e045..67788f6 100644
--- a/apertium/transfer_token.h
+++ b/apertium/transfer_token.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TRANSFERTOKEN_
#define _TRANSFERTOKEN_
diff --git a/apertium/transfer_word.cc b/apertium/transfer_word.cc
index ebaaa25..6241dda 100644
--- a/apertium/transfer_word.cc
+++ b/apertium/transfer_word.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/transfer_word.h>
@@ -35,7 +33,8 @@ TransferWord::destroy()
{
}
-TransferWord::TransferWord()
+TransferWord::TransferWord() :
+queue_length(0)
{
}
diff --git a/apertium/transfer_word.h b/apertium/transfer_word.h
index ab47998..7dcbf90 100644
--- a/apertium/transfer_word.h
+++ b/apertium/transfer_word.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TRANSFERWORD_
diff --git a/apertium/transfer_word_list.cc b/apertium/transfer_word_list.cc
index 6c17a26..4ee7218 100644
--- a/apertium/transfer_word_list.cc
+++ b/apertium/transfer_word_list.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/transfer_word_list.h>
#include <apertium/string_utils.h>
diff --git a/apertium/transfer_word_list.h b/apertium/transfer_word_list.h
index d5b0897..46b00ce 100644
--- a/apertium/transfer_word_list.h
+++ b/apertium/transfer_word_list.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TRANSFERWORDLIST_
#define _TRANSFERWORDLIST_
diff --git a/apertium/transferpp.cc b/apertium/transferpp.cc
index dae4642..8afce0f 100644
--- a/apertium/transferpp.cc
+++ b/apertium/transferpp.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/trx_reader.h>
#include <lttoolbox/lt_locale.h>
diff --git a/apertium/trx_reader.cc b/apertium/trx_reader.cc
index e93c2d8..6f9347f 100644
--- a/apertium/trx_reader.cc
+++ b/apertium/trx_reader.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/trx_reader.h>
#include <lttoolbox/xml_parse_util.h>
@@ -32,17 +30,14 @@ wstring const
TRXReader::ANY_CHAR = L"<ANY_CHAR>";
void
-TRXReader::copy(TRXReader const &o)
-{
-}
-
-void
TRXReader::destroy()
{
xmlFreeTextReader(reader);
}
-TRXReader::TRXReader()
+TRXReader::TRXReader() :
+reader(0),
+type(0)
{
td.getAlphabet().includeSymbol(ANY_TAG);
td.getAlphabet().includeSymbol(ANY_CHAR);
@@ -53,11 +48,6 @@ TRXReader::~TRXReader()
destroy();
}
-TRXReader::TRXReader(TRXReader const &o)
-{
- copy(o);
-}
-
void
TRXReader::step()
{
@@ -70,17 +60,6 @@ TRXReader::step()
type = xmlTextReaderNodeType(reader);
}
-TRXReader &
-TRXReader::operator =(TRXReader const &o)
-{
- if(this != &o)
- {
- destroy();
- copy(o);
- }
- return *this;
-}
-
wstring
TRXReader::attrib(wstring const &name)
{
diff --git a/apertium/trx_reader.h b/apertium/trx_reader.h
index bbe08ea..fdf350f 100644
--- a/apertium/trx_reader.h
+++ b/apertium/trx_reader.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TRXREADER_
#define _TRXREADER_
@@ -48,7 +46,6 @@ private:
wstring attrib(wstring const &name);
void parseError(wstring const &message);
- void copy(TRXReader const &o);
void destroy();
void clearTagIndex();
@@ -78,8 +75,6 @@ public:
TRXReader();
~TRXReader();
- TRXReader(TRXReader const &o);
- TRXReader & operator =(TRXReader const &o);
void read(string const &filename);
void write(string const &filename);
diff --git a/apertium/tsx_reader.cc b/apertium/tsx_reader.cc
index 878df46..8b2e57b 100644
--- a/apertium/tsx_reader.cc
+++ b/apertium/tsx_reader.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/tsx_reader.h>
#include <lttoolbox/xml_parse_util.h>
@@ -36,7 +34,9 @@ TSXReader::destroy()
{
}
-TSXReader::TSXReader()
+TSXReader::TSXReader() :
+reader(0),
+type(0)
{
open_class = &(tdata.getOpenClass());
forbid_rules = &(tdata.getForbidRules());
diff --git a/apertium/tsx_reader.h b/apertium/tsx_reader.h
index fe3102c..ee0ee6d 100644
--- a/apertium/tsx_reader.h
+++ b/apertium/tsx_reader.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TSXREADER_
#define _TSXREADER_
@@ -64,7 +62,6 @@ private:
void procLabelSequence();
void procEnforce();
void procPreferences();
- void copy(TSXReader const &o);
void destroy();
void clearTagIndex();
@@ -72,11 +69,14 @@ private:
public:
TSXReader();
~TSXReader();
- TSXReader(TSXReader const &o);
- TSXReader & operator =(TSXReader const &o);
void read(string const &filename);
TaggerData & getTaggerData();
+
+private:
+ void copy(TSXReader const &o);
+ TSXReader(TSXReader const &o);
+ TSXReader & operator =(TSXReader const &o);
};
#endif
diff --git a/apertium/ttag.h b/apertium/ttag.h
index bffea9b..d78aa49 100644
--- a/apertium/ttag.h
+++ b/apertium/ttag.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TTAG_
#define _TTAG_
diff --git a/apertium/unlocked_cstdio.h b/apertium/unlocked_cstdio.h
index 49b915f..a85b88a 100644
--- a/apertium/unlocked_cstdio.h
+++ b/apertium/unlocked_cstdio.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _APERTIUM_UNLOCKED_CSTDIO_
@@ -54,4 +52,10 @@
#define fputws_unlocked fputws
#endif
+#if !HAVE_MBTOWC
+#include <cwchar>
+inline int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); }
+inline int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); }
+#endif
+
#endif
diff --git a/apertium/utf_converter.cc b/apertium/utf_converter.cc
index a537e7e..1987ecb 100644
--- a/apertium/utf_converter.cc
+++ b/apertium/utf_converter.cc
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <apertium/utf_converter.h>
#include <iostream>
diff --git a/apertium/utf_converter.h b/apertium/utf_converter.h
index bafa8d0..202012e 100644
--- a/apertium/utf_converter.h
+++ b/apertium/utf_converter.h
@@ -12,9 +12,7 @@
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _UTFCONVERTER_
#define _UTFCONVERTER_
diff --git a/apertium/wchar_t_exception.h b/apertium/wchar_t_exception.h
new file mode 100644
index 0000000..f936937
--- /dev/null
+++ b/apertium/wchar_t_exception.h
@@ -0,0 +1,53 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef WCHAR_T_EXCEPTION_H
+#define WCHAR_T_EXCEPTION_H
+
+#include "wchar_t_exception_type.h"
+
+#include <sstream>
+#include <string>
+
+namespace Apertium {
+namespace wchar_t_Exception {
+
+#define WCHAR_T_EXCEPTION(WCHAR_T_EXCEPTION_TYPE) \
+ class WCHAR_T_EXCEPTION_TYPE : public ::Apertium::wchar_t_ExceptionType { \
+ public: \
+ WCHAR_T_EXCEPTION_TYPE(const wchar_t *wchar_t_what_) \
+ : wchar_t_ExceptionType(wchar_t_what_) {} \
+ WCHAR_T_EXCEPTION_TYPE(const std::wstring &wchar_t_what_) \
+ : wchar_t_ExceptionType(wchar_t_what_) {} \
+ WCHAR_T_EXCEPTION_TYPE(const std::wstringstream &wchar_t_what_) \
+ : wchar_t_ExceptionType(wchar_t_what_) {} \
+ ~WCHAR_T_EXCEPTION_TYPE() throw() {} \
+ };
+
+namespace Stream {
+WCHAR_T_EXCEPTION(TheCharacterStream_not_good)
+WCHAR_T_EXCEPTION(UnexpectedAnalysis)
+WCHAR_T_EXCEPTION(UnexpectedCase)
+WCHAR_T_EXCEPTION(UnexpectedCharacter)
+WCHAR_T_EXCEPTION(UnexpectedEndOfFile)
+WCHAR_T_EXCEPTION(UnexpectedLemma)
+WCHAR_T_EXCEPTION(UnexpectedPreviousCase)
+}
+
+#undef WCHAR_T_EXCEPTION
+}
+}
+
+#endif // WCHAR_T_EXCEPTION_H
diff --git a/apertium/wchar_t_exception_type.cc b/apertium/wchar_t_exception_type.cc
new file mode 100644
index 0000000..a1f5bdd
--- /dev/null
+++ b/apertium/wchar_t_exception_type.cc
@@ -0,0 +1,90 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#include "wchar_t_exception_type.h"
+
+#include "exception.h"
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdlib>
+#include <cstring>
+#include <cwchar>
+#include <sstream>
+#include <string>
+
+namespace Apertium {
+void swap(wchar_t_ExceptionType &a, wchar_t_ExceptionType &b) {
+ using std::swap;
+
+ swap(a.what_, b.what_);
+}
+
+wchar_t_ExceptionType::wchar_t_ExceptionType(const wchar_t *wchar_t_what_)
+ : what_(new char[size(wchar_t_what_)]) {
+ constructor(wchar_t_what_);
+}
+
+wchar_t_ExceptionType::wchar_t_ExceptionType(const std::wstring &wchar_t_what_)
+ : what_(new char[size(wchar_t_what_.c_str())]) {
+ constructor(wchar_t_what_.c_str());
+}
+
+wchar_t_ExceptionType::wchar_t_ExceptionType(
+ const std::wstringstream &wchar_t_what_)
+ : what_(new char[size(wchar_t_what_.str().c_str())]) {
+ constructor(wchar_t_what_.str().c_str());
+}
+
+wchar_t_ExceptionType::wchar_t_ExceptionType(
+ const wchar_t_ExceptionType &wchar_t_ExceptionType_)
+ : what_(new char[std::strlen(wchar_t_ExceptionType_.what_) + 1]) {
+ std::strcpy(what_, wchar_t_ExceptionType_.what_);
+}
+
+wchar_t_ExceptionType &wchar_t_ExceptionType::
+operator=(wchar_t_ExceptionType wchar_t_ExceptionType_) {
+ swap(*this, wchar_t_ExceptionType_);
+ return *this;
+}
+
+wchar_t_ExceptionType::~wchar_t_ExceptionType() throw() { delete[] what_; }
+
+const char *wchar_t_ExceptionType::what() const throw() { return what_; }
+
+std::size_t wchar_t_ExceptionType::size(const wchar_t *wchar_t_what_) {
+ std::mbstate_t ps = {0};
+ errno = 0;
+ std::size_t size_ = std::wcsrtombs(NULL, &wchar_t_what_, 0, &ps);
+
+ if (errno == EILSEQ)
+ throw Exception::wchar_t_ExceptionType::EILSEQ_(
+ "can't convert const wchar_t *wchar_t_what_ to char * : unexpected "
+ "wide character");
+
+ return size_ + 1;
+}
+
+void wchar_t_ExceptionType::constructor(const wchar_t *wchar_t_what_) {
+ std::mbstate_t ps = {0};
+ errno = 0;
+ std::wcsrtombs(what_, &wchar_t_what_, size(wchar_t_what_), &ps);
+
+ if (errno == EILSEQ)
+ throw Exception::wchar_t_ExceptionType::EILSEQ_(
+ "can't convert const wchar_t *const wchar_t_what_ to char *what_: "
+ "unexpected wide character");
+}
+}
diff --git a/apertium/wchar_t_exception_type.h b/apertium/wchar_t_exception_type.h
new file mode 100644
index 0000000..14b68d5
--- /dev/null
+++ b/apertium/wchar_t_exception_type.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+#ifndef WCHAR_T_EXCEPTION_TYPE_H
+#define WCHAR_T_EXCEPTION_TYPE_H
+
+#include "basic_exception_type.h"
+
+#include <cstddef>
+#include <sstream>
+#include <string>
+
+namespace Apertium {
+class wchar_t_ExceptionType : public basic_ExceptionType {
+public:
+ friend void swap(wchar_t_ExceptionType &a, wchar_t_ExceptionType &b);
+ wchar_t_ExceptionType(const wchar_t *wchar_t_what_);
+ wchar_t_ExceptionType(const std::wstring &wchar_t_what_);
+ wchar_t_ExceptionType(const std::wstringstream &wchar_t_what_);
+ wchar_t_ExceptionType(const wchar_t_ExceptionType &wchar_t_ExceptionType_);
+ wchar_t_ExceptionType &
+ operator=(wchar_t_ExceptionType wchar_t_ExceptionType_);
+ virtual ~wchar_t_ExceptionType() throw();
+ const char *what() const throw();
+
+private:
+ static std::size_t size(const wchar_t *wchar_t_what_);
+ void constructor(const wchar_t *wchar_t_what_);
+ char *what_;
+};
+}
+
+#endif // WCHAR_T_EXCEPTION_TYPE_H
diff --git a/configure.ac b/configure.ac
index 1e0b278..190872d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3,12 +3,12 @@
AC_PREREQ(2.52)
-m4_define([required_lttoolbox_version], [3.3.1])
+m4_define([required_lttoolbox_version], [3.3.3])
m4_define([required_libxml_version], [2.6.17])
m4_define([required_libpcre_version], [6.4])
#m4_define([required_pkg_config_version], [0.15])
-AC_INIT([apertium], [3.4.0], [sortiz at users.sourceforge.net])
+AC_INIT([apertium], [3.4.2], [sortiz at users.sourceforge.net])
AC_CONFIG_HEADER([apertium/apertium_config.h])
AC_CANONICAL_SYSTEM
@@ -50,80 +50,19 @@ AC_SUBST(GENERIC_VERSION)
VERSION=$GENERIC_VERSION
-AM_INIT_AUTOMAKE($PACKAGE, $VERSION, no-define)
-
-# Checks for programs.
-
-AC_MSG_CHECKING([Compilation architecture: PPC, i686, x86_64, Other])
-if test x$(which arch) = x
-then ARCH=$($(which uname) -m)
-else ARCH=$($(which arch))
-fi
-
-case "${target_os}" in
- mingw*)
- MinGW=yes
- ;;
- cygwin*)
- Cygwin=yes
- ;;
-esac
-
-if test x$ARCH = xppc
-then
- AC_MSG_RESULT([PowerPC])
- CFLAGS="$CFLAGS -Wall -ansi -fomit-frame-pointer"
- CXXFLAGS="$CXXFLAGS -Wall -ansi -fomit-frame-pointer"
-else
- if test x$MinGW = xyes
- then
- AC_MSG_RESULT([MinGW])
- CFLAGS="$CFLAGS -Wall -march=i686 -O3 -DMINGW -fomit-frame-pointer \
- -funroll-loops -I/include -L/lib"
- CXXFLAGS="$CXXFLAGS -Wall -march=i686 -O3 \
- -fomit-frame-pointer -funroll-loops -I/include -L/lib"
- CPPFLAGS="$CPPFLAGS -I/include"
- else
- if test x$Cygwin = xyes
- then
- AC_MSG_RESULT([Cygwin])
- CFLAGS="$CFLAGS -Wall -march=i686 -O3 -fomit-frame-pointer \
- -funroll-loops"
- CXXFLAGS="$CXXFLAGS -Wall -march=i686 -O3 \
- -fomit-frame-pointer -funroll-loops"
- CPPFLAGS="$CPPFLAGS"
- else
- if test x$ARCH = xi686
- then
- AC_MSG_RESULT([i686])
- CFLAGS="$CFLAGS -Wall -ansi -march=i686 -O3 -fomit-frame-pointer -funroll-loops"
- CXXFLAGS="$CXXFLAGS -Wall -ansi -march=i686 -O3 \
- -fomit-frame-pointer -funroll-loops"
- else
- if test x$ARCH = xx86_64
- then
- AC_MSG_RESULT([x86_64])
- CFLAGS="-Wall -ansi -O3 -mtune=nocona -fomit-frame-pointer -funroll-loops $CFLAGS"
- CXXFLAGS="-Wall -ansi -O3 -mtune=nocona \
- -fomit-frame-pointer -funroll-loops $CXXFLAGS"
- else
- AC_MSG_RESULT([Other])
- CFLAGS="-Wall -ansi -O3 $CFLAGS"
- CXXFLAGS="-Wall -ansi -O3 $CXXFLAGS"
- fi
- fi
- fi
- fi
-fi
+AM_INIT_AUTOMAKE(no-define)
AC_PROG_CXX
AC_PROG_LIBTOOL
AM_SANITY_CHECK
AC_LANG_CPLUSPLUS
+CFLAGS="-Wall -Wextra $CFLAGS"
+CXXFLAGS="-Wall -Wextra $CXXFLAGS"
+
AC_ARG_ENABLE(debug,
[ --enable-debug Enable "-g -Wall" compiler options],
- [CXXFLAGS="-g -Wall"; CFLAGS="-g -Wall"])
+ [CXXFLAGS="-g -Wall"; CFLAGS="-g -Wall"; AC_DEFINE([ENABLE_DEBUG], [1], [ENABLE_DEBUG])])
AC_ARG_ENABLE(profile,
[ --enable-profile Enable "-pg -g -Wall" compiler options],
@@ -242,7 +181,7 @@ AC_FUNC_ERROR_AT_LINE
AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, getopt_long, fgetwc_unlocked, fputwc_unlocked, fgetws_unlocked, fputws_unlocked])
-AC_CHECK_FUNCS([setlocale strdup getopt_long snprintf])
+AC_CHECK_FUNCS([setlocale strdup getopt_long snprintf mbtowc])
AM_CONDITIONAL([WINDOWS], [test x$version_type = xwindows])
AS_IF([test x$version_type = xwindows], [AC_DEFINE(HAVE_GETOPT_LONG,0)], [])
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/apertium.git
More information about the debian-science-commits
mailing list