[hfst-ospell] 01/02: Imported Upstream version 0.3.1~r4598
Tino Didriksen
tinodidriksen-guest at moszumanska.debian.org
Tue Jan 12 10:56:06 UTC 2016
This is an automated email from the git hooks/post-receive script.
tinodidriksen-guest pushed a commit to branch master
in repository hfst-ospell.
commit 09122755d7aa8d3071d152280147c70d18c074e9
Author: Tino Didriksen <tino at didriksen.cc>
Date: Tue Jan 12 10:55:02 2016 +0000
Imported Upstream version 0.3.1~r4598
---
AUTHORS | 11 +
COPYING | 201 +++++
ChangeLog | 1082 ++++++++++++++++++++++++
Doxyfile | 1902 +++++++++++++++++++++++++++++++++++++++++++
INSTALL | 234 ++++++
Makefile.am | 215 +++++
NEWS | 70 ++
README | 101 +++
ZHfstOspeller.cc | 472 +++++++++++
ZHfstOspeller.h | 204 +++++
ZHfstOspellerXmlMetadata.cc | 1023 +++++++++++++++++++++++
ZHfstOspellerXmlMetadata.h | 170 ++++
acceptor.basic.txt | 7 +
analyse-spell.sh | 11 +
analyser.default.txt | 12 +
authors.xml | 7 +
autogen.sh | 1491 +++++++++++++++++++++++++++++++++
bad-errormodel.sh | 11 +
basic-edit1.sh | 11 +
basic-zhfst.sh | 11 +
basic_test.xml | 34 +
configure.ac | 179 ++++
doc/index.html | 34 +
edit2-small.png | Bin 0 -> 21654 bytes
empty-descriptions.sh | 11 +
empty-locale.sh | 11 +
empty-titles.sh | 11 +
empty-zhfst.sh | 12 +
empty_descriptions.xml | 24 +
empty_locale.xml | 34 +
empty_titles.xml | 33 +
errmodel.basic.txt | 5 +
errmodel.edit1.txt | 728 +++++++++++++++++
errmodel.extrachars.txt | 5 +
hfst-ol.cc | 861 ++++++++++++++++++++
hfst-ol.h | 453 +++++++++++
hfst-ospell-office.1 | 19 +
hfst-ospell.1 | 58 ++
hfstospell.pc.in | 10 +
m4/ax_check_compile_flag.m4 | 74 ++
main-cicling.cc | 210 +++++
main-fsmnlp-2012.cc | 440 ++++++++++
main-ispell.cc | 455 +++++++++++
main-lrec2013.cc | 518 ++++++++++++
main-norvig.cc | 263 ++++++
main-survey.cc | 523 ++++++++++++
main.cc | 610 ++++++++++++++
no-errormodel.sh | 11 +
no_errmodel.xml | 26 +
office.cpp | 360 ++++++++
ol-exceptions.h | 84 ++
ospell.cc | 1174 ++++++++++++++++++++++++++
ospell.h | 445 ++++++++++
test.strings | 5 +
test/editdist.py | 457 +++++++++++
trailing-spaces.sh | 11 +
trailing_spaces.xml | 36 +
windows-Makefile.am | 278 +++++++
windows-configure.ac | 143 ++++
59 files changed, 15891 insertions(+)
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..89b61a3
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,11 @@
+Authors of HFST ospell
+----------------------
+
+This lists authors relevant for copyright issues. See also THANKS.
+
+Listing is alphabetically sorted.
+
+Sam Hardwick
+Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+Sjur Nørstebo Moshagen
+
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/ChangeLog b/ChangeLog
new file mode 100644
index 0000000..4f1cca1
--- /dev/null
+++ b/ChangeLog
@@ -0,0 +1,1082 @@
+2014-04-10 08:18 eaxelson
+
+ * main.cc: Now printing and reading to/from console works in
+ hfst-ospell is it is compiled with -DWINDOWS.
+
+2014-03-28 08:15 eaxelson
+
+ * Makefile.am, hfst-ospell.1: Added a rule for man page creation in
+ Makefile.am.
+
+2014-03-14 07:28 moshagen
+
+ * Makefile.am, empty-zhfst.sh: Added a test for an empty zhfst
+ file. It should fail, but presently hfst-ospell behaves as if all
+ is ok, just that it isn't able to correct anything.
+
+2014-03-14 07:24 moshagen
+
+ * bad-errormodel.sh, basic-legacy.sh, basic-zhfst-fallback.sh,
+ basic-zhfst.sh, empty-descriptions.sh, empty-locale.sh,
+ empty-titles.sh, trailing-spaces.sh: Corrected return value to
+ what it should be for a SKIP result, which I assume was intended.
+
+2014-03-11 17:15 moshagen
+
+ * Makefile.am: Added files to the distro.
+
+2014-03-11 06:14 moshagen
+
+ * Makefile.am, bad-errormodel.sh, errmodel.extrachars.txt: Added
+ test to check whether error models with extra chars are correctly
+ handled. The test passes with revision 3793 of ospell.cc, and
+ fails with revision 3729.
+
+2014-03-09 14:24 hardwick
+
+ * ospell.cc: Silently ignore non-lexicon symbols in the error
+ source, translating them
+ to NO_SYMBOL on the output side.
+
+2014-02-13 21:50 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ospell.cc, ospell.h: revert
+
+2014-02-13 19:46 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ospell.cc, ospell.h: What happens if I do this?
+
+2014-02-13 19:17 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, hfst-ol.cc, hfst-ol.h, ospell.cc, ospell.h: Moving
+ implementations?
+
+2014-02-13 17:59 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am: Install men
+
+2014-02-13 17:37 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * hfst-ospell.1: A man page for debian's lint
+
+2014-02-05 05:55 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspeller.cc: Probably libarchive deprecation will go away
+ with different function name
+ here too
+
+2014-02-05 05:33 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ospell.cc, ospell.h: Move some of the code away from headers
+
+2014-01-27 12:26 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspeller.cc: some old changes on work desktop?
+
+2013-12-23 05:52 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspellerXmlMetadata.cc: Ensure other potentially empty tags
+ for libxmlpp
+
+2013-12-23 05:22 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * configure.ac: Bumped version for tinyxml2 check to get lengthful
+ Parse by Børre Gaup
+
+2013-12-22 07:01 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspeller.cc, ZHfstOspellerXmlMetadata.cc, ospell.cc:
+ unarchive to mem and tinyxml2 xml length fixes from Harri
+ Pitkänen
+
+2013-12-20 08:57 moshagen
+
+ * ZHfstOspeller.cc, ZHfstOspeller.h: Reverted the patch by Børre to
+ clean the xml off of random byte noise after the end of the xml
+ document. The issue was caused by zip compression
+ incompatibilities between different plattforms, and the solution
+ (for now) is to just skip compression altogether, using 'zip -Z
+ store' instead when building the zhfst files. This creates a zip
+ file readable everywhere (but without compression).
+
+2013-12-18 13:03 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, trailing-spaces.sh, trailing_spaces.xml: Test for
+ trailing spaces in xml structure
+
+2013-12-16 15:20 moshagen
+
+ * ZHfstOspeller.cc, ZHfstOspeller.h: Patch by Børre Gaup to clean
+ corrupted xml data returned from Libarchive when reading xml data
+ from a zhfst file stored in RAM:
+
+ libarchive seems to add one char from the following file in the
+ stream pr whitespace char it (erroneously) removes from the xml
+ data around newlines, since the returned xml file is truncated
+ compared to the original file length reported when stored in the
+ archive. Although the xml info set is exactly the same, the extra
+ chars after the final closing element causes TinyXML2 to crash.
+
+ This change removes everything after the final '>', which should
+ clean the string representation of the xml document properly in
+ all cases but one: when the semi random noise contains an '>'.
+ This situation is presently not handled.
+
+2013-12-14 13:11 moshagen
+
+ * Makefile.am: Whitespace changes to make the file easier to my
+ eyes, comments. Added tests for empty locale and empty title
+ elements.
+
+2013-12-14 13:10 moshagen
+
+ * empty-locale.sh, empty-titles.sh, empty_locale.xml,
+ empty_titles.xml: Data and shell scripts to test empty titles and
+ empty locale.
+
+2013-12-14 13:09 moshagen
+
+ * ZHfstOspellerXmlMetadata.cc: Check for empty locale node.
+
+2013-12-13 23:50 moshagen
+
+ * ., Makefile.am: Clean and ignore index.xml.
+
+2013-12-13 09:37 moshagen
+
+ * ZHfstOspellerXmlMetadata.cc: One more test for empty data.
+
+2013-12-13 09:37 moshagen
+
+ * main.cc: Added newline before error cause.
+
+2013-12-13 01:10 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, ZHfstOspellerXmlMetadata.cc, configure.ac, main.cc:
+ Empty descriptions will throw (there might be others left)
+
+2013-12-13 00:48 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, basic-zhfst.sh, basic_test.xml,
+ empty-descriptions.sh, empty_descriptions.xml, index.xml:
+ index.xml with empty descriptions
+
+2013-12-13 00:36 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am: Test.strings shall not be deleted when cleaning
+
+2013-12-11 14:12 moshagen
+
+ * configure.ac: Added check for the availability of pkg-config -
+ without it configuration will fail in subtle but bad ways.
+
+2013-11-20 01:36 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ChangeLog, NEWS: Document for next release candidate
+
+2013-11-19 22:58 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspellerXmlMetadata.cc: Missing model attribute parsing for
+ errm
+
+2013-11-19 22:27 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspellerXmlMetadata.cc, ZHfstOspellerXmlMetadata.h: Use
+ Elements instead of Nodes and other such fixes
+
+2013-11-19 22:26 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, configure.ac: Use automake conditionals to avoid
+ pkg-config linking to libraries that
+ are not in configure's code paths
+
+2013-11-19 21:07 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * configure.ac: update configuration for that
+
+2013-11-19 20:52 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspellerXmlMetadata.cc: Add tinyxml2 versions of XML parsing
+
+2013-11-14 02:06 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, configure.ac: conference demos as configure option
+
+2013-10-08 11:21 moshagen
+
+ * .: Ignore generated files.
+
+2013-10-08 07:55 spectre360
+
+ * main-lrec2013.cc: stats -> status
+
+2013-10-02 16:03 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * main-lrec2013.cc: Allow to end correct correction
+
+2013-09-27 16:25 hardwick
+
+ * ospell.cc, ospell.h: Merge lookup() from devbranch
+
+2013-09-24 14:52 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, main-ispell.cc: This should be useful
+
+2013-09-24 11:25 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * main-lrec2013.cc: Fix the output tsv a bit more
+
+2013-09-23 15:56 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, configure.ac, main-lrec2013.cc: Slight udpate for
+ new measure ments
+
+2013-07-04 05:02 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspellerXmlMetadata.cc, ZHfstOspellerXmlMetadata.h: Few
+ starts
+
+2013-07-04 02:29 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * configure.ac: Allow selection of xml backend
+
+2013-05-28 14:34 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * configure.ac: Switch <3 into >3 because it looks nicer.
+
+2013-05-24 15:32 moshagen
+
+ * ospell.cc, ospell.h: strndup() fixed by Tommi.
+
+2013-05-24 11:44 hardwick
+
+ * ZHfstOspeller.cc, ZHfstOspellerXmlMetadata.cc, ospell.h: Pending
+ proper understanding of why strndup is getting defined several
+ times over, let's always use hfst_strndup() instead.
+
+2013-05-24 11:29 hardwick
+
+ * ospell.h: Add include guard to custom strndup so it doesn't get
+ compiled more than once
+
+2013-05-24 08:58 hardwick
+
+ * ZHfstOspeller.cc, ospell.h: Move our custom strndup to ospell.h
+ so it will be seen by
+ all compilation units that need it
+
+2013-05-21 12:33 moshagen
+
+ * Makefile.am, NEWS, configure.ac, hfstospell.pc.in: Renamed the
+ hfstospell package and variables to what they used to be, in case
+ that can help solve a build issue with libvoikko+hfst.
+
+2013-04-27 11:36 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ChangeLog: Wrap and load
+
+2013-04-27 11:32 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am: versioninfo 3:1:1
+
+2013-04-26 17:58 moshagen
+
+ * NEWS: Preparing for 0.2.3 release.
+
+2013-04-26 14:34 moshagen
+
+ * configure.ac, hfstospell.pc.in: Changed version number to 0.2.3,
+ and renamed the package and tool name to 'hfst-ospell', since
+ that is what the command-line tool is actually called, and it is
+ consistent with the rest of the hfst tool names.
+
+2013-04-26 12:44 hardwick
+
+ * ospell.cc: Should fix #176
+ Flag state from the stack back was getting clobbered in some
+ situations.
+ Now we restore flag state after modifying it for new nodes.
+
+2013-03-22 07:49 moshagen
+
+ * .: More ignore patterns to ignore files generated during 'make
+ check'.
+
+2013-03-22 06:21 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am: Fix dist stuff, wrap.
+
+2013-03-20 10:13 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, ZHfstOspeller.cc, configure.ac: Use pkg-config
+ instead of autoconf to check libarchive
+
+2013-03-18 12:43 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ChangeLog, Makefile.am, NEWS, configure.ac: Prepare files for
+ 0.2.2
+
+2013-03-18 12:37 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspellerXmlMetadata.cc, main.cc: Fix few leaks
+
+2013-03-18 10:26 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspeller.cc, hfst-ol.cc, hfst-ol.h, ospell.h: Fixes to
+ [#165] by Harri Pitkänen
+
+2013-02-04 13:54 hardwick
+
+ * ospell.cc, ospell.h: Made some changes to depth-first searching
+ to prevent just-added
+ nodes getting removed. This appears fo fix a very long-standing
+ and serious bug dating from r2763.
+
+2013-01-16 12:46 hardwick
+
+ * hfst-ol.cc, hfst-ol.h: When data is in memory, allocate & copy it
+ in ospell rather than
+ expect caller to keep track of it (which it usually doesn't)
+
+2013-01-08 06:21 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspeller.cc, ZHfstOspeller.h: Add access to metadata
+
+2012-12-08 10:32 hardwick
+
+ * hfst-ol.h: When initialised from memory, don't assume
+ responsibility for
+ freeing it
+
+2012-12-04 08:48 hardwick
+
+ * hfst-ol.cc, hfst-ol.h, ospell.cc, ospell.h: Somewhat experimental
+ (no checking) way of saving memory by ignoring alignment
+ issues (4-5 x memory savings)
+
+2012-10-05 07:48 hardwick
+
+ * ospell.h: Remove leftover agenda variable
+
+2012-10-05 05:18 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, main-survey.cc: Add stuff for survey article
+
+2012-10-03 12:19 hardwick
+
+ * main.cc, ospell.h: Revert to breadth-first searching pending
+ bugfix
+
+2012-09-25 09:20 hardwick
+
+ * ospell.cc: Search depth-first by preference for hope
+
+2012-09-20 16:30 hardwick
+
+ * ospell.cc: Forgot some important checks that we actually want to
+ limit the results
+
+2012-09-20 16:27 hardwick
+
+ * ospell.cc, ospell.h: Enforce an n-best limit for continuing the
+ search, just for breadth-first for
+ now
+
+2012-09-04 19:11 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * main.cc: Don’t just break on empty lines
+
+2012-08-21 17:59 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspeller.cc: Switch order of errmodels and acceptors in
+ legacy read
+
+2012-08-21 17:50 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspeller.cc: set spelling caps on legacy read
+
+2012-08-21 17:50 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, basic-zhfst-fallback.sh: Test fallback
+
+2012-08-15 08:48 moshagen
+
+ * basic-legacy.sh, basic-legacy.sh.in, basic-zhfst.sh,
+ basic-zhfst.sh.in, configure.ac: The VPATH approach to the test
+ shell scripts was wrong. Now everything is working as it should.
+
+2012-08-15 04:57 moshagen
+
+ * basic-legacy.sh.in, basic-zhfst.sh.in: Proper linebreaks.
+
+2012-08-15 04:56 moshagen
+
+ * ., basic-legacy.sh, basic-legacy.sh.in, basic-zhfst.sh,
+ basic-zhfst.sh.in, configure.ac: Enabling VPATH building for
+ 'make check'.
+
+2012-08-15 04:15 moshagen
+
+ * ., configure.ac: Whitespace change only in configure.ac. More
+ generated files to ignore.
+
+2012-08-15 03:57 moshagen
+
+ * Makefile.am: Added some whitespace to ease readability. Replaced
+ pattern rules with suffix rules to make automake happy.
+
+2012-08-14 20:19 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspellerXmlMetadata.cc: Fix compilation without xml,
+ as suggested by Harri Pitkänen on libvoikko-devel
+
+2012-08-14 18:06 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspeller.cc, main.cc: do not close unopened files, handle
+ legacy fallback reading errors
+
+2012-08-14 17:25 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * main.cc: print details of xml parsing errors
+
+2012-08-14 16:13 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, acceptor.default.txt, basic-legacy.sh,
+ basic-zhfst.sh, configure.ac, errmodel.default.txt, index.xml,
+ test.strings: Add the very basic test suite
+
+2012-08-13 20:23 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspeller.cc, configure.ac: Throw exception if reading a zip
+ without existing zip file.
+
+2012-08-13 20:02 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * configure.ac: Optionalise the libraries
+
+2012-07-22 01:52 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * configure.ac: Update for automake-1.12 and AM_PROG_AR
+
+2012-07-06 19:03 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * main.cc: kill everyone if windows linebreaks
+
+2012-07-05 18:13 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * AUTHORS, ChangeLog, NEWS, README, authors.xml, configure.ac,
+ main.cc: Fix documentation and set for 0.2.1 release
+
+2012-07-04 15:38 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspeller.cc, ZHfstOspeller.h: free more things and stuff
+
+2012-07-04 15:37 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, ZHfstOspellerXmlMetadata.cc,
+ ZHfstOspellerXmlMetadata.h: Move Xml metadata parsing and storing
+ into own file and class
+
+2012-07-04 10:28 hardwick
+
+ * hfst-ol.h: Increment *raw when reading bools
+
+2012-07-04 10:23 hardwick
+
+ * hfst-ol.cc, hfst-ol.h: Added utility function for iterating c
+ strings in raw memory,
+ use it in every branch of symbol reading
+
+2012-07-04 07:21 hardwick
+
+ * hfst-ol.cc: Fixed problems having to do with reading strings from
+ a transducer in raw
+ memory.
+
+2012-07-03 21:04 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspeller.cc, configure.ac: fix the tmpdir'd version again
+
+2012-07-03 19:42 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, ZHfstOspeller.cc, ZHfstOspeller.h, configure.ac,
+ hfst-ol.cc: Version that extracts zhfst to memory iff it fits on
+ one throw
+
+2012-07-02 22:19 hardwick
+
+ * hfst-ol.cc, hfst-ol.h, ospell.h: Added raw memory constructors
+ and table readers for slurping in transducers
+ from char *
+
+2012-07-02 19:49 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * main-cicling.cc, main-fsmnlp-2012.cc: Missing files lol
+
+2012-04-25 16:36 hardwick
+
+ * test/editdist.py: ...and in that case the final identity states
+ also are greater by one.
+
+2012-04-25 16:33 hardwick
+
+ * test/editdist.py: When avoiding initial edits, there needs to be
+ an extra inital state before
+ we do any edits. So add the value of options.no_initial to the
+ state range
+ loop.
+
+2012-04-18 14:53 eaxelson
+
+ * Makefile.am: Commented out hfst-ospell-cicling from Makefile.
+
+2012-04-18 14:48 hardwick
+
+ * Makefile.am: Comment out missing cicling target
+
+2012-04-18 11:24 hardwick
+
+ * main-fsmnlp-2012.cc: "else if" instead of incorrect "if" in
+ option handling
+
+2012-04-17 13:23 hardwick
+
+ * test/editdist.py: --no-string-initial should have transitions to
+ state 1, not 0
+
+2012-03-31 14:06 hardwick
+
+ * main-fsmnlp-2012.cc: Don't exit on empty lines
+
+2012-03-31 13:12 hardwick
+
+ * main-fsmnlp-2012.cc: Don't print unhelpful warnings
+
+2012-03-31 08:55 hardwick
+
+ * test/editdist.py: Silly default for minimum edit
+
+2012-03-19 08:26 hardwick
+
+ * main-fsmnlp-2012.cc: fallback_spell()
+
+2012-03-19 07:40 hardwick
+
+ * test/editdist.py: --no-string-initial-correction
+
+2012-03-05 08:58 hardwick
+
+ * test/editdist.py: Corrected eliminations (hopefully), added
+ --minimum-edit option
+
+2012-02-24 05:02 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, NEWS, main-fsmnlp-2012.cc: Profiled version for
+ fsmnlp-2012 measurements§
+
+2011-11-08 11:18 hardwick
+
+ * test/editdist.py: Don't skip state numbers when certain options
+ are turned off
+ (and don't print debugging lines)
+
+2011-11-08 11:16 hardwick
+
+ * test/editdist.py: Corrections to redundancy elimination
+
+2011-11-01 13:55 hardwick
+
+ * test/editdist.py: redundancy elimination was being performed one
+ state too late (too little)
+
+2011-10-25 10:40 hardwick
+
+ * test/editdist.py: Add option to disable redundancy elimination
+
+2011-10-25 10:35 hardwick
+
+ * test/editdist.py: Initial support for redundancy elimination
+ (refuse to do insertion after deletion or deletion after
+ insertion)
+
+2011-10-11 19:30 hardwick
+
+ * test/editdist.py: Bugfix: identity transitions were being
+ forgotten in the last edit state
+
+2011-10-07 10:28 hardwick
+
+ * hfst-ol.cc, test/editdist.py: Enforce @_UNKNOWN_SYMBOL_@ instead
+ of @?@, which users didn't know about
+
+2011-10-03 14:23 moshagen
+
+ * test/editdist.py: Fixed one remaining UTF-8 stderr printing bug.
+
+2011-09-28 08:52 hardwick
+
+ * test/editdist.py: Fixed bug with the newline character not being
+ stripped from excluded symbols
+
+2011-09-28 08:42 hardwick
+
+ * test/editdist.py: Lines that start with ## are comments
+
+2011-09-28 08:24 hardwick
+
+ * test/editdist.py: Updated help message & added exclusion of
+ symbols by prepending a ~
+
+2011-09-28 07:44 hardwick
+
+ * test/editdist.py: Wrap stderr with a utf-8 codec so we can print
+ non-ascii symbols when verbose
+
+2011-09-28 07:32 hardwick
+
+ * test/editdist.py: Write alphabet with weights when verbose
+
+2011-09-28 07:24 hardwick
+
+ * test/editdist.py: Order of preference of alphabet definition is
+ now
+ configfile - commandline - transducer.
+ If configfile gives a weight after a tab for symbols,
+ they are used additively for all edits involving those symbols.
+
+2011-09-28 06:42 hardwick
+
+ * test/editdist.py: Some clarifying comments
+
+2011-09-28 06:36 hardwick
+
+ * test/editdist.py: Rescue identities from being considered
+ substitutions
+
+2011-09-28 05:50 hardwick
+
+ * test/editdist.py: utf-8 -decode the user-specified transitions
+ from a conf file
+ (so easy to forget one of these...)
+
+2011-09-13 22:03 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * hfstospell.pc.in: use same includedir in pc as makefile
+
+2011-09-13 20:54 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspeller.h: don't declare strndup in public headers
+
+2011-09-06 07:46 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ol-exceptions.h: make ol exceptions in hfst_ol namespace, provide
+ stdexception style what()
+
+2011-09-02 06:12 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * main.cc: Add verbose, quiet
+
+2011-09-01 21:41 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspeller.cc, ZHfstOspeller.h, ospell.h: * parse all of the
+ metadata if possible
+ * use c++ ``struct''s for metadata
+
+2011-09-01 21:36 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspeller.cc: * Use temporary filenames by tmpsnam
+ * Do not delete Transducers in data structures since it will
+ segfault all
+ enchant-based applications in dtor
+
+2011-08-31 08:56 hardwick
+
+ * Makefile.am, hfst-ol.cc, hfst-ol.h, main.cc, ol-exceptions.h:
+ libhfst-style exception macros and some more informative messages
+
+2011-08-31 08:26 hardwick
+
+ * README: Document dependencies
+
+2011-08-17 11:14 moshagen
+
+ * .: Ignore generated files.
+
+2011-08-17 08:20 moshagen
+
+ * m4: Ignore generated files.
+
+2011-08-17 02:50 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ZHfstOspeller.cc, ZHfstOspeller.h, configure.ac: mac os x fixes:
+ * strndup
+ * libarchive installed without headers
+
+2011-08-08 02:49 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile.am, ZHfstOspeller.cc, ZHfstOspeller.h, configure.ac, m4,
+ main.cc: Preliminary zhfst support
+
+2011-07-21 01:05 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * main-norvig.cc: Example from:
+ http://norvig.com/spell-correct.html and similar tests
+
+2011-07-20 11:22 hardwick
+
+ * hfst-ol.h: The test for final weighted transitions involves
+ target_index == 1 instead of weight == INFINITE_WEIGHT
+
+ (I wish I remembered why this was changed)
+
+2011-07-20 07:44 hardwick
+
+ * hfst-ol.h: Fixed bug involving bytewise casting of longs to
+ floats (I misunderstood
+ what static_cast really does I guess).
+
+2011-05-24 10:19 hardwick
+
+ * hfst-ol.cc: fread returns element count, not byte count
+
+2011-05-09 19:22 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * configure.ac, main.cc: duplicate definitions
+
+2011-05-09 19:04 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * hfst-ol.cc, main.cc: fix msvc problems
+
+2011-04-25 14:04 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * hfst-ol.cc: check fread return value as advised by gcc
+
+2011-04-20 13:29 hardwick
+
+ * ospell.cc: Removed unnecessary test
+
+2011-04-20 13:12 hardwick
+
+ * hfst-ol.cc, hfst-ol.h, main.cc, ospell.cc, ospell.h: Understand
+ hfst3 headers, don't demand weightedness at header-reading stage
+
+2011-03-03 10:45 moshagen
+
+ * .: Ignore (autotools-)generated files.
+
+2011-02-22 01:20 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * configure.ac: use hfstospell library name for compatibility or
+ whatever
+
+2011-02-19 14:19 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile, configure.ac, hfst-ol.cc, hfst-ol.h, ospell.cc: MSVC
+ fixes:
+ * include <string> when using strinG
+ * use boolean operators instead of aliases?
+
+2011-02-03 01:07 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile, Makefile.am, configure.ac, hfstospell.pc.in: add
+ pkgconfig stuff
+
+2011-02-03 00:36 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * AUTHORS, ChangeLog, INSTALL, Makefile, Makefile.am, NEWS,
+ autogen.sh, configure.ac: autoconfiscate :-)
+
+2010-11-29 00:18 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile, README, hfst-ol.cc, hfst-ol.h, ospell.cc, ospell.h: Add
+ licences everywhere for release
+
+2010-11-07 00:29 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile: make directories that do not exist
+
+2010-11-07 00:24 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile: Install to destdir
+
+2010-11-02 19:35 moshagen
+
+ * ., test/editdist.py: Ignore compiled libraries.
+
+2010-11-02 18:55 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile: fix missing dash in mac dylib magic
+
+2010-11-02 18:44 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * ospell.cc: Silently ignore if empty labels are missing
+
+2010-11-02 15:45 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile: Make dynamic or share libraries
+
+2010-10-12 14:59 hardwick
+
+ * test/editdist.py: Added some fancy rule autodetection
+
+2010-10-12 14:02 hardwick
+
+ * test/editdist.py: Fixes to input format handling
+
+2010-10-12 10:22 hardwick
+
+ * test/editdist.py: New input file syntax
+
+2010-09-07 14:22 hardwick
+
+ * Makefile, hfst-ol.h, ospell.cc: More speed improvements
+
+2010-09-07 12:55 hardwick
+
+ * Makefile, ospell.cc: Various optimizations
+
+2010-09-07 09:06 hardwick
+
+ * hfst-ol.cc, hfst-ol.h, ospell.cc, ospell.h: Critical bugfix,
+ output now believed to be correct
+
+2010-08-30 19:34 hardwick
+
+ * test/editdist.py: Diagnostics and info about expected transducer
+ model in test script
+
+2010-08-12 20:26 hardwick
+
+ * test/editdist.py: More helpful help message for test script
+
+2010-08-12 10:52 moshagen
+
+ * test/editdist.py: Should be executable.
+
+2010-08-12 01:15 hardwick
+
+ * test/editdist.py: Support for OTHER symbol in test script
+
+2010-08-11 22:31 hardwick
+
+ * Makefile, main.cc: Added profiler flag to debug compilation
+ target and made demo utility exit
+ on empty lines
+
+2010-08-11 22:03 hardwick
+
+ * ospell.h: Trivial cosmetic changes
+
+2010-08-11 21:57 hardwick
+
+ * hfst-ol.cc, hfst-ol.h: More header cleanup
+
+2010-08-11 21:49 hardwick
+
+ * hfst-ol.cc, hfst-ol.h: Renamed variable
+
+2010-08-11 21:47 hardwick
+
+ * hfst-ol.cc, hfst-ol.h, ospell.h: Misc. code cleanup and memory
+ savings
+
+2010-08-11 20:43 hardwick
+
+ * hfst-ol.h: Free memory holding transducer data after parsing
+
+2010-08-11 19:11 hardwick
+
+ * ospell.cc, ospell.h: Some more const sprinkling
+
+2010-08-11 19:04 hardwick
+
+ * hfst-ol.h, main.cc, ospell.h: Misc. nonfunctional cleanup
+
+2010-08-11 18:15 hardwick
+
+ * test/editdist.py: Hack to make the test script handle unicode
+
+2010-08-11 18:10 hardwick
+
+ * test/editdist.py: Added character swaps to edit distance script.
+ You have to enable them with
+ the -s flag - they generate A(A-1)*D new states and twice that
+ many
+ transitions, where A is the size of the alphabet and D is the
+ edit distance.
+ Pretty expensive. Is there a better way?
+
+2010-08-11 17:37 hardwick
+
+ * test/editdist.py: Improvements to editdist.py - see
+ test/editdist.py --help
+
+2010-08-11 16:52 hardwick
+
+ * test/editdist.py: Put 1.0 weights on the test generator script
+
+2010-08-11 16:44 hardwick
+
+ * test/editdist.py: Minor enhancement to test script
+
+2010-08-10 12:27 hardwick
+
+ * main.cc, ospell.cc, ospell.h: Added helpful runtime error for
+ alphabet translation problems, updated
+ demo utility to make use of it
+
+2010-08-10 09:54 hardwick
+
+ * hfst-ol.cc, hfst-ol.h: Better checking of read operations, added
+ relevant exceptions
+
+2010-08-09 22:25 hardwick
+
+ * README: Made example formatting in README more consistent - I may
+ have broken
+ Tommi's commit a bit, but I think it's ok now...
+
+2010-08-09 22:21 hardwick
+
+ * README, main.cc: Minor improvement to demo
+
+2010-08-09 22:02 Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+
+ * Makefile, README: Static lib and fixes to xamples in readme
+
+2010-08-09 22:00 hardwick
+
+ * ospell.h: Added comment
+
+2010-08-09 20:44 hardwick
+
+ * ospell.h: Reversed previous commit, which did the opposite of
+ what the commit message
+ said it would. Committer will go to bed now...
+
+2010-08-09 20:41 hardwick
+
+ * ospell.h: Return results in reverse order by weight, ie. in order
+ of quality
+ (instead of the opposite)
+
+2010-08-09 20:04 hardwick
+
+ * ospell.h: Removed obsolete dependency on cassert
+
+2010-08-09 20:03 hardwick
+
+ * hfst-ol.h: Fixed some comments
+
+2010-08-09 20:01 hardwick
+
+ * ospell.cc: One more formatting fix
+
+2010-08-09 20:00 hardwick
+
+ * ospell.cc: Fixed typo in comment
+
+2010-08-09 19:58 hardwick
+
+ * main.cc, ospell.h: Moved getopt dependency from ospell.h to the
+ demo utility proper
+
+2010-08-09 19:56 hardwick
+
+ * hfst-ol.cc, hfst-ol.h, ospell.cc, ospell.h: Formatting
+ improvements
+
+2010-08-09 15:43 hardwick
+
+ * README, main.cc, ospell.cc, ospell.h: Introduced an exception for
+ handling alphabet translation failure,
+ fixed typo in help string, updated README
+
+2010-08-09 14:26 hardwick
+
+ * main.cc, ospell.cc: Made some changes to correction-storing data
+ structures to make sure each
+ correction string only appears once
+
+2010-08-09 13:37 hardwick
+
+ * hfst-ol.cc, hfst-ol.h, ospell.h, test/editdist.py, test/test.txt:
+ Fatal bug(s) fixed, (more) correct flag diacritic functionality
+
+2010-08-06 12:10 hardwick
+
+ * test/editdist.py, test/french.hfst, test/test2.txt: New test
+ script
+
+2010-08-06 12:05 hardwick
+
+ * ospell.cc: Fixed typo
+
+2010-08-06 11:59 hardwick
+
+ * main.cc, ospell.cc, test/test.txt: Fixed a braindead bug that
+ subtly broke everything, this should make
+ some code redundant
+
+2010-08-02 13:46 hardwick
+
+ * ospell.cc, ospell.h: A way to handle flag diacritics
+
+2010-07-08 15:33 hardwick
+
+ * Makefile: Trivial Makefile fix for commandline tester
+
+2010-07-08 15:26 hardwick
+
+ * README, hfst-ol.cc, hfst-ol.h: Replaced some ungracious exits
+ with exceptions and made small change to README
+
+2010-07-08 15:15 hardwick
+
+ * README, hfst-ol.cc, hfst-ol.h, main.cc, ospell.cc, ospell.h:
+ Added README and some fixes
+
+2010-07-08 14:46 hardwick
+
+ * hfst-ol.cc, hfst-ol.h, main.cc, ospell.cc, ospell.h: Implemented
+ spellchecking and correction library functions; documentation,
+ proper packaging and esp. functioning flag diacritics still to be
+ done.
+
+2010-07-05 11:05 hardwick
+
+ * ospell.h: Temporarily de-autotooled ospell
+
+2010-06-30 09:39 hardwick
+
+ * ospell.cc, ospell.h: Incorporated queue in speller proper
+
+2010-06-29 06:31 hardwick
+
+ * Makefile, hfst-ol.cc, hfst-ol.h, ospell.cc, ospell.h,
+ test/french-symbols.txt, test/french.hfst.ol, test/test.hfst.ol,
+ test/test2.hfst.ol: Fixed behaviour, added weightedness
+ scaffolding
+
+2010-06-22 14:38 moshagen
+
+ * main.cc: Corrected typo.
+
+2010-06-21 20:26 moshagen
+
+ * .: Ignore generated binary.
+
+2010-06-21 17:29 hardwick
+
+ * ., Makefile, hfst-ol.cc, hfst-ol.h, main.cc, ospell.cc, ospell.h,
+ test, test/french-symbols.txt, test/french.hfst,
+ test/french.hfst.ol, test/test.hfst.ol, test/test.txt,
+ test/test2.hfst.ol, test/test2.txt: Initial commit of
+ hfst-ospell.
+ Basic functionality including OTHER symbol (@?@) and runtime
+ alphabet
+ translation is present; weighted transducers (probably to be the
+ only option)
+ and flag diacritic states for the mutator and lexicon
+ forthcoming.
+
diff --git a/Doxyfile b/Doxyfile
new file mode 100644
index 0000000..898fdda
--- /dev/null
+++ b/Doxyfile
@@ -0,0 +1,1902 @@
+# Doxyfile 1.8.3.1
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+# TAG = value [value, ...]
+# For lists items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING = UTF-8
+
+# The PROJECT_NAME tag is a single word (or sequence of words) that should
+# identify the project. Note that if you do not use Doxywizard you need
+# to put quotes around the project name if it contains spaces.
+
+PROJECT_NAME = "HFST ospell"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF = "Weighted Finite-State Spell-Checking"
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO = edit2-small.png
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY = doc
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF = "The $name class" \
+ "The $name widget" \
+ "The $name file" \
+ is \
+ provides \
+ specifies \
+ contains \
+ represents \
+ a \
+ an \
+ the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip. Note that you specify absolute paths here, but also
+# relative paths, which will be relative from the directory where doxygen is
+# started.
+
+STRIP_FROM_PATH =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE = 4
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding
+# "class=itcl::class" will allow you to use the command class in the
+# itcl::class meaning.
+
+TCL_SUBST =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension,
+# and language is one of the parsers supported by doxygen: IDL, Java,
+# Javascript, CSharp, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL, C,
+# C++. For instance to make doxygen treat .inc files as Fortran files (default
+# is PHP), and .f files as C (default is Fortran), use: inc=Fortran f=C. Note
+# that for custom extensions you also need to set FILE_PATTERNS otherwise the
+# files are not read by doxygen.
+
+EXTENSION_MAPPING =
+
+# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all
+# comments according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you
+# can mix doxygen, HTML, and XML commands with Markdown formatting.
+# Disable only in case of backward compatibilities issues.
+
+MARKDOWN_SUPPORT = YES
+
+# When enabled doxygen tries to link words that correspond to documented classes,
+# or namespaces to their corresponding documentation. Such a link can be
+# prevented in individual cases by by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+
+AUTOLINK_SUPPORT = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES (the
+# default) will make doxygen replace the get and set methods by a property in
+# the documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
+# unions are shown inside the group in which they are included (e.g. using
+# @ingroup) instead of on a separate page (for HTML and Man pages) or
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and
+# unions with only public data fields will be shown inline in the documentation
+# of the scope in which they are defined (i.e. file, namespace, or group
+# documentation), provided this scope is documented. If set to NO (the default),
+# structs, classes, and unions are shown on a separate page (for HTML and Man
+# pages) or section (for LaTeX and RTF).
+
+INLINE_SIMPLE_STRUCTS = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+SYMBOL_CACHE_SIZE = 0
+
+# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
+# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
+# their name and scope. Since this can be an expensive process and often the
+# same symbol appear multiple times in the code, doxygen keeps a cache of
+# pre-resolved symbols. If the cache is too small doxygen will become slower.
+# If the cache is too large, memory is wasted. The cache size is given by this
+# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+LOOKUP_CACHE_SIZE = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
+# scope will be included in the documentation.
+
+EXTRACT_PACKAGE = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if section-label ... \endif
+# and \cond section-label ... \endcond blocks.
+
+ENABLED_SECTIONS =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page. This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files
+# containing the references data. This must be a list of .bib files. The
+# .bib extension is automatically appended if omitted. Using this command
+# requires the bibtex tool to be installed. See also
+# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style
+# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this
+# feature you need bibtex and perl available in the search path. Do not use
+# file names with spaces, bibtex cannot handle them.
+
+CITE_BIB_FILES =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT = /home/flammie/Koodit/hfst-ospell
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS = *.c \
+ *.cc \
+ *.cxx \
+ *.cpp \
+ *.c++ \
+ *.d \
+ *.java \
+ *.ii \
+ *.ixx \
+ *.ipp \
+ *.i++ \
+ *.inl \
+ *.h \
+ *.hh \
+ *.hxx \
+ *.hpp \
+ *.h++ \
+ *.idl \
+ *.odl \
+ *.cs \
+ *.php \
+ *.php3 \
+ *.inc \
+ *.m \
+ *.markdown \
+ *.md \
+ *.mm \
+ *.dox \
+ *.py \
+ *.f90 \
+ *.f \
+ *.for \
+ *.vhd \
+ *.vhdl
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output. If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MD_FILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page (index.html).
+# This can be useful if you have a project on for instance GitHub and want reuse
+# the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C, C++ and Fortran comments will always remain visible.
+
+STRIP_CODE_COMMENTS = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code. Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header. Note that when using a custom header you are responsible
+# for the proper inclusion of any scripts and style sheets that doxygen
+# needs, which is dependent on the configuration options used.
+# It is advised to generate a default header using "doxygen -w html
+# header.html footer.html stylesheet.css YourConfigFile" and then modify
+# that header. Note that the header is subject to change so you typically
+# have to redo this when upgrading to a newer version of doxygen or when
+# changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If left blank doxygen will
+# generate a default style sheet. Note that it is recommended to use
+# HTML_EXTRA_STYLESHEET instead of this one, as it is more robust and this
+# tag will in the future become obsolete.
+
+HTML_STYLESHEET =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional
+# user-defined cascading style sheet that is included after the standard
+# style sheets created by doxygen. Using this option one can overrule
+# certain style aspects. This is preferred over using HTML_STYLESHEET
+# since it does not replace the standard style sheet and is therefor more
+# robust against future updates. Doxygen will copy the style sheet file to
+# the output directory.
+
+HTML_EXTRA_STYLESHEET =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the style sheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+
+HTML_DYNAMIC_SECTIONS = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of
+# entries shown in the various tree structured indices initially; the user
+# can expand and collapse entries dynamically later on. Doxygen will expand
+# the tree to such a level that at most the specified number of entries are
+# visible (unless a fully collapsed tree already exceeds this amount).
+# So setting the number of entries 1 will produce a full collapsed tree by
+# default. 0 is a special value representing an infinite number of entries
+# and will result in a full expanded tree by default.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely
+# identify the documentation publisher. This should be a reverse domain-name
+# style string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+# will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs)
+# at top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it. Since the tabs have the same information as the
+# navigation tree you can set this option to NO if you already set
+# GENERATE_TREEVIEW to YES.
+
+DISABLE_INDEX = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+# Since the tree basically has the same information as the tab index you
+# could consider to set DISABLE_INDEX to NO when enabling this option.
+
+GENERATE_TREEVIEW = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML
+# documentation. Note that a value of 0 will completely suppress the enum
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you may also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# thA MathJax output. Supported types are HTML-CSS, NativeMML (i.e. MathML) and
+# SVG. The default value is HTML-CSS, which is slower, but has the best
+# compatibility.
+
+MATHJAX_FORMAT = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to
+# the MathJax Content Delivery Network so you can quickly see the result without
+# installing MathJax. However, it is strongly recommended to install a local
+# copy of MathJax from http://www.mathjax.org before deployment.
+
+MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
+# names that should be enabled during MathJax rendering.
+
+MATHJAX_EXTENSIONS =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE = NO
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript.
+# There are two flavours of web server based search depending on the
+# EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for
+# searching and an index file used by the script. When EXTERNAL_SEARCH is
+# enabled the indexing and searching needs to be provided by external tools.
+# See the manual for details.
+
+SERVER_BASED_SEARCH = NO
+
+# When EXTERNAL_SEARCH is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain
+# the search results. Doxygen ships with an example indexer (doxyindexer) and
+# search engine (doxysearch.cgi) which are based on the open source search engine
+# library Xapian. See the manual for configuration details.
+
+EXTERNAL_SEARCH = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will returned the search results when EXTERNAL_SEARCH is enabled.
+# Doxygen ships with an example search engine (doxysearch) which is based on
+# the open source search engine library Xapian. See the manual for configuration
+# details.
+
+SEARCHENGINE_URL =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+
+SEARCHDATA_FILE = searchdata.xml
+
+# When SERVER_BASED_SEARCH AND EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+
+EXTERNAL_SEARCH_ID =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id
+# of to a relative location where the documentation can be found.
+# The format is: EXTRA_SEARCH_MAPPINGS = id1=loc1 id2=loc2 ...
+
+EXTRA_SEARCH_MAPPINGS =
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE = a4
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
+# the generated latex document. The footer should contain everything after
+# the last chapter. If it is left blank doxygen will generate a
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See
+# http://en.wikipedia.org/wiki/BibTeX for more info.
+
+LATEX_BIB_STYLE = plain
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS = NO
+
+# Load style sheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN = YES
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader. This is useful
+# if you want to understand what is going on. On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles. For each
+# tag file the location of the external documentation should be added. The
+# format of a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths
+# or URLs. Note that each tag file must have a unique name (where the name does
+# NOT include the path). If a tag file is not located in the directory in which
+# doxygen is run, you must also specify the path to the tagfile here.
+
+TAGFILES =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS = 0
+
+# By default doxygen will use the Helvetica font for all dot files that
+# doxygen generates. When you want a differently looking font you can specify
+# the font name using DOT_FONTNAME. You need to make sure dot is able to find
+# the font, which can be done by putting it in a standard location or by setting
+# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font.
+
+DOT_FONTNAME = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE = 10
+
+# By default doxygen will tell dot to use the Helvetica font.
+# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to
+# set the path where dot can find it.
+
+DOT_FONTPATH =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside
+# the class node. If there are many fields or methods and many nodes the
+# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS
+# threshold limits the number of items for each type to make the size more
+# managable. Set this to 0 for no limit. Note that the threshold may be
+# exceeded by 50% before the limit is enforced.
+
+UML_LIMIT_NUM_FIELDS = 10
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY = YES
+
+# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are svg, png, jpg, or gif.
+# If left blank png will be used. If you choose svg you need to set
+# HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible in IE 9+ (other browsers do not have this requirement).
+
+DOT_IMAGE_FORMAT = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+# Note that this requires a modern browser other than Internet Explorer.
+# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you
+# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible. Older versions of IE do not have SVG support.
+
+INTERACTIVE_SVG = NO
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP = YES
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..5458714
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,234 @@
+Installation Instructions
+*************************
+
+Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005,
+2006 Free Software Foundation, Inc.
+
+This file is free documentation; the Free Software Foundation gives
+unlimited permission to copy, distribute and modify it.
+
+Basic Installation
+==================
+
+Briefly, the shell commands `./configure; make; make install' should
+configure, build, and install this package. The following
+more-detailed instructions are generic; see the `README' file for
+instructions specific to this package.
+
+ The `configure' shell script attempts to guess correct values for
+various system-dependent variables used during compilation. It uses
+those values to create a `Makefile' in each directory of the package.
+It may also create one or more `.h' files containing system-dependent
+definitions. Finally, it creates a shell script `config.status' that
+you can run in the future to recreate the current configuration, and a
+file `config.log' containing compiler output (useful mainly for
+debugging `configure').
+
+ It can also use an optional file (typically called `config.cache'
+and enabled with `--cache-file=config.cache' or simply `-C') that saves
+the results of its tests to speed up reconfiguring. Caching is
+disabled by default to prevent problems with accidental use of stale
+cache files.
+
+ If you need to do unusual things to compile the package, please try
+to figure out how `configure' could check whether to do them, and mail
+diffs or instructions to the address given in the `README' so they can
+be considered for the next release. If you are using the cache, and at
+some point `config.cache' contains results you don't want to keep, you
+may remove or edit it.
+
+ The file `configure.ac' (or `configure.in') is used to create
+`configure' by a program called `autoconf'. You need `configure.ac' if
+you want to change it or regenerate `configure' using a newer version
+of `autoconf'.
+
+The simplest way to compile this package is:
+
+ 1. `cd' to the directory containing the package's source code and type
+ `./configure' to configure the package for your system.
+
+ Running `configure' might take a while. While running, it prints
+ some messages telling which features it is checking for.
+
+ 2. Type `make' to compile the package.
+
+ 3. Optionally, type `make check' to run any self-tests that come with
+ the package.
+
+ 4. Type `make install' to install the programs and any data files and
+ documentation.
+
+ 5. You can remove the program binaries and object files from the
+ source code directory by typing `make clean'. To also remove the
+ files that `configure' created (so you can compile the package for
+ a different kind of computer), type `make distclean'. There is
+ also a `make maintainer-clean' target, but that is intended mainly
+ for the package's developers. If you use it, you may have to get
+ all sorts of other programs in order to regenerate files that came
+ with the distribution.
+
+Compilers and Options
+=====================
+
+Some systems require unusual options for compilation or linking that the
+`configure' script does not know about. Run `./configure --help' for
+details on some of the pertinent environment variables.
+
+ You can give `configure' initial values for configuration parameters
+by setting variables in the command line or in the environment. Here
+is an example:
+
+ ./configure CC=c99 CFLAGS=-g LIBS=-lposix
+
+ *Note Defining Variables::, for more details.
+
+Compiling For Multiple Architectures
+====================================
+
+You can compile the package for more than one kind of computer at the
+same time, by placing the object files for each architecture in their
+own directory. To do this, you can use GNU `make'. `cd' to the
+directory where you want the object files and executables to go and run
+the `configure' script. `configure' automatically checks for the
+source code in the directory that `configure' is in and in `..'.
+
+ With a non-GNU `make', it is safer to compile the package for one
+architecture at a time in the source code directory. After you have
+installed the package for one architecture, use `make distclean' before
+reconfiguring for another architecture.
+
+Installation Names
+==================
+
+By default, `make install' installs the package's commands under
+`/usr/local/bin', include files under `/usr/local/include', etc. You
+can specify an installation prefix other than `/usr/local' by giving
+`configure' the option `--prefix=PREFIX'.
+
+ You can specify separate installation prefixes for
+architecture-specific files and architecture-independent files. If you
+pass the option `--exec-prefix=PREFIX' to `configure', the package uses
+PREFIX as the prefix for installing programs and libraries.
+Documentation and other data files still use the regular prefix.
+
+ In addition, if you use an unusual directory layout you can give
+options like `--bindir=DIR' to specify different values for particular
+kinds of files. Run `configure --help' for a list of the directories
+you can set and what kinds of files go in them.
+
+ If the package supports it, you can cause programs to be installed
+with an extra prefix or suffix on their names by giving `configure' the
+option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
+
+Optional Features
+=================
+
+Some packages pay attention to `--enable-FEATURE' options to
+`configure', where FEATURE indicates an optional part of the package.
+They may also pay attention to `--with-PACKAGE' options, where PACKAGE
+is something like `gnu-as' or `x' (for the X Window System). The
+`README' should mention any `--enable-' and `--with-' options that the
+package recognizes.
+
+ For packages that use the X Window System, `configure' can usually
+find the X include and library files automatically, but if it doesn't,
+you can use the `configure' options `--x-includes=DIR' and
+`--x-libraries=DIR' to specify their locations.
+
+Specifying the System Type
+==========================
+
+There may be some features `configure' cannot figure out automatically,
+but needs to determine by the type of machine the package will run on.
+Usually, assuming the package is built to be run on the _same_
+architectures, `configure' can figure that out, but if it prints a
+message saying it cannot guess the machine type, give it the
+`--build=TYPE' option. TYPE can either be a short name for the system
+type, such as `sun4', or a canonical name which has the form:
+
+ CPU-COMPANY-SYSTEM
+
+where SYSTEM can have one of these forms:
+
+ OS KERNEL-OS
+
+ See the file `config.sub' for the possible values of each field. If
+`config.sub' isn't included in this package, then this package doesn't
+need to know the machine type.
+
+ If you are _building_ compiler tools for cross-compiling, you should
+use the option `--target=TYPE' to select the type of system they will
+produce code for.
+
+ If you want to _use_ a cross compiler, that generates code for a
+platform different from the build platform, you should specify the
+"host" platform (i.e., that on which the generated programs will
+eventually be run) with `--host=TYPE'.
+
+Sharing Defaults
+================
+
+If you want to set default values for `configure' scripts to share, you
+can create a site shell script called `config.site' that gives default
+values for variables like `CC', `cache_file', and `prefix'.
+`configure' looks for `PREFIX/share/config.site' if it exists, then
+`PREFIX/etc/config.site' if it exists. Or, you can set the
+`CONFIG_SITE' environment variable to the location of the site script.
+A warning: not all `configure' scripts look for a site script.
+
+Defining Variables
+==================
+
+Variables not defined in a site shell script can be set in the
+environment passed to `configure'. However, some packages may run
+configure again during the build, and the customized values of these
+variables may be lost. In order to avoid this problem, you should set
+them in the `configure' command line, using `VAR=value'. For example:
+
+ ./configure CC=/usr/local2/bin/gcc
+
+causes the specified `gcc' to be used as the C compiler (unless it is
+overridden in the site shell script).
+
+Unfortunately, this technique does not work for `CONFIG_SHELL' due to
+an Autoconf bug. Until the bug is fixed you can use this workaround:
+
+ CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
+
+`configure' Invocation
+======================
+
+`configure' recognizes the following options to control how it operates.
+
+`--help'
+`-h'
+ Print a summary of the options to `configure', and exit.
+
+`--version'
+`-V'
+ Print the version of Autoconf used to generate the `configure'
+ script, and exit.
+
+`--cache-file=FILE'
+ Enable the cache: use and save the results of the tests in FILE,
+ traditionally `config.cache'. FILE defaults to `/dev/null' to
+ disable caching.
+
+`--config-cache'
+`-C'
+ Alias for `--cache-file=config.cache'.
+
+`--quiet'
+`--silent'
+`-q'
+ Do not print messages saying which checks are being made. To
+ suppress all normal output, redirect it to `/dev/null' (any error
+ messages will still be shown).
+
+`--srcdir=DIR'
+ Look for the package's source code in directory DIR. Usually
+ `configure' can determine that directory automatically.
+
+`configure' also accepts some other, not widely useful, options. Run
+`configure --help' for more details.
+
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..129196c
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,215 @@
+## Process this file with automake to produce Makefile.in
+
+# Copyright 2010 University of Helsinki
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# to silence:
+# libtoolize: Consider adding `-I m4' to ACLOCAL_AMFLAGS in Makefile.am.
+ACLOCAL_AMFLAGS=-I m4
+
+# targets
+if EXTRA_DEMOS
+CONFERENCE_DEMOS=hfst-ospell-norvig hfst-ospell-fsmnlp-2012 hfst-ospell-cicling\
+ hfst-ospell-survey hfst-ospell-lrec2013 hfst-ispell
+endif # EXTRA_DEMOS
+
+if HFST_OSPELL_OFFICE
+MAYBE_HFST_OSPELL_OFFICE=hfst-ospell-office
+endif # HFST_OSPELL_OFFICE
+
+bin_PROGRAMS=hfst-ospell $(MAYBE_HFST_OSPELL_OFFICE) $(CONFERENCE_DEMOS)
+lib_LTLIBRARIES=libhfstospell.la
+man1_MANS=hfst-ospell.1 hfst-ospell-office.1
+
+PKG_LIBS=
+PKG_CXXFLAGS=
+
+if WANT_ARCHIVE
+PKG_LIBS+=$(LIBARCHIVE_LIBS)
+PKG_CXXFLAGS+=$(LIBARCHIVE_CFLAGS)
+endif
+
+if WANT_LIBXMLPP
+PKG_LIBS+=$(LIBXMLPP_LIBS)
+PKG_CXXFLAGS+=$(LIBXMLPP_CFLAGS)
+endif
+
+if WANT_TINYXML2
+PKG_LIBS+=$(TINYXML2_LIBS)
+PKG_CXXFLAGS+=$(TINYXML2_CFLAGS)
+endif
+
+# library parts
+libhfstospell_la_SOURCES=hfst-ol.cc ospell.cc \
+ ZHfstOspeller.cc ZHfstOspellerXmlMetadata.cc
+libhfstospell_la_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) $(PKG_CXXFLAGS)
+libhfstospell_la_LDFLAGS=-no-undefined -version-info 4:0:0 \
+ $(PKG_LIBS)
+
+# link sample program against library here
+hfst_ospell_SOURCES=main.cc
+hfst_ospell_LDADD=libhfstospell.la
+hfst_ospell_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \
+ $(PKG_CXXFLAGS)
+
+if HFST_OSPELL_OFFICE
+
+hfst_ospell_office_SOURCES=office.cpp
+hfst_ospell_office_LDADD=libhfstospell.la
+hfst_ospell_office_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) $(PKG_CXXFLAGS)
+
+endif # HFST_OSPELL_OFFICE
+
+if EXTRA_DEMOS
+
+hfst_ospell_norvig_SOURCES=main-norvig.cc
+hfst_ospell_norvig_LDADD=libhfstospell.la
+hfst_ospell_norvig_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \
+ $(PKG_CXXFLAGS)
+
+hfst_ospell_cicling_SOURCES=main-cicling.cc
+hfst_ospell_cicling_LDADD=libhfstospell.la
+hfst_ospell_cicling_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \
+ $(PKG_CXXFLAGS)
+
+hfst_ospell_lrec2013_SOURCES=main-lrec2013.cc
+hfst_ospell_lrec2013_LDADD=libhfstospell.la
+hfst_ospell_lrec2013_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \
+ $(PKG_CXXFLAGS)
+
+hfst_ospell_survey_SOURCES=main-survey.cc
+hfst_ospell_survey_LDADD=libhfstospell.la
+hfst_ospell_survey_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \
+ $(PKG_CXXFLAGS)
+
+hfst_ospell_fsmnlp_2012_SOURCES=main-fsmnlp-2012.cc
+hfst_ospell_fsmnlp_2012_LDADD=libhfstospell.la
+hfst_ospell_fsmnlp_2012_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \
+ $(PKG_CXXFLAGS)
+endif # EXTRA_DEMOS
+
+if EXTRA_DEMOS
+
+hfst_ispell_SOURCES=main-ispell.cc
+hfst_ispell_LDADD=libhfstospell.la
+hfst_ispell_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \
+ $(PKG_CXXFLAGS)
+
+endif # EXTRA_DEMOS
+
+# install headers for library in hfst's includedir
+include_HEADERS=hfst-ol.h ospell.h ol-exceptions.h \
+ ZHfstOspeller.h ZHfstOspellerXmlMetadata.h
+
+# pkgconfig
+pkgconfigdir=$(libdir)/pkgconfig
+pkgconfig_DATA=hfstospell.pc
+
+# tests
+if CAN_TEST
+TXTS=acceptor.basic.txt analyser.default.txt \
+ errmodel.basic.txt errmodel.extrachars.txt errmodel.edit1.txt
+check_DATA=speller_basic.zhfst empty_descriptions.zhfst \
+ empty_titles.zhfst empty_locale.zhfst \
+ trailing_spaces.zhfst \
+ acceptor.basic.hfst errmodel.basic.hfst \
+ errmodel.extrachars.hfst bad_errormodel.zhfst \
+ speller_analyser.zhfst no_errormodel.zhfst \
+ speller_edit1.zhfst
+# Actual test scripts:
+TESTS=basic-zhfst.sh basic-edit1.sh \
+ empty-descriptions.sh empty-titles.sh empty-locale.sh \
+ trailing-spaces.sh bad-errormodel.sh empty-zhfst.sh \
+ analyse-spell.sh no-errormodel.sh
+XFAIL_TESTS=empty-descriptions.sh empty-titles.sh empty-locale.sh empty-zhfst.sh
+EXTRA_DIST=$(TXTS) $(TESTS) $(man1_MANS)\
+ basic_test.xml empty_descriptions.xml empty_titles.xml \
+ empty_locale.xml trailing_spaces.xml no_errmodel.xml \
+ test.strings
+
+clean-local:
+ -rm -rf $(check_DATA) index.xml
+endif # CAN_TEST
+
+# N.B. Do not parallel test, race condition exists
+# N.B. Do not parallel test, race condition exists
+empty_descriptions.zhfst: acceptor.basic.hfst errmodel.basic.hfst empty_descriptions.xml
+ cp -f $(srcdir)/empty_descriptions.xml index.xml
+ cp -f acceptor.basic.hfst acceptor.default.hfst
+ cp -f errmodel.basic.hfst errmodel.default.hfst
+ $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml
+ -rm -f index.xml acceptor.default.hfst errmodel.default.hfst
+
+empty_titles.zhfst: acceptor.basic.hfst errmodel.basic.hfst empty_titles.xml
+ cp -f $(srcdir)/empty_titles.xml index.xml
+ cp -f acceptor.basic.hfst acceptor.default.hfst
+ cp -f errmodel.basic.hfst errmodel.default.hfst
+ $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml
+ -rm -f index.xml acceptor.default.hfst errmodel.default.hfst
+
+empty_locale.zhfst: acceptor.basic.hfst errmodel.basic.hfst empty_locale.xml
+ cp -f $(srcdir)/empty_locale.xml index.xml
+ cp -f acceptor.basic.hfst acceptor.default.hfst
+ cp -f errmodel.basic.hfst errmodel.default.hfst
+ $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml
+ -rm -f index.xml acceptor.default.hfst errmodel.default.hfst
+
+trailing_spaces.zhfst: acceptor.basic.hfst errmodel.basic.hfst trailing_spaces.xml
+ cp -f $(srcdir)/trailing_spaces.xml index.xml
+ cp -f acceptor.basic.hfst acceptor.default.hfst
+ cp -f errmodel.basic.hfst errmodel.default.hfst
+ $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml
+ -rm -f index.xml acceptor.default.hfst errmodel.default.hfst
+
+speller_edit1.zhfst: acceptor.basic.hfst errmodel.edit1.hfst basic_test.xml
+ cp $(srcdir)/basic_test.xml index.xml
+ cp -f acceptor.basic.hfst acceptor.default.hfst
+ cp -f errmodel.edit1.hfst errmodel.default.hfst
+ $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml
+ -rm -f index.xml acceptor.default.hfst errmodel.default.hfst
+
+speller_basic.zhfst: acceptor.basic.hfst errmodel.basic.hfst basic_test.xml
+ cp $(srcdir)/basic_test.xml index.xml
+ cp -f acceptor.basic.hfst acceptor.default.hfst
+ cp -f errmodel.basic.hfst errmodel.default.hfst
+ $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml
+ -rm -f index.xml acceptor.default.hfst errmodel.default.hfst
+
+speller_analyser.zhfst: analyser.default.hfst errmodel.edit1.hfst basic_test.xml
+ cp -f $(srcdir)/basic_test.xml index.xml
+ cp -f analyser.default.hfst acceptor.default.hfst
+ cp -f errmodel.edit1.hfst errmodel.default.hfst
+ $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml
+ -rm -f index.xml acceptor.default.hfst errmodel.default.hfst
+
+bad_errormodel.zhfst: acceptor.basic.hfst errmodel.extrachars.hfst basic_test.xml
+ cp -f $(srcdir)/basic_test.xml index.xml
+ cp -f acceptor.basic.hfst acceptor.default.hfst
+ cp -f errmodel.extrachars.hfst errmodel.default.hfst
+ $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml
+ -rm -f index.xml acceptor.default.hfst errmodel.default.hfst
+
+no_errormodel.zhfst: acceptor.basic.hfst no_errmodel.xml
+ cp -f $(srcdir)/no_errmodel.xml index.xml
+ cp -f acceptor.basic.hfst acceptor.default.hfst
+ $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst index.xml
+ -rm -f index.xml acceptor.default.hfst
+
+.txt.hfst:
+ hfst-txt2fst -e "@0@" $(HFST_FLAGS) $< | hfst-fst2fst $(HFST_FLAGS) -f olw -o $@
+
+if CAN_DOXYGEN
+doxygen:
+ $(DOXYGEN)
+endif
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..f1f7af0
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,70 @@
+NEWS for hfst-ospell
+====================
+
+This file contains all noteworthy changes in HFST-ospell development between
+releases. For full listing of changes see ChangeLog.
+
+Noteworthy changes in 0.3.0
+---------------------------
+
+* New API for analysing and suggesting
+* Moved code from headers to implementation files (API change)
+* Added Doxygen to mark stable API
+* Fixes for bad and malformed metadara handling
+* Limiting number of suggestions now works
+
+Noteworthy changes in 0.2.5
+---------------------------
+
+* optional support for tinyxml2
+* preliminary support for two-tape automata and *analysis* lookup
+* conference demos are no longer built by default
+* libarchive newer than 3 allowed
+
+Noteworthy changes in 0.2.4
+---------------------------
+
+* renamed the package hfstospell (from hfst-ospell), the previous rename caused
+ build issues.
+
+Noteworthy changes in 0.2.3
+---------------------------
+
+* fixed a bug that caused certain types of paths with flag diacritics not to
+ be accepted.
+
+Noteworthy changes in 0.2.2
+---------------------------
+
+* Memory and speed improvements; data structures for automaton changed
+
+* Tests and bug fixes for building
+
+Noteworthy changes in 0.2.1
+---------------------------
+
+* Added support for extracting zipped transducer collections to memory instead
+ of temporary files
+
+* Changed from libxml to libxml++ for XML parsing
+
+Noteworthy changes in 0.2.0
+---------------------------
+
+* Added support for zipped XML based transducer collection format.
+
+* Few new frontends for various experiments
+
+* Lots of metadata everywhere
+
+Noteworthy changes in 0.1.1
+---------------------------
+
+* Added autoconfiscation to avoid bugs like missing Makefile in tarball
+
+Noteworthy changes in 0.1
+-------------------------
+
+* First release
+
+
diff --git a/README b/README
new file mode 100644
index 0000000..b4a48c3
--- /dev/null
+++ b/README
@@ -0,0 +1,101 @@
+.. -*- mode: rst -*-
+================================================
+ Hfst-ospell library and toy commandline tester
+================================================
+
+This is a minimal hfst optimized lookup format based spell checker library and
+a demonstrational implementation of command line based spell checker. The
+library is licenced under Apache licence version 2, other licences can be
+obtained from University of Helsinki.
+
+
+Dependencies
+============
+
+ - libxml++2
+ - libarchive
+
+Debian packages for dependencies
+--------------------------------
+
+ - libxml++2-dev
+ - libarchive-dev
+
+Usage
+=====
+
+Usage in external programs::
+
+ #include <ospell.h>
+
+and compile your project with::
+
+ $(pkg-config --cflags hfstospell)
+
+and link with::
+
+ $(pkg-config --libs hfstospell)
+
+Programming examples
+--------------------
+
+The library lives in a namespace called hfst_ol. Pass (weighted!) Transducer
+pointers to the Speller constructor, eg.::
+
+ FILE * error_source = fopen(error_filename, "r");
+ FILE * lexicon_file = fopen(lexicon_filename, "r");
+ hfst_ol::Transducer * error;
+ hfst_ol::Transducer * lexicon;
+ try {
+ error = new hfst_ol::Transducer(error_source);
+ lexicon = new hfst_ol::Transducer(lexicon_file);
+ } catch (hfst_ol::TransducerParsingException& e) {
+ /* problem with transducer file, usually completely
+ different type of file - there's no magic number
+ in the header to check for this */
+ }
+ hfst_ol::Speller * speller;
+ try {
+ speller = new hfst_ol::Speller(error, lexicon);
+ } catch (hfst_ol::AlphabetTranslationException& e) {
+ /* problem with translating between the two alphabets */
+ }
+
+
+And use the functions::
+
+ // returns true if line is found in lexicon
+ bool hfst_ol::Speller::check(char * line);
+
+ // CorrectionQueue is a priority queue, sorted by weight
+ hfst_ol::CorrectionQueue hfst_ol::Speller::correct(char * line);
+
+
+to communicate with it. See main.cc for a concrete usage example.
+
+Command-line tool
+-----------------
+
+Main.cc provides a demo utility with the following help message::
+
+ Usage: hfst-ospell [OPTIONS] ERRORSOURCE LEXICON
+ Run a composition of ERRORSOURCE and LEXICON on standard input and
+ print corrected output
+
+ -h, --help Print this help message
+ -V, --version Print version information
+ -v, --verbose Be verbose
+ -q, --quiet Don't be verbose (default)
+ -s, --silent Same as quiet
+
+
+ Report bugs to hfst-bugs at ling.helsinki.fi
+
+Use in real-world applications
+------------------------------
+
+The HFST based spellers can be used in real applications with help of
+`voikko <http://voikko.sf.net>`_. Voikko in turn can be used with enchant,
+libreoffice, and firefox.
+
+.. vim: set ft=rst:
diff --git a/ZHfstOspeller.cc b/ZHfstOspeller.cc
new file mode 100644
index 0000000..6f6386a
--- /dev/null
+++ b/ZHfstOspeller.cc
@@ -0,0 +1,472 @@
+/* -*- Mode: C++ -*- */
+// Copyright 2010 University of Helsinki
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+// C
+#if HAVE_LIBARCHIVE
+# include <archive.h>
+# include <archive_entry.h>
+#endif
+// C++
+#if HAVE_LIBXML
+# include <libxml++/libxml++.h>
+#endif
+#include <string>
+#include <map>
+
+using std::string;
+using std::map;
+
+// local
+#include "ospell.h"
+#include "hfst-ol.h"
+#include "ZHfstOspeller.h"
+
+namespace hfst_ol
+ {
+
+#if HAVE_LIBARCHIVE
+#if ZHFST_EXTRACT_TO_MEM
+static
+char*
+extract_to_mem(archive* ar, archive_entry* entry, size_t* n)
+ {
+ size_t full_length = 0;
+ const struct stat* st = archive_entry_stat(entry);
+ size_t buffsize = st->st_size;
+ char * buff = new char[buffsize];
+ for (;;)
+ {
+ ssize_t curr = archive_read_data(ar, buff + full_length, buffsize - full_length);
+ if (0 == curr)
+ {
+ break;
+ }
+ else if (ARCHIVE_RETRY == curr)
+ {
+ continue;
+ }
+ else if (ARCHIVE_FAILED == curr)
+ {
+ throw ZHfstZipReadingError("Archive broken (ARCHIVE_FAILED)");
+ }
+ else if (curr < 0)
+ {
+ throw ZHfstZipReadingError("Archive broken...");
+ }
+ else
+ {
+ full_length += curr;
+ }
+ }
+ *n = full_length;
+ return buff;
+ }
+#endif
+
+#if ZHFST_EXTRACT_TO_TMPDIR
+static
+char*
+extract_to_tmp_dir(archive* ar)
+ {
+ char* rv = strdup("/tmp/zhfstospellXXXXXXXX");
+ int temp_fd = mkstemp(rv);
+ int rr = archive_read_data_into_fd(ar, temp_fd);
+ if ((rr != ARCHIVE_EOF) && (rr != ARCHIVE_OK))
+ {
+ throw ZHfstZipReadingError("Archive not EOF'd or OK'd");
+ }
+ close(temp_fd);
+ return rv;
+ }
+#endif
+
+#endif // HAVE_LIBARCHIVE
+
+ZHfstOspeller::ZHfstOspeller() :
+ suggestions_maximum_(0),
+ maximum_weight_(-1.0),
+ beam_(-1.0),
+ time_cutoff_(0.0),
+ can_spell_(false),
+ can_correct_(false),
+ can_analyse_(true),
+ current_speller_(0),
+ current_sugger_(0)
+ {
+ }
+
+ZHfstOspeller::~ZHfstOspeller()
+ {
+ if ((current_speller_ != NULL) && (current_sugger_ != NULL))
+ {
+ if (current_speller_ != current_sugger_)
+ {
+ delete current_speller_;
+ delete current_sugger_;
+ }
+ else
+ {
+ delete current_speller_;
+ }
+ current_sugger_ = 0;
+ current_speller_ = 0;
+ }
+ for (map<string, Transducer*>::iterator acceptor = acceptors_.begin();
+ acceptor != acceptors_.end();
+ ++acceptor)
+ {
+ delete acceptor->second;
+ }
+ for (map<string, Transducer*>::iterator errmodel = errmodels_.begin();
+ errmodel != errmodels_.end();
+ ++errmodel)
+ {
+ delete errmodel->second;
+ }
+ can_spell_ = false;
+ can_correct_ = false;
+ }
+
+void
+ZHfstOspeller::inject_speller(Speller * s)
+ {
+ current_speller_ = s;
+ current_sugger_ = s;
+ can_spell_ = true;
+ can_correct_ = true;
+ }
+
+void
+ZHfstOspeller::set_queue_limit(unsigned long limit)
+ {
+ suggestions_maximum_ = limit;
+ }
+
+void
+ZHfstOspeller::set_weight_limit(Weight limit)
+ {
+ maximum_weight_ = limit;
+ }
+
+void
+ZHfstOspeller::set_beam(Weight beam)
+ {
+ beam_ = beam;
+ }
+
+void
+ZHfstOspeller::set_time_cutoff(float time_cutoff)
+ {
+ time_cutoff_ = time_cutoff;
+ }
+
+bool
+ZHfstOspeller::spell(const string& wordform)
+ {
+ if (can_spell_ && (current_speller_ != 0))
+ {
+ char* wf = strdup(wordform.c_str());
+ bool rv = current_speller_->check(wf);
+ free(wf);
+ return rv;
+ }
+ return false;
+ }
+
+CorrectionQueue
+ZHfstOspeller::suggest(const string& wordform)
+ {
+ CorrectionQueue rv;
+ if ((can_correct_) && (current_sugger_ != 0))
+ {
+ char* wf = strdup(wordform.c_str());
+ rv = current_sugger_->correct(wf,
+ suggestions_maximum_,
+ maximum_weight_,
+ beam_,
+ time_cutoff_);
+ free(wf);
+ return rv;
+ }
+ return rv;
+ }
+
+AnalysisQueue
+ZHfstOspeller::analyse(const string& wordform, bool ask_sugger)
+ {
+ AnalysisQueue rv;
+ char* wf = strdup(wordform.c_str());
+ if ((can_analyse_) && (!ask_sugger) && (current_speller_ != 0))
+ {
+ rv = current_speller_->analyse(wf);
+ }
+ else if ((can_analyse_) && (ask_sugger) && (current_sugger_ != 0))
+ {
+ rv = current_sugger_->analyse(wf);
+ }
+ free(wf);
+ return rv;
+ }
+
+AnalysisCorrectionQueue
+ZHfstOspeller::suggest_analyses(const string& wordform)
+ {
+ AnalysisCorrectionQueue rv;
+ // FIXME: should be atomic
+ CorrectionQueue cq = suggest(wordform);
+ while (cq.size() > 0)
+ {
+ AnalysisQueue aq = analyse(cq.top().first, true);
+ while (aq.size() > 0)
+ {
+ StringPair sp(cq.top().first, aq.top().first);
+ StringPairWeightPair spwp(sp, aq.top().second);
+ rv.push(spwp);
+ aq.pop();
+ }
+ cq.pop();
+ }
+ return rv;
+ }
+
+void
+ZHfstOspeller::read_zhfst(const string& filename)
+ {
+#if HAVE_LIBARCHIVE
+ struct archive* ar = archive_read_new();
+ struct archive_entry* entry = 0;
+
+#if USE_LIBARCHIVE_2
+ archive_read_support_compression_all(ar);
+#else
+ archive_read_support_filter_all(ar);
+#endif // USE_LIBARCHIVE_2
+
+ archive_read_support_format_all(ar);
+ int rr = archive_read_open_filename(ar, filename.c_str(), 10240);
+ if (rr != ARCHIVE_OK)
+ {
+ throw ZHfstZipReadingError("Archive not OK");
+ }
+ for (int rr = archive_read_next_header(ar, &entry);
+ rr != ARCHIVE_EOF;
+ rr = archive_read_next_header(ar, &entry))
+ {
+ if (rr != ARCHIVE_OK)
+ {
+ throw ZHfstZipReadingError("Archive not OK");
+ }
+ char* filename = strdup(archive_entry_pathname(entry));
+ if (strncmp(filename, "acceptor.", strlen("acceptor.")) == 0)
+ {
+#if ZHFST_EXTRACT_TO_TMPDIR
+ char* temporary = extract_to_tmp_dir(ar);
+#elif ZHFST_EXTRACT_TO_MEM
+ size_t total_length = 0;
+ char* full_data = extract_to_mem(ar, entry, &total_length);
+#endif
+ char* p = filename;
+ p += strlen("acceptor.");
+ size_t descr_len = 0;
+ for (const char* q = p; *q != '\0'; q++)
+ {
+ if (*q == '.')
+ {
+ break;
+ }
+ else
+ {
+ descr_len++;
+ }
+ }
+ char* descr = hfst_strndup(p, descr_len);
+#if ZHFST_EXTRACT_TO_TMPDIR
+ FILE* f = fopen(temporary, "r");
+ if (f == NULL)
+ {
+ throw ZHfstTemporaryWritingError("reading acceptor back "
+ "from temp file");
+ }
+ Transducer* trans = new Transducer(f);
+#elif ZHFST_EXTRACT_TO_MEM
+ Transducer* trans = new Transducer(full_data);
+ delete[] full_data;
+#endif
+ acceptors_[descr] = trans;
+ free(descr);
+ }
+ else if (strncmp(filename, "errmodel.", strlen("errmodel.")) == 0)
+ {
+#if ZHFST_EXTRACT_TO_TMPDIR
+ char* temporary = extract_to_tmp_dir(ar);
+#elif ZHFST_EXTRACT_TO_MEM
+ size_t total_length = 0;
+ char* full_data = extract_to_mem(ar, entry, &total_length);
+#endif
+ const char* p = filename;
+ p += strlen("errmodel.");
+ size_t descr_len = 0;
+ for (const char* q = p; *q != '\0'; q++)
+ {
+ if (*q == '.')
+ {
+ break;
+ }
+ else
+ {
+ descr_len++;
+ }
+ }
+ char* descr = hfst_strndup(p, descr_len);
+#if ZHFST_EXTRACT_TO_TMPDIR
+ FILE* f = fopen(temporary, "r");
+ if (NULL == f)
+ {
+ throw ZHfstTemporaryWritingError("reading errmodel back "
+ "from temp file");
+ }
+ Transducer* trans = new Transducer(f);
+#elif ZHFST_EXTRACT_TO_MEM
+ Transducer* trans = new Transducer(full_data);
+ delete[] full_data;
+#endif
+ errmodels_[descr] = trans;
+ free(descr);
+ } // if acceptor or errmodel
+ else if (strcmp(filename, "index.xml") == 0)
+ {
+#if ZHFST_EXTRACT_TO_TMPDIR
+ char* temporary = extract_to_tmp_dir(ar);
+ metadata_.read_xml(temporary);
+#elif ZHFST_EXTRACT_TO_MEM
+ size_t xml_len = 0;
+ char* full_data = extract_to_mem(ar, entry, &xml_len);
+ metadata_.read_xml(full_data, xml_len);
+ delete[] full_data;
+#endif
+
+ }
+ else
+ {
+ fprintf(stderr, "Unknown file in archive %s\n", filename);
+ }
+ free(filename);
+ } // while r != ARCHIVE_EOF
+ archive_read_close(ar);
+
+#if USE_LIBARCHIVE_2
+ archive_read_finish(ar);
+#else
+ archive_read_free(ar);
+#endif // USE_LIBARCHIVE_2
+
+ if ((errmodels_.find("default") != errmodels_.end()) &&
+ (acceptors_.find("default") != acceptors_.end()))
+ {
+ current_speller_ = new Speller(
+ errmodels_["default"],
+ acceptors_["default"]
+ );
+ current_sugger_ = current_speller_;
+ can_spell_ = true;
+ can_correct_ = true;
+ }
+ else if ((acceptors_.size() > 0) && (errmodels_.size() > 0))
+ {
+ fprintf(stderr, "Could not find default speller, using %s %s\n",
+ acceptors_.begin()->first.c_str(),
+ errmodels_.begin()->first.c_str());
+ current_speller_ = new Speller(
+ errmodels_.begin()->second,
+ acceptors_.begin()->second
+ );
+ current_sugger_ = current_speller_;
+ can_spell_ = true;
+ can_correct_ = true;
+ }
+ else if ((acceptors_.size() > 0) &&
+ (acceptors_.find("default") != acceptors_.end()))
+ {
+ current_speller_ = new Speller(0, acceptors_["default"]);
+ current_sugger_ = current_speller_;
+ can_spell_ = true;
+ can_correct_ = false;
+ }
+ else if (acceptors_.size() > 0)
+ {
+ current_speller_ = new Speller(0, acceptors_.begin()->second);
+ current_sugger_ = current_speller_;
+ can_spell_ = true;
+ can_correct_ = false;
+ }
+ else
+ {
+ throw ZHfstZipReadingError("No automata found in zip");
+ }
+ can_analyse_ = can_spell_ | can_correct_;
+#else
+ throw ZHfstZipReadingError("Zip support was disabled");
+#endif // HAVE_LIBARCHIVE
+ }
+
+
+const ZHfstOspellerXmlMetadata&
+ZHfstOspeller::get_metadata() const
+ {
+ return metadata_;
+ }
+
+string
+ZHfstOspeller::metadata_dump() const
+ {
+ return metadata_.debug_dump();
+
+ }
+
+ZHfstException::ZHfstException() :
+ what_("unknown")
+ {}
+ZHfstException::ZHfstException(const std::string& message) :
+ what_(message)
+ {}
+
+
+const char*
+ZHfstException::what()
+ {
+ return what_.c_str();
+ }
+
+
+ZHfstMetaDataParsingError::ZHfstMetaDataParsingError(const std::string& message):
+ ZHfstException(message)
+ {}
+ZHfstXmlParsingError::ZHfstXmlParsingError(const std::string& message):
+ ZHfstException(message)
+ {}
+ZHfstZipReadingError::ZHfstZipReadingError(const std::string& message):
+ ZHfstException(message)
+ {}
+ZHfstTemporaryWritingError::ZHfstTemporaryWritingError(const std::string& message):
+ ZHfstException(message)
+ {}
+
+} // namespace hfst_ol
+
+
diff --git a/ZHfstOspeller.h b/ZHfstOspeller.h
new file mode 100644
index 0000000..b5b6e67
--- /dev/null
+++ b/ZHfstOspeller.h
@@ -0,0 +1,204 @@
+/* -*- Mode: C++ -*- */
+// Copyright 2010 University of Helsinki
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! @mainpage API to HFST ospell WFST spell-checking
+//!
+//! The hfst-ospell API has several layers for different end-users. A suggested
+//! starting point for new user is the @c ZHfstOspeller object, which reads an
+//! automaton set from zipped hfst file with metadata and provides high level
+//! access to it with generic spell-checking, correction and analysis functions.
+//! Second level of access is the Speller object, which can be used to
+//! construct spell-checker with two automata and traverse it and query
+//! low-level properties. The Speller is constructed with two Transducer objects
+//! which are the low-level access point to the automata with all the gory
+//! details of transition tables and symbol translations, headers and such.
+
+#ifndef HFST_OSPELL_ZHFSTOSPELLER_H_
+#define HFST_OSPELL_ZHFSTOSPELLER_H_
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <map>
+
+#include "ospell.h"
+#include "hfst-ol.h"
+#include "ZHfstOspellerXmlMetadata.h"
+
+namespace hfst_ol
+ {
+ //! @brief ZHfstOspeller class holds one speller contained in one
+ //! zhfst file.
+ //! Ospeller can perform all basic writer tool functionality that
+ //! is supporte by the automata in the zhfst archive.
+ class ZHfstOspeller
+ {
+ public:
+ //! @brief create speller with default values for undefined
+ //! language.
+ ZHfstOspeller();
+ //! @brief destroy all automata used by the speller.
+ ~ZHfstOspeller();
+
+ //! @brief assign a speller-suggestor circumventing the ZHFST format
+ void inject_speller(Speller * s);
+ //! @brief set upper limit to priority queue when performing
+ // suggestions or analyses.
+ void set_queue_limit(unsigned long limit);
+ //! @brief set upper limit for weights
+ void set_weight_limit(Weight limit);
+ //! @brief set search beam
+ void set_beam(Weight beam);
+ //! @brief set time cutoff for correcting
+ void set_time_cutoff(float time_cutoff);
+ //! @brief construct speller from named file containing valid
+ //! zhfst archive.
+ void read_zhfst(const std::string& filename);
+
+ //! @brief check if the given word is spelled correctly
+ bool spell(const std::string& wordform);
+ //! @brief construct an ordered set of corrections for misspelled
+ //! word form.
+ CorrectionQueue suggest(const std::string& wordform);
+ //! @brief analyse word form morphologically
+ //! @param wordform the string to analyse
+ //! @param ask_sugger whether to use the spelling correction model
+ // instead of the detection model
+ AnalysisQueue analyse(const std::string& wordform,
+ bool ask_sugger = false);
+ //! @brief construct an ordered set of corrections with analyses
+ AnalysisCorrectionQueue suggest_analyses(const std::string&
+ wordform);
+ //! @brief hyphenate word form
+ HyphenationQueue hyphenate(const std::string& wordform);
+
+ //! @brief get access to metadata read from XML.
+ const ZHfstOspellerXmlMetadata& get_metadata() const;
+ //! @brief create string representation of the speller for
+ //! programmer to debug
+ std::string metadata_dump() const;
+ private:
+ //! @brief file or path where the speller came from
+ std::string filename_;
+ //! @brief upper bound for suggestions generated and given
+ unsigned long suggestions_maximum_;
+ //! @brief upper bound for suggestion weight generated and given
+ Weight maximum_weight_;
+ //! @brief upper bound for search beam around best candidate
+ Weight beam_;
+ //! @brief upper bound for search time in seconds
+ float time_cutoff_;
+ //! @brief whether automatons loaded yet can be used to check
+ //! spelling
+ bool can_spell_;
+ //! @brief whether automatons loaded yet can be used to correct
+ //! word forms
+ bool can_correct_;
+ //! @brief whether automatons loaded yet can be used to analyse
+ //! word forms
+ bool can_analyse_;
+ //! @brief whether automatons loaded yet can be used to hyphenate
+ //! word forms
+ bool can_hyphenate_;
+ //! @brief dictionaries loaded
+ std::map<std::string, Transducer*> acceptors_;
+ //! @brief error models loaded
+ std::map<std::string, Transducer*> errmodels_;
+ //! @brief pointer to current speller
+ Speller* current_speller_;
+ //! @brief pointer to current correction model
+ Speller* current_sugger_;
+ //! @brief pointer to current morphological analyser
+ Speller* current_analyser_;
+ //! @brief pointer to current hyphenator
+ Transducer* current_hyphenator_;
+ //! @brief the metadata of loaded speller
+ ZHfstOspellerXmlMetadata metadata_;
+ };
+
+ //! @brief Top-level exception for zhfst handling.
+
+ //! Contains a human-readable error message that can be displayed to
+ //! end-user as additional info when either solving exception or exiting.
+ class ZHfstException
+ {
+ public:
+ ZHfstException();
+ //! @brief construct error with human readable message.
+ //!
+ //! the message will be displayed when recovering or dying from
+ //! exception
+ explicit ZHfstException(const std::string& message);
+ //!
+ //! format error as user-readable message
+ const char* what();
+ private:
+ std::string what_;
+ };
+
+ //! @brief Generic error in metadata parsing.
+ //
+ //! Gets raised if metadata is erroneous or missing.
+ class ZHfstMetaDataParsingError : public ZHfstException
+ {
+ public:
+ explicit ZHfstMetaDataParsingError(const std::string& message);
+ private:
+ std::string what_;
+ };
+
+ //! @brief Exception for XML parser errors.
+ //
+ //! Gets raised if underlying XML parser finds an error in XML data.
+ //! Errors include non-valid XML, missing or erroneous attributes or
+ //! elements, etc.
+ class ZHfstXmlParsingError : public ZHfstException
+ {
+ public:
+ explicit ZHfstXmlParsingError(const std::string& message);
+ private:
+ std::string what_;
+ };
+
+ //! @brief Generic error while reading zip file.
+ //!
+ //! Happens when libarchive is unable to proceed reading zip file or
+ //! zip file is missing required files.
+ class ZHfstZipReadingError : public ZHfstException
+ {
+ public:
+ explicit ZHfstZipReadingError(const std::string& message);
+ private:
+ std::string what_;
+ };
+
+ //! @brief Error when writing to temporary location.
+ //
+ //! This exception gets thrown, when e.g., zip extraction is unable to
+ //! find or open temporary file for writing.
+ class ZHfstTemporaryWritingError : public ZHfstException
+ {
+ public:
+ explicit ZHfstTemporaryWritingError(const std::string& message);
+ private:
+ std::string what_;
+ };
+
+ } // namespace hfst_ol
+
+
+#endif // HFST_OSPELL_OSPELLER_SET_H_
+// vim: set ft=cpp.doxygen:
diff --git a/ZHfstOspellerXmlMetadata.cc b/ZHfstOspellerXmlMetadata.cc
new file mode 100644
index 0000000..64c0502
--- /dev/null
+++ b/ZHfstOspellerXmlMetadata.cc
@@ -0,0 +1,1023 @@
+/* -*- Mode: C++ -*- */
+// Copyright 2010 University of Helsinki
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! XXX: Valgrind note: all errors about invalid reads in wcscmp are because of:
+//! https://bugzilla.redhat.com/show_bug.cgi?id=755242
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+// C++
+#if HAVE_LIBXML
+# include <libxml++/libxml++.h>
+#elif HAVE_TINYXML2
+# include <tinyxml2.h>
+#endif
+
+#include <string>
+#include <map>
+
+using std::string;
+using std::map;
+
+// local
+#include "ospell.h"
+#include "hfst-ol.h"
+#include "ZHfstOspeller.h"
+#include "ZHfstOspellerXmlMetadata.h"
+
+namespace hfst_ol
+ {
+
+ZHfstOspellerXmlMetadata::ZHfstOspellerXmlMetadata()
+ {
+ info_.locale_ = "und";
+ }
+
+static
+bool
+validate_automaton_id(const char* id)
+ {
+ const char* p = strchr(id, '.');
+ if (p == NULL)
+ {
+ return false;
+ }
+ const char* q = strchr(p + 1, '.');
+ if (q == NULL)
+ {
+ return false;
+ }
+ return true;
+ }
+
+static
+char*
+get_automaton_descr_from_id(const char* id)
+ {
+ const char* p = strchr(id, '.');
+ const char* q = strchr(p + 1, '.');
+ return hfst_strndup(p + 1, q - p);
+ }
+
+
+#if HAVE_LIBXML
+void
+ZHfstOspellerXmlMetadata::verify_hfstspeller(xmlpp::Node* rootNode)
+ {
+ xmlpp::Element* rootElement = dynamic_cast<xmlpp::Element*>(rootNode);
+ if (NULL == rootElement)
+ {
+ throw ZHfstMetaDataParsingError("Root node is not an element");
+ }
+ const Glib::ustring rootName = rootElement->get_name();
+ if (rootName != "hfstspeller")
+ {
+ throw ZHfstMetaDataParsingError("could not find <hfstspeller> "
+ "root from XML file");
+ }
+ // check versions
+ const xmlpp::Attribute* hfstversion =
+ rootElement->get_attribute("hfstversion");
+ if (NULL == hfstversion)
+ {
+ throw ZHfstMetaDataParsingError("No hfstversion attribute in root");
+ }
+ const Glib::ustring hfstversionValue = hfstversion->get_value();
+ if (hfstversionValue != "3")
+ {
+ throw ZHfstMetaDataParsingError("Unrecognised HFST version...");
+ }
+ const xmlpp::Attribute* dtdversion = rootElement->get_attribute("dtdversion");
+ if (NULL == dtdversion)
+ {
+ throw ZHfstMetaDataParsingError("No dtdversion attribute in root");
+ }
+ const Glib::ustring dtdversionValue = dtdversion->get_value();
+ if (dtdversionValue != "1.0")
+ {
+ throw ZHfstMetaDataParsingError("Unrecognised DTD version...");
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_info(xmlpp::Node* infoNode)
+ {
+ xmlpp::Node::NodeList infos = infoNode->get_children();
+ for (xmlpp::Node::NodeList::iterator info = infos.begin();
+ info != infos.end();
+ ++info)
+ {
+ const Glib::ustring infoName = (*info)->get_name();
+ if (infoName == "locale")
+ {
+ parse_locale(*info);
+ }
+ else if (infoName == "title")
+ {
+ parse_title(*info);
+ }
+ else if (infoName == "description")
+ {
+ parse_description(*info);
+ }
+ else if (infoName == "version")
+ {
+ parse_version(*info);
+ }
+ else if (infoName == "date")
+ {
+ parse_date(*info);
+ }
+ else if (infoName == "producer")
+ {
+ parse_producer(*info);
+ }
+ else if (infoName == "contact")
+ {
+ parse_contact(*info);
+ }
+ else
+ {
+ const xmlpp::TextNode* text = dynamic_cast<xmlpp::TextNode*>(*info);
+ if ((text == NULL) || (!text->is_white_space()))
+ {
+ fprintf(stderr, "DEBUG: unknown info child %s\n",
+ infoName.c_str());
+ }
+ }
+ } // while info childs
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_locale(xmlpp::Node* localeNode)
+ {
+ xmlpp::Element* localeElement = dynamic_cast<xmlpp::Element*>(localeNode);
+ if (NULL == localeElement->get_child_text())
+ {
+ throw ZHfstXmlParsingError("<locale> must be non-empty");
+ }
+ const Glib::ustring localeContent = localeElement->get_child_text()->get_content();
+ if ((info_.locale_ != "und") && (info_.locale_ != localeContent))
+ {
+ // locale in XML mismatches previous definition
+ // warnable, but overridden as per spec.
+ fprintf(stderr, "Warning: mismatched languages in "
+ "file data (%s) and XML (%s)\n",
+ info_.locale_.c_str(), localeContent.c_str());
+ }
+ info_.locale_ = localeContent;
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_title(xmlpp::Node* titleNode)
+ {
+ xmlpp::Element* titleElement = dynamic_cast<xmlpp::Element*>(titleNode);
+ const xmlpp::Attribute* lang = titleElement->get_attribute("lang");
+ if (NULL == titleElement->get_child_text())
+ {
+ throw ZHfstXmlParsingError("<title> must be non-empty");
+ }
+ if (lang != NULL)
+ {
+ info_.title_[lang->get_value()] = titleElement->get_child_text()->get_content();
+ }
+ else
+ {
+ info_.title_[info_.locale_] = titleElement->get_child_text()->get_content();
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_description(xmlpp::Node* descriptionNode)
+ {
+ xmlpp::Element* descriptionElement =
+ dynamic_cast<xmlpp::Element*>(descriptionNode);
+ const xmlpp::Attribute* lang = descriptionElement->get_attribute("lang");
+ if (NULL == descriptionElement->get_child_text())
+ {
+ throw ZHfstXmlParsingError("<description> must be non-empty");
+ }
+ if (lang != NULL)
+ {
+ info_.description_[lang->get_value()] =
+ descriptionElement->get_child_text()->get_content();
+ }
+ else
+ {
+ info_.description_[info_.locale_] =
+ descriptionElement->get_child_text()->get_content();
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_version(xmlpp::Node* versionNode)
+ {
+ xmlpp::Element* versionElement = dynamic_cast<xmlpp::Element*>(versionNode);
+ const xmlpp::Attribute* revision = versionElement->get_attribute("vcsrev");
+ if (revision != NULL)
+ {
+ info_.vcsrev_ = revision->get_value();
+ }
+ info_.version_ = versionElement->get_child_text()->get_content();
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_date(xmlpp::Node* dateNode)
+ {
+ xmlpp::Element* dateElement =
+ dynamic_cast<xmlpp::Element*>(dateNode);
+ info_.date_ = dateElement->get_child_text()->get_content();
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_producer(xmlpp::Node* producerNode)
+ {
+ xmlpp::Element* producerElement =
+ dynamic_cast<xmlpp::Element*>(producerNode);
+ info_.producer_ = producerElement->get_child_text()->get_content();
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_contact(xmlpp::Node* contactNode)
+ {
+ xmlpp::Element* contactElement = dynamic_cast<xmlpp::Element*>(contactNode);
+ const xmlpp::Attribute* email = contactElement->get_attribute("email");
+ const xmlpp::Attribute* website = contactElement->get_attribute("website");
+ if (email != NULL)
+ {
+ info_.email_ = email->get_value();
+ }
+ if (website != NULL)
+ {
+ info_.website_ = website->get_value();
+ }
+ }
+
+
+void
+ZHfstOspellerXmlMetadata::parse_acceptor(xmlpp::Node* acceptorNode)
+ {
+ xmlpp::Element* acceptorElement =
+ dynamic_cast<xmlpp::Element*>(acceptorNode);
+ xmlpp::Attribute* xid = acceptorElement->get_attribute("id");
+ if (xid == NULL)
+ {
+ throw ZHfstMetaDataParsingError("id missing in acceptor");
+ }
+ const Glib::ustring xidValue = xid->get_value();
+ if (validate_automaton_id(xidValue.c_str()) == false)
+ {
+ throw ZHfstMetaDataParsingError("Invalid id in acceptor");
+ }
+ char* descr = get_automaton_descr_from_id(xidValue.c_str());
+ acceptor_[descr].descr_ = descr;
+ acceptor_[descr].id_ = xidValue;
+ const xmlpp::Attribute* trtype =
+ acceptorElement->get_attribute("transtype");
+ if (trtype != NULL)
+ {
+ acceptor_[descr].transtype_ = trtype->get_value();
+ }
+ const xmlpp::Attribute* xtype = acceptorElement->get_attribute("type");
+ if (xtype != NULL)
+ {
+ acceptor_[descr].type_ = xtype->get_value();
+ }
+ xmlpp::Node::NodeList accs = acceptorNode->get_children();
+ for (xmlpp::Node::NodeList::iterator acc = accs.begin();
+ acc != accs.end();
+ ++acc)
+ {
+ const Glib::ustring accName = (*acc)->get_name();
+ if (accName == "title")
+ {
+ parse_title(*acc, descr);
+ }
+ else if (accName == "description")
+ {
+ parse_description(*acc, descr);
+ }
+ else
+ {
+ const xmlpp::TextNode* text = dynamic_cast<xmlpp::TextNode*>(*acc);
+ if ((text == NULL) || (!text->is_white_space()))
+ {
+ fprintf(stderr, "DEBUG: unknown acceptor node %s\n",
+ accName.c_str());
+ }
+ }
+ }
+ free(descr);
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_title(xmlpp::Node* titleNode,
+ const string& descr)
+ {
+ xmlpp::Element* titleElement = dynamic_cast<xmlpp::Element*>(titleNode);
+ const xmlpp::Attribute* lang = titleElement->get_attribute("lang");
+ if (lang != NULL)
+ {
+ acceptor_[descr].title_[lang->get_value()] = titleElement->get_child_text()->get_content();
+ }
+ else
+ {
+ acceptor_[descr].title_[info_.locale_] = titleElement->get_child_text()->get_content();
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_description(xmlpp::Node* descriptionNode,
+ const string& descr)
+ {
+ xmlpp::Element* descriptionElement =
+ dynamic_cast<xmlpp::Element*>(descriptionNode);
+ const xmlpp::Attribute* lang = descriptionElement->get_attribute("lang");
+ if (lang != NULL)
+ {
+ acceptor_[descr].description_[lang->get_value()] = descriptionElement->get_child_text()->get_content();
+ }
+ else
+ {
+ acceptor_[descr].description_[info_.locale_] = descriptionElement->get_child_text()->get_content();
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_errmodel(xmlpp::Node* errmodelNode)
+ {
+ xmlpp::Element* errmodelElement =
+ dynamic_cast<xmlpp::Element*>(errmodelNode);
+ xmlpp::Attribute* xid = errmodelElement->get_attribute("id");
+ if (xid == NULL)
+ {
+ throw ZHfstMetaDataParsingError("id missing in errmodel");
+ }
+ const Glib::ustring xidValue = xid->get_value();
+ if (validate_automaton_id(xidValue.c_str()) == false)
+ {
+ throw ZHfstMetaDataParsingError("Invalid id in errmodel");
+ }
+ char* descr = get_automaton_descr_from_id(xidValue.c_str());
+ errmodel_.push_back(ZHfstOspellerErrModelMetadata());
+ size_t errm_count = errmodel_.size() - 1;
+ if (descr != NULL)
+ {
+ errmodel_[errm_count].descr_ = descr;
+ }
+ free(descr);
+ errmodel_[errm_count].id_ = xidValue;
+ xmlpp::Node::NodeList errms = errmodelNode->get_children();
+ for (xmlpp::Node::NodeList::iterator errm = errms.begin();
+ errm != errms.end();
+ ++errm)
+ {
+ const Glib::ustring errmName = (*errm)->get_name();
+ if (errmName == "title")
+ {
+ parse_title(*errm, errm_count);
+ }
+ else if (errmName == "description")
+ {
+ parse_description(*errm, errm_count);
+ }
+ else if (errmName == "type")
+ {
+ parse_type(*errm, errm_count);
+ }
+ else if (errmName == "model")
+ {
+ parse_model(*errm, errm_count);
+ }
+ else
+ {
+ const xmlpp::TextNode* text = dynamic_cast<xmlpp::TextNode*>(*errm);
+ if ((text == NULL) || (!text->is_white_space()))
+ {
+ fprintf(stderr, "DEBUG: unknown errmodel node %s\n",
+ errmName.c_str());
+ }
+ }
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_title(xmlpp::Node* titleNode,
+ size_t errm_count)
+ {
+ xmlpp::Element* titleElement = dynamic_cast<xmlpp::Element*>(titleNode);
+ const xmlpp::Attribute* lang = titleElement->get_attribute("lang");
+ if (lang != NULL)
+ {
+ errmodel_[errm_count].title_[lang->get_value()] = titleElement->get_child_text()->get_content();
+ }
+ else
+ {
+ errmodel_[errm_count].title_[info_.locale_] = titleElement->get_child_text()->get_content();
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_description(xmlpp::Node* descriptionNode,
+ size_t errm_count)
+ {
+ xmlpp::Element* descriptionElement =
+ dynamic_cast<xmlpp::Element*>(descriptionNode);
+ const xmlpp::Attribute* lang = descriptionElement->get_attribute("lang");
+ if (lang != NULL)
+ {
+ errmodel_[errm_count].description_[lang->get_value()] = descriptionElement->get_child_text()->get_content();
+ }
+ else
+ {
+ errmodel_[errm_count].description_[info_.locale_] = descriptionElement->get_child_text()->get_content();
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_type(xmlpp::Node* typeNode, size_t errm_count)
+ {
+ xmlpp::Element* typeElement = dynamic_cast<xmlpp::Element*>(typeNode);
+ const xmlpp::Attribute* xtype = typeElement->get_attribute("type");
+ if (xtype != NULL)
+ {
+ errmodel_[errm_count].type_.push_back(xtype->get_value());
+
+ }
+ else
+ {
+ throw ZHfstMetaDataParsingError("No type in type");
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_model(xmlpp::Node* modelNode, size_t errm_count)
+ {
+ xmlpp::Element* modelElement = dynamic_cast<xmlpp::Element*>(modelNode);
+ errmodel_[errm_count].model_.push_back(modelElement->get_child_text()->get_content());
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_xml(const xmlpp::Document* doc)
+ {
+ if (NULL == doc)
+ {
+ throw ZHfstMetaDataParsingError("Cannot parse XML data");
+ }
+ xmlpp::Node* rootNode = doc->get_root_node();
+ // check validity
+ if (NULL == rootNode)
+ {
+ throw ZHfstMetaDataParsingError("No root node in index XML");
+ }
+ verify_hfstspeller(rootNode);
+ // parse
+ xmlpp::Node::NodeList nodes = rootNode->get_children();
+ for (xmlpp::Node::NodeList::iterator node = nodes.begin();
+ node != nodes.end();
+ ++node)
+ {
+ const Glib::ustring nodename = (*node)->get_name();
+ if (nodename == "info")
+ {
+ parse_info(*node);
+ } // if info node
+ else if (nodename == "acceptor")
+ {
+ parse_acceptor(*node);
+ } // acceptor node
+ else if (nodename == "errmodel")
+ {
+ parse_errmodel(*node);
+ } // errmodel node
+ else
+ {
+ const xmlpp::TextNode* text = dynamic_cast<xmlpp::TextNode*>(*node);
+ if ((text == NULL) || (!text->is_white_space()))
+ {
+ fprintf(stderr, "DEBUG: unknown root child %s\n",
+ nodename.c_str());
+ }
+ } // unknown root child node
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::read_xml(const char* xml_data, size_t xml_len)
+ {
+ xmlpp::DomParser parser;
+ parser.set_substitute_entities();
+ parser.parse_memory_raw(reinterpret_cast<const unsigned char*>(xml_data),
+ xml_len);
+ this->parse_xml(parser.get_document());
+ }
+
+void
+ZHfstOspellerXmlMetadata::read_xml(const string& filename)
+ {
+ xmlpp::DomParser parser;
+ parser.set_substitute_entities();
+ parser.parse_file(filename);
+ this->parse_xml(parser.get_document());
+ }
+#elif HAVE_TINYXML2
+
+void
+ZHfstOspellerXmlMetadata::parse_xml(const tinyxml2::XMLDocument& doc)
+ {
+ const tinyxml2::XMLElement* rootNode = doc.RootElement();
+ if (NULL == rootNode)
+ {
+ throw ZHfstMetaDataParsingError("No root node in index XML");
+ }
+ // check validity
+ if (strcmp(rootNode->Name(), "hfstspeller") != 0)
+ {
+ throw ZHfstMetaDataParsingError("could not find <hfstspeller> "
+ "root from XML file");
+ }
+ verify_hfstspeller(*rootNode);
+ // parse
+ const tinyxml2::XMLElement* child = rootNode->FirstChildElement();
+ while (child != NULL)
+ {
+ if (strcmp(child->Name(), "info") == 0)
+ {
+ parse_info(*child);
+ }
+ else if (strcmp(child->Name(), "acceptor") == 0)
+ {
+ parse_acceptor(*child);
+ }
+ else if (strcmp(child->Name(), "errmodel") == 0)
+ {
+ parse_errmodel(*child);
+ }
+ else
+ {
+ fprintf(stderr, "DEBUG: Unknown root child %s\n",
+ child->Name());
+ }
+ child = child->NextSiblingElement();
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::verify_hfstspeller(const tinyxml2::XMLElement& hfstspellerNode)
+ {
+ if (!hfstspellerNode.Attribute("hfstversion"))
+ {
+ throw ZHfstMetaDataParsingError("No hfstversion attribute in root");
+ }
+ if (!hfstspellerNode.Attribute("hfstversion", "3"))
+ {
+ throw ZHfstMetaDataParsingError("Unrecognised HFST version...");
+ }
+ if (!hfstspellerNode.Attribute("dtdversion"))
+ {
+ throw ZHfstMetaDataParsingError("No dtdversion attribute in root");
+ }
+ if (!hfstspellerNode.Attribute("dtdversion", "1.0"))
+ {
+ throw ZHfstMetaDataParsingError("Unrecognised DTD version...");
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_info(const tinyxml2::XMLElement& infoNode)
+ {
+ const tinyxml2::XMLElement* info = infoNode.FirstChildElement();
+ while (info != NULL)
+ {
+ if (strcmp(info->Name(), "locale") == 0)
+ {
+ parse_locale(*info);
+ }
+ else if (strcmp(info->Name(), "title") == 0)
+ {
+ parse_title(*info);
+ }
+ else if (strcmp(info->Name(), "description") == 0)
+ {
+ parse_description(*info);
+ }
+ else if (strcmp(info->Name(), "version") == 0)
+ {
+ parse_version(*info);
+ }
+ else if (strcmp(info->Name(), "date") == 0)
+ {
+ parse_date(*info);
+ }
+ else if (strcmp(info->Name(), "producer") == 0)
+ {
+ parse_producer(*info);
+ }
+ else if (strcmp(info->Name(), "contact") == 0)
+ {
+ parse_contact(*info);
+ }
+ else
+ {
+ fprintf(stderr, "DEBUG: unknown info child %s\n",
+ info->Name());
+ }
+ info = info->NextSiblingElement();
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_locale(const tinyxml2::XMLElement& localeNode)
+ {
+ const char* localeContent = localeNode.GetText();
+ if (NULL == localeNode.GetText())
+ {
+ throw ZHfstXmlParsingError("<locale> must be non-empty");
+ }
+ if ((info_.locale_ != "und") && (info_.locale_ != localeContent))
+ {
+ // locale in XML mismatches previous definition
+ // warnable, but overridden as per spec.
+ fprintf(stderr, "Warning: mismatched languages in "
+ "file data (%s) and XML (%s)\n",
+ info_.locale_.c_str(), localeContent);
+ }
+ info_.locale_ = localeContent;
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_title(const tinyxml2::XMLElement& titleNode)
+ {
+ if (NULL == titleNode.GetText())
+ {
+ throw ZHfstXmlParsingError("<title> must be non-empty");
+ }
+ if (titleNode.Attribute("lang"))
+ {
+ info_.title_[titleNode.Attribute("lang")] = titleNode.GetText();
+ }
+ else
+ {
+ info_.title_[info_.locale_] = titleNode.GetText();
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_description(const tinyxml2::XMLElement& descriptionNode)
+ {
+ if (NULL == descriptionNode.GetText())
+ {
+ throw ZHfstXmlParsingError("<description> must be non-empty");
+ }
+ if (descriptionNode.Attribute("lang"))
+ {
+ info_.description_[descriptionNode.Attribute("lang")] =
+ descriptionNode.GetText();
+ }
+ else
+ {
+ info_.description_[info_.locale_] = descriptionNode.GetText();
+ }
+
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_version(const tinyxml2::XMLElement& versionNode)
+ {
+ if (versionNode.Attribute("vcsrev"))
+ {
+ info_.vcsrev_ = versionNode.Attribute("vcsrev");
+ }
+ info_.version_ = versionNode.GetText();
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_date(const tinyxml2::XMLElement& dateNode)
+ {
+ info_.date_ = dateNode.GetText();
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_producer(const tinyxml2::XMLElement& producerNode)
+ {
+ info_.producer_ = producerNode.GetText();
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_contact(const tinyxml2::XMLElement& contactNode)
+ {
+ if (contactNode.Attribute("email"))
+ {
+ info_.email_ = contactNode.Attribute("email");
+ }
+ if (contactNode.Attribute("website"))
+ {
+ info_.website_ = contactNode.Attribute("website");
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_acceptor(const tinyxml2::XMLElement& acceptorNode)
+ {
+ const char* xid = acceptorNode.Attribute("id");
+ if (xid == NULL)
+ {
+ throw ZHfstMetaDataParsingError("id missing in acceptor");
+ }
+ if (validate_automaton_id(xid) == false)
+ {
+ throw ZHfstMetaDataParsingError("Invalid id in accpetor");
+ }
+ char* descr = get_automaton_descr_from_id(xid);
+ acceptor_[descr].descr_ = descr;
+ acceptor_[descr].id_ = xid;
+ if (acceptorNode.Attribute("trtype"))
+ {
+ acceptor_[descr].transtype_ = acceptorNode.Attribute("trtype");
+ }
+ if (acceptorNode.Attribute("type"))
+ {
+ acceptor_[descr].type_ = acceptorNode.Attribute("type");
+ }
+ const tinyxml2::XMLElement* acc = acceptorNode.FirstChildElement();
+ while (acc != NULL)
+ {
+ if (strcmp(acc->Name(), "title") == 0)
+ {
+ parse_title(*acc, descr);
+ }
+ else if (strcmp(acc->Name(), "description") == 0)
+ {
+ parse_description(*acc, descr);
+ }
+ else
+ {
+ fprintf(stderr, "DEBUG: unknown acceptor child %s\n",
+ acc->Name());
+ }
+ acc = acc->NextSiblingElement();
+ }
+ free(descr);
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_title(const tinyxml2::XMLElement& titleNode,
+ const std::string& accName)
+ {
+ if (titleNode.Attribute("lang"))
+ {
+ acceptor_[accName].title_[titleNode.Attribute("lang")] =
+ titleNode.GetText();
+ }
+ else
+ {
+ acceptor_[accName].title_[info_.locale_] =
+ titleNode.GetText();
+ }
+ }
+
+
+void
+ZHfstOspellerXmlMetadata::parse_description(const tinyxml2::XMLElement& descriptionNode,
+ const std::string& accName)
+ {
+ if (descriptionNode.Attribute("lang"))
+ {
+ acceptor_[accName].description_[descriptionNode.Attribute("lang")] =
+ descriptionNode.GetText();
+ }
+ else
+ {
+ acceptor_[accName].description_[info_.locale_] =
+ descriptionNode.GetText();
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_errmodel(const tinyxml2::XMLElement& errmodelNode)
+ {
+ const char* xid = errmodelNode.Attribute("id");
+ if (xid == NULL)
+ {
+ throw ZHfstMetaDataParsingError("id missing in errmodel");
+ }
+ if (validate_automaton_id(xid) == false)
+ {
+ throw ZHfstMetaDataParsingError("Invalid id in errmodel");
+ }
+ char* descr = get_automaton_descr_from_id(xid);
+ errmodel_.push_back(ZHfstOspellerErrModelMetadata());
+ size_t errm_count = errmodel_.size() - 1;
+ if (descr != NULL)
+ {
+ errmodel_[errm_count].descr_ = descr;
+ }
+ free(descr);
+ errmodel_[errm_count].id_ = xid;
+ const tinyxml2::XMLElement* errm = errmodelNode.FirstChildElement();
+ while (errm != NULL)
+ {
+ if (strcmp(errm->Name(), "title") == 0)
+ {
+ parse_title(*errm, errm_count);
+ }
+ else if (strcmp(errm->Name(), "description") == 0)
+ {
+ parse_description(*errm, errm_count);
+ }
+ else if (strcmp(errm->Name(), "type") == 0)
+ {
+ parse_type(*errm, errm_count);
+ }
+ else if (strcmp(errm->Name(), "model") == 0)
+ {
+ parse_model(*errm, errm_count);
+ }
+ else
+ {
+ fprintf(stderr, "DEBUG: unknown errmodel child %s\n",
+ errm->Name());
+ }
+ errm = errm->NextSiblingElement();
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_title(const tinyxml2::XMLElement& titleNode, size_t errm_count)
+ {
+ if (titleNode.Attribute("lang"))
+ {
+ errmodel_[errm_count].title_[titleNode.Attribute("lang")] =
+ titleNode.GetText();
+ }
+ else
+ {
+ errmodel_[errm_count].title_[info_.locale_] = titleNode.GetText();
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_description(const tinyxml2::XMLElement& descriptionNode, size_t errm_count)
+ {
+ if (descriptionNode.Attribute("lang"))
+ {
+ errmodel_[errm_count].description_[descriptionNode.Attribute("lang")] =
+ descriptionNode.GetText();
+ }
+ else
+ {
+ errmodel_[errm_count].description_[info_.locale_] =
+ descriptionNode.GetText();
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_type(const tinyxml2::XMLElement& typeNode, size_t errm_count)
+ {
+ if (typeNode.Attribute("type"))
+ {
+ errmodel_[errm_count].type_.push_back(typeNode.Attribute("type"));
+ }
+ else
+ {
+ throw ZHfstMetaDataParsingError("No type in type");
+ }
+ }
+
+void
+ZHfstOspellerXmlMetadata::parse_model(const tinyxml2::XMLElement& modelNode, size_t errm_count)
+ {
+ errmodel_[errm_count].model_.push_back(modelNode.GetText());
+ }
+
+void
+ZHfstOspellerXmlMetadata::read_xml(const char* xml_data, size_t xml_len)
+ {
+ tinyxml2::XMLDocument doc;
+ if (doc.Parse(xml_data, xml_len) != tinyxml2::XML_NO_ERROR)
+ {
+ throw ZHfstMetaDataParsingError("Reading XML from memory");
+ }
+ this->parse_xml(doc);
+ }
+
+void
+ZHfstOspellerXmlMetadata::read_xml(const string& filename)
+ {
+ tinyxml2::XMLDocument doc;
+ if (doc.LoadFile(filename.c_str()) != tinyxml2::XML_NO_ERROR)
+ {
+ throw ZHfstMetaDataParsingError("Reading XML from file");
+ }
+ this->parse_xml(doc);
+ }
+#else
+#error configure found no usable XML library
+void
+ ZHfstOspellerXmlMetadata::read_xml(const char*, size_t)
+ {}
+void
+ ZHfstOspellerXmlMetadata::read_xml(const std::string&)
+ {}
+#endif // HAVE_LIBXML
+
+
+string
+ZHfstOspellerXmlMetadata::debug_dump() const
+ {
+ string retval = "locale: " + info_.locale_ + "\n"
+ "version: " + info_.version_ + " [vcsrev: " + info_.vcsrev_ + "]\n"
+ "date: " + info_.date_ + "\n"
+ "producer: " + info_.producer_ + "[email: <" + info_.email_ + ">, "
+ "website: <" + info_.website_ + ">]\n";
+ for (map<string,string>::const_iterator title = info_.title_.begin();
+ title != info_.title_.end();
+ ++title)
+ {
+ retval.append("title [" + title->first + "]: " + title->second + "\n");
+ }
+ for (map<string,string>::const_iterator description = info_.description_.begin();
+ description != info_.description_.end();
+ ++description)
+ {
+ retval.append("description [" + description->first + "]: " +
+ description->second + "\n");
+ }
+ for (map<string,ZHfstOspellerAcceptorMetadata>::const_iterator acc = acceptor_.begin();
+ acc != acceptor_.end();
+ ++acc)
+ {
+ retval.append("acceptor[" + acc->second.descr_ + "] [id: " + acc->second.id_ +
+ ", type: " + acc->second.type_ + "trtype: " + acc->second.transtype_ +
+ "]\n");
+
+ for (LanguageVersions::const_iterator title = acc->second.title_.begin();
+ title != acc->second.title_.end();
+ ++title)
+ {
+ retval.append("title [" + title->first + "]: " + title->second +
+ "\n");
+ }
+ for (LanguageVersions::const_iterator description = acc->second.description_.begin();
+ description != acc->second.description_.end();
+ ++description)
+ {
+ retval.append("description[" + description->first + "]: "
+ + description->second + "\n");
+ }
+ }
+ for (std::vector<ZHfstOspellerErrModelMetadata>::const_iterator errm = errmodel_.begin();
+ errm != errmodel_.end();
+ ++errm)
+ {
+ retval.append("errmodel[" + errm->descr_ + "] [id: " + errm->id_ +
+ "]\n");
+
+ for (LanguageVersions::const_iterator title = errm->title_.begin();
+ title != errm->title_.end();
+ ++title)
+ {
+ retval.append("title [" + title->first + "]: " + title->second +
+ "\n");
+ }
+ for (LanguageVersions::const_iterator description = errm->description_.begin();
+ description != errm->description_.end();
+ ++description)
+ {
+ retval.append("description[" + description->first + "]: "
+ + description->second + "\n");
+ }
+ for (std::vector<string>::const_iterator type = errm->type_.begin();
+ type != errm->type_.end();
+ ++type)
+ {
+ retval.append("type: " + *type + "\n");
+ }
+ for (std::vector<string>::const_iterator model = errm->model_.begin();
+ model != errm->model_.end();
+ ++model)
+ {
+ retval.append("model: " + *model + "\n");
+ }
+ }
+
+ return retval;
+ }
+
+ } // namespace hfst_ol
+
+
diff --git a/ZHfstOspellerXmlMetadata.h b/ZHfstOspellerXmlMetadata.h
new file mode 100644
index 0000000..066174a
--- /dev/null
+++ b/ZHfstOspellerXmlMetadata.h
@@ -0,0 +1,170 @@
+/* -*- Mode: C++ -*- */
+// Copyright 2010 University of Helsinki
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HFST_OSPELL_ZHFSTOSPELLERXMLMETADATA_H_
+#define HFST_OSPELL_ZHFSTOSPELLERXMLMETADATA_H_ 1
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <map>
+
+using std::map;
+
+#if HAVE_LIBXML
+# include <libxml++/libxml++.h>
+#elif HAVE_TINYXML2
+# include <tinyxml2.h>
+#endif
+
+#include "ospell.h"
+#include "hfst-ol.h"
+
+namespace hfst_ol
+ {
+ //! @brief data type for associating set of translations to languages.
+ typedef std::map<std::string,std::string> LanguageVersions;
+
+
+ //! @brief ZHfstOspellerInfo represents one info block of an zhfst file.
+ //! @see https://victorio.uit.no/langtech/trunk/plan/proof/doc/lexfile-spec.xml
+ struct ZHfstOspellerInfoMetadata
+ {
+ //! @brief active locale of speller in BCP format
+ std::string locale_;
+ //! @brief transalation of titles to all languages
+ LanguageVersions title_;
+ //! @brief translation of descriptions to all languages
+ LanguageVersions description_;
+ //! @brief version defintition as free form string
+ std::string version_;
+ //! @brief vcs revision as string
+ std::string vcsrev_;
+ //! @brief date for age of speller as string
+ std::string date_;
+ //! @brief producer of the speller
+ std::string producer_;
+ //! @brief email address of the speller
+ std::string email_;
+ //! @brief web address of the speller
+ std::string website_;
+ };
+ //! @brief Represents one acceptor block in XML metadata
+ struct ZHfstOspellerAcceptorMetadata
+ {
+ //! @brief unique id of acceptor
+ std::string id_;
+ //! @brief descr part of acceptor
+ std::string descr_;
+ //! @brief type of dictionary
+ std::string type_;
+ //! @brief type of transducer
+ std::string transtype_;
+ //! @brief titles of dictionary in languages
+ LanguageVersions title_;
+ //! @brief descriptions of dictionary in languages
+ LanguageVersions description_;
+ };
+ //! @brief Represents one errmodel block in XML metadata
+ struct ZHfstOspellerErrModelMetadata
+ {
+ //! @brief id of each error model in set
+ std::string id_;
+ //! @brief descr part of each id
+ std::string descr_;
+ //! @brief title of error models in languages
+ LanguageVersions title_;
+ //! @brief description of error models in languages
+ LanguageVersions description_;
+ //! @brief types of error models
+ std::vector<std::string> type_;
+ //! @brief models
+ std::vector<std::string> model_;
+ };
+ //! @brief holds one index.xml metadata for whole ospeller
+ class ZHfstOspellerXmlMetadata
+ {
+ public:
+ //! @brief construct metadata for undefined language and other default
+ //! values
+ ZHfstOspellerXmlMetadata();
+ //! @brief read metadata from XML file by @a filename.
+ void read_xml(const std::string& filename);
+ //! @brief read XML from in memory @a data pointer with given @a length
+ //!
+ //! Depending on the XML library compiled in, the data length may
+ //! be omitted and the buffer may be overflown.
+ void read_xml(const char* data, size_t data_length);
+ //! @brief create a programmer readable dump of XML metadata.
+ //!
+ //! shows linear serialisation of all header data in random order.
+ std::string debug_dump() const;
+
+ public:
+ ZHfstOspellerInfoMetadata info_; //!< The info node data
+ //! @brief data for acceptor nodes
+ std::map<std::string,ZHfstOspellerAcceptorMetadata> acceptor_;
+ //! @brief data for errmodel nodes
+ std::vector<ZHfstOspellerErrModelMetadata> errmodel_;
+#if HAVE_LIBXML
+ private:
+ void parse_xml(const xmlpp::Document* doc);
+ void verify_hfstspeller(xmlpp::Node* hfstspellerNode);
+ void parse_info(xmlpp::Node* infoNode);
+ void parse_locale(xmlpp::Node* localeNode);
+ void parse_title(xmlpp::Node* titleNode);
+ void parse_description(xmlpp::Node* descriptionNode);
+ void parse_version(xmlpp::Node* versionNode);
+ void parse_date(xmlpp::Node* dateNode);
+ void parse_producer(xmlpp::Node* producerNode);
+ void parse_contact(xmlpp::Node* contactNode);
+ void parse_acceptor(xmlpp::Node* acceptorNode);
+ void parse_title(xmlpp::Node* titleNode, const std::string& accName);
+ void parse_description(xmlpp::Node* descriptionNode,
+ const std::string& accName);
+ void parse_errmodel(xmlpp::Node* errmodelNode);
+ void parse_title(xmlpp::Node* titleNode, size_t errm_count);
+ void parse_description(xmlpp::Node* descriptionNode, size_t errm_count);
+ void parse_type(xmlpp::Node* typeNode, size_t errm_count);
+ void parse_model(xmlpp::Node* modelNode, size_t errm_count);
+#elif HAVE_TINYXML2
+ private:
+ void parse_xml(const tinyxml2::XMLDocument& doc);
+ void verify_hfstspeller(const tinyxml2::XMLElement& hfstspellerNode);
+ void parse_info(const tinyxml2::XMLElement& infoNode);
+ void parse_locale(const tinyxml2::XMLElement& localeNode);
+ void parse_title(const tinyxml2::XMLElement& titleNode);
+ void parse_description(const tinyxml2::XMLElement& descriptionNode);
+ void parse_version(const tinyxml2::XMLElement& versionNode);
+ void parse_date(const tinyxml2::XMLElement& dateNode);
+ void parse_producer(const tinyxml2::XMLElement& producerNode);
+ void parse_contact(const tinyxml2::XMLElement& contactNode);
+ void parse_acceptor(const tinyxml2::XMLElement& acceptorNode);
+ void parse_title(const tinyxml2::XMLElement& titleNode, const std::string& accName);
+ void parse_description(const tinyxml2::XMLElement& descriptionNode,
+ const std::string& accName);
+ void parse_errmodel(const tinyxml2::XMLElement& errmodelNode);
+ void parse_title(const tinyxml2::XMLElement& titleNode, size_t errm_count);
+ void parse_description(const tinyxml2::XMLElement& descriptionNode, size_t errm_count);
+ void parse_type(const tinyxml2::XMLElement& typeNode, size_t errm_count);
+ void parse_model(const tinyxml2::XMLElement& modelNode, size_t errm_count);
+
+#endif
+ };
+}
+
+#endif // inclusion GUARD
+// vim: set ft=cpp.doxygen:
diff --git a/acceptor.basic.txt b/acceptor.basic.txt
new file mode 100644
index 0000000..ec89513
--- /dev/null
+++ b/acceptor.basic.txt
@@ -0,0 +1,7 @@
+0 1 o o
+1 2 l l
+2 3 u u
+3 4 t t
+4
+0 5 v e
+0 5 s i
diff --git a/analyse-spell.sh b/analyse-spell.sh
new file mode 100755
index 0000000..6b1f312
--- /dev/null
+++ b/analyse-spell.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+if test -x ./hfst-ospell ; then
+ if ! cat $srcdir/test.strings | ./hfst-ospell -a speller_analyser.zhfst ; then
+ exit 1
+ fi
+else
+ echo ./hfst-ospell not built
+ exit 77
+fi
+
diff --git a/analyser.default.txt b/analyser.default.txt
new file mode 100644
index 0000000..3961b3b
--- /dev/null
+++ b/analyser.default.txt
@@ -0,0 +1,12 @@
+0 1 o o
+0 6 v v
+1 2 l l
+2 3 u u
+3 5 @_EPSILON_SYMBOL_@ +Use/-Spell
+3 4 t t
+4 5 @_EPSILON_SYMBOL_@ +N
+5
+6 7 e e
+7 8 s s
+8 9 i i
+9 5 @_EPSILON_SYMBOL_@ +Use/SpellNoSugg
diff --git a/authors.xml b/authors.xml
new file mode 100644
index 0000000..4429e9c
--- /dev/null
+++ b/authors.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="utf-8"?>
+<authors>
+ <author uid="mie">
+ Tommi A Pirinen <tommi.pirinen at helsinki.fi>
+ </author>
+</authors>
+
diff --git a/autogen.sh b/autogen.sh
new file mode 100755
index 0000000..916bace
--- /dev/null
+++ b/autogen.sh
@@ -0,0 +1,1491 @@
+#!/bin/sh
+# a u t o g e n . s h
+#
+# Copyright (c) 2005-2007 United States Government as represented by
+# the U.S. Army Research Laboratory.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials provided
+# with the distribution.
+#
+# 3. The name of the author may not be used to endorse or promote
+# products derived from this software without specific prior written
+# permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+# OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+###
+#
+# Script for automatically preparing the sources for compilation by
+# performing the myrid of necessary steps. The script attempts to
+# detect proper version support, and outputs warnings about particular
+# systems that have autotool peculiarities.
+#
+# Basically, if everything is set up and installed correctly, the
+# script will validate that minimum versions of the GNU Build System
+# tools are installed, account for several common configuration
+# issues, and then simply run autoreconf for you.
+#
+# If autoreconf fails, which can happen for many valid configurations,
+# this script proceeds to run manual preparation steps effectively
+# providing a POSIX shell script (mostly complete) reimplementation of
+# autoreconf.
+#
+# The AUTORECONF, AUTOCONF, AUTOMAKE, LIBTOOLIZE, ACLOCAL, AUTOHEADER
+# environment variables and corresponding _OPTIONS variables (e.g.
+# AUTORECONF_OPTIONS) may be used to override the default automatic
+# detection behaviors. Similarly the _VERSION variables will override
+# the minimum required version numbers.
+#
+# Examples:
+#
+# To obtain help on usage:
+# ./autogen.sh --help
+#
+# To obtain verbose output:
+# ./autogen.sh --verbose
+#
+# To skip autoreconf and prepare manually:
+# AUTORECONF=false ./autogen.sh
+#
+# To verbosely try running with an older (unsupported) autoconf:
+# AUTOCONF_VERSION=2.50 ./autogen.sh --verbose
+#
+# Author: Christopher Sean Morrison <morrison at brlcad.org>
+#
+######################################################################
+
+# set to minimum acceptible version of autoconf
+if [ "x$AUTOCONF_VERSION" = "x" ] ; then
+ AUTOCONF_VERSION=2.52
+fi
+# set to minimum acceptible version of automake
+if [ "x$AUTOMAKE_VERSION" = "x" ] ; then
+ AUTOMAKE_VERSION=1.6.0
+fi
+# set to minimum acceptible version of libtool
+if [ "x$LIBTOOL_VERSION" = "x" ] ; then
+ LIBTOOL_VERSION=1.4.2
+fi
+
+
+##################
+# ident function #
+##################
+ident ( ) {
+ # extract copyright from header
+ __copyright="`grep Copyright $AUTOGEN_SH | head -${HEAD_N}1 | awk '{print $4}'`"
+ if [ "x$__copyright" = "x" ] ; then
+ __copyright="`date +%Y`"
+ fi
+
+ # extract version from CVS Id string
+ __id="$Id: autogen.sh 920 2011-02-03 00:36:00Z mie $"
+ __version="`echo $__id | sed 's/.*\([0-9][0-9][0-9][0-9]\)[-\/]\([0-9][0-9]\)[-\/]\([0-9][0-9]\).*/\1\2\3/'`"
+ if [ "x$__version" = "x" ] ; then
+ __version=""
+ fi
+
+ echo "autogen.sh build preparation script by Christopher Sean Morrison"
+ echo "revised 3-clause BSD-style license, copyright (c) $__copyright"
+ echo "script version $__version, ISO/IEC 9945 POSIX shell script"
+}
+
+
+##################
+# USAGE FUNCTION #
+##################
+usage ( ) {
+ echo "Usage: $AUTOGEN_SH [-h|--help] [-v|--verbose] [-q|--quiet] [--version]"
+ echo " --help Help on $NAME_OF_AUTOGEN usage"
+ echo " --verbose Verbose progress output"
+ echo " --quiet Quiet suppressed progress output"
+ echo " --version Only perform GNU Build System version checks"
+ echo
+ echo "Description: This script will validate that minimum versions of the"
+ echo "GNU Build System tools are installed and then run autoreconf for you."
+ echo "Should autoreconf fail, manual preparation steps will be run"
+ echo "potentially accounting for several common preparation issues. The"
+
+ echo "AUTORECONF, AUTOCONF, AUTOMAKE, LIBTOOLIZE, ACLOCAL, AUTOHEADER,"
+ echo "PROJECT, & CONFIGURE environment variables and corresponding _OPTIONS"
+ echo "variables (e.g. AUTORECONF_OPTIONS) may be used to override the"
+ echo "default automatic detection behavior."
+ echo
+
+ ident
+
+ return 0
+}
+
+
+##########################
+# VERSION_ERROR FUNCTION #
+##########################
+version_error ( ) {
+ if [ "x$1" = "x" ] ; then
+ echo "INTERNAL ERROR: version_error was not provided a version"
+ exit 1
+ fi
+ if [ "x$2" = "x" ] ; then
+ echo "INTERNAL ERROR: version_error was not provided an application name"
+ exit 1
+ fi
+ $ECHO
+ $ECHO "ERROR: To prepare the ${PROJECT} build system from scratch,"
+ $ECHO " at least version $1 of $2 must be installed."
+ $ECHO
+ $ECHO "$NAME_OF_AUTOGEN does not need to be run on the same machine that will"
+ $ECHO "run configure or make. Either the GNU Autotools will need to be installed"
+ $ECHO "or upgraded on this system, or $NAME_OF_AUTOGEN must be run on the source"
+ $ECHO "code on another system and then transferred to here. -- Cheers!"
+ $ECHO
+}
+
+##########################
+# VERSION_CHECK FUNCTION #
+##########################
+version_check ( ) {
+ if [ "x$1" = "x" ] ; then
+ echo "INTERNAL ERROR: version_check was not provided a minimum version"
+ exit 1
+ fi
+ _min="$1"
+ if [ "x$2" = "x" ] ; then
+ echo "INTERNAL ERROR: version check was not provided a comparison version"
+ exit 1
+ fi
+ _cur="$2"
+
+ # needed to handle versions like 1.10 and 1.4-p6
+ _min="`echo ${_min}. | sed 's/[^0-9]/./g' | sed 's/\.\././g'`"
+ _cur="`echo ${_cur}. | sed 's/[^0-9]/./g' | sed 's/\.\././g'`"
+
+ _min_major="`echo $_min | cut -d. -f1`"
+ _min_minor="`echo $_min | cut -d. -f2`"
+ _min_patch="`echo $_min | cut -d. -f3`"
+
+ _cur_major="`echo $_cur | cut -d. -f1`"
+ _cur_minor="`echo $_cur | cut -d. -f2`"
+ _cur_patch="`echo $_cur | cut -d. -f3`"
+
+ if [ "x$_min_major" = "x" ] ; then
+ _min_major=0
+ fi
+ if [ "x$_min_minor" = "x" ] ; then
+ _min_minor=0
+ fi
+ if [ "x$_min_patch" = "x" ] ; then
+ _min_patch=0
+ fi
+ if [ "x$_cur_minor" = "x" ] ; then
+ _cur_major=0
+ fi
+ if [ "x$_cur_minor" = "x" ] ; then
+ _cur_minor=0
+ fi
+ if [ "x$_cur_patch" = "x" ] ; then
+ _cur_patch=0
+ fi
+
+ $VERBOSE_ECHO "Checking if ${_cur_major}.${_cur_minor}.${_cur_patch} is greater than ${_min_major}.${_min_minor}.${_min_patch}"
+
+ if [ $_min_major -lt $_cur_major ] ; then
+ return 0
+ elif [ $_min_major -eq $_cur_major ] ; then
+ if [ $_min_minor -lt $_cur_minor ] ; then
+ return 0
+ elif [ $_min_minor -eq $_cur_minor ] ; then
+ if [ $_min_patch -lt $_cur_patch ] ; then
+ return 0
+ elif [ $_min_patch -eq $_cur_patch ] ; then
+ return 0
+ fi
+ fi
+ fi
+ return 1
+}
+
+
+######################################
+# LOCATE_CONFIGURE_TEMPLATE FUNCTION #
+######################################
+locate_configure_template ( ) {
+ _pwd="`pwd`"
+ if test -f "./configure.ac" ; then
+ echo "./configure.ac"
+ elif test -f "./configure.in" ; then
+ echo "./configure.in"
+ elif test -f "$_pwd/configure.ac" ; then
+ echo "$_pwd/configure.ac"
+ elif test -f "$_pwd/configure.in" ; then
+ echo "$_pwd/configure.in"
+ elif test -f "$PATH_TO_AUTOGEN/configure.ac" ; then
+ echo "$PATH_TO_AUTOGEN/configure.ac"
+ elif test -f "$PATH_TO_AUTOGEN/configure.in" ; then
+ echo "$PATH_TO_AUTOGEN/configure.in"
+ fi
+}
+
+
+##################
+# argument check #
+##################
+ARGS="$*"
+PATH_TO_AUTOGEN="`dirname $0`"
+NAME_OF_AUTOGEN="`basename $0`"
+AUTOGEN_SH="$PATH_TO_AUTOGEN/$NAME_OF_AUTOGEN"
+
+LIBTOOL_M4="${PATH_TO_AUTOGEN}/misc/libtool.m4"
+
+if [ "x$HELP" = "x" ] ; then
+ HELP=no
+fi
+if [ "x$QUIET" = "x" ] ; then
+ QUIET=no
+fi
+if [ "x$VERBOSE" = "x" ] ; then
+ VERBOSE=no
+fi
+if [ "x$VERSION_ONLY" = "x" ] ; then
+ VERSION_ONLY=no
+fi
+if [ "x$AUTORECONF_OPTIONS" = "x" ] ; then
+ AUTORECONF_OPTIONS="-i -f"
+fi
+if [ "x$AUTOCONF_OPTIONS" = "x" ] ; then
+ AUTOCONF_OPTIONS="-f"
+fi
+if [ "x$AUTOMAKE_OPTIONS" = "x" ] ; then
+ AUTOMAKE_OPTIONS="-a -c -f"
+fi
+ALT_AUTOMAKE_OPTIONS="-a -c"
+if [ "x$LIBTOOLIZE_OPTIONS" = "x" ] ; then
+ LIBTOOLIZE_OPTIONS="--automake -c -f"
+fi
+ALT_LIBTOOLIZE_OPTIONS="--automake --copy --force"
+if [ "x$ACLOCAL_OPTIONS" = "x" ] ; then
+ ACLOCAL_OPTIONS=""
+fi
+if [ "x$AUTOHEADER_OPTIONS" = "x" ] ; then
+ AUTOHEADER_OPTIONS=""
+fi
+for arg in $ARGS ; do
+ case "x$arg" in
+ x--help) HELP=yes ;;
+ x-[hH]) HELP=yes ;;
+ x--quiet) QUIET=yes ;;
+ x-[qQ]) QUIET=yes ;;
+ x--verbose) VERBOSE=yes ;;
+ x-[vV]) VERBOSE=yes ;;
+ x--version) VERSION_ONLY=yes ;;
+ *)
+ echo "Unknown option: $arg"
+ echo
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+
+#####################
+# environment check #
+#####################
+
+# sanity check before recursions potentially begin
+if [ ! -f "$AUTOGEN_SH" ] ; then
+ echo "INTERNAL ERROR: $AUTOGEN_SH does not exist"
+ if [ ! "x$0" = "x$AUTOGEN_SH" ] ; then
+ echo "INTERNAL ERROR: dirname/basename inconsistency: $0 != $AUTOGEN_SH"
+ fi
+ exit 1
+fi
+
+# force locale setting to C so things like date output as expected
+LC_ALL=C
+
+# commands that this script expects
+for __cmd in echo head tail pwd ; do
+ echo "test" | $__cmd > /dev/null 2>&1
+ if [ $? != 0 ] ; then
+ echo "INTERNAL ERROR: '${__cmd}' command is required"
+ exit 2
+ fi
+done
+echo "test" | grep "test" > /dev/null 2>&1
+if test ! x$? = x0 ; then
+ echo "INTERNAL ERROR: grep command is required"
+ exit 1
+fi
+echo "test" | sed "s/test/test/" > /dev/null 2>&1
+if test ! x$? = x0 ; then
+ echo "INTERNAL ERROR: sed command is required"
+ exit 1
+fi
+
+
+# determine the behavior of echo
+case `echo "testing\c"; echo 1,2,3`,`echo -n testing; echo 1,2,3` in
+ *c*,-n*) ECHO_N= ECHO_C='
+' ECHO_T=' ' ;;
+ *c*,* ) ECHO_N=-n ECHO_C= ECHO_T= ;;
+ *) ECHO_N= ECHO_C='\c' ECHO_T= ;;
+esac
+
+# determine the behavior of head
+case "x`echo 'head' | head -n 1 2>&1`" in
+ *xhead*) HEAD_N="n " ;;
+ *) HEAD_N="" ;;
+esac
+
+# determine the behavior of tail
+case "x`echo 'tail' | tail -n 1 2>&1`" in
+ *xtail*) TAIL_N="n " ;;
+ *) TAIL_N="" ;;
+esac
+
+VERBOSE_ECHO=:
+ECHO=:
+if [ "x$QUIET" = "xyes" ] ; then
+ if [ "x$VERBOSE" = "xyes" ] ; then
+ echo "Verbose output quelled by quiet option. Further output disabled."
+ fi
+else
+ ECHO=echo
+ if [ "x$VERBOSE" = "xyes" ] ; then
+ echo "Verbose output enabled"
+ VERBOSE_ECHO=echo
+ fi
+fi
+
+
+# allow a recursive run to disable further recursions
+if [ "x$RUN_RECURSIVE" = "x" ] ; then
+ RUN_RECURSIVE=yes
+fi
+
+
+################################################
+# check for help arg and bypass version checks #
+################################################
+if [ "x`echo $ARGS | sed 's/.*[hH][eE][lL][pP].*/help/'`" = "xhelp" ] ; then
+ HELP=yes
+fi
+if [ "x$HELP" = "xyes" ] ; then
+ usage
+ $ECHO "---"
+ $ECHO "Help was requested. No preparation or configuration will be performed."
+ exit 0
+fi
+
+
+#######################
+# set up signal traps #
+#######################
+untrap_abnormal ( ) {
+ for sig in 1 2 13 15; do
+ trap - $sig
+ done
+}
+
+# do this cleanup whenever we exit.
+trap '
+ # start from the root
+ if test -d "$START_PATH" ; then
+ cd "$START_PATH"
+ fi
+
+ # restore/delete backup files
+ if test "x$PFC_INIT" = "x1" ; then
+ recursive_restore
+ fi
+' 0
+
+# trap SIGHUP (1), SIGINT (2), SIGPIPE (13), SIGTERM (15)
+for sig in 1 2 13 15; do
+ trap '
+ $ECHO ""
+ $ECHO "Aborting $NAME_OF_AUTOGEN: caught signal '$sig'"
+
+ # start from the root
+ if test -d "$START_PATH" ; then
+ cd "$START_PATH"
+ fi
+
+ # clean up on abnormal exit
+ $VERBOSE_ECHO "rm -rf autom4te.cache"
+ rm -rf autom4te.cache
+
+ if test -f "acinclude.m4.$$.backup" ; then
+ $VERBOSE_ECHO "cat acinclude.m4.$$.backup > acinclude.m4"
+ chmod u+w acinclude.m4
+ cat acinclude.m4.$$.backup > acinclude.m4
+
+ $VERBOSE_ECHO "rm -f acinclude.m4.$$.backup"
+ rm -f acinclude.m4.$$.backup
+ fi
+
+ { (exit 1); exit 1; }
+' $sig
+done
+
+
+#############################
+# look for a configure file #
+#############################
+if [ "x$CONFIGURE" = "x" ] ; then
+ CONFIGURE="`locate_configure_template`"
+ if [ ! "x$CONFIGURE" = "x" ] ; then
+ $VERBOSE_ECHO "Found a configure template: $CONFIGURE"
+ fi
+else
+ $ECHO "Using CONFIGURE environment variable override: $CONFIGURE"
+fi
+if [ "x$CONFIGURE" = "x" ] ; then
+ if [ "x$VERSION_ONLY" = "xyes" ] ; then
+ CONFIGURE=/dev/null
+ else
+ $ECHO
+ $ECHO "A configure.ac or configure.in file could not be located implying"
+ $ECHO "that the GNU Build System is at least not used in this directory. In"
+ $ECHO "any case, there is nothing to do here without one of those files."
+ $ECHO
+ $ECHO "ERROR: No configure.in or configure.ac file found in `pwd`"
+ exit 1
+ fi
+fi
+
+####################
+# get project name #
+####################
+if [ "x$PROJECT" = "x" ] ; then
+ PROJECT="`grep AC_INIT $CONFIGURE | grep -v '.*#.*AC_INIT' | tail -${TAIL_N}1 | sed 's/^[ ]*AC_INIT(\([^,)]*\).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`"
+ if [ "x$PROJECT" = "xAC_INIT" ] ; then
+ # projects might be using the older/deprecated arg-less AC_INIT .. look for AM_INIT_AUTOMAKE instead
+ PROJECT="`grep AM_INIT_AUTOMAKE $CONFIGURE | grep -v '.*#.*AM_INIT_AUTOMAKE' | tail -${TAIL_N}1 | sed 's/^[ ]*AM_INIT_AUTOMAKE(\([^,)]*\).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`"
+ fi
+ if [ "x$PROJECT" = "xAM_INIT_AUTOMAKE" ] ; then
+ PROJECT="project"
+ fi
+ if [ "x$PROJECT" = "x" ] ; then
+ PROJECT="project"
+ fi
+else
+ $ECHO "Using PROJECT environment variable override: $PROJECT"
+fi
+$ECHO "Preparing the $PROJECT build system...please wait"
+$ECHO
+
+
+########################
+# check for autoreconf #
+########################
+HAVE_AUTORECONF=no
+if [ "x$AUTORECONF" = "x" ] ; then
+ for AUTORECONF in autoreconf ; do
+ $VERBOSE_ECHO "Checking autoreconf version: $AUTORECONF --version"
+ $AUTORECONF --version > /dev/null 2>&1
+ if [ $? = 0 ] ; then
+ HAVE_AUTORECONF=yes
+ break
+ fi
+ done
+else
+ HAVE_AUTORECONF=yes
+ $ECHO "Using AUTORECONF environment variable override: $AUTORECONF"
+fi
+
+
+##########################
+# autoconf version check #
+##########################
+_acfound=no
+if [ "x$AUTOCONF" = "x" ] ; then
+ for AUTOCONF in autoconf ; do
+ $VERBOSE_ECHO "Checking autoconf version: $AUTOCONF --version"
+ $AUTOCONF --version > /dev/null 2>&1
+ if [ $? = 0 ] ; then
+ _acfound=yes
+ break
+ fi
+ done
+else
+ _acfound=yes
+ $ECHO "Using AUTOCONF environment variable override: $AUTOCONF"
+fi
+
+_report_error=no
+if [ ! "x$_acfound" = "xyes" ] ; then
+ $ECHO "ERROR: Unable to locate GNU Autoconf."
+ _report_error=yes
+else
+ _version="`$AUTOCONF --version | head -${HEAD_N}1 | sed 's/[^0-9]*\([0-9\.][0-9\.]*\)/\1/'`"
+ if [ "x$_version" = "x" ] ; then
+ _version="0.0.0"
+ fi
+ $ECHO "Found GNU Autoconf version $_version"
+ version_check "$AUTOCONF_VERSION" "$_version"
+ if [ $? -ne 0 ] ; then
+ _report_error=yes
+ fi
+fi
+if [ "x$_report_error" = "xyes" ] ; then
+ version_error "$AUTOCONF_VERSION" "GNU Autoconf"
+ exit 1
+fi
+
+
+##########################
+# automake version check #
+##########################
+_amfound=no
+if [ "x$AUTOMAKE" = "x" ] ; then
+ for AUTOMAKE in automake ; do
+ $VERBOSE_ECHO "Checking automake version: $AUTOMAKE --version"
+ $AUTOMAKE --version > /dev/null 2>&1
+ if [ $? = 0 ] ; then
+ _amfound=yes
+ break
+ fi
+ done
+else
+ _amfound=yes
+ $ECHO "Using AUTOMAKE environment variable override: $AUTOMAKE"
+fi
+
+
+_report_error=no
+if [ ! "x$_amfound" = "xyes" ] ; then
+ $ECHO
+ $ECHO "ERROR: Unable to locate GNU Automake."
+ _report_error=yes
+else
+ _version="`$AUTOMAKE --version | head -${HEAD_N}1 | sed 's/[^0-9]*\([0-9\.][0-9\.]*\)/\1/'`"
+ if [ "x$_version" = "x" ] ; then
+ _version="0.0.0"
+ fi
+ $ECHO "Found GNU Automake version $_version"
+ version_check "$AUTOMAKE_VERSION" "$_version"
+ if [ $? -ne 0 ] ; then
+ _report_error=yes
+ fi
+fi
+if [ "x$_report_error" = "xyes" ] ; then
+ version_error "$AUTOMAKE_VERSION" "GNU Automake"
+ exit 1
+fi
+
+
+########################
+# check for libtoolize #
+########################
+HAVE_LIBTOOLIZE=yes
+HAVE_ALT_LIBTOOLIZE=no
+_ltfound=no
+if [ "x$LIBTOOLIZE" = "x" ] ; then
+ LIBTOOLIZE=libtoolize
+ $VERBOSE_ECHO "Checking libtoolize version: $LIBTOOLIZE --version"
+ $LIBTOOLIZE --version > /dev/null 2>&1
+ if [ ! $? = 0 ] ; then
+ HAVE_LIBTOOLIZE=no
+ $ECHO
+ if [ "x$HAVE_AUTORECONF" = "xno" ] ; then
+ $ECHO "Warning: libtoolize does not appear to be available."
+ else
+ $ECHO "Warning: libtoolize does not appear to be available. This means that"
+ $ECHO "the automatic build preparation via autoreconf will probably not work."
+ $ECHO "Preparing the build by running each step individually, however, should"
+ $ECHO "work and will be done automatically for you if autoreconf fails."
+ fi
+
+ # look for some alternates
+ for tool in glibtoolize libtoolize15 libtoolize14 libtoolize13 ; do
+ $VERBOSE_ECHO "Checking libtoolize alternate: $tool --version"
+ _glibtoolize="`$tool --version > /dev/null 2>&1`"
+ if [ $? = 0 ] ; then
+ $VERBOSE_ECHO "Found $tool --version"
+ _glti="`which $tool`"
+ if [ "x$_glti" = "x" ] ; then
+ $VERBOSE_ECHO "Cannot find $tool with which"
+ continue;
+ fi
+ if test ! -f "$_glti" ; then
+ $VERBOSE_ECHO "Cannot use $tool, $_glti is not a file"
+ continue;
+ fi
+ _gltidir="`dirname $_glti`"
+ if [ "x$_gltidir" = "x" ] ; then
+ $VERBOSE_ECHO "Cannot find $tool path with dirname of $_glti"
+ continue;
+ fi
+ if test ! -d "$_gltidir" ; then
+ $VERBOSE_ECHO "Cannot use $tool, $_gltidir is not a directory"
+ continue;
+ fi
+ HAVE_ALT_LIBTOOLIZE=yes
+ LIBTOOLIZE="$tool"
+ $ECHO
+ $ECHO "Fortunately, $tool was found which means that your system may simply"
+ $ECHO "have a non-standard or incomplete GNU Autotools install. If you have"
+ $ECHO "sufficient system access, it may be possible to quell this warning by"
+ $ECHO "running:"
+ $ECHO
+ sudo -V > /dev/null 2>&1
+ if [ $? = 0 ] ; then
+ $ECHO " sudo ln -s $_glti $_gltidir/libtoolize"
+ $ECHO
+ else
+ $ECHO " ln -s $_glti $_gltidir/libtoolize"
+ $ECHO
+ $ECHO "Run that as root or with proper permissions to the $_gltidir directory"
+ $ECHO
+ fi
+ _ltfound=yes
+ break
+ fi
+ done
+ else
+ _ltfound=yes
+ fi
+else
+ _ltfound=yes
+ $ECHO "Using LIBTOOLIZE environment variable override: $LIBTOOLIZE"
+fi
+
+
+############################
+# libtoolize version check #
+############################
+_report_error=no
+if [ ! "x$_ltfound" = "xyes" ] ; then
+ $ECHO
+ $ECHO "ERROR: Unable to locate GNU Libtool."
+ _report_error=yes
+else
+ _version="`$LIBTOOLIZE --version | head -${HEAD_N}1 | sed 's/[^0-9]*\([0-9\.][0-9\.]*\)/\1/'`"
+ if [ "x$_version" = "x" ] ; then
+ _version="0.0.0"
+ fi
+ $ECHO "Found GNU Libtool version $_version"
+ version_check "$LIBTOOL_VERSION" "$_version"
+ if [ $? -ne 0 ] ; then
+ _report_error=yes
+ fi
+fi
+if [ "x$_report_error" = "xyes" ] ; then
+ version_error "$LIBTOOL_VERSION" "GNU Libtool"
+ exit 1
+fi
+
+
+#####################
+# check for aclocal #
+#####################
+if [ "x$ACLOCAL" = "x" ] ; then
+ for ACLOCAL in aclocal ; do
+ $VERBOSE_ECHO "Checking aclocal version: $ACLOCAL --version"
+ $ACLOCAL --version > /dev/null 2>&1
+ if [ $? = 0 ] ; then
+ break
+ fi
+ done
+else
+ $ECHO "Using ACLOCAL environment variable override: $ACLOCAL"
+fi
+
+
+########################
+# check for autoheader #
+########################
+if [ "x$AUTOHEADER" = "x" ] ; then
+ for AUTOHEADER in autoheader ; do
+ $VERBOSE_ECHO "Checking autoheader version: $AUTOHEADER --version"
+ $AUTOHEADER --version > /dev/null 2>&1
+ if [ $? = 0 ] ; then
+ break
+ fi
+ done
+else
+ $ECHO "Using AUTOHEADER environment variable override: $AUTOHEADER"
+fi
+
+
+#########################
+# check if version only #
+#########################
+$VERBOSE_ECHO "Checking whether to only output version information"
+if [ "x$VERSION_ONLY" = "xyes" ] ; then
+ $ECHO
+ ident
+ $ECHO "---"
+ $ECHO "Version requested. No preparation or configuration will be performed."
+ exit 0
+fi
+
+
+#################################
+# PROTECT_FROM_CLOBBER FUNCTION #
+#################################
+protect_from_clobber ( ) {
+ PFC_INIT=1
+
+ # protect COPYING & INSTALL from overwrite by automake. the
+ # automake force option will (inappropriately) ignore the existing
+ # contents of a COPYING and/or INSTALL files (depending on the
+ # version) instead of just forcing *missing* files like it does
+ # for AUTHORS, NEWS, and README. this is broken but extremely
+ # prevalent behavior, so we protect against it by keeping a backup
+ # of the file that can later be restored.
+
+ if test -f COPYING ; then
+ if test -f COPYING.$$.protect_from_automake.backup ; then
+ $VERBOSE_ECHO "Already backed up COPYING in `pwd`"
+ else
+ $VERBOSE_ECHO "Backing up COPYING in `pwd`"
+ $VERBOSE_ECHO "cp -p COPYING COPYING.$$.protect_from_automake.backup"
+ cp -p COPYING COPYING.$$.protect_from_automake.backup
+ fi
+ fi
+ if test -f INSTALL ; then
+ if test -f INSTALL.$$.protect_from_automake.backup ; then
+ $VERBOSE_ECHO "Already backed up INSTALL in `pwd`"
+ else
+ $VERBOSE_ECHO "Backing up INSTALL in `pwd`"
+ $VERBOSE_ECHO "cp -p INSTALL INSTALL.$$.protect_from_automake.backup"
+ cp -p INSTALL INSTALL.$$.protect_from_automake.backup
+ fi
+ fi
+}
+
+
+##############################
+# RECURSIVE_PROTECT FUNCTION #
+##############################
+recursive_protect ( ) {
+
+ # for projects using recursive configure, run the build
+ # preparation steps for the subdirectories. this function assumes
+ # START_PATH was set to pwd before recursion begins so that
+ # relative paths work.
+
+ # git 'r done, protect COPYING and INSTALL from being clobbered
+ protect_from_clobber
+
+ if test -d autom4te.cache ; then
+ $VERBOSE_ECHO "Found an autom4te.cache directory, deleting it"
+ $VERBOSE_ECHO "rm -rf autom4te.cache"
+ rm -rf autom4te.cache
+ fi
+
+ # find configure template
+ _configure="`locate_configure_template`"
+ if [ "x$_configure" = "x" ] ; then
+ return
+ fi
+ # $VERBOSE_ECHO "Looking for configure template found `pwd`/$_configure"
+
+ # look for subdirs
+ # $VERBOSE_ECHO "Looking for subdirs in `pwd`"
+ _det_config_subdirs="`grep AC_CONFIG_SUBDIRS $_configure | grep -v '.*#.*AC_CONFIG_SUBDIRS' | sed 's/^[ ]*AC_CONFIG_SUBDIRS(\(.*\)).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`"
+ CHECK_DIRS=""
+ for dir in $_det_config_subdirs ; do
+ if test -d "`pwd`/$dir" ; then
+ CHECK_DIRS="$CHECK_DIRS \"`pwd`/$dir\""
+ fi
+ done
+
+ # process subdirs
+ if [ ! "x$CHECK_DIRS" = "x" ] ; then
+ $VERBOSE_ECHO "Recursively scanning the following directories:"
+ $VERBOSE_ECHO " $CHECK_DIRS"
+ for dir in $CHECK_DIRS ; do
+ $VERBOSE_ECHO "Protecting files from automake in $dir"
+ cd "$START_PATH"
+ eval "cd $dir"
+
+ # recursively git 'r done
+ recursive_protect
+ done
+ fi
+} # end of recursive_protect
+
+
+#############################
+# RESTORE_CLOBBERED FUNCION #
+#############################
+restore_clobbered ( ) {
+
+ # The automake (and autoreconf by extension) -f/--force-missing
+ # option may overwrite COPYING and INSTALL even if they do exist.
+ # Here we restore the files if necessary.
+
+ spacer=no
+
+ # COPYING
+ if test -f COPYING.$$.protect_from_automake.backup ; then
+ if test -f COPYING ; then
+ # compare entire content, restore if needed
+ if test "x`cat COPYING`" != "x`cat COPYING.$$.protect_from_automake.backup`" ; then
+ if test "x$spacer" = "xno" ; then
+ $VERBOSE_ECHO
+ spacer=yes
+ fi
+ # restore the backup
+ $VERBOSE_ECHO "Restoring COPYING from backup (automake -f likely clobbered it)"
+ $VERBOSE_ECHO "rm -f COPYING"
+ rm -f COPYING
+ $VERBOSE_ECHO "mv COPYING.$$.protect_from_automake.backup COPYING"
+ mv COPYING.$$.protect_from_automake.backup COPYING
+ fi # check contents
+ elif test -f COPYING.$$.protect_from_automake.backup ; then
+ $VERBOSE_ECHO "mv COPYING.$$.protect_from_automake.backup COPYING"
+ mv COPYING.$$.protect_from_automake.backup COPYING
+ fi # -f COPYING
+
+ # just in case
+ $VERBOSE_ECHO "rm -f COPYING.$$.protect_from_automake.backup"
+ rm -f COPYING.$$.protect_from_automake.backup
+ fi # -f COPYING.$$.protect_from_automake.backup
+
+ # INSTALL
+ if test -f INSTALL.$$.protect_from_automake.backup ; then
+ if test -f INSTALL ; then
+ # compare entire content, restore if needed
+ if test "x`cat INSTALL`" != "x`cat INSTALL.$$.protect_from_automake.backup`" ; then
+ if test "x$spacer" = "xno" ; then
+ $VERBOSE_ECHO
+ spacer=yes
+ fi
+ # restore the backup
+ $VERBOSE_ECHO "Restoring INSTALL from backup (automake -f likely clobbered it)"
+ $VERBOSE_ECHO "rm -f INSTALL"
+ rm -f INSTALL
+ $VERBOSE_ECHO "mv INSTALL.$$.protect_from_automake.backup INSTALL"
+ mv INSTALL.$$.protect_from_automake.backup INSTALL
+ fi # check contents
+ elif test -f INSTALL.$$.protect_from_automake.backup ; then
+ $VERBOSE_ECHO "mv INSTALL.$$.protect_from_automake.backup INSTALL"
+ mv INSTALL.$$.protect_from_automake.backup INSTALL
+ fi # -f INSTALL
+
+ # just in case
+ $VERBOSE_ECHO "rm -f INSTALL.$$.protect_from_automake.backup"
+ rm -f INSTALL.$$.protect_from_automake.backup
+ fi # -f INSTALL.$$.protect_from_automake.backup
+
+ CONFIGURE="`locate_configure_template`"
+ if [ "x$CONFIGURE" = "x" ] ; then
+ return
+ fi
+
+ _aux_dir="`grep AC_CONFIG_AUX_DIR $CONFIGURE | grep -v '.*#.*AC_CONFIG_AUX_DIR' | tail -${TAIL_N}1 | sed 's/^[ ]*AC_CONFIG_AUX_DIR(\(.*\)).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`"
+ if test ! -d "$_aux_dir" ; then
+ _aux_dir=.
+ fi
+
+ for file in config.guess config.sub ltmain.sh ; do
+ if test -f "${_aux_dir}/${file}" ; then
+ $VERBOSE_ECHO "rm -f \"${_aux_dir}/${file}.backup\""
+ rm -f "${_aux_dir}/${file}.backup"
+ fi
+ done
+} # end of restore_clobbered
+
+
+##############################
+# RECURSIVE_RESTORE FUNCTION #
+##############################
+recursive_restore ( ) {
+
+ # restore COPYING and INSTALL from backup if they were clobbered
+ # for each directory recursively.
+
+ # git 'r undone
+ restore_clobbered
+
+ # find configure template
+ _configure="`locate_configure_template`"
+ if [ "x$_configure" = "x" ] ; then
+ return
+ fi
+
+ # look for subdirs
+ _det_config_subdirs="`grep AC_CONFIG_SUBDIRS $_configure | grep -v '.*#.*AC_CONFIG_SUBDIRS' | sed 's/^[ ]*AC_CONFIG_SUBDIRS(\(.*\)).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`"
+ CHECK_DIRS=""
+ for dir in $_det_config_subdirs ; do
+ if test -d "`pwd`/$dir" ; then
+ CHECK_DIRS="$CHECK_DIRS \"`pwd`/$dir\""
+ fi
+ done
+
+ # process subdirs
+ if [ ! "x$CHECK_DIRS" = "x" ] ; then
+ $VERBOSE_ECHO "Recursively scanning the following directories:"
+ $VERBOSE_ECHO " $CHECK_DIRS"
+ for dir in $CHECK_DIRS ; do
+ $VERBOSE_ECHO "Checking files for automake damage in $dir"
+ cd "$START_PATH"
+ eval "cd $dir"
+
+ # recursively git 'r undone
+ recursive_restore
+ done
+ fi
+} # end of recursive_restore
+
+
+#######################
+# INITIALIZE FUNCTION #
+#######################
+initialize ( ) {
+
+ # this routine performs a variety of directory-specific
+ # initializations. some are sanity checks, some are preventive,
+ # and some are necessary setup detection.
+ #
+ # this function sets:
+ # CONFIGURE
+ # SEARCH_DIRS
+ # CONFIG_SUBDIRS
+
+ ##################################
+ # check for a configure template #
+ ##################################
+ CONFIGURE="`locate_configure_template`"
+ if [ "x$CONFIGURE" = "x" ] ; then
+ $ECHO
+ $ECHO "A configure.ac or configure.in file could not be located implying"
+ $ECHO "that the GNU Build System is at least not used in this directory. In"
+ $ECHO "any case, there is nothing to do here without one of those files."
+ $ECHO
+ $ECHO "ERROR: No configure.in or configure.ac file found in `pwd`"
+ exit 1
+ fi
+
+ #####################
+ # detect an aux dir #
+ #####################
+ _aux_dir="`grep AC_CONFIG_AUX_DIR $CONFIGURE | grep -v '.*#.*AC_CONFIG_AUX_DIR' | tail -${TAIL_N}1 | sed 's/^[ ]*AC_CONFIG_AUX_DIR(\(.*\)).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`"
+ if test ! -d "$_aux_dir" ; then
+ _aux_dir=.
+ else
+ $VERBOSE_ECHO "Detected auxillary directory: $_aux_dir"
+ fi
+
+ ################################
+ # detect a recursive configure #
+ ################################
+ CONFIG_SUBDIRS=""
+ _det_config_subdirs="`grep AC_CONFIG_SUBDIRS $CONFIGURE | grep -v '.*#.*AC_CONFIG_SUBDIRS' | sed 's/^[ ]*AC_CONFIG_SUBDIRS(\(.*\)).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`"
+ for dir in $_det_config_subdirs ; do
+ if test -d "`pwd`/$dir" ; then
+ $VERBOSE_ECHO "Detected recursive configure directory: `pwd`/$dir"
+ CONFIG_SUBDIRS="$CONFIG_SUBDIRS `pwd`/$dir"
+ fi
+ done
+
+ ##########################################
+ # make sure certain required files exist #
+ ##########################################
+ for file in AUTHORS COPYING ChangeLog INSTALL NEWS README ; do
+ if test ! -f $file ; then
+ $VERBOSE_ECHO "Touching ${file} since it does not exist"
+ touch $file
+ fi
+ done
+
+ ##################################################
+ # make sure certain generated files do not exist #
+ ##################################################
+ for file in config.guess config.sub ltmain.sh ; do
+ if test -f "${_aux_dir}/${file}" ; then
+ $VERBOSE_ECHO "mv -f \"${_aux_dir}/${file}\" \"${_aux_dir}/${file}.backup\""
+ mv -f "${_aux_dir}/${file}" "${_aux_dir}/${file}.backup"
+ fi
+ done
+
+ ############################
+ # search alternate m4 dirs #
+ ############################
+ SEARCH_DIRS=""
+ for dir in m4 ; do
+ if [ -d $dir ] ; then
+ $VERBOSE_ECHO "Found extra aclocal search directory: $dir"
+ SEARCH_DIRS="$SEARCH_DIRS -I $dir"
+ fi
+ done
+
+ ######################################
+ # remove any previous build products #
+ ######################################
+ if test -d autom4te.cache ; then
+ $VERBOSE_ECHO "Found an autom4te.cache directory, deleting it"
+ $VERBOSE_ECHO "rm -rf autom4te.cache"
+ rm -rf autom4te.cache
+ fi
+# tcl/tk (and probably others) have a customized aclocal.m4, so can't delete it
+# if test -f aclocal.m4 ; then
+# $VERBOSE_ECHO "Found an aclocal.m4 file, deleting it"
+# $VERBOSE_ECHO "rm -f aclocal.m4"
+# rm -f aclocal.m4
+# fi
+
+} # end of initialize()
+
+
+##############
+# initialize #
+##############
+
+# stash path
+START_PATH="`pwd`"
+
+# Before running autoreconf or manual steps, some prep detection work
+# is necessary or useful. Only needs to occur once per directory, but
+# does need to traverse the entire subconfigure hierarchy to protect
+# files from being clobbered even by autoreconf.
+recursive_protect
+
+# start from where we started
+cd "$START_PATH"
+
+# get ready to process
+initialize
+
+
+############################################
+# prepare build via autoreconf or manually #
+############################################
+reconfigure_manually=no
+if [ "x$HAVE_AUTORECONF" = "xyes" ] ; then
+ $ECHO
+ $ECHO $ECHO_N "Automatically preparing build ... $ECHO_C"
+
+ $VERBOSE_ECHO "$AUTORECONF $SEARCH_DIRS $AUTORECONF_OPTIONS"
+ autoreconf_output="`$AUTORECONF $SEARCH_DIRS $AUTORECONF_OPTIONS 2>&1`"
+ ret=$?
+ $VERBOSE_ECHO "$autoreconf_output"
+
+ if [ ! $ret = 0 ] ; then
+ if [ "x$HAVE_ALT_LIBTOOLIZE" = "xyes" ] ; then
+ if [ ! "x`echo \"$autoreconf_output\" | grep libtoolize | grep \"No such file or directory\"`" = "x" ] ; then
+ $ECHO
+ $ECHO "Warning: autoreconf failed but due to what is usually a common libtool"
+ $ECHO "misconfiguration issue. This problem is encountered on systems that"
+ $ECHO "have installed libtoolize under a different name without providing a"
+ $ECHO "symbolic link or without setting the LIBTOOLIZE environment variable."
+ $ECHO
+ $ECHO "Restarting the preparation steps with LIBTOOLIZE set to $LIBTOOLIZE"
+
+ export LIBTOOLIZE
+ RUN_RECURSIVE=no
+ export RUN_RECURSIVE
+ untrap_abnormal
+
+ $VERBOSE_ECHO sh $AUTOGEN_SH "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" "$9"
+ sh "$AUTOGEN_SH" "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" "$9"
+ exit $?
+ fi
+ fi
+
+ $ECHO "Warning: $AUTORECONF failed"
+
+ if test -f ltmain.sh ; then
+ $ECHO "libtoolize being run by autoreconf is not creating ltmain.sh in the auxillary directory like it should"
+ fi
+
+ $ECHO "Attempting to run the preparation steps individually"
+ reconfigure_manually=yes
+ fi
+else
+ reconfigure_manually=yes
+fi
+
+
+############################
+# LIBTOOL_FAILURE FUNCTION #
+############################
+libtool_failure ( ) {
+
+ # libtool is rather error-prone in comparison to the other
+ # autotools and this routine attempts to compensate for some
+ # common failures. the output after a libtoolize failure is
+ # parsed for an error related to AC_PROG_LIBTOOL and if found, we
+ # attempt to inject a project-provided libtool.m4 file.
+
+ _autoconf_output="$1"
+
+ if [ "x$RUN_RECURSIVE" = "xno" ] ; then
+ # we already tried the libtool.m4, don't try again
+ return 1
+ fi
+
+ if test -f "$LIBTOOL_M4" ; then
+ found_libtool="`$ECHO $_autoconf_output | grep AC_PROG_LIBTOOL`"
+ if test ! "x$found_libtool" = "x" ; then
+ if test -f acinclude.m4 ; then
+ rm -f acinclude.m4.$$.backup
+ $VERBOSE_ECHO "cat acinclude.m4 > acinclude.m4.$$.backup"
+ cat acinclude.m4 > acinclude.m4.$$.backup
+ fi
+ $VERBOSE_ECHO "cat \"$LIBTOOL_M4\" >> acinclude.m4"
+ chmod u+w acinclude.m4
+ cat "$LIBTOOL_M4" >> acinclude.m4
+
+ # don't keep doing this
+ RUN_RECURSIVE=no
+ export RUN_RECURSIVE
+ untrap_abnormal
+
+ $ECHO
+ $ECHO "Restarting the preparation steps with libtool macros in acinclude.m4"
+ $VERBOSE_ECHO sh $AUTOGEN_SH "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" "$9"
+ sh "$AUTOGEN_SH" "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" "$9"
+ exit $?
+ fi
+ fi
+}
+
+
+###########################
+# MANUAL_AUTOGEN FUNCTION #
+###########################
+manual_autogen ( ) {
+
+ ##################################################
+ # Manual preparation steps taken are as follows: #
+ # aclocal [-I m4] #
+ # libtoolize --automake -c -f #
+ # aclocal [-I m4] #
+ # autoconf -f #
+ # autoheader #
+ # automake -a -c -f #
+ ##################################################
+
+ ###########
+ # aclocal #
+ ###########
+ $VERBOSE_ECHO "$ACLOCAL $SEARCH_DIRS $ACLOCAL_OPTIONS"
+ aclocal_output="`$ACLOCAL $SEARCH_DIRS $ACLOCAL_OPTIONS 2>&1`"
+ ret=$?
+ $VERBOSE_ECHO "$aclocal_output"
+ if [ ! $ret = 0 ] ; then $ECHO "ERROR: $ACLOCAL failed" && exit 2 ; fi
+
+ ##############
+ # libtoolize #
+ ##############
+ need_libtoolize=no
+ for feature in AC_PROG_LIBTOOL LT_INIT ; do
+ $VERBOSE_ECHO "Searching for $feature in $CONFIGURE"
+ found="`grep \"^$feature.*\" $CONFIGURE`"
+ if [ ! "x$found" = "x" ] ; then
+ need_libtoolize=yes
+ break
+ fi
+ done
+ if [ "x$need_libtoolize" = "xyes" ] ; then
+ if [ "x$HAVE_LIBTOOLIZE" = "xyes" ] ; then
+ $VERBOSE_ECHO "$LIBTOOLIZE $LIBTOOLIZE_OPTIONS"
+ libtoolize_output="`$LIBTOOLIZE $LIBTOOLIZE_OPTIONS 2>&1`"
+ ret=$?
+ $VERBOSE_ECHO "$libtoolize_output"
+
+ if [ ! $ret = 0 ] ; then $ECHO "ERROR: $LIBTOOLIZE failed" && exit 2 ; fi
+ else
+ if [ "x$HAVE_ALT_LIBTOOLIZE" = "xyes" ] ; then
+ $VERBOSE_ECHO "$LIBTOOLIZE $ALT_LIBTOOLIZE_OPTIONS"
+ libtoolize_output="`$LIBTOOLIZE $ALT_LIBTOOLIZE_OPTIONS 2>&1`"
+ ret=$?
+ $VERBOSE_ECHO "$libtoolize_output"
+
+ if [ ! $ret = 0 ] ; then $ECHO "ERROR: $LIBTOOLIZE failed" && exit 2 ; fi
+ fi
+ fi
+
+ ###########
+ # aclocal #
+ ###########
+ # re-run again as instructed by libtoolize
+ $VERBOSE_ECHO "$ACLOCAL $SEARCH_DIRS $ACLOCAL_OPTIONS"
+ aclocal_output="`$ACLOCAL $SEARCH_DIRS $ACLOCAL_OPTIONS 2>&1`"
+ ret=$?
+ $VERBOSE_ECHO "$aclocal_output"
+
+ # libtoolize might put ltmain.sh in the wrong place
+ if test -f ltmain.sh ; then
+ if test ! -f "${_aux_dir}/ltmain.sh" ; then
+ $ECHO
+ $ECHO "Warning: $LIBTOOLIZE is creating ltmain.sh in the wrong directory"
+ $ECHO
+ $ECHO "Fortunately, the problem can be worked around by simply copying the"
+ $ECHO "file to the appropriate location (${_aux_dir}/). This has been done for you."
+ $ECHO
+ $VERBOSE_ECHO "cp -p ltmain.sh \"${_aux_dir}/ltmain.sh\""
+ cp -p ltmain.sh "${_aux_dir}/ltmain.sh"
+ $ECHO $ECHO_N "Continuing build preparation ... $ECHO_C"
+ fi
+ fi # ltmain.sh
+ fi # need_libtoolize
+
+ ############
+ # autoconf #
+ ############
+ $VERBOSE_ECHO
+ $VERBOSE_ECHO "$AUTOCONF $AUTOCONF_OPTIONS"
+ autoconf_output="`$AUTOCONF $AUTOCONF_OPTIONS 2>&1`"
+ ret=$?
+ $VERBOSE_ECHO "$autoconf_output"
+
+ if [ ! $ret = 0 ] ; then
+ # retry without the -f and check for usage of macros that are too new
+ ac2_59_macros="AC_C_RESTRICT AC_INCLUDES_DEFAULT AC_LANG_ASSERT AC_LANG_WERROR AS_SET_CATFILE"
+ ac2_55_macros="AC_COMPILER_IFELSE AC_FUNC_MBRTOWC AC_HEADER_STDBOOL AC_LANG_CONFTEST AC_LANG_SOURCE AC_LANG_PROGRAM AC_LANG_CALL AC_LANG_FUNC_TRY_LINK AC_MSG_FAILURE AC_PREPROC_IFELSE"
+ ac2_54_macros="AC_C_BACKSLASH_A AC_CONFIG_LIBOBJ_DIR AC_GNU_SOURCE AC_PROG_EGREP AC_PROG_FGREP AC_REPLACE_FNMATCH AC_FUNC_FNMATCH_GNU AC_FUNC_REALLOC AC_TYPE_MBSTATE_T"
+
+ macros_to_search=""
+ ac_major="`echo ${AUTOCONF_VERSION}. | cut -d. -f1 | sed 's/[^0-9]//g'`"
+ ac_minor="`echo ${AUTOCONF_VERSION}. | cut -d. -f2 | sed 's/[^0-9]//g'`"
+
+ if [ $ac_major -lt 2 ] ; then
+ macros_to_search="$ac2_59_macros $ac2_55_macros $ac2_54_macros"
+ else
+ if [ $ac_minor -lt 54 ] ; then
+ macros_to_search="$ac2_59_macros $ac2_55_macros $ac2_54_macros"
+ elif [ $ac_minor -lt 55 ] ; then
+ macros_to_search="$ac2_59_macros $ac2_55_macros"
+ elif [ $ac_minor -lt 59 ] ; then
+ macros_to_search="$ac2_59_macros"
+ fi
+ fi
+
+ configure_ac_macros=__none__
+ for feature in $macros_to_search ; do
+ $VERBOSE_ECHO "Searching for $feature in $CONFIGURE"
+ found="`grep \"^$feature.*\" $CONFIGURE`"
+ if [ ! "x$found" = "x" ] ; then
+ if [ "x$configure_ac_macros" = "x__none__" ] ; then
+ configure_ac_macros="$feature"
+ else
+ configure_ac_macros="$feature $configure_ac_macros"
+ fi
+ fi
+ done
+ if [ ! "x$configure_ac_macros" = "x__none__" ] ; then
+ $ECHO
+ $ECHO "Warning: Unsupported macros were found in $CONFIGURE"
+ $ECHO
+ $ECHO "The `echo $CONFIGURE | basename` file was scanned in order to determine if any"
+ $ECHO "unsupported macros are used that exceed the minimum version"
+ $ECHO "settings specified within this file. As such, the following macros"
+ $ECHO "should be removed from configure.ac or the version numbers in this"
+ $ECHO "file should be increased:"
+ $ECHO
+ $ECHO "$configure_ac_macros"
+ $ECHO
+ $ECHO $ECHO_N "Ignorantly continuing build preparation ... $ECHO_C"
+ fi
+
+ ###################
+ # autoconf, retry #
+ ###################
+ $VERBOSE_ECHO
+ $VERBOSE_ECHO "$AUTOCONF"
+ autoconf_output="`$AUTOCONF 2>&1`"
+ ret=$?
+ $VERBOSE_ECHO "$autoconf_output"
+
+ if [ ! $ret = 0 ] ; then
+ # test if libtool is busted
+ libtool_failure "$autoconf_output"
+
+ # let the user know what went wrong
+ cat <<EOF
+$autoconf_output
+EOF
+ $ECHO "ERROR: $AUTOCONF failed"
+ exit 2
+ else
+ # autoconf sans -f and possibly sans unsupported options succeed so warn verbosely
+ $ECHO
+ $ECHO "Warning: autoconf seems to have succeeded by removing the following options:"
+ $ECHO " AUTOCONF_OPTIONS=\"$AUTOCONF_OPTIONS\""
+ $ECHO
+ $ECHO "Removing those options should not be necessary and indicate some other"
+ $ECHO "problem with the build system. The build preparation is highly suspect"
+ $ECHO "and may result in configuration or compilation errors. Consider"
+ if [ "x$VERBOSE_ECHO" = "x:" ] ; then
+ $ECHO "rerunning the build preparation with verbose output enabled."
+ $ECHO " $AUTOGEN_SH --verbose"
+ else
+ $ECHO "reviewing the minimum GNU Autotools version settings contained in"
+ $ECHO "this script along with the macros being used in your `echo $CONFIGURE | basename` file."
+ fi
+ $ECHO
+ $ECHO $ECHO_N "Continuing build preparation ... $ECHO_C"
+ fi # autoconf ret = 0
+ fi # autoconf ret = 0
+
+ ##############
+ # autoheader #
+ ##############
+ need_autoheader=no
+ for feature in AM_CONFIG_HEADER AC_CONFIG_HEADER ; do
+ $VERBOSE_ECHO "Searching for $feature in $CONFIGURE"
+ found="`grep \"^$feature.*\" $CONFIGURE`"
+ if [ ! "x$found" = "x" ] ; then
+ need_autoheader=yes
+ break
+ fi
+ done
+ if [ "x$need_autoheader" = "xyes" ] ; then
+ $VERBOSE_ECHO "$AUTOHEADER $AUTOHEADER_OPTIONS"
+ autoheader_output="`$AUTOHEADER $AUTOHEADER_OPTIONS 2>&1`"
+ ret=$?
+ $VERBOSE_ECHO "$autoheader_output"
+ if [ ! $ret = 0 ] ; then $ECHO "ERROR: $AUTOHEADER failed" && exit 2 ; fi
+ fi # need_autoheader
+
+ ############
+ # automake #
+ ############
+ need_automake=no
+ for feature in AM_INIT_AUTOMAKE ; do
+ $VERBOSE_ECHO "Searching for $feature in $CONFIGURE"
+ found="`grep \"^$feature.*\" $CONFIGURE`"
+ if [ ! "x$found" = "x" ] ; then
+ need_automake=yes
+ break
+ fi
+ done
+
+ if [ "x$need_automake" = "xyes" ] ; then
+ $VERBOSE_ECHO "$AUTOMAKE $AUTOMAKE_OPTIONS"
+ automake_output="`$AUTOMAKE $AUTOMAKE_OPTIONS 2>&1`"
+ ret=$?
+ $VERBOSE_ECHO "$automake_output"
+
+ if [ ! $ret = 0 ] ; then
+
+ ###################
+ # automake, retry #
+ ###################
+ $VERBOSE_ECHO
+ $VERBOSE_ECHO "$AUTOMAKE $ALT_AUTOMAKE_OPTIONS"
+ # retry without the -f
+ automake_output="`$AUTOMAKE $ALT_AUTOMAKE_OPTIONS 2>&1`"
+ ret=$?
+ $VERBOSE_ECHO "$automake_output"
+
+ if [ ! $ret = 0 ] ; then
+ # test if libtool is busted
+ libtool_failure "$automake_output"
+
+ # let the user know what went wrong
+ cat <<EOF
+$automake_output
+EOF
+ $ECHO "ERROR: $AUTOMAKE failed"
+ exit 2
+ fi # automake retry
+ fi # automake ret = 0
+ fi # need_automake
+} # end of manual_autogen
+
+
+#####################################
+# RECURSIVE_MANUAL_AUTOGEN FUNCTION #
+#####################################
+recursive_manual_autogen ( ) {
+
+ # run the build preparation steps manually for this directory
+ manual_autogen
+
+ # for projects using recursive configure, run the build
+ # preparation steps for the subdirectories.
+ if [ ! "x$CONFIG_SUBDIRS" = "x" ] ; then
+ $VERBOSE_ECHO "Recursively configuring the following directories:"
+ $VERBOSE_ECHO " $CONFIG_SUBDIRS"
+ for dir in $CONFIG_SUBDIRS ; do
+ $VERBOSE_ECHO "Processing recursive configure in $dir"
+ cd "$START_PATH"
+ cd "$dir"
+
+ # new directory, prepare
+ initialize
+
+ # run manual steps for the subdir and any others below
+ recursive_manual_autogen
+ done
+ fi
+}
+
+
+################################
+# run manual preparation steps #
+################################
+if [ "x$reconfigure_manually" = "xyes" ] ; then
+ $ECHO
+ $ECHO $ECHO_N "Preparing build ... $ECHO_C"
+
+ recursive_manual_autogen
+fi
+
+
+#########################
+# restore and summarize #
+#########################
+cd "$START_PATH"
+
+# restore COPYING and INSTALL from backup if necessary
+recursive_restore
+
+# make sure we end up with a configure script
+config_ac="`locate_configure_template`"
+config="`echo $config_ac | sed 's/\.ac$//' | sed 's/\.in$//'`"
+if [ "x$config" = "x" ] ; then
+ $VERBOSE_ECHO "Could not locate the configure template (from `pwd`)"
+fi
+
+# summarize
+$ECHO "done"
+$ECHO
+if test "x$config" = "x" -o ! -f "$config" ; then
+ $ECHO "WARNING: The $PROJECT build system should now be prepared but there"
+ $ECHO "does not seem to be a resulting configure file. This is unexpected"
+ $ECHO "and likely the result of an error. You should run $NAME_OF_AUTOGEN"
+ $ECHO "with the --verbose option to get more details on a potential"
+ $ECHO "misconfiguration."
+else
+ $ECHO "The $PROJECT build system is now prepared. To build here, run:"
+ $ECHO " $config"
+ $ECHO " make"
+fi
+
+
+# Local Variables:
+# mode: sh
+# tab-width: 8
+# sh-basic-offset: 4
+# sh-indentation: 4
+# indent-tabs-mode: t
+# End:
+# ex: shiftwidth=4 tabstop=8
diff --git a/bad-errormodel.sh b/bad-errormodel.sh
new file mode 100755
index 0000000..836ba9c
--- /dev/null
+++ b/bad-errormodel.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+if test -x ./hfst-ospell ; then
+ if ! cat $srcdir/test.strings | ./hfst-ospell -v bad_errormodel.zhfst ; then
+ exit 1
+ fi
+else
+ echo ./hfst-ospell not built
+ exit 77
+fi
+
diff --git a/basic-edit1.sh b/basic-edit1.sh
new file mode 100755
index 0000000..f67be34
--- /dev/null
+++ b/basic-edit1.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+if test -x ./hfst-ospell ; then
+ if ! cat $srcdir/test.strings | ./hfst-ospell speller_edit1.zhfst ; then
+ exit 1
+ fi
+else
+ echo ./hfst-ospell not built
+ exit 77
+fi
+
diff --git a/basic-zhfst.sh b/basic-zhfst.sh
new file mode 100755
index 0000000..4393dff
--- /dev/null
+++ b/basic-zhfst.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+if test -x ./hfst-ospell ; then
+ if ! cat $srcdir/test.strings | ./hfst-ospell speller_basic.zhfst ; then
+ exit 1
+ fi
+else
+ echo ./hfst-ospell not built
+ exit 77
+fi
+
diff --git a/basic_test.xml b/basic_test.xml
new file mode 100644
index 0000000..5158abe
--- /dev/null
+++ b/basic_test.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<hfstspeller dtdversion="1.0" hfstversion="3">
+ <info>
+ <locale>qtz</locale>
+ <title>Example speller</title>
+ <description>
+ This example is for the automatic test suite of hfst-ospell.
+ </description>
+ <version vcsrev="33459">1.5.73</version>
+ <date>2012-08-15</date>
+ <producer>Flammie</producer>
+ <contact email="flammie at iki.fi"
+ website="http://flammie.dyndns.org/"/>
+ </info>
+ <acceptor type="general" id="acceptor.default.hfst">
+ <title>Example dictionary</title>
+ <title xml:lang="se">Vuola lávlla</title>
+ <description>Example dictionary recognises a word.</description>
+ <description xml:lang="se">
+ Vuola, vuola mun aigon lási
+ vuolas juhkaluvvat,
+ vuola, vuola mun aigon lási
+ vuolas mieladuvvat
+ </description>
+ </acceptor>
+ <errmodel id="errormodel.default.hfst">
+ <title>Sahtiwaari</title>
+ <description>
+ Example error model turns one word into another.
+ </description>
+ <type type="default"/>
+ <model>errormodel.default.hfst</model>
+ </errmodel>
+</hfstspeller>
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..efdc8d8
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,179 @@
+## Process this file with autoconf to produce configure script
+
+## Copyright (C) 2010 University of Helsinki
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# autoconf requirements
+AC_PREREQ([2.62])
+AC_INIT([hfstospell], [0.3.1], [hfst-bugs at helsinki.fi], [hfstospell], [http://hfst.sf.net])
+
+LT_PREREQ([2.2.6])
+
+# init
+AC_CONFIG_AUX_DIR([build-aux])
+AM_INIT_AUTOMAKE([1.11 -Wall -Werror foreign check-news color-tests silent-rules])
+AM_SILENT_RULES([yes])
+AC_REVISION([$Revision: 4541 $])
+AC_CONFIG_MACRO_DIR([m4])
+AC_CONFIG_SRCDIR([ospell.cc])
+AC_CONFIG_HEADERS([config.h])
+
+# Information on package
+HFSTOSPELL_NAME=hfstospell
+HFSTOSPELL_MAJOR=0
+HFSTOSPELL_MINOR=3
+HFSTOSPELL_EXTENSION=.1
+HFSTOSPELL_VERSION=$HFSTOSPELL_MAJOR.$HFSTOSPELL_MINOR$HFSTOSPELL_EXTENSION
+AC_SUBST(HFSTOSPELL_MAJOR)
+AC_SUBST(HFSTOSPELL_MINOR)
+AC_SUBST(HFSTOSPELL_VERSION)
+AC_SUBST(HFSTOSPELL_NAME)
+
+# Check for pkg-config first - the configuration won't work if it isn't available:
+AC_PATH_PROG([PKGCONFIG], [pkg-config], [no])
+AS_IF([test "x$PKGCONFIG" = xno], [AC_MSG_ERROR([pkg-config is required - please install])])
+AC_PATH_PROG([DOXYGEN], [doxygen], [false])
+AM_CONDITIONAL([CAN_DOXYGEN], [test "x$DOXYGEN" != xfalse])
+
+# Settings
+AC_ARG_ENABLE([extra_demos],
+ [AS_HELP_STRING([--enable-extra-demos],
+ [build conference demos for science reproduction @<:@default=no@:>@])],
+ [enable_extra_demos=$enableval], [enable_extra_demos=no])
+AM_CONDITIONAL([EXTRA_DEMOS], [test x$enable_extra_demos != xno])
+AC_ARG_ENABLE([hfst_ospell_office],
+ [AS_HELP_STRING([--enable-hfst-ospell-office],
+ [build hfst-ospell-office @<:@default=yes@:>@])],
+ [enable_hfst_ospell_ofiice=$enableval], [enable_hfst_ospell_office=yes])
+AM_CONDITIONAL([HFST_OSPELL_OFFICE], [test x$enable_hfst_ospell_office != xno])
+AC_ARG_ENABLE([zhfst],
+ [AS_HELP_STRING([--enable-zhfst],
+ [support zipped complex automaton sets @<:@default=check@:>@])],
+ [enable_zhfst=$enableval], [enable_zhfst=check])
+AC_ARG_WITH([libxmlpp],
+ [AS_HELP_STRING([--with-libxmlpp],
+ [support xml metadata for zipped automaton sets with libxml++-2.6 @<:@default=yes@:>@])],
+ [with_libxmlpp=$withval], [with_libxmlpp=yes])
+AC_ARG_WITH([tinyxml2],
+ [AS_HELP_STRING([--with-tinyxml2],
+ [support xml metadata for zipped automaton sets with tinyxml2 @<:@default=no@:>@])],
+ [with_tinyxml2=$withval], [with_tinyxml2=no])
+
+AC_ARG_WITH([extract],
+ [AS_HELP_STRING([--with-extract=TARGET],
+ [extract zhfst archives to tmpdir or mem @<:@default=mem@:>@])],
+ [with_extract=$withval], [with_extract=mem])
+AS_IF([test "x$with_extract" = xmem], [AC_DEFINE([ZHFST_EXTRACT_TO_MEM], [1],
+ [Define to extract zhfst archives to char buffer])],
+ [AS_IF([test "x$with_extract" = xtmpdir],
+ [AC_DEFINE([ZHFST_EXTRACT_TO_TMPDIR], [1],
+ [Define to extract zhfst to tmp dir])],
+ [AC_MSG_ERROR([Use with-extract to mem or tmpdir])])])
+
+# Checks for programs
+m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
+AC_PROG_CC
+AC_PROG_CXX
+AC_LIBTOOL_WIN32_DLL
+LT_INIT
+AC_PROG_INSTALL
+AC_PROG_LN_S
+AC_PROG_MAKE_SET
+AC_PATH_PROG([HFST_TXT2FST], [hfst-txt2fst], [false])
+AC_PATH_PROG([HFST_FST2FST], [hfst-fst2fst], [false])
+AC_PATH_PROG([ZIP], [zip], [false])
+AM_CONDITIONAL([CAN_TEST],
+ [test x$HFST_TXT2FST != xfalse -a x$HFST_FST2FST != xfalse -a x$ZIP != xfalse])
+
+# Checks for libraries
+AS_IF([test x$enable_zhfst != xno],
+ [PKG_CHECK_MODULES([LIBARCHIVE], [libarchive > 3],
+ [AC_DEFINE([HAVE_LIBARCHIVE], [1], [Use archives])
+ enable_zhfst=yes],
+ [PKG_CHECK_MODULES([LIBARCHIVE], [libarchive > 2],
+ [AC_DEFINE([HAVE_LIBARCHIVE], [1], [Use archives])
+ AC_DEFINE([USE_LIBARCHIVE_2], [1], [Use libarchive2])
+ enable_zhfst=yes],[enable_zhfst=no])])])
+
+AM_CONDITIONAL([WANT_ARCHIVE], [test x$enable_zhfst != xno])
+AS_IF([test x$with_libxmlpp != xno],
+ [PKG_CHECK_MODULES([LIBXMLPP], [libxml++-2.6 >= 2.10.0],
+ [AC_DEFINE([HAVE_LIBXML], [1], [Use libxml++])
+ enable_xml=libxmlpp],
+ [AC_MSG_ERROR([libxml++ failed])
+ enable_xml=no])])
+AM_CONDITIONAL([WANT_LIBXMLPP], [test x$enable_xml = xlibxmlpp])
+AS_IF([test x$with_tinyxml2 != xno -a x$with_libxmlpp = xno],
+ [PKG_CHECK_MODULES([TINYXML2], [tinyxml2 >= 1.0.8],
+ [AC_DEFINE([HAVE_TINYXML2], [1], [Use tinyxml])
+ enable_xml=tinyxml2],
+ [AC_MSG_ERROR([tinyxml missing])
+ enable_xml=no])])
+AM_CONDITIONAL([WANT_TINYXML2], [test x$enable_xml = xtinyxml2])
+
+# Find ICU in the new and old way
+PKG_CHECK_MODULES(ICU, [icu-uc >= 4], [], [
+ AC_PATH_PROG([ICU_CONFIG], [icu-config], [false])
+ AS_IF([test x$ICU_CONFIG != xfalse], [
+ ICU_LIBS=$($ICU_CONFIG --ldflags)
+ ])
+])
+LIBS="$LIBS $ICU_LIBS"
+
+# Checks for header files
+AC_CHECK_HEADERS([getopt.h error.h])
+
+# Checks for types
+AC_TYPE_SIZE_T
+
+# Checks for structures
+
+# Checks for compiler characteristics
+
+# Checks for library functions
+AC_FUNC_MALLOC
+AC_CHECK_FUNCS([strndup error])
+# Checks for system services
+
+# Checks for highest supported C++ standard
+AC_LANG(C++)
+AX_CHECK_COMPILE_FLAG([-std=c++17], [CXXFLAGS="$CXXFLAGS -std=c++17"], [
+ AX_CHECK_COMPILE_FLAG([-std=c++1z], [CXXFLAGS="$CXXFLAGS -std=c++1z"], [
+ AX_CHECK_COMPILE_FLAG([-std=c++14], [CXXFLAGS="$CXXFLAGS -std=c++14"], [
+ AX_CHECK_COMPILE_FLAG([-std=c++1y], [CXXFLAGS="$CXXFLAGS -std=c++1y"], [
+ AX_CHECK_COMPILE_FLAG([-std=c++11], [CXXFLAGS="$CXXFLAGS -std=c++11"], [
+ AX_CHECK_COMPILE_FLAG([-std=c++0x], [CXXFLAGS="$CXXFLAGS -std=c++0x"], [])
+ ])
+ ])
+ ])
+ ])
+])
+
+# config files
+AC_CONFIG_FILES([Makefile hfstospell.pc])
+
+# output
+AC_OUTPUT
+
+cat <<EOF
+-- Building $PACKAGE_STRING
+ * zhfst support: $enable_zhfst
+ * extracting to: $with_extract
+ * xml support: $enable_xml
+ * hfst-ospell-office: $enable_hfst_ospell_office
+ * conference demos: $enable_extra_demos
+EOF
+AS_IF([test x$with_libxmlpp != xno -a x$with_tinyxml2 != xno],
+ [AC_MSG_WARN([You can only have one xml library (e.g., --with-tinyxml2 --without-libxmlpp)])])
diff --git a/doc/index.html b/doc/index.html
new file mode 100644
index 0000000..9a2b219
--- /dev/null
+++ b/doc/index.html
@@ -0,0 +1,34 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <meta charset="UTF-8">
+ <title>HFST ospell–Free WFST spell-checker and library</title>
+ </head>
+ <body>
+ <h1><img src="doc/html/edit2-small.png"
+ alt="[edit distance 2 automaton]"/>
+ HFST ospell</h1>
+ <p>
+ HFST ospell is a free open source spell-checker using weighted
+ finite-state automata. It is a light-weight library for using
+ combinations of two automata–a language model and an error model–for
+ spell-checking and correction.
+ </p>
+ <p>
+ It has optional support for XML-based metadata using libxml++2 or
+ tinyxml2. Automata compression is supported through libarchive,
+ currently with zip format.
+ </p>
+ <p>
+ The API of the library is stable to support updating the shared library
+ while keeping the automata and the plugins for enchant and LibreOffice
+ in place. The <a href="doc/html/">api documentation</a> is maintained with
+ doxygen.
+ </p>
+ <p>
+ You can download the library and small demo applications from
+ <a href="http://hfst.sf.net/">HFST’s main sourceforge site</a>.
+ </p>
+ </body>
+</html>
+
diff --git a/edit2-small.png b/edit2-small.png
new file mode 100644
index 0000000..1f312bc
Binary files /dev/null and b/edit2-small.png differ
diff --git a/empty-descriptions.sh b/empty-descriptions.sh
new file mode 100755
index 0000000..707779f
--- /dev/null
+++ b/empty-descriptions.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+if test -x ./hfst-ospell ; then
+ if ! cat $srcdir/test.strings | ./hfst-ospell -v empty_descriptions.zhfst ; then
+ exit 1
+ fi
+else
+ echo ./hfst-ospell not built
+ exit 77
+fi
+
diff --git a/empty-locale.sh b/empty-locale.sh
new file mode 100755
index 0000000..009e408
--- /dev/null
+++ b/empty-locale.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+if test -x ./hfst-ospell ; then
+ if ! cat $srcdir/test.strings | ./hfst-ospell -v empty_locale.zhfst ; then
+ exit 1
+ fi
+else
+ echo ./hfst-ospell not built
+ exit 77
+fi
+
diff --git a/empty-titles.sh b/empty-titles.sh
new file mode 100755
index 0000000..523b321
--- /dev/null
+++ b/empty-titles.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+if test -x ./hfst-ospell ; then
+ if ! cat $srcdir/test.strings | ./hfst-ospell -v empty_titles.zhfst ; then
+ exit 1
+ fi
+else
+ echo ./hfst-ospell not built
+ exit 77
+fi
+
diff --git a/empty-zhfst.sh b/empty-zhfst.sh
new file mode 100755
index 0000000..eb2a6ab
--- /dev/null
+++ b/empty-zhfst.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+if test -x ./hfst-ospell ; then
+ touch empty.zhfst
+ if ! cat $srcdir/test.strings | ./hfst-ospell -v empty.zhfst ; then
+ exit 1
+ fi
+else
+ echo ./hfst-ospell not built
+ exit 77
+fi
+
diff --git a/empty_descriptions.xml b/empty_descriptions.xml
new file mode 100644
index 0000000..1a90acf
--- /dev/null
+++ b/empty_descriptions.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<hfstspeller dtdversion="1.0" hfstversion="3">
+ <info>
+ <locale>qtz</locale>
+ <title>Example speller</title>
+ <description/>
+ <version vcsrev="33459">1.5.73</version>
+ <date>2012-08-15</date>
+ <producer>Flammie</producer>
+ <contact email="flammie at iki.fi"
+ website="http://flammie.dyndns.org/"/>
+ </info>
+ <acceptor type="general" id="acceptor.default.hfst">
+ <title>Example dictionary</title>
+ <title xml:lang="se">Vuola lávlla</title>
+ <description/>
+ </acceptor>
+ <errmodel id="errormodel.default.hfst">
+ <title>Sahtiwaari</title>
+ <description/>
+ <type type="default"/>
+ <model>errormodel.default.hfst</model>
+ </errmodel>
+</hfstspeller>
diff --git a/empty_locale.xml b/empty_locale.xml
new file mode 100644
index 0000000..3922d5f
--- /dev/null
+++ b/empty_locale.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<hfstspeller dtdversion="1.0" hfstversion="3">
+ <info>
+ <locale/>
+ <title>Example speller</title>
+ <description>
+ This example is for the automatic test suite of hfst-ospell.
+ </description>
+ <version vcsrev="33459">1.5.73</version>
+ <date>2012-08-15</date>
+ <producer>Flammie</producer>
+ <contact email="flammie at iki.fi"
+ website="http://flammie.dyndns.org/"/>
+ </info>
+ <acceptor type="general" id="acceptor.default.hfst">
+ <title>Example dictionary</title>
+ <title xml:lang="se">Vuola lávlla</title>
+ <description>Example dictionary recognises a word.</description>
+ <description xml:lang="se">
+ Vuola, vuola mun aigon lási
+ vuolas juhkaluvvat,
+ vuola, vuola mun aigon lási
+ vuolas mieladuvvat
+ </description>
+ </acceptor>
+ <errmodel id="errormodel.default.hfst">
+ <title>Sahtiwaari</title>
+ <description>
+ Example error model turns one word into another.
+ </description>
+ <type type="default"/>
+ <model>errormodel.default.hfst</model>
+ </errmodel>
+</hfstspeller>
diff --git a/empty_titles.xml b/empty_titles.xml
new file mode 100644
index 0000000..cf35de5
--- /dev/null
+++ b/empty_titles.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<hfstspeller dtdversion="1.0" hfstversion="3">
+ <info>
+ <locale>qtz</locale>
+ <title/>
+ <description>
+ This example is for the automatic test suite of hfst-ospell.
+ </description>
+ <version vcsrev="33459">1.5.73</version>
+ <date>2012-08-15</date>
+ <producer>Flammie</producer>
+ <contact email="flammie at iki.fi"
+ website="http://flammie.dyndns.org/"/>
+ </info>
+ <acceptor type="general" id="acceptor.default.hfst">
+ <title/>
+ <description>Example dictionary recognises a word.</description>
+ <description xml:lang="se">
+ Vuola, vuola mun aigon lási
+ vuolas juhkaluvvat,
+ vuola, vuola mun aigon lási
+ vuolas mieladuvvat
+ </description>
+ </acceptor>
+ <errmodel id="errormodel.default.hfst">
+ <title/>
+ <description>
+ Example error model turns one word into another.
+ </description>
+ <type type="default"/>
+ <model>errormodel.default.hfst</model>
+ </errmodel>
+</hfstspeller>
diff --git a/errmodel.basic.txt b/errmodel.basic.txt
new file mode 100644
index 0000000..73fd8e5
--- /dev/null
+++ b/errmodel.basic.txt
@@ -0,0 +1,5 @@
+0 1 v o
+1 2 e l
+2 3 s u
+3 4 i t
+4
diff --git a/errmodel.edit1.txt b/errmodel.edit1.txt
new file mode 100644
index 0000000..0b1ba7a
--- /dev/null
+++ b/errmodel.edit1.txt
@@ -0,0 +1,728 @@
+0 0.0
+0 0 a a 0.0
+0 0 c c 0.0
+0 0 b b 0.0
+0 0 e e 0.0
+0 0 d d 0.0
+0 0 g g 0.0
+0 0 f f 0.0
+0 0 i i 0.0
+0 0 h h 0.0
+0 0 k k 0.0
+0 0 j j 0.0
+0 0 m m 0.0
+0 0 l l 0.0
+0 0 o o 0.0
+0 0 n n 0.0
+0 0 q q 0.0
+0 0 p p 0.0
+0 0 s s 0.0
+0 0 r r 0.0
+0 0 u u 0.0
+0 0 t t 0.0
+0 0 v v 0.0
+0 0 y y 0.0
+0 0 x x 0.0
+0 0 z z 0.0
+0 1 c u 1.0
+0 1 x f 1.0
+0 1 i h 1.0
+0 1 j v 1.0
+0 1 @0@ x 1.0
+0 1 h s 1.0
+0 1 n o 1.0
+0 1 z h 1.0
+0 1 k b 1.0
+0 1 i y 1.0
+0 1 d q 1.0
+0 1 t z 1.0
+0 1 o x 1.0
+0 1 d f 1.0
+0 1 k @0@ 1.0
+0 1 c n 1.0
+0 1 j z 1.0
+0 1 @0@ q 1.0
+0 1 r p 1.0
+0 1 e z 1.0
+0 1 k y 1.0
+0 1 a q 1.0
+0 1 q t 1.0
+0 1 g p 1.0
+0 1 h @0@ 1.0
+0 1 f c 1.0
+0 1 y i 1.0
+0 1 u v 1.0
+0 1 v d 1.0
+0 1 h v 1.0
+0 1 f r 1.0
+0 1 t k 1.0
+0 1 j x 1.0
+0 1 @0@ j 1.0
+0 1 m i 1.0
+0 1 r i 1.0
+0 1 c q 1.0
+0 1 x j 1.0
+0 1 k p 1.0
+0 1 y d 1.0
+0 1 a s 1.0
+0 1 q v 1.0
+0 1 z l 1.0
+0 1 o u 1.0
+0 1 m h 1.0
+0 1 q r 1.0
+0 1 s k 1.0
+0 1 x k 1.0
+0 1 y @0@ 1.0
+0 1 e g 1.0
+0 1 y v 1.0
+0 1 n t 1.0
+0 1 c j 1.0
+0 1 t i 1.0
+0 1 y p 1.0
+0 1 @0@ c 1.0
+0 1 r b 1.0
+0 1 c x 1.0
+0 1 z n 1.0
+0 1 u s 1.0
+0 1 i n 1.0
+0 1 e f 1.0
+0 1 a u 1.0
+0 1 q h 1.0
+0 1 g t 1.0
+0 1 u r 1.0
+0 1 q y 1.0
+0 1 h z 1.0
+0 1 p g 1.0
+0 1 f n 1.0
+0 1 x n 1.0
+0 1 j n 1.0
+0 1 t f 1.0
+0 1 q j 1.0
+0 1 s o 1.0
+0 1 k j 1.0
+0 1 s y 1.0
+0 1 i a 1.0
+0 1 z r 1.0
+0 1 d y 1.0
+0 1 d n 1.0
+0 1 x o 1.0
+0 1 e c 1.0
+0 1 c f 1.0
+0 1 y a 1.0
+0 1 @_UNKNOWN_SYMBOL_@ x 1.0
+0 1 c t 1.0
+0 1 s c 1.0
+0 1 @0@ y 1.0
+0 1 p t 1.0
+0 1 y b 1.0
+0 1 r x 1.0
+0 1 e b 1.0
+0 1 z i 1.0
+0 1 k a 1.0
+0 1 q l 1.0
+0 1 g h 1.0
+0 1 i c 1.0
+0 1 y m 1.0
+0 1 p u 1.0
+0 1 f j 1.0
+0 1 t s 1.0
+0 1 j p 1.0
+0 1 @0@ r 1.0
+0 1 m a 1.0
+0 1 r q 1.0
+0 1 x r 1.0
+0 1 n a 1.0
+0 1 j b 1.0
+0 1 k x 1.0
+0 1 q n 1.0
+0 1 z d 1.0
+0 1 m v 1.0
+0 1 s u 1.0
+0 1 z v 1.0
+0 1 q g 1.0
+0 1 x s 1.0
+0 1 n b 1.0
+0 1 e o 1.0
+0 1 i r 1.0
+0 1 c b 1.0
+0 1 t q 1.0
+0 1 j y 1.0
+0 1 @0@ k 1.0
+0 1 @_UNKNOWN_SYMBOL_@ z 1.0
+0 1 r j 1.0
+0 1 c p 1.0
+0 1 t @0@ 1.0
+0 1 p z 1.0
+0 1 e n 1.0
+0 1 o f 1.0
+0 1 @_UNKNOWN_SYMBOL_@ t 1.0
+0 1 g l 1.0
+0 1 @_UNKNOWN_SYMBOL_@ s 1.0
+0 1 d c 1.0
+0 1 u z 1.0
+0 1 v x 1.0
+0 1 d h 1.0
+0 1 l y 1.0
+0 1 h b 1.0
+0 1 i t 1.0
+0 1 @_UNKNOWN_SYMBOL_@ m 1.0
+0 1 @0@ d 1.0
+0 1 m s 1.0
+0 1 r c 1.0
+0 1 j t 1.0
+0 1 k v 1.0
+0 1 x v 1.0
+0 1 j f 1.0
+0 1 t n 1.0
+0 1 o d 1.0
+0 1 p a 1.0
+0 1 h c 1.0
+0 1 q b 1.0
+0 1 z x 1.0
+0 1 s q 1.0
+0 1 x a 1.0
+0 1 p b 1.0
+0 1 j g 1.0
+0 1 d a 1.0
+0 1 q s 1.0
+0 1 e k 1.0
+0 1 z m 1.0
+0 1 @_UNKNOWN_SYMBOL_@ p 1.0
+0 1 p l 1.0
+0 1 y j 1.0
+0 1 o c 1.0
+0 1 e j 1.0
+0 1 z a 1.0
+0 1 k i 1.0
+0 1 q d 1.0
+0 1 z b 1.0
+0 1 s x 1.0
+0 1 o @0@ 1.0
+0 1 x g 1.0
+0 1 i k 1.0
+0 1 v t 1.0
+0 1 h p 1.0
+0 1 p m 1.0
+0 1 q u 1.0
+0 1 l u 1.0
+0 1 h f 1.0
+0 1 f b 1.0
+0 1 x d 1.0
+0 1 j h 1.0
+0 1 @0@ z 1.0
+0 1 r y 1.0
+0 1 h q 1.0
+0 1 x z 1.0
+0 1 n i 1.0
+0 1 o a 1.0
+0 1 h g 1.0
+0 1 q f 1.0
+0 1 s n 1.0
+0 1 m n 1.0
+0 1 s m 1.0
+0 1 x e 1.0
+0 1 d @0@ 1.0
+0 1 n j 1.0
+0 1 l s 1.0
+0 1 i z 1.0
+0 1 c z 1.0
+0 1 t y 1.0
+0 1 j q 1.0
+0 1 @0@ s 1.0
+0 1 @_UNKNOWN_SYMBOL_@ l 1.0
+0 1 k g 1.0
+0 1 p i 1.0
+0 1 t h 1.0
+0 1 o n 1.0
+0 1 d t 1.0
+0 1 v m 1.0
+0 1 s t 1.0
+0 1 g d 1.0
+0 1 p s 1.0
+0 1 d k 1.0
+0 1 v p 1.0
+0 1 h t 1.0
+0 1 q i 1.0
+0 1 l q 1.0
+0 1 h j 1.0
+0 1 @0@ l 1.0
+0 1 m k 1.0
+0 1 r k 1.0
+0 1 x h 1.0
+0 1 j l 1.0
+0 1 h u 1.0
+0 1 a @0@ 1.0
+0 1 t v 1.0
+0 1 o l 1.0
+0 1 d r 1.0
+0 1 h k 1.0
+0 1 v o 1.0
+0 1 z p 1.0
+0 1 s i 1.0
+0 1 x i 1.0
+0 1 d i 1.0
+0 1 q k 1.0
+0 1 l o 1.0
+0 1 y k 1.0
+0 1 @0@ e 1.0
+0 1 r d 1.0
+0 1 c v 1.0
+0 1 j u 1.0
+0 1 k u 1.0
+0 1 p v 1.0
+0 1 @_UNKNOWN_SYMBOL_@ h 1.0
+0 1 p d 1.0
+0 1 y r 1.0
+0 1 o k 1.0
+0 1 z y 1.0
+0 1 v i 1.0
+0 1 s p 1.0
+0 1 z @0@ 1.0
+0 1 t g 1.0
+0 1 i l 1.0
+0 1 c o 1.0
+0 1 h x 1.0
+0 1 p e 1.0
+0 1 q m 1.0
+0 1 l m 1.0
+0 1 m u 1.0
+0 1 h n 1.0
+0 1 q z 1.0
+0 1 x l 1.0
+0 1 g z 1.0
+0 1 h y 1.0
+0 1 n q 1.0
+0 1 o i 1.0
+0 1 k h 1.0
+0 1 m t 1.0
+0 1 y h 1.0
+0 1 h o 1.0
+0 1 v k 1.0
+0 1 z t 1.0
+0 1 m f 1.0
+0 1 s e 1.0
+0 1 x m 1.0
+0 1 y e 1.0
+0 1 t e 1.0
+0 1 n r 1.0
+0 1 l k 1.0
+0 1 i b 1.0
+0 1 @_UNKNOWN_SYMBOL_@ v 1.0
+0 1 c r 1.0
+0 1 j i 1.0
+0 1 y t 1.0
+0 1 f y 1.0
+0 1 @_UNKNOWN_SYMBOL_@ d 1.0
+0 1 l z 1.0
+0 1 z k 1.0
+0 1 k o 1.0
+0 1 t p 1.0
+0 1 o v 1.0
+0 1 v e 1.0
+0 1 s l 1.0
+0 1 p k 1.0
+0 1 y g 1.0
+0 1 c k 1.0
+0 1 q a 1.0
+0 1 l i 1.0
+0 1 i d 1.0
+0 1 @0@ t 1.0
+0 1 m c 1.0
+0 1 r s 1.0
+0 1 x p 1.0
+0 1 n c 1.0
+0 1 j d 1.0
+0 1 k f 1.0
+0 1 l x 1.0
+0 1 i u 1.0
+0 1 d u 1.0
+0 1 v n 1.0
+0 1 o t 1.0
+0 1 d z 1.0
+0 1 a h 1.0
+0 1 v g 1.0
+0 1 s a 1.0
+0 1 x q 1.0
+0 1 q c 1.0
+0 1 l g 1.0
+0 1 @0@ m 1.0
+0 1 p x 1.0
+0 1 @_UNKNOWN_SYMBOL_@ r 1.0
+0 1 b p 1.0
+0 1 j m 1.0
+0 1 p n 1.0
+0 1 f u 1.0
+0 1 l v 1.0
+0 1 z o 1.0
+0 1 y z 1.0
+0 1 o s 1.0
+0 1 s z 1.0
+0 1 z q 1.0
+0 1 a j 1.0
+0 1 v a 1.0
+0 1 s h 1.0
+0 1 b q 1.0
+0 1 t o 1.0
+0 1 c g 1.0
+0 1 q e 1.0
+0 1 @0@ f 1.0
+0 1 l e 1.0
+0 1 r e 1.0
+0 1 k t 1.0
+0 1 b r 1.0
+0 1 q o 1.0
+0 1 x t 1.0
+0 1 l t 1.0
+0 1 g r 1.0
+0 1 h a 1.0
+0 1 n y 1.0
+0 1 o q 1.0
+0 1 m l 1.0
+0 1 v j 1.0
+0 1 a l 1.0
+0 1 v c 1.0
+0 1 b s 1.0
+0 1 x u 1.0
+0 1 n h 1.0
+0 1 t m 1.0
+0 1 u e 1.0
+0 1 n z 1.0
+0 1 l c 1.0
+0 1 @_UNKNOWN_SYMBOL_@ y 1.0
+0 1 i j 1.0
+0 1 @_UNKNOWN_SYMBOL_@ n 1.0
+0 1 b t 1.0
+0 1 @_UNKNOWN_SYMBOL_@ g 1.0
+0 1 f q 1.0
+0 1 g y 1.0
+0 1 l r 1.0
+0 1 u d 1.0
+0 1 z c 1.0
+0 1 @_UNKNOWN_SYMBOL_@ u 1.0
+0 1 s v 1.0
+0 1 t x 1.0
+0 1 z u 1.0
+0 1 a n 1.0
+0 1 s d 1.0
+0 1 b u 1.0
+0 1 p c 1.0
+0 1 @_UNKNOWN_SYMBOL_@ o 1.0
+0 1 v @0@ 1.0
+0 1 h d 1.0
+0 1 r t 1.0
+0 1 l a 1.0
+0 1 p j 1.0
+0 1 c @0@ 1.0
+0 1 j o 1.0
+0 1 b v 1.0
+0 1 n k 1.0
+0 1 r z 1.0
+0 1 k n 1.0
+0 1 l p 1.0
+0 1 g v 1.0
+0 1 h e 1.0
+0 1 j e 1.0
+0 1 e u 1.0
+0 1 v f 1.0
+0 1 d b 1.0
+0 1 x y 1.0
+0 1 u a 1.0
+0 1 e t 1.0
+0 1 j s 1.0
+0 1 @0@ u 1.0
+0 1 @_UNKNOWN_SYMBOL_@ j 1.0
+0 1 b x 1.0
+0 1 @_UNKNOWN_SYMBOL_@ c 1.0
+0 1 k e 1.0
+0 1 p f 1.0
+0 1 f m 1.0
+0 1 l n 1.0
+0 1 m @0@ 1.0
+0 1 s r 1.0
+0 1 z g 1.0
+0 1 a b 1.0
+0 1 v y 1.0
+0 1 b y 1.0
+0 1 @0@ n 1.0
+0 1 m e 1.0
+0 1 r m 1.0
+0 1 n e 1.0
+0 1 b z 1.0
+0 1 g s 1.0
+0 1 m z 1.0
+0 1 g j 1.0
+0 1 h i 1.0
+0 1 o y 1.0
+0 1 e q 1.0
+0 1 m d 1.0
+0 1 f @0@ 1.0
+0 1 a d 1.0
+0 1 n f 1.0
+0 1 p q 1.0
+0 1 n p 1.0
+0 1 s f 1.0
+0 1 t u 1.0
+0 1 u m 1.0
+0 1 @0@ g 1.0
+0 1 r f 1.0
+0 1 e p 1.0
+0 1 @_UNKNOWN_SYMBOL_@ q 1.0
+0 1 k s 1.0
+0 1 @_UNKNOWN_SYMBOL_@ f 1.0
+0 1 t d 1.0
+0 1 o b 1.0
+0 1 f i 1.0
+0 1 g q 1.0
+0 1 l j 1.0
+0 1 u l 1.0
+0 1 d g 1.0
+0 1 d l 1.0
+0 1 a f 1.0
+0 1 f x 1.0
+0 1 v u 1.0
+0 1 c m 1.0
+0 1 i p 1.0
+0 1 y s 1.0
+0 1 h l 1.0
+0 1 t b 1.0
+0 1 n s 1.0
+0 1 l h 1.0
+0 1 g n 1.0
+0 1 h m 1.0
+0 1 i e 1.0
+0 1 d e 1.0
+0 1 e @0@ 1.0
+0 1 d j 1.0
+0 1 a x 1.0
+0 1 u i 1.0
+0 1 a i 1.0
+0 1 j k 1.0
+0 1 p h 1.0
+0 1 y f 1.0
+0 1 @_UNKNOWN_SYMBOL_@ b 1.0
+0 1 k m 1.0
+0 1 f e 1.0
+0 1 g u 1.0
+0 1 l f 1.0
+0 1 u h 1.0
+0 1 i g 1.0
+0 1 q @0@ 1.0
+0 1 s j 1.0
+0 1 a z 1.0
+0 1 f t 1.0
+0 1 v q 1.0
+0 1 b a 1.0
+0 1 c i 1.0
+0 1 a k 1.0
+0 1 @0@ v 1.0
+0 1 r u 1.0
+0 1 n m 1.0
+0 1 k d 1.0
+0 1 g k 1.0
+0 1 l d 1.0
+0 1 m r 1.0
+0 1 j @0@ 1.0
+0 1 g b 1.0
+0 1 e y 1.0
+0 1 v z 1.0
+0 1 v s 1.0
+0 1 b c 1.0
+0 1 i v 1.0
+0 1 n x 1.0
+0 1 j a 1.0
+0 1 @0@ o 1.0
+0 1 a m 1.0
+0 1 r n 1.0
+0 1 e x 1.0
+0 1 @_UNKNOWN_SYMBOL_@ i 1.0
+0 1 t l 1.0
+0 1 b d 1.0
+0 1 o j 1.0
+0 1 f a 1.0
+0 1 g i 1.0
+0 1 l b 1.0
+0 1 u t 1.0
+0 1 z s 1.0
+0 1 d o 1.0
+0 1 f p 1.0
+0 1 b e 1.0
+0 1 c e 1.0
+0 1 i x 1.0
+0 1 @0@ h 1.0
+0 1 m o 1.0
+0 1 r g 1.0
+0 1 c s 1.0
+0 1 a o 1.0
+0 1 k r 1.0
+0 1 z j 1.0
+0 1 t j 1.0
+0 1 b f 1.0
+0 1 o h 1.0
+0 1 d v 1.0
+0 1 g o 1.0
+0 1 g f 1.0
+0 1 i m 1.0
+0 1 d m 1.0
+0 1 a p 1.0
+0 1 c l 1.0
+0 1 s g 1.0
+0 1 u g 1.0
+0 1 b g 1.0
+0 1 @0@ a 1.0
+0 1 u q 1.0
+0 1 p r 1.0
+0 1 e d 1.0
+0 1 @_UNKNOWN_SYMBOL_@ e 1.0
+0 1 f s 1.0
+0 1 o g 1.0
+0 1 u f 1.0
+0 1 b h 1.0
+0 1 g m 1.0
+0 1 u p 1.0
+0 1 i o 1.0
+0 1 s b 1.0
+0 1 a r 1.0
+0 1 f l 1.0
+0 1 i @0@ 1.0
+0 1 m y 1.0
+0 1 b i 1.0
+0 1 c a 1.0
+0 1 b @0@ 1.0
+0 1 l @0@ 1.0
+0 1 y u 1.0
+0 1 a c 1.0
+0 1 n u 1.0
+0 1 o e 1.0
+0 1 k l 1.0
+0 1 y l 1.0
+0 1 m x 1.0
+0 1 b j 1.0
+0 1 g c 1.0
+0 1 m j 1.0
+0 1 n d 1.0
+0 1 e a 1.0
+0 1 v r 1.0
+0 1 a t 1.0
+0 1 n v 1.0
+0 1 c h 1.0
+0 1 y o 1.0
+0 1 u c 1.0
+0 1 b k 1.0
+0 1 v b 1.0
+0 1 e v 1.0
+0 1 a e 1.0
+0 1 q x 1.0
+0 1 r v 1.0
+0 1 @_UNKNOWN_SYMBOL_@ a 1.0
+0 1 k c 1.0
+0 1 f o 1.0
+0 1 u b 1.0
+0 1 b l 1.0
+0 1 o r 1.0
+0 1 d p 1.0
+0 1 r l 1.0
+0 1 g a 1.0
+0 1 y c 1.0
+0 1 a v 1.0
+0 1 f h 1.0
+0 1 b m 1.0
+0 1 @_UNKNOWN_SYMBOL_@ k 1.0
+0 1 @0@ p 1.0
+0 1 m g 1.0
+0 1 r o 1.0
+0 1 a g 1.0
+0 1 n g 1.0
+0 1 k z 1.0
+0 1 @_UNKNOWN_SYMBOL_@ @0@ 1.0
+0 1 i q 1.0
+0 1 j c 1.0
+0 1 t r 1.0
+0 1 g @0@ 1.0
+0 1 b n 1.0
+0 1 o p 1.0
+0 1 x @0@ 1.0
+0 1 e s 1.0
+0 1 e m 1.0
+0 1 y n 1.0
+0 1 c d 1.0
+0 1 u o 1.0
+0 1 b o 1.0
+0 1 @0@ i 1.0
+0 1 r h 1.0
+0 1 e r 1.0
+0 1 u y 1.0
+0 1 k q 1.0
+0 1 a y 1.0
+0 1 e l 1.0
+0 1 g x 1.0
+0 1 s @0@ 1.0
+0 1 i s 1.0
+0 1 f k 1.0
+0 1 z e 1.0
+0 1 u n 1.0
+0 1 v l 1.0
+0 1 g e 1.0
+0 1 u x 1.0
+0 1 f z 1.0
+0 1 t c 1.0
+0 1 f d 1.0
+0 1 @0@ b 1.0
+0 1 m q 1.0
+0 1 r a 1.0
+0 1 c y 1.0
+0 1 x b 1.0
+0 1 j r 1.0
+0 1 p @0@ 1.0
+0 1 p y 1.0
+0 1 y x 1.0
+0 1 o m 1.0
+0 1 m p 1.0
+0 1 u @0@ 1.0
+0 1 x c 1.0
+0 1 m b 1.0
+0 1 n l 1.0
+0 1 t a 1.0
+0 1 e i 1.0
+0 1 u k 1.0
+0 1 y q 1.0
+0 1 i f 1.0
+0 1 q p 1.0
+0 1 e h 1.0
+0 1 r @0@ 1.0
+0 1 z f 1.0
+0 1 f g 1.0
+0 1 d s 1.0
+0 1 u j 1.0
+0 1 v h 1.0
+0 1 o z 1.0
+0 1 d x 1.0
+0 1 n @0@ 1.0
+0 1 h r 1.0
+0 1 p o 1.0
+0 1 f v 1.0
+1 1 a a 0.0
+1 1 c c 0.0
+1 1 b b 0.0
+1 1 e e 0.0
+1 1 d d 0.0
+1 1 g g 0.0
+1 1 f f 0.0
+1 1 i i 0.0
+1 1 h h 0.0
+1 1 k k 0.0
+1 1 j j 0.0
+1 1 m m 0.0
+1 1 l l 0.0
+1 1 o o 0.0
+1 1 n n 0.0
+1 1 q q 0.0
+1 1 p p 0.0
+1 1 s s 0.0
+1 1 r r 0.0
+1 1 u u 0.0
+1 1 t t 0.0
+1 1 v v 0.0
+1 1 y y 0.0
+1 1 x x 0.0
+1 1 z z 0.0
+1 0.0
diff --git a/errmodel.extrachars.txt b/errmodel.extrachars.txt
new file mode 100644
index 0000000..3dd35ed
--- /dev/null
+++ b/errmodel.extrachars.txt
@@ -0,0 +1,5 @@
+0 1 v z
+1 2 e l
+2 3 s u
+3 4 q t
+4
diff --git a/hfst-ol.cc b/hfst-ol.cc
new file mode 100644
index 0000000..e926cbe
--- /dev/null
+++ b/hfst-ol.cc
@@ -0,0 +1,861 @@
+// Copyright 2010 University of Helsinki
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hfst-ol.h"
+#include <string>
+
+namespace hfst_ol {
+
+template <typename T>
+inline T hfst_deref(T* ptr)
+{
+ T dest;
+ memcpy(&dest, ptr, sizeof(dest));
+ return dest;
+}
+
+void skip_c_string(char ** raw)
+{
+ while (**raw != 0) {
+ ++(*raw);
+ }
+ ++(*raw);
+}
+
+void
+TransducerHeader::read_property(bool& property, FILE* f)
+{
+ unsigned int prop;
+ if (fread(&prop,sizeof(unsigned int),1,f) != 1) {
+ HFST_THROW_MESSAGE(HeaderParsingException,
+ "Header ended unexpectedly\n");
+ }
+ if (prop == 0)
+ {
+ property = false;
+ return;
+ }
+ else
+ {
+ property = true;
+ return;
+ }
+}
+
+void
+TransducerHeader::read_property(bool& property, char** raw)
+{
+ unsigned int prop = *((unsigned int *) *raw);
+ (*raw) += sizeof(unsigned int);
+ if (prop == 0)
+ {
+ property = false;
+ return;
+ }
+ else
+ {
+ property = true;
+ return;
+ }
+}
+
+void TransducerHeader::skip_hfst3_header(FILE * f)
+{
+ const char* header1 = "HFST";
+ unsigned int header_loc = 0; // how much of the header has been found
+ int c;
+ for(header_loc = 0; header_loc < strlen(header1) + 1; header_loc++)
+ {
+ c = getc(f);
+ if(c != header1[header_loc]) {
+ break;
+ }
+ }
+ if(header_loc == strlen(header1) + 1) // we found it
+ {
+ unsigned short remaining_header_len;
+ if (fread(&remaining_header_len,
+ sizeof(remaining_header_len), 1, f) != 1 ||
+ getc(f) != '\0') {
+ HFST_THROW_MESSAGE(HeaderParsingException,
+ "Found broken HFST3 header\n");
+ }
+ char * headervalue = new char[remaining_header_len];
+ if (fread(headervalue, remaining_header_len, 1, f) != 1)
+ {
+ HFST_THROW_MESSAGE(HeaderParsingException,
+ "HFST3 header ended unexpectedly\n");
+ }
+ if (headervalue[remaining_header_len - 1] != '\0') {
+ HFST_THROW_MESSAGE(HeaderParsingException,
+ "Found broken HFST3 header\n");
+ }
+ std::string header_tail(headervalue, remaining_header_len);
+ size_t type_field = header_tail.find("type");
+ if (type_field != std::string::npos) {
+ if (header_tail.find("HFST_OL") != type_field + 5 &&
+ header_tail.find("HFST_OLW") != type_field + 5) {
+ delete headervalue;
+ HFST_THROW_MESSAGE(
+ TransducerTypeException,
+ "Transducer has incorrect type, should be "
+ "hfst-optimized-lookup\n");
+ }
+ }
+ } else // nope. put back what we've taken
+ {
+ ungetc(c, f); // first the non-matching character
+ for(int i = header_loc - 1; i>=0; i--) {
+ // then the characters that did match (if any)
+ ungetc(header1[i], f);
+ }
+ }
+}
+
+void TransducerHeader::skip_hfst3_header(char ** raw)
+{
+ const char* header1 = "HFST";
+ unsigned int header_loc = 0; // how much of the header has been found
+
+ for(header_loc = 0; header_loc < strlen(header1) + 1; header_loc++)
+ {
+ if(**raw != header1[header_loc]) {
+ break;
+ }
+ ++(*raw);
+ }
+ if(header_loc == strlen(header1) + 1) // we found it
+ {
+ unsigned short remaining_header_len = *((unsigned short *) *raw);
+ (*raw) += sizeof(unsigned short) + 1 + remaining_header_len;
+ } else // nope. put back what we've taken
+ {
+ --(*raw); // first the non-matching character
+ for(int i = header_loc - 1; i>=0; i--) {
+ // then the characters that did match (if any)
+ --(*raw);
+ }
+ }
+}
+
+TransducerHeader::TransducerHeader(FILE* f)
+{
+ skip_hfst3_header(f); // skip header iff it is present
+ /* The following conditional clause does all the numerical reads
+ and throws an exception if any fails to return 1 */
+ if (fread(&number_of_input_symbols,
+ sizeof(SymbolNumber),1,f) != 1||
+ fread(&number_of_symbols,
+ sizeof(SymbolNumber),1,f) != 1||
+ fread(&size_of_transition_index_table,
+ sizeof(TransitionTableIndex),1,f) != 1||
+ fread(&size_of_transition_target_table,
+ sizeof(TransitionTableIndex),1,f) != 1||
+ fread(&number_of_states,
+ sizeof(TransitionTableIndex),1,f) != 1||
+ fread(&number_of_transitions,
+ sizeof(TransitionTableIndex),1,f) != 1) {
+ HFST_THROW_MESSAGE(HeaderParsingException,
+ "Header ended unexpectedly\n");
+ }
+ read_property(weighted,f);
+ read_property(deterministic,f);
+ read_property(input_deterministic,f);
+ read_property(minimized,f);
+ read_property(cyclic,f);
+ read_property(has_epsilon_epsilon_transitions,f);
+ read_property(has_input_epsilon_transitions,f);
+ read_property(has_input_epsilon_cycles,f);
+ read_property(has_unweighted_input_epsilon_cycles,f);
+}
+
+TransducerHeader::TransducerHeader(char** raw)
+{
+ skip_hfst3_header(raw); // skip header iff it is present
+ number_of_input_symbols = *(SymbolNumber*) *raw;
+ (*raw) += sizeof(SymbolNumber);
+ number_of_symbols = *(SymbolNumber*) *raw;
+ (*raw) += sizeof(SymbolNumber);
+ size_of_transition_index_table = *(TransitionTableIndex*) *raw;
+ (*raw) += sizeof(TransitionTableIndex);
+ size_of_transition_target_table = *(TransitionTableIndex*) *raw;
+ (*raw) += sizeof(TransitionTableIndex);
+ number_of_states = *(TransitionTableIndex*) *raw;
+ (*raw) += sizeof(TransitionTableIndex);
+ number_of_transitions = *(TransitionTableIndex*) *raw;
+ (*raw) += sizeof(TransitionTableIndex);
+ read_property(weighted,raw);
+ read_property(deterministic,raw);
+ read_property(input_deterministic,raw);
+ read_property(minimized,raw);
+ read_property(cyclic,raw);
+ read_property(has_epsilon_epsilon_transitions,raw);
+ read_property(has_input_epsilon_transitions,raw);
+ read_property(has_input_epsilon_cycles,raw);
+ read_property(has_unweighted_input_epsilon_cycles,raw);
+}
+
+SymbolNumber
+TransducerHeader::symbol_count()
+{
+ return number_of_symbols;
+}
+
+SymbolNumber
+TransducerHeader::input_symbol_count()
+{
+ return number_of_input_symbols;
+}
+TransitionTableIndex
+TransducerHeader::index_table_size(void)
+{
+ return size_of_transition_index_table;
+}
+
+TransitionTableIndex
+TransducerHeader::target_table_size()
+{
+ return size_of_transition_target_table;
+}
+
+bool
+TransducerHeader::probe_flag(HeaderFlag flag)
+{
+ switch (flag) {
+ case Weighted:
+ return weighted;
+ case Deterministic:
+ return deterministic;
+ case Input_deterministic:
+ return input_deterministic;
+ case Minimized:
+ return minimized;
+ case Cyclic:
+ return cyclic;
+ case Has_epsilon_epsilon_transitions:
+ return has_epsilon_epsilon_transitions;
+ case Has_input_epsilon_transitions:
+ return has_input_epsilon_transitions;
+ case Has_input_epsilon_cycles:
+ return has_input_epsilon_cycles;
+ case Has_unweighted_input_epsilon_cycles:
+ return has_unweighted_input_epsilon_cycles;
+ }
+ return false;
+}
+
+bool
+FlagDiacriticOperation::isFlag() const
+{
+ return feature != NO_SYMBOL;
+}
+
+FlagDiacriticOperator
+FlagDiacriticOperation::Operation() const
+{
+ return operation;
+}
+
+SymbolNumber
+FlagDiacriticOperation::Feature() const
+{
+ return feature;
+}
+
+
+ValueNumber
+FlagDiacriticOperation::Value() const
+{
+ return value;
+}
+
+
+void TransducerAlphabet::read(FILE * f, SymbolNumber number_of_symbols)
+{
+ char * line = (char *) malloc(MAX_SYMBOL_BYTES);
+ std::map<std::string, SymbolNumber> feature_bucket;
+ std::map<std::string, ValueNumber> value_bucket;
+ value_bucket[std::string()] = 0; // empty value = neutral
+ ValueNumber val_num = 1;
+ SymbolNumber feat_num = 0;
+
+ kt.push_back(std::string("")); // zeroth symbol is epsilon
+ int byte;
+ while ( (byte = fgetc(f)) != 0 ) {
+ /* pass over epsilon */
+ if (byte == EOF) {
+ HFST_THROW(AlphabetParsingException);
+ }
+ }
+
+ for (SymbolNumber k = 1; k < number_of_symbols; ++k) {
+ char * sym = line;
+ while ( (byte = fgetc(f)) != 0 ) {
+ if (byte == EOF) {
+ HFST_THROW(AlphabetParsingException);
+ }
+ *sym = byte;
+ ++sym;
+ }
+ *sym = 0;
+ // Detect and handle special symbols, which begin and end with @
+ if (line[0] == '@' && line[strlen(line) - 1] == '@') {
+ if (strlen(line) >= 5 && line[2] == '.') { // flag diacritic
+ std::string feat;
+ std::string val;
+ FlagDiacriticOperator op = P; // for the compiler
+ switch (line[1]) {
+ case 'P': op = P; break;
+ case 'N': op = N; break;
+ case 'R': op = R; break;
+ case 'D': op = D; break;
+ case 'C': op = C; break;
+ case 'U': op = U; break;
+ }
+ char * c = line;
+ for (c +=3; *c != '.' && *c != '@'; c++) { feat.append(c,1); }
+ if (*c == '.')
+ {
+ for (++c; *c != '@'; c++) { val.append(c,1); }
+ }
+ if (feature_bucket.count(feat) == 0)
+ {
+ feature_bucket[feat] = feat_num;
+ ++feat_num;
+ }
+ if (value_bucket.count(val) == 0)
+ {
+ value_bucket[val] = val_num;
+ ++val_num;
+ }
+
+ operations.insert(
+ std::pair<SymbolNumber, FlagDiacriticOperation>(
+ k,
+ FlagDiacriticOperation(
+ op, feature_bucket[feat], value_bucket[val])));
+
+ kt.push_back(std::string(""));
+ continue;
+
+ } else if (strcmp(line, "@_UNKNOWN_SYMBOL_@") == 0) {
+ unknown_symbol = k;
+ kt.push_back(std::string(line));
+ continue;
+ } else if (strcmp(line, "@_IDENTITY_SYMBOL_@") == 0) {
+ identity_symbol = k;
+ kt.push_back(std::string(line));
+ continue;
+ } else { // we don't know what this is, ignore and suppress
+ kt.push_back(std::string(""));
+ continue;
+ }
+ }
+ kt.push_back(std::string(line));
+ string_to_symbol[std::string(line)] = k;
+ }
+ free(line);
+ flag_state_size = feature_bucket.size();
+}
+
+void TransducerAlphabet::read(char ** raw, SymbolNumber number_of_symbols)
+{
+ std::map<std::string, SymbolNumber> feature_bucket;
+ std::map<std::string, ValueNumber> value_bucket;
+ value_bucket[std::string()] = 0; // empty value = neutral
+ ValueNumber val_num = 1;
+ SymbolNumber feat_num = 0;
+
+ kt.push_back(std::string("")); // zeroth symbol is epsilon
+ skip_c_string(raw);
+
+ for (SymbolNumber k = 1; k < number_of_symbols; ++k) {
+
+ // Detect and handle special symbols, which begin and end with @
+ if ((*raw)[0] == '@' && (*raw)[strlen(*raw) - 1] == '@') {
+ if (strlen(*raw) >= 5 && (*raw)[2] == '.') { // flag diacritic
+ std::string feat;
+ std::string val;
+ FlagDiacriticOperator op = P; // for the compiler
+ switch ((*raw)[1]) {
+ case 'P': op = P; break;
+ case 'N': op = N; break;
+ case 'R': op = R; break;
+ case 'D': op = D; break;
+ case 'C': op = C; break;
+ case 'U': op = U; break;
+ }
+ char * c = *raw;
+ for (c +=3; *c != '.' && *c != '@'; c++) { feat.append(c,1); }
+ if (*c == '.')
+ {
+ for (++c; *c != '@'; c++) { val.append(c,1); }
+ }
+ if (feature_bucket.count(feat) == 0)
+ {
+ feature_bucket[feat] = feat_num;
+ ++feat_num;
+ }
+ if (value_bucket.count(val) == 0)
+ {
+ value_bucket[val] = val_num;
+ ++val_num;
+ }
+
+ operations.insert(
+ std::pair<SymbolNumber, FlagDiacriticOperation>(
+ k,
+ FlagDiacriticOperation(
+ op, feature_bucket[feat], value_bucket[val])));
+
+ kt.push_back(std::string(""));
+ skip_c_string(raw);
+ continue;
+
+ } else if (strcmp(*raw, "@_UNKNOWN_SYMBOL_@") == 0) {
+ unknown_symbol = k;
+ kt.push_back(std::string(""));
+ skip_c_string(raw);
+ continue;
+ } else if (strcmp(*raw, "@_IDENTITY_SYMBOL_@") == 0) {
+ identity_symbol = k;
+ kt.push_back(std::string(""));
+ skip_c_string(raw);
+ continue;
+ } else { // we don't know what this is, ignore and suppress
+ kt.push_back(std::string(""));
+ skip_c_string(raw);
+ continue;
+ }
+ }
+ kt.push_back(std::string(*raw));
+ string_to_symbol[std::string(*raw)] = k;
+ skip_c_string(raw);
+ }
+ flag_state_size = feature_bucket.size();
+}
+
+TransducerAlphabet::TransducerAlphabet(FILE* f, SymbolNumber number_of_symbols):
+ unknown_symbol(NO_SYMBOL),
+ identity_symbol(NO_SYMBOL),
+ orig_symbol_count(number_of_symbols)
+{
+ read(f, number_of_symbols);
+}
+
+TransducerAlphabet::TransducerAlphabet(char** raw,
+ SymbolNumber number_of_symbols):
+ unknown_symbol(NO_SYMBOL),
+ identity_symbol(NO_SYMBOL),
+ orig_symbol_count(number_of_symbols)
+{
+ read(raw, number_of_symbols);
+}
+
+void TransducerAlphabet::add_symbol(std::string & sym)
+{
+ string_to_symbol[sym] = kt.size();
+ kt.push_back(sym);
+}
+
+void TransducerAlphabet::add_symbol(char * sym)
+{
+ std::string s(sym);
+ add_symbol(s);
+}
+
+KeyTable*
+TransducerAlphabet::get_key_table()
+{
+ return &kt;
+}
+
+OperationMap*
+TransducerAlphabet::get_operation_map()
+{
+ return &operations;
+}
+
+SymbolNumber
+TransducerAlphabet::get_state_size()
+{
+ return flag_state_size;
+}
+
+SymbolNumber
+TransducerAlphabet::get_unknown() const
+{
+ return unknown_symbol;
+}
+
+SymbolNumber
+TransducerAlphabet::get_identity() const
+{
+ return identity_symbol;
+}
+
+SymbolNumber TransducerAlphabet::get_orig_symbol_count() const
+{
+ return orig_symbol_count;
+}
+
+StringSymbolMap*
+TransducerAlphabet::get_string_to_symbol()
+{
+ return &string_to_symbol;
+}
+
+bool TransducerAlphabet::has_string(std::string const & s) const
+{
+ return string_to_symbol.count(s) != 0;
+}
+
+bool
+TransducerAlphabet::is_flag(SymbolNumber symbol)
+{
+ return operations.count(symbol) == 1;
+}
+
+void IndexTable::read(FILE * f,
+ TransitionTableIndex number_of_table_entries)
+{
+ size_t table_size = number_of_table_entries*TransitionIndex::SIZE;
+ indices = (char*)(malloc(table_size));
+ if (fread(indices,table_size, 1, f) != 1) {
+ HFST_THROW(IndexTableReadingException);
+ }
+}
+
+void IndexTable::read(char ** raw,
+ TransitionTableIndex number_of_table_entries)
+{
+ size_t table_size = number_of_table_entries*TransitionIndex::SIZE;
+ indices = (char*)(malloc(table_size));
+ memcpy((void *) indices, (const void *) *raw, table_size);
+ (*raw) += table_size;
+}
+
+void TransitionTable::read(FILE * f,
+ TransitionTableIndex number_of_table_entries)
+{
+ size_t table_size = number_of_table_entries*Transition::SIZE;
+ transitions = (char*)(malloc(table_size));
+ if (fread(transitions, table_size, 1, f) != 1) {
+ HFST_THROW(TransitionTableReadingException);
+ }
+}
+
+void TransitionTable::read(char ** raw,
+ TransitionTableIndex number_of_table_entries)
+{
+ size_t table_size = number_of_table_entries*Transition::SIZE;
+ transitions = (char*)(malloc(table_size));
+ memcpy((void *) transitions, (const void *) *raw, table_size);
+ (*raw) += table_size;
+}
+
+void LetterTrie::add_string(const char * p, SymbolNumber symbol_key)
+{
+ if (*(p+1) == 0)
+ {
+ symbols[(unsigned char)(*p)] = symbol_key;
+ return;
+ }
+ if (letters[(unsigned char)(*p)] == NULL)
+ {
+ letters[(unsigned char)(*p)] = new LetterTrie();
+ }
+ letters[(unsigned char)(*p)]->add_string(p+1,symbol_key);
+}
+
+SymbolNumber LetterTrie::find_key(char ** p)
+{
+ const char * old_p = *p;
+ ++(*p);
+ if (letters[(unsigned char)(*old_p)] == NULL)
+ {
+ return symbols[(unsigned char)(*old_p)];
+ }
+ SymbolNumber s = letters[(unsigned char)(*old_p)]->find_key(p);
+ if (s == NO_SYMBOL)
+ {
+ --(*p);
+ return symbols[(unsigned char)(*old_p)];
+ }
+ return s;
+}
+
+LetterTrie::~LetterTrie()
+{
+ for (LetterTrieVector::iterator i = letters.begin();
+ i != letters.end(); ++i)
+ {
+ if (*i)
+ {
+ delete *i;
+ }
+ }
+}
+
+Encoder::Encoder(KeyTable * kt, SymbolNumber number_of_input_symbols):
+ ascii_symbols(UCHAR_MAX,NO_SYMBOL)
+{
+ read_input_symbols(kt, number_of_input_symbols);
+}
+
+void Encoder::read_input_symbol(const char * s, const int s_num)
+{
+ if (strlen(s) == 0) { // ignore empty strings
+ return;
+ }
+ if ((strlen(s) == 1) && (unsigned char)(*s) <= 127)
+ {
+ ascii_symbols[(unsigned char)(*s)] = s_num;
+ }
+ letters.add_string(s, s_num);
+}
+
+void Encoder::read_input_symbol(std::string const & s, const int s_num)
+{
+ read_input_symbol(s.c_str(), s_num);
+}
+
+void Encoder::read_input_symbols(KeyTable * kt,
+ SymbolNumber number_of_input_symbols)
+{
+ for (SymbolNumber k = 0; k < number_of_input_symbols; ++k)
+ {
+ const char * p = kt->at(k).c_str();
+ read_input_symbol(p, k);
+ }
+}
+
+TransitionTableIndex
+TransitionIndex::target() const
+{
+ return first_transition_index;
+}
+
+bool
+TransitionIndex::final(void) const
+{
+ return input_symbol == NO_SYMBOL &&
+ first_transition_index != NO_TABLE_INDEX;
+}
+
+Weight
+TransitionIndex::final_weight(void) const
+{
+ union to_weight
+ {
+ TransitionTableIndex i;
+ Weight w;
+ } weight;
+ weight.i = first_transition_index;
+ return weight.w;
+}
+
+SymbolNumber
+TransitionIndex::get_input(void) const
+{
+ return input_symbol;
+}
+
+TransitionTableIndex
+Transition::target(void) const
+{
+ return target_index;
+}
+
+SymbolNumber
+Transition::get_output(void) const
+{
+ return output_symbol;
+}
+
+SymbolNumber
+Transition::get_input(void) const
+{
+ return input_symbol;
+}
+
+Weight
+Transition::get_weight(void) const
+{
+ return transition_weight;
+}
+
+bool
+Transition::final(void) const
+{
+ return input_symbol == NO_SYMBOL &&
+ output_symbol == NO_SYMBOL &&
+ target_index == 1;
+}
+
+IndexTable::IndexTable(FILE* f,
+ TransitionTableIndex number_of_table_entries):
+ indices(NULL),
+ size(number_of_table_entries)
+{
+ read(f, number_of_table_entries);
+}
+
+IndexTable::IndexTable(char ** raw,
+ TransitionTableIndex number_of_table_entries):
+ indices(NULL),
+ size(number_of_table_entries)
+{
+ read(raw, number_of_table_entries);
+}
+
+IndexTable::~IndexTable(void)
+{
+ if (indices) {
+ free(indices);
+ }
+}
+
+SymbolNumber
+IndexTable::input_symbol(TransitionTableIndex i) const
+{
+ if (i < size) {
+ return hfst_deref((SymbolNumber *)
+ (indices + TransitionIndex::SIZE * i));
+ } else {
+ return NO_SYMBOL;
+ }
+}
+
+TransitionTableIndex
+IndexTable::target(TransitionTableIndex i) const
+{
+ if (i < size) {
+ return hfst_deref((TransitionTableIndex *)
+ (indices + TransitionIndex::SIZE * i +
+ sizeof(SymbolNumber)));
+ } else {
+ return NO_TABLE_INDEX;
+ }
+}
+
+bool
+IndexTable::final(TransitionTableIndex i) const
+{
+ return input_symbol(i) == NO_SYMBOL && target(i) != NO_TABLE_INDEX;
+}
+
+Weight
+IndexTable::final_weight(TransitionTableIndex i) const
+{
+ if (i < size) {
+ return hfst_deref((Weight *)
+ (indices + TransitionIndex::SIZE * i +
+ sizeof(SymbolNumber)));
+ } else {
+ return INFINITE_WEIGHT;
+ }
+}
+
+TransitionTable::TransitionTable(FILE * f,
+ TransitionTableIndex transition_count):
+ transitions(NULL),
+ size(transition_count)
+{
+ read(f, transition_count);
+}
+
+TransitionTable::TransitionTable(char ** raw,
+ TransitionTableIndex transition_count):
+ transitions(NULL),
+ size(transition_count)
+{
+ read(raw, transition_count);
+}
+
+TransitionTable::~TransitionTable(void)
+{
+ if (transitions) {
+ free(transitions);
+ }
+}
+
+SymbolNumber
+TransitionTable::input_symbol(TransitionTableIndex i) const
+{
+ if (i < size) {
+ return hfst_deref((SymbolNumber *)
+ (transitions + Transition::SIZE * i));
+ } else {
+ return NO_SYMBOL;
+ }
+}
+
+SymbolNumber
+TransitionTable::output_symbol(TransitionTableIndex i) const
+{
+ if (i < size) {
+ return hfst_deref((SymbolNumber *)
+ (transitions + Transition::SIZE * i +
+ sizeof(SymbolNumber)));
+ } else {
+ return NO_SYMBOL;
+ }
+}
+
+TransitionTableIndex
+TransitionTable::target(TransitionTableIndex i) const
+{
+ if (i < size) {
+ return hfst_deref((TransitionTableIndex *)
+ (transitions + Transition::SIZE * i +
+ 2*sizeof(SymbolNumber)));
+ } else {
+ return NO_TABLE_INDEX;
+ }
+}
+
+Weight
+TransitionTable::weight(TransitionTableIndex i) const
+{
+ if (i < size) {
+ return hfst_deref((Weight *)
+ (transitions + Transition::SIZE * i +
+ 2*sizeof(SymbolNumber) +
+ sizeof(TransitionTableIndex)));
+ } else {
+ return INFINITE_WEIGHT;
+ }
+}
+
+bool
+TransitionTable::final(TransitionTableIndex i) const
+{
+ return input_symbol(i) == NO_SYMBOL &&
+ output_symbol(i) == NO_SYMBOL &&
+ target(i) == 1;
+}
+
+SymbolNumber Encoder::find_key(char ** p)
+{
+ if (ascii_symbols[(unsigned char)(**p)] == NO_SYMBOL)
+ {
+ return letters.find_key(p);
+ }
+ SymbolNumber s = ascii_symbols[(unsigned char)(**p)];
+ ++(*p);
+ return s;
+}
+
+} // namespace hfst_ol
diff --git a/hfst-ol.h b/hfst-ol.h
new file mode 100644
index 0000000..e8a9887
--- /dev/null
+++ b/hfst-ol.h
@@ -0,0 +1,453 @@
+/* -*- Mode: C++ -*- */
+// Copyright 2010 University of Helsinki
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file contains some classes, typedefs and constant common to all
+ * hfst-optimized-lookup stuff. This is just to get them out of the way
+ * of the actual ospell code.
+ */
+
+#ifndef HFST_OSPELL_HFST_OL_H_
+#define HFST_OSPELL_HFST_OL_H_
+
+#include <vector>
+#include <map>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <cstring>
+#include <set>
+#include <utility>
+#include "ol-exceptions.h"
+
+namespace hfst_ol {
+
+typedef unsigned short SymbolNumber;
+typedef unsigned int TransitionTableIndex;
+typedef std::vector<SymbolNumber> SymbolVector;
+typedef std::vector<std::string> KeyTable;
+typedef std::map<std::string, SymbolNumber> StringSymbolMap;
+typedef short ValueNumber;
+typedef float Weight;
+
+// Forward declarations to typedef some more containers
+class TransitionIndex;
+class Transition;
+class FlagDiacriticOperation;
+
+typedef std::vector<TransitionIndex*> TransitionIndexVector;
+typedef std::vector<Transition*> TransitionVector;
+
+typedef std::map<SymbolNumber, FlagDiacriticOperation> OperationMap;
+
+const SymbolNumber NO_SYMBOL = USHRT_MAX;
+const TransitionTableIndex NO_TABLE_INDEX = UINT_MAX;
+const Weight INFINITE_WEIGHT = static_cast<float>(NO_TABLE_INDEX);
+const unsigned int MAX_SYMBOL_BYTES = 1000;
+
+// This is 2^31, hopefully equal to UINT_MAX/2 rounded up.
+// For some profound reason it can't be replaced with (UINT_MAX+1)/2.
+const TransitionTableIndex TARGET_TABLE = 2147483648u;
+
+// the flag diacritic operators as given in
+// Beesley & Karttunen, Finite State Morphology (U of C Press 2003)
+enum FlagDiacriticOperator {P, N, R, D, C, U};
+
+enum HeaderFlag {Weighted, Deterministic, Input_deterministic, Minimized,
+ Cyclic, Has_epsilon_epsilon_transitions,
+ Has_input_epsilon_transitions, Has_input_epsilon_cycles,
+ Has_unweighted_input_epsilon_cycles};
+
+// Utility function for dealing with raw memory
+void skip_c_string(char ** raw);
+
+//! Internal class for Transducer processing.
+
+//! Contains low-level processing stuff.
+class TransducerHeader
+{
+private:
+ SymbolNumber number_of_symbols;
+ SymbolNumber number_of_input_symbols;
+ TransitionTableIndex size_of_transition_index_table;
+ TransitionTableIndex size_of_transition_target_table;
+
+ TransitionTableIndex number_of_states;
+ TransitionTableIndex number_of_transitions;
+
+ bool weighted;
+ bool deterministic;
+ bool input_deterministic;
+ bool minimized;
+ bool cyclic;
+ bool has_epsilon_epsilon_transitions;
+ bool has_input_epsilon_transitions;
+ bool has_input_epsilon_cycles;
+ bool has_unweighted_input_epsilon_cycles;
+ void read_property(bool &property, FILE * f);
+ void read_property(bool &property, char ** raw);
+ void skip_hfst3_header(FILE * f);
+ void skip_hfst3_header(char ** f);
+
+public:
+ //!
+ //! @brief read header from file @a f
+ TransducerHeader(FILE * f);
+
+ //!
+ //! read header from raw memory data @a raw
+ TransducerHeader(char ** raw);
+ //!
+ //! count symbols
+ SymbolNumber symbol_count(void);
+ //!
+ //! count input symbols
+ SymbolNumber input_symbol_count(void);
+ //!
+ //! index table size
+ TransitionTableIndex index_table_size(void);
+ //!
+ //! target table size
+ TransitionTableIndex target_table_size(void);
+ //!
+ //! check for flag
+ bool probe_flag(HeaderFlag flag);
+};
+
+//! Internal class for flag diacritic processing.
+
+//! Contains low-level processing stuff.
+class FlagDiacriticOperation
+{
+private:
+ const FlagDiacriticOperator operation;
+ const SymbolNumber feature;
+ const ValueNumber value;
+public:
+ //!
+ //! Construct flag diacritic of from \@ @a op . @a feat . @a val \@.
+ FlagDiacriticOperation(const FlagDiacriticOperator op,
+ const SymbolNumber feat,
+ const ValueNumber val):
+ operation(op), feature(feat), value(val) {}
+
+ // dummy constructor
+ FlagDiacriticOperation():
+ operation(P), feature(NO_SYMBOL), value(0) {}
+
+ //!
+ //! check if flag
+ bool isFlag(void) const;
+ //!
+ //! Operation something I don't understand really.
+ FlagDiacriticOperator Operation(void) const;
+ //!
+ //! No clue
+ SymbolNumber Feature(void) const;
+ //!
+ //! Not a slightest idea
+ ValueNumber Value(void) const;
+
+};
+
+//! Internal class for alphabet processing.
+
+//! Contains low-level processing stuff.
+class TransducerAlphabet
+{
+private:
+ KeyTable kt;
+ OperationMap operations;
+ SymbolNumber unknown_symbol;
+ SymbolNumber identity_symbol;
+ SymbolNumber flag_state_size;
+ SymbolNumber orig_symbol_count;
+ StringSymbolMap string_to_symbol;
+ void process_symbol(char * line);
+
+ void read(FILE * f, SymbolNumber number_of_symbols);
+ void read(char ** raw, SymbolNumber number_of_symbols);
+
+public:
+ //!
+ //! read alphabets from file @a f
+ TransducerAlphabet(FILE *f, SymbolNumber number_of_symbols);
+ //!
+ //! read alphabes from raw data @a raw
+ TransducerAlphabet(char ** raw, SymbolNumber number_of_symbols);
+
+ void add_symbol(std::string & sym);
+ void add_symbol(char * sym);
+ //!
+ //! get alphabet's keytable mapping
+ KeyTable * get_key_table(void);
+ //!
+ //! get flag operation map stuff
+ OperationMap * get_operation_map(void);
+ //!
+ //! get state's size
+ SymbolNumber get_state_size(void);
+ //!
+ //! get position of unknown symbol
+ SymbolNumber get_unknown(void) const;
+ SymbolNumber get_identity(void) const;
+ //! get orig symbol count
+ SymbolNumber get_orig_symbol_count(void) const;
+ //!
+ //! get mapping from strings to symbols
+ StringSymbolMap * get_string_to_symbol(void);
+ bool has_string(std::string const & s) const;
+ //!
+ //! get if given symbol is a flag
+ bool is_flag(SymbolNumber symbol);
+};
+
+class LetterTrie;
+typedef std::vector<LetterTrie*> LetterTrieVector;
+
+//! Internal class for alphabet processing.
+
+//! Contains low-level processing stuff.
+class LetterTrie
+{
+private:
+ LetterTrieVector letters;
+ SymbolVector symbols;
+
+public:
+ LetterTrie(void):
+ letters(UCHAR_MAX, static_cast<LetterTrie*>(NULL)),
+ symbols(UCHAR_MAX,NO_SYMBOL)
+ {}
+ //!
+ //! add a string to alphabets with a key
+ void add_string(const char * p,SymbolNumber symbol_key);
+ //!
+ //! find a key for string or add it
+ SymbolNumber find_key(char ** p);
+ ~LetterTrie();
+};
+
+//! Internal class for alphabet processing.
+
+//! Contains low-level processing stuff.
+class Encoder {
+
+private:
+ LetterTrie letters;
+ SymbolVector ascii_symbols;
+
+ void read_input_symbols(KeyTable * kt, SymbolNumber number_of_input_symbols);
+
+public:
+ //!
+ //! create encoder from keytable
+ Encoder(KeyTable * kt, SymbolNumber number_of_input_symbols);
+ SymbolNumber find_key(char ** p);
+ void read_input_symbol(const char * s, const int s_num);
+ void read_input_symbol(std::string const & s, const int s_num);
+};
+
+typedef std::vector<ValueNumber> FlagDiacriticState;
+
+//! Internal class for transition data.
+
+//! Contains low-level processing stuff.
+class TransitionIndex
+{
+protected:
+ SymbolNumber input_symbol; //!< transition's input symbol
+ TransitionTableIndex first_transition_index; //!< first transition location
+
+public:
+
+ //!
+ //! Each TransitionIndex has an input symbol and a target index.
+ static const size_t SIZE =
+ sizeof(SymbolNumber) + sizeof(TransitionTableIndex);
+
+ //!
+ //! Create transition index for symbol
+ TransitionIndex(const SymbolNumber input,
+ const TransitionTableIndex first_transition):
+ input_symbol(input),
+ first_transition_index(first_transition)
+ {}
+ //!
+ //! return target of transition
+ TransitionTableIndex target(void) const;
+ //!
+ //! whether it's final state
+ bool final(void) const;
+ //!
+ //! retrieve final weight
+ Weight final_weight(void) const;
+ //!
+ //! symbol number for transitions input
+ SymbolNumber get_input(void) const;
+};
+
+//! Internal class for transition processing.
+
+//! Contains low-level processing stuff.
+class Transition
+{
+protected:
+ SymbolNumber input_symbol; //!< input symbol
+ SymbolNumber output_symbol; //!< output symbol
+ TransitionTableIndex target_index; //!< location of target of transition
+ Weight transition_weight; //!< tranisition's weight
+
+public:
+
+ //! Each transition has an input symbol, an output symbol and
+ //! a target index.
+ static const size_t SIZE =
+ 2 * sizeof(SymbolNumber) + sizeof(TransitionTableIndex) + sizeof(Weight);
+
+ //!
+ //! Create transition with input, output, target and weight.
+ Transition(const SymbolNumber input,
+ const SymbolNumber output,
+ const TransitionTableIndex target,
+ const Weight w):
+ input_symbol(input),
+ output_symbol(output),
+ target_index(target),
+ transition_weight(w)
+ {}
+
+ Transition():
+ input_symbol(NO_SYMBOL),
+ output_symbol(NO_SYMBOL),
+ target_index(NO_TABLE_INDEX),
+ transition_weight(INFINITE_WEIGHT)
+ {}
+
+ //!
+ //! get transitions target
+ TransitionTableIndex target(void) const;
+ //!
+ //! get output symbol
+ SymbolNumber get_output(void) const;
+ //!
+ //! get input symbol
+ SymbolNumber get_input(void) const;
+ //!
+ //! get transition weight
+ Weight get_weight(void) const;
+ //!
+ //! whether transition is final
+ bool final(void) const;
+};
+
+//! Internal class for Transducer processing.
+
+//! Contains low-level processing stuff.
+class IndexTable
+{
+private:
+ char * indices;
+
+ void read(FILE * f,
+ TransitionTableIndex number_of_table_entries);
+ void read(char ** raw,
+ TransitionTableIndex number_of_table_entries);
+ TransitionTableIndex size;
+
+public:
+ //!
+ //! read index table from file @a f.
+ IndexTable(FILE * f,
+ TransitionTableIndex number_of_table_entries);
+ //!
+ //! read index table from raw data @a raw.
+ IndexTable(char ** raw,
+ TransitionTableIndex number_of_table_entries);
+ ~IndexTable(void);
+ //!
+ //! input symbol for the index
+ SymbolNumber input_symbol(TransitionTableIndex i) const;
+ //!
+ //! target state location for the index
+ TransitionTableIndex target(TransitionTableIndex i) const;
+ //!
+ //! whether it's final transition
+ bool final(TransitionTableIndex i) const;
+ //!
+ //! transition's weight
+ Weight final_weight(TransitionTableIndex i) const;
+};
+
+//! Internal class for transition processing.
+
+//! Contains low-level processing stuff.
+class TransitionTable
+{
+protected:
+ //!
+ //! raw transition data
+ char * transitions;
+
+ //!
+ //! read known amount of transitions from file @a f
+ void read(FILE * f,
+ TransitionTableIndex number_of_table_entries);
+ //! read known amount of transitions from raw dara @a data
+ void read(char ** raw,
+ TransitionTableIndex number_of_table_entries);
+ TransitionTableIndex size;
+public:
+ //!
+ //! read transition table from file @a f
+ TransitionTable(FILE * f,
+ TransitionTableIndex transition_count);
+ //!
+ //! read transition table from raw data @a raw
+ TransitionTable(char ** raw,
+ TransitionTableIndex transition_count);
+
+ ~TransitionTable(void);
+ //!
+ //! transition's input symbol
+ SymbolNumber input_symbol(TransitionTableIndex i) const;
+ //!
+ //! transition's output symbol
+ SymbolNumber output_symbol(TransitionTableIndex i) const;
+ //!
+ //! target node location
+ TransitionTableIndex target(TransitionTableIndex i) const;
+ //!
+ //! weight of transiton
+ Weight weight(TransitionTableIndex i) const;
+ //!
+ //! whether it's final
+ bool final(TransitionTableIndex i) const;
+
+
+};
+
+template <class printable>
+void debug_print(printable p)
+{
+ if (0) {
+ std::cerr << p;
+ }
+}
+
+} // namespace hfst_ol
+
+#endif // HFST_OSPELL_HFST_OL_H_
diff --git a/hfst-ospell-office.1 b/hfst-ospell-office.1
new file mode 100644
index 0000000..cc6008f
--- /dev/null
+++ b/hfst-ospell-office.1
@@ -0,0 +1,19 @@
+.TH HFST-OSPELL-OFFICE "1" "January 2016" "hfst-ospell-office " "User Commands"
+.SH NAME
+hfst-ospell-office \- Spell checker tool based on HFST
+.SH SYNOPSIS
+.B hfstospell
+[\fIOPTIONS\fR] [\fIZHFST-ARCHIVE\fR]
+.SH DESCRIPTION
+Use automata in ZHFST\-ARCHIVE or from OPTIONS to check and correct
+.TP
+\fB\-\-verbatim\fR
+Check the input as-is without any transformations
+.SH "REPORTING BUGS"
+Report bugs to mail at tinodidriksen.com and/or hfst\-bugs at helsinki.fi
+.PP
+hfstospell 0.3.1
+.br
+Jan 11 2016 12:57:25
+.br
+Copyright (C) 2009 \- 2016 University of Helsinki
diff --git a/hfst-ospell.1 b/hfst-ospell.1
new file mode 100644
index 0000000..6f4b372
--- /dev/null
+++ b/hfst-ospell.1
@@ -0,0 +1,58 @@
+.TH HFST-OSPELL "1" "January 2016" "hfst-ospell " "User Commands"
+.SH NAME
+hfst-ospell \- Spell checker tool based on HFST
+.SH SYNOPSIS
+.B hfstospell
+[\fIOPTIONS\fR] [\fIZHFST-ARCHIVE\fR]
+.SH DESCRIPTION
+Use automata in ZHFST\-ARCHIVE or from OPTIONS to check and correct
+.TP
+\fB\-h\fR, \fB\-\-help\fR
+Print this help message
+.TP
+\fB\-V\fR, \fB\-\-version\fR
+Print version information
+.TP
+\fB\-v\fR, \fB\-\-verbose\fR
+Be verbose
+.TP
+\fB\-q\fR, \fB\-\-quiet\fR
+Don't be verbose (default)
+.TP
+\fB\-s\fR, \fB\-\-silent\fR
+Same as quiet
+.TP
+\fB\-a\fR, \fB\-\-analyse\fR
+Analyse strings and corrections
+.TP
+\fB\-n\fR, \fB\-\-limit\fR=\fIN\fR
+Show at most N suggestions
+.TP
+\fB\-w\fR, \fB\-\-max\-weight\fR=\fIW\fR
+Suppress corrections with weights above W
+.TP
+\fB\-b\fR, \fB\-\-beam\fR=\fIW\fR
+Suppress corrections worse than best candidate by more than W
+.TP
+\fB\-t\fR, \fB\-\-time\-cutoff\fR=\fIT\fR
+Stop trying to find better corrections after T seconds (T is a float)
+.TP
+\fB\-S\fR, \fB\-\-suggest\fR
+Suggest corrections to mispellings
+.TP
+\fB\-X\fR, \fB\-\-real\-word\fR
+Also suggest corrections to correct words
+.TP
+\fB\-m\fR, \fB\-\-error\-model\fR
+Use this error model (must also give lexicon as option)
+.TP
+\fB\-l\fR, \fB\-\-lexicon\fR
+Use this lexicon (must also give erro model as option)
+.SH "REPORTING BUGS"
+Report bugs to hfst\-bugs at helsinki.fi
+.PP
+hfstospell 0.3.1
+.br
+Jan 11 2016 12:57:25
+.br
+Copyright (C) 2009 \- 2016 University of Helsinki
diff --git a/hfstospell.pc.in b/hfstospell.pc.in
new file mode 100644
index 0000000..04259a3
--- /dev/null
+++ b/hfstospell.pc.in
@@ -0,0 +1,10 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: hfstospell
+Description: Finite-state transducer based spell checker library
+Version: @HFSTOSPELL_VERSION@
+Libs: -L${libdir} -l at HFSTOSPELL_NAME@
+Cflags: -I${includedir}
diff --git a/m4/ax_check_compile_flag.m4 b/m4/ax_check_compile_flag.m4
new file mode 100644
index 0000000..51df0c0
--- /dev/null
+++ b/m4/ax_check_compile_flag.m4
@@ -0,0 +1,74 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT])
+#
+# DESCRIPTION
+#
+# Check whether the given FLAG works with the current language's compiler
+# or gives an error. (Warnings, however, are ignored)
+#
+# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
+# success/failure.
+#
+# If EXTRA-FLAGS is defined, it is added to the current language's default
+# flags (e.g. CFLAGS) when the check is done. The check is thus made with
+# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to
+# force the compiler to issue an error when a bad flag is given.
+#
+# INPUT gives an alternative input source to AC_COMPILE_IFELSE.
+#
+# NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
+# macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Guido U. Draheim <guidod at gmx.de>
+# Copyright (c) 2011 Maarten Bosmans <mkbosmans at gmail.com>
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception, the respective Autoconf Macro's copyright owner
+# gives unlimited permission to copy, distribute and modify the configure
+# scripts that are the output of Autoconf when processing the Macro. You
+# need not follow the terms of the GNU General Public License when using
+# or distributing such scripts, even though portions of the text of the
+# Macro appear in them. The GNU General Public License (GPL) does govern
+# all other use of the material that constitutes the Autoconf Macro.
+#
+# This special exception to the GPL applies to versions of the Autoconf
+# Macro released by the Autoconf Archive. When you make and distribute a
+# modified version of the Autoconf Macro, you may extend this special
+# exception to the GPL to apply to your modified version as well.
+
+#serial 3
+
+AC_DEFUN([AX_CHECK_COMPILE_FLAG],
+[AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX
+AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
+AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
+ ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
+ _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
+ AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])],
+ [AS_VAR_SET(CACHEVAR,[yes])],
+ [AS_VAR_SET(CACHEVAR,[no])])
+ _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
+AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes],
+ [m4_default([$2], :)],
+ [m4_default([$3], :)])
+AS_VAR_POPDEF([CACHEVAR])dnl
+])dnl AX_CHECK_COMPILE_FLAGS
diff --git a/main-cicling.cc b/main-cicling.cc
new file mode 100644
index 0000000..a22a325
--- /dev/null
+++ b/main-cicling.cc
@@ -0,0 +1,210 @@
+/*
+
+ Copyright 2009 University of Helsinki
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+*/
+
+/*
+ This is a toy commandline utility for testing spellers on standard io.
+ */
+
+#include "ospell.h"
+#include <getopt.h>
+#include <cassert>
+#include <math.h>
+
+#define PACKAGE_NAME "hfst-ospell"
+#define PACKAGE_STRING "hfst-ospell 0.1"
+#define PACKAGE_BUGREPORT "hfst-bugs at ling.helsinki.fi"
+
+bool print_usage(void)
+{
+ std::cerr <<
+ "\n" <<
+ "Usage: " << PACKAGE_NAME << " [OPTIONS] ERRORSOURCE LEXICON\n" <<
+ "Run a composition of ERRORSOURCE and LEXICON on standard input and\n" <<
+ "print corrected output\n" <<
+ "\n" <<
+ " -h, --help Print this help message\n" <<
+ " -V, --version Print version information\n" <<
+ " -v, --verbose Be verbose\n" <<
+ " -q, --quiet Don't be verbose (default)\n" <<
+ " -s, --silent Same as quiet\n" <<
+ "\n" <<
+ "\n" <<
+ "Report bugs to " << PACKAGE_BUGREPORT << "\n" <<
+ "\n";
+ return true;
+}
+
+bool print_version(void)
+{
+ std::cerr <<
+ "\n" <<
+ PACKAGE_STRING << std::endl <<
+ __DATE__ << " " __TIME__ << std::endl <<
+ "copyright (C) 2009 University of Helsinki\n";
+ return true;
+}
+
+bool print_short_help(void)
+{
+ print_usage();
+ return true;
+}
+
+int main(int argc, char **argv)
+{
+
+ FILE * mutator_file = NULL;
+ FILE * lexicon_file = NULL;
+
+ int c;
+ bool verbose = false;
+
+ while (true)
+ {
+ static struct option long_options[] =
+ {
+ // first the hfst-mandated options
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {"verbose", no_argument, 0, 'v'},
+ {"quiet", no_argument, 0, 'q'},
+ {"silent", no_argument, 0, 's'},
+ {0, 0, 0, 0 }
+ };
+
+ int option_index = 0;
+ c = getopt_long(argc, argv, "hVvqs", long_options, &option_index);
+
+ if (c == -1) // no more options to look at
+ break;
+
+ switch (c) {
+ case 'h':
+ print_usage();
+ return EXIT_SUCCESS;
+ break;
+
+ case 'V':
+ print_version();
+ return EXIT_SUCCESS;
+ break;
+
+ case 'v':
+ verbose = true;
+ break;
+
+ case 'q': // fallthrough
+ case 's':
+ break;
+
+ default:
+ std::cerr << "Invalid option\n\n";
+ print_short_help();
+ return EXIT_FAILURE;
+ break;
+ }
+ }
+ // no more options, we should now be at the input filenames
+ if ( (optind + 2) < argc) {
+ std::cerr << "More than two input files given\n";
+ return EXIT_FAILURE;
+ } else if ( (optind + 2) > argc)
+ {
+ std::cerr << "Need two input files\n";
+ return EXIT_FAILURE;
+ } else {
+ mutator_file = fopen(argv[(optind)], "r");
+ if (mutator_file == NULL) {
+ std::cerr << "Could not open file " << argv[(optind)]
+ << std::endl;
+ return 1;
+ }
+ lexicon_file = fopen(argv[(optind + 1)], "r");
+ if (lexicon_file == NULL) {
+ std::cerr << "Could not open file " << argv[(optind + 1)]
+ << std::endl;
+ return 1;
+ }
+ }
+ hfst_ol::Transducer * mutator;
+ hfst_ol::Transducer * lexicon;
+ mutator = new hfst_ol::Transducer(mutator_file);
+ if (!mutator->is_weighted()) {
+ std::cerr << "Error source was unweighted, exiting\n\n";
+ return EXIT_FAILURE;
+ }
+ lexicon = new hfst_ol::Transducer(lexicon_file);
+ if (!lexicon->is_weighted()) {
+ std::cerr << "Lexicon was unweighted, exiting\n\n";
+ return EXIT_FAILURE;
+ }
+
+ hfst_ol::Speller * speller;
+
+ try {
+ speller = new hfst_ol::Speller(mutator, lexicon);
+ } catch (hfst_ol::AlphabetTranslationException& e) {
+ std::cerr <<
+ "Unable to build speller - symbol " << e.what() << " not "
+ "present in lexicon's alphabet\n";
+ return EXIT_FAILURE;
+ }
+
+ char * str = (char*) malloc(65535);
+ while (!std::cin.eof()) {
+ std::cin.getline(str, 65535);
+ if (str[0] == '\0') {
+ continue;
+ }
+ // n += 1
+ char* p = strdup(str);
+ char* tok = strtok(p, "\t");
+ assert(tok != NULL);
+ char* mispelt = strdup(tok);
+ tok = strtok(NULL, "\t");
+ assert(tok != NULL);
+ char* corr = strdup(tok);
+ tok = strtok(NULL, "\t");
+ assert(tok != NULL);
+ char* context = strdup(tok);
+ // unknown += (corr in NWORDS)
+ hfst_ol::CorrectionQueue corrections = speller->correct(mispelt);
+ if (corrections.size() == 0)
+ {
+ // correction too far
+ fprintf(stdout, "%s\t%s\t%s[inf]\t%s\n",
+ mispelt, corr, mispelt, context);
+ }
+ else
+ {
+ fprintf(stdout, "%s\t%s", mispelt, corr);
+ if (speller->check(mispelt))
+ {
+ fprintf(stdout, "\t%s[0]", mispelt);
+ }
+ while (corrections.size() > 0)
+ {
+ fprintf(stdout, "\t%s[%f]", corrections.top().first.c_str(),
+ corrections.top().second);
+ corrections.pop();
+ }
+ fprintf(stdout, "\t%s\n", context);
+ } // corrections size != 0
+ }
+ return EXIT_SUCCESS;
+}
diff --git a/main-fsmnlp-2012.cc b/main-fsmnlp-2012.cc
new file mode 100644
index 0000000..06de881
--- /dev/null
+++ b/main-fsmnlp-2012.cc
@@ -0,0 +1,440 @@
+/*
+
+ Copyright 2009 University of Helsinki
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+*/
+
+/*
+ This is a toy commandline utility for testing spellers on standard io.
+ */
+
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif
+#if HAVE_GETOPT_H
+# include <getopt.h>
+#endif
+
+#include <time.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <errno.h>
+
+#include "ol-exceptions.h"
+#include "ospell.h"
+#include "ZHfstOspeller.h"
+
+
+using hfst_ol::ZHfstOspeller;
+using hfst_ol::Transducer;
+
+static bool quiet = false;
+static bool verbose = false;
+static FILE* profile_file;
+clock_t profile_start, profile_end;
+
+bool print_usage(void)
+{
+ std::cerr <<
+ "\n" <<
+ "Usage: " << PACKAGE_NAME << " [OPTIONS] ERRORSOURCE LEXICON\n" <<
+ " " << PACKAGE_NAME << " [OPTIONS] ZHFST-ARCHIVE\n" <<
+ "Run a composition of ERRORSOURCE and LEXICON on standard input and\n" <<
+ "print corrected output\n" <<
+ "Second form seeks error sources and lexicons from the ZHFST-ARCHIVE\n"
+ "\n" <<
+ " -h, --help Print this help message\n" <<
+ " -V, --version Print version information\n" <<
+ " -v, --verbose Be verbose\n" <<
+ " -q, --quiet Don't be verbose (default)\n" <<
+ " -s, --silent Same as quiet\n" <<
+ "\n" <<
+ "\n" <<
+ "Report bugs to " << PACKAGE_BUGREPORT << "\n" <<
+ "\n";
+ return true;
+}
+
+bool print_version(void)
+{
+ std::cerr <<
+ "\n" <<
+ PACKAGE_STRING << std::endl <<
+ __DATE__ << " " __TIME__ << std::endl <<
+ "copyright (C) 2009 - 2011 University of Helsinki\n";
+ return true;
+}
+
+bool print_short_help(void)
+{
+ print_usage();
+ return true;
+}
+
+int
+legacy_spell(const char* errmodel_filename, const char* acceptor_filename)
+{
+ FILE* mutator_file = fopen(errmodel_filename, "r");
+ if (mutator_file == NULL) {
+ std::cerr << "Could not open file " << errmodel_filename
+ << std::endl;
+ return EXIT_FAILURE;
+ }
+ FILE* lexicon_file = fopen(acceptor_filename, "r");
+ if (lexicon_file == NULL) {
+ std::cerr << "Could not open file " << acceptor_filename
+ << std::endl;
+ return EXIT_FAILURE;
+ }
+ hfst_ol::Transducer * mutator;
+ hfst_ol::Transducer * lexicon;
+ mutator = new hfst_ol::Transducer(mutator_file);
+ if (!mutator->is_weighted()) {
+ std::cerr << "Error source was unweighted, exiting\n\n";
+ return EXIT_FAILURE;
+ }
+ lexicon = new hfst_ol::Transducer(lexicon_file);
+ if (!lexicon->is_weighted()) {
+ std::cerr << "Lexicon was unweighted, exiting\n\n";
+ return EXIT_FAILURE;
+ }
+
+ hfst_ol::Speller * speller;
+
+ try {
+ speller = new hfst_ol::Speller(mutator, lexicon);
+ } catch (hfst_ol::AlphabetTranslationException& e) {
+ std::cerr <<
+ "Unable to build speller - symbol " << e.what() << " not "
+ "present in lexicon's alphabet\n";
+ return EXIT_FAILURE;
+ }
+ char * str = (char*) malloc(2000);
+
+ while (!std::cin.eof()) {
+ std::cin.getline(str, 2000);
+ if (speller->check(str)) {
+ std::cout << "\"" << str << "\" is in the lexicon\n\n";
+ } else {
+ hfst_ol::CorrectionQueue corrections = speller->correct(str);
+ if (corrections.size() > 0) {
+ std::cout << "Corrections for \"" << str << "\":\n";
+ while (corrections.size() > 0)
+ {
+ std::cout << corrections.top().first << " " << corrections.top().second << std::endl;
+ corrections.pop();
+ }
+ std::cout << std::endl;
+ } else {
+ std::cout << "Unable to correct \"" << str << "\"!\n\n";
+ }
+ }
+ }
+ return EXIT_SUCCESS;
+
+}
+
+int
+fallback_spell(const char* errmodel_filename1, const char* errmodel_filename2,
+ const char* acceptor_filename)
+{
+ FILE* mutator_file1 = fopen(errmodel_filename1, "r");
+ if (mutator_file1 == NULL) {
+ std::cerr << "Could not open file " << errmodel_filename1
+ << std::endl;
+ return EXIT_FAILURE;
+ }
+ FILE* mutator_file2 = fopen(errmodel_filename2, "r");
+ if (mutator_file2 == NULL) {
+ std::cerr << "Could not open file " << errmodel_filename2
+ << std::endl;
+ return EXIT_FAILURE;
+ }
+ FILE* lexicon_file = fopen(acceptor_filename, "r");
+ if (lexicon_file == NULL) {
+ std::cerr << "Could not open file " << acceptor_filename
+ << std::endl;
+ return EXIT_FAILURE;
+ }
+ hfst_ol::Transducer * mutator1;
+ hfst_ol::Transducer * mutator2;
+ hfst_ol::Transducer * lexicon;
+ mutator1= new hfst_ol::Transducer(mutator_file1);
+ if (!mutator1->is_weighted()) {
+ std::cerr << "Error source was unweighted, exiting\n\n";
+ return EXIT_FAILURE;
+ }
+ mutator2= new hfst_ol::Transducer(mutator_file2);
+ if (!mutator2->is_weighted()) {
+ std::cerr << "Error source was unweighted, exiting\n\n";
+ return EXIT_FAILURE;
+ }
+ lexicon = new hfst_ol::Transducer(lexicon_file);
+ if (!lexicon->is_weighted()) {
+ std::cerr << "Lexicon was unweighted, exiting\n\n";
+ return EXIT_FAILURE;
+ }
+
+ hfst_ol::Speller * speller1;
+ hfst_ol::Speller * speller2;
+
+ try {
+ speller1 = new hfst_ol::Speller(mutator1, lexicon);
+ } catch (hfst_ol::AlphabetTranslationException& e) {
+ std::cerr <<
+ "Unable to build speller - symbol " << e.what() << " not "
+ "present in lexicon's alphabet\n";
+ return EXIT_FAILURE;
+ }
+ try {
+ speller2 = new hfst_ol::Speller(mutator2, lexicon);
+ } catch (hfst_ol::AlphabetTranslationException& e) {
+ std::cerr <<
+ "Unable to build speller - symbol " << e.what() << " not "
+ "present in lexicon's alphabet\n";
+ return EXIT_FAILURE;
+ }
+ char * str = (char*) malloc(2000);
+
+ while (!std::cin.eof()) {
+ std::cin.getline(str, 2000);
+ if (speller1->check(str)) {
+ std::cout << "\"" << str << "\" is in the lexicon 1\n\n";
+ } else {
+ hfst_ol::CorrectionQueue corrections1 = speller1->correct(str);
+ if (corrections1.size() > 0) {
+ std::cout << "Corrections for \"" << str << "\" w/ source 1:\n";
+ while (corrections1.size() > 0)
+ {
+ std::cout << corrections1.top().first << " " << corrections1.top().second << std::endl;
+ corrections1.pop();
+ }
+ std::cout << std::endl;
+ } else {
+ hfst_ol::CorrectionQueue corrections2 = speller2->correct(str);
+ if (corrections2.size() > 0) {
+ std::cout << "Corrections for \"" << str << "\" w/ source 2:\n";
+ while (corrections2.size() > 0)
+ {
+ std::cout << corrections2.top().first << " " << corrections2.top().second << std::endl;
+ corrections2.pop();
+ }
+ std::cout << std::endl;
+ } else {
+ std::cout << "Unable to correct \"" << str << "\"!\n\n";
+ }
+ }
+ }
+ }
+ return EXIT_SUCCESS;
+
+}
+
+int
+zhfst_spell(char* zhfst_filename)
+{
+ ZHfstOspeller speller;
+ try
+ {
+ speller.read_zhfst(zhfst_filename);
+ }
+ catch (hfst_ol::ZHfstZipReadingError zhzre)
+ {
+ std::cerr << "cannot read zhfst archive " << zhfst_filename << ":"
+ << std::endl
+ << zhzre.what() << "." << std::endl
+ << "trying to read as legacy automata directory" << std::endl;
+ speller.read_legacy(zhfst_filename);
+ }
+ catch (hfst_ol::ZHfstLegacyReadingError zhlre)
+ {
+ std::cerr << "cannot read legacy hfst speller dir " << zhfst_filename
+ << ":" << std::endl
+ << zhlre.what() << "." << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ if (verbose)
+ {
+ std::cout << "Following metadata was read from ZHFST archive:" << std::endl
+ << speller.metadata_dump() << std::endl;
+ }
+ char * str = (char*) malloc(2000);
+
+ while (!std::cin.eof()) {
+ std::cin.getline(str, 2000);
+ if (str[0] == '\0') {
+ break;
+ }
+ if (speller.spell(str)) {
+ std::cout << "\"" << str << "\" is in the lexicon\n\n";
+ } else {
+ hfst_ol::CorrectionQueue corrections = speller.suggest(str);
+ if (corrections.size() > 0) {
+ std::cout << "Corrections for \"" << str << "\":\n";
+ while (corrections.size() > 0)
+ {
+ std::cout << corrections.top().first << " " << corrections.top().second << std::endl;
+ corrections.pop();
+ }
+ std::cout << std::endl;
+ } else {
+ std::cout << "Unable to correct \"" << str << "\"!\n\n";
+ }
+ }
+ }
+ return EXIT_SUCCESS;
+ return EXIT_SUCCESS;
+}
+
+void
+hfst_print_profile_line()
+ {
+ if (profile_file == 0)
+ {
+ return;
+ }
+ fprintf(profile_file, "ospell");
+ clock_t profile_end = clock();
+ fprintf(profile_file, "\t%f", ((float)(profile_end - profile_start))
+ / CLOCKS_PER_SEC);
+ struct rusage* usage = static_cast<struct rusage*>
+ (malloc(sizeof(struct rusage)));
+ errno = 0;
+ int rv = getrusage(RUSAGE_SELF, usage);
+ if (rv != -1)
+ {
+ fprintf(profile_file, "\t%lu.%lu\t%lu.%lu"
+ "\t%ld\t%ld\t%ld"
+ "\t%ld"
+ "\t%ld\t%ld\t%ld"
+ "\t%ld\t%ld"
+ "\t%ld\t%ld"
+ "\t%ld"
+ "\t%ld\t%ld",
+ usage->ru_utime.tv_sec, usage->ru_utime.tv_usec,
+ usage->ru_stime.tv_sec, usage->ru_stime.tv_usec,
+ usage->ru_maxrss, usage->ru_ixrss, usage->ru_idrss,
+ usage->ru_isrss,
+ usage->ru_minflt, usage->ru_majflt, usage->ru_nswap,
+ usage->ru_inblock, usage->ru_oublock,
+ usage->ru_msgsnd, usage->ru_msgrcv,
+ usage->ru_nsignals,
+ usage->ru_nvcsw, usage->ru_nivcsw);
+ }
+ else
+ {
+ fprintf(profile_file, "\tgetrusage: %s", strerror(errno));
+ }
+ fprintf(profile_file, "\n");
+ }
+
+
+int main(int argc, char **argv)
+{
+
+ int c;
+
+#if HAVE_GETOPT_H
+ while (true) {
+ static struct option long_options[] =
+ {
+ // first the hfst-mandated options
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {"verbose", no_argument, 0, 'v'},
+ {"quiet", no_argument, 0, 'q'},
+ {"silent", no_argument, 0, 's'},
+ {"profile", required_argument, 0, 'p'},
+ {0, 0, 0, 0 }
+ };
+
+ int option_index = 0;
+ c = getopt_long(argc, argv, "hVvqsp:", long_options, &option_index);
+
+ if (c == -1) // no more options to look at
+ break;
+
+ switch (c) {
+ case 'h':
+ print_usage();
+ return EXIT_SUCCESS;
+ break;
+
+ case 'V':
+ print_version();
+ return EXIT_SUCCESS;
+ break;
+
+ case 'v':
+ verbose = true;
+ quiet = false;
+ break;
+
+ case 'p':
+ profile_file = fopen(optarg, "a");
+ if (NULL == profile_file)
+ {
+ perror("Couldn't open profiling file for appending");
+ }
+ profile_start = clock();
+ break;
+ case 'q': // fallthrough
+ case 's':
+ quiet = true;
+ verbose = false;
+ break;
+
+ default:
+ std::cerr << "Invalid option\n\n";
+ print_short_help();
+ return EXIT_FAILURE;
+ break;
+ }
+ }
+#else
+ int optind = 1;
+#endif
+ // no more options, we should now be at the input filenames
+ int rv = EXIT_SUCCESS;
+ if (optind == (argc - 3))
+ {
+ rv = fallback_spell(argv[optind], argv[optind+1], argv[optind+2]);
+ }
+ else if (optind == (argc - 2))
+ {
+ rv = legacy_spell(argv[optind], argv[optind+1]);
+ }
+ else if (optind == (argc - 1))
+ {
+ rv = zhfst_spell(argv[optind]);
+ }
+ else if (optind < (argc - 3))
+ {
+ std::cerr << "No more than three free parameters allowed" << std::endl;
+ print_short_help();
+ return EXIT_FAILURE;
+ }
+ else if (optind >= argc)
+ {
+ std::cerr << "Give full path to zhfst spellers or two automata"
+ << std::endl;
+ print_short_help();
+ return EXIT_FAILURE;
+ }
+ hfst_print_profile_line();
+ return rv;
+ }
diff --git a/main-ispell.cc b/main-ispell.cc
new file mode 100644
index 0000000..1e037e6
--- /dev/null
+++ b/main-ispell.cc
@@ -0,0 +1,455 @@
+/*
+
+ Copyright 2009 University of Helsinki
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+*/
+
+/*
+ This is a toy commandline utility for testing spellers on standard io.
+ */
+
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif
+#if HAVE_GETOPT_H
+# include <getopt.h>
+#endif
+#if HAVE_ERROR_H
+# include <error.h>
+#else
+# define error(status, errnum, fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__); \
+ if (status != 0) exit(status);
+#endif
+
+#include "ol-exceptions.h"
+#include "ospell.h"
+#include "ZHfstOspeller.h"
+
+using hfst_ol::ZHfstOspeller;
+using hfst_ol::Transducer;
+
+//static bool quiet = false;
+static bool verbose = false;
+
+char*
+find_dicts(const char* langcode)
+ {
+ FILE* testhandle = NULL;
+ char* testname = (char*)malloc(sizeof(char) *
+ (strlen(langcode) + strlen("speller-.zhfst") + 1));
+ int rv = sprintf(testname, "speller-%s.zhfst", langcode);
+ if (rv == 0)
+ {
+ perror("sprinting path");
+ }
+ testhandle = fopen(testname, "r");
+ if (testhandle != NULL)
+ {
+ fclose(testhandle);
+ return testname;
+ }
+ free(testname);
+ testname = (char*)malloc(sizeof(char) *
+ (strlen(langcode) +
+ strlen("/usr/share/voikko/3/speller-.zhfst") + 1));
+ rv = sprintf(testname, "/usr/share/voikko/3/speller-%s.zhfst",
+ langcode);
+ if (rv == 0)
+ {
+ perror("sprinting path");
+ }
+ testhandle = fopen(testname, "r");
+ if (testhandle != NULL)
+ {
+ fclose(testhandle);
+ return testname;
+ }
+ free(testname);
+ char* homepath = getenv("HOME");
+ if (homepath == NULL)
+ {
+ return NULL;
+ }
+ testname = (char*)malloc(sizeof(char) *
+ (strlen(homepath) + strlen("/.voikko/3/speller-.zhfst") +
+ strlen(langcode) + 1));
+ rv = sprintf(testname, "%s/.voikko/3/speller-%s.zhfst", homepath,
+ langcode);
+ if (rv == 0)
+ {
+ perror("sprinting path");
+ }
+ testhandle = fopen(testname, "r");
+ if (testhandle != NULL)
+ {
+ fclose(testhandle);
+ return testname;
+ }
+ free(testname);
+ return NULL;
+ }
+
+bool print_usage(void)
+ {
+ fprintf(stdout, "Usage: %s [OPTION]... [FILE]...\n"
+ "Check spelling of each FILE. Without FILE, check standard input."
+ "\n\n", "hfst-ispell");
+ fprintf(stdout,
+ " -1 check only first field in lines "
+ "(delimiter = tabulator)\n"
+ " -a Ispell's pipe interface\n"
+ " --check-url Check URLs, email addresses and directory paths\n"
+ " -d d[,d2,...] used d (d2 etc.) dictionaries\n"
+ " -D show available dictionaries\n"
+ " -G print only correct words or lines\n"
+ " -h, --help display this help and exit\n"
+ " -l print mispelled words\n"
+ " -L print lines with mispelled words\n"
+ " -v, --version print version number\n"
+ " -vv print Ispell compatible version number\n"
+ " -w print misspelled words (= lines) "
+ "from one word/line input\n"
+ "\n");
+ fprintf(stdout, "Examples: %s -d fi file.txt\n"
+ " %s -l file.txt\n\n", "hfst-ispell", "hfst-ispell");
+ fprintf(stdout, "Report bugs to " PACKAGE_BUGREPORT "\n");
+ return true;
+ }
+
+bool print_version(bool ispell_strict)
+ {
+ fprintf(stdout, "@(#) International Ispell Version 3.2.06 (but really "
+ PACKAGE_STRING ")\n\n");
+ if (!ispell_strict)
+ {
+ fprintf(stdout, "Copyright (C) 2013 University of Helsinki. APL\n");
+ fprintf(stdout,
+ "This is free software; see the source for copying conditions. "
+ " There is NO\n"
+ "warranty; not even for MERCHANTABILITY or FITNESS FOR A "
+ " PARTICULAR PURPOSE,\n"
+ "to the extent permitted by law.\n");
+ }
+ return true;
+ }
+
+bool print_short_help(void)
+ {
+ print_usage();
+ return true;
+ }
+
+static
+void
+print_correct(const char* /*s*/)
+ {
+ fprintf(stdout, "*\n");
+ }
+
+static
+void
+print_corrections(const char* s, hfst_ol::CorrectionQueue& c)
+ {
+ fprintf(stdout, "& %s %zu %d: ", s, c.size(), 0);
+ bool comma = false;
+ while (c.size() > 0)
+ {
+ if (comma)
+ {
+ fprintf(stdout, ", ");
+ }
+ fprintf(stdout, "%s", c.top().first.c_str());
+ comma = true;
+ c.pop();
+ }
+ fprintf(stdout, "\n");
+ }
+
+static
+void
+print_no_corrects(const char* s)
+ {
+ fprintf(stdout, "# %s %d\n", s, 0);
+ }
+
+int
+legacy_spell(const char* errmodel_filename, const char* acceptor_filename)
+ {
+ FILE* mutator_file = fopen(errmodel_filename, "r");
+ if (mutator_file == NULL)
+ {
+ std::cerr << "Could not open file " << errmodel_filename
+ << std::endl;
+ return EXIT_FAILURE;
+ }
+ FILE* lexicon_file = fopen(acceptor_filename, "r");
+ if (lexicon_file == NULL)
+ {
+ std::cerr << "Could not open file " << acceptor_filename
+ << std::endl;
+ return EXIT_FAILURE;
+ }
+ hfst_ol::Transducer * mutator;
+ hfst_ol::Transducer * lexicon;
+ mutator = new hfst_ol::Transducer(mutator_file);
+ if (!mutator->is_weighted())
+ {
+ std::cerr << "Error source was unweighted, exiting\n\n";
+ return EXIT_FAILURE;
+ }
+ lexicon = new hfst_ol::Transducer(lexicon_file);
+ if (!lexicon->is_weighted())
+ {
+ std::cerr << "Lexicon was unweighted, exiting\n\n";
+ return EXIT_FAILURE;
+ }
+ hfst_ol::Speller * speller;
+ try
+ {
+ speller = new hfst_ol::Speller(mutator, lexicon);
+ }
+ catch (hfst_ol::AlphabetTranslationException& e)
+ {
+ std::cerr <<
+ "Unable to build speller - symbol " << e.what() << " not "
+ "present in lexicon's alphabet\n";
+ return EXIT_FAILURE;
+ }
+ char * str = (char*) malloc(2000);
+ while (!std::cin.eof())
+ {
+ std::cin.getline(str, 2000);
+ if (str[0] == '\0')
+ {
+ fprintf(stdout, "\n");
+ continue;
+ }
+ if (str[strlen(str) - 1] == '\r')
+ {
+ fprintf(stderr, "\\r is not allowed\n");
+ exit(1);
+ }
+ if (speller->check(str))
+ {
+ print_correct(str);
+ }
+ else
+ {
+ hfst_ol::CorrectionQueue corrections = speller->correct(str, 5);
+ if (corrections.size() > 0)
+ {
+ print_corrections(str, corrections);
+ }
+ else
+ {
+ print_no_corrects(str);
+ }
+ }
+ }
+ return EXIT_SUCCESS;
+ }
+
+int
+zhfst_spell(char* zhfst_filename, FILE* input)
+ {
+ ZHfstOspeller speller;
+ try
+ {
+ speller.read_zhfst(zhfst_filename);
+ }
+ catch (hfst_ol::ZHfstMetaDataParsingError zhmdpe)
+ {
+ std::cerr << "cannot finish reading zhfst archive " << zhfst_filename <<
+ ":" << zhmdpe.what() << "." << std::endl;
+ return EXIT_FAILURE;
+ }
+ catch (hfst_ol::ZHfstZipReadingError zhzre)
+ {
+ std::cerr << "cannot read zhfst archive " << zhfst_filename << ":"
+ << zhzre.what() << "." << std::endl
+ << "trying to read as legacy automata directory" << std::endl;
+ try
+ {
+ speller.read_legacy(zhfst_filename);
+ }
+ catch (hfst_ol::ZHfstLegacyReadingError zhlre)
+ {
+ std::cerr << "cannot fallback to read legacy hfst speller dir "
+ << zhfst_filename
+ << ":" << std::endl
+ << zhlre.what() << "." << std::endl;
+ return EXIT_FAILURE;
+ }
+ }
+ if (verbose)
+ {
+ std::cout << "Following metadata was read from ZHFST archive:" << std::endl
+ << speller.metadata_dump() << std::endl;
+ }
+ char* str = NULL;
+ size_t len = 0;
+ while (getline(&str, &len, input) != -1)
+ {
+ if (str[0] == '\0')
+ {
+ break;
+ }
+ if (str[strlen(str) - 1] == '\r')
+ {
+ fprintf(stderr, "\\r is not allowed\n");
+ exit(1);
+ }
+ else if (str[strlen(str) - 1] == '\n')
+ {
+ str[strlen(str) - 1] = '\0';
+ }
+ if (speller.spell(str))
+ {
+ print_correct(str);
+ }
+ else
+ {
+ hfst_ol::CorrectionQueue corrections = speller.suggest(str);
+ if (corrections.size() > 0)
+ {
+ print_corrections(str, corrections);
+ }
+ else
+ {
+ print_no_corrects(str);
+ }
+ }
+ }
+ free(str);
+ return EXIT_SUCCESS;
+}
+
+int main(int argc, char **argv)
+{
+
+ int c;
+ char* langcode = 0;
+ //std::locale::global(std::locale(""));
+ int version = 0;
+#if HAVE_GETOPT_H
+ while (true) {
+ static struct option long_options[] =
+ {
+ // first the hfst-mandated options
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'v'},
+ {"one", no_argument, 0, '1'},
+ {"ispell", no_argument, 0, 'a'},
+ {"check-url", no_argument, 0, 'X'},
+ {"dictionary", required_argument, 0, 'd'},
+ {"list", no_argument, 0, 'D'},
+ {"mispelt", no_argument, 0, 'l'},
+ {"misslines", no_argument, 0, 'L'},
+ {"wordperline", no_argument, 0, 'w'},
+ {0, 0, 0, 0 }
+ };
+
+ int option_index = 0;
+ c = getopt_long(argc, argv, "1ad:DGhvlLw", long_options, &option_index);
+
+ if (c == -1) // no more options to look at
+ break;
+
+ switch (c)
+ {
+ case 'h':
+ print_usage();
+ return EXIT_SUCCESS;
+ break;
+ case 'V':
+ version += 1;
+ break;
+ case 'v':
+ version += 1;
+ break;
+ case 'd':
+ langcode = optarg;
+ break;
+ default:
+ std::cerr << "Invalid option\n\n";
+ print_short_help();
+ return EXIT_FAILURE;
+ break;
+ }
+ }
+ if (version == 1)
+ {
+ print_version(false);
+ return EXIT_SUCCESS;
+ }
+ else if (version == 2)
+ {
+ print_version(true);
+ return EXIT_SUCCESS;
+ }
+ else if (version >= 3)
+ {
+ fprintf(stdout, "Come on, really?\n");
+ exit(version);
+ }
+#else
+ int optind = 1;
+#endif
+ // find the dicts
+ char* zhfst_file = 0;
+ if (NULL == langcode)
+ {
+ fprintf(stderr, "Currently -d is required since I'm too lazy to check "
+ "locale\n");
+ exit(1);
+ }
+ else
+ {
+ zhfst_file = find_dicts(langcode);
+ if (NULL == zhfst_file)
+ {
+ fprintf(stderr, "Could not find dictionary %s in standard "
+ "locations\n"
+ "Please install one of:\n"
+ " /usr/share/voikko/3/speller-%s.zhfst\n"
+ " $HOME/.voikko/3/speller-%s.zhfst\n"
+ " ./speller-%s.zhfst\n",
+ langcode, langcode, langcode, langcode);
+ exit(1);
+ }
+ }
+ // no more options, we should now be at the input filenames
+ if (optind == argc)
+ {
+ return zhfst_spell(zhfst_file, stdin);
+ }
+ else if (optind < argc)
+ {
+ while (optind < argc)
+ {
+ FILE* infile = fopen(argv[optind], "r");
+ if (NULL == infile)
+ {
+ fprintf(stderr, "Could not open %s for reading",
+ argv[optind]);
+ exit(1);
+ }
+ zhfst_spell(zhfst_file, infile);
+ optind++;
+ }
+ }
+ return EXIT_SUCCESS;
+ }
diff --git a/main-lrec2013.cc b/main-lrec2013.cc
new file mode 100644
index 0000000..79caaed
--- /dev/null
+++ b/main-lrec2013.cc
@@ -0,0 +1,518 @@
+/*
+
+ Copyright 2009 University of Helsinki
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+*/
+
+/*
+ This is a toy commandline utility for testing spellers on standard io.
+ */
+
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif
+#if HAVE_GETOPT_H
+# include <getopt.h>
+#endif
+#if HAVE_ERROR_H
+# include <error.h>
+#else
+#define error(status, errnum, format, ...) fprintf(stderr, format, ##__VA_ARGS__); \
+ if (status != 0) exit(status);
+#endif
+#include <time.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <errno.h>
+
+#include "ol-exceptions.h"
+#include "ospell.h"
+#include "ZHfstOspeller.h"
+
+using hfst_ol::ZHfstOspeller;
+using hfst_ol::Transducer;
+
+static bool quiet = false;
+static bool verbose = false;
+
+static FILE* profile_file = 0;
+static FILE* histogram_file = 0;
+static FILE* statistics_file = 0;
+static clock_t profile_start, profile_end;
+
+static long check_results = 65536;
+
+static long correct_not_in_lm = 0;
+static long lines = 0;
+static long fixable_lines = 0;
+static long some_results = 0;
+static long* results = 0;
+static long* corrects_at = 0;
+static long results_beyond = 0;
+static long corrects_beyond = 0;
+static long no_corrects = 0;
+static long in_language = 0;
+
+static long max_results = 1024;
+
+bool
+print_usage(void)
+{
+ std::cerr <<
+ "\n" <<
+ "Usage: " << PACKAGE_NAME << " [OPTIONS] ZHFST\n" <<
+ "Run a composition of ZHFST and field 1 of standard input and\n" <<
+ "print corrected output on fields 3...\n" <<
+ "\n" <<
+ " -h, --help Print this help message\n" <<
+ " -V, --version Print version information\n" <<
+ " -v, --verbose Be verbose\n" <<
+ " -q, --quiet Don't be verbose (default)\n" <<
+ " -s, --silent Same as quiet\n" <<
+ " -P, --profile=PFILE Save profiling data to PFILE\n" <<
+ " -X, --statistics=SFILE Save statistsics to SFILE\n" <<
+ " -H, --histogram=HFILE Save match numbes to HFILE\n" <<
+ " -n, --n-best=NBEST Collect and provide only NBEST suggestions\n"
+ <<
+ "\n" <<
+ "\n" <<
+ "Report bugs to " << PACKAGE_BUGREPORT << "\n" <<
+ "\n";
+ return true;
+}
+
+bool
+print_version(void)
+{
+ printf(PACKAGE_STRING
+ "\ncopyright (C) 2013 University of Helsinki\n");
+ return true;
+}
+
+bool print_short_help(void)
+{
+ print_usage();
+ return true;
+}
+
+int
+zhfst_spell(char* zhfst_filename)
+ {
+ ZHfstOspeller speller;
+ try
+ {
+ speller.read_zhfst(zhfst_filename);
+ }
+ catch (hfst_ol::ZHfstMetaDataParsingError zhmdpe)
+ {
+ error(EXIT_FAILURE, 0, "error while parsing metadata in %s: %s\n",
+ zhfst_filename, zhmdpe.what());
+ }
+ catch (hfst_ol::ZHfstZipReadingError zhzre)
+ {
+ error(EXIT_FAILURE, 0, "error while unzipping %s: %s\n",
+ zhfst_filename, zhzre.what());
+ }
+ if (verbose)
+ {
+ fprintf(stdout, "Following metadata was read from ZHFST archive:\n%s",
+ speller.metadata_dump().c_str());
+ }
+ char * str = (char*) malloc(2000);
+ char* always_incorrect = strdup("\001\002 at ALWAYS INCORRECT@");
+ bool correcting = false;
+ unsigned long linen = 0;
+ if (verbose)
+ {
+ fprintf(stdout, "Reading corrections from <stdin>\n");
+ }
+ else
+ {
+ fprintf(stdout,
+ "Misspelled\tCorrect\tSuggestion 1\tSuggestion 2\t...\n");
+ }
+ while (!std::cin.eof()) {
+ linen++;
+ std::cin.getline(str, 2000);
+ if (str[0] == '\0')
+ {
+ fprintf(stderr, "Skipping empty line at %lu\n", linen);
+ continue;
+ }
+ if (str[strlen(str) - 1] == '\r')
+ {
+ fprintf(stdout, "There is a WINDOWS linebreak in this file\n"
+ "Please convert with dos2unix or fromdos");
+ exit(3);
+ }
+ char* tab = strchr(str, '\t');
+ char* correct = 0;
+ if (tab != NULL)
+ {
+ *tab = '\0';
+ correct = strdup(tab + 1);
+ char* p = correct;
+ while (*p != '\0')
+ {
+ p++;
+ if ((*p == '\n') || (*p == '\t'))
+ {
+ *p = '\0';
+ }
+ }
+ correcting = true;
+ }
+ else
+ {
+ correct = always_incorrect;
+ correcting = false;
+ }
+ if (verbose)
+ {
+ fprintf(stdout, "Checking if %s == %s\n", str, correct);
+ }
+ else
+ {
+ fprintf(stdout, "%s\t%s", str, correct);
+ }
+ lines++;
+ int i = 0;
+ bool any_corrects = false;
+ if (speller.spell(str))
+ {
+ // spelling correct string is as if one suggestion was
+ // made at edit 0 for means of this article;
+ i++;
+ if (strcmp(str, correct) == 0)
+ {
+ corrects_at[i]++;
+ any_corrects = true;
+ }
+ results[i]++;
+ in_language++;
+ if (verbose)
+ {
+ fprintf(stdout, "%s was in the lexicon\n", str);
+ }
+ else
+ {
+ fprintf(stdout, "\t%s", str);
+ }
+ }
+ hfst_ol::CorrectionQueue corrections = speller.suggest(str /*,
+ max_results */);
+ while (corrections.size() > 0)
+ {
+ i++;
+ if (i >= check_results)
+ {
+ break;
+ }
+ if (verbose)
+ {
+ fprintf(stdout, "Trying %s\n",
+ corrections.top().first.c_str());
+ }
+ else
+ {
+ fprintf(stdout, "\t%s",
+ corrections.top().first.c_str());
+ }
+ if (strcmp(corrections.top().first.c_str(), correct) == 0)
+ {
+ if (i >= max_results)
+ {
+ if (verbose)
+ {
+ fprintf(stdout, "%d was correct beyond threshold\n", i);
+ }
+ corrects_beyond++;
+ }
+ else
+ {
+ if (verbose)
+ {
+ fprintf(stdout, "%d was correct\n", i);
+ }
+ corrects_at[i]++;
+ }
+ any_corrects = true;
+ }
+ corrections.pop();
+ } // while corrections
+ if (!any_corrects)
+ {
+ no_corrects++;
+ if (verbose)
+ {
+ fprintf(stdout, "no corrects for %s ?= %s\n", str, correct);
+ }
+ }
+ fprintf(stdout, "\n");
+ if (i >= max_results)
+ {
+ results_beyond++;
+ }
+ else
+ {
+ results[i]++;
+ }
+ if (!speller.spell(correct))
+ {
+ if (verbose)
+ {
+ fprintf(stdout, "could not have been corrected, missing %s\n",
+ correct);
+ }
+ correct_not_in_lm++;
+ }
+ if (correcting)
+ {
+ free(correct);
+ }
+ }
+ return EXIT_SUCCESS;
+
+}
+
+void
+hfst_print_profile_line()
+ {
+ if (profile_file == 0)
+ {
+ return;
+ }
+ if (ftell(profile_file) == 0L)
+ {
+ fprintf(profile_file, "name\tclock\t"
+ "utime\tstime\tmaxrss\tixrss\tidrss\tisrss\t"
+ "minflt\tmajflt\tnswap\tinblock\toublock\t"
+ "msgsnd\tmsgrcv\tnsignals\tnvcsw\tnivcsw\n");
+ }
+
+ fprintf(profile_file, "ospell");
+ profile_end = clock();
+ fprintf(profile_file, "\t%f", ((float)(profile_end - profile_start))
+ / CLOCKS_PER_SEC);
+ struct rusage* usage = static_cast<struct rusage*>
+ (malloc(sizeof(struct rusage)));
+ errno = 0;
+ int rv = getrusage(RUSAGE_SELF, usage);
+ if (rv != -1)
+ {
+ fprintf(profile_file, "\t%lu.%lu\t%lu.%lu"
+ "\t%ld\t%ld\t%ld"
+ "\t%ld"
+ "\t%ld\t%ld\t%ld"
+ "\t%ld\t%ld"
+ "\t%ld\t%ld"
+ "\t%ld"
+ "\t%ld\t%ld",
+ usage->ru_utime.tv_sec, usage->ru_utime.tv_usec,
+ usage->ru_stime.tv_sec, usage->ru_stime.tv_usec,
+ usage->ru_maxrss, usage->ru_ixrss, usage->ru_idrss,
+ usage->ru_isrss,
+ usage->ru_minflt, usage->ru_majflt, usage->ru_nswap,
+ usage->ru_inblock, usage->ru_oublock,
+ usage->ru_msgsnd, usage->ru_msgrcv,
+ usage->ru_nsignals,
+ usage->ru_nvcsw, usage->ru_nivcsw);
+ }
+ else
+ {
+ fprintf(profile_file, "\tgetrusage: %s", strerror(errno));
+ }
+ fprintf(profile_file, "\n");
+ }
+
+void
+print_statistics()
+ {
+ if (NULL == statistics_file)
+ {
+ return;
+ }
+ if (lines == 0)
+ {
+ fprintf(stderr, "DATOISSA VIRHE. END.\n");
+ exit(1);
+ }
+ // calculate stuff
+ unsigned long corrects_rest = 0;
+ unsigned long total_results = 0;
+ for (int i = 6; i < max_results; i++)
+ {
+ corrects_rest += corrects_at[i];
+ }
+ for (int i = 0; i < max_results; i++)
+ {
+ total_results += results[i] * i;
+ }
+ some_results = lines - results[0];
+ fixable_lines = lines - correct_not_in_lm;
+ // print
+ fprintf(statistics_file, "All\tIn LM\tLM+EM\t0s\tImpossible\n");
+ fprintf(statistics_file, "%lu\t%lu\t%lu\t%lu\t%lu\n",
+ lines, in_language, some_results, results[0], correct_not_in_lm);
+ fprintf(statistics_file, "%.2f %%\t%.2f %%\t%.2f %%\t%.2f %%\n",
+ static_cast<float>(lines) / static_cast<float>(lines) * 100.0f,
+ static_cast<float>(in_language) / static_cast<float>(lines) * 100.0f,
+ static_cast<float>(some_results) / static_cast<float>(lines) * 100.0f,
+ static_cast<float>(results[0]) / static_cast<float>(lines) * 100.0f);
+ fprintf(statistics_file, "All\t1sts\t2nds\t3rds\t4ths\t5ths\t"
+ "Rests\tNo corrects\n");
+ fprintf(statistics_file, "%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t"
+ "%lu\t%lu\n",
+ fixable_lines,
+ corrects_at[1], corrects_at[2], corrects_at[3], corrects_at[4],
+ corrects_at[5], corrects_rest, no_corrects - correct_not_in_lm);
+ fprintf(statistics_file, "%.2f %%\t"
+ "%.2f %%\t%.2f %%\t%.2f %%\t%.2f %%\t"
+ "%.2f %%\t"
+ "%.2f %%\t"
+ "%.2f %%\n",
+ static_cast<float>(fixable_lines) / static_cast<float>(fixable_lines) * 100.0f,
+ static_cast<float>(corrects_at[1]) / static_cast<float>(fixable_lines) * 100.0f,
+ static_cast<float>(corrects_at[2]) / static_cast<float>(fixable_lines) * 100.0f,
+ static_cast<float>(corrects_at[3]) / static_cast<float>(fixable_lines) * 100.0f,
+ static_cast<float>(corrects_at[4]) / static_cast<float>(fixable_lines) * 100.0f,
+ static_cast<float>(corrects_at[5]) / static_cast<float>(fixable_lines) * 100.0f,
+ static_cast<float>(corrects_rest) / static_cast<float>(fixable_lines) * 100.0f,
+ static_cast<float>(no_corrects - correct_not_in_lm) / static_cast<float>(fixable_lines) * 100.0f);
+ if (histogram_file == NULL)
+ {
+ return;
+ }
+ fprintf(histogram_file, "Result count\tfrequency\n");
+ for (int i = 0; i < max_results; i++)
+ {
+ fprintf(histogram_file, "%u\t%lu\n", i, results[i]);
+ }
+ }
+
+int main(int argc, char **argv)
+ {
+ results = new long[max_results];
+ corrects_at = new long[max_results];
+
+ int c;
+ //std::locale::global(std::locale(""));
+
+#if HAVE_GETOPT_H
+ while (true) {
+ static struct option long_options[] =
+ {
+ // first the hfst-mandated options
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {"verbose", no_argument, 0, 'v'},
+ {"quiet", no_argument, 0, 'q'},
+ {"silent", no_argument, 0, 's'},
+ {"profile", required_argument, 0, 'P'},
+ {"statistics", required_argument, 0, 'X'},
+ {"histogram", required_argument, 0, 'H'},
+ {"n-best", required_argument, 0, 'n'},
+ {0, 0, 0, 0 }
+ };
+
+ int option_index = 0;
+ c = getopt_long(argc, argv, "hVvqsP:X:H:n:", long_options, &option_index);
+
+ if (c == -1) // no more options to look at
+ break;
+
+ switch (c) {
+ case 'h':
+ print_usage();
+ return EXIT_SUCCESS;
+ break;
+
+ case 'V':
+ print_version();
+ return EXIT_SUCCESS;
+ break;
+
+ case 'v':
+ verbose = true;
+ quiet = false;
+ break;
+
+ case 'P':
+ profile_file = fopen(optarg, "a");
+ if (NULL == profile_file)
+ {
+ perror("Couldn't open profiling file for append");
+ }
+ profile_start = clock();
+ break;
+ case 'X':
+ statistics_file = fopen(optarg, "w");
+ if (NULL == statistics_file)
+ {
+ perror("Couldn't open statistic file for writing");
+ }
+ break;
+ case 'H':
+ histogram_file = fopen(optarg, "w");
+ if (NULL == histogram_file)
+ {
+ perror("Couldn't open histogram file for wriitng");
+ }
+ break;
+ case 'q': // fallthrough
+ case 's':
+ quiet = true;
+ verbose = false;
+ break;
+ case 'n':
+ char* endptr;
+ max_results = strtoul(optarg, &endptr, 10L);
+ if (*endptr != '\0')
+ {
+ perror("argument not valid for n-best");
+ }
+ break;
+ default:
+ std::cerr << "Invalid option\n\n";
+ print_short_help();
+ return EXIT_FAILURE;
+ break;
+ }
+ }
+#else
+ int optind = 1;
+#endif
+ // no more options, we should now be at the input filenames
+ if (optind == (argc - 1))
+ {
+ zhfst_spell(argv[optind]);
+ }
+ else if (optind < (argc - 1))
+ {
+ fprintf(stderr, "No more than one free parameter allowed\n");
+ print_short_help();
+ return EXIT_FAILURE;
+ }
+ else if (optind >= argc)
+ {
+ fprintf(stderr ,"Give full path to zhfst spellers\n");
+ print_short_help();
+ return EXIT_FAILURE;
+ }
+ hfst_print_profile_line();
+ print_statistics();
+ return EXIT_SUCCESS;
+ }
diff --git a/main-norvig.cc b/main-norvig.cc
new file mode 100644
index 0000000..7b46ef8
--- /dev/null
+++ b/main-norvig.cc
@@ -0,0 +1,263 @@
+/*
+
+ Copyright 2009 University of Helsinki
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+*/
+
+/*
+ This is a toy commandline utility for testing spellers on standard io.
+ */
+
+#include "ospell.h"
+#include <getopt.h>
+#include <cassert>
+#include <math.h>
+
+#define PACKAGE_NAME "hfst-ospell"
+#define PACKAGE_STRING "hfst-ospell 0.1"
+#define PACKAGE_BUGREPORT "hfst-bugs at ling.helsinki.fi"
+
+bool print_usage(void)
+{
+ std::cerr <<
+ "\n" <<
+ "Usage: " << PACKAGE_NAME << " [OPTIONS] ERRORSOURCE LEXICON\n" <<
+ "Run a composition of ERRORSOURCE and LEXICON on standard input and\n" <<
+ "print corrected output\n" <<
+ "\n" <<
+ " -h, --help Print this help message\n" <<
+ " -V, --version Print version information\n" <<
+ " -v, --verbose Be verbose\n" <<
+ " -q, --quiet Don't be verbose (default)\n" <<
+ " -s, --silent Same as quiet\n" <<
+ "\n" <<
+ "\n" <<
+ "Report bugs to " << PACKAGE_BUGREPORT << "\n" <<
+ "\n";
+ return true;
+}
+
+bool print_version(void)
+{
+ std::cerr <<
+ "\n" <<
+ PACKAGE_STRING << std::endl <<
+ __DATE__ << " " __TIME__ << std::endl <<
+ "copyright (C) 2009 University of Helsinki\n";
+ return true;
+}
+
+bool print_short_help(void)
+{
+ print_usage();
+ return true;
+}
+
+int main(int argc, char **argv)
+{
+
+ FILE * mutator_file = NULL;
+ FILE * lexicon_file = NULL;
+
+ int c;
+ bool verbose = false;
+
+ while (true)
+ {
+ static struct option long_options[] =
+ {
+ // first the hfst-mandated options
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {"verbose", no_argument, 0, 'v'},
+ {"quiet", no_argument, 0, 'q'},
+ {"silent", no_argument, 0, 's'},
+ {0, 0, 0, 0 }
+ };
+
+ int option_index = 0;
+ c = getopt_long(argc, argv, "hVvqs", long_options, &option_index);
+
+ if (c == -1) // no more options to look at
+ break;
+
+ switch (c) {
+ case 'h':
+ print_usage();
+ return EXIT_SUCCESS;
+ break;
+
+ case 'V':
+ print_version();
+ return EXIT_SUCCESS;
+ break;
+
+ case 'v':
+ verbose = true;
+ break;
+
+ case 'q': // fallthrough
+ case 's':
+ break;
+
+ default:
+ std::cerr << "Invalid option\n\n";
+ print_short_help();
+ return EXIT_FAILURE;
+ break;
+ }
+ }
+ // no more options, we should now be at the input filenames
+ if ( (optind + 2) < argc) {
+ std::cerr << "More than two input files given\n";
+ return EXIT_FAILURE;
+ } else if ( (optind + 2) > argc)
+ {
+ std::cerr << "Need two input files\n";
+ return EXIT_FAILURE;
+ } else {
+ mutator_file = fopen(argv[(optind)], "r");
+ if (mutator_file == NULL) {
+ std::cerr << "Could not open file " << argv[(optind)]
+ << std::endl;
+ return 1;
+ }
+ lexicon_file = fopen(argv[(optind + 1)], "r");
+ if (lexicon_file == NULL) {
+ std::cerr << "Could not open file " << argv[(optind + 1)]
+ << std::endl;
+ return 1;
+ }
+ }
+ hfst_ol::Transducer * mutator;
+ hfst_ol::Transducer * lexicon;
+ mutator = new hfst_ol::Transducer(mutator_file);
+ if (!mutator->is_weighted()) {
+ std::cerr << "Error source was unweighted, exiting\n\n";
+ return EXIT_FAILURE;
+ }
+ lexicon = new hfst_ol::Transducer(lexicon_file);
+ if (!lexicon->is_weighted()) {
+ std::cerr << "Lexicon was unweighted, exiting\n\n";
+ return EXIT_FAILURE;
+ }
+
+ hfst_ol::Speller * speller;
+
+ try {
+ speller = new hfst_ol::Speller(mutator, lexicon);
+ } catch (hfst_ol::AlphabetTranslationException& e) {
+ std::cerr <<
+ "Unable to build speller - symbol " << e.what() << " not "
+ "present in lexicon's alphabet\n";
+ return EXIT_FAILURE;
+ }
+
+ char * str = (char*) malloc(2000);
+ // def spelltest(tests, bias=None, verbose=False):
+ // n, bad, unknown, start = 0, 0, 0, time.clock()
+ unsigned long n = 0;
+ unsigned long bad = 0;
+ unsigned long unknown = 0;
+ clock_t start = clock();
+ // if bias:
+ // for target in tests: NWORDS[target] += bias
+ // for target,wrongs in tests.items():
+ // for wrong in wrongs.split():
+ while (!std::cin.eof()) {
+ std::cin.getline(str, 2000);
+ if (str[0] == '\0') {
+ continue;
+ }
+ // n += 1
+ n++;
+ char* p = strdup(str);
+ char* tok = strtok(p, "\t");
+ assert(tok != NULL);
+ char* mispelt = strdup(tok);
+ tok = strtok(NULL, "\t");
+ assert(tok != NULL);
+ //w = correct(wrong)
+ char* corr = strdup(tok);
+ // unknown += (corr in NWORDS)
+ if (!speller->check(corr))
+ {
+ unknown++;
+ }
+ if (speller->check(mispelt))
+ {
+ // real word spelling error
+ bad++;
+ if (verbose)
+ {
+ fprintf(stdout, "correct(%s) => %s; expected %s\n",
+ mispelt, mispelt, corr);
+ }
+ }
+ else
+ {
+
+ hfst_ol::CorrectionQueue corrections = speller->correct(mispelt);
+ if (corrections.size() == 0)
+ {
+ bad++;
+ if (verbose)
+ {
+ fprintf(stdout, "correct(%s) => %s; expected %s\n",
+ mispelt, mispelt, corr);
+ }
+ }
+ else
+ {
+ std::string first = corrections.top().first;
+ //if w!=target:
+ if (first != corr)
+ {
+ //bad += 1
+ bad++;
+ // if verbose:
+ // print 'correct(%r) => %r (%d); expected %r (%d)' % (
+ // wrong, w, NWORDS[w], target, NWORDS[target])
+ if (verbose)
+ {
+ fprintf(stdout, "correct(%s) => %s; "
+ "expected %s\n",
+ mispelt, first.c_str(),
+ corr);
+ }
+ } // first != corr
+ else
+ {
+ if (verbose)
+ {
+ fprintf(stdout, "correct(%s) => %s "
+ "as expected %s\n",
+ mispelt, first.c_str(),
+ corr);
+ }
+ }
+ } // corrections size != 0
+ } // word not in dictionary§
+ }
+ //return dict(bad=bad, n=n, bias=bias, pct=int(100. - 100.*bad/n),
+ // unknown=unknown, secs=int(time.clock()-start) )
+ int pct = (int)round(100.0f - 100.0f*(float)bad/(float)n);
+ float secs = (((float)clock()-(float)start)/(float)CLOCKS_PER_SEC);
+ fprintf(stdout,
+ "{'bad': %lu, 'bias': None, 'unknown': %lu, "
+ "'secs': %f, 'pct': %d, 'n': %lu}\n",
+ bad, unknown, secs, pct, n);
+ return EXIT_SUCCESS;
+}
diff --git a/main-survey.cc b/main-survey.cc
new file mode 100644
index 0000000..d9426a5
--- /dev/null
+++ b/main-survey.cc
@@ -0,0 +1,523 @@
+/*
+
+ Copyright 2009 University of Helsinki
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+*/
+
+/*
+ This is a toy commandline utility for testing spellers on standard io.
+ */
+
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif
+#if HAVE_GETOPT_H
+# include <getopt.h>
+#endif
+
+#include <time.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <errno.h>
+
+#include "ol-exceptions.h"
+#include "ospell.h"
+#include "ZHfstOspeller.h"
+
+using hfst_ol::ZHfstOspeller;
+using hfst_ol::Transducer;
+
+static bool quiet = false;
+static bool verbose = false;
+
+static FILE* profile_file = 0;
+static FILE* histogram_file = 0;
+static FILE* statistics_file = 0;
+static clock_t profile_start, profile_end;
+
+static long check_results = 65536;
+
+static long correct_not_in_lm = 0;
+static long lines = 0;
+static long fixable_lines = 0;
+static long some_results = 0;
+static long* results = 0;
+static long* corrects_at = 0;
+static long results_beyond = 0;
+static long corrects_beyond = 0;
+static long no_corrects = 0;
+static long in_language = 0;
+
+static long max_results = 1024;
+
+bool
+print_usage(void)
+{
+ std::cerr <<
+ "\n" <<
+ "Usage: " << PACKAGE_NAME << " [OPTIONS] ERRORSOURCE LEXICON\n" <<
+ "Run a composition of ERRORSOURCE and LEXICON on standard input and\n" <<
+ "print corrected output\n" <<
+ "\n" <<
+ " -h, --help Print this help message\n" <<
+ " -V, --version Print version information\n" <<
+ " -v, --verbose Be verbose\n" <<
+ " -q, --quiet Don't be verbose (default)\n" <<
+ " -s, --silent Same as quiet\n" <<
+ " -P, --profile=PFILE Save profiling data to PFILE\n" <<
+ " -X, --statistics=SFILE Save statistsics to SFILE\n" <<
+ " -H, --histogram=HFILE Save match numbes to HFILE\n" <<
+ " -n, --n-best=NBEST Collect and provide only NBEST suggestions\n"
+ <<
+ "\n" <<
+ "\n" <<
+ "Report bugs to " << PACKAGE_BUGREPORT << "\n" <<
+ "\n";
+ return true;
+}
+
+bool
+print_version(void)
+{
+ std::cerr <<
+ "\n" <<
+ PACKAGE_STRING << std::endl <<
+ __DATE__ << " " __TIME__ << std::endl <<
+ "copyright (C) 2009 - 2011 University of Helsinki\n";
+ return true;
+}
+
+bool print_short_help(void)
+{
+ print_usage();
+ return true;
+}
+
+int
+legacy_spell(const char* errmodel_filename, const char* acceptor_filename)
+{
+ FILE* mutator_file = fopen(errmodel_filename, "r");
+ if (mutator_file == NULL) {
+ std::cerr << "Could not open file " << errmodel_filename
+ << std::endl;
+ return EXIT_FAILURE;
+ }
+ FILE* lexicon_file = fopen(acceptor_filename, "r");
+ if (lexicon_file == NULL) {
+ std::cerr << "Could not open file " << acceptor_filename
+ << std::endl;
+ return EXIT_FAILURE;
+ }
+ hfst_ol::Transducer * mutator;
+ hfst_ol::Transducer * lexicon;
+ mutator = new hfst_ol::Transducer(mutator_file);
+ if (!mutator->is_weighted()) {
+ std::cerr << "Error source was unweighted, exiting\n\n";
+ return EXIT_FAILURE;
+ }
+ lexicon = new hfst_ol::Transducer(lexicon_file);
+ if (!lexicon->is_weighted()) {
+ std::cerr << "Lexicon was unweighted, exiting\n\n";
+ return EXIT_FAILURE;
+ }
+
+ hfst_ol::Speller * speller;
+
+ try {
+ speller = new hfst_ol::Speller(mutator, lexicon);
+ } catch (hfst_ol::AlphabetTranslationException& e) {
+ std::cerr <<
+ "Unable to build speller - symbol " << e.what() << " not "
+ "present in lexicon's alphabet\n";
+ return EXIT_FAILURE;
+ }
+ char * str = (char*) malloc(2000);
+
+ char* always_incorrect = strdup("\001\002 at ALWAYS INCORRECT@");
+ bool correcting = false;
+ while (!std::cin.eof()) {
+ std::cin.getline(str, 2000);
+ if (str[0] == '\0')
+ {
+ std::cerr << "Skipping empty lines" << std::endl;
+ continue;
+ }
+ if (str[strlen(str) - 1] == '\r')
+ {
+ std::cerr << "There is a WINDOWS linebreak in this file" <<
+ std::endl <<
+ "Please convert with dos2unix or fromdos" << std::endl;
+ exit(1);
+ }
+ char* tab = strchr(str, '\t');
+ char* correct = 0;
+ if (tab != NULL)
+ {
+ *tab = '\0';
+ correct = strdup(tab + 1);
+ char* p = correct;
+ while (*p != '\0')
+ {
+ p++;
+ if (*p == '\n')
+ {
+ *p = '\0';
+ }
+ }
+ correcting = true;
+ }
+ else
+ {
+ correct = always_incorrect;
+ correcting = false;
+ }
+ if (verbose)
+ {
+ fprintf(stdout, "Checking if %s == %s\n", str, correct);
+ }
+ else
+ {
+ fprintf(stdout, "%s", str);
+ }
+ lines++;
+ int i = 0;
+ bool any_corrects = false;
+ if (speller->check(str))
+ {
+ // spelling correct string is as if one suggestion was
+ // made at edit 0 for means of this article;
+ i++;
+ if (strcmp(str, correct) == 0)
+ {
+ corrects_at[i]++;
+ any_corrects = true;
+ }
+ results[i]++;
+ in_language++;
+ if (verbose)
+ {
+ fprintf(stdout, "%s was in the lexicon\n", str);
+ }
+ else
+ {
+ fprintf(stdout, "\t%s\n", str);
+ }
+ }
+ hfst_ol::CorrectionQueue corrections = speller->correct(str /*,
+ max_results */);
+ while (corrections.size() > 0)
+ {
+ i++;
+ if (i >= check_results)
+ {
+ break;
+ }
+ if (verbose)
+ {
+ fprintf(stdout, "Trying %s\n",
+ corrections.top().first.c_str());
+ }
+ else
+ {
+ fprintf(stdout, "\t%s",
+ corrections.top().first.c_str());
+ }
+ if (strcmp(corrections.top().first.c_str(), correct) == 0)
+ {
+ if (i >= max_results)
+ {
+ if (verbose)
+ {
+ fprintf(stdout, "%d was correct beyond threshold\n", i);
+ }
+ corrects_beyond++;
+ }
+ else
+ {
+ if (verbose)
+ {
+ fprintf(stdout, "%d was correct\n", i);
+ }
+ corrects_at[i]++;
+ }
+ any_corrects = true;
+ }
+ corrections.pop();
+ }
+ if (!any_corrects)
+ {
+ no_corrects++;
+ if (verbose)
+ {
+ fprintf(stdout, "no corrects for %s ?= %s\n", str, correct);
+ }
+ }
+ fprintf(stdout, "\n");
+ if (i >= max_results)
+ {
+ results_beyond++;
+ }
+ else
+ {
+ results[i]++;
+ }
+ if (!speller->check(correct))
+ {
+ if (verbose)
+ {
+ fprintf(stdout, "could not have been corrected, missing %s\n",
+ correct);
+ }
+ correct_not_in_lm++;
+ }
+ if (correcting)
+ {
+ free(correct);
+ }
+ }
+ return EXIT_SUCCESS;
+
+}
+
+void
+hfst_print_profile_line()
+ {
+ if (profile_file == 0)
+ {
+ return;
+ }
+ if (ftell(profile_file) == 0L)
+ {
+ fprintf(profile_file, "name\tclock\t"
+ "utime\tstime\tmaxrss\tixrss\tidrss\tisrss\t"
+ "minflt\tmajflt\tnswap\tinblock\toublock\t"
+ "msgsnd\tmsgrcv\tnsignals\tnvcsw\tnivcsw\n");
+ }
+
+ fprintf(profile_file, "ospell");
+ profile_end = clock();
+ fprintf(profile_file, "\t%f", ((float)(profile_end - profile_start))
+ / CLOCKS_PER_SEC);
+ struct rusage* usage = static_cast<struct rusage*>
+ (malloc(sizeof(struct rusage)));
+ errno = 0;
+ int rv = getrusage(RUSAGE_SELF, usage);
+ if (rv != -1)
+ {
+ fprintf(profile_file, "\t%lu.%lu\t%lu.%lu"
+ "\t%ld\t%ld\t%ld"
+ "\t%ld"
+ "\t%ld\t%ld\t%ld"
+ "\t%ld\t%ld"
+ "\t%ld\t%ld"
+ "\t%ld"
+ "\t%ld\t%ld",
+ usage->ru_utime.tv_sec, usage->ru_utime.tv_usec,
+ usage->ru_stime.tv_sec, usage->ru_stime.tv_usec,
+ usage->ru_maxrss, usage->ru_ixrss, usage->ru_idrss,
+ usage->ru_isrss,
+ usage->ru_minflt, usage->ru_majflt, usage->ru_nswap,
+ usage->ru_inblock, usage->ru_oublock,
+ usage->ru_msgsnd, usage->ru_msgrcv,
+ usage->ru_nsignals,
+ usage->ru_nvcsw, usage->ru_nivcsw);
+ }
+ else
+ {
+ fprintf(profile_file, "\tgetrusage: %s", strerror(errno));
+ }
+ fprintf(profile_file, "\n");
+ }
+
+void
+print_statistics()
+ {
+ if (NULL == statistics_file)
+ {
+ return;
+ }
+ if (lines == 0)
+ {
+ fprintf(stderr, "DATOISSA VIRHE. END.\n");
+ exit(1);
+ }
+ // calculate stuff
+ unsigned long corrects_rest = 0;
+ unsigned long total_results = 0;
+ for (int i = 6; i < max_results; i++)
+ {
+ corrects_rest += corrects_at[i];
+ }
+ for (int i = 0; i < max_results; i++)
+ {
+ total_results += results[i] * i;
+ }
+ some_results = lines - results[0];
+ fixable_lines = lines - correct_not_in_lm;
+ // print
+ fprintf(statistics_file, "All\tIn LM\tLM+EM\t0s\tImpossible\n");
+ fprintf(statistics_file, "%lu\t%lu\t%lu\t%lu\t%lu\n",
+ lines, in_language, some_results, results[0], correct_not_in_lm);
+ fprintf(statistics_file, "%.2f %%\t%.2f %%\t%.2f %%\t%.2f %%\n",
+ static_cast<float>(lines) / static_cast<float>(lines) * 100.0f,
+ static_cast<float>(in_language) / static_cast<float>(lines) * 100.0f,
+ static_cast<float>(some_results) / static_cast<float>(lines) * 100.0f,
+ static_cast<float>(results[0]) / static_cast<float>(lines) * 100.0f);
+ fprintf(statistics_file, "All\t1sts\t2nds\t3rds\t4ths\t5ths\t"
+ "Rests\tNo corrects\n");
+ fprintf(statistics_file, "%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t"
+ "%lu\t%lu\n",
+ fixable_lines,
+ corrects_at[1], corrects_at[2], corrects_at[3], corrects_at[4],
+ corrects_at[5], corrects_rest, no_corrects - correct_not_in_lm);
+ fprintf(statistics_file, "%.2f %%\t"
+ "%.2f %%\t%.2f %%\t%.2f %%\t%.2f %%\t"
+ "%.2f %%\t"
+ "%.2f %%\t"
+ "%.2f %%\n",
+ static_cast<float>(fixable_lines) / static_cast<float>(fixable_lines) * 100.0f,
+ static_cast<float>(corrects_at[1]) / static_cast<float>(fixable_lines) * 100.0f,
+ static_cast<float>(corrects_at[2]) / static_cast<float>(fixable_lines) * 100.0f,
+ static_cast<float>(corrects_at[3]) / static_cast<float>(fixable_lines) * 100.0f,
+ static_cast<float>(corrects_at[4]) / static_cast<float>(fixable_lines) * 100.0f,
+ static_cast<float>(corrects_at[5]) / static_cast<float>(fixable_lines) * 100.0f,
+ static_cast<float>(corrects_rest) / static_cast<float>(fixable_lines) * 100.0f,
+ static_cast<float>(no_corrects - correct_not_in_lm) / static_cast<float>(fixable_lines) * 100.0f);
+ if (histogram_file == NULL)
+ {
+ return;
+ }
+ fprintf(histogram_file, "Result count\tfrequency\n");
+ for (int i = 0; i < max_results; i++)
+ {
+ fprintf(histogram_file, "%u\t%lu\n", i, results[i]);
+ }
+ }
+
+int main(int argc, char **argv)
+ {
+ results = new long[max_results];
+ corrects_at = new long[max_results];
+
+ int c;
+ //std::locale::global(std::locale(""));
+
+#if HAVE_GETOPT_H
+ while (true) {
+ static struct option long_options[] =
+ {
+ // first the hfst-mandated options
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {"verbose", no_argument, 0, 'v'},
+ {"quiet", no_argument, 0, 'q'},
+ {"silent", no_argument, 0, 's'},
+ {"profile", required_argument, 0, 'P'},
+ {"statistics", required_argument, 0, 'X'},
+ {"histogram", required_argument, 0, 'H'},
+ {"n-best", required_argument, 0, 'n'},
+ {0, 0, 0, 0 }
+ };
+
+ int option_index = 0;
+ c = getopt_long(argc, argv, "hVvqsP:X:H:n:", long_options, &option_index);
+
+ if (c == -1) // no more options to look at
+ break;
+
+ switch (c) {
+ case 'h':
+ print_usage();
+ return EXIT_SUCCESS;
+ break;
+
+ case 'V':
+ print_version();
+ return EXIT_SUCCESS;
+ break;
+
+ case 'v':
+ verbose = true;
+ quiet = false;
+ break;
+
+ case 'P':
+ profile_file = fopen(optarg, "a");
+ if (NULL == profile_file)
+ {
+ perror("Couldn't open profiling file for append");
+ }
+ profile_start = clock();
+ break;
+ case 'X':
+ statistics_file = fopen(optarg, "w");
+ if (NULL == statistics_file)
+ {
+ perror("Couldn't open statistic file for writing");
+ }
+ break;
+ case 'H':
+ histogram_file = fopen(optarg, "w");
+ if (NULL == histogram_file)
+ {
+ perror("Couldn't open histogram file for wriitng");
+ }
+ break;
+ case 'q': // fallthrough
+ case 's':
+ quiet = true;
+ verbose = false;
+ break;
+ case 'n':
+ char* endptr;
+ max_results = strtoul(optarg, &endptr, 10L);
+ if (*endptr != '\0')
+ {
+ perror("argument not valid for n-best");
+ }
+ break;
+ default:
+ std::cerr << "Invalid option\n\n";
+ print_short_help();
+ return EXIT_FAILURE;
+ break;
+ }
+ }
+#else
+ int optind = 1;
+#endif
+ // no more options, we should now be at the input filenames
+ if (optind == (argc - 2))
+ {
+ legacy_spell(argv[optind], argv[optind+1]);
+ }
+ else if (optind < (argc - 2))
+ {
+ std::cerr << "No more than two free parameters allowed" << std::endl;
+ print_short_help();
+ return EXIT_FAILURE;
+ }
+ else if (optind >= argc)
+ {
+ std::cerr << "Give full path to zhfst spellers or two automata"
+ << std::endl;
+ print_short_help();
+ return EXIT_FAILURE;
+ }
+ hfst_print_profile_line();
+ print_statistics();
+ return EXIT_SUCCESS;
+ }
diff --git a/main.cc b/main.cc
new file mode 100644
index 0000000..438d4e2
--- /dev/null
+++ b/main.cc
@@ -0,0 +1,610 @@
+/*
+
+ Copyright 2009 University of Helsinki
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+*/
+
+/*
+ This is a toy commandline utility for testing spellers on standard io.
+ */
+
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif
+#if HAVE_GETOPT_H
+# include <getopt.h>
+#endif
+
+#ifdef WINDOWS
+# include <windows.h>
+#endif
+
+#include <cstdarg>
+#include <stdio.h>
+#include <errno.h>
+
+#include "ol-exceptions.h"
+#include "ospell.h"
+#include "ZHfstOspeller.h"
+
+using hfst_ol::ZHfstOspeller;
+using hfst_ol::Transducer;
+
+static bool quiet = false;
+static bool verbose = false;
+static bool analyse = false;
+static unsigned long suggs = 0;
+static hfst_ol::Weight max_weight = -1.0;
+static hfst_ol::Weight beam = -1.0;
+static float time_cutoff = 0.0;
+static std::string error_model_filename = "";
+static std::string lexicon_filename = "";
+#ifdef WINDOWS
+ static bool output_to_console = false;
+#endif
+static bool suggest = false;
+static bool suggest_reals = false;
+
+#ifdef WINDOWS
+static std::string wide_string_to_string(const std::wstring & wstr)
+{
+ int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)§wstr.size(), NULL, 0, NULL, NULL);
+ std::string str( size_needed, 0 );
+ WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &str[0], size_needed, NULL, NULL);
+ return str;
+}
+#endif
+
+static int hfst_fprintf(FILE * stream, const char * format, ...)
+{
+ va_list args;
+ va_start(args, format);
+#ifdef WINDOWS
+ if (output_to_console && (stream == stdout || stream == stderr))
+ {
+ char buffer [1024];
+ int r = vsprintf(buffer, format, args);
+ va_end(args);
+ if (r < 0)
+ return r;
+ HANDLE stdHandle = GetStdHandle(STD_OUTPUT_HANDLE);
+ if (stream == stderr)
+ stdHandle = GetStdHandle(STD_ERROR_HANDLE);
+
+ std::string pstr(buffer);
+ DWORD numWritten = 0;
+ int wchars_num =
+ MultiByteToWideChar(CP_UTF8 , 0 , pstr.c_str() , -1, NULL , 0 );
+ wchar_t* wstr = new wchar_t[wchars_num];
+ MultiByteToWideChar(CP_UTF8 , 0 ,
+ pstr.c_str() , -1, wstr , wchars_num );
+ int retval = WriteConsoleW(stdHandle, wstr, wchars_num-1, &numWritten, NULL);
+ delete[] wstr;
+
+ return retval;
+ }
+ else
+ {
+ int retval = vfprintf(stream, format, args);
+ va_end(args);
+ return retval;
+ }
+#else
+ errno = 0;
+ int retval = vfprintf(stream, format, args);
+ if (retval < 0)
+ {
+ perror("hfst_fprintf");
+ }
+ va_end(args);
+ return retval;
+#endif
+}
+
+
+bool print_usage(void)
+{
+ std::cout <<
+ "\n" <<
+ "Usage: " << PACKAGE_NAME << " [OPTIONS] [ZHFST-ARCHIVE]\n" <<
+ "Use automata in ZHFST-ARCHIVE or from OPTIONS to check and correct\n"
+ "\n" <<
+ " -h, --help Print this help message\n" <<
+ " -V, --version Print version information\n" <<
+ " -v, --verbose Be verbose\n" <<
+ " -q, --quiet Don't be verbose (default)\n" <<
+ " -s, --silent Same as quiet\n" <<
+ " -a, --analyse Analyse strings and corrections\n" <<
+ " -n, --limit=N Show at most N suggestions\n" <<
+ " -w, --max-weight=W Suppress corrections with weights above W\n" <<
+ " -b, --beam=W Suppress corrections worse than best candidate by more than W\n" <<
+ " -t, --time-cutoff=T Stop trying to find better corrections after T seconds (T is a float)\n" <<
+ " -S, --suggest Suggest corrections to mispellings\n" <<
+ " -X, --real-word Also suggest corrections to correct words\n" <<
+ " -m, --error-model Use this error model (must also give lexicon as option)\n" <<
+ " -l, --lexicon Use this lexicon (must also give erro model as option)\n" <<
+#ifdef WINDOWS
+ " -k, --output-to-console Print output to console (Windows-specific)" <<
+#endif
+ "\n" <<
+ "\n" <<
+ "Report bugs to " << PACKAGE_BUGREPORT << "\n" <<
+ "\n";
+ return true;
+}
+
+bool print_version(void)
+{
+ std::cout <<
+ "\n" <<
+ PACKAGE_STRING << std::endl <<
+ __DATE__ << " " __TIME__ << std::endl <<
+ "copyright (C) 2009 - 2014 University of Helsinki\n";
+ return true;
+}
+
+bool print_short_help(void)
+{
+ print_usage();
+ return true;
+}
+
+void
+do_suggest(ZHfstOspeller& speller, const std::string& str)
+ {
+ hfst_ol::CorrectionQueue corrections = speller.suggest(str);
+ if (corrections.size() > 0)
+ {
+ hfst_fprintf(stdout, "Corrections for \"%s\":\n", str.c_str());
+ while (corrections.size() > 0)
+ {
+ const std::string& corr = corrections.top().first;
+ if (analyse)
+ {
+ hfst_ol::AnalysisQueue anals = speller.analyse(corr, true);
+ bool all_discarded = true;
+ while (anals.size() > 0)
+ {
+ if (anals.top().first.find("Use/SpellNoSugg") !=
+ std::string::npos)
+ {
+ hfst_fprintf(stdout, "%s %f %s "
+ "[DISCARDED BY ANALYSES]\n",
+ corr.c_str(), corrections.top().second,
+ anals.top().first.c_str());
+ }
+ else
+ {
+ all_discarded = false;
+ hfst_fprintf(stdout, "%s %f %s\n",
+ corr.c_str(), corrections.top().second,
+ anals.top().first.c_str());
+ }
+ anals.pop();
+ }
+ if (all_discarded)
+ {
+ hfst_fprintf(stdout, "All corrections were "
+ "invalidated by analysis! "
+ "No score!\n");
+ }
+ }
+ else
+ {
+ hfst_fprintf(stdout, "%s %f\n",
+ corr.c_str(),
+ corrections.top().second);
+ }
+ corrections.pop();
+ }
+ hfst_fprintf(stdout, "\n");
+ }
+ else
+ {
+ hfst_fprintf(stdout,
+ "Unable to correct \"%s\"!\n\n", str.c_str());
+ }
+
+ }
+
+void
+do_spell(ZHfstOspeller& speller, const std::string& str)
+ {
+ if (speller.spell(str))
+ {
+ hfst_fprintf(stdout, "\"%s\" is in the lexicon...\n",
+ str.c_str());
+ if (analyse)
+ {
+ hfst_fprintf(stdout, "analysing:\n");
+ hfst_ol::AnalysisQueue anals = speller.analyse(str, false);
+ bool all_no_spell = true;
+ while (anals.size() > 0)
+ {
+ if (anals.top().first.find("Use/-Spell") != std::string::npos)
+ {
+ hfst_fprintf(stdout,
+ "%s %f [DISCARDED AS -Spell]\n",
+ anals.top().first.c_str(),
+ anals.top().second);
+ }
+ else
+ {
+ all_no_spell = false;
+ hfst_fprintf(stdout, "%s %f\n",
+ anals.top().first.c_str(),
+ anals.top().second);
+ }
+ anals.pop();
+ }
+ if (all_no_spell)
+ {
+ hfst_fprintf(stdout,
+ "All spellings were invalidated by analysis! "
+ ".:. Not in lexicon!\n");
+ }
+ }
+ if (suggest_reals)
+ {
+ hfst_fprintf(stdout, "(but correcting anyways)\n", str.c_str());
+ do_suggest(speller, str);
+ }
+ }
+ else
+ {
+ hfst_fprintf(stdout, "\"%s\" is NOT in the lexicon:\n",
+ str.c_str());
+ if (suggest)
+ {
+ do_suggest(speller, str);
+ }
+ }
+ }
+
+int
+zhfst_spell(char* zhfst_filename)
+{
+ ZHfstOspeller speller;
+ try
+ {
+ speller.read_zhfst(zhfst_filename);
+ }
+ catch (hfst_ol::ZHfstMetaDataParsingError zhmdpe)
+ {
+ hfst_fprintf(stderr, "cannot finish reading zhfst archive %s:\n%s.\n",
+ zhfst_filename, zhmdpe.what());
+ //std::cerr << "cannot finish reading zhfst archive " << zhfst_filename <<
+ // ":\n" << zhmdpe.what() << "." << std::endl;
+ return EXIT_FAILURE;
+ }
+ catch (hfst_ol::ZHfstZipReadingError zhzre)
+ {
+ //std::cerr << "cannot read zhfst archive " << zhfst_filename << ":\n"
+ // << zhzre.what() << "." << std::endl
+ // << "trying to read as legacy automata directory" << std::endl;
+ hfst_fprintf(stderr,
+ "cannot read zhfst archive %s:\n"
+ "%s.\n",
+ zhfst_filename, zhzre.what());
+ return EXIT_FAILURE;
+ }
+ catch (hfst_ol::ZHfstXmlParsingError zhxpe)
+ {
+ //std::cerr << "Cannot finish reading index.xml from "
+ // << zhfst_filename << ":" << std::endl
+ // << zhxpe.what() << "." << std::endl;
+ hfst_fprintf(stderr,
+ "Cannot finish reading index.xml from %s:\n"
+ "%s.\n",
+ zhfst_filename, zhxpe.what());
+ return EXIT_FAILURE;
+ }
+ if (verbose)
+ {
+ //std::cout << "Following metadata was read from ZHFST archive:" << std::endl
+ // << speller.metadata_dump() << std::endl;
+ hfst_fprintf(stdout,
+ "Following metadata was read from ZHFST archive:\n"
+ "%s\n",
+ speller.metadata_dump().c_str());
+ }
+ speller.set_queue_limit(suggs);
+ if (suggs != 0 && verbose)
+ {
+ hfst_fprintf(stdout, "Printing only %lu top suggestions per line\n", suggs);
+ }
+ speller.set_weight_limit(max_weight);
+ if (max_weight >= 0.0 && verbose)
+ {
+ hfst_fprintf(stdout, "Not printing suggestions worse than %f\n", max_weight);
+ }
+ speller.set_beam(beam);
+ if (beam >= 0.0 && verbose)
+ {
+ hfst_fprintf(stdout, "Not printing suggestions worse than best by margin %f\n", beam);
+ }
+ speller.set_time_cutoff(time_cutoff);
+ if (time_cutoff >= 0.0 && verbose)
+ {
+ hfst_fprintf(stdout, "Not trying to find better suggestions after %f seconds\n", time_cutoff);
+ }
+ char * str = (char*) malloc(2000);
+
+
+#ifdef WINDOWS
+ SetConsoleCP(65001);
+ const HANDLE stdIn = GetStdHandle(STD_INPUT_HANDLE);
+ WCHAR buffer[0x1000];
+ DWORD numRead = 0;
+ while (ReadConsoleW(stdIn, buffer, sizeof buffer, &numRead, NULL))
+ {
+ std::wstring wstr(buffer, numRead-1); // skip the newline
+ std::string linestr = wide_string_to_string(wstr);
+ free(str);
+ str = strdup(linestr.c_str());
+#else
+ while (!std::cin.eof()) {
+ std::cin.getline(str, 2000);
+#endif
+ if (str[0] == '\0') {
+ continue;
+ }
+ if (str[strlen(str) - 1] == '\r')
+ {
+#ifdef WINDOWS
+ str[strlen(str) - 1] = '\0';
+#else
+ hfst_fprintf(stderr, "There is a WINDOWS linebreak in this file\n"
+ "Please convert with dos2unix or fromdos\n");
+ exit(1);
+#endif
+ }
+ do_spell(speller, str);
+ }
+ free(str);
+ return EXIT_SUCCESS;
+}
+
+int
+ legacy_spell(hfst_ol::Speller * s)
+{
+ ZHfstOspeller speller;
+ speller.inject_speller(s);
+ speller.set_queue_limit(suggs);
+ if (suggs != 0 && verbose)
+ {
+ hfst_fprintf(stdout, "Printing only %lu top suggestions per line\n", suggs);
+ }
+ speller.set_weight_limit(max_weight);
+ if (max_weight >= 0.0 && verbose)
+ {
+ hfst_fprintf(stdout, "Not printing suggestions worse than %f\n", suggs);
+ }
+ speller.set_beam(beam);
+ if (beam >= 0.0 && verbose)
+ {
+ hfst_fprintf(stdout, "Not printing suggestions worse than best by margin %f\n", suggs);
+ }
+ char * str = (char*) malloc(2000);
+
+#ifdef WINDOWS
+ SetConsoleCP(65001);
+ const HANDLE stdIn = GetStdHandle(STD_INPUT_HANDLE);
+ WCHAR buffer[0x1000];
+ DWORD numRead = 0;
+ while (ReadConsoleW(stdIn, buffer, sizeof buffer, &numRead, NULL))
+ {
+ std::wstring wstr(buffer, numRead-1); // skip the newline
+ std::string linestr = wide_string_to_string(wstr);
+ free(str);
+ str = strdup(linestr.c_str());
+#else
+ while (!std::cin.eof()) {
+ std::cin.getline(str, 2000);
+#endif
+ if (str[0] == '\0') {
+ continue;
+ }
+ if (str[strlen(str) - 1] == '\r')
+ {
+#ifdef WINDOWS
+ str[strlen(str) - 1] = '\0';
+#else
+ hfst_fprintf(stderr, "There is a WINDOWS linebreak in this file\n"
+ "Please convert with dos2unix or fromdos\n");
+ exit(1);
+#endif
+ }
+ do_spell(speller, str);
+ }
+ free(str);
+ return EXIT_SUCCESS;
+}
+
+int main(int argc, char **argv)
+{
+
+ int c;
+ //std::locale::global(std::locale(""));
+
+#if HAVE_GETOPT_H
+ while (true) {
+ static struct option long_options[] =
+ {
+ // first the hfst-mandated options
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {"verbose", no_argument, 0, 'v'},
+ {"quiet", no_argument, 0, 'q'},
+ {"silent", no_argument, 0, 's'},
+ {"analyse", no_argument, 0, 'a'},
+ {"limit", required_argument, 0, 'n'},
+ {"max-weight", required_argument, 0, 'w'},
+ {"beam", required_argument, 0, 'b'},
+ {"suggest", no_argument, 0, 'S'},
+ {"time-cutoff", required_argument, 0, 't'},
+ {"real-word", no_argument, 0, 'X'},
+ {"error-model", required_argument, 0, 'm'},
+ {"lexicon", required_argument, 0, 'l'},
+#ifdef WINDOWS
+ {"output-to-console", no_argument, 0, 'k'},
+#endif
+ {0, 0, 0, 0 }
+ };
+
+ int option_index = 0;
+ c = getopt_long(argc, argv, "hVvqsan:w:b:t:SXm:l:k", long_options, &option_index);
+ char* endptr = 0;
+
+ if (c == -1) // no more options to look at
+ break;
+
+ switch (c) {
+ case 'h':
+ print_usage();
+ return EXIT_SUCCESS;
+ break;
+
+ case 'V':
+ print_version();
+ return EXIT_SUCCESS;
+ break;
+
+ case 'v':
+ verbose = true;
+ quiet = false;
+ break;
+
+ case 'q': // fallthrough
+ case 's':
+ quiet = true;
+ verbose = false;
+ break;
+ case 'a':
+ analyse = true;
+ break;
+ case 'n':
+ suggs = strtoul(optarg, &endptr, 10);
+ if (endptr == optarg)
+ {
+ fprintf(stderr, "%s not a strtoul number\n", optarg);
+ exit(1);
+ }
+ else if (*endptr != '\0')
+ {
+ fprintf(stderr, "%s truncated from limit parameter\n", endptr);
+ }
+ break;
+ case 'w':
+ max_weight = strtof(optarg, &endptr);
+ if (endptr == optarg)
+ {
+ fprintf(stderr, "%s is not a float\n", optarg);
+ exit(1);
+ }
+ else if (*endptr != '\0')
+ {
+ fprintf(stderr, "%s truncated from limit parameter\n", endptr);
+ }
+
+ break;
+ case 'b':
+ beam = strtof(optarg, &endptr);
+ if (endptr == optarg)
+ {
+ fprintf(stderr, "%s is not a float\n", optarg);
+ exit(1);
+ }
+ else if (*endptr != '\0')
+ {
+ fprintf(stderr, "%s truncated from limit parameter\n", endptr);
+ }
+
+ break;
+ case 't':
+ time_cutoff = strtof(optarg, &endptr);
+ if (endptr == optarg)
+ {
+ fprintf(stderr, "%s is not a float\n", optarg);
+ exit(1);
+ }
+ else if (*endptr != '\0')
+ {
+ fprintf(stderr, "%s truncated from limit parameter\n", endptr);
+ }
+
+ break;
+#ifdef WINDOWS
+ case 'k':
+ output_to_console = true;
+ break;
+#endif
+ case 'S':
+ suggest = true;
+ break;
+ case 'X':
+ suggest_reals = true;
+ break;
+ case 'm':
+ error_model_filename = optarg;
+ break;
+ case 'l':
+ lexicon_filename = optarg;
+ break;
+ default:
+ std::cerr << "Invalid option\n\n";
+ print_short_help();
+ return EXIT_FAILURE;
+ break;
+ }
+ }
+#else
+ int optind = 1;
+#endif
+ // no more options, we should now be at the input filenames
+ if (optind == (argc - 1))
+ {
+ if (error_model_filename != "" || lexicon_filename != "") {
+ std::cerr << "Give *either* a zhfst speller or --error-model and --lexicon"
+ << std::endl;
+ print_short_help();
+ return EXIT_FAILURE;
+ }
+ return zhfst_spell(argv[optind]);
+ }
+ else if (optind < (argc - 1))
+ {
+ std::cerr << "Too many file parameters" << std::endl;
+ print_short_help();
+ return EXIT_FAILURE;
+ }
+ else if (optind >= argc)
+ {
+ if (error_model_filename == "" || lexicon_filename == "") {
+ std::cerr << "Give *either* a zhfst speller or --error-model and --lexicon"
+ << std::endl;
+ print_short_help();
+ return EXIT_FAILURE;
+ }
+ FILE * err_file = fopen(error_model_filename.c_str(), "r");
+ FILE * lex_file = fopen(lexicon_filename.c_str(), "r");
+ hfst_ol::Transducer err(err_file);
+ hfst_ol::Transducer lex(lex_file);
+ hfst_ol::Speller * s = new hfst_ol::Speller(&err, &lex);
+ return legacy_spell(s);
+ }
+ return EXIT_SUCCESS;
+ }
diff --git a/no-errormodel.sh b/no-errormodel.sh
new file mode 100755
index 0000000..2d3c5ff
--- /dev/null
+++ b/no-errormodel.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+if test -x ./hfst-ospell ; then
+ if ! cat $srcdir/test.strings | ./hfst-ospell no_errormodel.zhfst ; then
+ exit 1
+ fi
+else
+ echo ./hfst-ospell not built
+ exit 77
+fi
+
diff --git a/no_errmodel.xml b/no_errmodel.xml
new file mode 100644
index 0000000..d2d9fd0
--- /dev/null
+++ b/no_errmodel.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<hfstspeller dtdversion="1.0" hfstversion="3">
+ <info>
+ <locale>qtz</locale>
+ <title>Example speller</title>
+ <description>
+ This example is for the automatic test suite of hfst-ospell.
+ </description>
+ <version vcsrev="33459">1.5.73</version>
+ <date>2012-08-15</date>
+ <producer>Flammie</producer>
+ <contact email="flammie at iki.fi"
+ website="http://flammie.dyndns.org/"/>
+ </info>
+ <acceptor type="general" id="acceptor.default.hfst">
+ <title>Example dictionary</title>
+ <title xml:lang="se">Vuola lávlla</title>
+ <description>Example dictionary recognises a word.</description>
+ <description xml:lang="se">
+ Vuola, vuola mun aigon lási
+ vuolas juhkaluvvat,
+ vuola, vuola mun aigon lási
+ vuolas mieladuvvat
+ </description>
+ </acceptor>
+</hfstspeller>
diff --git a/office.cpp b/office.cpp
new file mode 100644
index 0000000..a4f2441
--- /dev/null
+++ b/office.cpp
@@ -0,0 +1,360 @@
+/*
+
+ Copyright 2009 University of Helsinki
+ Copyright 2015 Tino Didriksen <mail at tinodidriksen.com>
+ Code adapted from https://github.com/TinoDidriksen/trie-tools
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+*/
+
+/*
+ Tests up to 16 variations of each input token:
+ - Verbatim
+ - With leading non-alphanumerics removed
+ - With trailing non-alphanumerics removed
+ - With leading and trailing non-alphanumerics removed
+ - Lower-case of all the above
+ - First-upper of all the above
+*/
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <map>
+#include <sstream>
+#include <stdexcept>
+#include <cmath>
+#include <cerrno>
+#include <cctype>
+
+#define U_CHARSET_IS_UTF8 1
+#include <unicode/uclean.h>
+#include <unicode/ucnv.h>
+#include <unicode/uloc.h>
+#include <unicode/uchar.h>
+#include <unicode/unistr.h>
+
+#include "ol-exceptions.h"
+#include "ospell.h"
+#include "ZHfstOspeller.h"
+
+using hfst_ol::ZHfstOspeller;
+using hfst_ol::Transducer;
+
+typedef std::map<UnicodeString,bool> valid_words_t;
+valid_words_t valid_words;
+
+struct word_t {
+ size_t start, count;
+ UnicodeString buffer;
+};
+std::vector<word_t> words(8);
+std::string buffer;
+UnicodeString ubuffer, uc_buffer;
+size_t cw;
+
+bool verbatim = false;
+bool uc_first = false;
+bool uc_all = true;
+
+bool find_alternatives(ZHfstOspeller& speller, size_t suggs) {
+ for (size_t k=1 ; k <= cw ; ++k) {
+ buffer.clear();
+ words[cw-k].buffer.toUTF8String(buffer);
+ hfst_ol::CorrectionQueue corrections = speller.suggest(buffer);
+
+ if (corrections.size() == 0) {
+ continue;
+ }
+
+ std::cout << "&";
+ // Because speller.set_queue_limit() doesn't actually work, hard limit it here
+ for (size_t i=0, e=corrections.size() ; i<e && i<suggs ; ++i) {
+ std::cout << "\t";
+
+ buffer.clear();
+ if (cw - k != 0) {
+ words[0].buffer.tempSubString(0, words[cw-k].start).toUTF8String(buffer);
+ }
+ if (uc_all) {
+ UnicodeString::fromUTF8(corrections.top().first).toUpper().toUTF8String(buffer);
+ }
+ else if (uc_first) {
+ uc_buffer.setTo(UnicodeString::fromUTF8(corrections.top().first));
+ ubuffer.setTo(uc_buffer, 0, 1);
+ ubuffer.toUpper();
+ ubuffer.append(uc_buffer, 1, uc_buffer.length()-1);
+ ubuffer.toUTF8String(buffer);
+ }
+ else {
+ buffer.append(corrections.top().first);
+ }
+ if (cw - k != 0) {
+ words[0].buffer.tempSubString(words[cw-k].start + words[cw-k].count).toUTF8String(buffer);
+ }
+
+ std::cout << buffer;
+ corrections.pop();
+ }
+ std::cout << std::endl;
+ return true;
+ }
+ return false;
+}
+
+bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs) {
+ ubuffer.setTo(UnicodeString::fromUTF8(word));
+
+ if (word.size() == 13 && word[5] == 'D' && word == "nuvviDspeller") {
+ uc_first = false;
+ uc_all = false;
+ words[0].start = 0;
+ words[0].count = ubuffer.length();
+ words[0].buffer = ubuffer;
+ cw = 1;
+ return false;
+ }
+
+ uc_first = false;
+ uc_all = true;
+ bool has_letters = false;
+ for (int32_t i=0 ; i<ubuffer.length() ; ++i) {
+ if (u_isalpha(ubuffer[i])) {
+ has_letters = true;
+ if (u_isupper(ubuffer[i]) && uc_all) {
+ uc_first = true;
+ }
+ else if (u_islower(ubuffer[i])) {
+ uc_all = false;
+ break;
+ }
+ }
+ }
+
+ // If there are no letters in this token, just ignore it
+ if (has_letters == false) {
+ return true;
+ }
+
+ size_t ichStart = 0, cchUse = ubuffer.length();
+ const UChar *pwsz = ubuffer.getTerminatedBuffer();
+
+ // Always test the full given input
+ words[0].buffer.remove();
+ words[0].start = ichStart;
+ words[0].count = cchUse;
+ words[0].buffer = ubuffer;
+ cw = 1;
+
+ if (cchUse > 1 && !verbatim) {
+ size_t count = cchUse;
+ while (count && !u_isalnum(pwsz[ichStart+count-1])) {
+ --count;
+ }
+ if (count != cchUse) {
+ // If the input ended with non-alphanumerics, test input with non-alphanumerics trimmed from the end
+ words[cw].buffer.remove();
+ words[cw].start = ichStart;
+ words[cw].count = count;
+ words[cw].buffer.append(pwsz, words[cw].start, words[cw].count);
+ ++cw;
+ }
+
+ size_t start = ichStart, count2 = cchUse;
+ while (start < ichStart+cchUse && !u_isalnum(pwsz[start])) {
+ ++start;
+ --count2;
+ }
+ if (start != ichStart) {
+ // If the input started with non-alphanumerics, test input with non-alphanumerics trimmed from the start
+ words[cw].buffer.remove();
+ words[cw].start = start;
+ words[cw].count = count2;
+ words[cw].buffer.append(pwsz, words[cw].start, words[cw].count);
+ ++cw;
+ }
+
+ if (start != ichStart && count != cchUse) {
+ // If the input both started and ended with non-alphanumerics, test input with non-alphanumerics trimmed from both sides
+ words[cw].buffer.remove();
+ words[cw].start = start;
+ words[cw].count = count - (cchUse - count2);
+ words[cw].buffer.append(pwsz, words[cw].start, words[cw].count);
+ ++cw;
+ }
+ }
+
+ for (size_t i=0, e=cw ; i<e ; ++i) {
+ // If we are looking for suggestions, don't use the cache
+ valid_words_t::iterator it = suggs ? valid_words.end() : valid_words.find(words[i].buffer);
+
+ if (it == valid_words.end()) {
+ buffer.clear();
+ words[i].buffer.toUTF8String(buffer);
+ bool valid = speller.spell(buffer);
+ it = valid_words.insert(std::make_pair(words[i].buffer,valid)).first;
+
+ if (!valid && !verbatim) {
+ // If the word was not valid, fold it to lower case and try again
+ buffer.clear();
+ ubuffer = words[i].buffer;
+ ubuffer.toLower();
+ ubuffer.toUTF8String(buffer);
+
+ // Add the lower case variant to the list so that we get suggestions using that, if need be
+ words[cw].start = words[i].start;
+ words[cw].count = words[i].count;
+ words[cw].buffer = ubuffer;
+ ++cw;
+
+ // Don't try again if the lower cased variant has already been tried
+ valid_words_t::iterator itl = suggs ? valid_words.end() : valid_words.find(ubuffer);
+ if (itl != valid_words.end()) {
+ it->second = itl->second;
+ it = itl;
+ }
+ else {
+ valid = speller.spell(buffer);
+ it->second = valid; // Also mark the original mixed case variant as whatever the lower cased one was
+ it = valid_words.insert(std::make_pair(words[i].buffer,valid)).first;
+ }
+ }
+
+ if (!valid && !verbatim && (uc_all || uc_first)) {
+ // If the word was still not valid but had upper case, try a first-upper variant
+ buffer.clear();
+ ubuffer.setTo(words[i].buffer, 0, 1);
+ ubuffer.toUpper();
+ uc_buffer.setTo(words[i].buffer, 1);
+ uc_buffer.toLower();
+ ubuffer.append(uc_buffer);
+ ubuffer.toUTF8String(buffer);
+
+ // Add the first-upper variant to the list so that we get suggestions using that, if need be
+ words[cw].start = words[i].start;
+ words[cw].count = words[i].count;
+ words[cw].buffer = ubuffer;
+ ++cw;
+
+ // Don't try again if the first-upper variant has already been tried
+ valid_words_t::iterator itl = suggs ? valid_words.end() : valid_words.find(ubuffer);
+ if (itl != valid_words.end()) {
+ it->second = itl->second;
+ it = itl;
+ }
+ else {
+ valid = speller.spell(buffer);
+ it->second = valid; // Also mark the original mixed case variant as whatever the first-upper one was
+ it = valid_words.insert(std::make_pair(words[i].buffer,valid)).first;
+ }
+ }
+ }
+
+ if (it->second == true) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+int zhfst_spell(const char* zhfst_filename) {
+ ZHfstOspeller speller;
+ try {
+ speller.read_zhfst(zhfst_filename);
+ }
+ catch (hfst_ol::ZHfstMetaDataParsingError zhmdpe) {
+ fprintf(stderr, "cannot finish reading zhfst archive %s:\n%s.\n", zhfst_filename, zhmdpe.what());
+ return EXIT_FAILURE;
+ }
+ catch (hfst_ol::ZHfstZipReadingError zhzre) {
+ fprintf(stderr, "cannot read zhfst archive %s:\n%s.\n", zhfst_filename, zhzre.what());
+ return EXIT_FAILURE;
+ }
+ catch (hfst_ol::ZHfstXmlParsingError zhxpe) {
+ fprintf(stderr, "Cannot finish reading index.xml from %s:\n%s.\n", zhfst_filename, zhxpe.what());
+ return EXIT_FAILURE;
+ }
+
+ std::cout << "@@ hfst-ospell-office is alive" << std::endl;
+
+ std::string line;
+ std::string word;
+ std::istringstream ss;
+ while (std::getline(std::cin, line)) {
+ while (!line.empty() && std::isspace(line[line.size()-1])) {
+ line.resize(line.size()-1);
+ }
+ if (line.empty()) {
+ continue;
+ }
+ // Just in case anyone decides to use the speller for a minor eternity
+ if (valid_words.size() > 20480) {
+ valid_words.clear();
+ }
+
+ ss.clear();
+ ss.str(line);
+ size_t suggs = 0;
+ char c = 0;
+ if (!(ss >> suggs) || !ss.get(c) || !std::getline(ss, line)) {
+ std::cout << "!" << std::endl;
+ continue;
+ }
+
+ if (is_valid_word(speller, line, suggs)) {
+ std::cout << "*" << std::endl;
+ continue;
+ }
+
+ if (!suggs || !find_alternatives(speller, suggs)) {
+ std::cout << "#" << std::endl;
+ }
+ }
+ return EXIT_SUCCESS;
+}
+
+int main(int argc, char **argv) {
+ UErrorCode status = U_ZERO_ERROR;
+ u_init(&status);
+ if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) {
+ std::cerr << "Error: Cannot initialize ICU. Status = " << u_errorName(status) << std::endl;
+ return -1;
+ }
+
+ ucnv_setDefaultName("UTF-8");
+ uloc_setDefault("en_US_POSIX", &status);
+
+ std::vector<std::string> args(argv, argv+argc);
+ for (std::vector<std::string>::iterator it=args.begin() ; it != args.end() ; ) {
+ if (*it == "--verbatim") {
+ verbatim = true;
+ it = args.erase(it);
+ }
+ else {
+ ++it;
+ }
+ }
+
+ if (args.size() < 2) {
+ throw std::invalid_argument("Must pass a zhfst as argument");
+ }
+
+ int rv = zhfst_spell(args[1].c_str());
+
+ u_cleanup();
+ return rv;
+}
diff --git a/ol-exceptions.h b/ol-exceptions.h
new file mode 100644
index 0000000..2116c08
--- /dev/null
+++ b/ol-exceptions.h
@@ -0,0 +1,84 @@
+#ifndef _OL_EXCEPTIONS_H
+#define _OL_EXCEPTIONS_H
+
+#include <string>
+#include <sstream>
+
+namespace hfst_ol
+{
+
+// This structure is inherited from for each exception. Taken from HFST library
+// code.
+//! @brief Top level exception class for ospell related errors.
+
+//! Ospell exceptions can hold basic back-track information for programmer as
+//! well as human readable explanation.
+struct OspellException
+{
+ std::string name; //!< short description of exception
+ std::string file; //!< file name of exception
+ size_t line; //!< line number of exception
+
+ OspellException(void) {}
+
+//!
+//! construct exception with name, file and location
+OspellException(const std::string &name,const std::string &file,size_t line):
+ name(name),
+ file(file),
+ line(line)
+ {}
+
+ //!
+ //! create string representation of exception for output
+ std::string operator() (void) const
+ {
+ std::ostringstream o;
+ o << "Exception: "<< name << " in file: "
+ << file << " on line: " << line;
+ return o.str();
+ }
+ //!
+ //! create char array representation of exception for output
+ const char* what()
+ {
+ std::ostringstream o;
+ o << file << ":" << line << ":" << name;
+ return o.str().c_str();
+ }
+};
+
+// These macros are used instead of the normal exception facilities.
+
+#define HFST_THROW(E) throw E(#E,__FILE__,__LINE__)
+
+#define HFST_THROW_MESSAGE(E,M) throw E(std::string(#E)+": "+std::string(M)\
+ ,__FILE__,__LINE__)
+
+#define HFST_EXCEPTION_CHILD_DECLARATION(CHILD) \
+ struct CHILD : public OspellException \
+ { CHILD(const std::string &name,const std::string &file,size_t line):\
+ OspellException(name,file,line) {}}
+
+#define HFST_CATCH(E) \
+ catch (const E &e) \
+ { \
+ std::cerr << e.file << ", line " << e.line << ": " << \
+ e() << std::endl; \
+ }
+
+// Now the exceptions themselves
+
+HFST_EXCEPTION_CHILD_DECLARATION(HeaderParsingException);
+
+HFST_EXCEPTION_CHILD_DECLARATION(AlphabetParsingException);
+
+HFST_EXCEPTION_CHILD_DECLARATION(IndexTableReadingException);
+
+HFST_EXCEPTION_CHILD_DECLARATION(TransitionTableReadingException);
+
+HFST_EXCEPTION_CHILD_DECLARATION(UnweightedSpellerException);
+
+HFST_EXCEPTION_CHILD_DECLARATION(TransducerTypeException);
+} // namespace
+#endif // _OL_EXCEPTIONS_H
diff --git a/ospell.cc b/ospell.cc
new file mode 100644
index 0000000..04ad5b9
--- /dev/null
+++ b/ospell.cc
@@ -0,0 +1,1174 @@
+// Copyright 2010 University of Helsinki
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include "ospell.h"
+
+namespace hfst_ol {
+
+int nByte_utf8(unsigned char c)
+{
+ /* utility function to determine how many bytes to peel off as
+ a utf-8 character for representing as unknown symbol */
+ if (c <= 127) {
+ return 1;
+ } else if ( (c & (128 + 64 + 32 + 16)) == (128 + 64 + 32 + 16) ) {
+ return 4;
+ } else if ( (c & (128 + 64 + 32 )) == (128 + 64 + 32) ) {
+ return 3;
+ } else if ( (c & (128 + 64 )) == (128 + 64)) {
+ return 2;
+ } else {
+ return 0;
+ }
+}
+
+bool
+StringWeightComparison::operator()(StringWeightPair lhs, StringWeightPair rhs)
+{ // return true when we want rhs to appear before lhs
+ if (reverse) {
+ return (lhs.second < rhs.second);
+ } else {
+ return (lhs.second > rhs.second);
+ }
+}
+
+bool
+StringPairWeightComparison::operator()(StringPairWeightPair lhs,
+ StringPairWeightPair rhs)
+{ // return true when we want rhs to appear before lhs
+ if (reverse) {
+ return (lhs.second < rhs.second);
+ } else {
+ return (lhs.second > rhs.second);
+ }
+}
+
+void WeightQueue::push(Weight w)
+{
+ for (WeightQueue::iterator it = begin(); it != end(); ++it) {
+ if (*it > w) {
+ insert(it, w);
+ return;
+ }
+ }
+ push_back(w);
+}
+
+void WeightQueue::pop(void)
+{
+ pop_back();
+}
+
+Weight WeightQueue::get_lowest(void) const
+{
+ if (size() == 0) {
+ return std::numeric_limits<Weight>::max();
+ }
+ return front();
+}
+
+Weight WeightQueue::get_highest(void) const
+{
+ if (size() == 0) {
+ return std::numeric_limits<Weight>::max();
+ }
+ return back();
+}
+
+Transducer::Transducer(FILE* f):
+ header(TransducerHeader(f)),
+ alphabet(TransducerAlphabet(f, header.symbol_count())),
+ keys(alphabet.get_key_table()),
+ encoder(keys,header.input_symbol_count()),
+ indices(f,header.index_table_size()),
+ transitions(f,header.target_table_size())
+ {}
+
+Transducer::Transducer(char* raw):
+ header(TransducerHeader(&raw)),
+ alphabet(TransducerAlphabet(&raw, header.symbol_count())),
+ keys(alphabet.get_key_table()),
+ encoder(keys,header.input_symbol_count()),
+ indices(&raw,header.index_table_size()),
+ transitions(&raw,header.target_table_size())
+ {}
+
+TreeNode TreeNode::update_lexicon(SymbolNumber symbol,
+ TransitionTableIndex next_lexicon,
+ Weight weight)
+{
+ SymbolVector str(this->string);
+ if (symbol != 0) {
+ str.push_back(symbol);
+ }
+ return TreeNode(str,
+ this->input_state,
+ this->mutator_state,
+ next_lexicon,
+ this->flag_state,
+ this->weight + weight);
+}
+
+TreeNode TreeNode::update_mutator(TransitionTableIndex next_mutator,
+ Weight weight)
+{
+ return TreeNode(this->string,
+ this->input_state,
+ next_mutator,
+ this->lexicon_state,
+ this->flag_state,
+ this->weight + weight);
+}
+
+TreeNode TreeNode::update(SymbolNumber symbol,
+ unsigned int next_input,
+ TransitionTableIndex next_mutator,
+ TransitionTableIndex next_lexicon,
+ Weight weight)
+{
+ SymbolVector str(this->string);
+ if (symbol != 0) {
+ str.push_back(symbol);
+ }
+ return TreeNode(str,
+ next_input,
+ next_mutator,
+ next_lexicon,
+ this->flag_state,
+ this->weight + weight);
+}
+
+TreeNode TreeNode::update(SymbolNumber symbol,
+ TransitionTableIndex next_mutator,
+ TransitionTableIndex next_lexicon,
+ Weight weight)
+{
+ SymbolVector str(this->string);
+ if (symbol != 0) {
+ str.push_back(symbol);
+ }
+ return TreeNode(str,
+ this->input_state,
+ next_mutator,
+ next_lexicon,
+ this->flag_state,
+ this->weight + weight);
+}
+
+bool TreeNode::try_compatible_with(FlagDiacriticOperation op)
+{
+ switch (op.Operation()) {
+
+ case P: // positive set
+ flag_state[op.Feature()] = op.Value();
+ return true;
+
+ case N: // negative set (literally, in this implementation)
+ flag_state[op.Feature()] = -1*op.Value();
+ return true;
+
+ case R: // require
+ if (op.Value() == 0) { // "plain" require, return false if unset
+ return (flag_state[op.Feature()] != 0);
+ }
+ return (flag_state[op.Feature()] == op.Value());
+
+ case D: // disallow
+ if (op.Value() == 0) { // "plain" disallow, return true if unset
+ return (flag_state[op.Feature()] == 0);
+ }
+ return (flag_state[op.Feature()] != op.Value());
+
+ case C: // clear
+ flag_state[op.Feature()] = 0;
+ return true;
+
+ case U: // unification
+ /* if the feature is unset OR the feature is to this value already OR
+ the feature is negatively set to something else than this value */
+ if (flag_state[op.Feature()] == 0 ||
+ flag_state[op.Feature()] == op.Value() ||
+ (flag_state[op.Feature()] < 0 &&
+ (flag_state[op.Feature()] * -1 != op.Value()))
+ ) {
+ flag_state[op.Feature()] = op.Value();
+ return true;
+ }
+ return false;
+ }
+
+ return false; // to make the compiler happy
+}
+
+Speller::Speller(Transducer* mutator_ptr, Transducer* lexicon_ptr):
+ mutator(mutator_ptr),
+ lexicon(lexicon_ptr),
+ input(),
+ queue(TreeNodeQueue()),
+ next_node(FlagDiacriticState(get_state_size(), 0)),
+ limit(std::numeric_limits<Weight>::max()),
+ alphabet_translator(SymbolVector()),
+ operations(lexicon->get_operations()),
+ limiting(None),
+ mode(Correct),
+ max_clock(-1)
+ {
+ if (mutator != NULL) {
+ build_alphabet_translator();
+ cache = std::vector<CacheContainer>(
+ mutator->get_key_table()->size(), CacheContainer());
+ }
+ }
+
+
+SymbolNumber
+Speller::get_state_size()
+{
+ return lexicon->get_state_size();
+}
+
+
+void Speller::lexicon_epsilons(void)
+{
+ if (!lexicon->has_epsilons_or_flags(next_node.lexicon_state + 1)) {
+ return;
+ }
+ TransitionTableIndex next = lexicon->next(next_node.lexicon_state, 0);
+ STransition i_s = lexicon->take_epsilons_and_flags(next);
+
+ while (i_s.symbol != NO_SYMBOL) {
+ if (is_under_weight_limit(next_node.weight + i_s.weight)) {
+ if (lexicon->transitions.input_symbol(next) == 0) {
+ queue.push_back(next_node.update_lexicon((mode == Correct) ? 0 : i_s.symbol,
+ i_s.index,
+ i_s.weight));
+ } else {
+ FlagDiacriticState old_flags = next_node.flag_state;
+ if (next_node.try_compatible_with( // this is terrible
+ operations->operator[](
+ lexicon->transitions.input_symbol(next)))) {
+ queue.push_back(next_node.update_lexicon(0,
+ i_s.index,
+ i_s.weight));
+ next_node.flag_state = old_flags;
+ }
+ }
+ }
+ ++next;
+ i_s = lexicon->take_epsilons_and_flags(next);
+ }
+}
+
+void Speller::lexicon_consume(void)
+{
+ unsigned int input_state = next_node.input_state;
+ if (input_state >= input.size()) {
+ // no more input
+ return;
+ }
+ SymbolNumber this_input = alphabet_translator[input[input_state]];
+ if(!lexicon->has_transitions(
+ next_node.lexicon_state + 1, this_input)) {
+ // we have no regular transitions for this
+ if (this_input >= lexicon->get_alphabet()->get_orig_symbol_count()) {
+ // this input was not originally in the alphabet, so unknown or identity
+ // may apply
+ if (lexicon->get_unknown() != NO_SYMBOL &&
+ lexicon->has_transitions(next_node.lexicon_state + 1,
+ lexicon->get_unknown())) {
+ queue_lexicon_arcs(lexicon->get_unknown(),
+ next_node.mutator_state,
+ 0.0, 1);
+ }
+ if (lexicon->get_identity() != NO_SYMBOL &&
+ lexicon->has_transitions(next_node.lexicon_state + 1,
+ lexicon->get_identity())) {
+ queue_lexicon_arcs(lexicon->get_identity(),
+ next_node.mutator_state,
+ 0.0, 1);
+ }
+ }
+ return;
+ }
+ queue_lexicon_arcs(this_input,
+ next_node.mutator_state, 0.0, 1);
+}
+
+void Speller::queue_lexicon_arcs(SymbolNumber input_sym,
+ unsigned int mutator_state,
+ Weight mutator_weight,
+ int input_increment)
+{
+ TransitionTableIndex next = lexicon->next(next_node.lexicon_state,
+ input_sym);
+ STransition i_s = lexicon->take_non_epsilons(next, input_sym);
+ while (i_s.symbol != NO_SYMBOL) {
+ if (i_s.symbol == lexicon->get_identity()) {
+ i_s.symbol = input[next_node.input_state];
+ }
+ if (is_under_weight_limit(next_node.weight + i_s.weight + mutator_weight)) {
+ queue.push_back(next_node.update(
+ (mode == Correct) ? input_sym : i_s.symbol,
+ next_node.input_state + input_increment,
+ mutator_state,
+ i_s.index,
+ i_s.weight + mutator_weight));
+ }
+ ++next;
+ i_s = lexicon->take_non_epsilons(next, input_sym);
+ }
+}
+
+void Speller::mutator_epsilons(void)
+{
+ if (!mutator->has_transitions(next_node.mutator_state + 1, 0)) {
+ return;
+ }
+ TransitionTableIndex next_m = mutator->next(next_node.mutator_state, 0);
+ STransition mutator_i_s = mutator->take_epsilons(next_m);
+
+ while (mutator_i_s.symbol != NO_SYMBOL) {
+ if (mutator_i_s.symbol == 0) {
+ if (is_under_weight_limit(
+ next_node.weight + mutator_i_s.weight)) {
+ queue.push_back(next_node.update_mutator(mutator_i_s.index,
+ mutator_i_s.weight));
+ }
+ ++next_m;
+ mutator_i_s = mutator->take_epsilons(next_m);
+ continue;
+ } else if (!lexicon->has_transitions(
+ next_node.lexicon_state + 1,
+ alphabet_translator[mutator_i_s.symbol])) {
+ // we have no regular transitions for this
+ if (alphabet_translator[mutator_i_s.symbol] >= lexicon->get_alphabet()->get_orig_symbol_count()) {
+ // this input was not originally in the alphabet, so unknown or identity
+ // may apply
+ if (lexicon->get_unknown() != NO_SYMBOL &&
+ lexicon->has_transitions(next_node.lexicon_state + 1,
+ lexicon->get_unknown())) {
+ queue_lexicon_arcs(lexicon->get_unknown(),
+ mutator_i_s.index, mutator_i_s.weight);
+ }
+ if (lexicon->get_identity() != NO_SYMBOL &&
+ lexicon->has_transitions(next_node.lexicon_state + 1,
+ lexicon->get_identity())) {
+ queue_lexicon_arcs(lexicon->get_identity(),
+ mutator_i_s.index, mutator_i_s.weight);
+ }
+ }
+ ++next_m;
+ mutator_i_s = mutator->take_epsilons(next_m);
+ continue;
+ }
+ queue_lexicon_arcs(alphabet_translator[mutator_i_s.symbol],
+ mutator_i_s.index, mutator_i_s.weight);
+ ++next_m;
+ mutator_i_s = mutator->take_epsilons(next_m);
+ }
+}
+
+
+bool Speller::is_under_weight_limit(Weight w) const
+{
+ if (limiting == Nbest) {
+ return w < limit;
+ }
+ return w <= limit;
+}
+
+void Speller::consume_input()
+{
+ if (next_node.input_state >= input.size()) {
+ return; // not enough input to consume
+ }
+ SymbolNumber input_sym = input[next_node.input_state];
+ if (!mutator->has_transitions(next_node.mutator_state + 1,
+ input_sym)) {
+ // we have no regular transitions for this
+ if (input_sym >= mutator->get_alphabet()->get_orig_symbol_count()) {
+ // this input was not originally in the alphabet, so unknown or identity
+ // may apply
+ if (mutator->get_identity() != NO_SYMBOL &&
+ mutator->has_transitions(next_node.mutator_state + 1,
+ mutator->get_identity())) {
+ queue_mutator_arcs(mutator->get_identity());
+ }
+ if (mutator->get_unknown() != NO_SYMBOL &&
+ mutator->has_transitions(next_node.mutator_state + 1,
+ mutator->get_unknown())) {
+ queue_mutator_arcs(mutator->get_unknown());
+ }
+ }
+ } else {
+ queue_mutator_arcs(input_sym);
+ }
+}
+
+void Speller::queue_mutator_arcs(SymbolNumber input_sym)
+{
+ TransitionTableIndex next_m = mutator->next(next_node.mutator_state,
+ input_sym);
+ STransition mutator_i_s = mutator->take_non_epsilons(next_m,
+ input_sym);
+ while (mutator_i_s.symbol != NO_SYMBOL) {
+ if (mutator_i_s.symbol == 0) {
+ if (is_under_weight_limit(
+ next_node.weight + mutator_i_s.weight)) {
+ queue.push_back(next_node.update(0, next_node.input_state + 1,
+ mutator_i_s.index,
+ next_node.lexicon_state,
+ mutator_i_s.weight));
+ }
+ ++next_m;
+ mutator_i_s = mutator->take_non_epsilons(next_m, input_sym);
+ continue;
+ } else if (!lexicon->has_transitions(
+ next_node.lexicon_state + 1,
+ alphabet_translator[mutator_i_s.symbol])) {
+ // we have no regular transitions for this
+ if (alphabet_translator[mutator_i_s.symbol] >= lexicon->get_alphabet()->get_orig_symbol_count()) {
+ // this input was not originally in the alphabet, so unknown or identity
+ // may apply
+ if (lexicon->get_unknown() != NO_SYMBOL &&
+ lexicon->has_transitions(next_node.lexicon_state + 1,
+ lexicon->get_unknown())) {
+ queue_lexicon_arcs(lexicon->get_unknown(),
+ mutator_i_s.index, mutator_i_s.weight, 1);
+ }
+ if (lexicon->get_identity() != NO_SYMBOL &&
+ lexicon->has_transitions(next_node.lexicon_state + 1,
+ lexicon->get_identity())) {
+ queue_lexicon_arcs(lexicon->get_identity(),
+ mutator_i_s.index, mutator_i_s.weight, 1);
+ }
+ }
+ ++next_m;
+ mutator_i_s = mutator->take_non_epsilons(next_m, input_sym);
+ continue;
+ }
+ queue_lexicon_arcs(alphabet_translator[mutator_i_s.symbol],
+ mutator_i_s.index, mutator_i_s.weight, 1);
+ ++next_m;
+ mutator_i_s = mutator->take_non_epsilons(next_m,
+ input_sym);
+ }
+}
+
+bool Transducer::initialize_input_vector(SymbolVector & input_vector,
+ Encoder * encoder,
+ char * line)
+{
+ input_vector.clear();
+ SymbolNumber k = NO_SYMBOL;
+ char ** inpointer = &line;
+ char * oldpointer;
+ while (**inpointer != '\0') {
+ oldpointer = *inpointer;
+ k = encoder->find_key(inpointer);
+ if (k == NO_SYMBOL) { // no tokenization from alphabet
+ // for real handling of other and identity for unseen symbols,
+ // use the Speller interface analyse()!
+ return false;
+ }
+ input_vector.push_back(k);
+ }
+ return true;
+}
+
+AnalysisQueue Transducer::lookup(char * line)
+{
+ std::map<std::string, Weight> outputs;
+ AnalysisQueue analyses;
+ SymbolVector input;
+ TreeNodeQueue queue;
+ if (!initialize_input_vector(input, &encoder, line)) {
+ return analyses;
+ }
+ TreeNode start_node(FlagDiacriticState(get_state_size(), 0));
+ queue.assign(1, start_node);
+
+ while (queue.size() > 0) {
+ TreeNode next_node = queue.back();
+ queue.pop_back();
+
+ // Final states
+ if (next_node.input_state == input.size() &&
+ is_final(next_node.lexicon_state)) {
+ Weight weight = next_node.weight +
+ final_weight(next_node.lexicon_state);
+ std::string output = stringify(get_key_table(),
+ next_node.string);
+ /* if the result is novel or lower weighted than before, insert it */
+ if (outputs.count(output) == 0 ||
+ outputs[output] > weight) {
+ outputs[output] = weight;
+ }
+ }
+
+ TransitionTableIndex next_index;
+ // epsilon loop
+ if (has_epsilons_or_flags(next_node.lexicon_state + 1)) {
+ next_index = next(next_node.lexicon_state, 0);
+ STransition i_s = take_epsilons_and_flags(next_index);
+ while (i_s.symbol != NO_SYMBOL) {
+ if (transitions.input_symbol(next_index) == 0) {
+ queue.push_back(next_node.update_lexicon(i_s.symbol,
+ i_s.index,
+ i_s.weight));
+ // Not a true epsilon but a flag diacritic
+ } else {
+ FlagDiacriticState old_flags = next_node.flag_state;
+ if (next_node.try_compatible_with(
+ get_operations()->operator[](
+ transitions.input_symbol(next_index)))) {
+ queue.push_back(next_node.update_lexicon(i_s.symbol,
+ i_s.index,
+ i_s.weight));
+ next_node.flag_state = old_flags;
+ }
+ }
+ ++next_index;
+ i_s = take_epsilons_and_flags(next_index);
+ }
+ }
+
+ // input consumption loop
+ unsigned int input_state = next_node.input_state;
+ if (input_state < input.size() &&
+ has_transitions(
+ next_node.lexicon_state + 1, input[input_state])) {
+
+ next_index = next(next_node.lexicon_state,
+ input[input_state]);
+ STransition i_s = take_non_epsilons(next_index,
+ input[input_state]);
+
+ while (i_s.symbol != NO_SYMBOL) {
+ queue.push_back(next_node.update(
+ i_s.symbol,
+ input_state + 1,
+ next_node.mutator_state,
+ i_s.index,
+ i_s.weight));
+
+ ++next_index;
+ i_s = take_non_epsilons(next_index, input[input_state]);
+ }
+ }
+
+ }
+
+ std::map<std::string, Weight>::const_iterator it;
+ for (it = outputs.begin(); it != outputs.end(); ++it) {
+ analyses.push(StringWeightPair(it->first, it->second));
+ }
+
+ return analyses;
+}
+
+bool
+Transducer::final_transition(TransitionTableIndex i)
+{
+ return transitions.final(i);
+}
+
+bool
+Transducer::final_index(TransitionTableIndex i)
+{
+ return indices.final(i);
+}
+
+KeyTable*
+Transducer::get_key_table()
+{
+ return keys;
+}
+
+SymbolNumber
+Transducer::find_next_key(char** p)
+{
+ return encoder.find_key(p);
+}
+
+Encoder*
+Transducer::get_encoder()
+{
+ return &encoder;
+}
+
+unsigned int
+Transducer::get_state_size()
+{
+ return alphabet.get_state_size();
+}
+
+SymbolNumber
+Transducer::get_unknown() const
+{
+ return alphabet.get_unknown();
+}
+
+SymbolNumber
+Transducer::get_identity() const
+{
+ return alphabet.get_identity();
+}
+
+TransducerAlphabet*
+Transducer::get_alphabet()
+{
+ return &alphabet;
+}
+
+OperationMap*
+Transducer::get_operations()
+{
+ return alphabet.get_operation_map();
+}
+
+TransitionTableIndex Transducer::next(const TransitionTableIndex i,
+ const SymbolNumber symbol) const
+{
+ if (i >= TARGET_TABLE) {
+ return i - TARGET_TABLE + 1;
+ } else {
+ return indices.target(i+1+symbol) - TARGET_TABLE;
+ }
+}
+
+bool Transducer::has_transitions(const TransitionTableIndex i,
+ const SymbolNumber symbol) const
+{
+ if (symbol == NO_SYMBOL) {
+ return false;
+ }
+ if (i >= TARGET_TABLE) {
+ return (transitions.input_symbol(i - TARGET_TABLE) == symbol);
+ } else {
+ return (indices.input_symbol(i+symbol) == symbol);
+ }
+}
+
+bool Transducer::has_epsilons_or_flags(const TransitionTableIndex i)
+{
+ if (i >= TARGET_TABLE) {
+ return(transitions.input_symbol(i - TARGET_TABLE) == 0||
+ is_flag(transitions.input_symbol(i - TARGET_TABLE)));
+ } else {
+ return (indices.input_symbol(i) == 0);
+ }
+}
+
+bool Transducer::has_non_epsilons_or_flags(const TransitionTableIndex i)
+{
+ if (i >= TARGET_TABLE) {
+ SymbolNumber this_input = transitions.input_symbol(i - TARGET_TABLE);
+ return((this_input != 0 && this_input != NO_SYMBOL) &&
+ !is_flag(this_input));
+ } else {
+ SymbolNumber max_symbol = get_key_table()->size();
+ for (SymbolNumber sym = 1; sym < max_symbol; ++sym) {
+ if (indices.input_symbol(i + sym) == sym) {
+ return true;
+ }
+ }
+ return false;
+ }
+}
+
+STransition Transducer::take_epsilons(const TransitionTableIndex i) const
+{
+ if (transitions.input_symbol(i) != 0) {
+ return STransition(0, NO_SYMBOL);
+ }
+ return STransition(transitions.target(i),
+ transitions.output_symbol(i),
+ transitions.weight(i));
+}
+
+STransition Transducer::take_epsilons_and_flags(const TransitionTableIndex i)
+{
+ if (transitions.input_symbol(i) != 0 &&
+ !is_flag(transitions.input_symbol(i))) {
+ return STransition(0, NO_SYMBOL);
+ }
+ return STransition(transitions.target(i),
+ transitions.output_symbol(i),
+ transitions.weight(i));
+}
+
+STransition Transducer::take_non_epsilons(const TransitionTableIndex i,
+ const SymbolNumber symbol) const
+{
+ if (transitions.input_symbol(i) != symbol) {
+ return STransition(0, NO_SYMBOL);
+ }
+ return STransition(transitions.target(i),
+ transitions.output_symbol(i),
+ transitions.weight(i));
+}
+
+bool Transducer::is_final(const TransitionTableIndex i)
+{
+ if (i >= TARGET_TABLE) {
+ return final_transition(i - TARGET_TABLE);
+ } else {
+ return final_index(i);
+ }
+}
+
+Weight Transducer::final_weight(const TransitionTableIndex i) const
+{
+ if (i >= TARGET_TABLE) {
+ return transitions.weight(i - TARGET_TABLE);
+ } else {
+ return indices.final_weight(i);
+ }
+}
+
+bool
+Transducer::is_flag(const SymbolNumber symbol)
+{
+ return alphabet.is_flag(symbol);
+}
+
+bool
+Transducer::is_weighted(void)
+{
+ return header.probe_flag(Weighted);
+}
+
+
+AnalysisQueue Speller::analyse(char * line, int nbest)
+{
+ mode = Lookup;
+ if (!init_input(line)) {
+ return AnalysisQueue();
+ }
+ std::map<std::string, Weight> outputs;
+ AnalysisQueue analyses;
+ TreeNode start_node(FlagDiacriticState(get_state_size(), 0));
+ queue.assign(1, start_node);
+ while (queue.size() > 0) {
+ next_node = queue.back();
+ queue.pop_back();
+ // Final states
+ if (next_node.input_state == input.size() &&
+ lexicon->is_final(next_node.lexicon_state)) {
+ Weight weight = next_node.weight +
+ lexicon->final_weight(next_node.lexicon_state);
+ std::string output = stringify(lexicon->get_key_table(),
+ next_node.string);
+ /* if the result is novel or lower weighted than before, insert it */
+ if (outputs.count(output) == 0 ||
+ outputs[output] > weight) {
+ outputs[output] = weight;
+ }
+ }
+ lexicon_epsilons();
+ lexicon_consume();
+ }
+ std::map<std::string, Weight>::const_iterator it;
+ for (it = outputs.begin(); it != outputs.end(); ++it) {
+ analyses.push(StringWeightPair(it->first, it->second));
+ }
+ return analyses;
+}
+
+
+
+void Speller::build_cache(SymbolNumber first_sym)
+{
+ TreeNode start_node(FlagDiacriticState(get_state_size(), 0));
+ queue.assign(1, start_node);
+ limit = std::numeric_limits<Weight>::max();
+ // A placeholding map, only one weight per correction
+ StringWeightMap corrections_len_0;
+ StringWeightMap corrections_len_1;
+ while (queue.size() > 0) {
+ next_node = queue.back();
+ queue.pop_back();
+ lexicon_epsilons();
+ mutator_epsilons();
+ if (mutator->is_final(next_node.mutator_state) &&
+ lexicon->is_final(next_node.lexicon_state)) {
+ // complete result of length 0 or 1
+ Weight weight = next_node.weight +
+ lexicon->final_weight(next_node.lexicon_state) +
+ mutator->final_weight(next_node.mutator_state);
+ std::string string = stringify(lexicon->get_key_table(), next_node.string);
+ /* if the correction is novel or better than before, insert it
+ */
+ if (next_node.input_state == 0) {
+ if (corrections_len_0.count(string) == 0 ||
+ corrections_len_0[string] > weight) {
+ corrections_len_0[string] = weight;
+ }
+ } else {
+ if (corrections_len_1.count(string) == 0 ||
+ corrections_len_1[string] > weight) {
+ corrections_len_1[string] = weight;
+ }
+ }
+ }
+ if (next_node.input_state == 1) {
+ cache[first_sym].nodes.push_back(next_node);
+ } else {
+// std::cerr << "discarded node\n";
+ }
+ if (first_sym > 0 && next_node.input_state == 0) {
+ consume_input();
+ }
+ }
+ cache[first_sym].results_len_0.assign(corrections_len_0.begin(), corrections_len_0.end());
+ cache[first_sym].results_len_1.assign(corrections_len_1.begin(), corrections_len_1.end());
+ cache[first_sym].empty = false;
+}
+
+CorrectionQueue Speller::correct(char * line, int nbest,
+ Weight maxweight, Weight beam,
+ float time_cutoff)
+{
+ mode = Correct;
+ // if input initialization fails, return empty correction queue
+ if (!init_input(line)) {
+ return CorrectionQueue();
+ }
+ if (time_cutoff > 0.0) {
+ max_clock = clock() + CLOCKS_PER_SEC*time_cutoff;
+ } else {
+ max_clock = -1;
+ }
+ set_limiting_behaviour(nbest, maxweight, beam);
+ nbest_queue = WeightQueue();
+ // The queue for our suggestions
+ CorrectionQueue correction_queue;
+ // A placeholding map, only one weight per correction
+ std::map<std::string, Weight> corrections;
+ SymbolNumber first_input = (input.size() == 0) ? 0 : input[0];
+ if (cache[first_input].empty) {
+ build_cache(first_input);
+ }
+ if (input.size() <= 1) {
+ // get the cached results and we're done
+ StringWeightVector * results;
+ if (input.size() == 0) {
+ results = &cache[first_input].results_len_0;
+ } else {
+ results = &cache[first_input].results_len_1;
+ }
+ for(StringWeightVector::const_iterator it = results->begin();
+ // First get the correct weight limit
+ it != results->end(); ++it) {
+ best_suggestion = std::min(best_suggestion, it->second);
+ if (nbest > 0) {
+ nbest_queue.push(it->second);
+ if (nbest_queue.size() > nbest) {
+ nbest_queue.pop();
+ }
+ }
+ }
+ adjust_weight_limits(nbest, beam);
+ for(StringWeightVector::const_iterator it = results->begin();
+ // Then collect the results
+ it != results->end(); ++it) {
+ if (it->second <= limit && (nbest == 0 || // we either don't have an nbest condition or
+ (it->second <= nbest_queue.get_highest() && // we're below the worst nbest weight and
+ correction_queue.size() < nbest &&
+ nbest_queue.size() > 0))) { // number of results
+ correction_queue.push(StringWeightPair(it->first, it->second));
+ if (nbest != 0) {
+ nbest_queue.pop();
+ }
+ }
+ }
+ return correction_queue;
+ } else {
+ // populate the tree node queue
+ queue.assign(cache[first_input].nodes.begin(), cache[first_input].nodes.end());
+ }
+ // TreeNode start_node(FlagDiacriticState(get_state_size(), 0));
+ // queue.assign(1, start_node);
+
+ while (queue.size() > 0) {
+ if (max_clock > -1 && clock() > max_clock) {
+ break;
+ }
+ /*
+ For depth-first searching, we save the back node now, remove it
+ from the queue and add new nodes to the search at the back.
+ */
+ next_node = queue.back();
+ queue.pop_back();
+ adjust_weight_limits(nbest, beam);
+ // if we can't get an acceptable result, never mind
+ if (next_node.weight > limit) {
+ continue;
+ }
+ if (next_node.input_state > 1) {
+ // Early epsilons were handled during the caching stage
+ lexicon_epsilons();
+ mutator_epsilons();
+ }
+ if (next_node.input_state == input.size()) {
+ /* if our transducers are in final states
+ * we generate the correction
+ */
+ if (!mutator->is_final(next_node.mutator_state)) {
+ }
+ if (!lexicon->is_final(next_node.lexicon_state)) {
+ }
+ if (mutator->is_final(next_node.mutator_state) &&
+ lexicon->is_final(next_node.lexicon_state)) {
+ Weight weight = next_node.weight +
+ lexicon->final_weight(next_node.lexicon_state) +
+ mutator->final_weight(next_node.mutator_state);
+ if (weight > limit) {
+ continue;
+ }
+ std::string string = stringify(lexicon->get_key_table(), next_node.string);
+ /* if the correction is novel or better than before, insert it
+ */
+ if (corrections.count(string) == 0 ||
+ corrections[string] > weight) {
+ corrections[string] = weight;
+ best_suggestion = std::min(best_suggestion, weight);
+ if (nbest > 0) {
+ nbest_queue.push(weight);
+ if (nbest_queue.size() > nbest) {
+ nbest_queue.pop();
+ }
+ }
+ }
+ }
+ } else {
+ consume_input();
+ }
+ }
+ adjust_weight_limits(nbest, beam);
+ std::map<std::string, Weight>::iterator it;
+ for (it = corrections.begin(); it != corrections.end(); ++it) {
+ if (it->second <= limit && // we're not over our weight limit and
+ (nbest == 0 || // we either don't have an nbest condition or
+ (it->second <= nbest_queue.get_highest() && // we're below the worst nbest weight and
+ correction_queue.size() < nbest &&
+ nbest_queue.size() > 0))) { // number of results
+ correction_queue.push(StringWeightPair(it->first, it->second));
+ if (nbest != 0) {
+ nbest_queue.pop();
+ }
+ }
+ }
+ return correction_queue;
+}
+
+void Speller::set_limiting_behaviour(int nbest, Weight maxweight, Weight beam)
+{
+ limiting = None;
+ limit = std::numeric_limits<Weight>::max();
+ best_suggestion = std::numeric_limits<Weight>::max();
+ if (maxweight >= 0.0 && nbest > 0 && beam >= 0.0) {
+ limiting = MaxWeightNbestBeam;
+ limit = maxweight;
+ } else if (maxweight >= 0.0 && nbest > 0 && beam < 0.0) {
+ limiting = MaxWeightNbest;
+ limit = maxweight;
+ } else if (maxweight >= 0.0 && beam >= 0.0 && nbest == 0) {
+ limiting = MaxWeightBeam;
+ limit = maxweight;
+ } else if (maxweight < 0.0 && nbest > 0 && beam >= 0.0) {
+ limiting = NbestBeam;
+ } else if (maxweight >= 0.0 && nbest == 0 && beam < 0.0) {
+ limiting = MaxWeight;
+ limit = maxweight;
+ } else if (maxweight < 0.0 && nbest > 0 && beam < 0.0) {
+ limiting = Nbest;
+ } else if (maxweight < 0.0 && nbest == 0 && beam >= 0.0) {
+ limiting = Beam;
+ }
+}
+
+void Speller::adjust_weight_limits(int nbest, Weight beam)
+{
+ if (limiting == Nbest && nbest_queue.size() >= nbest) {
+ limit = nbest_queue.get_highest();
+ } else if (limiting == MaxWeightNbest && nbest_queue.size() >= nbest) {
+ limit = std::min(limit, nbest_queue.get_lowest());
+ } else if (limiting == Beam && best_suggestion < std::numeric_limits<Weight>::max()) {
+ limit = best_suggestion + beam;
+ } else if (limiting == NbestBeam) {
+ if (best_suggestion < std::numeric_limits<Weight>::max()) {
+ if (nbest_queue.size() >= nbest) {
+ limit = std::min(best_suggestion + beam, nbest_queue.get_lowest());
+ } else {
+ limit = best_suggestion + beam;
+ }
+ }
+ } else if (limiting == MaxWeightBeam) {
+ if (best_suggestion < std::numeric_limits<Weight>::max()) {
+ limit = std::min(best_suggestion + beam, limit);
+ }
+ } else if (limiting == MaxWeightNbestBeam) {
+ if (best_suggestion < std::numeric_limits<Weight>::max()) {
+ limit = std::min(limit, best_suggestion + beam);
+ }
+ if (nbest_queue.size() >= nbest) {
+ limit = std::min(limit, nbest_queue.get_lowest());
+ }
+ }
+}
+
+bool Speller::check(char * line)
+{
+ mode = Check;
+ if (!init_input(line)) {
+ return false;
+ }
+ TreeNode start_node(FlagDiacriticState(get_state_size(), 0));
+ queue.assign(1, start_node);
+ limit = std::numeric_limits<Weight>::max();
+
+ while (queue.size() > 0) {
+ next_node = queue.back();
+ queue.pop_back();
+ if (next_node.input_state == input.size() &&
+ lexicon->is_final(next_node.lexicon_state)) {
+ return true;
+ }
+ lexicon_epsilons();
+ lexicon_consume();
+ }
+ return false;
+}
+
+std::string stringify(KeyTable * key_table,
+ SymbolVector & symbol_vector)
+{
+ std::string s;
+ for (SymbolVector::iterator it = symbol_vector.begin();
+ it != symbol_vector.end(); ++it) {
+ if (*it < key_table->size()) {
+ s.append(key_table->at(*it));
+ }
+ }
+ return s;
+}
+
+void Speller::build_alphabet_translator(void)
+{
+ TransducerAlphabet * from = mutator->get_alphabet();
+ TransducerAlphabet * to = lexicon->get_alphabet();
+ KeyTable * from_keys = from->get_key_table();
+ StringSymbolMap * to_symbols = to->get_string_to_symbol();
+ alphabet_translator.push_back(0); // zeroth element is always epsilon
+ for (SymbolNumber i = 1; i < from_keys->size(); ++i) {
+ if (to_symbols->count(from_keys->operator[](i)) != 1) {
+ // A symbol in the error source isn't present in the
+ // lexicon, so we add it.
+ std::string sym = from_keys->operator[](i);
+ SymbolNumber lexicon_key = lexicon->get_key_table()->size();
+ lexicon->get_encoder()->read_input_symbol(sym, lexicon_key);
+ lexicon->get_alphabet()->add_symbol(sym);
+ alphabet_translator.push_back(lexicon_key);
+ continue;
+ }
+ // translator at i points to lexicon's symbol for mutator's string for
+ // mutator's symbol number i
+ alphabet_translator.push_back(
+ to_symbols->operator[](
+ from_keys->operator[](i)));
+ }
+}
+
+bool Speller::init_input(char * line)
+{
+ // Initialize the symbol vector to the tokenization given by encoder.
+ // In the case of tokenization failure, valid utf-8 characters
+ // are tokenized as unknown and tokenization is reattempted from
+ // such a character onwards. The empty string is tokenized as an
+ // empty vector; there is no end marker.
+ input.clear();
+ SymbolNumber k = NO_SYMBOL;
+ char ** inpointer = &line;
+ char * oldpointer;
+
+ while (**inpointer != '\0') {
+ oldpointer = *inpointer;
+ k = mutator->get_encoder()->find_key(inpointer);
+ if (k == NO_SYMBOL) { // no tokenization from alphabet
+ int bytes_to_tokenize = nByte_utf8(static_cast<unsigned char>(*oldpointer));
+ if (bytes_to_tokenize == 0) {
+ return false; // can't parse utf-8 character, admit failure
+ } else {
+ char new_symbol[5]; // max size plus NUL
+ memcpy(new_symbol, oldpointer, bytes_to_tokenize);
+ new_symbol[bytes_to_tokenize] = '\0';
+ std::string new_symbol_string(new_symbol);
+ oldpointer += bytes_to_tokenize;
+ *inpointer = oldpointer;
+ cache.push_back(CacheContainer());
+ if (!lexicon->get_alphabet()->has_string(new_symbol_string)) {
+ lexicon->get_alphabet()->add_symbol(new_symbol_string);
+ }
+ SymbolNumber k_lexicon = lexicon->get_alphabet()->get_string_to_symbol()
+ ->operator[](new_symbol_string);
+ lexicon->get_encoder()->read_input_symbol(new_symbol, k_lexicon);
+ if (!mutator->get_alphabet()->has_string(new_symbol_string)) {
+ mutator->get_alphabet()->add_symbol(new_symbol_string);
+ }
+ k = mutator->get_alphabet()->get_string_to_symbol()->
+ operator[](new_symbol_string);
+ mutator->get_encoder()->read_input_symbol(new_symbol, k);
+ if (k >= alphabet_translator.size()) {
+ add_symbol_to_alphabet_translator(k_lexicon);
+ }
+ input.push_back(k);
+ continue;
+ }
+ } else {
+ input.push_back(k);
+ }
+ }
+ return true;
+}
+
+void Speller::add_symbol_to_alphabet_translator(SymbolNumber to_sym)
+{
+ alphabet_translator.push_back(to_sym);
+}
+
+} // namespace hfst_ol
+
+char*
+hfst_strndup(const char* s, size_t n)
+ {
+ char* rv = static_cast<char*>(malloc(sizeof(char)*n+1));
+ if (rv == NULL)
+ {
+ return rv;
+ }
+ rv = static_cast<char*>(memcpy(rv, s, n));
+ if (rv == NULL)
+ {
+ return rv;
+ }
+ rv[n] = '\0';
+ return rv;
+ }
+
diff --git a/ospell.h b/ospell.h
new file mode 100644
index 0000000..17435e3
--- /dev/null
+++ b/ospell.h
@@ -0,0 +1,445 @@
+/* -*- Mode: C++ -*- */
+// Copyright 2010 University of Helsinki
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HFST_OSPELL_OSPELL_H_
+#define HFST_OSPELL_OSPELL_H_ 1
+
+#include <string>
+#include <deque>
+#include <queue>
+#include <list>
+#include <stdexcept>
+#include <limits>
+#include <ctime>
+#include "hfst-ol.h"
+
+namespace hfst_ol {
+
+//! Internal class for transition processing.
+
+struct TreeNode;
+struct CacheContainer;
+typedef std::pair<std::string, std::string> StringPair;
+typedef std::pair<std::string, Weight> StringWeightPair;
+typedef std::vector<StringWeightPair> StringWeightVector;
+typedef std::pair<std::pair<std::string, std::string>, Weight>
+ StringPairWeightPair;
+typedef std::vector<TreeNode> TreeNodeVector;
+typedef std::map<std::string, Weight> StringWeightMap;
+
+//! Contains low-level processing stuff.
+struct STransition{
+ TransitionTableIndex index; //!< index to transition
+ SymbolNumber symbol; //!< symbol of transition
+ Weight weight; //!< weight of transition
+
+ //!
+ //! create transition without weight
+ STransition(TransitionTableIndex i,
+ SymbolNumber s):
+ index(i),
+ symbol(s),
+ weight(0.0)
+ {}
+
+ //! create transition with weight
+ STransition(TransitionTableIndex i,
+ SymbolNumber s,
+ Weight w):
+ index(i),
+ symbol(s),
+ weight(w)
+ {}
+
+};
+//! @brief comparison for establishing order for priority queue for suggestions.
+
+//! The suggestions that are stored in a priority queue are arranged in
+//! ascending order of the weight component, following the basic penalty
+//! weight logic of tropical semiring that is present in most weighted
+//! finite-state spell-checking automata.
+class StringWeightComparison
+/* results are reversed by default because greater weights represent
+ worse results - to reverse the reversal, give a true argument*/
+
+{
+ bool reverse;
+public:
+ //!
+ //! construct a result comparator with ascending or descending weight order
+ StringWeightComparison(bool reverse_result=false):
+ reverse(reverse_result)
+ {}
+
+ //!
+ //! compare two string weight pairs for weights
+ bool operator() (StringWeightPair lhs, StringWeightPair rhs);
+};
+
+//! @brief comparison for complex analysis queues
+//
+//! Follows weight value logic.
+//! @see StringWeightComparison.
+class StringPairWeightComparison
+{
+ bool reverse;
+public:
+ //!
+ //! create result comparator with ascending or descending weight order
+ StringPairWeightComparison(bool reverse_result=false):
+ reverse(reverse_result)
+ {}
+
+ //!
+ //! compare two analysis corrections for weights
+ bool operator() (StringPairWeightPair lhs, StringPairWeightPair rhs);
+};
+
+typedef std::priority_queue<StringWeightPair,
+ std::vector<StringWeightPair>,
+ StringWeightComparison> CorrectionQueue;
+typedef std::priority_queue<StringWeightPair,
+ std::vector<StringWeightPair>,
+ StringWeightComparison> AnalysisQueue;
+typedef std::priority_queue<StringWeightPair,
+ std::vector<StringWeightPair>,
+ StringWeightComparison> HyphenationQueue;
+typedef std::priority_queue<StringPairWeightPair,
+ std::vector<StringPairWeightPair>,
+ StringPairWeightComparison> AnalysisCorrectionQueue;
+
+struct WeightQueue: public std::list<Weight>
+{
+ void push(Weight w); // add a new weight
+ void pop(void); // delete the biggest weight
+ Weight get_lowest(void) const;
+ Weight get_highest(void) const;
+};
+
+//! Internal class for Transducer processing.
+
+//! Contains low-level processing stuff.
+class Transducer
+{
+protected:
+ TransducerHeader header; //!< header data
+ TransducerAlphabet alphabet; //!< alphabet data
+ KeyTable * keys; //!< key symbol mappings
+ Encoder encoder; //!< encoder to convert the strings
+
+ static const TransitionTableIndex START_INDEX = 0; //!< position of first
+
+public:
+ //!
+ //! read transducer from file @a f
+ Transducer(FILE * f);
+ //!
+ //! read transducer from raw dara @a data
+ Transducer(char * raw);
+ IndexTable indices; //!< index table
+ TransitionTable transitions; //!< transition table
+ //!
+ //! Deprecated functions for single-tranducer lookup
+ //! Speller::analyse() is recommended
+ bool initialize_input_vector(SymbolVector & input_vector,
+ Encoder * encoder,
+ char * line);
+ AnalysisQueue lookup(char * line);
+ //!
+ //! whether it's final transition in this transducer
+ bool final_transition(TransitionTableIndex i);
+ //!
+ //! whether it's final index
+ bool final_index(TransitionTableIndex i);
+ //!
+ //! get transducers symbol table mapping
+ KeyTable * get_key_table(void);
+ //!
+ //! find key for string or create it
+ SymbolNumber find_next_key(char ** p);
+ //!
+ //! get encoder for mapping sttrings and symbols
+ Encoder * get_encoder(void);
+ //!
+ //! get size of a state
+ unsigned int get_state_size(void);
+ //!
+ //! get position of the ? symbols
+ SymbolNumber get_unknown(void) const;
+ SymbolNumber get_identity(void) const;
+ //!
+ //! get alphabet of automaton
+ TransducerAlphabet * get_alphabet(void);
+ //!
+ //! get flag stuff of automaton
+ OperationMap * get_operations(void);
+ //!
+ //! follow epsilon transitions from index
+ STransition take_epsilons(const TransitionTableIndex i) const;
+ //!
+ //! follow epsilon transitions and falsg form index
+ STransition take_epsilons_and_flags(const TransitionTableIndex i);
+ //!
+ //! follow real transitions from index
+ STransition take_non_epsilons(const TransitionTableIndex i,
+ const SymbolNumber symbol) const;
+ //!
+ //! get next index
+ TransitionTableIndex next(const TransitionTableIndex i,
+ const SymbolNumber symbol) const;
+ //!
+ //! get next epsilon inedx
+ TransitionTableIndex next_e(const TransitionTableIndex i) const;
+ //!
+ //! whether state has any transitions with @a symbol
+ bool has_transitions(const TransitionTableIndex i,
+ const SymbolNumber symbol) const;
+ //!
+ //! whether state has epsilon s or flag s
+ bool has_epsilons_or_flags(const TransitionTableIndex i);
+ //!
+ //! whether state has non-epsilons or non-flags
+ bool has_non_epsilons_or_flags(const TransitionTableIndex i);
+ //!
+ //! whether it's final
+ bool is_final(const TransitionTableIndex i);
+ //!
+ //! get final weight
+ Weight final_weight(const TransitionTableIndex i) const;
+ //!
+ //! whether it's a flag
+ bool is_flag(const SymbolNumber symbol);
+ //!
+ //! whether it's weighedc
+ bool is_weighted(void);
+
+};
+
+//! Internal class for alphabet processing.
+
+//! Contains low-level processing stuff.
+struct TreeNode
+{
+// SymbolVector input_string; //<! the current input vector
+ SymbolVector string; //!< the current output vector
+ unsigned int input_state; //!< its input state
+ TransitionTableIndex mutator_state; //!< state in error model
+ TransitionTableIndex lexicon_state; //!< state in language model
+ FlagDiacriticState flag_state; //!< state of flags
+ Weight weight; //!< weight
+
+ //!
+ //! construct a node in trie from all that stuff
+ TreeNode(SymbolVector prev_string,
+ unsigned int i,
+ TransitionTableIndex mutator,
+ TransitionTableIndex lexicon,
+ FlagDiacriticState state,
+ Weight w):
+ string(prev_string),
+ input_state(i),
+ mutator_state(mutator),
+ lexicon_state(lexicon),
+ flag_state(state),
+ weight(w)
+ { }
+
+ //!
+ //! construct empty node with a starting state for flags
+ TreeNode(FlagDiacriticState start_state): // starting state node
+ string(SymbolVector()),
+ input_state(0),
+ mutator_state(0),
+ lexicon_state(0),
+ flag_state(start_state),
+ weight(0.0)
+ { }
+
+ //!
+ //! check if tree node is compatible with flag diacritc
+ bool try_compatible_with(FlagDiacriticOperation op);
+
+ //!
+ //! traverse some node in lexicon
+ TreeNode update_lexicon(SymbolNumber next_symbol,
+ TransitionTableIndex next_lexicon,
+ Weight weight);
+
+ //!
+ //! traverse some node in error model
+ TreeNode update_mutator(TransitionTableIndex next_mutator,
+ Weight weight);
+
+ //!
+ //! The update functions return updated copies of this state
+ TreeNode update(SymbolNumber output_symbol,
+ unsigned int next_input,
+ TransitionTableIndex next_mutator,
+ TransitionTableIndex next_lexicon,
+ Weight weight);
+
+ TreeNode update(SymbolNumber output_symbol,
+ TransitionTableIndex next_mutator,
+ TransitionTableIndex next_lexicon,
+ Weight weight);
+
+
+};
+
+typedef std::vector<TreeNode> TreeNodeQueue;
+
+int nByte_utf8(unsigned char c);
+
+//! Exception when speller cannot map characters of error model to language
+//! model.
+
+//! May get raised if error model automaton has output characters that are not
+//! present in language model.
+class AlphabetTranslationException: public std::runtime_error
+{ // "what" should hold the first untranslatable symbol
+public:
+
+ //!
+ //! create alpabet exception with symbol as explanation
+ AlphabetTranslationException(const std::string what):
+ std::runtime_error(what)
+ { }
+};
+
+//! @brief Basic spell-checking automata pair unit.
+
+//! Speller consists of two automata, one for language modeling and one for
+//! error modeling. The speller object has low-level access to the automata
+//! and convenience functions for checking, analysing and correction.
+//! @see ZHfstOspeller for high level access.
+class Speller
+{
+public:
+ Transducer * mutator; //!< error model
+ Transducer * lexicon; //!< languag model
+ SymbolVector input; //!< current input
+ TreeNodeQueue queue; //!< current traversal fifo stack
+ TreeNode next_node; //!< current next node
+ Weight limit; //!< current limit for weights
+ Weight best_suggestion; //!< best suggestion so far
+ WeightQueue nbest_queue; //!< queue to keep track of current n best results
+ SymbolVector alphabet_translator; //!< alphabets in automata
+ OperationMap * operations; //!< flags in it
+ //!< A cache for the result of first symbols
+ std::vector<CacheContainer> cache;
+ //!< what kind of limiting behaviour we have
+ enum LimitingBehaviour { None, MaxWeight, Nbest, Beam, MaxWeightNbest,
+ MaxWeightBeam, NbestBeam, MaxWeightNbestBeam } limiting;
+ //! what mode we're in
+ enum Mode { Check, Correct, Lookup } mode;
+
+ //! stop doing work after this time
+ clock_t max_clock;
+
+ //!
+ //! Create a speller object form error model and language automata.
+ Speller(Transducer * mutator_ptr, Transducer * lexicon_ptr);
+ //!
+ //! size of states
+ SymbolNumber get_state_size(void);
+ //!
+ //! initialise string conversions
+ void build_alphabet_translator(void);
+ void add_symbol_to_alphabet_translator(SymbolNumber to_sym);
+ //!
+ //! initialize input string
+ bool init_input(char * line);
+ //!
+ //! travers epsilons in language model
+ void lexicon_epsilons(void);
+ bool has_lexicon_epsilons(void) const
+ {
+ return lexicon->has_epsilons_or_flags(next_node.lexicon_state + 1);
+ }
+ //!
+ //! traverse epsilons in error modle
+ void mutator_epsilons(void);
+ bool has_mutator_epsilons(void) const
+ {
+ return mutator->has_transitions(next_node.mutator_state + 1, 0);
+ }
+ //!
+ //! traverse along input
+ void consume_input();
+ //! helper functions for traversal
+ void queue_mutator_arcs(SymbolNumber input);
+ void lexicon_consume(void);
+ void queue_lexicon_arcs(SymbolNumber input,
+ unsigned int mutator_state,
+ Weight mutator_weight = 0.0,
+ int input_increment = 0);
+ //! @brief Check if the given string is accepted by the speller
+ //
+ //! foo
+ bool check(char * line);
+ //! @brief suggest corrections for given string @a line.
+ //
+ //! The number of corrections given and stored at any given time
+ //! is limited by @a nbest if ≥ 0.
+ CorrectionQueue correct(char * line, int nbest = 0,
+ Weight maxweight = -1.0,
+ Weight beam = -1.0,
+ float time_cutoff = 0.0);
+
+ bool is_under_weight_limit(Weight w) const;
+ void set_limiting_behaviour(int nbest, Weight maxweight, Weight beam);
+ void adjust_weight_limits(int nbest, Weight beam);
+
+ //! @brief analyse given string @a line.
+ //
+ //! If language model is two-tape, give a list of analyses for string.
+ //! If not, this should return queue of one result @a line if the
+ //! string is in language model and 0 results if it isn't.
+ AnalysisQueue analyse(char * line, int nbest = 0);
+
+ void build_cache(SymbolNumber first_sym);
+ //! @brief Construct a cache entry for @a first_sym..
+
+};
+
+struct CacheContainer
+{
+ // All the nodes that ultimately result from searching at input depth 1
+ TreeNodeVector nodes;
+ // The results are for length max one inputs only
+ StringWeightVector results_len_0;
+ StringWeightVector results_len_1;
+ bool empty;
+
+ CacheContainer(void): empty(true) {}
+
+ void clear(void)
+ {
+ nodes.clear();
+ results_len_0.clear();
+ results_len_1.clear();
+ }
+
+};
+
+std::string stringify(KeyTable * key_table,
+ SymbolVector & symbol_vector);
+
+} // namespace hfst_ol
+
+// Some platforms lack strndup
+char* hfst_strndup(const char* s, size_t n);
+
+#endif // HFST_OSPELL_OSPELL_H_
diff --git a/test.strings b/test.strings
new file mode 100644
index 0000000..7efd06e
--- /dev/null
+++ b/test.strings
@@ -0,0 +1,5 @@
+olut
+vesi
+sivolutesi
+olu
+ßþ”×\
diff --git a/test/editdist.py b/test/editdist.py
new file mode 100755
index 0000000..ef46fa3
--- /dev/null
+++ b/test/editdist.py
@@ -0,0 +1,457 @@
+#!/usr/bin/python
+# see editdist.py --help for usage
+
+import sys
+import struct
+import codecs
+from optparse import OptionParser
+
+debug = False
+
+usage_string = "usage: %prog [options] alphabet"
+
+info_string = """
+Produce an edit distance transducer.
+
+Output is either an arc-by-arc -generated ATT listing (deprecated), or a large
+regular expression.
+
+There are three ways to give the alphabet and weights:
+
+* giving the alphabet as a command line argument
+ (weights are implicitly 1.0 per error)
+* giving a file with specialized configuration syntax
+* giving a transducer in optimized-lookup format to induce an alphabet
+ (in this case only symbols with length 1 are considered,
+ and weights are implicitly 1.0 per error)
+
+These ways may be combined freely.
+
+The specification file should be in the following format:
+* First, an (optional) list of tokens separated by newlines
+ All transitions involving these tokens that are otherwise unspecified
+ are generated with weight 1.0. Symbol weights can be specified by appending
+ a tab and a weight to the symbol. Transitions involving such a symbol
+ will have the user-specified weight added to it.
+* If you want to exclude symbols that may be induced from a transducer,
+ add a leading ~ character to that line.
+* If you want to specify transitions, insert a line with the content "@@"
+ (without the quotes)
+* In the following lines, specified transitions with the form
+ FROM <TAB> TO <TAB> WEIGHT
+ where FROM is the source token, TO is the destination token and WEIGHT is
+ a nonnegative floating point number specifying the weight. By default,
+ if only one transition involving FROM and TO is specified, the same WEIGHT
+ will be used to specify the transition TO -> FROM (assuming that both are
+ listed in the list of tokens).
+* If the command line option to generate swaps is set, you can also specify swap
+ weights with
+ FROM,TO <TAB> TO,FROM <TAB> WEIGHT
+ Again, unspecified swaps will be generated automatically with weight 1.0.
+* Lines starting with ## are comments.
+
+with d for distance and S for size of alphabet plus one
+(for epsilon), expected output is a transducer in ATT format with
+* Swapless:
+** d + 1 states
+** d*(S^2 + S - 1) transitions
+* Swapful:
+** d*(S^2 - 3S + 3) + 1 states
+** d*(3S^2 - 5S + 3) transitions
+"""
+
+OTHER = u'@_UNKNOWN_SYMBOL_@'
+
+class MyOptionParser(OptionParser):
+ # This is needed to override the formatting of the help string
+ def format_epilog(self, formatter):
+ return self.epilog
+
+parser = MyOptionParser(usage=usage_string, epilog=info_string)
+parser.add_option("-r", "--regex", action = "store_true", dest = "make_regex",
+ help = "write a regular expression")
+parser.add_option("-e", "--epsilon", dest = "epsilon",
+ help = "specify symbol to use as epsilon, default is @0@",
+ metavar = "EPS")
+parser.add_option("-d", "--distance", type = "int", dest = "distance",
+ help = "specify edit depth, default is 1",
+ metavar = "DIST")
+parser.add_option("-w", "--default-weight", type = "float", dest = "default_weight",
+ help = "weight per correction when nothing else is specified (the default default is 1.0)",
+ metavar = "DIST")
+parser.add_option("-s", "--swap", action = "store_true", dest="swap",
+ help = "generate swaps (as well as insertions and deletions)")
+parser.add_option("", "--no-elim", action = "store_true", dest="no_elim",
+ help = "don't do redundancy elimination")
+parser.add_option("-m", "--minimum-edit", type = "int", dest = "minimum_edit",
+ help = "minimum accepting edit (default is 1)")
+parser.add_option("", "--no-string-initial-correction", action = "store_true",
+ dest = "no_initial",
+ help = "don't make corrections at the beginning of the string")
+parser.add_option("-i", "--input", dest = "inputfile",
+ help = "optional file with special edit-distance syntax",
+ metavar = "INPUT")
+parser.add_option("-o", "--output-file", dest = "outputfile", help = "output file (default is stdout)", metavar = "OUTPUT")
+parser.add_option("-a", "--alphabet", dest = "alphabetfile",
+ help = "read the alphabet from an existing optimized-lookup format transducer",
+ metavar = "ALPHABET")
+parser.add_option("-v", "--verbose", action = "store_true", dest="verbose",
+ help = "print some diagnostics to standard error")
+parser.set_defaults(make_regex = False)
+parser.set_defaults(epsilon = '@0@')
+parser.set_defaults(distance = 1)
+parser.set_defaults(default_weight = 1.0)
+parser.set_defaults(swap = False)
+parser.set_defaults(no_elim = False)
+parser.set_defaults(no_initial = False)
+parser.set_defaults(minimum_edit = 0)
+parser.set_defaults(verbose = False)
+(options, args) = parser.parse_args()
+
+# Some utility classes
+
+class Header:
+ """Read and provide interface to header"""
+
+ def __init__(self, file):
+ bytes = file.read(5) # "HFST\0"
+ if str(struct.unpack_from("<5s", bytes, 0)) == "('HFST\\x00',)":
+ # just ignore any hfst3 header
+ remaining = struct.unpack_from("<H", file.read(3), 0)[0]
+ self.handle_hfst3_header(file, remaining)
+ bytes = file.read(56) # 2 unsigned shorts, 4 unsigned ints and 9 uint-bools
+ else:
+ bytes = bytes + file.read(56 - 5)
+ self.number_of_input_symbols = struct.unpack_from("<H", bytes, 0)[0]
+ self.number_of_symbols = struct.unpack_from("<H", bytes, 2)[0]
+ self.size_of_transition_index_table = struct.unpack_from("<I", bytes, 4)[0]
+ self.size_of_transition_target_table = struct.unpack_from("<I", bytes, 8)[0]
+ self.number_of_states = struct.unpack_from("<I", bytes, 12)[0]
+ self.number_of_transitions = struct.unpack_from("<I", bytes, 16)[0]
+ self.weighted = struct.unpack_from("<I", bytes, 20)[0] != 0
+ self.deterministic = struct.unpack_from("<I", bytes, 24)[0] != 0
+ self.input_deterministic = struct.unpack_from("<I", bytes, 28)[0] != 0
+ self.minimized = struct.unpack_from("<I", bytes, 32)[0] != 0
+ self.cyclic = struct.unpack_from("<I", bytes, 36)[0] != 0
+ self.has_epsilon_epsilon_transitions = struct.unpack_from("<I", bytes, 40)[0] != 0
+ self.has_input_epsilon_transitions = struct.unpack_from("<I", bytes, 44)[0] != 0
+ self.has_input_epsilon_cycles = struct.unpack_from("<I", bytes, 48)[0] != 0
+ self.has_unweighted_input_epsilon_cycles = struct.unpack_from("<I", bytes, 52)[0] != 0
+
+ def handle_hfst3_header(self, file, remaining):
+ chars = struct.unpack_from("<" + str(remaining) + "c",
+ file.read(remaining), 0)
+ # assume the h3-header doesn't say anything surprising for now
+
+class Alphabet:
+ """Read and provide interface to alphabet"""
+
+ def __init__(self, file, number_of_symbols):
+ stderr_u8 = codecs.getwriter('utf-8')(sys.stderr)
+ self.keyTable = [] # list of unicode objects, use foo.encode("utf-8") to print
+ for x in range(number_of_symbols):
+ symbol = ""
+ while True:
+ byte = file.read(1)
+ if byte == '\0': # a symbol has ended
+ symbol = unicode(symbol, "utf-8")
+ if len(symbol) != 1:
+ stderr_u8.write("Ignored symbol " + symbol + "\n")
+ else:
+ self.keyTable.append(symbol)
+ break
+ symbol += byte
+
+def p(string): # stupid python, or possibly stupid me
+ return string.encode('utf-8')
+
+def maketrans(from_st, to_st, from_sy, to_sy, weight):
+ return str(from_st) + "\t" + str(to_st) + "\t" + p(from_sy) + "\t" + p(to_sy) + "\t" + str(weight)
+
+class Transducer:
+ def __init__(self, alphabet, _other = OTHER, _epsilon = options.epsilon, _distance = options.distance):
+ self.alphabet = alphabet
+ self.substitutions = {}
+ self.swaps = {}
+ self.other = _other
+ self.epsilon = _epsilon
+ self.distance = _distance
+ self.transitions = []
+ # the first self.distance states are always used, for others we
+ # grab state numbers from this counter
+ self.state_clock = self.distance + 1
+ self.debug_messages = []
+
+ def process_pair_info(self, specification):
+ for pair, weight in specification["edits"].iteritems():
+ self.substitutions[pair] = weight
+ for pairpair, weight in specification["swaps"].iteritems():
+ self.swaps[pairpair] = weight
+
+ def generate(self):
+ # for substitutions and swaps that weren't defined by the user,
+ # generate standard subs and swaps
+ if (self.other, self.epsilon) not in self.substitutions:
+ self.substitutions[(self.other, self.epsilon)] = options.default_weight
+ for symbol in self.alphabet.keys():
+ if (self.other, symbol) not in self.substitutions:
+ self.substitutions[(self.other, symbol)] = options.default_weight + alphabet[symbol]
+ if (self.epsilon, symbol) not in self.substitutions:
+ self.substitutions[(self.epsilon, symbol)] = options.default_weight + alphabet[symbol]
+ if (symbol, self.epsilon) not in self.substitutions:
+ self.substitutions[(symbol, self.epsilon)] = options.default_weight + alphabet[symbol]
+ for symbol2 in self.alphabet.keys():
+ if symbol == symbol2: continue
+ if ((symbol, symbol2), (symbol2, symbol)) not in self.swaps:
+ if ((symbol2, symbol), (symbol, symbol2)) in self.swaps:
+ self.swaps[((symbol, symbol2), (symbol2, symbol))] = self.swaps[((symbol2, symbol), (symbol, symbol2))]
+ else:
+ self.swaps[((symbol, symbol2), (symbol2, symbol))] = options.default_weight + alphabet[symbol] + alphabet[symbol2]
+ if (symbol, symbol2) not in self.substitutions:
+ if (symbol2, symbol) in self.substitutions:
+ self.substitutions[(symbol, symbol2)] = self.substitutions[(symbol2, symbol)]
+ else:
+ self.substitutions[(symbol, symbol2)] = options.default_weight + alphabet[symbol] + alphabet[symbol2]
+
+ def make_identities(self, state, nextstate = None):
+ if nextstate is None:
+ nextstate = state
+ ret = []
+ for symbol in self.alphabet.keys():
+ if symbol not in (self.epsilon, self.other):
+ ret.append(maketrans(state, nextstate, symbol, symbol, 0.0))
+ return ret
+
+ def make_swaps(self, state, nextstate = None):
+ if nextstate is None:
+ nextstate = state + 1
+ ret = []
+ if options.swap:
+ for swap in self.swaps:
+ swapstate = self.state_clock
+ self.state_clock += 1
+ self.debug_messages.append(str(swapstate) + " is a swap state for " + swap[0][0] + " and " + swap[0][1])
+ ret.append(maketrans(state, swapstate, swap[0][0], swap[0][1], self.swaps[swap]))
+ ret.append(maketrans(swapstate, nextstate, swap[1][0], swap[1][1], 0.0))
+ return ret
+
+ # for substitutions, we try to eliminate redundancies by refusing to do
+ # deletion right after insertion and insertion right after deletion
+ def make_substitutions(self, state, nextstate = None):
+ if nextstate is None:
+ nextstate = state + 1
+ ret = []
+ eliminate = False
+ # unless we're about to hit the maximum edit or we're not eliminating
+ # redundancies, make skip states for delete and insert
+ if (nextstate < options.distance) and not options.no_elim:
+ eliminate = True
+ delete_skip = self.state_clock
+ self.state_clock += 1
+ insert_skip = self.state_clock
+ self.state_clock += 1
+ ret += self.make_identities(delete_skip, nextstate)
+ ret += self.make_swaps(delete_skip, nextstate + 1)
+ ret += self.make_identities(insert_skip, nextstate)
+ ret += self.make_swaps(insert_skip, nextstate + 1)
+
+ for sub in self.substitutions:
+ if not eliminate:
+ ret.append(maketrans(state, nextstate, sub[0], sub[1], self.substitutions[sub]))
+ elif sub[1] is self.epsilon: # (eliminating) deletion
+ ret.append(maketrans(state, delete_skip, sub[0], sub[1], self.substitutions[sub]))
+ for sub2 in self.substitutions:
+ # after deletion, refuse to do insertion
+ if sub2[0] != self.epsilon:
+ ret.append(maketrans(delete_skip, nextstate + 1, sub2[0], sub2[1], self.substitutions[sub2]))
+ elif sub[0] is self.epsilon: # (eliminating) insertion
+ ret.append(maketrans(state, insert_skip, sub[0], sub[1], self.substitutions[sub]))
+ for sub2 in self.substitutions:
+ # after insertion, refuse to do deletion
+ if sub2[1] != self.epsilon:
+ ret.append(maketrans(insert_skip, nextstate + 1, sub2[0], sub2[1], self.substitutions[sub2]))
+ else:
+ ret.append(maketrans(state, nextstate, sub[0], sub[1], self.substitutions[sub]))
+ return ret
+
+ def make_transitions(self):
+ # If we're not editing in the initial state, there's an extra state
+ # where we just want identities
+ for state in range(options.distance + options.no_initial):
+ if options.minimum_edit != 0:
+ options.minimum_edit -= 1
+ else:
+ self.transitions.append(str(state) + "\t0.0") # final states
+ if state == 0 and options.no_initial:
+ self.transitions += self.make_identities(state, state + 1)
+ continue # Don't do initial corrections
+ else:
+ self.transitions += self.make_identities(state)
+ self.transitions += self.make_substitutions(state)
+ self.transitions += self.make_swaps(state)
+ self.transitions += self.make_identities(options.distance + options.no_initial)
+ self.transitions.append(str(options.distance + options.no_initial) + "\t0.0")
+
+alphabet = {}
+exclusions = set()
+pair_info = {"edits": {}, "swaps": {}}
+
+if options.inputfile == None and options.alphabetfile == None \
+ and len(args) == 0:
+ print "Specify at least one of INPUT, ALPHABET or alphabet string"
+ sys.exit()
+if len(args) > 1:
+ print "Too many options!"
+ sys.exit()
+
+if options.outputfile == None:
+ outputfile = codecs.getwriter('utf-8')(sys.stdout)
+else:
+ outputfile = codecs.getwriter('utf-8')(open(options.outputfile, 'w'))
+
+if options.inputfile != None:
+ try:
+ inputfile = open(options.inputfile)
+ except IOError:
+ print "Couldn't open " + options.inputfile
+ sys.exit()
+ while True:
+ # first the single-symbol info
+ line = unicode(inputfile.readline(), 'utf-8')
+ if line in ("@@\n", ""):
+ break
+ if line.strip() != "":
+ if line.startswith(u'##'):
+ continue
+ if len(line) > 1 and line.startswith(u'~'):
+ exclusions.add(line[1:].strip())
+ continue
+ if '\t' in line:
+ weight = float(line.split('\t')[1])
+ symbol = line.split('\t')[0]
+ else:
+ weight = 0.0
+ symbol = line.strip("\n")
+ alphabet[symbol] = weight
+ while True:
+ # then pairs
+ line = unicode(inputfile.readline(), 'utf-8')
+ if line.startswith('##'):
+ continue
+ if line == "\n":
+ continue
+ if line == "":
+ break
+ parts = line.split('\t')
+ if len(parts) != 3:
+ raise ValueError("Got specification with " + str(len(parts)) +\
+ " parts, expected 3:\n" + specification)
+ weight = float(parts[2])
+ if ',' in parts[0]:
+ frompair = tuple(parts[0].split(','))
+ topair = tuple(parts[1].split(','))
+ if not (len(frompair) == len(topair) == 2):
+ raise ValueError("Got swap-specification with incorrect number "
+ "of comma separators:\n" + specification)
+ if (frompair, topair) not in pair_info["swaps"]:
+ pair_info["swaps"][(frompair, topair)] = weight
+ for sym in [frompair[0], frompair[1], topair[0], topair[1]]:
+ if sym != '' and sym not in alphabet:
+ alphabet[sym] = weight
+ else:
+ if not (parts[0], parts[1]) in pair_info["edits"]:
+ pair_info["edits"][(parts[0], parts[1])] = weight
+ for sym in [parts[0], parts[1]]:
+ if sym != '' and sym not in alphabet:
+ alphabet[sym] = weight
+
+if len(args) == 1:
+ for c in unicode(args[0], 'utf-8'):
+ if c not in alphabet.keys() and c not in exclusions:
+ alphabet[c] = 0.0
+if options.alphabetfile != None:
+ afile = open(options.alphabetfile, "rb")
+ ol_header = Header(afile)
+ ol_alphabet = Alphabet(afile, ol_header.number_of_symbols)
+ for c in filter(lambda x: x.strip() != '', ol_alphabet.keyTable[:]):
+ if c not in alphabet.keys() and c not in exclusions:
+ alphabet[c] = 0.0
+epsilon = unicode(options.epsilon, 'utf-8')
+
+def replace_rules(alphabet, pair_info, weight = options.default_weight):
+ corr = ' "<CORR>" '
+ unk = OTHER
+ corrections = "["
+ # first, the empty string may become the empty string anywhere
+ corrections += '"" -> \t[ "" |\n'
+ for a in alphabet:
+ this_weight = weight
+ # insertions
+ if ('', a) in pair_info["edits"]:
+ this_weight = pair_info["edits"][('', a)] + alphabet[a]
+ corrections += '\t[ "' + a + '" ' + corr + ' ]::' + str(this_weight) + ' |\n'
+ # trim the extra left by the last pass
+ corrections = corrections[:-3]
+ corrections += ' ] ,,\n'
+ for a in alphabet:
+ this_weight = weight
+ # the left-hand side of the rule
+ corrections += '"' + a + '" ->\t[ '
+ # identity
+ corrections += '"' + a + '" |\n'
+ # deletion
+ if (a, '') in pair_info["edits"]:
+ this_weight = pair_info["edits"][(a, '')]
+ corrections += '\t[ ""' + corr + ']::' + str(this_weight) + ' |\n'
+ for b in alphabet:
+ this_weight = weight + alphabet[b]
+ #substitutions
+ if a == b:
+ # we don't handle identities here
+ continue
+ if (a, b) in pair_info["edits"]:
+ this_weight = pair_info["edits"][(a, b)] + alphabet[b]
+ corrections += '\t[ "' + b + '"' + corr + ']::' + str(this_weight) + ' |\n'
+ corrections = corrections[:-3]
+ corrections += ' ] ,,\n'
+ # now the unknown symbol
+ corrections += '"' + unk + '" -> [\n\t[""' + corr + ']::' + str(weight) + ' |\n'
+ for a in alphabet:
+ corrections += '\t[ "' + a + '"' + corr + ']::' + str(weight) + ' |\n'
+ # trim the end again
+ corrections = corrections[:-3]
+ corrections += ' ]]'
+ return corrections
+
+
+if options.make_regex:
+ corrections = replace_rules(alphabet, pair_info)
+ corr_counter = '[[ [? - "<CORR>"]* ( "<CORR>":0 ) [? - "<CORR>"]* ]^' + str(options.distance) + ']'
+ corr_eater = '[[? - "<CORR>"]*]'
+ full_regex = corr_eater + '\n.o.\n' + corrections.encode('utf-8') + '\n.o.\n' + corr_counter + ";\n"
+ outputfile.write(full_regex.decode('utf-8'))
+else:
+ transducer = Transducer(alphabet)
+ transducer.process_pair_info(pair_info)
+ transducer.generate()
+ transducer.make_transitions()
+ for transition in transducer.transitions:
+ outputfile.write(transition.decode('utf-8'))
+ outputfile.write('\n')
+
+ stderr_u8 = codecs.getwriter('utf-8')(sys.stderr)
+
+ if options.verbose:
+ stderr_u8.write("\n" + str(transducer.state_clock) + " states and " + str(len(transducer.transitions)) + " transitions written for "+
+ "distance " + str(options.distance) + " and base alphabet size " + str(len(transducer.alphabet)) +"\n\n")
+ stderr_u8.write("The alphabet was:\n")
+ for symbol, weight in alphabet.iteritems():
+ stderr_u8.write(symbol + "\t" + str(weight) + "\n")
+ if len(exclusions) != 0:
+ stderr_u8.write("The exclusions were:\n")
+ for symbol in exclusions:
+ stderr_u8.write(symbol + "\n")
+ print
+ if debug:
+ for message in transducer.debug_messages:
+ print message
diff --git a/trailing-spaces.sh b/trailing-spaces.sh
new file mode 100755
index 0000000..e0f7b99
--- /dev/null
+++ b/trailing-spaces.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+if test -x ./hfst-ospell ; then
+ if ! cat $srcdir/test.strings | ./hfst-ospell -v trailing_spaces.zhfst ; then
+ exit 1
+ fi
+else
+ echo ./hfst-ospell not built
+ exit 77
+fi
+
diff --git a/trailing_spaces.xml b/trailing_spaces.xml
new file mode 100644
index 0000000..2840cfb
--- /dev/null
+++ b/trailing_spaces.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<hfstspeller dtdversion="1.0" hfstversion="3">
+ <info>
+ <locale>qtz</locale>
+ <title>Example speller</title>
+ <description>
+ This example is for the automatic test suite of hfst-ospell.
+ </description>
+ <version vcsrev="33459">1.5.73</version>
+ <date>2012-08-15</date>
+ <producer>Flammie</producer>
+ <contact
+email="flammie at iki.fi"
+website="http://flammie.dyndns.org/"/>
+ </info>
+ <acceptor
+ type="general" id="acceptor.default.hfst">
+ <title>Example dictionary</title>
+ <title xml:lang="se">Vuola lávlla</title>
+ <description>Example dictionary recognises a word.</description>
+ <description xml:lang="se">
+ Vuola, vuola mun aigon lási
+ vuolas juhkaluvvat,
+ vuola, vuola mun aigon lási
+ vuolas mieladuvvat
+ </description>
+ </acceptor>
+ <errmodel id="errormodel.default.hfst">
+ <title>Sahtiwaari</title>
+ <description>
+ Example error model turns one word into another.
+ </description>
+ <type type="default"/>
+ <model>errormodel.default.hfst</model>
+ </errmodel>
+</hfstspeller>
diff --git a/windows-Makefile.am b/windows-Makefile.am
new file mode 100644
index 0000000..6f3af6c
--- /dev/null
+++ b/windows-Makefile.am
@@ -0,0 +1,278 @@
+## Process this file with automake to produce Makefile.in
+
+# Copyright 2010 University of Helsinki
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# to silence:
+# libtoolize: Consider adding `-I m4' to ACLOCAL_AMFLAGS in Makefile.am.
+ACLOCAL_AMFLAGS=-I m4
+
+# targets
+if EXTRA_DEMOS
+CONFERENCE_DEMOS=hfst-ospell-norvig hfst-ospell-fsmnlp-2012 hfst-ospell-cicling\
+ hfst-ospell-survey hfst-ospell-lrec2013 hfst-ispell
+endif # EXTRA_DEMOS
+
+bin_PROGRAMS=hfst-ospell $(CONFERENCE_DEMOS)
+#lib_LTLIBRARIES=libhfstospell.la
+man1_MANS=hfst-ospell.1
+
+PKG_LIBS=
+PKG_CXXFLAGS=
+
+if WANT_ARCHIVE
+PKG_LIBS+=$(LIBARCHIVE_LIBS)
+PKG_CXXFLAGS+=$(LIBARCHIVE_CFLAGS)
+endif
+
+#if WANT_LIBXMLPP
+#PKG_LIBS+=$(LIBXMLPP_LIBS)
+#PKG_CXXFLAGS+=$(LIBXMLPP_CFLAGS)
+#endif
+
+if WANT_TINYXML2
+PKG_LIBS+=$(TINYXML2_LIBS)
+PKG_CXXFLAGS+=$(TINYXML2_CFLAGS)
+endif
+
+# library parts
+#libhfstospell_la_SOURCES=
+
+#libhfstospell_la_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) $(PKG_CXXFLAGS)
+#libhfstospell_la_LDFLAGS=-no-undefined -version-info 4:0:0 \
+ $(PKG_LIBS)
+
+# link sample program against library here
+hfst_ospell_SOURCES=main.cc hfst-ol.cc ospell.cc \
+ ZHfstOspeller.cc ZHfstOspellerXmlMetadata.cc \
+ tinyxml2.cc \
+ libarchive/archive_acl.c \
+ libarchive/archive_acl_private.h \
+ libarchive/archive_check_magic.c \
+ libarchive/archive_crc32.h \
+ libarchive/archive_crypto.c \
+ libarchive/archive_crypto_private.h \
+ libarchive/archive_endian.h \
+ libarchive/archive_entry.c \
+ libarchive/archive_entry.h \
+ libarchive/archive_entry_copy_stat.c \
+ libarchive/archive_entry_link_resolver.c \
+ libarchive/archive_entry_locale.h \
+ libarchive/archive_entry_private.h \
+ libarchive/archive_entry_sparse.c \
+ libarchive/archive_entry_stat.c \
+ libarchive/archive_entry_strmode.c \
+ libarchive/archive_entry_xattr.c \
+ libarchive/archive_options.c \
+ libarchive/archive_options_private.h \
+ libarchive/archive_platform.h \
+ libarchive/archive_ppmd_private.h \
+ libarchive/archive_ppmd7.c \
+ libarchive/archive_ppmd7_private.h \
+ libarchive/archive_private.h \
+ libarchive/archive_rb.c \
+ libarchive/archive_rb.h \
+ libarchive/archive_read.c \
+ libarchive/archive_read_data_into_fd.c \
+ libarchive/archive_read_disk_entry_from_file.c \
+ libarchive/archive_read_disk_posix.c \
+ libarchive/archive_read_disk_private.h \
+ libarchive/archive_read_disk_set_standard_lookup.c \
+ libarchive/archive_read_extract.c \
+ libarchive/archive_read_open_fd.c \
+ libarchive/archive_read_open_file.c \
+ libarchive/archive_read_open_filename.c \
+ libarchive/archive_read_open_memory.c \
+ libarchive/archive_read_private.h \
+ libarchive/archive_read_set_options.c \
+ libarchive/archive_read_support_filter_all.c \
+ libarchive/archive_read_support_filter_compress.c \
+ libarchive/archive_read_support_filter_gzip.c \
+ libarchive/archive_read_support_filter_none.c \
+ libarchive/archive_read_support_filter_program.c \
+ libarchive/archive_read_support_filter_rpm.c \
+ libarchive/archive_read_support_filter_uu.c \
+ libarchive/archive_read_support_filter_xz.c \
+ libarchive/archive_read_support_format_7zip.c \
+ libarchive/archive_read_support_format_all.c \
+ libarchive/archive_read_support_format_ar.c \
+ libarchive/archive_read_support_format_by_code.c \
+ libarchive/archive_read_support_format_cab.c \
+ libarchive/archive_read_support_format_cpio.c \
+ libarchive/archive_read_support_format_empty.c \
+ libarchive/archive_read_support_format_iso9660.c \
+ libarchive/archive_read_support_format_lha.c \
+ libarchive/archive_read_support_format_mtree.c \
+ libarchive/archive_read_support_format_rar.c \
+ libarchive/archive_read_support_format_raw.c \
+ libarchive/archive_read_support_format_tar.c \
+ libarchive/archive_read_support_format_xar.c \
+ libarchive/archive_read_support_format_zip.c \
+ libarchive/archive_string.c \
+ libarchive/archive_string.h \
+ libarchive/archive_string_composition.h \
+ libarchive/archive_string_sprintf.c \
+ libarchive/archive_util.c \
+ libarchive/archive_virtual.c \
+ libarchive/archive_write.c \
+ libarchive/archive_write_disk_posix.c \
+ libarchive/archive_write_disk_private.h \
+ libarchive/archive_write_disk_set_standard_lookup.c \
+ libarchive/archive_write_open_fd.c \
+ libarchive/archive_write_open_file.c \
+ libarchive/archive_write_open_filename.c \
+ libarchive/archive_write_open_memory.c \
+ libarchive/archive_write_private.h \
+ libarchive/archive_write_add_filter_compress.c \
+ libarchive/archive_write_add_filter_gzip.c \
+ libarchive/archive_write_add_filter_none.c \
+ libarchive/archive_write_add_filter_program.c \
+ libarchive/archive_write_add_filter_xz.c \
+ libarchive/archive_write_set_format.c \
+ libarchive/archive_write_set_format_7zip.c \
+ libarchive/archive_write_set_format_ar.c \
+ libarchive/archive_write_set_format_by_name.c \
+ libarchive/archive_write_set_format_cpio.c \
+ libarchive/archive_write_set_format_cpio_newc.c \
+ libarchive/archive_write_set_format_iso9660.c \
+ libarchive/archive_write_set_format_mtree.c \
+ libarchive/archive_write_set_format_pax.c \
+ libarchive/archive_write_set_format_shar.c \
+ libarchive/archive_write_set_format_ustar.c \
+ libarchive/archive_write_set_format_gnutar.c \
+ libarchive/archive_write_set_format_xar.c \
+ libarchive/archive_write_set_format_zip.c \
+ libarchive/archive_write_set_options.c \
+ libarchive/config_freebsd.h \
+ libarchive/archive_read_support_filter_bzip2.c \
+ libarchive/archive_write_add_filter_bzip2.c \
+ libarchive/filter_fork.c \
+ libarchive/filter_fork.h \
+ libarchive/archive_entry_copy_bhfi.c \
+ libarchive/archive_read_disk_windows.c \
+ libarchive/archive_windows.h \
+ libarchive/archive_windows.c \
+ libarchive/archive_write_disk_windows.c \
+ libarchive/filter_fork_windows.c
+
+#hfst_ospell_LDADD=libhfstospell.la
+hfst_ospell_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \
+ $(PKG_CXXFLAGS)
+
+if EXTRA_DEMOS
+hfst_ospell_norvig_SOURCES=main-norvig.cc
+#hfst_ospell_norvig_LDADD=libhfstospell.la
+hfst_ospell_norvig_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \
+ $(PKG_CXXFLAGS)
+
+hfst_ospell_cicling_SOURCES=main-cicling.cc
+#hfst_ospell_cicling_LDADD=libhfstospell.la
+hfst_ospell_cicling_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \
+ $(PKG_CXXFLAGS)
+
+hfst_ospell_lrec2013_SOURCES=main-lrec2013.cc
+#hfst_ospell_lrec2013_LDADD=libhfstospell.la
+hfst_ospell_lrec2013_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \
+ $(PKG_CXXFLAGS)
+
+hfst_ospell_survey_SOURCES=main-survey.cc
+#hfst_ospell_survey_LDADD=libhfstospell.la
+hfst_ospell_survey_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \
+ $(PKG_CXXFLAGS)
+
+hfst_ospell_fsmnlp_2012_SOURCES=main-fsmnlp-2012.cc
+#hfst_ospell_fsmnlp_2012_LDADD=libhfstospell.la
+hfst_ospell_fsmnlp_2012_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \
+ $(PKG_CXXFLAGS)
+endif # EXTRA_DEMOS
+
+hfst_ispell_SOURCES=main-ispell.cc
+#hfst_ispell_LDADD=libhfstospell.la
+hfst_ispell_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \
+ $(PKG_CXXFLAGS)
+
+# install headers for library in hfst's includedir
+include_HEADERS=hfst-ol.h ospell.h ol-exceptions.h \
+ ZHfstOspeller.h ZHfstOspellerXmlMetadata.h tinyxml2.h
+
+# pkgconfig
+pkgconfigdir=$(libdir)/pkgconfig
+pkgconfig_DATA=hfstospell.pc
+
+# tests
+if CAN_TEST
+TXTS=acceptor.default.txt errmodel.default.txt errmodel.extrachars.txt
+check_DATA=speller_basic.zhfst empty_descriptions.zhfst \
+ empty_titles.zhfst empty_locale.zhfst \
+ trailing_spaces.zhfst \
+ spl.hfstol sug.hfstol err.hfstol \
+ acceptor.default.hfst errmodel.default.hfst \
+ errmodel.extrachars.hfst bad_errormodel.zhfst
+# Actual test scripts:
+TESTS=basic-zhfst.sh basic-legacy.sh basic-zhfst-fallback.sh \
+ empty-descriptions.sh empty-titles.sh empty-locale.sh \
+ trailing-spaces.sh bad-errormodel.sh empty-zhfst.sh
+XFAIL_TESTS=empty-descriptions.sh empty-titles.sh empty-locale.sh empty-zhfst.sh
+EXTRA_DIST=$(TXTS) $(TESTS) \
+ basic_test.xml empty_descriptions.xml empty_titles.xml \
+ empty_locale.xml trailing_spaces.xml \
+ test.strings
+
+clean-local:
+ -rm -rf $(check_DATA) index.xml
+endif # CAN_TEST
+
+# N.B. Do not parallel test, race condition exists
+empty_descriptions.zhfst: acceptor.default.hfst errmodel.default.hfst empty_descriptions.xml
+ cp -f $(srcdir)/empty_descriptions.xml index.xml
+ $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml
+
+empty_titles.zhfst: acceptor.default.hfst errmodel.default.hfst empty_titles.xml
+ cp -f $(srcdir)/empty_titles.xml index.xml
+ $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml
+
+empty_locale.zhfst: acceptor.default.hfst errmodel.default.hfst empty_locale.xml
+ cp -f $(srcdir)/empty_locale.xml index.xml
+ $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml
+
+trailing_spaces.zhfst: acceptor.default.hfst errmodel.default.hfst trailing_spaces.xml
+ cp -f $(srcdir)/trailing_spaces.xml index.xml
+ $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml
+
+# N.B. Do not parallel test, race condition exists
+speller_basic.zhfst: acceptor.default.hfst errmodel.default.hfst basic_test.xml
+ cp $(srcdir)/basic_test.xml index.xml
+ $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml
+
+bad_errormodel.zhfst: acceptor.default.hfst errmodel.extrachars.hfst index.xml
+ cp -f $(srcdir)/errmodel.extrachars.hfst errmodel.default.hfst
+ $(ZIP) $(ZIPFLAGS) $@ acceptor.default.hfst errmodel.default.hfst index.xml
+
+sug.hfstol: acceptor.default.hfst
+ -ln -sf $< $@
+
+spl.hfstol: acceptor.default.hfst
+ -ln -sf $< $@
+
+err.hfstol: errmodel.default.hfst
+ -ln -sf $< $@
+
+.txt.hfst:
+ hfst-txt2fst $(HFST_FLAGS) $< | hfst-fst2fst $(HFST_FLAGS) -f olw -o $@
+
+.txt.hfstol:
+ hfst-txt2fst $(HFST_FLAGS) $< | hfst-fst2fst $(HFST_FLAGS) -f olw -o $@
+
+hfst-ospell.1: hfst-ospell
+ help2man --no-discard-stderr $< > $@
diff --git a/windows-configure.ac b/windows-configure.ac
new file mode 100644
index 0000000..b0ab7d6
--- /dev/null
+++ b/windows-configure.ac
@@ -0,0 +1,143 @@
+## Process this file with autoconf to produce configure script
+
+## Copyright (C) 2010 University of Helsinki
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# autoconf requirements
+AC_PREREQ([2.62])
+LT_PREREQ([2.2.6])
+
+# init
+AC_INIT([hfstospell], [0.3.0], [hfst-bugs at helsinki.fi], [hfstospell], [http://hfst.sf.net])
+AC_CONFIG_AUX_DIR([build-aux])
+AM_INIT_AUTOMAKE([1.11 -Wall -Werror foreign check-news color-tests silent-rules])
+AM_SILENT_RULES([yes])
+AC_REVISION([$Revision: 3876 $])
+AC_CONFIG_MACRO_DIR([m4])
+AC_CONFIG_SRCDIR([ospell.cc])
+AC_CONFIG_HEADERS([config.h])
+
+# Information on package
+HFSTOSPELL_NAME=hfstospell
+HFSTOSPELL_MAJOR=0
+HFSTOSPELL_MINOR=3
+HFSTOSPELL_EXTENSION=.0
+HFSTOSPELL_VERSION=$HFSTOSPELL_MAJOR.$HFSTOSPELL_MINOR$HFSTOSPELL_EXTENSION
+AC_SUBST(HFSTOSPELL_MAJOR)
+AC_SUBST(HFSTOSPELL_MINOR)
+AC_SUBST(HFSTOSPELL_VERSION)
+AC_SUBST(HFSTOSPELL_NAME)
+
+# Check for pkg-config first - the configuration won't work if it isn't available:
+#AC_PATH_PROG([PKGCONFIG], [pkg-config], [no])
+#AS_IF([test "x$PKGCONFIG" = xno], [AC_MSG_ERROR([pkg-config is required - please install])])
+#AC_PATH_PROG([DOXYGEN], [doxygen], [false])
+#AM_CONDITIONAL([CAN_DOXYGEN], [test "x$DOXYGEN" != xfalse])
+
+
+# Settings
+AC_ARG_ENABLE([extra_demos],
+ [AS_HELP_STRING([--enable-extra-demos],
+ [build conference demos for science reproduction @<:@default=no@:>@])],
+ [enable_extra_demos=$enableval], [enable_extra_demos=no])
+AM_CONDITIONAL([EXTRA_DEMOS], [test x$enable_extra_demos != xno])
+AC_ARG_ENABLE([zhfst],
+ [AS_HELP_STRING([--enable-zhfst],
+ [support zipped complex automaton sets @<:@default=check@:>@])],
+ [enable_zhfst=$enableval], [enable_zhfst=check])
+AC_ARG_ENABLE([xml],
+ [AS_HELP_STRING([--enable-xml=LIBXML],
+ [support xml metadata for zipped automaton sets with library LIBXML @<:@default=libxmlpp@:>@])],
+ [enable_xml=$enableval], [enable_xml=libxmlpp])
+AC_ARG_WITH([extract],
+ [AS_HELP_STRING([--with-extract=TARGET],
+ [extract zhfst archives to tmpdir or mem @<:@default=mem@:>@])],
+ [with_extract=$withval], [with_extract=mem])
+AS_IF([test "x$with_extract" = xmem], [AC_DEFINE([ZHFST_EXTRACT_TO_MEM], [1],
+ [Define to extract zhfst archives to char buffer])],
+ [AS_IF([test "x$with_extract" = xtmpdir],
+ [AC_DEFINE([ZHFST_EXTRACT_TO_TMPDIR], [1],
+ [Define to extract zhfst to tmp dir])],
+ [AC_MSG_ERROR([Use with-extract to mem or tmpdir])])])
+
+# Checks for programs
+m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
+AC_PROG_CC
+AC_PROG_CXX
+AC_LIBTOOL_WIN32_DLL
+LT_INIT
+AC_PROG_INSTALL
+AC_PROG_LN_S
+AC_PROG_MAKE_SET
+AC_PATH_PROG([HFST_TXT2FST], [hfst-txt2fst], [false])
+AC_PATH_PROG([HFST_FST2FST], [hfst-fst2fst], [false])
+AC_PATH_PROG([ZIP], [zip], [false])
+AM_CONDITIONAL([CAN_TEST],
+ [test x$HFST_TXT2FST != xfalse -a x$HFST_FST2FST != xfalse -a x$ZIP != xfalse])
+
+# Checks for libraries
+#AS_IF([test x$enable_zhfst != xno],
+# [PKG_CHECK_MODULES([LIBARCHIVE], [libarchive > 3],
+# [AC_DEFINE([HAVE_LIBARCHIVE], [1], [Use archives])
+# enable_zhfst=yes],
+# [enable_zhfst=no])])
+
+AC_DEFINE([HAVE_LIBARCHIVE], [1], [Use archives])
+AM_CONDITIONAL([WANT_ARCHIVE], [test x$enable_zhfst != xno])
+
+#AS_IF([test x$enable_xml = xlibxmlpp],
+# [PKG_CHECK_MODULES([LIBXMLPP], [libxml++-2.6 >= 2.10.0],
+# [AC_DEFINE([HAVE_LIBXML], [1], [Use libxml++])],
+# [AC_MSG_WARN([libxml++ failed, disabling xml])
+# enable_xml=no])])
+#AM_CONDITIONAL([WANT_LIBXMLPP], [test x$enable_xml = xlibxmlpp])
+
+#AS_IF([test x$enable_xml = xtinyxml2],
+# [PKG_CHECK_MODULES([TINYXML2], [tinyxml2 >= 1.0.8],
+# [AC_DEFINE([HAVE_TINYXML2], [1], [Use tinyxml])],
+# [AC_MSG_WARN([tinyxml missing, xml disabled])
+# enable_xml=no])])
+
+AC_DEFINE([HAVE_TINYXML2], [1], [Use tinyxml])
+AM_CONDITIONAL([WANT_TINYXML2], [test x$enable_xml = xtinyxml2])
+
+# Checks for header files
+AC_CHECK_HEADERS([getopt.h error.h])
+
+# Checks for types
+AC_TYPE_SIZE_T
+
+# Checks for structures
+
+# Checks for compiler characteristics
+
+# Checks for library functions
+AC_FUNC_MALLOC
+AC_CHECK_FUNCS([strndup error])
+# Checks for system services
+
+# config files
+AC_CONFIG_FILES([Makefile hfstospell.pc])
+
+# output
+AC_OUTPUT
+
+cat <<EOF
+-- Building $PACKAGE_STRING
+ * zhfst support: $enable_zhfst
+ * extracting to: $with_extract
+ * xml support: $enable_xml
+ * conference demos: $enable_extra_demos
+EOF
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/hfst-ospell.git
More information about the debian-science-commits
mailing list