[hfst] 01/02: Imported Upstream version 3.8.2~r4145
Tino Didriksen
tinodidriksen-guest at moszumanska.debian.org
Fri Dec 19 18:11:17 UTC 2014
This is an automated email from the git hooks/post-receive script.
tinodidriksen-guest pushed a commit to branch master
in repository hfst.
commit 0b8a798b927fd758a05b5509ed5df9ba19466578
Author: Tino Didriksen <mail at tinodidriksen.com>
Date: Fri Dec 19 18:09:10 2014 +0000
Imported Upstream version 3.8.2~r4145
---
ChangeLog | 424 ++++++++++++++++
ChangeLog.old | 535 +++++++++++++++++++++
NEWS | 12 +
check_installation/copy-tool-tests.sh | 2 +
configure.ac | 16 +-
libhfst/src/HfstTokenizer.cc | 130 ++++-
libhfst/src/HfstTokenizer.h | 9 +
libhfst/src/HfstTransducer.cc | 43 ++
libhfst/src/HfstTransducer.h | 4 +
libhfst/src/Makefile.am | 2 +-
libhfst/src/implementations/ConvertOlTransducer.cc | 174 +++----
libhfst/src/implementations/HfstTransitionGraph.h | 149 ++++--
.../implementations/optimized-lookup/convert.cc | 58 +--
.../src/implementations/optimized-lookup/convert.h | 246 ++++++----
.../src/implementations/optimized-lookup/pmatch.cc | 268 ++++++++---
.../src/implementations/optimized-lookup/pmatch.h | 40 +-
.../implementations/optimized-lookup/transducer.cc | 5 +
.../implementations/optimized-lookup/transducer.h | 5 +-
libhfst/src/parsers/LexcCompiler.cc | 158 ++++--
libhfst/src/parsers/LexcCompiler.h | 12 +-
libhfst/src/parsers/XreCompiler.cc | 41 +-
libhfst/src/parsers/XreCompiler.h | 30 ++
libhfst/src/parsers/lexc-lexer.ll | 3 -
libhfst/src/parsers/lexc-parser.yy | 72 ++-
libhfst/src/parsers/pmatch_lex.ll | 36 ++
libhfst/src/parsers/pmatch_parse.yy | 280 +++++++++--
libhfst/src/parsers/pmatch_utils.cc | 255 ++++++++--
libhfst/src/parsers/pmatch_utils.h | 9 +-
libhfst/src/parsers/xre_lex.ll | 2 +
libhfst/src/parsers/xre_parse.yy | 19 +-
libhfst/src/parsers/xre_utils.cc | 83 +++-
libhfst/src/parsers/xre_utils.h | 5 +
test/tools/Makefile.am | 8 +-
test/tools/lexc-compiler-functionality.sh | 4 +-
test/tools/warn.one-sided-flags.lexc | 7 +
test/tools/warn.one-sided-flags.lexc.flag.result | Bin 0 -> 461 bytes
test/tools/warn.one-sided-flags.lexc.result | Bin 0 -> 347 bytes
.../xfail.sublexicon-defined-more-than-once.lexc | 15 +
tools/src/HfstStrings2FstTokenizer.cc | 20 +-
tools/src/hfst-compose.cc | 62 ++-
tools/src/hfst-fst2strings.cc | 4 +-
tools/src/hfst-lexc-compiler.cc | 26 +-
tools/src/hfst-pmatch.cc | 17 +-
tools/src/hfst-proc2.cc | 3 +-
tools/src/hfst-summarize.cc | 61 ++-
tools/src/parsers/XfstCompiler.cc | 192 ++++++--
tools/src/parsers/XfstCompiler.h | 3 +-
tools/src/parsers/hfst-xfst.cc | 1 +
tools/src/parsers/test/Makefile.am | 8 +-
tools/src/parsers/test/compile_replace_1.output | 8 +
tools/src/parsers/test/compile_replace_1.xfst | 12 +
tools/src/parsers/test/compile_replace_2.output | 8 +
tools/src/parsers/test/compile_replace_2.xfst | 10 +
tools/src/parsers/test/compile_replace_3.output | 17 +
tools/src/parsers/test/compile_replace_3.xfst | 21 +
tools/src/parsers/test/merge.att | 39 ++
tools/src/parsers/test/merge.xfst | 17 +
tools/src/parsers/test/merge_weighted.att | 39 ++
tools/src/parsers/test/merge_weighted.xfst | 18 +
tools/src/parsers/test/one_transition_regex.att | 10 +
tools/src/parsers/test/one_transition_regex.xfst | 11 +
tools/src/parsers/test/test.sh | 30 +-
62 files changed, 3195 insertions(+), 603 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 56e3859..50487c4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,427 @@
+2014-12-17 13:28 eaxelson
+
+ * configure.ac, libhfst/src/HfstTransducer.cc,
+ libhfst/src/parsers/XreCompiler.cc,
+ libhfst/src/parsers/XreCompiler.h,
+ libhfst/src/parsers/xre_lex.ll,
+ tools/src/parsers/XfstCompiler.cc: Small fixes to
+ compile-replace. Also restarting the char counter every time a
+ new xre parsing is started. Removed the xml2 dependecy in
+ configure.
+
+2014-12-12 10:49 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/pmatch.cc: Omit
+ unnecessary step in stringification
+
+2014-12-12 10:22 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/pmatch.cc,
+ libhfst/src/implementations/optimized-lookup/pmatch.h: Another
+ slight speed improvement
+
+2014-12-12 09:56 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/pmatch.cc: Some
+ double free insurance related to previous commit
+
+2014-12-12 09:53 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/pmatch.cc,
+ libhfst/src/implementations/optimized-lookup/pmatch.h: Some more
+ pmatch runtime speedup (around 5-10% in most cases)
+
+2014-12-11 21:54 hardwick
+
+ * libhfst/src/implementations/ConvertOlTransducer.cc: Remove
+ commented-out lines (same ones I was intending in the previous
+ commit)
+
+2014-12-11 21:08 hardwick
+
+ * libhfst/src/implementations/ConvertOlTransducer.cc: Remove
+ extraenous index table fitting test
+
+2014-12-11 20:33 hardwick
+
+ * libhfst/src/implementations/ConvertOlTransducer.cc,
+ libhfst/src/implementations/optimized-lookup/convert.h: Further
+ improvements to conversion to optimized lookup format
+
+ This speeds up conversion a bit more, and also improves packing
+ in one case by around 25% (implying that this commit *does*
+ change the
+ output of the conversion, but in a functionally equivalent way).
+
+ The space gain was mainly accomplished by changing what happens
+ when we fail
+ to find a suitable index table location several times a row
+ starting in the
+ same place. We used to jump up to the previous successful
+ location and its
+ indices, now we keep trying to fill in the gaps between those
+ first.
+
+ This commit also removes the state-relabeling facility, because
+ it didn't seem
+ to be doing anything (perhaps something in HfstBasicTransducer
+ has changed;
+ it seems state numbers can no longer be non-contiguous). So we
+ now assume
+ states are contiguous and iterated in order. If this assumption
+ is broken,
+ this needs to be changes. But as I said earlier, looks like
+ relabeling wasn't
+ doing anything now anyway.
+
+2014-12-11 13:25 hardwick
+
+ * libhfst/src/implementations/ConvertOlTransducer.cc,
+ libhfst/src/implementations/optimized-lookup/convert.cc,
+ libhfst/src/implementations/optimized-lookup/convert.h: Large
+ speedup in conversion to optimized-lookup format
+
+ In some cases this speeds up conversion by up to 80%. This
+ involved
+ a) making the code a bit more convoluted by using multiple data
+ structures
+ where there used to be just one
+ b) omitting some space-oriented optimizations that appear to be
+ usually
+ very minor and occasionally even harmful
+
+ While I've tested this with the major transducers I've found,
+ it's not
+ unthinkable that refactoring such hairy code as this has
+ introduced bugs.
+ No new functionality is introduced so it's ok to roll this back
+ if it causes
+ problems.
+
+2014-12-05 16:03 eaxelson
+
+ * tools/src/parsers/test/Makefile.am,
+ tools/src/parsers/test/compile_replace_1.output,
+ tools/src/parsers/test/compile_replace_1.xfst,
+ tools/src/parsers/test/compile_replace_2.output,
+ tools/src/parsers/test/compile_replace_2.xfst,
+ tools/src/parsers/test/compile_replace_3.output,
+ tools/src/parsers/test/compile_replace_3.xfst,
+ tools/src/parsers/test/test.sh: Added test cases for
+ compile-replace.
+
+2014-12-05 14:49 eaxelson
+
+ * tools/src/parsers/test/Makefile.am,
+ tools/src/parsers/test/merge.att,
+ tools/src/parsers/test/merge.xfst,
+ tools/src/parsers/test/merge_weighted.att,
+ tools/src/parsers/test/merge_weighted.xfst,
+ tools/src/parsers/test/test.sh: Added test cases for merge
+ operation.
+
+2014-12-05 12:22 eaxelson
+
+ * tools/src/hfst-fst2strings.cc: Fixed an error in hfst-fst2strings
+ --print-separator where two consecutive lines of -- were printed
+ between non-empty transducers in some cases.
+
+2014-12-04 13:04 eaxelson
+
+ * libhfst/src/parsers/xre_utils.cc,
+ tools/src/parsers/XfstCompiler.cc: Now minimizing the merging
+ automaton before merge operation so that epsilons do not cut a
+ succesfull merge path. Also allowing epsilon-to-regexp-marker
+ transitions in the merge filter.
+
+2014-12-03 15:35 eaxelson
+
+ * libhfst/src/implementations/HfstTransitionGraph.h,
+ tools/src/parsers/XfstCompiler.cc: Improvements to
+ compile-replace function, now it should work for input and output
+ sides of a transducer.
+
+2014-12-03 09:36 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy: Revert bungled change to
+ precedence order
+
+2014-12-02 20:34 eaxelson
+
+ * libhfst/src/HfstTransducer.cc,
+ libhfst/src/parsers/XreCompiler.cc,
+ libhfst/src/parsers/XreCompiler.h,
+ libhfst/src/parsers/xre_lex.ll, tools/src/parsers/hfst-xfst.cc:
+ Now xre compiler of function merge does not increment the char
+ counter, making it possible to have many merge operators inside
+ one regex.
+
+2014-12-02 16:07 eaxelson
+
+ * libhfst/src/HfstTransducer.cc, libhfst/src/HfstTransducer.h,
+ libhfst/src/parsers/XreCompiler.cc,
+ libhfst/src/parsers/XreCompiler.h,
+ libhfst/src/parsers/xre_utils.cc: Added a constructor
+ XreCompiler(XreConstructorArguments & args) to facilitate passing
+ xre variables to merge function which needs them in its internal
+ xre compiler.
+
+2014-12-02 14:06 eaxelson
+
+ * libhfst/src/parsers/xre_utils.cc: Now using internal starptr
+ variables in functions hfst::xre::compile and
+ hfst::xre::compile_first instead of global hfst::xre::startptr.
+ This should fix the strange memory errors which occurred when
+ calling merge operation inside a regular expression.
+
+2014-12-02 12:33 eaxelson
+
+ * libhfst/src/HfstTransducer.cc,
+ libhfst/src/implementations/HfstTransitionGraph.h: Now merge
+ operation filters out non-optimal paths.
+
+2014-11-27 14:49 eaxelson
+
+ * libhfst/src/HfstTokenizer.cc, libhfst/src/HfstTokenizer.h:
+ Tentatively added a function 'tokenize_and_align_flag_diacritics'
+ to HfstTokenizer.
+
+2014-11-26 16:50 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/pmatch.cc,
+ libhfst/src/implementations/optimized-lookup/pmatch.h: Runtime
+ speed improvements
+ Prereserve table vectors, eliminate special_symbols map
+
+2014-11-26 13:00 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/pmatch.cc,
+ libhfst/src/implementations/optimized-lookup/pmatch.h,
+ libhfst/src/implementations/optimized-lookup/transducer.h,
+ libhfst/src/parsers/pmatch_lex.ll,
+ libhfst/src/parsers/pmatch_parse.yy,
+ libhfst/src/parsers/pmatch_utils.cc,
+ libhfst/src/parsers/pmatch_utils.h, tools/src/hfst-pmatch.cc,
+ tools/src/hfst-proc2.cc: Profiling support with Counter() and
+ --profile & a bunch of smaller changes
+
+ Counter(arg) in a ruleset inserts a profiling point, hfst-pmatch
+ --profile
+ prints profiling info.
+
+ When --verbose, warn about undefined symbols being interpreted as
+ labels.
+ When --flatten, flatten Lst() definitions too.
+
+2014-11-25 14:14 eaxelson
+
+ * tools/src/hfst-compose.cc: Allowing 1-to-n composition of
+ automata in archives. Fixes bug (or feature request) #277.
+
+2014-11-25 10:54 eaxelson
+
+ * libhfst/src/parsers/LexcCompiler.cc,
+ libhfst/src/parsers/LexcCompiler.h,
+ libhfst/src/parsers/lexc-lexer.ll,
+ libhfst/src/parsers/lexc-parser.yy,
+ tools/src/hfst-lexc-compiler.cc: Now passing verbosity to
+ LexcCompiler as an unsigned integer via setVerbosity(uint). Also
+ made small fixes to warning prints in lexc compiler.
+
+2014-11-24 13:51 eaxelson
+
+ * test/tools/Makefile.am,
+ test/tools/lexc-compiler-functionality.sh,
+ test/tools/warn.one-sided-flags.lexc,
+ test/tools/warn.one-sided-flags.lexc.flag.result,
+ test/tools/warn.one-sided-flags.lexc.result: Added tests for
+ one-sided flag diacritics for hfst-lexc.
+
+2014-11-24 13:50 eaxelson
+
+ * libhfst/src/parsers/LexcCompiler.cc,
+ libhfst/src/parsers/lexc-parser.yy: Now lexc compiler warns about
+ one-sided flag diacritics in verbose mode.
+
+2014-11-24 13:49 eaxelson
+
+ * libhfst/src/HfstTokenizer.cc, libhfst/src/HfstTokenizer.h: Added
+ a tokenizing function that warns about symbol pairs, if needed.
+
+2014-11-24 11:07 eaxelson
+
+ * test/tools/Makefile.am,
+ test/tools/lexc-compiler-functionality.sh,
+ test/tools/xfail.sublexicon-defined-more-than-once.lexc: Added
+ test case for previous lexc commit (sublexicon defined more than
+ once treated as an error).
+
+2014-11-24 10:33 eaxelson
+
+ * libhfst/src/parsers/LexcCompiler.cc,
+ libhfst/src/parsers/LexcCompiler.h,
+ libhfst/src/parsers/lexc-parser.yy: Now multiple definitions of
+ the same lexicon in lexc are treated as an error unless
+ LexcCompiler::setAllowMultipleLexiconDefinitions(true) is called
+ first.
+
+2014-11-24 09:04 eaxelson
+
+ * libhfst/src/parsers/lexc-lexer.ll,
+ libhfst/src/parsers/lexc-parser.yy: Now lexc parser updates the
+ error status hlexcnerrs when hlexcerror is called. In case of
+ warnings, the error status is nor updated.
+
+2014-11-20 12:15 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/pmatch.cc,
+ libhfst/src/implementations/optimized-lookup/pmatch.h,
+ libhfst/src/implementations/optimized-lookup/transducer.cc,
+ libhfst/src/implementations/optimized-lookup/transducer.h: Speed
+ up list arc processing by replacing some maps and sets with
+ vectors
+
+2014-11-19 09:17 eaxelson
+
+ * libhfst/src/HfstTransducer.cc, libhfst/src/HfstTransducer.h,
+ libhfst/src/implementations/HfstTransitionGraph.h,
+ libhfst/src/parsers/LexcCompiler.cc,
+ libhfst/src/parsers/XreCompiler.cc,
+ libhfst/src/parsers/XreCompiler.h,
+ libhfst/src/parsers/xre_parse.yy,
+ libhfst/src/parsers/xre_utils.cc,
+ libhfst/src/parsers/xre_utils.h,
+ tools/src/parsers/XfstCompiler.cc,
+ tools/src/parsers/XfstCompiler.h: Changed list to set in merge
+ operation. Removed commented code.
+
+2014-11-18 17:00 eaxelson
+
+ * tools/src/hfst-lexc-compiler.cc: Added option --encode-weights to
+ hfst-lexc.
+
+2014-11-17 13:55 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy: Reinstate undefined symbols
+ as valid tokens as per documentation
+
+2014-11-17 13:53 hardwick
+
+ * libhfst/src/parsers/pmatch_lex.ll,
+ libhfst/src/parsers/pmatch_parse.yy,
+ libhfst/src/parsers/pmatch_utils.cc: Add defined lists
+
+2014-11-17 13:14 hardwick
+
+ * libhfst/src/parsers/pmatch_lex.ll,
+ libhfst/src/parsers/pmatch_parse.yy,
+ libhfst/src/parsers/pmatch_utils.cc,
+ libhfst/src/parsers/pmatch_utils.h: Add Sigma()
+
+2014-11-17 11:48 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/pmatch.cc,
+ libhfst/src/implementations/optimized-lookup/pmatch.h,
+ libhfst/src/parsers/pmatch_lex.ll,
+ libhfst/src/parsers/pmatch_parse.yy,
+ libhfst/src/parsers/pmatch_utils.cc,
+ libhfst/src/parsers/pmatch_utils.h: Added Lst() and support for
+ list arcs in runtime
+
+2014-11-17 11:21 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy: Sync precedence rules
+
+2014-11-17 11:17 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy: Switch order of precedence
+ of concatenation and other binary operations
+
+2014-11-17 07:47 hardwick
+
+ * libhfst/src/parsers/pmatch_lex.ll,
+ libhfst/src/parsers/pmatch_parse.yy: Added Lit()
+
+2014-11-13 07:21 eaxelson
+
+ * libhfst/src/HfstTransducer.cc, libhfst/src/HfstTransducer.h,
+ libhfst/src/implementations/HfstTransitionGraph.h: Modified the
+ function merge, it now takes as an argument a map of list
+ symbols.
+
+2014-11-13 07:16 eaxelson
+
+ * libhfst/src/parsers/XreCompiler.cc,
+ libhfst/src/parsers/XreCompiler.h,
+ libhfst/src/parsers/xre_parse.yy,
+ libhfst/src/parsers/xre_utils.cc,
+ libhfst/src/parsers/xre_utils.h,
+ tools/src/parsers/XfstCompiler.cc,
+ tools/src/parsers/test/Makefile.am,
+ tools/src/parsers/test/one_transition_regex.att,
+ tools/src/parsers/test/one_transition_regex.xfst,
+ tools/src/parsers/test/test.sh: Fixed a bug in xre parser, now
+ definitions and unknowns can be used together in expressions such
+ as 'regex [def:?] ;'.
+
+2014-11-11 14:06 eaxelson
+
+ * libhfst/src/HfstTokenizer.cc,
+ tools/src/HfstStrings2FstTokenizer.cc: Now the epsilon symbol is
+ not added as a multichar symbol to hfst-lookup tokenizer if it is
+ the empty string. Should fix bug #275.
+
+2014-11-10 21:26 hardwick
+
+ * libhfst/src/parsers/pmatch_lex.ll,
+ libhfst/src/parsers/pmatch_utils.cc: Allow \U00NNNNNN syntax for
+ code points in utf-8 but above U+FFFF
+
+2014-11-10 20:55 hardwick
+
+ * libhfst/src/parsers/pmatch_lex.ll,
+ libhfst/src/parsers/pmatch_utils.cc: Allow \UNNNN as well as
+ \uNNNN
+
+2014-11-10 20:41 hardwick
+
+ * libhfst/src/parsers/pmatch_lex.ll,
+ libhfst/src/parsers/pmatch_utils.cc: Allow \uNNNN in range
+ notation and make some fixes to utf-8 handling
+
+2014-11-10 18:58 hardwick
+
+ * libhfst/src/parsers/pmatch_lex.ll,
+ libhfst/src/parsers/pmatch_parse.yy,
+ libhfst/src/parsers/pmatch_utils.cc,
+ libhfst/src/parsers/pmatch_utils.h: Added utf-8 character range
+ expressions
+
+2014-11-10 14:57 eaxelson
+
+ * libhfst/src/HfstTransducer.cc, libhfst/src/HfstTransducer.h,
+ libhfst/src/implementations/HfstTransitionGraph.h,
+ libhfst/src/parsers/xre_lex.ll, libhfst/src/parsers/xre_parse.yy:
+ An untested implementation of the merge operation added to
+ hfst-xfst.
+
+2014-11-10 13:23 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy,
+ libhfst/src/parsers/pmatch_utils.cc,
+ libhfst/src/parsers/pmatch_utils.h: Syntax-level completion of
+ functions (arg placement still not completely free)
+
+2014-11-04 16:35 eaxelson
+
+ * check_installation/copy-tool-tests.sh: Fixed a small bug in hfst
+ tool tester.
+
+2014-11-04 10:23 eaxelson
+
+ * ChangeLog, ChangeLog.old, NEWS, configure.ac,
+ libhfst/src/Makefile.am, swig/setup.py: Ready for release 3.8.1.
+
2014-10-31 16:44 eaxelson
* tools/src/hfst-guess.cc: Fixed std::cout into &std::cout in
diff --git a/ChangeLog.old b/ChangeLog.old
index 982c46a..56e3859 100644
--- a/ChangeLog.old
+++ b/ChangeLog.old
@@ -1,3 +1,538 @@
+2014-10-31 16:44 eaxelson
+
+ * tools/src/hfst-guess.cc: Fixed std::cout into &std::cout in
+ stream pointer comparison.
+
+2014-10-31 14:36 eaxelson
+
+ * back-ends/openfst/src/include/fst/accumulator.h,
+ back-ends/openfst/src/include/fst/arc-map.h,
+ back-ends/openfst/src/include/fst/determinize.h,
+ back-ends/openfst/src/include/fst/encode.h,
+ back-ends/openfst/src/include/fst/epsnormalize.h,
+ back-ends/openfst/src/include/fst/equivalent.h,
+ back-ends/openfst/src/include/fst/factor-weight.h,
+ back-ends/openfst/src/include/fst/label-reachable.h,
+ back-ends/openfst/src/include/fst/relabel.h,
+ back-ends/openfst/src/include/fst/replace-util.h,
+ back-ends/openfst/src/include/fst/replace.h,
+ back-ends/openfst/src/include/fst/rmepsilon.h,
+ back-ends/openfst/src/include/fst/rmfinalepsilon.h,
+ back-ends/openfst/src/include/fst/sparse-tuple-weight.h,
+ back-ends/openfst/src/include/fst/state-map.h,
+ back-ends/openfst/src/include/fst/symbol-table-ops.h,
+ back-ends/openfst/src/include/fst/synchronize.h,
+ back-ends/openfst/src/include/fst/test-properties.h,
+ back-ends/openfst/src/include/fst/util.h, configure.ac,
+ tools/src/hfst-tagger/src/use_model_src/DataTypes.h,
+ tools/src/hfst-tagger/src/use_model_src/NewLexicalModel.h: Now
+ using definitions USE_TR1_UNORDERED_(MAP|SET) when defining what
+ unordered maps and sets to use.
+
+2014-10-30 15:29 eaxelson
+
+ * configure.ac: Fixed a typo tr2 -> tr1.
+
+2014-10-29 13:50 eaxelson
+
+ * configure.ac, tools/src/HfstAlphabet.h,
+ tools/src/hfst-tagger/src/use_model_src/NewLexicalModel.h:
+ Unordered maps and sets are used from std namespace if
+ -std=gnu++11 is requested.
+
+2014-10-28 11:40 eaxelson
+
+ * swig/setup.py: Forgot to update version number in swig bindings.
+
+2014-10-27 16:40 hardwick
+
+ * libhfst/src/parsers/pmatch_lex.ll,
+ libhfst/src/parsers/pmatch_parse.yy: Various improvements and
+ additions to function syntax,
+ particularly empty args and string-args
+
+2014-10-27 14:48 eaxelson
+
+ * libhfst/src/implementations/HfstTransitionGraph.h,
+ tools/src/parsers/XfstCompiler.cc: Added functions for merge
+ operation in HfstTransitionGraph.
+
+2014-10-24 11:03 eaxelson
+
+ * libhfst/src/implementations/HfstTransitionGraph.h,
+ libhfst/src/implementations/HfstTropicalTransducerTransitionData.h:
+ Modified weight handling in HfstTransitionGraph::intersect.
+
+2014-10-24 08:38 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/transducer.cc: When
+ the alphabet is constructed from a symbol table, set identity to
+ NO_SYM
+ this was supposed to always happen anyway but didn't matter until
+ recently
+
+2014-10-22 13:16 eaxelson
+
+ * libhfst/src/implementations/HfstTransitionGraph.h,
+ libhfst/src/implementations/HfstTropicalTransducerTransitionData.h:
+ Modified intersection algorithms in HfstTransitionGraph.
+
+2014-10-22 10:21 eaxelson
+
+ * libhfst/src/implementations/HfstTransitionGraph.h: Added
+ functions to be used in xerox's merge operation.
+
+2014-10-22 10:20 eaxelson
+
+ * tools/src/parsers/XfstCompiler.cc,
+ tools/src/parsers/xfst-parser.yy: Small fixes to list definitions
+ in hfst-xfst.
+
+2014-10-17 15:59 eaxelson
+
+ * libhfst/src/implementations/HfstTransitionGraph.h,
+ tools/src/parsers/XfstCompiler.cc,
+ tools/src/parsers/XfstCompiler.h: Added an implementation for
+ compile-replace in hfst-xfst, it still needs lot of testing.
+
+2014-10-16 14:36 eaxelson
+
+ * libhfst/src/implementations/HfstTransitionGraph.h,
+ tools/src/parsers/XfstCompiler.cc: Yet some more functions added
+ to compile-replace.
+
+2014-10-16 11:48 eaxelson
+
+ * libhfst/src/implementations/HfstTransitionGraph.h,
+ tools/src/parsers/XfstCompiler.cc: Added more functions for
+ compile-replace.
+
+2014-10-15 14:23 eaxelson
+
+ * libhfst/src/implementations/HfstTransitionGraph.h: Tentatively
+ added functions in HfstTransitionGraph to be used in
+ compile-replace.
+
+2014-10-15 13:26 eaxelson
+
+ * tools/src/parsers/XfstCompiler.cc: Added function
+ is_well_formed_for_compile_replace to be used in compile-replace
+ command.
+
+2014-10-15 11:30 eaxelson
+
+ * test/libhfst/test_transducer_functions.cc: Forgot to comment out
+ debugging prints in tests.
+
+2014-10-15 11:28 eaxelson
+
+ * back-ends/openfst/src/include/fst/interval-set.h,
+ test/libhfst/test_transducer_functions.cc: Added brackets around
+ member calls 'Interval.end' and 'Interval.begin' to avoid them
+ getting confused with std::end() and std::begin() templates in
+ C++11.
+
+2014-10-15 10:21 eaxelson
+
+ * back-ends/openfst/src/include/fst/interval-set.h: Rolled back
+ earlier revision in interval-set.h
+
+2014-10-15 09:56 eaxelson
+
+ * back-ends/openfst/src/include/fst/interval-set.h,
+ libhfst/src/implementations/ConvertTransducerFormat.h,
+ libhfst/src/implementations/FomaTransducer.h: Made small
+ modifications for better c++11/c++0x support.
+
+2014-10-15 09:32 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/transducer.cc: Fix
+ bug where identity wasn't being set to NO_SYMBOL when absent from
+ alphabet
+
+2014-10-15 08:46 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/find_epsilon_loops.cc,
+ libhfst/src/implementations/optimized-lookup/transducer.cc,
+ libhfst/src/implementations/optimized-lookup/transducer.h: Refuse
+ to enter flag loops more than once
+ fixes bug #250 bug - arguably this could be applied to epsilon
+ loops too,
+ pending discussion
+
+2014-10-15 08:09 eaxelson
+
+ * man/Makefile.am: Fixed a typo in Makefile.
+
+2014-10-14 15:01 eaxelson
+
+ * tools/src/parsers/XfstCompiler.cc: Now hfst-xfst gives a warning
+ (or exits) if a binary command tries to access a stack with less
+ than 2 transducers.
+
+2014-10-14 14:51 eaxelson
+
+ * tools/src/parsers/XfstCompiler.cc,
+ tools/src/parsers/XfstCompiler.h: Now hfst-xfst exits if a
+ command tries to access an empty stack if quit-on-fail is ON and
+ hfst-xfst is not in interactive mode.
+
+2014-10-14 13:44 mpsilfve
+
+ * scripts/hfst-fst2tesseract.xfst: Comment explaining usage of
+ hfst-fst2tesseract.xfst.
+
+2014-10-14 13:40 mpsilfve
+
+ * scripts/hfst-fst2tesseract.xfst: Added script for converting
+ morphological analyzers to Tesseract word models.
+
+2014-10-14 12:27 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/find_epsilon_loops.cc,
+ libhfst/src/implementations/optimized-lookup/transducer.h:
+ Improvements to loop finding
+
+2014-10-14 12:08 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/find_epsilon_loops.cc,
+ libhfst/src/implementations/optimized-lookup/transducer.h:
+ Further corrections to loop detection
+ slowdown back to ~10x but may be improved from here
+
+2014-10-14 11:51 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/find_epsilon_loops.cc,
+ libhfst/src/implementations/optimized-lookup/transducer.h: Fix
+ some cases of overdetecting infinite ambiguity, there's still
+ some left
+
+2014-10-14 10:51 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/find_epsilon_loops.cc:
+ Forgot to keep adding the repeated states in the loop detection
+ phase
+
+2014-10-13 18:15 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/find_epsilon_loops.cc,
+ libhfst/src/implementations/optimized-lookup/transducer.h: Only
+ try to catch infinite ambiguity at epsilon arcs
+ This is the big speed win and presumably correct.
+
+2014-10-13 17:22 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/find_epsilon_loops.cc:
+ This order of comparison is a bit faster since sizes never differ
+
+2014-10-13 16:34 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/find_epsilon_loops.cc,
+ libhfst/src/implementations/optimized-lookup/transducer.h: Speed
+ up is_lookup_infinitely_ambiguous() somewhat
+
+2014-10-11 04:10 mie
+
+ * tools/src/hfst-optimized-lookup.cc: Try to avoid using negative
+ indexes for arrays
+
+2014-10-11 03:59 mie
+
+ * test/tools/Makefile.am,
+ test/tools/optimized-lookup-functionality.sh: Optimised lookup
+ tests
+
+2014-10-09 17:22 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/transducer.cc,
+ libhfst/src/implementations/optimized-lookup/transducer.h:
+ Runtime handling of identity and unknown
+
+2014-10-09 12:30 hardwick
+
+ * libhfst/src/parsers/pmatch_utils.h: Forgot to remove one thing in
+ the last commit
+
+2014-10-09 12:22 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy,
+ libhfst/src/parsers/pmatch_utils.cc: Take out our own
+ harmonization hacks now that they're unneeded
+
+2014-10-08 12:45 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/pmatch.cc,
+ libhfst/src/implementations/optimized-lookup/pmatch.h,
+ libhfst/src/parsers/pmatch_parse.yy,
+ libhfst/src/parsers/pmatch_utils.cc,
+ libhfst/src/parsers/pmatch_utils.h: Don't use delimiters when
+ they're not necessary,
+ also don't insert everything to RTNs anymore and provide the
+ is_special()
+ function the previous commit required
+
+2014-10-08 12:29 hardwick
+
+ * libhfst/src/HarmonizeUnknownAndIdentitySymbols.cc: Treat special
+ pmatch symbols like flag diacritics for harmonization,
+ also after harmonization add all symbols, including flags, to the
+ alphabets
+
+2014-10-08 11:51 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy: Fix bug where delimiters
+ were shadowing the named transducers' names
+ also remove extraenous parsing path
+
+2014-10-06 21:33 janiemi
+
+ * test/tools/pmatch-tester.sh, test/tools/pmatch-tests.sh: Updated
+ pmatch functionality tests.
+
+ Current pmatch syntax: string literals in {...}, symbols in
+ double quotes.
+ Added tests: Ins maximizing globally; Difference and character
+ sets in
+ named expressions; Named expressions in OptCap, ToUpper; Named
+ expressions
+ in replace; Long input lines; Ins should not throw
+ std::out_of_range;
+ Disjunction of two Ins expressions.
+ Added options: --include-tests, --exclude-tests,
+ --no-number-tests,
+ --truncate-lines, --truncate-log-lines. Minor new features in
+ test runner.
+
+2014-10-06 17:27 hardwick
+
+ * libhfst/src/parsers/pmatch_lex.ll: Add string literal syntax for
+ standalone %-escaped chars
+ (they used to be considered symbols which now have to be
+ defined or cause an error)
+
+2014-10-06 16:44 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/pmatch.cc,
+ libhfst/src/implementations/optimized-lookup/pmatch.h,
+ libhfst/src/parsers/pmatch_parse.yy,
+ libhfst/src/parsers/pmatch_utils.cc,
+ libhfst/src/parsers/pmatch_utils.h: Use minimization guards to
+ keep multiple negative contexts separate in disjunctions
+
+2014-10-06 15:36 eaxelson
+
+ * libhfst/src/HfstInputStream.h,
+ libhfst/src/implementations/HfstTransitionGraph.h,
+ libhfst/src/parsers/XreCompiler.h, swig/doc/libhfst.py,
+ swig/hfstBot.py, swig/test/test_examples.py: Fixed some more
+ spelling errors noticed by lintian.
+
+2014-10-06 14:45 eaxelson
+
+ * tools/src/hfst-lexc-wrapper.cc,
+ tools/src/hfst-twolc/src/commandline_src/CommandLine.cc: Fixed
+ spelling errors found by lintian.
+
+2014-10-06 14:33 eaxelson
+
+ * man/hfst-train-tagger.1: Added again hfst-train-tagger man page
+ which is no more a symlink.
+
+2014-10-06 14:31 eaxelson
+
+ * man/Makefile.am, man/hfst-build-tagger.1,
+ man/hfst-foma-wrapper.1, man/hfst-open-input-file-for-tagger.1,
+ man/hfst-reweight-tagger.1, man/hfst-train-tagger.1,
+ man/hfst-twolc-loc.1, man/hfst-twolc-system.1, man/hfst-twolc.1,
+ man/hfst_tagger_compute_data_statistics.py.1, man/htwolcpre1.1,
+ man/htwolcpre2.1, man/htwolcpre3.1: Added missing man pages.
+
+2014-10-06 14:00 eaxelson
+
+ * tools/src/hfst-tagger/src/hfst-reweight-tagger.cc: Now
+ hfst-reweight-tagger --help returns EXIT_SUCCESS before trying to
+ access uninitialized values.
+
+2014-10-06 13:24 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy,
+ libhfst/src/parsers/pmatch_utils.cc,
+ libhfst/src/parsers/pmatch_utils.h: Warn about shadowing
+ definitions
+
+2014-10-06 13:05 hardwick
+
+ * libhfst/src/parsers/pmatch_lex.ll,
+ libhfst/src/parsers/pmatch_parse.yy: Alternate syntaxes regex for
+ Define TOP and .#. for #
+
+2014-10-06 12:47 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy: Revamp LABEL parsing and
+ introduce curly literal pairs
+
+2014-10-06 11:08 eaxelson
+
+ * man/Makefile.am, man/hfst-affix-guessify.1,
+ man/hfst-apertium-proc.1, man/hfst-calculate.1,
+ man/hfst-compare.1, man/hfst-compose-intersect.1,
+ man/hfst-compose.1, man/hfst-concatenate.1, man/hfst-conjunct.1,
+ man/hfst-determinise.1, man/hfst-determinize.1,
+ man/hfst-disjunct.1, man/hfst-edit-metadata.1,
+ man/hfst-expand-equivalences.1, man/hfst-expand.1,
+ man/hfst-format.1, man/hfst-fst2fst.1, man/hfst-fst2strings.1,
+ man/hfst-fst2txt.1, man/hfst-grep.1, man/hfst-guess.1,
+ man/hfst-guessify.1, man/hfst-head.1, man/hfst-info.1,
+ man/hfst-intersect.1, man/hfst-invert.1, man/hfst-lexc-wrapper.1,
+ man/hfst-lexc.1, man/hfst-lookup.1, man/hfst-minimise.1,
+ man/hfst-minimize.1, man/hfst-minus.1, man/hfst-multiply.1,
+ man/hfst-name.1, man/hfst-open-input-file-for-tagger.1,
+ man/hfst-optimised-lookup.1, man/hfst-optimized-lookup.1,
+ man/hfst-pair-test.1, man/hfst-pmatch.1, man/hfst-pmatch2fst.1,
+ man/hfst-proc2.1, man/hfst-project.1, man/hfst-prune-alphabet.1,
+ man/hfst-push-weights.1, man/hfst-regexp2fst.1,
+ man/hfst-remove-epsilons.1, man/hfst-repeat.1,
+ man/hfst-reverse.1, man/hfst-reweight.1, man/hfst-sfstpl2fst.1,
+ man/hfst-shuffle.1, man/hfst-split.1, man/hfst-strings2fst.1,
+ man/hfst-substitute.1, man/hfst-subtract.1, man/hfst-summarise.1,
+ man/hfst-summarize.1, man/hfst-tag.1, man/hfst-tail.1,
+ man/hfst-train-tagger-loc.1, man/hfst-train-tagger-system.1,
+ man/hfst-train-tagger.1, man/hfst-traverse.1, man/hfst-txt2fst.1,
+ man/hfst-union.1, man/hfst-xfst.1: Updated and added man pages.
+
+2014-10-06 11:06 eaxelson
+
+ * tools/src/hfst-expand-equivalences.cc: Moved option checking
+ after possible returning from program so that option --help will
+ not generate error messages.
+
+2014-10-05 11:06 hardwick
+
+ * libhfst/src/parsers/pmatch_utils.cc: Require backslash character
+ to be escaped as \\ in curly literals
+
+2014-10-05 10:53 hardwick
+
+ * libhfst/src/parsers/pmatch_utils.cc: Fix bug in unescaping
+ function
+
+2014-10-03 12:28 eaxelson
+
+ * ChangeLog, ChangeLog.old, NEWS, configure.ac,
+ libhfst/src/Makefile.am: Ready for release 3.8.0.
+
+2014-10-03 11:55 eaxelson
+
+ * tools/src/parsers/XfstCompiler.cc: Added variable
+ 'lexc-rename-flags' to hfst-xfst.
+
+2014-10-03 11:48 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy: Desperately fiddle with the
+ way the minus operation expands things
+
+2014-10-01 16:29 hardwick
+
+ * libhfst/src/parsers/pmatch_lex.ll,
+ libhfst/src/parsers/pmatch_parse.yy: Add lambda-like anonymous
+ definitions for controlling subexpression boundaries
+
+2014-10-01 12:46 eaxelson
+
+ * tools/src/hfst-lexc-compiler.cc, tools/src/hfst-regexp2fst.cc,
+ tools/src/parsers/XfstCompiler.cc: Changed the flag handling
+ behavior of hfst-xfst and hfst-lexc. Now both tools by default
+ use Xerox's way when composing, i.e. flag diacritics match
+ unknown and identity symbols. This can be controlled with
+ variable 'xerox-composition' (the default is ON) in hfst-xfst and
+ with option '--xerox-composition={ON,OFF}' (the default is also
+ ON) in hfst-lexc. hfst-regexp2fst also has the option
+ '--xerox-composition' which by default is OFF, as it was earlier.
+
+2014-10-01 11:28 eaxelson
+
+ * tools/src/hfst-regexp2fst.cc: Fixed a typo in hfst-regexp2fst
+ option handling.
+
+2014-09-30 15:38 eaxelson
+
+ * libhfst/src/HfstTransducer.cc: Now one-sided flag diacritics are
+ allowed in composition when flag-is-epsilon is used.
+
+2014-09-30 13:54 eaxelson
+
+ * tools/src/hfst-strings2fst.cc: Added option --log10 for 10-based
+ logarithmic weights in hfst-strings2fst.
+
+2014-09-30 13:18 eaxelson
+
+ * libhfst/src/HfstTransducer.cc, libhfst/src/parsers/xre_parse.yy,
+ tools/src/parsers/XfstCompiler.cc: Now an error is thrown if
+ flags are not twosided in composition when xerox composition is
+ used.
+
+2014-09-30 13:05 eaxelson
+
+ * libhfst/src/HfstExceptionDefs.cc,
+ libhfst/src/HfstExceptionDefs.h: Added exception class
+ FlagDiacriticsAreNotIdentitesException.
+
+2014-09-29 11:22 hardwick
+
+ * test/tools/Makefile.am, test/tools/pmatch-functionality.sh,
+ test/tools/pmatch-tester.sh, test/tools/pmatch-tests.sh: Add
+ pmatch functionality test suite
+
+2014-09-29 10:49 eaxelson
+
+ * tools/src/hfst-regexp2fst.cc: Added option -X flag-is-epsilon to
+ hfst-regexp2fst.
+
+2014-09-25 10:38 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy: Fix another symbol-leaking
+ issue
+
+2014-09-25 10:17 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy: Allow nested logical
+ operations on contexts
+
+2014-09-23 07:15 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy,
+ libhfst/src/parsers/pmatch_utils.cc,
+ libhfst/src/parsers/pmatch_utils.h: We need to avoid symbol
+ pollution for more than just special symbols
+ (this doesn't completely resolve pollution issues, just some
+ urgent ones)
+
+2014-09-23 06:47 hardwick
+
+ * libhfst/src/implementations/optimized-lookup/pmatch.cc: Don't
+ forget to pop the rtn stack when there's nothing matched
+
+2014-09-23 06:36 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy: Minimize after adding
+ delimiters, not before
+
+2014-09-23 05:14 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy: Revert bracket-bounding
+ behaviour
+
+2014-09-22 15:47 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy: Use brackets for extra
+ delimiters to control tag and context boundaries more
+
+2014-09-22 15:28 hardwick
+
+ * libhfst/src/parsers/pmatch_parse.yy,
+ libhfst/src/parsers/pmatch_utils.cc: add_delimiters was happening
+ in the wrong place since recent syntax changes
+
2014-09-22 12:57 eaxelson
* libhfst/src/implementations/HfstOlTransducer.cc: Fixed a too
diff --git a/NEWS b/NEWS
index 5c1c381..e937203 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,18 @@
This file contains all noteworthy changes in HFST development between releases.
For full listing of changes see ChangeLog.
+Noteworthy changes in 3.8.2
+---------------------------
+
+* Runtime speed improvements in pmatch
+
+* Speedup in conversion to optimized lookup format
+
+* Merge and compile-replace operations supported in hfst-xfst
+
+* Added option --print-symbol-pair-statistis(=N) to hfst-summarize
+
+
Noteworthy changes in 3.8.1
---------------------------
diff --git a/check_installation/copy-tool-tests.sh b/check_installation/copy-tool-tests.sh
index d91adfa..72881c5 100755
--- a/check_installation/copy-tool-tests.sh
+++ b/check_installation/copy-tool-tests.sh
@@ -38,6 +38,8 @@ do
sed -i 's/-loc / /g' $file
fi
done
+# this file uses 'tooldir' instead of 'TOOLDIR'
+sed -i 's/$\tooldir\//$1/g' pmatch-tester.sh
# These tests are rewritten in directory check_installation
rm empty-input.sh
diff --git a/configure.ac b/configure.ac
index 9089334..f4c8c58 100644
--- a/configure.ac
+++ b/configure.ac
@@ -19,7 +19,7 @@
HFST_NAME=hfst
HFST_MAJOR=3
HFST_MINOR=8
-HFST_EXTENSION=1
+HFST_EXTENSION=2
HFST_VERSION=$HFST_MAJOR.$HFST_MINOR.$HFST_EXTENSION
### When the VERSION is INCREMENTED, REMEMBER to increment the LONGVERSION too.
@@ -28,10 +28,10 @@ HFST_VERSION=$HFST_MAJOR.$HFST_MINOR.$HFST_EXTENSION
LIBHFST_NAME=hfst
LIBHFST_MAJOR=3
LIBHFST_MINOR=8
-LIBHFST_EXTENSION=1
+LIBHFST_EXTENSION=2
LIBHFST_VERSION=$LIBHFST_MAJOR.$LIBHFST_MINOR.$LIBHFST_EXTENSION
-AC_INIT([hfst], [3.8.1], [hfst-bugs at helsinki.fi], [hfst])
+AC_INIT([hfst], [3.8.2], [hfst-bugs at helsinki.fi], [hfst])
AC_CONFIG_AUX_DIR([build-aux])
AM_INIT_AUTOMAKE([-Wall std-options foreign check-news])
@@ -44,8 +44,8 @@ AC_CONFIG_HEADERS([config.h libhfst/src/hfst.hpp])
AC_SUBST([LIBHFST_MAJOR], [3])
AC_SUBST([LIBHFST_MINOR], [8])
-AC_SUBST([LIBHFST_EXTENSION], [1])
-AC_SUBST([LIBHFST_VERSION], [3.8.1])
+AC_SUBST([LIBHFST_EXTENSION], [2])
+AC_SUBST([LIBHFST_VERSION], [3.8.2])
AC_SUBST([LIBHFST_NAME], [hfst])
# long version = version vector cast in base 10000, for automatic comparisons
@@ -56,9 +56,9 @@ AC_SUBST([LIBHFST_NAME], [hfst])
# $LIBHFST_MINOR * 10000 + $LIBHFST_EXTENSION + "L"
# NB! It turned out to be not portable, and can't be used!
-AC_DEFINE([HFST_LONGVERSION], [300080001L],
+AC_DEFINE([HFST_LONGVERSION], [300080002L],
[Define to hfst version vector as long in base 10000])
-AC_DEFINE([HFST_REVISION], ["$Revision: 4088 $"],
+AC_DEFINE([HFST_REVISION], ["$Revision: 4145 $"],
[Automatically substitute to configure.ac revision])
AC_DEFINE_UNQUOTED([HFST_STRING], ["$PACKAGE_STRING"],
[Define to libhfst pretty name for programs to print])
@@ -570,7 +570,7 @@ AM_CONDITIONAL([CAN_DOXYGEN], [test x$DOXYGEN != xno])
# Checks for libraries
-AC_CHECK_LIB([xml2], [main])
+#AC_CHECK_LIB([xml2], [main])
AC_LANG_PUSH([C++])
AS_IF([test "x$with_openfst" != "xno" -a "x$enable_mingw" == "xno"],
[AC_CHECK_LIB([dl], [main])])
diff --git a/libhfst/src/HfstTokenizer.cc b/libhfst/src/HfstTokenizer.cc
index 27c1260..8dd27b3 100644
--- a/libhfst/src/HfstTokenizer.cc
+++ b/libhfst/src/HfstTokenizer.cc
@@ -11,7 +11,9 @@
// along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "HfstTokenizer.h"
+#include "HfstFlagDiacritics.h"
#include <string>
+#include <cassert>
#ifndef MAIN_TEST
@@ -111,7 +113,9 @@ const
void
HfstTokenizer::add_multichar_symbol(const string& symbol)
-{ multi_char_symbols.add(symbol.c_str()); }
+{ if (symbol == "")
+ { return; }
+ multi_char_symbols.add(symbol.c_str()); }
void
HfstTokenizer::add_skip_symbol(const std::string &symbol)
@@ -232,6 +236,130 @@ StringPairVector HfstTokenizer::tokenize
}
return spv;
}
+
+StringPairVector HfstTokenizer::tokenize
+(const string& input_string,const string& output_string,
+ void (*warn_about_pair)(const std::pair<std::string, std::string> &symbol_pair)) const
+{
+ check_utf8_correctness(input_string);
+ check_utf8_correctness(output_string);
+
+ StringPairVector spv;
+
+ StringPairVector input_spv = tokenize(input_string.c_str());
+ StringPairVector output_spv = tokenize(output_string.c_str());
+
+ if (input_spv.size() < output_spv.size())
+ {
+ StringPairVector::iterator jt = output_spv.begin();
+ for (StringPairVector::iterator it = input_spv.begin();
+ it != input_spv.end();
+ ++it)
+ {
+ StringPair sp(it->first, jt->first);
+ warn_about_pair(sp);
+ spv.push_back(sp);
+ ++jt; }
+ for ( ; jt != output_spv.end(); ++jt)
+ { StringPair sp(internal_epsilon,jt->first);
+ warn_about_pair(sp);
+ spv.push_back(sp); }
+ }
+ else
+ {
+ StringPairVector::iterator it = input_spv.begin();
+ for (StringPairVector::iterator jt = output_spv.begin();
+ jt != output_spv.end();
+ ++jt)
+ { StringPair sp(it->first, jt->first);
+ warn_about_pair(sp);
+ spv.push_back(sp);
+ ++it; }
+ for ( ; it != input_spv.end(); ++it)
+ { StringPair sp(it->first,internal_epsilon);
+ warn_about_pair(sp);
+ spv.push_back(sp); }
+ }
+ return spv;
+}
+
+StringPairVector HfstTokenizer::tokenize_and_align_flag_diacritics
+(const string& input_string,const string& output_string,
+ void (*warn_about_pair)(const std::pair<std::string, std::string> &symbol_pair)) const
+{
+ check_utf8_correctness(input_string);
+ check_utf8_correctness(output_string);
+
+ StringPairVector spv;
+
+ StringPairVector input_spv = tokenize(input_string.c_str());
+ StringPairVector output_spv = tokenize(output_string.c_str());
+
+ assert(input_spv.size() > 0 && output_spv.size() > 0);
+ StringPairVector::const_iterator it = input_spv.begin();
+ StringPairVector::const_iterator jt = output_spv.begin();
+
+ // proceed until both token vectors are exhausted
+ while(it != input_spv.end() || jt != output_spv.end())
+ {
+ StringPair sp("", ""); // string pair to push back to the result
+ StringPair sp_cont("", ""); // possible continuation in case of missaligned flags
+
+ if (it == input_spv.end())
+ {
+ if (FdOperation::is_diacritic(jt->first)) // copy diacritic to other side
+ {
+ sp = StringPair(jt->first, jt->first);
+ }
+ else // pad input with epsilons
+ {
+ sp = StringPair(internal_epsilon, jt->first);
+ }
+ jt++;
+ }
+ else if (jt == output_spv.end())
+ {
+ if (FdOperation::is_diacritic(it->first)) // copy diacritic to other side
+ {
+ sp = StringPair(it->first, it->first);
+ }
+ else // pad output with epsilons
+ {
+ sp = StringPair(it->first, internal_epsilon);
+ }
+ it++;
+ }
+ else
+ {
+ // take from both vectors (cases foo:bar, foo:foo, flag1:flag1)
+ if ((!FdOperation::is_diacritic(it->first) && !FdOperation::is_diacritic(jt->first)) ||
+ *it == *jt)
+ {
+ sp = StringPair(it->first, jt->first);
+ }
+ // take first from first vector and then from second
+ // (cases flag1:flag2, flag1::bar, foo:flag2)
+ else
+ {
+ StringPair wrong_pair(it->first, jt->first);
+ warn_about_pair(wrong_pair);
+ sp = StringPair(it->first, it->first);
+ sp_cont = StringPair(jt->first, jt->first);
+ }
+ it++;
+ jt++;
+ }
+
+ spv.push_back(sp);
+ if (sp_cont.first.size() != 0 && sp_cont.second.size() != 0)
+ {
+ spv.push_back(sp_cont);
+ }
+ }
+
+ return spv;
+}
+
void
HfstTokenizer::check_utf8_correctness(const std::string &input_string)
diff --git a/libhfst/src/HfstTokenizer.h b/libhfst/src/HfstTokenizer.h
index 4174d99..d8e0cf5 100644
--- a/libhfst/src/HfstTokenizer.h
+++ b/libhfst/src/HfstTokenizer.h
@@ -128,6 +128,15 @@ namespace hfst
StringPairVector tokenize(const std::string &input_string,
const std::string &output_string) const;
+ StringPairVector tokenize(const std::string &input_string,
+ const std::string &output_string,
+ void (*warn_about_pair)(const std::pair<std::string, std::string> &symbol_pair)) const;
+
+ StringPairVector tokenize_and_align_flag_diacritics
+ (const std::string &input_string,
+ const std::string &output_string,
+ void (*warn_about_pair)(const std::pair<std::string, std::string> &symbol_pair)) const;
+
//! \brief If @a input_String is not valid utf-8, throw an
//! @a IncorrectUtf8CodingException.
//!
diff --git a/libhfst/src/HfstTransducer.cc b/libhfst/src/HfstTransducer.cc
index 2806202..d7c0083 100644
--- a/libhfst/src/HfstTransducer.cc
+++ b/libhfst/src/HfstTransducer.cc
@@ -3556,6 +3556,49 @@ bool substitute_unknown_identity_pairs
}
+HfstTransducer &HfstTransducer::merge
+(const HfstTransducer &another, const struct hfst::xre::XreConstructorArguments & args)
+{
+ HfstBasicTransducer this_basic(*this);
+ HfstBasicTransducer another_basic(another);
+ std::set<std::string> markers_added;
+ HfstBasicTransducer result = hfst::implementations::HfstBasicTransducer::merge(this_basic, another_basic, args.list_definitions, markers_added);
+ HfstTransducer initial_merge(result, this->get_type());
+ initial_merge.minimize();
+
+ // filter non-optimal paths
+ // [ ? | #V ?:? ]* %#V:V ?:0 [ ? | #V ?:? | %#V:V ?:0 ]*
+ hfst::xre::XreCompiler xre_(args);
+ xre_.set_verbosity(false, NULL);
+
+ for (std::set<std::string>::const_iterator it = markers_added.begin(); it != markers_added.end(); it++)
+ {
+ std::string marker = *it;
+ std::string symbol(1, it->at(1)); // @X@ -> X
+ std::string worsener_string("[ ? | \"" + marker + "\" ?:? ]* \"" + marker + "\":" + symbol + " ?:0 [ ? | \"" + marker + "\" ?:? | \"" + marker + "\":" + symbol + " ?:0 ]* ;");
+
+ HfstTransducer * worsener = xre_.compile(worsener_string);
+ assert(worsener != NULL);
+ worsener->minimize();
+ HfstTransducer cp(initial_merge);
+ cp.compose(*worsener).output_project().minimize();
+ delete worsener;
+
+ initial_merge.subtract(cp).minimize();
+ initial_merge.substitute(marker, internal_epsilon);
+
+ HfstBasicTransducer fsm(initial_merge);
+ StringSet symbols = fsm.symbols_used();
+ if (symbols.find(symbol) == symbols.end())
+ {
+ initial_merge.remove_from_alphabet(symbol);
+ }
+ }
+
+ *this = initial_merge;
+ return *this;
+}
+
HfstTransducer &HfstTransducer::compose
(const HfstTransducer &another,
bool harmonize)
diff --git a/libhfst/src/HfstTransducer.h b/libhfst/src/HfstTransducer.h
index d1835c3..8fad045 100644
--- a/libhfst/src/HfstTransducer.h
+++ b/libhfst/src/HfstTransducer.h
@@ -1249,6 +1249,10 @@ ccc : ddd
HfstTransducer &compose(const HfstTransducer &another,
bool harmonize=true);
+ HfstTransducer &merge(const HfstTransducer &another, const std::map<std::string, std::set<std::string> > & list_symbols);
+
+ HfstTransducer &merge(const HfstTransducer &another, const struct hfst::xre::XreConstructorArguments & args);
+
/** \brief Compose this transducer with the intersection of
transducers in \a v. If \a invert is true, then compose the
intersection of the transducers in \a v with this transducer.
diff --git a/libhfst/src/Makefile.am b/libhfst/src/Makefile.am
index 93f21d9..3da3c4e 100644
--- a/libhfst/src/Makefile.am
+++ b/libhfst/src/Makefile.am
@@ -97,7 +97,7 @@ HFST_HDRS = \
hfstinclude_HEADERS = $(HFST_HDRS)
-libhfst_la_LDFLAGS = -no-undefined -version-info 39:0:0
+libhfst_la_LDFLAGS = -no-undefined -version-info 40:0:0
LIBHFST_TSTS=HfstApply HfstInputStream HfstTransducer \
HfstOutputStream HfstXeroxRules HfstRules HfstSymbolDefs \
diff --git a/libhfst/src/implementations/ConvertOlTransducer.cc b/libhfst/src/implementations/ConvertOlTransducer.cc
index ad46db3..365d8ef 100644
--- a/libhfst/src/implementations/ConvertOlTransducer.cc
+++ b/libhfst/src/implementations/ConvertOlTransducer.cc
@@ -171,23 +171,17 @@ void get_states_and_symbols(
StringSet * flag_diacritics = new StringSet();
StringSet * other_symbols = new StringSet();
- std::map<unsigned int, unsigned int> * relabeled_states =
- new std::map<unsigned int, unsigned int>();
unsigned int first_transition = 0;
- unsigned int source_state=0;
+ unsigned int state_number = 0;
for (HfstBasicTransducer::const_iterator it = t->begin();
it != t->end(); ++it) {
- unsigned int state_number = state_placeholders.size();
- if (state_number != source_state) {
- relabeled_states->operator[](source_state) = state_number;
- }
hfst_ol::Weight final_w = 0.0;
- if (t->is_final_state(source_state)) {
- final_w = t->get_final_weight(source_state);
+ if (t->is_final_state(state_number)) {
+ final_w = t->get_final_weight(state_number);
}
state_placeholders.push_back(hfst_ol::StatePlaceholder(
state_number,
- t->is_final_state(source_state),
+ t->is_final_state(state_number),
first_transition,
final_w));
++first_transition; // there's a padding entry between states
@@ -207,7 +201,7 @@ void get_states_and_symbols(
other_symbols->insert(tr_it->get_output_symbol());
}
}
- source_state++;
+ ++state_number;
}
std::map<std::string, SymbolNumber> string_symbol_map;
@@ -265,41 +259,31 @@ void get_states_and_symbols(
delete input_symbols;
delete flag_diacritics;
delete other_symbols;
-
+
// Do a second pass over the transitions, figuring out everything
// about the states except starting indices
- source_state=0;
+ state_number = 0;
for (HfstBasicTransducer::const_iterator it = t->begin();
it != t->end(); ++it) {
for (HfstBasicTransducer::HfstTransitions::const_iterator tr_it
= it->begin();
tr_it != it->end(); ++tr_it) {
- unsigned int state_number = source_state;
- if (relabeled_states->count(state_number) != 0) {
- state_number = relabeled_states->operator[](state_number);
- }
- // check for previously unseen inputs
- if (state_placeholders[state_number].inputs.count(
- string_symbol_map[tr_it->get_input_symbol()]) == 0) {
- state_placeholders[state_number].inputs[
- string_symbol_map[tr_it->get_input_symbol()]] =
- std::vector<hfst_ol::TransitionPlaceholder>();
- }
- unsigned int target = tr_it->get_target_state();
- if (relabeled_states->count(target) != 0) {
- target = relabeled_states->operator[](target);
- }
+ // add input in case we're seeing it the first time
+ state_placeholders[state_number].add_input(
+ string_symbol_map[tr_it->get_input_symbol()],
+ flag_symbols);
+ unsigned int target = tr_it->get_target_state();
hfst_ol::TransitionPlaceholder trans(
target,
+ string_symbol_map[tr_it->get_input_symbol()],
string_symbol_map[tr_it->get_output_symbol()],
tr_it->get_weight());
- state_placeholders[state_number]
- .inputs[string_symbol_map[tr_it->get_input_symbol()]].push_back(trans);
+ SymbolNumber input_sym = string_symbol_map[tr_it->get_input_symbol()];
+ state_placeholders[state_number].add_transition(trans);
}
- source_state++;
+ ++state_number;
}
- delete relabeled_states;
}
/* Create an hfst_ol::Transducer equivalent to HfstBasicTransducer \a t.
@@ -333,12 +317,12 @@ void get_states_and_symbols(
flag_symbols,
harmonizer_ol);
- // For determining the index table we first sort the states (excepting
+ // For determining the index table we first sort the states (excepting
// the starting state) by number of different input symbols.
- if (state_placeholders.begin() != state_placeholders.end()) {
- std::sort(state_placeholders.begin() + 1, state_placeholders.end(),
- hfst_ol::compare_states_by_input_size);
- }
+ // if (state_placeholders.begin() != state_placeholders.end()) {
+ // std::sort(state_placeholders.begin() + 1, state_placeholders.end(),
+ // hfst_ol::compare_states_by_input_size);
+ // }
hfst_ol::IndexPlaceholders * used_indices =
new hfst_ol::IndexPlaceholders();
@@ -360,107 +344,85 @@ void get_states_and_symbols(
for (std::vector<hfst_ol::StatePlaceholder>::iterator it =
state_placeholders.begin();
it != state_placeholders.end(); ++it) {
- if (it->is_simple(flag_symbols) and it->state_number != 0) {
+ if (it->is_simple()) {
continue;
}
unsigned int i = first_available_index;
// While this index is not suitable for a starting index, keep looking
- if (!quick) {
while (!used_indices->fits(*it, flag_symbols, i)) {
- ++i;
+ ++i;
}
- }
it->start_index = i;
- previous_successful_index = i;
+ previous_successful_index = i;
// Once we've found a starting index, insert a finality marker and
- // mark all the used indices
- used_indices->operator[](i) =
- std::pair<unsigned int, SymbolNumber>(
- it->state_number, NO_SYMBOL_NUMBER);
- for (std::map<SymbolNumber,
- std::vector<hfst_ol::TransitionPlaceholder> >
- ::iterator sym_it = it->inputs.begin();
- sym_it != it->inputs.end(); ++sym_it) {
- SymbolNumber index_offset = sym_it->first;
+ // mark all the used indices
+ used_indices->assign(i, it->state_number, NO_SYMBOL_NUMBER);
+ for (std::vector<std::vector<hfst_ol::TransitionPlaceholder> >
+ ::const_iterator tr_it = it->transition_placeholders.begin();
+ tr_it != it->transition_placeholders.end(); ++tr_it) {
+ SymbolNumber index_offset = tr_it->at(0).input;
if (flag_symbols.count(index_offset) != 0) {
index_offset = 0;
}
- used_indices->operator[](i + index_offset + 1) =
- std::pair<unsigned int, SymbolNumber>
- (it->state_number, index_offset);
+ used_indices->assign(i + index_offset + 1, it->state_number, index_offset);
}
- if (quick) {
- first_available_index = used_indices->rbegin()->first + 1;
- continue;
- }
- while (used_indices->unsuitable(
- first_available_index, seen_input_symbols,
- packing_aggression)) {
- ++first_available_index;
- }
- if (first_available_index == previous_first_index) {
- if (floor_stuck_counter > floor_jump_threshold) {
- SymbolNumber index_offset = it->inputs.rbegin()->first;
- if (flag_symbols.count(index_offset) != 0) {
- index_offset = 0;
- }
- first_available_index =
- previous_successful_index + 1 + index_offset;
- while (used_indices->unsuitable(
- first_available_index,
- seen_input_symbols, packing_aggression)) {
+
+ while (used_indices->unsuitable(first_available_index, seen_input_symbols, packing_aggression)) {
++first_available_index;
}
- floor_stuck_counter = 0;
- previous_first_index = first_available_index;
+ if (first_available_index == previous_first_index) {
+ if (floor_stuck_counter > floor_jump_threshold) {
+ first_available_index = previous_successful_index + 1;
+ floor_stuck_counter = 0;
+ previous_first_index = first_available_index;
+ } else {
+ ++floor_stuck_counter;
+ }
} else {
- ++floor_stuck_counter;
+ previous_first_index = first_available_index;
+ floor_stuck_counter = 0;
}
- } else {
- previous_first_index = first_available_index;
- floor_stuck_counter = 0;
- }
}
// Now resort by state number for the rest
// (this could definitely be neater...)
- if (state_placeholders.begin() != state_placeholders.end()) {
- std::sort(state_placeholders.begin() + 1, state_placeholders.end(),
- hfst_ol::compare_states_by_state_number);
- }
+ // if (state_placeholders.begin() != state_placeholders.end()) {
+ // std::sort(state_placeholders.begin() + 1, state_placeholders.end(),
+ // hfst_ol::compare_states_by_state_number);
+ // }
// Now for each index entry we write its input symbol and target
hfst_ol::TransducerTable<hfst_ol::TransitionWIndex> windex_table;
unsigned int greatest_index = 0;
- if (used_indices->size() != 0) {
- greatest_index = used_indices->rbegin()->first;
+ if (used_indices->indices.size() != 0) {
+ greatest_index = used_indices->indices.size() - 1;
}
for(unsigned int i = 0; i <= greatest_index; ++i) {
- if (used_indices->count(i) == 0) { // blank entries
+ if (!used_indices->used(i)) { // blank entries
windex_table.append(hfst_ol::TransitionWIndex());
- } else if (used_indices->operator[](i).second ==
- NO_SYMBOL_NUMBER) { // finality markers
- if (state_placeholders[used_indices->operator[](i).first].final) {
- windex_table.append(
- hfst_ol::TransitionWIndex::create_final(
- state_placeholders[
- used_indices->operator[](i).first].final_weight));
- } else {
- windex_table.append(hfst_ol::TransitionWIndex());
- }
- } else { // actual entries
- unsigned int idx = used_indices->operator[](i).first;
- SymbolNumber sym = used_indices->operator[](i).second;
+ } else if (used_indices->get_target(i).second ==
+ NO_SYMBOL_NUMBER) { // finality markers
+ if (state_placeholders[used_indices->get_target(i).first].final) {
+ windex_table.append(
+ hfst_ol::TransitionWIndex::create_final(
+ state_placeholders[
+ used_indices->get_target(i).first].final_weight));
+ } else {
+ windex_table.append(hfst_ol::TransitionWIndex());
+ }
+ } else { // actual entries
+ unsigned int idx = used_indices->get_target(i).first;
+ SymbolNumber sym = used_indices->get_target(i).second;
windex_table.append(
- hfst_ol::TransitionWIndex(
- sym,
- state_placeholders[idx].first_transition +
- state_placeholders[idx].symbol_offset(
- sym, flag_symbols) + TA_OFFSET));
+ hfst_ol::TransitionWIndex(
+ sym,
+ state_placeholders[idx].first_transition +
+ state_placeholders[idx].symbol_offset(
+ sym, flag_symbols) + TA_OFFSET));
}
}
diff --git a/libhfst/src/implementations/HfstTransitionGraph.h b/libhfst/src/implementations/HfstTransitionGraph.h
index 24096f2..8641699 100644
--- a/libhfst/src/implementations/HfstTransitionGraph.h
+++ b/libhfst/src/implementations/HfstTransitionGraph.h
@@ -276,7 +276,7 @@
/* Check that all symbols that occur in the transitions of the graph
are also in the alphabet. */
- bool check_alphabet()
+ bool check_alphabet()
{
for (iterator it = begin(); it != end(); it++)
{
@@ -2732,7 +2732,7 @@
{
HfstState new_state = add_state();
std::string marker = weight2marker(IT->get_weight());
- std::cerr << "got marker '" << marker << "'" << std::endl;
+ //std::cerr << "got marker '" << marker << "'" << std::endl;
HfstTransition <C> marker_transition(IT->get_target_state(),
marker,
marker,
@@ -2949,7 +2949,7 @@
if ( (!marker2weight(data.get_input_symbol(), weight)) &&
marker2weight(data.get_output_symbol(), weight) )
{
- std::cerr << "got weight '" << weight << "'" << std::endl;
+ //std::cerr << "got weight '" << weight << "'" << std::endl;
// schedule a substitution
new_transitions.push_back
(HfstTransition <C> (tr_it->get_target_state(),
@@ -2963,7 +2963,7 @@
else if (marker2weight(data.get_input_symbol(), weight) &&
marker2weight(data.get_output_symbol(), weight) )
{
- std::cerr << "got weight '" << weight << "'" << std::endl;
+ //std::cerr << "got weight '" << weight << "'" << std::endl;
// schedule the old transition to be deleted
old_transitions.push(tr_it);
}
@@ -3869,24 +3869,24 @@
}
// Returns whether tr is "^]":"^]". If tr is not allowed, throws an error message.
- bool check_regexp_transition_end(const HfstBasicTransition & tr)
+ bool check_regexp_transition_end(const HfstBasicTransition & tr, bool input_side)
{
std::string istr = tr.get_input_symbol();
std::string ostr = tr.get_output_symbol();
- if (is_special_symbol(istr) || is_special_symbol(ostr))
+ if ((input_side && is_special_symbol(istr)) || (!input_side && is_special_symbol(ostr)))
{
throw "error: special symbol detected in compile-replace regular expression";
}
- if (("^[" == istr) || ("^[" == ostr))
+ if ((input_side && ("^[" == istr)) || (!input_side && ("^[" == ostr)))
{
throw "error: ^[ detected inside compile-replace regular expression";
}
- if (("^]" == istr) || ("^]" == ostr))
+ if ((input_side && ("^]" == istr)) || (!input_side && ("^]" == ostr)))
{
- if (istr != ostr)
+ /*if (istr != ostr)
{
throw "error: ^] detected on only one side of transition inside compile-replace regular expression";
- }
+ }*/
return true;
}
return false;
@@ -3904,7 +3904,7 @@
(HfstState s,
std::set<HfstState> & states_visited,
std::vector<std::pair<std::string, std::string> > & path,
- HfstReplacements & full_paths)
+ HfstReplacements & full_paths, bool input_side)
{
// no cycles allowed inside "^[" and "^]"
check_regexp_state_for_cycle(s, states_visited);
@@ -3918,11 +3918,12 @@
it != transitions.end(); it++)
{
// closing bracket..
- if (check_regexp_transition_end(*it)) // throws error message if *it is not a valid transition
+ if (check_regexp_transition_end(*it, input_side)) // throws error message if *it is not a valid transition
{
// ..cannot lead to a state already visited..
check_regexp_state_for_cycle(it->get_target_state(), states_visited);
// ..but else we can add the expression that it ends to the results
+ path.push_back(std::pair<std::string, std::string>(it->get_input_symbol(), it->get_output_symbol()));
full_paths.push_back
(HfstReplacement(it->get_target_state(), path));
}
@@ -3934,7 +3935,7 @@
(it->get_target_state(),
states_visited,
path,
- full_paths);
+ full_paths, input_side);
path.pop_back();
}
}
@@ -3953,7 +3954,8 @@
// Weights are currently ignored.
void find_regexp_paths
(HfstState s,
- std::vector<std::pair<HfstState, std::vector<std::pair<std::string, std::string> > > > & full_paths)
+ std::vector<std::pair<HfstState, std::vector<std::pair<std::string, std::string> > > > & full_paths,
+ bool input_side)
{
// go through all transitions
const HfstBasicTransducer::HfstTransitions &transitions
@@ -3964,17 +3966,18 @@
{
std::string istr = it->get_input_symbol();
std::string ostr = it->get_output_symbol();
- if ("^[" == istr || "^[" == ostr)
+ if ((input_side && ("^[" == istr)) || (!input_side && ("^[" == ostr)))
{
- if (istr != ostr)
+ /*if (istr != ostr)
{
throw "error: ^[ detected on only one side of transition";
- }
+ }*/
std::set<HfstState> states_visited;
states_visited.insert(s);
std::vector<std::pair<std::string, std::string> > path;
- find_regexp_paths(it->get_target_state(), states_visited, path, full_paths);
- fprintf(stderr, "%u regexp paths found for state %u\n", (unsigned int)full_paths.size(), s);
+ path.push_back(std::pair<std::string, std::string>(istr, ostr));
+ find_regexp_paths(it->get_target_state(), states_visited, path, full_paths, input_side);
+ //fprintf(stderr, "%u regexp paths found for state %u\n", (unsigned int)full_paths.size(), s); // debug
}
}
}
@@ -3982,15 +3985,15 @@
// Find all subpaths of form "^[" [x:y]* "^]" (x and y cannot be "^[" or "^]") and return them.
// retval[start_state] == vector(pair(end_state, vector(pair(isymbol,osymbol) ) ) )
// Weights are currently ignored.
- HfstReplacementsMap find_replacements()
+ HfstReplacementsMap find_replacements(bool input_side)
{
HfstReplacementsMap replacements;
unsigned int state = 0;
for (iterator it = begin(); it != end(); it++)
{
- fprintf(stderr, "state %u......\n", state);
+ //fprintf(stderr, "state %u......\n", state); // debug
HfstReplacements full_paths;
- find_regexp_paths(state, full_paths);
+ find_regexp_paths(state, full_paths, input_side);
if (full_paths.size() > 0)
{
replacements[state] = full_paths;
@@ -4184,7 +4187,7 @@
graph1.sort_arcs();
graph2.sort_arcs();
state_map[StatePair(0, 0)] = 0; // initial states
-
+
if (graph1.is_final_state(0) && graph2.is_final_state(0))
{
float final_weight = std::min(graph1.get_final_weight(0), graph2.get_final_weight(0));
@@ -4198,6 +4201,9 @@
+
+ // HERE BEGINS
+
// A function used by find_matches_for_merge
// Copy matching transition graph_tr/merger_tr to state \a result_state in \a result and return
// the target state of that transition. Also make that state final, if needed.
@@ -4229,8 +4235,7 @@
// the target state of that transition. Also make that state final, if needed.
static HfstState handle_list_match(const HfstTransitionGraph & graph, const HfstTransition <C> & graph_transition,
const HfstTransitionGraph & merger, const HfstTransition <C> & merger_transition,
- HfstTransitionGraph & result, HfstState result_state, StateMap & state_map)
-
+ HfstTransitionGraph & result, HfstState result_state, StateMap & state_map, std::set<std::string> & markers_added)
{
HfstState graph_target = graph_transition.get_target_state();
HfstState merger_target = merger_transition.get_target_state();
@@ -4239,8 +4244,16 @@
(graph_target, merger_target, state_map, result, was_new_state);
// The sum of weight is copied to the resulting intersection.
float transition_weight = graph_transition.get_weight() + merger_transition.get_weight();
+
+ // testing: add a marker
+ HfstState extra_state = result.add_state();
result.add_transition
(result_state, HfstTransition <C>
+ (extra_state, "@" + graph_transition.get_input_symbol() + "@", "@" + graph_transition.get_output_symbol() + "@", 0));
+ markers_added.insert("@" + graph_transition.get_input_symbol() + "@");
+
+ result.add_transition
+ (extra_state /*result_state*/, HfstTransition <C>
(retval, merger_transition.get_input_symbol(), merger_transition.get_output_symbol(), transition_weight));
// For each new state added, check if the corresponding states in \a graph1 and \a graph2
// are final. If they are, make the new state final with the sum of final weights.
@@ -4251,17 +4264,55 @@
}
return retval;
}
+
-
- static bool is_list_symbol(const C & transition_data)
+
+ static bool is_list_symbol(const C & transition_data, const std::map<std::string, std::set<std::string> > & list_symbols)
{
- return false;
+ std::string isymbol = transition_data.get_input_symbol();
+ std::string osymbol = transition_data.get_output_symbol();
+
+ if (isymbol != osymbol)
+ {
+ throw "is_list_symbol: input and output symbols must be the same";
+ }
+ return (list_symbols.find(isymbol) != list_symbols.end());
}
- static bool is_list_match(const C & graph_transition_data, const C & merger_transition_data)
+ /*
+ // @pre \a transition_data is a list symbol
+ // @pre list symbols cannot contain '_' or '@'
+ static std::set<std::string> get_list_symbols(const std::string & list_symbol)
{
- return false;
- }
+ std::set<std::string> result;
+ unsigned int i = 6;
+
+ // skip list name
+ while(list_symbol[i] != '_')
+ {
+ i++;
+ }
+ i++;
+
+ // extract symbols
+ std::string symbol("");
+ while (list_symbol[i] != '@')
+ {
+ if (list_symbol[i] == '_')
+ {
+ result.insert(symbol);
+ symbol = std::string("");
+ }
+ else
+ {
+ symbol.append(1, list_symbol[i]);
+ }
+ i++;
+ }
+ result.insert(symbol);
+
+ return result;
+ }*/
// A recursive function used by function intersect.
//
@@ -4278,7 +4329,8 @@
// @pre \a graph and \a merger must be deterministic. (todo: handle equivalent transitions, maybe even epsilons?)
static void find_matches_for_merge
(HfstTransitionGraph & graph, HfstState graph_state, HfstTransitionGraph & merger, HfstState merger_state,
- HfstTransitionGraph & result, HfstState result_state, StateMap & state_map, std::set<HfstState> & agenda)
+ HfstTransitionGraph & result, HfstState result_state, StateMap & state_map, std::set<HfstState> & agenda,
+ const std::map<std::string, std::set<std::string> > & list_symbols, std::set<std::string> & markers_added)
{
agenda.insert(result_state); // do not handle \a result_state twice
HfstTransitions & graph_transitions = graph.state_vector[graph_state]; // transitions of graph
@@ -4296,22 +4348,30 @@
const C & graph_transition_data = graph_transition.get_transition_data();
// List symbols must be checked separately
- if (is_list_symbol(graph_transition_data))
+ if (is_list_symbol(graph_transition_data, list_symbols))
{
+ const std::set<std::string> & symbol_list = list_symbols.find(graph_transition_data.get_input_symbol())->second;
bool list_match_found=false;
// Find all matches
for(unsigned int j=0; j < merger_transitions.size(); j++)
{
HfstTransition <C> & merger_transition = merger_transitions[j];
const C & merger_transition_data = merger_transition.get_transition_data();
+ const std::string & isymbol = merger_transition_data.get_input_symbol();
+ const std::string & osymbol = merger_transition_data.get_output_symbol();
+
+ if (isymbol != osymbol)
+ {
+ throw "find_matches_for_merge: input and output symbols must be the same";
+ }
- if (is_list_match(graph_transition_data, merger_transition_data))
+ if (symbol_list.find(isymbol) != symbol_list.end())
{
list_match_found=true;
- HfstState target = handle_list_match(graph, graph_transition, merger, merger_transition, result, result_state, state_map);
+ HfstState target = handle_list_match(graph, graph_transition, merger, merger_transition, result, result_state, state_map, markers_added);
if (agenda.find(target) == agenda.end())
{
- find_matches_for_merge(graph, graph_transition.get_target_state(), merger, merger_transition.get_target_state(), result, target, state_map, agenda);
+ find_matches_for_merge(graph, graph_transition.get_target_state(), merger, merger_transition.get_target_state(), result, target, state_map, agenda, list_symbols, markers_added);
}
}
}
@@ -4325,7 +4385,7 @@
HfstState target = handle_non_list_match(graph, graph_transition, merger, merger_state, result, result_state, state_map);
if (agenda.find(target) == agenda.end())
{
- find_matches_for_merge(graph, graph_transition.get_target_state(), merger, /*merger_transition.get_target_state()*/ merger_state, result, target, state_map, agenda);
+ find_matches_for_merge(graph, graph_transition.get_target_state(), merger, /*merger_transition.get_target_state()*/ merger_state, result, target, state_map, agenda, list_symbols, markers_added);
}
// --- A transition in graph compared for all corresponding transitions in merger, compare next transition. ---
}
@@ -4334,7 +4394,7 @@
}
static HfstTransitionGraph merge
- (HfstTransitionGraph & graph, HfstTransitionGraph & merger)
+ (HfstTransitionGraph & graph, HfstTransitionGraph & merger, const std::map<std::string, std::set<std::string> > & list_symbols, std::set<std::string> & markers_added)
{
HfstTransitionGraph result;
StateMap state_map;
@@ -4349,7 +4409,14 @@
result.set_final_weight(0, final_weight);
}
- find_matches_for_merge(graph, 0, merger, 0, result, 0, state_map, agenda);
+ try
+ {
+ find_matches_for_merge(graph, 0, merger, 0, result, 0, state_map, agenda, list_symbols, markers_added);
+ }
+ catch (const char * msg)
+ {
+ HFST_THROW_MESSAGE(TransducersAreNotAutomataException, std::string(msg));
+ }
return result;
}
@@ -4400,8 +4467,8 @@
HfstFastTransducer;
- }
+ }
-}
+ }
#endif // #ifndef _HFST_TRANSITION_GRAPH_H_
diff --git a/libhfst/src/implementations/optimized-lookup/convert.cc b/libhfst/src/implementations/optimized-lookup/convert.cc
index 90b5da6..a7f3363 100644
--- a/libhfst/src/implementations/optimized-lookup/convert.cc
+++ b/libhfst/src/implementations/optimized-lookup/convert.cc
@@ -36,36 +36,36 @@ void write_transitions_from_state_placeholders(
// Then we iterate through the symbols each state has.
// First we do a pass for epsilon and flags (they have to come
// first), then everything else.
- if (it->inputs.count(0) != 0) {
- add_transitions_with(0, it->inputs[0],
- transition_table,
- state_placeholders,
- flag_symbols);
+ if (it->input_present(0)) {
+ add_transitions_with(0, it->get_transition_placeholders(0),
+ transition_table,
+ state_placeholders,
+ flag_symbols);
}
for (std::set<hfst_ol::SymbolNumber>::iterator flag_it =
flag_symbols.begin(); flag_it != flag_symbols.end();
++flag_it) {
- if (it->inputs.count(*flag_it) != 0) {
- hfst_ol::add_transitions_with(*flag_it,
- it->inputs[*flag_it],
- transition_table,
- state_placeholders,
- flag_symbols);
-
+ if (it->input_present(*flag_it)) {
+ hfst_ol::add_transitions_with(
+ *flag_it,
+ it->get_transition_placeholders(*flag_it),
+ transition_table,
+ state_placeholders,
+ flag_symbols);
+
}
}
- for (std::map<SymbolNumber,
- std::vector<TransitionPlaceholder> >::iterator sym_it =
- it->inputs.begin();
- sym_it != it->inputs.end(); ++sym_it) {
- if (sym_it->first == 0 or flag_symbols.count(sym_it->first) != 0) {
+ for (unsigned int i = 1; i < it->symbol_to_transition_placeholder_v.size();
+ ++i) {
+ if (!it->input_present(i) ||
+ flag_symbols.count(i) != 0) {
continue;
}
- hfst_ol::add_transitions_with(sym_it->first,
- it->inputs[sym_it->first],
- transition_table,
- state_placeholders,
- flag_symbols);
+ hfst_ol::add_transitions_with(i,
+ it->get_transition_placeholders(i),
+ transition_table,
+ state_placeholders,
+ flag_symbols);
}
}
@@ -88,7 +88,7 @@ void add_transitions_with(SymbolNumber symbol,
// before writing each transition, find out whether its
// target is simple (ie. should point directly to TA entry)
unsigned int target;
- if (state_placeholders[it->target].is_simple(flag_symbols)) {
+ if (state_placeholders[it->target].is_simple()) {
target = state_placeholders[it->target].first_transition +
TRANSITION_TARGET_TABLE_START - 1;
} else {
@@ -106,7 +106,7 @@ bool compare_states_by_input_size(
const StatePlaceholder & lhs, const StatePlaceholder & rhs)
{
// descending by input size
- return lhs.inputs.size() > rhs.inputs.size();
+ return lhs.inputs > rhs.inputs;
}
bool compare_states_by_state_number(
@@ -504,7 +504,7 @@ void ConvertFstState::set_transition_indices(void)
previous_symbol = input_symbol;
}
}
- if(input_symbol == 0) { zero_transitions = true; }
+ if(input_symbol == 0) { zero_transitions = true; }
++position;
}
}
@@ -724,9 +724,9 @@ PlaceHolderVector::size_type ConvertTransitionTableIndices::add_state(
{
// Only try the first 100 indices.
// if (index > lower_bound+100000)
- // {
- // index = last_full_index()+1;
- // }
+ //{
+ // index = last_full_index()+1;
+ //}
if((index + number_of_input_symbols + 1) >= indices.size())
get_more_space();
@@ -872,7 +872,7 @@ void ConvertTransducerHeader::compute_header(TransducerHeader& header,
if(!header.weighted)
header.has_unweighted_input_epsilon_cycles =
header.has_input_epsilon_cycles;
-}
+}
ConvertTransducer* ConvertTransducer::constructing_transducer = NULL;
diff --git a/libhfst/src/implementations/optimized-lookup/convert.h b/libhfst/src/implementations/optimized-lookup/convert.h
index b68e4ef..58d137d 100644
--- a/libhfst/src/implementations/optimized-lookup/convert.h
+++ b/libhfst/src/implementations/optimized-lookup/convert.h
@@ -27,25 +27,32 @@ namespace hfst_ol {
struct TransitionPlaceholder {
unsigned int target;
+ SymbolNumber input;
SymbolNumber output;
float weight;
-TransitionPlaceholder(unsigned int t, SymbolNumber o, float w):
+ TransitionPlaceholder(unsigned int t, SymbolNumber i, SymbolNumber o, float w):
target(t),
+ input(i),
output(o),
weight(w)
{}
};
-typedef std::map<SymbolNumber, std::vector<TransitionPlaceholder> >
- SymbolTransitionsMap;
+//typedef std::map<SymbolNumber, std::vector<TransitionPlaceholder> >
+// SymbolTransitionsMap;
struct StatePlaceholder {
+ enum indexing_type {empty, simple_zero_index, simple_nonzero_index, nonsimple};
+
unsigned int state_number;
unsigned int start_index;
unsigned int first_transition;
- SymbolTransitionsMap inputs;
+ std::vector<unsigned int> symbol_to_transition_placeholder_v;
+ std::vector<std::vector<TransitionPlaceholder> > transition_placeholders;
+ indexing_type type;
+ SymbolNumber inputs;
bool final;
float final_weight;
StatePlaceholder (unsigned int state, bool finality, unsigned int first,
@@ -54,101 +61,136 @@ struct StatePlaceholder {
start_index(UINT_MAX),
first_transition(first),
final(finality),
- final_weight(final_weight)
- {}
+ final_weight(final_weight),
+ type(state == 0 ? nonsimple: empty),
+ inputs(0)
+ { }
StatePlaceholder ():
state_number(UINT_MAX),
start_index(UINT_MAX),
first_transition(UINT_MAX),
final(false),
- final_weight(0.0)
+ final_weight(0.0),
+ type(empty),
+ inputs(0)
{ }
- bool is_simple(std::set<SymbolNumber> const & flag_symbols) const
+ bool is_simple(void) const
{
- if (state_number == 0) {
- return false;
- }
- if (flag_symbols.size() == 0) {
- return inputs.size() < 2;
- }
- bool have_zero = false;
- SymbolNumber input_symbols = 0;
- for(SymbolTransitionsMap::const_iterator it = inputs.begin();
- it != inputs.end(); ++it) {
- if ((it->first == 0) or (flag_symbols.count(it->first) != 0)) {
- if (!have_zero) {
- have_zero = true;
- ++input_symbols;
- }
- } else {
- ++input_symbols;
- }
- if (input_symbols > 1) {
- return false;
- }
- }
- return true;
+ return type != nonsimple;
}
unsigned int number_of_transitions(void) const {
- unsigned int count = 0;
- for(SymbolTransitionsMap::const_iterator it = inputs.begin();
- it != inputs.end(); ++it) {
- count += it->second.size();
+ unsigned int count = 0;
+ for(std::vector<std::vector<TransitionPlaceholder> >::const_iterator it
+ = transition_placeholders.begin();
+ it != transition_placeholders.end(); ++it) {
+ count += it->size();
+ }
+ return count;
}
- return count;
+
+ bool input_present(SymbolNumber input) const {
+ return input < symbol_to_transition_placeholder_v.size() &&
+ symbol_to_transition_placeholder_v[input] != UINT_MAX;
}
+
+ void add_input(SymbolNumber input, std::set<SymbolNumber> const & flag_symbols)
+ {
+ if (input_present(input)) {
+ return;
+ }
+ while (symbol_to_transition_placeholder_v.size() <= input) {
+ symbol_to_transition_placeholder_v.push_back(UINT_MAX);
+ }
+ symbol_to_transition_placeholder_v[input] = transition_placeholders.size();
+ transition_placeholders.push_back(std::vector<TransitionPlaceholder>());
+ ++inputs;
+ if (type != nonsimple) {
+ // Depending on what type of inputs we now have, adjust the index type.
+ // Epsilons and flags both index to 0. If we have only one input symbol,
+ // we're simple.
+ if (type == empty) {
+ if (input == 0 || flag_symbols.count(input) == 1) {
+ type = simple_zero_index;
+ } else {
+ type = simple_nonzero_index;
+ }
+ } else if (type == simple_zero_index) {
+ if (input != 0 && flag_symbols.count(input) == 0) {
+ type = nonsimple;
+ }
+ } else { // simple_nonzero_index
+ if (inputs > 1 || input == 0 || flag_symbols.count(input) == 1) {
+ type = nonsimple;
+ }
+ }
+ }
+ }
+
+ SymbolNumber get_largest_index(void)
+ {
+ return transition_placeholders[symbol_to_transition_placeholder_v.back()][0].input;
+ }
+
+ void add_transition(TransitionPlaceholder & trans)
+ {
+ transition_placeholders[symbol_to_transition_placeholder_v[trans.input]].push_back(trans);
+ }
+
+ std::vector<TransitionPlaceholder> & get_transition_placeholders(SymbolNumber input)
+ {
+ return transition_placeholders[symbol_to_transition_placeholder_v[input]];
+ }
unsigned int symbol_offset(
SymbolNumber const symbol,
- std::set<SymbolNumber> const & flag_symbols) const {
+ std::set<SymbolNumber> const & flag_symbols) {
if (symbol == 0) {
return 0;
}
unsigned int offset = 0;
- if (flag_symbols.size() == 0) {
- for(SymbolTransitionsMap::const_iterator it = inputs.begin();
- it!= inputs.end(); ++it) {
- if (symbol == it->first) {
- return offset;
- }
- offset += it->second.size();
- }
-
- } else {
- if (inputs.count(0) != 0) { // if there are epsilons
- offset = inputs.find(0)->second.size();
- }
- for(std::set<SymbolNumber>::iterator flag_it = flag_symbols.begin();
- flag_it != flag_symbols.end(); ++flag_it) {
- if (inputs.count(*flag_it) != 0) { // if this flag is present
- if (symbol == *flag_it) {
- // Flags go to 0 (even if there's no epsilon)
- return 0;
- }
- offset += inputs.find(*flag_it)->second.size();
+ // if (flag_symbols.size() == 0) {
+ // for(int i = 0; i < symbol_to_transition_placeholder_v.size(); ++i) {
+ // if (symbol_to_transition_placeholder_v[i] != UINT_MAX) {
+ // if (symbol == i) {
+ // return offset;
+ // }
+ // offset += get_transition_placeholders(i).size();
+ // }
+ // }
+ // } else {
+ if (input_present(0)) { // if there are epsilons
+ offset = get_transition_placeholders(0).size();
+ }
+ for(std::set<SymbolNumber>::iterator flag_it = flag_symbols.begin();
+ flag_it != flag_symbols.end(); ++flag_it) {
+ if (input_present(*flag_it)) {
+ if (symbol == *flag_it) {
+ // Flags go to 0 (even if there's no epsilon)
+ return 0;
}
+ offset += get_transition_placeholders(*flag_it).size();
}
- for(SymbolTransitionsMap::const_iterator it = inputs.begin();
- it!= inputs.end(); ++it) {
- if (it->first == 0 || flag_symbols.count(it->first) != 0) {
+ }
+ for(unsigned int i = 1; i < symbol_to_transition_placeholder_v.size(); ++i) {
+ if (input_present(i)) {
+ if (flag_symbols.count(i) != 0) {
+ // already counted
continue;
}
- if (symbol == it->first) {
+ if (symbol == i) {
return offset;
}
- offset += it->second.size();
+ offset += get_transition_placeholders(i).size();
}
- std::string message("error in conversion between optimized lookup "
- "format and HfstTransducer;\ntried to calculate "
- "symbol_offset for symbol not present in state");
- HFST_THROW_MESSAGE
- (HfstFatalException,
- message);
}
- std::string message("error in function StatePlaceholder::symbol_offset");
- HFST_THROW_MESSAGE(HfstFatalException, message);
+ std::string message("error in conversion between optimized lookup "
+ "format and HfstTransducer;\ntried to calculate "
+ "symbol_offset for symbol not present in state");
+ HFST_THROW_MESSAGE
+ (HfstFatalException,
+ message);
}
};
@@ -157,35 +199,55 @@ bool compare_states_by_input_size(
bool compare_states_by_state_number(
const StatePlaceholder & lhs, const StatePlaceholder & rhs);
-class IndexPlaceholders: public std::map<unsigned int,
- std::pair<unsigned int, SymbolNumber> >
+struct IndexPlaceholders
{
-public:
+ std::vector<unsigned int> indices;
+ std::vector<std::pair<unsigned int, SymbolNumber> > targets;
+
+ bool used(unsigned int const position) const
+ {
+ return position < indices.size() && indices[position] != NO_TABLE_INDEX;
+ }
+
+ void assign(unsigned int const position, unsigned int target, SymbolNumber sym)
+ {
+ while (position >= indices.size()) {
+ indices.push_back(NO_TABLE_INDEX);
+ }
+ indices[position] = targets.size();
+ targets.push_back(std::pair<unsigned int, SymbolNumber>(target, sym));
+ }
+
+ std::pair<unsigned int, SymbolNumber> get_target(unsigned int index)
+ {
+ return targets[indices[index]];
+ }
+
bool fits(StatePlaceholder const & state,
std::set<SymbolNumber> const & flag_symbols,
unsigned int const position) const
- {
- if (count(position) != 0) {
- return false;
- }
- for (SymbolTransitionsMap::const_iterator it = state.inputs.begin();
- it != state.inputs.end(); ++it) {
- SymbolNumber index_offset = it->first;
- if (flag_symbols.count(index_offset) != 0) {
- index_offset = 0;
- }
- if (count(index_offset + position + 1) != 0) {
- return false;
+ {
+ if (used(position)) {
+ return false;
+ }
+ for (std::vector<std::vector<TransitionPlaceholder> >::const_iterator it = state.transition_placeholders.begin();
+ it != state.transition_placeholders.end(); ++it) {
+ SymbolNumber index_offset = it->at(0).input;
+ if (flag_symbols.count(index_offset) != 0) {
+ index_offset = 0;
+ }
+ if (used(index_offset + position + 1)) {
+ return false;
+ }
+ }
+ return true;
}
- }
- return true;
- }
bool unsuitable(unsigned int const index,
SymbolNumber const symbols,
float const packing_aggression) const
{
- if (count(index) != 0) {
+ if (used(index)) {
return true;
}
@@ -200,9 +262,9 @@ public:
unsigned int filled = 0;
for (unsigned int i = 0; i < symbols; ++i) {
- filled += count(index + i + 1);
+ filled += used(index + i + 1);
if (filled >= (packing_aggression*symbols)) {
- return true; // too full
+ return true; // too full
}
}
return false;
diff --git a/libhfst/src/implementations/optimized-lookup/pmatch.cc b/libhfst/src/implementations/optimized-lookup/pmatch.cc
index 5f7d1e7..cd7383e 100644
--- a/libhfst/src/implementations/optimized-lookup/pmatch.cc
+++ b/libhfst/src/implementations/optimized-lookup/pmatch.cc
@@ -5,22 +5,20 @@ namespace hfst_ol {
PmatchAlphabet::PmatchAlphabet(std::istream & inputstream,
SymbolNumber symbol_count):
- TransducerAlphabet(inputstream, symbol_count, false)
-{
- special_symbols[entry] = NO_SYMBOL_NUMBER;
- special_symbols[exit] = NO_SYMBOL_NUMBER;
- special_symbols[LC_entry] = NO_SYMBOL_NUMBER;
- special_symbols[LC_exit] = NO_SYMBOL_NUMBER;
- special_symbols[RC_entry] = NO_SYMBOL_NUMBER;
- special_symbols[RC_exit] = NO_SYMBOL_NUMBER;
- special_symbols[NLC_entry] = NO_SYMBOL_NUMBER;
- special_symbols[NLC_exit] = NO_SYMBOL_NUMBER;
- special_symbols[NRC_entry] = NO_SYMBOL_NUMBER;
- special_symbols[NRC_exit] = NO_SYMBOL_NUMBER;
- special_symbols[Pmatch_passthrough] = NO_SYMBOL_NUMBER;
- special_symbols[boundary] = NO_SYMBOL_NUMBER;
+ TransducerAlphabet(inputstream, symbol_count, false),
+ special_symbols(12, NO_SYMBOL_NUMBER) // SpecialSymbols enum
+{
+ symbol2lists = SymbolNumberVector(orig_symbol_count, NO_SYMBOL_NUMBER);
+ list2symbols = SymbolNumberVector(orig_symbol_count, NO_SYMBOL_NUMBER);
+ rtns = RtnVector(orig_symbol_count, NULL);
+ // We initialize the vector of which symbols have a printable representation
+ // with false, then flip those that actually do to true
+ printable_vector = std::vector<bool>(orig_symbol_count, false);
for (SymbolNumber i = 1; i < symbol_table.size(); ++i) {
add_special_symbol(symbol_table[i], i);
+ if (is_flag_diacritic(i)) {
+ printable_vector[i] = false;
+ }
}
}
@@ -28,19 +26,18 @@ PmatchAlphabet::PmatchAlphabet(void):
TransducerAlphabet()
{}
+void PmatchAlphabet::add_symbol(const std::string & symbol)
+{
+ TransducerAlphabet::add_symbol(symbol);
+ symbol2lists.push_back(NO_SYMBOL_NUMBER);
+ list2symbols.push_back(NO_SYMBOL_NUMBER);
+ rtns.push_back(NULL);
+ printable_vector.push_back(true);
+}
+
bool PmatchAlphabet::is_printable(SymbolNumber symbol)
{
- if (symbol == 0 || symbol == NO_SYMBOL_NUMBER ||
- is_flag_diacritic(symbol) || is_end_tag(symbol) || is_guard(symbol)) {
- return false;
- }
- for (std::map<SpecialSymbol, SymbolNumber>::const_iterator it = special_symbols.begin();
- it != special_symbols.end(); ++it) {
- if (it->second == symbol) {
- return false;
- }
- }
- return true;
+ return symbol < printable_vector.size() && printable_vector[symbol];
}
void PmatchAlphabet::add_special_symbol(const std::string & str,
@@ -79,24 +76,90 @@ void PmatchAlphabet::add_special_symbol(const std::string & str,
rtn_names[name_from_insertion(str)] = symbol_number;
} else if (is_guard(str)) {
guards.push_back(symbol_number);
+ } else if (is_list(str)) {
+ process_symbol_list(str, symbol_number);
+ } else if (is_counter(str)) {
+ process_counter(str, symbol_number);
+ } else {
+ printable_vector[symbol_number] = true;
+ }
+}
+
+void PmatchAlphabet::process_symbol_list(std::string str, SymbolNumber sym)
+{
+ SymbolNumberVector list_symbols;
+ StringSymbolMap ss = build_string_symbol_map();
+ size_t begin = strlen("@PMATCH_LIST_");
+ size_t stop;
+ std::vector<std::string> collected_symbols;
+ while ((stop = str.find('_', begin)) != std::string::npos) {
+// For each underscore after the prelude, grab the substring
+ std::string symbol = str.substr(begin, stop - begin);
+ if (symbol.size() == 0) {
+// If the symbol _is_ an underscore it looks like we got an empty string
+ symbol = "_";
+ begin = stop + 2;
+ } else {
+ begin = stop + 1;
+ }
+ collected_symbols.push_back(symbol);
+ }
+ // One at the end
+ collected_symbols.push_back(str.substr(begin, str.size() - begin - strlen("@")));
+ // Process the symbols we found
+ for (std::vector<std::string>::const_iterator it = collected_symbols.begin();
+ it != collected_symbols.end(); ++it) {
+ SymbolNumber str_sym;
+ if (ss.count(*it) == 0) {
+// This symbol isn't mentioned elsewhere in the alphabet
+ add_symbol(*it);
+ str_sym = orig_symbol_count;
+ ++orig_symbol_count;
+ } else {
+ str_sym = ss[*it];
+ }
+ list_symbols.push_back(str_sym);
+ if (symbol2lists[str_sym] == NO_SYMBOL_NUMBER) {
+ symbol2lists[str_sym] = symbol_lists.size();
+ symbol_lists.push_back(SymbolNumberVector(1, sym));
+ } else {
+ symbol_lists[symbol2lists[str_sym]].push_back(sym);
+ }
}
+ list2symbols[sym] = symbol_list_members.size();
+ symbol_list_members.push_back(list_symbols);
}
SymbolNumberVector PmatchAlphabet::get_specials(void) const
{
SymbolNumberVector v;
- for (std::map<SpecialSymbol, SymbolNumber>::const_iterator it =
+ for (SymbolNumberVector::const_iterator it =
special_symbols.begin(); it != special_symbols.end(); ++it) {
- if (it->second != NO_SYMBOL_NUMBER) {
- v.push_back(it->second);
+ if (*it != NO_SYMBOL_NUMBER) {
+ v.push_back(*it);
}
}
return v;
}
-PmatchContainer::PmatchContainer(std::istream & inputstream,
- bool _verbose, bool _extract_tags):
- verbose(_verbose),
+void PmatchAlphabet::process_counter(std::string str, SymbolNumber sym)
+{
+ // Fill up non-counter spots in the counter vector with blanks
+ while (counters.size() < sym) {
+ counters.push_back(NO_COUNTER);
+ }
+ counters.push_back(0);
+}
+
+void PmatchAlphabet::count(SymbolNumber sym)
+{
+ if (is_counter(sym)) {
+ counters[sym]++;
+ }
+}
+
+PmatchContainer::PmatchContainer(std::istream & inputstream):
+ verbose(false),
locate_mode(false),
recursion_depth_left(PMATCH_MAX_RECURSION_DEPTH),
entry_stack()
@@ -107,11 +170,11 @@ PmatchContainer::PmatchContainer(std::istream & inputstream,
// for once more established
TransducerHeader header(inputstream);
- orig_symbol_count = symbol_count = header.symbol_count();
alphabet = PmatchAlphabet(inputstream, header.symbol_count());
- alphabet.extract_tags = _extract_tags;
+ orig_symbol_count = symbol_count = alphabet.get_orig_symbol_count();
+ alphabet.extract_tags = locate_mode;
line_number = 0;
- encoder = new Encoder(alphabet.get_symbol_table(), header.input_symbol_count());
+ encoder = new Encoder(alphabet.get_symbol_table(), orig_symbol_count);
toplevel = new hfst_ol::PmatchTransducer(
inputstream,
header.index_table_size(),
@@ -158,9 +221,9 @@ PmatchContainer::PmatchContainer(std::istream & inputstream,
alphabet.get_rtn(*it)->collect_possible_first_symbols();
std::set<SymbolNumber> rtn_firsts =
alphabet.get_rtn(*it)->possible_first_symbols;
- for (RtnMap::iterator it = alphabet.rtns.begin();
- it != alphabet.rtns.end(); ++it) {
- if (rtn_firsts.count(it->first) == 1) {
+ for (RtnNameMap::const_iterator it = alphabet.rtn_names.begin();
+ it != alphabet.rtn_names.end(); ++it) {
+ if (rtn_firsts.count(it->second) == 1) {
// For now we are very conservative:
// if we can go through two levels of rtns
// without any input, we just assume the full
@@ -181,9 +244,9 @@ PmatchContainer::PmatchContainer(std::istream & inputstream,
}
}
}
- for (RtnMap::iterator it = alphabet.rtns.begin();
- it != alphabet.rtns.end(); ++it) {
- possible_firsts.erase(it->first);
+ for (RtnNameMap::const_iterator it = alphabet.rtn_names.begin();
+ it != alphabet.rtn_names.end(); ++it) {
+ possible_firsts.erase(it->second);
}
if (!possible_firsts.empty() &&
alphabet.get_special(boundary) != NO_SYMBOL_NUMBER) {
@@ -227,6 +290,16 @@ bool PmatchAlphabet::is_guard(const std::string & symbol)
return symbol.find("@PMATCH_GUARD_") == 0 && symbol.rfind("@") == symbol.size() - 1;
}
+bool PmatchAlphabet::is_counter(const std::string & symbol)
+{
+ return symbol.find("@PMATCH_COUNTER_") == 0 && symbol.rfind("@") == symbol.size() - 1;
+}
+
+bool PmatchAlphabet::is_list(const std::string & symbol)
+{
+ return symbol.find("@PMATCH_LIST_") == 0 && symbol.rfind("@") == symbol.size() - 1;
+}
+
bool PmatchAlphabet::is_special(const std::string & symbol)
{
if (symbol.size() == 0) {
@@ -251,6 +324,11 @@ bool PmatchAlphabet::is_guard(const SymbolNumber symbol) const
return false;
}
+bool PmatchAlphabet::is_counter(const SymbolNumber symbol) const
+{
+ return (symbol < counters.size() && counters[symbol] != NO_COUNTER);
+}
+
std::string PmatchAlphabet::name_from_insertion(const std::string & symbol)
{
return symbol.substr(sizeof("@I.") - 1, symbol.size() - (sizeof("@I.@") - 1));
@@ -283,9 +361,10 @@ PmatchContainer::~PmatchContainer(void)
PmatchAlphabet::~PmatchAlphabet(void)
{
- for (RtnMap::iterator it = rtns.begin();
+ for (RtnVector::iterator it = rtns.begin();
it != rtns.end(); ++it) {
- delete it->second;
+ delete *it;
+ *it = NULL;
}
}
@@ -357,17 +436,17 @@ std::string PmatchContainer::parse_name_from_hfst3_header(std::istream & f)
void PmatchAlphabet::add_rtn(PmatchTransducer * rtn, std::string const & name)
{
SymbolNumber symbol = rtn_names[name];
- rtns.insert(std::pair<SymbolNumber, PmatchTransducer *>(symbol, rtn));
+ rtns[symbol] = rtn;
}
bool PmatchAlphabet::has_rtn(std::string const & name) const
{
- return rtns.count(rtn_names.at(name)) != 0;
+ return rtn_names.at(name) < rtns.size() && rtns[rtn_names.at(name)] != NULL;
}
bool PmatchAlphabet::has_rtn(SymbolNumber symbol) const
{
- return rtns.count(symbol) != 0;
+ return symbol < rtns.size() && rtns[symbol] != NULL;
}
PmatchTransducer * PmatchAlphabet::get_rtn(SymbolNumber symbol)
@@ -375,6 +454,19 @@ PmatchTransducer * PmatchAlphabet::get_rtn(SymbolNumber symbol)
return rtns[symbol];
}
+std::string PmatchAlphabet::get_counter_name(SymbolNumber symbol)
+{
+ if (symbol_table.size() <= symbol) {
+ return "INVALID_COUNTER";
+ }
+ std::string name = symbol_table[symbol];
+ if (!is_counter(name)) {
+ return "INVALID_COUNTER";
+ }
+ return name.substr(strlen("@PMATCH_COUNTER_"),
+ name.size() - strlen("@PMATCH_COUNTER_") - 1);
+}
+
SymbolNumber PmatchAlphabet::get_special(SpecialSymbol special) const
{
return special_symbols.at(special);
@@ -440,6 +532,47 @@ LocationVectorVector PmatchContainer::locate(std::string & input)
return locations;
}
+// A utility comparing function for get_profiling_info
+bool counter_comp(std::pair<std::string, unsigned long> l,
+ std::pair<std::string, unsigned long> r)
+{
+ // Descending order
+ return l.second > r.second;
+}
+
+std::string PmatchContainer::get_profiling_info(void)
+{
+ std::stringstream retval;
+ size_t max_name_len = 0;
+ retval << "Profiling information:\n";
+ retval << " Traversals of Counter() positions:\n";
+ std::vector<std::pair<std::string, unsigned long> > counter_name_val_pairs;
+ for(SymbolNumber i = 0; i < alphabet.counters.size(); ++i) {
+ if (alphabet.counters[i] != NO_COUNTER) {
+ std::string counter_name = alphabet.get_counter_name(i);
+ if (counter_name.size() > max_name_len) {
+ max_name_len = counter_name.size();
+ }
+ counter_name_val_pairs.push_back(
+ std::pair<std::string, unsigned long>(counter_name,
+ alphabet.counters[i]));
+ }
+ }
+ std::sort(counter_name_val_pairs.begin(), counter_name_val_pairs.end(),
+ counter_comp);
+ for(std::vector<std::pair<std::string, unsigned long> >::const_iterator it =
+ counter_name_val_pairs.begin(); it != counter_name_val_pairs.end(); ++it) {
+ retval << " " << it->first;
+ size_t spacing_counter = max_name_len + 8 - it->first.size();
+ while (spacing_counter) {
+ retval << " ";
+ --spacing_counter;
+ }
+ retval << it->second << "\n";
+ }
+ return retval.str();
+}
+
void PmatchContainer::copy_to_output(const DoubleTape & best_result)
{
for (DoubleTape::const_iterator it = best_result.begin();
@@ -486,11 +619,9 @@ std::string PmatchAlphabet::stringify(const DoubleTape & str)
}
retval.insert(pos, start_tag(output));
retval.append(end_tag(output));
- } else if (output == special_symbols[boundary]
- || is_guard(output)) {
- continue;
} else {
- if (!extract_tags || start_tag_pos.size() != 0) {
+ if ((!extract_tags || start_tag_pos.size() != 0)
+ && is_printable(output)) {
retval.append(string_from_symbol(output));
}
}
@@ -582,6 +713,7 @@ PmatchTransducer::PmatchTransducer(std::istream & is,
is.read(indextab, TransitionWIndex::size * index_table_size);
is.read(transitiontab, TransitionW::size * transition_table_size);
char * orig_p = indextab;
+ index_table.reserve(index_table_size);
while(index_table_size) {
// index_table.push_back(
// SimpleIndex(*(SymbolNumber *) indextab,
@@ -593,6 +725,7 @@ PmatchTransducer::PmatchTransducer(std::istream & is,
}
free(orig_p);
orig_p = transitiontab;
+ transition_table.reserve(transition_table_size);
while(transition_table_size) {
transition_table.push_back(TransitionW(transitiontab));
// SimpleTransition(*(SymbolNumber *) transitiontab,
@@ -714,7 +847,16 @@ void PmatchTransducer::collect_first_transition(TransitionTableIndex i,
container->reset_recursion();
throw true;
}
- possible_first_symbols.insert(*it);
+ if (alphabet.list2symbols[*it] != NO_SYMBOL_NUMBER) {
+// If this is a list, collect everything in the list
+ for (SymbolNumberVector::const_iterator sym_it =
+ alphabet.symbol_list_members[alphabet.list2symbols[*it]].begin();
+ sym_it != alphabet.symbol_list_members[alphabet.list2symbols[*it]].end(); ++sym_it) {
+ possible_first_symbols.insert(*sym_it);
+ }
+ } else {
+ possible_first_symbols.insert(*it);
+ }
} else {
// faking through a context check
collect_first(transition_table[i].get_target(),
@@ -893,9 +1035,9 @@ void PmatchTransducer::note_analysis(unsigned int input_pos,
rtn_stack.top().candidate_tape_pos = tape_pos;
rtn_stack.top().candidate_input_pos = input_pos;
rtn_stack.top().best_weight = local_stack.top().running_weight;
- } else if (container->is_verbose() &&
- input_pos == rtn_stack.top().candidate_input_pos &&
- rtn_stack.top().best_weight == local_stack.top().running_weight) {
+ } else if (container->verbose &&
+ input_pos == rtn_stack.top().candidate_input_pos &&
+ rtn_stack.top().best_weight == local_stack.top().running_weight) {
DoubleTape discarded(container->tape.extract_slice(
rtn_stack.top().tape_entry, tape_pos));
std::cerr << "\n\tline " << container->line_number << ": conflicting equally weighted matches found, keeping:\n\t"
@@ -937,6 +1079,9 @@ void PmatchTransducer::take_epsilons(unsigned int input_pos,
Weight weight = transition_table[i].get_weight();
// We handle paths where we're checking contexts here
if (input == 0) {
+ if (container->profile_mode) {
+ alphabet.count(output);
+ }
if (!checking_context()) {
if (!try_entering_context(output)) {
// no context to enter, regular input epsilon
@@ -950,7 +1095,6 @@ void PmatchTransducer::take_epsilons(unsigned int input_pos,
} else if (output == alphabet.get_special(exit)) {
container->entry_stack.pop();
}
-
get_analyses(input_pos, tape_pos + 1, target);
@@ -1081,8 +1225,9 @@ void PmatchTransducer::take_transitions(SymbolNumber input,
} else if (this_input == input) {
if (!checking_context()) {
if (this_output == alphabet.get_identity_symbol() ||
- (this_output == alphabet.get_unknown_symbol())) {
- // we got here via identity or unknown, so look back in the
+ (this_output == alphabet.get_unknown_symbol()) ||
+ (alphabet.list2symbols[this_output] != NO_SYMBOL_NUMBER)) {
+ // we got here via a meta-arc, so look back in the
// input tape to find the symbol we want to write
this_output = container->input[input_pos];
}
@@ -1113,7 +1258,7 @@ void PmatchTransducer::get_analyses(unsigned int input_pos,
TransitionTableIndex i)
{
if (!container->try_recurse()) {
- if (container->is_verbose()) {
+ if (container->verbose) {
std::cerr << "pmatch: out of stack space, truncating result\n";
}
return;
@@ -1142,6 +1287,15 @@ void PmatchTransducer::get_analyses(unsigned int input_pos,
} else {
input = container->input[input_pos];
}
+
+ if (alphabet.symbol2lists[input] != NO_SYMBOL_NUMBER) {
+// At least one symbol list contains this symbol
+ for(SymbolNumberVector::const_iterator it =
+ alphabet.symbol_lists[alphabet.symbol2lists[input]].begin();
+ it != alphabet.symbol_lists[alphabet.symbol2lists[input]].end(); ++it) {
+ take_transitions(*it, input_pos, tape_pos, i+1);
+ }
+ }
if (input < orig_symbol_count) {
take_transitions(input, input_pos, tape_pos, i+1);
diff --git a/libhfst/src/implementations/optimized-lookup/pmatch.h b/libhfst/src/implementations/optimized-lookup/pmatch.h
index 3bf523d..b97f2dd 100644
--- a/libhfst/src/implementations/optimized-lookup/pmatch.h
+++ b/libhfst/src/implementations/optimized-lookup/pmatch.h
@@ -16,7 +16,8 @@ namespace hfst_ol {
const unsigned int PMATCH_MAX_RECURSION_DEPTH = 5000;
- typedef std::map<SymbolNumber, PmatchTransducer *> RtnMap;
+ typedef std::vector<PmatchTransducer *> RtnVector;
+ typedef std::map<std::string, SymbolNumber> RtnNameMap;
typedef std::vector<Location> LocationVector;
typedef std::vector<LocationVector> LocationVectorVector;
typedef std::vector<WeightedDoubleTape> WeightedDoubleTapeVector;
@@ -86,13 +87,22 @@ namespace hfst_ol {
class PmatchAlphabet: public TransducerAlphabet {
protected:
- RtnMap rtns;
- std::map<SpecialSymbol, SymbolNumber> special_symbols;
+ RtnVector rtns;
+ SymbolNumberVector special_symbols;
std::map<SymbolNumber, std::string> end_tag_map;
- std::map<std::string, SymbolNumber> rtn_names;
+ RtnNameMap rtn_names;
+// For each symbol, either NO_SYMBOL for "no corresponding list" or an index into symbol_lists
+ SymbolNumberVector symbol2lists;
+// For each a symbol, either NO_SYMBOL for "this is not a list" or an index into symbol_lists_members
+ SymbolNumberVector list2symbols;
+ std::vector<SymbolNumberVector> symbol_lists;
+ std::vector<SymbolNumberVector> symbol_list_members;
+ std::vector<unsigned long int> counters;
SymbolNumberVector guards;
+ std::vector<bool> printable_vector;
bool is_end_tag(const SymbolNumber symbol) const;
bool is_guard(const SymbolNumber symbol) const;
+ bool is_counter(const SymbolNumber symbol) const;
std::string end_tag(const SymbolNumber symbol);
std::string start_tag(const SymbolNumber symbol);
bool extract_tags;
@@ -101,18 +111,25 @@ namespace hfst_ol {
PmatchAlphabet(std::istream& is, SymbolNumber symbol_count);
PmatchAlphabet(void);
~PmatchAlphabet(void);
+ virtual void add_symbol(const std::string & symbol);
static bool is_end_tag(const std::string & symbol);
static bool is_insertion(const std::string & symbol);
static bool is_guard(const std::string & symbol);
+ static bool is_list(const std::string & symbol);
+ static bool is_counter(const std::string & symbol);
static bool is_special(const std::string & symbol);
static std::string name_from_insertion(
const std::string & symbol);
bool is_printable(SymbolNumber symbol);
void add_special_symbol(const std::string & str, SymbolNumber symbol_number);
+ void process_symbol_list(std::string str, SymbolNumber sym);
+ void process_counter(std::string str, SymbolNumber sym);
+ void count(SymbolNumber sym);
void add_rtn(PmatchTransducer * rtn, std::string const & name);
bool has_rtn(std::string const & name) const;
bool has_rtn(SymbolNumber symbol) const;
PmatchTransducer * get_rtn(SymbolNumber symbol);
+ std::string get_counter_name(SymbolNumber symbol);
SymbolNumber get_special(SpecialSymbol special) const;
SymbolNumberVector get_specials(void) const;
std::string stringify(const DoubleTape & str);
@@ -140,16 +157,16 @@ namespace hfst_ol {
std::vector<char> possible_first_symbols;
bool verbose;
bool locate_mode;
+ bool profile_mode;
unsigned int recursion_depth_left;
public:
- PmatchContainer(std::istream & is, bool verbose = false,
- bool extract_tags = false);
+ PmatchContainer(std::istream & is);
PmatchContainer(void);
~PmatchContainer(void);
- long line_number;
+ unsigned long line_number;
void initialize_input(const char * input);
bool has_unsatisfied_rtns(void) const;
@@ -157,6 +174,7 @@ namespace hfst_ol {
void process(std::string & input);
std::string match(std::string & input);
LocationVectorVector locate(std::string & input);
+ std::string get_profiling_info(void);
bool has_queued_input(unsigned int input_pos);
bool not_possible_first_symbol(SymbolNumber sym)
{
@@ -171,8 +189,12 @@ namespace hfst_ol {
std::string stringify_output(void);
// LocationVector locatefy_output(void);
static std::string parse_name_from_hfst3_header(std::istream & f);
- void be_verbose(void) { verbose = true; }
- bool is_verbose(void) { return verbose; }
+ void set_verbose(bool b) { verbose = b; }
+ void set_locate_mode(bool b) {
+ locate_mode = b;
+ alphabet.extract_tags = b;
+ }
+ void set_profile(bool b) { profile_mode = b; }
bool try_recurse(void)
{
if (recursion_depth_left > 0) {
diff --git a/libhfst/src/implementations/optimized-lookup/transducer.cc b/libhfst/src/implementations/optimized-lookup/transducer.cc
index 5ca763e..3bdcf7d 100644
--- a/libhfst/src/implementations/optimized-lookup/transducer.cc
+++ b/libhfst/src/implementations/optimized-lookup/transducer.cc
@@ -60,6 +60,11 @@ void TransducerAlphabet::add_symbol(char * symbol)
symbol_table.push_back(symbol);
}
+void TransducerAlphabet::add_symbol(const std::string & symbol)
+{
+ symbol_table.push_back(symbol);
+}
+
TransducerAlphabet::TransducerAlphabet(const SymbolTable& st):
symbol_table(st)
{
diff --git a/libhfst/src/implementations/optimized-lookup/transducer.h b/libhfst/src/implementations/optimized-lookup/transducer.h
index 25dd01b..4e6990b 100644
--- a/libhfst/src/implementations/optimized-lookup/transducer.h
+++ b/libhfst/src/implementations/optimized-lookup/transducer.h
@@ -78,6 +78,7 @@ typedef std::set<TraversalState> TraversalStates;
const SymbolNumber NO_SYMBOL_NUMBER = std::numeric_limits<SymbolNumber>::max();
const TransitionTableIndex NO_TABLE_INDEX =
std::numeric_limits<TransitionTableIndex>::max();
+const unsigned long NO_COUNTER = std::numeric_limits<unsigned long>::max();
const Weight INFINITE_WEIGHT = static_cast<float>(NO_TABLE_INDEX);
enum HeaderFlag {Weighted, Deterministic, Input_deterministic, Minimized,
@@ -419,7 +420,9 @@ public:
{ return identity_symbol; }
SymbolNumber get_orig_symbol_count(void) const
{ return orig_symbol_count; }
- void add_symbol(char * symbol);
+ virtual void add_symbol(char * symbol);
+ virtual void add_symbol(const std::string & symbol);
+
};
diff --git a/libhfst/src/parsers/LexcCompiler.cc b/libhfst/src/parsers/LexcCompiler.cc
index a0b1262..67cd416 100644
--- a/libhfst/src/parsers/LexcCompiler.cc
+++ b/libhfst/src/parsers/LexcCompiler.cc
@@ -85,7 +85,8 @@ LexcCompiler::LexcCompiler() :
parseErrors_(false),
with_flags_(false),
minimize_flags_(false),
- rename_flags_(false)
+ rename_flags_(false),
+ allow_multiple_sublexicon_definitions_(false)
{
xre_.set_expand_definitions(true);
}
@@ -102,7 +103,8 @@ LexcCompiler::LexcCompiler(ImplementationType impl) :
parseErrors_(false),
with_flags_(false),
minimize_flags_(false),
- rename_flags_(false)
+ rename_flags_(false),
+ allow_multiple_sublexicon_definitions_(false)
{
tokenizer_.add_multichar_symbol("@_EPSILON_SYMBOL_@");
tokenizer_.add_multichar_symbol("@0@");
@@ -126,7 +128,8 @@ LexcCompiler::LexcCompiler(ImplementationType impl, bool withFlags) :
parseErrors_(false),
with_flags_(withFlags),
minimize_flags_(false),
- rename_flags_(false)
+ rename_flags_(false),
+ allow_multiple_sublexicon_definitions_(false)
{
tokenizer_.add_multichar_symbol("@_EPSILON_SYMBOL_@");
tokenizer_.add_multichar_symbol("@0@");
@@ -180,12 +183,38 @@ LexcCompiler& LexcCompiler::parse(const char* filename)
return *this;
}
+bool LexcCompiler::isQuiet()
+{
+ return quiet_;
+}
+
LexcCompiler&
-LexcCompiler::setVerbosity(bool verbose)
+LexcCompiler::setVerbosity(unsigned int verbose)
{
- quiet_ = !verbose;
- verbose_ = verbose;
- return *this;
+ //quiet_ = !verbose;
+ //verbose_ = verbose;
+ if (verbose == 0)
+ {
+ quiet_ = true;
+ verbose_ = false;
+ }
+ else if (verbose == 1)
+ {
+ quiet_ = false;
+ verbose_ = false;
+ }
+ else
+ {
+ quiet_ = false;
+ verbose_ = true;
+ }
+ return *this;
+}
+
+bool
+LexcCompiler::areWarningsTreatedAsErrors()
+{
+ return treat_warnings_as_errors_;
}
LexcCompiler&
@@ -196,6 +225,14 @@ LexcCompiler::setTreatWarningsAsErrors(bool value)
}
LexcCompiler&
+LexcCompiler::setAllowMultipleSublexiconDefinitions(bool value)
+{
+ allow_multiple_sublexicon_definitions_ = value;
+ return *this;
+}
+
+
+LexcCompiler&
LexcCompiler::setWithFlags(bool value)
{
with_flags_ = value;
@@ -284,6 +321,48 @@ LexcCompiler::addStringEntry(const string& data,
return *this;
}
+// to handle information to warn_about_one_sided_flags_
+static bool treat_one_sided_flags_as_errors_ = false;
+static bool quiet_one_sided_flags_ = false;
+
+static void warn_about_one_sided_flags(const std::pair<std::string, std::string> & symbol_pair)
+{
+ if (FdOperation::is_diacritic(symbol_pair.first))
+ {
+ if (symbol_pair.first != symbol_pair.second)
+ {
+ if (treat_one_sided_flags_as_errors_)
+ {
+ if (true /*!quiet_one_sided_flags_*/) // error messages are always printed
+ {
+ std::cerr << std::endl << "*** ERROR: one-sided flag diacritic: " << symbol_pair.first << ":" << symbol_pair.second << " [--Werror]" << std::endl;
+ }
+ throw "one-sided flag";
+ }
+ if (!quiet_one_sided_flags_)
+ {
+ hfst::lexc::error_at_current_token(0, 0, "Warning: one-sided flag diacritic.");
+ }
+ }
+ return;
+ }
+ if (FdOperation::is_diacritic(symbol_pair.second))
+ {
+ if (treat_one_sided_flags_as_errors_)
+ {
+ if (true /*!quiet_one_sided_flags_*/) // error messages are always printed
+ {
+ std::cerr << std::endl << "*** ERROR: one-sided flag diacritic: " << symbol_pair.first << ":" << symbol_pair.second << " [--Werror]" << std::endl;
+ }
+ throw "one-sided flag";
+ }
+ if (!quiet_one_sided_flags_)
+ {
+ hfst::lexc::error_at_current_token(0, 0, "Warning: one-sided flag diacritic.");
+ }
+ }
+}
+
LexcCompiler&
LexcCompiler::addStringPairEntry(const string& upper, const string& lower,
const string& continuation, double weight)
@@ -352,6 +431,10 @@ LexcCompiler::addStringPairEntry(const string& upper, const string& lower,
StringPairVector newVector;
+ // information for function pointer &warn_about_one_sided_flags
+ treat_one_sided_flags_as_errors_ = treat_warnings_as_errors_;
+ quiet_one_sided_flags_ = quiet_;
+
if ( upperSize > lowerSize)
{
std::string epsilons = "";
@@ -362,7 +445,8 @@ LexcCompiler::addStringPairEntry(const string& upper, const string& lower,
}
newVector = tokenizer_.tokenize(joinerEnc + upper_string + encodedCont,
- joinerEnc + lower_string + epsilons + encodedCont);
+ joinerEnc + lower_string + epsilons + encodedCont,
+ &warn_about_one_sided_flags);
}
else if (upperSize < lowerSize)
@@ -375,12 +459,14 @@ LexcCompiler::addStringPairEntry(const string& upper, const string& lower,
}
newVector = tokenizer_.tokenize(joinerEnc + upper_string + epsilons + encodedCont,
- joinerEnc + lower_string + encodedCont);
+ joinerEnc + lower_string + encodedCont,
+ &warn_about_one_sided_flags);
}
else
{
newVector = tokenizer_.tokenize(joinerEnc + upper_string + encodedCont,
- joinerEnc + lower_string + encodedCont);
+ joinerEnc + lower_string + encodedCont,
+ &warn_about_one_sided_flags);
}
stringsTrie_.disjunct(newVector, weight);
@@ -504,6 +590,13 @@ LexcCompiler::setCurrentLexiconName(const string& lexiconName)
{
static bool firstLexicon = true;
currentLexiconName_ = lexiconName;
+
+ if (!allow_multiple_sublexicon_definitions_)
+ {
+ if (lexiconNames_.find(lexiconName) != lexiconNames_.end())
+ throw("Lexicon is defined more than once!");
+ }
+
lexiconNames_.insert(lexiconName);
if (noFlags_.find(lexiconName) == noFlags_.end())
{
@@ -575,53 +668,23 @@ LexcCompiler::compileLexical()
if (!quiet_) fprintf(stderr, "*** ERROR: could not parse lexc file: treating warnings as errors [--Werror] ***\n");
return 0;
}
-/*
- if( with_flags_)
- fprintf(stderr, "With Flags \n \n");
- else
- fprintf(stderr, "no Flags \n \n");
-*/
HfstTransducer lexicons(stringsTrie_, format_);
lexicons.minimize();
-
- // DEBUG
- //fprintf(stderr, "lexicons: \n");
- //lexicons.write_in_att_format(stderr, 1);
-
-
// repeat star to overgenerate
lexicons.repeat_star().minimize();
-
-
- //printf("lexicons: \n");
- //lexicons.write_in_att_format(stdout, 1);
-
-
-
-
HfstSymbolSubstitutions smallSubstitutions;
smallSubstitutions.insert(StringPair("@0@", "@_EPSILON_SYMBOL_@"));
smallSubstitutions.insert(StringPair("@@ANOTHER_EPSILON@@", "@_EPSILON_SYMBOL_@"));
smallSubstitutions.insert(StringPair("@ZERO@", "0"));
- /*
- lexicons.substitute("@0@", "@_EPSILON_SYMBOL_@");
- lexicons.substitute("@@ANOTHER_EPSILON@@", "@_EPSILON_SYMBOL_@");
- lexicons.substitute("@ZERO@", "0");
- */
lexicons.substitute(smallSubstitutions);
lexicons.prune_alphabet();
-
- //printf("lexicons: \n");
- //lexicons.write_in_att_format(stdout, 1);
-
-
HfstBasicTransducer joinersTrie_;
HfstSymbolSubstitutions allJoinersToEpsilon;
@@ -1122,27 +1185,30 @@ main(int argc, char** argv)
#if HAVE_SFST
std::cout << " (SFST)...";
LexcCompiler lexcSfst(SFST_TYPE);
+ lexcSfst.setAllowMultipleSublexiconDefinitions(true);
#endif
#if HAVE_OPENFST
std::cout << " (OpenFST)...";
LexcCompiler lexcOfst(TROPICAL_OPENFST_TYPE);
+ lexcOfst.setAllowMultipleSublexiconDefinitions(true);
#endif
#if HAVE_FOMA
std::cout << " (foma)...";
LexcCompiler lexcFoma(FOMA_TYPE);
+ lexcFoma.setAllowMultipleSublexiconDefinitions(true);
#endif
std::cout << std::endl << "set verbose:";
#if HAVE_SFST
- lexcSfst.setVerbosity(true);
- lexcSfst.setVerbosity(false);
+ lexcSfst.setVerbosity(1);
+ lexcSfst.setVerbosity(2);
#endif
#if HAVE_OFST
- lexcOfst.setVerbosity(true);
- lexcOfst.setVerbosity(false);
+ lexcOfst.setVerbosity(1);
+ lexcOfst.setVerbosity(2);
#endif
#if HAVE_FOMA
- lexcFoma.setVerbosity(true);
- lexcFoma.setVerbosity(false);
+ lexcFoma.setVerbosity(1);
+ lexcFoma.setVerbosity(2);
#endif
FILE* existence_check = fopen("LexcCompiler_test.lexc", "r");
if (existence_check == NULL)
diff --git a/libhfst/src/parsers/LexcCompiler.h b/libhfst/src/parsers/LexcCompiler.h
index 2bf4818..fd14d60 100644
--- a/libhfst/src/parsers/LexcCompiler.h
+++ b/libhfst/src/parsers/LexcCompiler.h
@@ -65,12 +65,19 @@ class LexcCompiler
LexcCompiler& parse(const char* filename);
//! @brief set verbosity options.
- //! When verbose is true, LexcCompiler will output the messages that Xerox
+ //! 0 means quiet, 1 the default and 2 (or bigger) the verbose mode.
+ //! When verbose is 2, LexcCompiler will output the messages that Xerox
//! lexc compiler does.
- LexcCompiler& setVerbosity(bool verbose);
+ LexcCompiler& setVerbosity(unsigned int verbose);
+
+ bool isQuiet();
LexcCompiler& setTreatWarningsAsErrors(bool value);
+ bool areWarningsTreatedAsErrors();
+
+ LexcCompiler& setAllowMultipleSublexiconDefinitions(bool value);
+
LexcCompiler& setWithFlags(bool value);
LexcCompiler& setMinimizeFlags(bool value);
@@ -137,6 +144,7 @@ class LexcCompiler
bool minimize_flags_;
bool rename_flags_;
bool treat_warnings_as_errors_;
+ bool allow_multiple_sublexicon_definitions_;
hfst::ImplementationType format_;
hfst::HfstTokenizer tokenizer_;
diff --git a/libhfst/src/parsers/XreCompiler.cc b/libhfst/src/parsers/XreCompiler.cc
index 0a453e9..41ce1db 100644
--- a/libhfst/src/parsers/XreCompiler.cc
+++ b/libhfst/src/parsers/XreCompiler.cc
@@ -10,7 +10,7 @@
namespace hfst { namespace xre {
- unsigned int cr=0;
+ unsigned int cr=0; // chars read from xre input
std::set<unsigned int> positions;
char * position_symbol = NULL;
std::string error_message;
@@ -19,6 +19,7 @@ XreCompiler::XreCompiler() :
definitions_(),
function_definitions_(),
function_arguments_(),
+ list_definitions_(),
format_(hfst::TROPICAL_OPENFST_TYPE)
{}
@@ -26,9 +27,18 @@ XreCompiler::XreCompiler(hfst::ImplementationType impl) :
definitions_(),
function_definitions_(),
function_arguments_(),
+ list_definitions_(),
format_(impl)
{}
+ XreCompiler::XreCompiler(const struct XreConstructorArguments & args) :
+ definitions_(args.definitions),
+ function_definitions_(args.function_definitions),
+ function_arguments_(args.function_arguments),
+ list_definitions_(args.list_definitions),
+ format_(args.format)
+{}
+
void
XreCompiler::define(const std::string& name, const std::string& xre)
{
@@ -42,6 +52,12 @@ XreCompiler::define(const std::string& name, const std::string& xre)
definitions_[name] = compiled;
}
+void
+XreCompiler::define_list(const std::string& name, const std::set<std::string>& symbol_list)
+{
+ list_definitions_[name] = symbol_list;
+}
+
void
XreCompiler::define(const std::string& name, const HfstTransducer & transducer)
{
@@ -129,13 +145,26 @@ XreCompiler::contained_only_comments()
HfstTransducer*
XreCompiler::compile(const std::string& xre)
{
- return hfst::xre::compile(xre, definitions_, function_definitions_, function_arguments_, format_);
+ // debug
+ //std::cerr << "XreCompiler: " << this << " : compile(\"" << xre << "\")" << std::endl;
+ unsigned int cr_before = cr;
+ cr = 0;
+ HfstTransducer * retval = hfst::xre::compile(xre, definitions_, function_definitions_, function_arguments_, list_definitions_, format_);
+ cr = cr_before;
+ return retval;
}
HfstTransducer*
XreCompiler::compile_first(const std::string& xre, unsigned int & chars_read)
{
- return hfst::xre::compile_first(xre, definitions_, function_definitions_, function_arguments_, format_, chars_read);
+ // debug
+ //std::cerr << "XreCompiler: " << this << " : compile_first(\"" << xre << "\"";
+ unsigned int cr_before = cr;
+ cr = 0;
+ HfstTransducer * retval = hfst::xre::compile_first(xre, definitions_, function_definitions_, function_arguments_, list_definitions_, format_, chars_read);
+ //std::cerr << ", " << chars_read << ")" << std::endl;
+ cr = cr_before;
+ return retval;
}
bool XreCompiler::get_positions_of_symbol_in_xre
@@ -143,9 +172,10 @@ bool XreCompiler::get_positions_of_symbol_in_xre
{
position_symbol = strdup(symbol.c_str());
positions.clear();
- cr=0;
+ unsigned int cr_before = cr;
+ cr = 0;
HfstTransducer * compiled =
- hfst::xre::compile(xre, definitions_, function_definitions_, function_arguments_, format_);
+ hfst::xre::compile(xre, definitions_, function_definitions_, function_arguments_, list_definitions_, format_);
free(position_symbol);
position_symbol = NULL;
if (compiled == NULL)
@@ -156,6 +186,7 @@ bool XreCompiler::get_positions_of_symbol_in_xre
return false;
}
positions_ = positions;
+ cr = cr_before;
return true;
}
diff --git a/libhfst/src/parsers/XreCompiler.h b/libhfst/src/parsers/XreCompiler.h
index b1b2153..665f52f 100644
--- a/libhfst/src/parsers/XreCompiler.h
+++ b/libhfst/src/parsers/XreCompiler.h
@@ -38,6 +38,31 @@ namespace hfst {
//! @brief hfst::xre namespace is used for all functions related to Xerox
//! Regular Expresisions (XRE) parsing.
namespace xre {
+
+ // needed for merge operation
+struct XreConstructorArguments
+{
+ std::map<std::string,hfst::HfstTransducer*> definitions;
+ std::map<std::string, std::string> function_definitions;
+ std::map<std::string, unsigned int > function_arguments;
+ std::map<std::string, std::set<std::string> > list_definitions;
+ hfst::ImplementationType format;
+
+ XreConstructorArguments
+ (std::map<std::string,hfst::HfstTransducer*> definitions_,
+ std::map<std::string, std::string> function_definitions_,
+ std::map<std::string, unsigned int > function_arguments_,
+ std::map<std::string, std::set<std::string> > list_definitions_,
+ hfst::ImplementationType format_)
+ {
+ definitions = definitions_;
+ function_definitions = function_definitions_;
+ function_arguments = function_arguments_;
+ list_definitions = list_definitions_;
+ format = format_;
+ }
+};
+
//! @brief A compiler holding information needed to compile XREs.
class XreCompiler
{
@@ -46,12 +71,16 @@ class XreCompiler
XreCompiler();
//! @brief Create compiler for @a impl format transducers
XreCompiler(hfst::ImplementationType impl);
+ // ...
+ XreCompiler(const struct XreConstructorArguments & args);
//! @brief Add a definition macro.
//! Compilers will replace arcs labeled @a name, with the transducer
//! defined by @a xre in later phases of compilation.
void define(const std::string& name, const std::string& xre);
+ void define_list(const std::string& name, const std::set<std::string>& symbol_list);
+
//! @brief Add a function macro.
//! Compilers will replace call to function \a name with the transducer
//! defined by \a xre when the function is called.
@@ -126,6 +155,7 @@ class XreCompiler
std::map<std::string,hfst::HfstTransducer*> definitions_;
std::map<std::string, std::string> function_definitions_;
std::map<std::string, unsigned int > function_arguments_;
+ std::map<std::string, std::set<std::string> > list_definitions_;
hfst::ImplementationType format_;
}
diff --git a/libhfst/src/parsers/lexc-lexer.ll b/libhfst/src/parsers/lexc-lexer.ll
index 6ff7d23..156566f 100644
--- a/libhfst/src/parsers/lexc-lexer.ll
+++ b/libhfst/src/parsers/lexc-lexer.ll
@@ -29,7 +29,6 @@
#include <assert.h>
extern void hlexcerror(const char *text);
-
%}
/* c.f. Unicode Standard 5.1 D92 Table 3-7 */
@@ -126,7 +125,6 @@ LWSP [\r\n\t ]
hlexclval.name = hfst::lexc::strdup_nonconst_part(lexicon_start, "Lexicon",
NULL, true);
free(lexicon_start);
- hlexcerror("Titlecase Lexicon parsed as LEXICON");
return LEXICON_START_WRONG_CASE;
}
@@ -196,7 +194,6 @@ LWSP [\r\n\t ]
lexicon_start = hfst::lexc::strstrip(hlexctext);
hlexclval.name = hfst::lexc::strdup_nonconst_part(lexicon_start, "Lexicon", 0, true);
free(lexicon_start);
- hlexcerror("Titlecase Lexicon parsed as LEXICON");
return LEXICON_START_WRONG_CASE;
}
diff --git a/libhfst/src/parsers/lexc-parser.yy b/libhfst/src/parsers/lexc-parser.yy
index 2bf2cb2..74cba6b 100644
--- a/libhfst/src/parsers/lexc-parser.yy
+++ b/libhfst/src/parsers/lexc-parser.yy
@@ -64,10 +64,18 @@ handle_definition(const string& variable_name, const string& reg_exp)
}
static
-void
+bool
handle_lexicon_name(const string& lexiconName)
{
+ try
+ {
hfst::lexc::lexc_->setCurrentLexiconName(lexiconName);
+ }
+ catch(const char * msg)
+ {
+ return false;
+ }
+ return true;
}
static
@@ -116,7 +124,7 @@ handle_string_entry(const string& data, const string& cont, const string& gloss)
}
static
-void
+bool
handle_string_pair_entry(const string& upper, const string& lower,
const string& cont, const string& gloss)
{
@@ -127,7 +135,11 @@ handle_string_pair_entry(const string& upper, const string& lower,
// handle epsilon "0"
if (upper != "0" && lower != "0")
{
- hfst::lexc::lexc_->addStringPairEntry(upper, lower, cont, weight);
+ try {
+ hfst::lexc::lexc_->addStringPairEntry(upper, lower, cont, weight);
+ } catch(const char * msg) {
+ return false;
+ }
}
else
{
@@ -137,8 +149,13 @@ handle_string_pair_entry(const string& upper, const string& lower,
upper_ = std::string("");
if (lower == "0")
lower_ = std::string("");
- hfst::lexc::lexc_->addStringPairEntry(upper_, lower_, cont, weight);
+ try {
+ hfst::lexc::lexc_->addStringPairEntry(upper_, lower_, cont, weight);
+ } catch(const char * msg) {
+ return false;
+ }
}
+ return true;
}
static
@@ -155,6 +172,15 @@ handle_regexp_entry(const string& reg_exp, const string& cont,
static
void
+hlexcwarn(const char* text)
+{
+ if (! hfst::lexc::lexc_->isQuiet())
+ { hfst::lexc::error_at_current_token(0, 0, text); }
+}
+
+
+static
+void
handle_eof()
{
}
@@ -255,12 +281,20 @@ LEXICONS: LEXICONS LEXICON2 LEXICON_LINES
;
LEXICON2: LEXICON_START {
- handle_lexicon_name($1);
+ bool retval = handle_lexicon_name($1);
free($1);
+ if (!retval)
+ { hlexcerror("Sublexicon defined more than once."); YYABORT; }
}
| LEXICON_START_WRONG_CASE {
- handle_lexicon_name($1);
+ if (hfst::lexc::lexc_->areWarningsTreatedAsErrors())
+ { hlexcerror("Keyword 'Lexicon' used instead of 'LEXICON'. [--Werror]"); YYABORT; }
+ else
+ { hlexcwarn("Titlecase Lexicon parsed as LEXICON"); }
+ bool retval = handle_lexicon_name($1);
free($1);
+ if (!retval)
+ { hlexcerror("Sublexicon defined more than once."); YYABORT; }
}
;
@@ -275,24 +309,30 @@ LEXICON_LINE: ULSTRING LEXICON_NAME ';' {
}
| ULSTRING ':' ULSTRING
LEXICON_NAME ';' {
- handle_string_pair_entry($1, $3, $4, "");
+ bool retval = handle_string_pair_entry($1, $3, $4, "");
free( $1);
free( $3);
free( $4);
+ if (!retval)
+ { hlexcerror("Erroneous string pair entry."); YYABORT; }
}
| LEXICON_NAME ';' {
handle_string_entry("", $1, "");
free( $1);
}
| ULSTRING ':' LEXICON_NAME ';' {
- handle_string_pair_entry($1, "", $3, "");
+ bool retval = handle_string_pair_entry($1, "", $3, "");
free( $1);
free( $3);
+ if (!retval)
+ { hlexcerror("Erroneous string pair entry."); YYABORT; }
}
| ':' ULSTRING LEXICON_NAME ';' {
- handle_string_pair_entry("", $2, $3, "");
+ bool retval = handle_string_pair_entry("", $2, $3, "");
free( $2);
free( $3);
+ if (!retval)
+ { hlexcerror("Erroneous string pair entry."); YYABORT; }
}
| ':' LEXICON_NAME ';' {
handle_string_entry("", $2, "");
@@ -306,11 +346,13 @@ LEXICON_LINE: ULSTRING LEXICON_NAME ';' {
}
| ULSTRING ':' ULSTRING
LEXICON_NAME ENTRY_GLOSS ';' {
- handle_string_pair_entry($1, $3, $4, $5);
+ bool retval = handle_string_pair_entry($1, $3, $4, $5);
free( $1);
free( $3);
free( $4);
free( $5);
+ if (!retval)
+ { hlexcerror("Erroneous string pair entry."); YYABORT; }
}
| LEXICON_NAME ENTRY_GLOSS ';' {
handle_string_entry("", $1, $2);
@@ -318,16 +360,20 @@ LEXICON_LINE: ULSTRING LEXICON_NAME ';' {
free( $2);
}
| ULSTRING ':' LEXICON_NAME ENTRY_GLOSS ';' {
- handle_string_pair_entry($1, "", $3, $4);
+ bool retval = handle_string_pair_entry($1, "", $3, $4);
free( $1);
free( $3);
free( $4);
+ if (!retval)
+ { hlexcerror("Erroneous string pair entry."); YYABORT; }
}
| ':' ULSTRING LEXICON_NAME ENTRY_GLOSS ';' {
- handle_string_pair_entry("", $2, $3, $4);
+ bool retval = handle_string_pair_entry("", $2, $3, $4);
free( $2);
free( $3);
free( $4);
+ if (!retval)
+ { hlexcerror("Erroneous string pair entry."); YYABORT; }
}
| ':' LEXICON_NAME ENTRY_GLOSS ';' {
handle_string_entry("", $2, $3);
@@ -363,7 +409,7 @@ void
hlexcerror(const char* text)
{
hfst::lexc::error_at_current_token(0, 0, text);
+ hlexcnerrs++;
}
-
// vim: set ft=yacc:
diff --git a/libhfst/src/parsers/pmatch_lex.ll b/libhfst/src/parsers/pmatch_lex.ll
index ae88028..e13e3d7 100644
--- a/libhfst/src/parsers/pmatch_lex.ll
+++ b/libhfst/src/parsers/pmatch_lex.ll
@@ -53,6 +53,9 @@ UINTEGER [1-9][0-9]*
INTEGER -?[1-9][0-9]*
WSP [\t ]
LWSP [\t\r\n ]
+
+HEXCHAR [0-9]|[a-f]
+UNICODE_ESCAPE ("\\u"{HEXCHAR}{HEXCHAR}{HEXCHAR}{HEXCHAR})|("\\U00"{HEXCHAR}{HEXCHAR}{HEXCHAR}{HEXCHAR}{HEXCHAR}{HEXCHAR})
%%
[Dd]"efine" {
@@ -63,6 +66,10 @@ LWSP [\t\r\n ]
return REGEX;
}
+"list" {
+ return DEFINED_LIST;
+}
+
"DefIns" {
return DEFINS;
}
@@ -107,6 +114,14 @@ return REGEX;
return TOUPPER_LEFT;
}
+"UpCase(" {
+ return TOLOWER_LEFT;
+}
+
+"DownCase(" {
+ return TOUPPER_LEFT;
+}
+
"Ins(" {
return INS_LEFT;
}
@@ -147,6 +162,22 @@ return REGEX;
return MAP_LEFT;
}
+"Lit(" {
+ return LIT_LEFT;
+}
+
+"Lst(" {
+ return LST_LEFT;
+}
+
+"Sigma(" {
+ return SIGMA_LEFT;
+}
+
+"Counter(" {
+ return COUNTER_LEFT;
+}
+
"~" { return COMPLEMENT; }
"\\" { return TERM_COMPLEMENT; }
"&" { return INTERSECTION; }
@@ -285,6 +316,11 @@ return REGEX;
return CURLY_LITERAL;
}
+"\""(({UNICODE_ESCAPE}|{U8C})"-"({UNICODE_ESCAPE}|{U8C}))+"\"" {
+ pmatchlval.transducer = hfst::pmatch::parse_range(pmatchtext);
+ return CHARACTER_RANGE;
+}
+
"\""([^"\""]|"\\\"")+"\"" {
pmatchlval.label = hfst::pmatch::parse_quoted(pmatchtext);
return QUOTED_LITERAL;
diff --git a/libhfst/src/parsers/pmatch_parse.yy b/libhfst/src/parsers/pmatch_parse.yy
index e3d5b40..b64c70c 100644
--- a/libhfst/src/parsers/pmatch_parse.yy
+++ b/libhfst/src/parsers/pmatch_parse.yy
@@ -49,6 +49,7 @@
std::pair<std::string, hfst::HfstTransducer*>* transducerDefinition;
std::map<std::string, hfst::HfstTransducer>* transducerDefinitions;
hfst::pmatch::PmatchAstNode* ast_node;
+ std::vector<hfst::pmatch::PmatchAstNode *>* ast_node_vector;
std::vector<std::string>* string_vector;
std::pair<hfst::xeroxRules::ReplaceArrow, std::vector<hfst::xeroxRules::Rule> >* replaceRuleVectorWithArrow;
@@ -75,7 +76,8 @@
REGEXP8 REGEXP9 REGEXP10 REGEXP11 REGEXP12 LABEL_PAIR
REPLACE REGEXP3 FUNCALL MAP FUNCALL_ARG
%type <label> LABEL
-%type <ast_node> FUNCBODY1 FUNCBODY2 FUNCBODY3 FUNCBODY4 FUNCBODY5 FUNCBODY6 FUNC_LABEL_PAIR
+%type <ast_node> FUNCBODY1 FUNCBODY2 FUNCBODY3 FUNCBODY4 FUNCBODY5 FUNCBODY6 FUNC_LABEL_PAIR FUN_OR_CONTEXT FUN_AND_CONTEXT FUN_CONTEXT_CONDITION FUN_CONTEXT
+%type <ast_node_vector> FUN_CONTEXT_CONDITIONS
%type <string_vector> ARGLIST
%type <transducerVector> FUNCALL_ARGLIST
@@ -96,6 +98,7 @@ REPLACE REGEXP3 FUNCALL MAP FUNCALL_ARG
%nonassoc <weight> WEIGHT END_OF_WEIGHTED_EXPRESSION
%nonassoc <label> QUOTED_LITERAL CURLY_LITERAL SYMBOL SYMBOL_WITH_LEFT_PAREN
+%nonassoc <transducer> CHARACTER_RANGE
%left CROSS_PRODUCT COMPOSITION LENIENT_COMPOSITION INTERSECTION
%left CENTER_MARKER MARKUP_MARKER
@@ -140,9 +143,10 @@ PAIR_SEPARATOR_WO_RIGHT PAIR_SEPARATOR_WO_LEFT
%token EPSILON_TOKEN ANY_TOKEN BOUNDARY_MARKER
%token LEXER_ERROR
-%nonassoc DEFINE REGEX DEFINS DEFFUN ALPHA LOWERALPHA UPPERALPHA NUM PUNCT WHITESPACE
-OPTCAP_LEFT TOLOWER_LEFT TOUPPER_LEFT INS_LEFT DEFINE_LEFT ENDTAG_LEFT LC_LEFT
-RC_LEFT NLC_LEFT NRC_LEFT MAP_LEFT SYM_LEFT OR_LEFT AND_LEFT
+%nonassoc DEFINE REGEX DEFINED_LIST DEFINS DEFFUN ALPHA LOWERALPHA UPPERALPHA
+NUM PUNCT WHITESPACE OPTCAP_LEFT TOLOWER_LEFT TOUPPER_LEFT INS_LEFT DEFINE_LEFT
+ENDTAG_LEFT LC_LEFT RC_LEFT NLC_LEFT NRC_LEFT MAP_LEFT LIT_LEFT LST_LEFT
+SIGMA_LEFT COUNTER_LEFT OR_LEFT AND_LEFT
%%
@@ -211,6 +215,12 @@ DEFINITION: DEFINE BINDING { $$ = $2; }
$2->minimize();
$$ = new std::pair<std::string, hfst::HfstTransducer*>("TOP", $2);
}
+| DEFINED_LIST BINDING {
+ $$ = new std::pair<std::string, hfst::HfstTransducer*>($2->first,
+ hfst::pmatch::make_sigma($2->second));
+ delete $2->second;
+ delete $2;
+ }
| DEFFUN FUNCTION { $$ = $2; }
;
@@ -297,20 +307,9 @@ FUNCBODY1: FUNCBODY2 { }
| FUNCBODY1 LENIENT_COMPOSITION FUNCBODY2 {
$$ = new PmatchAstNode($1, $3, hfst::pmatch::AstLenientCompose);
}
-| FUNCBODY1 FUN_RIGHT_CONTEXT {
- $$ = new PmatchAstNode($1, $2, hfst::pmatch::AstConcatenate);
- }
-| FUNCBODY1 FUN_LEFT_CONTEXT {
- $$ = new PmatchAstNode($2, $1, hfst::pmatch::AstConcatenate);
- }
-| FUNCBODY1 FUN_NEGATIVE_RIGHT_CONTEXT {
- $$ = new PmatchAstNode($1, $2, hfst::pmatch::AstConcatenate);
- }
-| FUNCBODY1 FUN_NEGATIVE_LEFT_CONTEXT {
- $$ = new PmatchAstNode($2, $1, hfst::pmatch::AstConcatenate);
- }
;
+
FUNCBODY2: FUNCBODY3 { }
| FUNCBODY2 UNION FUNCBODY3 {
$$ = new PmatchAstNode($1, $3, hfst::pmatch::AstDisjunct);
@@ -321,30 +320,28 @@ FUNCBODY2: FUNCBODY3 { }
| FUNCBODY2 MINUS FUNCBODY3 {
$$ = new PmatchAstNode($1, $3, hfst::pmatch::AstSubtract);
}
-// | REGEXP5 UPPER_MINUS REGEXP6 {
+// | REGEXP3 UPPER_MINUS REGEXP4 {
// pmatcherror("No upper minus");
// $$ = $1;
// delete $3;
// }
-// | REGEXP5 LOWER_MINUS REGEXP6 {
+// | REGEXP3 LOWER_MINUS REGEXP4 {
// pmatcherror("No lower minus");
// $$ = $1;
// delete $3;
// }
-// | REGEXP5 UPPER_PRIORITY_UNION REGEXP6 {
+// | REGEXP3 UPPER_PRIORITY_UNION REGEXP4 {
// pmatcherror("No upper priority union");
// $$ = $1;
// delete $3;
// }
-// | REGEXP5 LOWER_PRIORITY_UNION REGEXP6 {
+// | REGEXP3 LOWER_PRIORITY_UNION REGEXP3 {
// pmatcherror("No lower priority union");
// $$ = $1;
// delete $3;
// }
;
-
-
FUNCBODY3: FUNCBODY4 { }
| FUNCBODY3 FUNCBODY4 {
$$ = new PmatchAstNode($1, $2, hfst::pmatch::AstConcatenate);
@@ -376,6 +373,18 @@ FUNCBODY4: FUNCBODY5
$$ = new PmatchAstNode(hfst::pmatch::get_utils()->latin1_whitespace_acceptor);
}
| INSERT { $$ = new PmatchAstNode($1); }
+| ANONYMOUS_DEFINITION {
+$$ = new PmatchAstNode($1, hfst::pmatch::AstAddDelimiters);
+// Fixme: no funargs inside this
+}
+| FUNCALL {
+ $$ = new PmatchAstNode($1);
+// Fixme: no funargs inside this
+ }
+| MAP {
+ $$ = new PmatchAstNode($1);
+// Fixme: no funargs inside this
+}
| FUN_OPTCAP { }
| FUN_TOUPPER { }
| FUN_TOLOWER { }
@@ -428,9 +437,6 @@ FUNCBODY6: FUNC_LABEL_PAIR { }
$$ = new PmatchAstNode($1);
free($1);
}
-| BOUNDARY_MARKER {
- $$ = new PmatchAstNode(new HfstTransducer("@BOUNDARY@", "@BOUNDARY@", hfst::pmatch::format));
- }
| ENDTAG_LEFT SYMBOL RIGHT_PARENTHESIS {
$$ = new PmatchAstNode(hfst::pmatch::make_end_tag($2));
hfst::pmatch::need_delimiters = true;
@@ -468,20 +474,181 @@ FUNCBODY6: FUNC_LABEL_PAIR { }
$$ = new PmatchAstNode(hfst::HfstTransducer::read_lexc_ptr($1, hfst::TROPICAL_OPENFST_TYPE, hfst::pmatch::verbose));
free($1);
}
+| FUN_CONTEXT_CONDITION { }
+
+
+
+
+;
+
+FUN_CONTEXT_CONDITION:
+FUN_CONTEXT { $$ = $1; hfst::pmatch::need_delimiters = true; }
+| FUN_OR_CONTEXT { }
+| FUN_AND_CONTEXT { }
;
+FUN_CONTEXT:
+FUN_RIGHT_CONTEXT { }
+| FUN_NEGATIVE_RIGHT_CONTEXT { }
+| FUN_LEFT_CONTEXT { }
+| FUN_NEGATIVE_LEFT_CONTEXT { };
+
+FUN_OR_CONTEXT: OR_LEFT FUN_CONTEXT_CONDITIONS RIGHT_PARENTHESIS
+{
+ $$ = NULL;
+ for (std::vector<PmatchAstNode *>::iterator it = $2->begin();
+ it != $2->end(); ++it) {
+ if ($$ == NULL) {
+ $$ = *it;
+ } else {
+ PmatchAstNode * tmp = $$;
+ $$ = new PmatchAstNode(tmp, *it, hfst::pmatch::AstDisjunct);
+ }
+ }
+ delete $2;
+ // Zero the counter for making minimization
+ // guards for disjuncted negative contexts
+ hfst::pmatch::zero_minimization_guard();
+};
+
+FUN_AND_CONTEXT: AND_LEFT FUN_CONTEXT_CONDITIONS RIGHT_PARENTHESIS
+{
+ $$ = NULL;
+ for (std::vector<PmatchAstNode *>::iterator it = $2->begin();
+ it != $2->end(); ++it) {
+ if ($$ = NULL) {
+ $$ = *it;
+ } else {
+ PmatchAstNode * tmp = $$;
+ $$ = new PmatchAstNode(tmp, *it, hfst::pmatch::AstConcatenate);
+ }
+ }
+ delete $2;
+};
+
+FUN_CONTEXT_CONDITIONS:
+FUN_CONTEXT_CONDITION {
+ $$ = new std::vector<PmatchAstNode *>(1, $1);
+ }
+| FUN_CONTEXT_CONDITION COMMA FUN_CONTEXT_CONDITIONS {
+ $3->push_back($1);
+ $$ = $3;
+ };
+
+
+
+
FUNC_LABEL_PAIR:
LABEL {
$$ = new PmatchAstNode(new HfstTransducer($1, hfst::pmatch::format));
free($1);
-} |
-CURLY_LITERAL {
+}
+| CURLY_LITERAL {
HfstTokenizer tok;
$$ = new PmatchAstNode(new HfstTransducer($1, tok, hfst::pmatch::format));
free($1);
}
+| LABEL PAIR_SEPARATOR LABEL {
+ $$ = new PmatchAstNode(new HfstTransducer($1, $3, hfst::pmatch::format));
+ free($1); free($3);
+}
+| ANY_TOKEN PAIR_SEPARATOR ANY_TOKEN {
+ $$ = new PmatchAstNode(new HfstTransducer(hfst::internal_unknown, hfst::pmatch::format));
+}
+| LABEL PAIR_SEPARATOR ANY_TOKEN {
+ $$ = new PmatchAstNode(new HfstTransducer($1, hfst::internal_unknown, hfst::pmatch::format));
+ free($1);
+}
+| ANY_TOKEN PAIR_SEPARATOR LABEL {
+ $$ = new PmatchAstNode(new HfstTransducer(hfst::internal_unknown, $3, hfst::pmatch::format));
+ free($3);
+}
+| LABEL PAIR_SEPARATOR_WO_RIGHT {
+ $$ = new PmatchAstNode(new HfstTransducer($1, hfst::internal_unknown, hfst::pmatch::format));
+ free($1);
+ }
+| ANY_TOKEN PAIR_SEPARATOR_WO_RIGHT {
+ $$ = new PmatchAstNode(new HfstTransducer(hfst::internal_unknown, hfst::internal_unknown,
+ hfst::pmatch::format));
+}
+| PAIR_SEPARATOR_WO_LEFT LABEL {
+ $$ = new PmatchAstNode(new HfstTransducer(hfst::internal_unknown, $2, hfst::pmatch::format));
+ free($2);
+}
+| PAIR_SEPARATOR_WO_LEFT ANY_TOKEN {
+ $$ = new PmatchAstNode(new HfstTransducer(hfst::internal_unknown, hfst::internal_unknown,
+ hfst::pmatch::format));
+}
+| PAIR_SEPARATOR_SOLE {
+ $$ = new PmatchAstNode(new HfstTransducer(hfst::internal_unknown,
+ hfst::pmatch::format));
+ }
+| ANY_TOKEN {
+ $$ = new PmatchAstNode(new HfstTransducer(hfst::internal_identity,
+ hfst::pmatch::format));
+ }
+| CURLY_LITERAL PAIR_SEPARATOR CURLY_LITERAL {
+ HfstTokenizer tok;
+ HfstTransducer * left = new HfstTransducer($1, tok, hfst::pmatch::format);
+ HfstTransducer right($3, tok, hfst::pmatch::format);
+ HfstTransducer destroy(hfst::internal_unknown, hfst::internal_epsilon, hfst::pmatch::format);
+ HfstTransducer construct(hfst::internal_epsilon, hfst::internal_unknown, hfst::pmatch::format);
+ left->compose(destroy.repeat_star());
+ left->compose(construct.repeat_star());
+ left->compose(right);
+ $$ = new PmatchAstNode(left);
+ free($1); free($3);
+}
+| LABEL PAIR_SEPARATOR CURLY_LITERAL {
+ HfstTokenizer tok;
+ HfstTransducer * left = new HfstTransducer(
+ $1, hfst::internal_epsilon, hfst::pmatch::format);
+ HfstTransducer right($3, tok, hfst::pmatch::format);
+ HfstTransducer construct(hfst::internal_epsilon, hfst::internal_unknown, hfst::pmatch::format);
+ left->compose(construct.repeat_star());
+ left->compose(right);
+ $$ = new PmatchAstNode(left);
+ free($1); free($3);
+}
+| CURLY_LITERAL PAIR_SEPARATOR LABEL {
+ HfstTokenizer tok;
+ HfstTransducer * left = new HfstTransducer($1, tok, hfst::pmatch::format);
+ HfstTransducer right(hfst::internal_epsilon, $3, hfst::pmatch::format);
+ HfstTransducer destroy(hfst::internal_unknown, hfst::internal_epsilon, hfst::pmatch::format);
+ left->compose(destroy.repeat_star());
+ left->compose(right);
+ $$ = new PmatchAstNode(left);
+ free($1); free($3);
+}
+
;
+// MAP: MAP_LEFT SYMBOL COMMA READ_TEXT RIGHT_PARENTHESIS {
+// if (hfst::pmatch::functions.count($2) == 0) {
+// std::string errstring = "Function not defined: " + std::string($2);
+// pmatcherror(errstring.c_str());
+// }
+// std::vector<string> & callee_args = hfst::pmatch::functions[$2].args;
+// std::vector<std::vector<std::string> > caller_strings =
+// hfst::pmatch::read_args($4, callee_args.size());
+// std::map<std::string, HfstTransducer*> caller_args;
+// HfstTokenizer tok;
+// $$ = new HfstTransducer(hfst::pmatch::format);
+// for (std::vector<std::vector<std::string> >::iterator it =
+// caller_strings.begin(); it != caller_strings.end(); ++it) {
+// for (int i = 0; i < it->size(); ++i) {
+// caller_args[callee_args[i]] = new HfstTransducer(it->at(i), tok, hfst::pmatch::format);
+// }
+// $$->disjunct(*hfst::pmatch::functions[$2].evaluate(caller_args));
+// // Clean up the string transducers we allocated each time
+// for (std::map<std::string, HfstTransducer *>::iterator it = caller_args.begin();
+// it != caller_args.end(); ++it) {
+// delete it->second;
+// }
+// caller_args.clear();
+// }
+// $$->minimize();
+// };
REGEXP2: REPLACE
{ }
@@ -1147,8 +1314,13 @@ LABEL_PAIR: LABEL PAIR_SEPARATOR LABEL {
if (strlen($1) == 0) {
$$ = new HfstTransducer(hfst::pmatch::format);
} else {
- std::string errstring = "Unknown symbol: " + std::string($1);
- pmatcherror(errstring.c_str());
+ if (hfst::pmatch::verbose) {
+ std::cerr << "Warning: interpreting undefined symbol \"" << $1
+ << "\" as label on line " << pmatchlineno << "\n";
+ }
+ $$ = new HfstTransducer($1, hfst::pmatch::format);
+// std::string errstring = "Unknown symbol: " + std::string($1);
+// pmatcherror(errstring.c_str());
}
}
free($1);
@@ -1223,12 +1395,32 @@ LABEL_PAIR: LABEL PAIR_SEPARATOR LABEL {
$$ = hfst::pmatch::make_end_tag($2);
hfst::pmatch::need_delimiters = true;
}
-
+| CHARACTER_RANGE { $$ = $1; }
+| LST_LEFT REGEXP2 RIGHT_PARENTHESIS
+{
+ if (!hfst::pmatch::flatten) {
+ $$ = hfst::pmatch::make_list($2);
+ free($2);
+ } else {
+ $$ = $2;
+ }
+}
+| SIGMA_LEFT REGEXP2 RIGHT_PARENTHESIS
+{
+ $$ = hfst::pmatch::make_sigma($2);
+ free($2);
+}
+| COUNTER_LEFT SYMBOL RIGHT_PARENTHESIS
+{
+ $$ = hfst::pmatch::make_counter($2);
+ free($2);
+}
;
LABEL: QUOTED_LITERAL { }
| EPSILON_TOKEN { $$ = strdup(hfst::internal_epsilon.c_str()); }
| BOUNDARY_MARKER { $$ = strdup("@BOUNDARY@"); }
+| LIT_LEFT SYMBOL RIGHT_PARENTHESIS { $$ = $2; }
;
CONTEXT_CONDITION:
@@ -1330,6 +1522,9 @@ SYMBOL {
$$ = new HfstTransducer($1, hfst::pmatch::format);
free($1);
}
+| LEFT_BRACKET REGEXP2 RIGHT_BRACKET {
+ $$ = $2;
+}
;
MAP: MAP_LEFT SYMBOL COMMA READ_TEXT RIGHT_PARENTHESIS {
@@ -1469,15 +1664,16 @@ FUN_RIGHT_CONTEXT: RC_LEFT FUNCBODY2 RIGHT_PARENTHESIS {
hfst::pmatch::format),
$2, hfst::pmatch::AstConcatenate);
$$ = new PmatchAstNode(rc_entry,
- new HfstTransducer(
- hfst::internal_epsilon,
- hfst::pmatch::RC_EXIT_SYMBOL,
- hfst::pmatch::format),
- hfst::pmatch::AstConcatenate);
+ new HfstTransducer(
+ hfst::internal_epsilon,
+ hfst::pmatch::RC_EXIT_SYMBOL,
+ hfst::pmatch::format),
+ hfst::pmatch::AstConcatenate);
}
;
FUN_NEGATIVE_RIGHT_CONTEXT: NRC_LEFT FUNCBODY2 RIGHT_PARENTHESIS {
+ HfstTransducer * guard = hfst::pmatch::get_minimization_guard();
PmatchAstNode * nrc_entry =
new PmatchAstNode(
new HfstTransducer(hfst::internal_epsilon,
@@ -1491,11 +1687,12 @@ FUN_NEGATIVE_RIGHT_CONTEXT: NRC_LEFT FUNCBODY2 RIGHT_PARENTHESIS {
hfst::pmatch::NRC_EXIT_SYMBOL,
hfst::pmatch::format),
hfst::pmatch::AstConcatenate);
- $$ = new PmatchAstNode(
+ PmatchAstNode * context = new PmatchAstNode(
nrc_main_branch,
new HfstTransducer("@PMATCH_PASSTHROUGH@",
hfst::internal_epsilon, hfst::pmatch::format),
hfst::pmatch::AstDisjunct);
+ $$ = new PmatchAstNode(guard, context, hfst::pmatch::AstConcatenate);
}
;
@@ -1516,6 +1713,7 @@ FUN_LEFT_CONTEXT: LC_LEFT FUNCBODY2 RIGHT_PARENTHESIS {
;
FUN_NEGATIVE_LEFT_CONTEXT: NLC_LEFT FUNCBODY2 RIGHT_PARENTHESIS {
+ HfstTransducer * guard = hfst::pmatch::get_minimization_guard();
PmatchAstNode * reverse = new PmatchAstNode(
$2, hfst::pmatch::AstReverse);
@@ -1529,10 +1727,12 @@ FUN_NEGATIVE_LEFT_CONTEXT: NLC_LEFT FUNCBODY2 RIGHT_PARENTHESIS {
PmatchAstNode * main_branch = new PmatchAstNode(
entry, nlc_exit, hfst::pmatch::AstConcatenate);
- $$ = new PmatchAstNode(main_branch,
- new HfstTransducer("@PMATCH_PASSTHROUGH@",
- hfst::internal_epsilon, hfst::pmatch::format),
- hfst::pmatch::AstDisjunct);
+ PmatchAstNode * context =
+ new PmatchAstNode(main_branch,
+ new HfstTransducer("@PMATCH_PASSTHROUGH@",
+ hfst::internal_epsilon, hfst::pmatch::format),
+ hfst::pmatch::AstDisjunct);
+ $$ = new PmatchAstNode(guard, context, hfst::pmatch::AstConcatenate);
}
;
diff --git a/libhfst/src/parsers/pmatch_utils.cc b/libhfst/src/parsers/pmatch_utils.cc
index 95304fe..4a33c63 100644
--- a/libhfst/src/parsers/pmatch_utils.cc
+++ b/libhfst/src/parsers/pmatch_utils.cc
@@ -12,6 +12,7 @@
#include "pmatch_utils.h"
#include "HfstTransducer.h"
#include "tools/src/HfstUtf8.h"
+#include "implementations/optimized-lookup/pmatch.h"
using std::string;
using std::map;
@@ -254,6 +255,47 @@ HfstTransducer * make_end_tag(std::string tag)
return end_tag;
}
+HfstTransducer * make_counter(std::string name)
+{
+ HfstTransducer * counter = new HfstTransducer(
+ hfst::internal_epsilon,
+ "@PMATCH_COUNTER_" + name + "@",
+ format);
+ return counter;
+}
+
+HfstTransducer * make_list(HfstTransducer * t)
+{
+ std::string arc = "@PMATCH_LIST_";
+ hfst::StringSet alphabet = t->get_alphabet();
+ for (hfst::StringSet::const_iterator it = alphabet.begin();
+ it != alphabet.end(); ++it) {
+ if (!hfst_ol::PmatchAlphabet::is_special(*it) &&
+ *it != hfst::internal_epsilon && *it != hfst::internal_unknown &&
+ *it != hfst::internal_identity && *it != hfst::internal_default) {
+ arc.append(*it);
+ arc.append("_");
+ }
+ }
+ arc.append("@");
+ return new HfstTransducer(arc, format);
+}
+
+HfstTransducer * make_sigma(HfstTransducer * t)
+{
+ HfstTransducer * retval =
+ new HfstTransducer(format);
+ hfst::StringSet alphabet = t->get_alphabet();
+ for (hfst::StringSet::const_iterator it = alphabet.begin();
+ it != alphabet.end(); ++it) {
+ if (!hfst_ol::PmatchAlphabet::is_special(*it) &&
+ *it != hfst::internal_epsilon && *it != hfst::internal_unknown &&
+ *it != hfst::internal_identity && *it != hfst::internal_default) {
+ retval->disjunct(HfstTransducer(*it, format));
+ }
+ }
+ return retval;
+}
char * get_delimited(const char *s, char delim_left, char delim_right)
{
@@ -364,55 +406,37 @@ parse_quoted(const char *s)
case 'u':
if (strlen(p) < 6) {
// Can't be a valid escape sequence
- *r++ = '\\';
- *r++ = 'u';
+ *r++ = *p;
+ *r++ = *(p+1);
p += 2;
} else {
char buf[5];
memcpy(buf, p+2, 4);
buf[4] = '\0';
unsigned int codepoint = strtol(buf, NULL, 16);
- bool u_parse_err = false;
- // The following is adapted from an answer at
- // http://stackoverflow.com/questions/4607413/c-library-to-convert-unicode-code-points-to-utf8
- // My understanding of the magic numbers:
- // 0x80 = 128 = 2^7
- // 64 = 2^6, 192 = 2^6 + 2^7
- // 0x800 = 2048 = 2^11
- // 0x1000 = 2^16 etc.
- if (codepoint < 0x80) {
- buf[0] = codepoint;
- buf[1] = '\0';
- } else if (codepoint < 0x800) {
- buf[0] = 192 + codepoint / 64;
- buf[1] = 128 + codepoint % 64;
- buf[2] = '\0';
- } else if (codepoint - 0xd800u < 0x800) {
- u_parse_err = true;
- } else if (codepoint < 0x10000) {
- buf[0] = 224 + codepoint / 4096;
- buf[1] = 128 + codepoint / 64 % 64;
- buf[2] = 128 + codepoint % 64;
- buf[3] = '\0';
- } else if (codepoint < 0x110000) {
- buf[0] = 240 + codepoint / 262144;
- buf[1] = 128 + codepoint / 4096 % 64;
- buf[2] = 128 + codepoint / 64 % 64;
- buf[3] = 128 + codepoint % 64;
- buf[4] = '\0';
- } else {
- u_parse_err = true;
- }
- if (u_parse_err) {
- fprintf(stderr, "PMATCH: Failed to parse unicode codepoint\n");
- *r++ = '\0';
- } else {
- strcpy(r, buf);
- r += strlen(buf) + 1;
- }
+ std::string utf8_char = codepoint_to_utf8(codepoint);
+ strcpy(r, utf8_char.c_str());
+ r += utf8_char.size() + 1;
p += 6;
}
break;
+ case 'U':
+ if (strlen(p) < 10) {
+ // Can't be a valid escape sequence
+ *r++ = *p;
+ *r++ = *(p+1);
+ p += 2;
+ } else {
+ char buf[9];
+ memcpy(buf, p+2, 8);
+ buf[8] = '\0';
+ unsigned int codepoint = strtol(buf, NULL, 16);
+ std::string utf8_char = codepoint_to_utf8(codepoint);
+ strcpy(r, utf8_char.c_str());
+ r += utf8_char.size() + 1;
+ p += 10;
+ }
+ break;
case 'v':
*r = '\v';
r++;
@@ -456,6 +480,141 @@ parse_quoted(const char *s)
return rv;
}
+unsigned int next_utf8_to_codepoint(unsigned char **c)
+{
+ unsigned int codepoint = 0;
+ int bytes_in_char = 0;
+ if (**c <= 127) {
+ bytes_in_char = 1;
+ codepoint = **c & 127;
+ } else if ( (**c & (128 + 64)) == (128 + 64) ) {
+ bytes_in_char = 2;
+ codepoint = **c & 31;
+ } else if ( (**c & (128 + 64 + 32)) == (128 + 64 + 32) ) {
+ bytes_in_char = 3;
+ codepoint = **c & 15;
+ } else if ( (**c & (128 + 64 + 32 + 16)) == (128 + 64 + 32 + 16)) {
+ bytes_in_char = 4;
+ codepoint = **c & 7;
+ } else {
+ return 0;
+ }
+ for (int i = 1; i < bytes_in_char; ++i) {
+ codepoint = ((codepoint << 6) | (unsigned long)(*(*c + i) & 63));
+ }
+ *c += bytes_in_char;
+ return codepoint;
+}
+
+std::string codepoint_to_utf8(unsigned int codepoint)
+{
+ char buf[5];
+ bool u_parse_err = false;
+ // The following is adapted from an answer at
+ // http://stackoverflow.com/questions/4607413/c-library-to-convert-unicode-code-points-to-utf8
+ // My understanding of the magic numbers:
+ // 0x80 = 128 = 2^7
+ // 64 = 2^6, 192 = 2^6 + 2^7
+ // 0x800 = 2048 = 2^11
+ // 0x1000 = 2^16 etc.
+ if (codepoint < 0x80) {
+ buf[0] = codepoint;
+ buf[1] = '\0';
+ } else if (codepoint < 0x800) {
+ buf[0] = 192 + codepoint / 64;
+ buf[1] = 128 + codepoint % 64;
+ buf[2] = '\0';
+ } else if (codepoint - 0xd800u < 0x800) {
+ u_parse_err = true;
+ } else if (codepoint < 0x10000) {
+ buf[0] = 224 + codepoint / 4096;
+ buf[1] = 128 + codepoint / 64 % 64;
+ buf[2] = 128 + codepoint % 64;
+ buf[3] = '\0';
+ } else if (codepoint < 0x110000) {
+ buf[0] = 240 + codepoint / 262144;
+ buf[1] = 128 + codepoint / 4096 % 64;
+ buf[2] = 128 + codepoint / 64 % 64;
+ buf[3] = 128 + codepoint % 64;
+ buf[4] = '\0';
+ } else {
+ u_parse_err = true;
+ }
+ if (u_parse_err) {
+ return "";
+ } else {
+ return std::string(buf);;
+ }
+}
+
+HfstTransducer * parse_range(const char * s)
+{
+ char * quoted = get_delimited(s, '"');
+ char * orig_quoted = quoted;
+ char ** c = & quoted;
+ unsigned char bytes_in_char;
+ HfstTransducer * retval = new HfstTransducer(format);
+ while (**c != '\0') {
+ unsigned int codepoint1 = 0;
+ unsigned int codepoint2 = 0;
+ if (strlen(*c) >= 6 && **c == '\\' &&
+ (*(*c + 1) == 'u' || *(*c + 1) == 'U')) {
+ // an escape sequence
+ char buf[9];
+ if (*(*c + 1) == 'u') {
+ memcpy(buf, *c+2, 4);
+ buf[4] = '\0';
+ *c += 6;
+ } else {
+ memcpy(buf, *c+2, 8);
+ buf[8] = '\0';
+ *c += 10;
+ }
+ codepoint1 = strtol(buf, NULL, 16);
+ } else {
+ codepoint1 = next_utf8_to_codepoint((unsigned char**) c);
+ }
+ if (**c != '-') {
+ std::string errstring("Could not parse range expression: ");
+ errstring.append(std::string(s));
+ pmatcherror(errstring.c_str());
+ }
+ *c += 1;
+ if (strlen(*c) >= 6 && **c == '\\' &&
+ (*(*c + 1) == 'u' || *(*c + 1) == 'U')) {
+ char buf[9];
+ if (*(*c + 1) == 'u') {
+ memcpy(buf, *c+2, 4);
+ buf[4] = '\0';
+ *c += 6;
+ } else {
+ memcpy(buf, *c+2, 8);
+ buf[8] = '\0';
+ *c += 10;
+ }
+ codepoint2 = strtol(buf, NULL, 16);
+ } else {
+ codepoint2 = next_utf8_to_codepoint((unsigned char**) c);
+ }
+ if (codepoint1 == 0 || codepoint2 == 0) {
+ std::string errstring("Malformed character in range expression: ");
+ errstring.append(std::string(s));
+ pmatcherror(errstring.c_str());
+ }
+ if (codepoint2 < codepoint1) {
+ std::string errstring("Range expression goes from higher to lower: ");
+ errstring.append(std::string(s));
+ pmatcherror(errstring.c_str());
+ }
+ while (codepoint1 <= codepoint2) {
+ retval->disjunct(HfstTransducer(codepoint_to_utf8(codepoint1), format));
+ ++codepoint1;
+ }
+ }
+ free(orig_quoted);
+ return retval;
+}
+
double
get_weight(const char *s)
{
@@ -918,6 +1077,22 @@ HfstTransducer * PmatchAstNode::evaluate(
} else if (type == AstSymbol) {
if (funargs.count(symbol) == 1) {
return new HfstTransducer(*funargs[symbol]);
+ } else if (definitions.count(symbol) == 1) {
+ if (!flatten && def_insed_transducers.count(symbol) == 1) {
+ inserted_transducers.insert(symbol);
+ if (verbose) {
+ used_definitions.insert(symbol);
+ }
+ return new HfstTransducer(*def_insed_transducers[symbol]);
+ } else {
+ if (verbose) {
+ std::cerr << "including " <<
+ definitions[symbol]->get_name() << " with ";
+ print_size_info(hfst::pmatch::definitions[symbol]);
+ used_definitions.insert(symbol);
+ }
+ return new HfstTransducer(*hfst::pmatch::definitions[symbol]);
+ }
} else {
std::string errstring = "Symbol " + std::string(symbol) + " not found";
pmatcherror(errstring.c_str());
diff --git a/libhfst/src/parsers/pmatch_utils.h b/libhfst/src/parsers/pmatch_utils.h
index 18f1258..817a76b 100644
--- a/libhfst/src/parsers/pmatch_utils.h
+++ b/libhfst/src/parsers/pmatch_utils.h
@@ -88,6 +88,9 @@ HfstTransducer * add_pmatch_delimiters(HfstTransducer * regex);
*/
void add_end_tag(HfstTransducer * regex, std::string tag);
HfstTransducer * make_end_tag(std::string tag);
+HfstTransducer * make_counter(std::string name);
+HfstTransducer * make_list(HfstTransducer * t);
+HfstTransducer * make_sigma(HfstTransducer * t);
/**
* @brief find first segment from strign @a s delimited by char delim.
@@ -100,6 +103,10 @@ char* unescape_delimited(char *s, char delim);
char* parse_quoted(const char *s);
+unsigned int next_utf8_to_codepoint(unsigned char **c);
+std::string codepoint_to_utf8(unsigned int codepoint);
+HfstTransducer * parse_range(const char *s);
+
int* get_n_to_k(const char* s);
double get_weight(const char* s);
@@ -326,7 +333,7 @@ struct PmatchAstNode {
transducer(NULL) { }
PmatchAstNode(PmatchAstNode * l,
- HfstTransducer * r,
+ HfstTransducer * r,
PmatchAstOperation o):
left_child(l),
right_child(new PmatchAstNode(r)),
diff --git a/libhfst/src/parsers/xre_lex.ll b/libhfst/src/parsers/xre_lex.ll
index 1803eca..79f13cc 100644
--- a/libhfst/src/parsers/xre_lex.ll
+++ b/libhfst/src/parsers/xre_lex.ll
@@ -102,6 +102,8 @@ BRACED [{]([^}]|[\300-\337].|[\340-\357]..|[\360-\367]...)+[}]
".o." { CR; return COMPOSITION; }
".O." { CR; return LENIENT_COMPOSITION; }
+".m>." { CR; return MERGE_RIGHT_ARROW; }
+".<m." { CR; return MERGE_LEFT_ARROW; }
".x." { CR; return CROSS_PRODUCT; }
".P." { CR; return UPPER_PRIORITY_UNION; }
".p." { CR; return LOWER_PRIORITY_UNION; }
diff --git a/libhfst/src/parsers/xre_parse.yy b/libhfst/src/parsers/xre_parse.yy
index c75168e..2593c0b 100644
--- a/libhfst/src/parsers/xre_parse.yy
+++ b/libhfst/src/parsers/xre_parse.yy
@@ -121,7 +121,7 @@ int xrelex ( YYSTYPE * , yyscan_t );
%nonassoc <weight> WEIGHT END_OF_WEIGHTED_EXPRESSION
%nonassoc <label> SYMBOL CURLY_BRACKETS
-%left CROSS_PRODUCT COMPOSITION LENIENT_COMPOSITION INTERSECTION
+%left CROSS_PRODUCT COMPOSITION LENIENT_COMPOSITION INTERSECTION MERGE_RIGHT_ARROW MERGE_LEFT_ARROW
%left CENTER_MARKER MARKUP_MARKER
%left SHUFFLE
%right LEFT_RESTRICTION LEFT_ARROW RIGHT_ARROW LEFT_RIGHT_ARROW
@@ -242,6 +242,23 @@ REGEXP2: REPLACE
$$ = & $1->lenient_composition(*$3).minimize();
delete $3;
}
+ | REGEXP2 MERGE_RIGHT_ARROW REPLACE {
+ try {
+ $$ = hfst::xre::merge_first_to_second($1, $3);
+ }
+ catch (const TransducersAreNotAutomataException & e)
+ {
+ xreerror("Error: transducers must be automata in merge operation.");
+ delete $1;
+ delete $3;
+ YYABORT;
+ }
+ delete $1;
+ }
+ | REGEXP2 MERGE_LEFT_ARROW REPLACE {
+ $$ = hfst::xre::merge_first_to_second($3, $1);
+ delete $3;
+ }
// substitute
| SUB1 HALFARC PAIR_SEPARATOR HALFARC COMMA HALFARC PAIR_SEPARATOR HALFARC RIGHT_BRACKET {
$1->substitute(StringPair($2,$4), StringPair($6,$8));
diff --git a/libhfst/src/parsers/xre_utils.cc b/libhfst/src/parsers/xre_utils.cc
index e9edade..996bfe3 100644
--- a/libhfst/src/parsers/xre_utils.cc
+++ b/libhfst/src/parsers/xre_utils.cc
@@ -70,11 +70,12 @@ namespace hfst
namespace xre
{
-char* data;
-std::map<std::string,hfst::HfstTransducer*> definitions;
+ char* data;
+ std::map<std::string,hfst::HfstTransducer*> definitions;
std::map<std::string,std::string> function_definitions;
std::map<std::string,unsigned int> function_arguments;
-char* startptr;
+ std::map<std::string,std::set<string> > symbol_lists;
+ char* startptr; // changed this to an internal variable in compile functions
hfst::HfstTransducer* last_compiled;
bool contains_only_comments = false;
hfst::ImplementationType format;
@@ -438,29 +439,32 @@ HfstTransducer*
compile(const string& xre, map<string,HfstTransducer*>& defs,
map<string, string>& func_defs,
map<string, unsigned int > func_args,
+ map<string, std::set<string> >& lists,
ImplementationType impl)
{
// lock here?
data = strdup(xre.c_str());
- startptr = data;
+ // use an internal variable startptr_ instead of global startptr
+ char * startptr_ = data;
len = strlen(data);
definitions = defs;
function_definitions = func_defs;
function_arguments = func_args;
+ symbol_lists = lists;
format = impl;
contains_only_comments = false;
yyscan_t scanner;
xrelex_init(&scanner);
- YY_BUFFER_STATE bs = xre_scan_string(startptr,scanner);
+ YY_BUFFER_STATE bs = xre_scan_string(startptr_,scanner);
int parse_retval = xreparse(scanner);
xre_delete_buffer(bs,scanner);
xrelex_destroy(scanner);
- free(startptr);
+ free(startptr_);
data = 0;
len = 0;
if (parse_retval == 0 && !contains_only_comments) // if (yynerrs == 0)
@@ -480,23 +484,26 @@ HfstTransducer*
compile_first(const string& xre, map<string,HfstTransducer*>& defs,
map<string, string>& func_defs,
map<string, unsigned int > func_args,
+ map<string, std::set<string> >& lists,
ImplementationType impl,
unsigned int & chars_read)
{
// lock here?
data = strdup(xre.c_str());
- startptr = data;
+ // use an internal variable startptr_ instead of global startptr
+ char * startptr_ = data;
len = strlen(data);
definitions = defs;
function_definitions = func_defs;
function_arguments = func_args;
+ symbol_lists = lists;
format = impl;
contains_only_comments = false;
yyscan_t scanner;
xrelex_init(&scanner);
- YY_BUFFER_STATE bs = xre_scan_string(startptr,scanner);
+ YY_BUFFER_STATE bs = xre_scan_string(startptr_,scanner);
bool tmp = hfst::xre::allow_extra_text_at_end;
hfst::xre::allow_extra_text_at_end = true;
@@ -509,7 +516,8 @@ compile_first(const string& xre, map<string,HfstTransducer*>& defs,
xre_delete_buffer(bs,scanner);
xrelex_destroy(scanner);
- free(startptr);
+ free(startptr_);
+
data = 0;
len = 0;
if (parse_retval == 0 && !contains_only_comments) // if (yynerrs == 0)
@@ -725,30 +733,49 @@ xfst_label_to_transducer(const char* input, const char* output)
{
HfstTransducer * retval = NULL;
- // non-matching definitions
- if ( (is_definition(input) || is_definition(output)) &&
- strcmp(input, output) != 0 )
+ bool input_is_definition = is_definition(input);
+ bool output_is_definition = is_definition(output);
+ bool input_is_unknown = (strcmp(input, hfst::internal_unknown.c_str()) == 0);
+ bool output_is_unknown = (strcmp(output, hfst::internal_unknown.c_str()) == 0);
+
+ // definitions -> use cross-product
+ if (input_is_definition || output_is_definition)
{
- // TODO, FIX:
- //char msg[256];
- //sprintf(msg, "invalid use of definitions in label %s:%s",
- // get_print_format(input), get_print_format(output));
- //yyerror(msg);
+ HfstTransducer * tmp = NULL; // temporary transducer for cross-product calculation
+ if (input_is_unknown)
+ {
+ retval = new HfstTransducer(hfst::internal_identity, hfst::xre::format);
+ tmp = expand_definition(output);
+ }
+ else if (output_is_unknown)
+ {
+ tmp = new HfstTransducer(hfst::internal_identity, hfst::xre::format);
+ retval = expand_definition(input);
+ }
+ else // neither is unknown
+ {
+ retval = expand_definition(input);
+ tmp = expand_definition(output);
+ }
+ retval->cross_product(*tmp);
+ delete tmp;
+ return retval;
}
- if (strcmp(input, hfst::internal_unknown.c_str()) == 0 &&
- strcmp(output, hfst::internal_unknown.c_str()) == 0)
+
+ // no definitions
+ if (input_is_unknown && output_is_unknown)
{
retval = new HfstTransducer(hfst::internal_unknown, hfst::internal_unknown, hfst::xre::format);
HfstTransducer id(hfst::internal_identity, hfst::internal_identity, hfst::xre::format);
retval->disjunct(id).minimize();
}
- else if (strcmp(input, hfst::internal_unknown.c_str()) == 0)
+ else if (input_is_unknown)
{
retval = new HfstTransducer(hfst::internal_unknown, output, hfst::xre::format);
HfstTransducer output_tr(output, output, hfst::xre::format);
retval->disjunct(output_tr).minimize();
}
- else if (strcmp(output, hfst::internal_unknown.c_str()) == 0)
+ else if (output_is_unknown)
{
retval = new HfstTransducer(input, hfst::internal_unknown, hfst::xre::format);
HfstTransducer input_tr(input, input, hfst::xre::format);
@@ -758,10 +785,6 @@ xfst_label_to_transducer(const char* input, const char* output)
{
retval = new HfstTransducer(input, output, hfst::xre::format);
}
-
- if (is_definition(input))
- retval = expand_definition(input); // changed
-
return retval;
}
@@ -914,6 +937,16 @@ xfst_label_to_transducer(const char* input, const char* output)
return retval;
}
+ HfstTransducer * merge_first_to_second(HfstTransducer * tr1, HfstTransducer * tr2)
+ {
+ // Merge operation creates an XreCompiler that needs this information below. Otherwise, it will overwrite all this.
+ struct XreConstructorArguments args(hfst::xre::definitions, hfst::xre::function_definitions, hfst::xre::function_arguments, hfst::xre::symbol_lists, hfst::xre::format);
+
+ tr1->minimize();
+ tr2->merge(*tr1, args);
+ return tr2;
+ }
+
void warn(const char * msg)
{
if (!verbose_)
diff --git a/libhfst/src/parsers/xre_utils.h b/libhfst/src/parsers/xre_utils.h
index f90c579..f8a9226 100644
--- a/libhfst/src/parsers/xre_utils.h
+++ b/libhfst/src/parsers/xre_utils.h
@@ -19,6 +19,7 @@ extern size_t len;
extern std::map<std::string,hfst::HfstTransducer*> definitions;
extern std::map<std::string,std::string> function_definitions;
extern std::map<std::string,unsigned int> function_arguments;
+extern std::map<std::string, std::set<std::string> > symbol_lists;
extern HfstTransducer* last_compiled;
extern bool contains_only_comments;
extern ImplementationType format;
@@ -79,6 +80,7 @@ HfstTransducer* compile(const std::string& xre,
std::map<std::string,hfst::HfstTransducer*>& defs,
std::map<std::string,std::string>& func_defs,
std::map<std::string,unsigned int> func_args,
+ std::map<std::string, std::set<std::string> >& lists,
hfst::ImplementationType type);
/**
@@ -88,6 +90,7 @@ HfstTransducer* compile_first(const std::string& xre,
std::map<std::string,hfst::HfstTransducer*>& defs,
std::map<std::string,std::string>& func_defs,
std::map<std::string,unsigned int> func_args,
+ std::map<std::string, std::set<std::string> >& lists,
hfst::ImplementationType type,
unsigned int & chars_read);
@@ -125,6 +128,8 @@ bool is_valid_function_call(const char * name, const std::vector<HfstTransducer>
HfstTransducer * contains_once_optional(const HfstTransducer * t);
+ HfstTransducer * merge_first_to_second(HfstTransducer * tr1, HfstTransducer * tr2);
+
void warn(const char * msg);
void warn_about_special_symbols_in_replace(HfstTransducer *t);
/* Warn about \a symbol if it is of form "@_.*_@" and verbose mode is on. */
diff --git a/test/tools/Makefile.am b/test/tools/Makefile.am
index 794ade4..2dc396f 100644
--- a/test/tools/Makefile.am
+++ b/test/tools/Makefile.am
@@ -299,6 +299,7 @@ LEXC_TXTS=basic.cat-dog-bird.lexc basic.colons.lexc basic.comments.lexc \
basic.two-lexicons.lexc basic.UTF-8.lexc basic.zeros-epsilons.lexc \
cat.lexc hfst.weights.lexc stress.random-lexicons-100.lexc \
xfail.bogus.lexc xfail.ISO-8859-1.lexc xfail.lexicon-semicolon.lexc \
+ xfail.sublexicon-defined-more-than-once.lexc \
xre.any-variations.lexc xre.at-file.lexc \
xre.automatic-multichar-symbols.lexc xre.basic.lexc \
xre.definitions.lexc xre.months.lexc xre.nested-definitions.lexc \
@@ -306,7 +307,8 @@ LEXC_TXTS=basic.cat-dog-bird.lexc basic.colons.lexc basic.comments.lexc \
xre.star-plus-optional.lexc basic.lowercase-lexicon-end.lexc \
basic.multichar-flag-with-zero.lexc basic.almost-reserved-words.lexc \
basic.regexps.lexc no-newline-before-sublexicon.lexc \
- warn.sublexicon-mentioned-but-not-defined.lexc
+ warn.sublexicon-mentioned-but-not-defined.lexc \
+ warn.one-sided-flags.lexc
#XFST_TXTS=cat-regex.xfst
ALL_SRCS=$(FST_TXTS) $(FST_STRINGS) $(FST_PAIRS) $(FST_PAIRSTRINGS) \
@@ -412,7 +414,9 @@ xre.star-plus-optional.lexc.result \
no-newline-before-sublexicon.lexc.flag.result \
no-newline-before-sublexicon.lexc.result \
warn.sublexicon-mentioned-but-not-defined.lexc.flag.result \
-warn.sublexicon-mentioned-but-not-defined.lexc.result
+warn.sublexicon-mentioned-but-not-defined.lexc.result \
+warn.one-sided-flags.lexc.flag.result \
+warn.one-sided-flags.lexc.result
# files needed for test programs
EXTRA_DIST=empty-file $(FST_TXTS) $(FST_STRINGS) $(FST_PAIRS) $(FST_PAIRSTRINGS) $(FST_SPACESTRINGS) $(SUBSTITUTE_TXTS) $(XRE_TXTS) $(XFST_TXTS) $(TESTS) $(EXTRA_FILES) $(LEXC_TXTS) $(PMATCH_TXTS) $(PMATCHSCRIPTS) script.xfst lexc2fst-stress.sh lookup-stress.sh proc-stress.sh valgrind.sh \
diff --git a/test/tools/lexc-compiler-functionality.sh b/test/tools/lexc-compiler-functionality.sh
index 25883bc..20d8ff8 100755
--- a/test/tools/lexc-compiler-functionality.sh
+++ b/test/tools/lexc-compiler-functionality.sh
@@ -30,9 +30,9 @@ LEXCTESTS="basic.cat-dog-bird.lexc basic.colons.lexc basic.comments.lexc
-LEXCXFAIL="xfail.bogus.lexc xfail.ISO-8859-1.lexc xfail.lexicon-semicolon.lexc"
+LEXCXFAIL="xfail.bogus.lexc xfail.ISO-8859-1.lexc xfail.lexicon-semicolon.lexc xfail.sublexicon-defined-more-than-once.lexc"
-LEXCWARN="warn.sublexicon-mentioned-but-not-defined.lexc"
+LEXCWARN="warn.sublexicon-mentioned-but-not-defined.lexc warn.one-sided-flags.lexc"
if test "$srcdir" = ""; then
srcdir="./"
diff --git a/test/tools/warn.one-sided-flags.lexc b/test/tools/warn.one-sided-flags.lexc
new file mode 100644
index 0000000..5183fd5
--- /dev/null
+++ b/test/tools/warn.one-sided-flags.lexc
@@ -0,0 +1,7 @@
+Multichar_Symbols
+ @U.FOO.ON@
+ @U.FOO.OFF@
+
+LEXICON Root
+ at U.FOO.ON@foo:bar # ;
+# ;
diff --git a/test/tools/warn.one-sided-flags.lexc.flag.result b/test/tools/warn.one-sided-flags.lexc.flag.result
new file mode 100644
index 0000000..fb4d840
Binary files /dev/null and b/test/tools/warn.one-sided-flags.lexc.flag.result differ
diff --git a/test/tools/warn.one-sided-flags.lexc.result b/test/tools/warn.one-sided-flags.lexc.result
new file mode 100644
index 0000000..4250076
Binary files /dev/null and b/test/tools/warn.one-sided-flags.lexc.result differ
diff --git a/test/tools/xfail.sublexicon-defined-more-than-once.lexc b/test/tools/xfail.sublexicon-defined-more-than-once.lexc
new file mode 100644
index 0000000..aa21b15
--- /dev/null
+++ b/test/tools/xfail.sublexicon-defined-more-than-once.lexc
@@ -0,0 +1,15 @@
+LEXICON Root
+Noun ;
+Verb ;
+
+LEXICON Noun
+cat #;
+dog #;
+
+LEXICON Verb
+mew #;
+bark #;
+
+LEXICON Noun
+cat #;
+dog #;
diff --git a/tools/src/HfstStrings2FstTokenizer.cc b/tools/src/HfstStrings2FstTokenizer.cc
index ef59041..5ea13ab 100644
--- a/tools/src/HfstStrings2FstTokenizer.cc
+++ b/tools/src/HfstStrings2FstTokenizer.cc
@@ -7,8 +7,10 @@ HfstStrings2FstTokenizer::HfstStrings2FstTokenizer
eps(eps)
{
// \: \\ \<space> and eps are special cases.
- add_multichar_symbol( eps );
-
+ if (!eps.empty())
+ {
+ add_multichar_symbol( eps );
+ }
tokenizer.add_multichar_symbol( BACKSLASH COL );
tokenizer.add_multichar_symbol( BACKSLASH SPACE );
@@ -122,10 +124,10 @@ StringPairVector HfstStrings2FstTokenizer::make_pair_vector
std::string output_symbol = unescape(*output_it);
spv.push_back
- (StringPair(input_symbol.empty() or input_symbol == eps ?
- EPSILON_SYMBOL : input_symbol,
- output_symbol.empty() or output_symbol == eps ?
- EPSILON_SYMBOL : output_symbol));
+ (StringPair(input_symbol.empty() or input_symbol == eps ?
+ EPSILON_SYMBOL : input_symbol,
+ output_symbol.empty() or output_symbol == eps ?
+ EPSILON_SYMBOL : output_symbol));
++input_it;
++output_it;
}
@@ -173,7 +175,7 @@ std::string HfstStrings2FstTokenizer::unescape(std::string symbol)
pos = 0;
while ((pos = symbol.find(TAB_ESCAPE)) != std::string::npos)
- { symbol.replace(pos,strlen(TAB_ESCAPE)," "); }
+ { symbol.replace(pos,strlen(TAB_ESCAPE)," "); }
pos = 0;
while ((pos = symbol.find(COL_ESCAPE)) != std::string::npos)
@@ -277,7 +279,7 @@ void test_ps
it != spv.end();
++it)
{
- if (it->first != it->second)
+ if (it->first != it->second)
{ std::cout << it->first << " : " << it->second << std::endl; }
else
{ std::cout << it->first << std::endl; }
@@ -295,7 +297,7 @@ void test_sp
it != spv.end();
++it)
{
- if (it->first != it->second)
+ if (it->first != it->second)
{ std::cout << it->first << " : " << it->second << std::endl; }
else
{ std::cout << it->first << std::endl; }
diff --git a/tools/src/hfst-compose.cc b/tools/src/hfst-compose.cc
index f6fec45..86c7208 100644
--- a/tools/src/hfst-compose.cc
+++ b/tools/src/hfst-compose.cc
@@ -139,8 +139,11 @@ compose_streams(HfstInputStream& firststream, HfstInputStream& secondstream,
size_t transducer_n_first = 0; // transducers read from first stream
size_t transducer_n_second = 0; // transducers read from second stream
while (continueReading) {
- first = new HfstTransducer(firststream);
- transducer_n_first++;
+ if (firststream.is_good())
+ {
+ first = new HfstTransducer(firststream);
+ transducer_n_first++;
+ }
if (secondstream.is_good())
{
second = new HfstTransducer(secondstream);
@@ -194,19 +197,42 @@ compose_streams(HfstInputStream& firststream, HfstInputStream& secondstream,
hfst_strformat(secondstream.get_type()));
}
- continueReading = firststream.is_good() &&
- (secondstream.is_good() || transducer_n_second == 1);
+ continueReading =
+ (firststream.is_good() && secondstream.is_good()) ||
+ (firststream.is_good() && (transducer_n_second == 1)) ||
+ ((transducer_n_first == 1) && secondstream.is_good());
- delete first;
- first=0;
- // delete the transducer of second stream, unless we continue reading
- // the first stream and there is only one transducer in the second
- // stream
- if ((continueReading && secondstream.is_good()) || not continueReading)
+ if (!continueReading)
{
+ delete first;
delete second;
- second=0;
}
+ else
+ {
+ if (firststream.is_good())
+ {
+ delete first;
+ }
+ if (secondstream.is_good())
+ {
+ delete second;
+ }
+ }
+
+ //continueReading = firststream.is_good() &&
+ // (secondstream.is_good() || transducer_n_second == 1);
+
+ //delete first;
+ //first=0;
+ // delete the transducer of second stream, unless we continue reading
+ // the first stream and there is only one transducer in the second
+ // stream
+ //if ((continueReading && secondstream.is_good()) || not continueReading)
+ // {
+ // delete second;
+ // second=0;
+ // }
+
free(firstname);
free(secondname);
}
@@ -221,12 +247,14 @@ compose_streams(HfstInputStream& firststream, HfstInputStream& secondstream,
}
if (secondstream.is_good())
- {
- error(EXIT_FAILURE, 0,
- "first input '%s' contains fewer transducers than second input"
- " '%s'",
- firstfilename, secondfilename);
- }
+ {
+ error(EXIT_FAILURE, 0,
+ "first input '%s' contains fewer transducers than second input"
+ " '%s'; this is only possible if the first input contains"
+ " exactly one transducer",
+ firstfilename, secondfilename);
+ }
+
firststream.close();
secondstream.close();
outstream.close();
diff --git a/tools/src/hfst-fst2strings.cc b/tools/src/hfst-fst2strings.cc
index 5d7a985..b2a9ae3 100644
--- a/tools/src/hfst-fst2strings.cc
+++ b/tools/src/hfst-fst2strings.cc
@@ -563,8 +563,8 @@ process_stream(HfstInputStream& instream, std::ostream& outstream)
verbose_printf("Printed %i random string(s)\n", cb.count);
}
- if (print_separator_after_each_transducer)
- outstream << "--" << std::endl;
+ //if (print_separator_after_each_transducer)
+ // outstream << "--" << std::endl;
}
instream.close();
diff --git a/tools/src/hfst-lexc-compiler.cc b/tools/src/hfst-lexc-compiler.cc
index bcf6c40..aa64185 100644
--- a/tools/src/hfst-lexc-compiler.cc
+++ b/tools/src/hfst-lexc-compiler.cc
@@ -60,6 +60,8 @@ static bool minimize_flags = false;
static bool rename_flags = false;
static bool treat_warnings_as_errors = false;
static bool xerox_composition = true; // Compatibility with Xerox tools is the default
+static bool encode_weights = false;
+static bool enc = false;
void
print_usage()
@@ -73,6 +75,7 @@ print_usage()
" -f, --format=FORMAT compile into FORMAT transducer\n"
" -o, --output=OUTFILE write result into OUTFILE\n");
fprintf(message_out, "Lexc options:\n"
+ " -E, --encode-weights encode weights when minimizing (default is false)\n"
" -F, --withFlags use flags to hyperminimize result\n"
" -M, --minimizeFlags if --withFlags is used, minimize the number of flags\n"
" -R, --renameFlags if --withFlags and --minimizeFlags are used, rename\n"
@@ -122,6 +125,7 @@ parse_options(int argc, char** argv)
static const struct option long_options[] =
{
HFST_GETOPT_COMMON_LONG,
+ {"encode-weights", no_argument, 0, 'E'},
{"format", required_argument, 0, 'f'},
{"output", required_argument, 0, 'o'},
{"withFlags", no_argument, 0, 'F'},
@@ -134,7 +138,7 @@ parse_options(int argc, char** argv)
};
int option_index = 0;
char c = getopt_long(argc, argv, HFST_GETOPT_COMMON_SHORT
- "f:o:FMRx:X:W",
+ "Ef:o:FMRx:X:W",
long_options, &option_index);
if (-1 == c)
{
@@ -143,6 +147,9 @@ parse_options(int argc, char** argv)
switch (c)
{
#include "inc/getopt-cases-common.h"
+ case 'E':
+ encode_weights = true;
+ break;
case 'f':
format = hfst_parse_format_name(optarg);
break;
@@ -276,6 +283,12 @@ lexc_streams(LexcCompiler& lexc, HfstOutputStream& outstream)
verbose_printf("done\n");
delete res;
outstream.close();
+
+ if (encode_weights)
+ {
+ hfst::set_encode_weights(enc);
+ }
+
return EXIT_SUCCESS;
}
@@ -304,6 +317,13 @@ int main( int argc, char **argv ) {
{
fclose(outfile);
}
+
+ enc = hfst::get_encode_weights();
+ if (encode_weights)
+ {
+ hfst::set_encode_weights(true);
+ }
+
verbose_printf("Reading from ");
for (unsigned int i = 0; i < lexccount; i++)
{
@@ -321,11 +341,11 @@ int main( int argc, char **argv ) {
// lexc.with_flags_ = with_flags;
if (silent)
{
- lexc.setVerbosity(false);
+ lexc.setVerbosity(0);
}
else
{
- lexc.setVerbosity(verbose);
+ lexc.setVerbosity(verbose ? 2 : 1);
}
if (treat_warnings_as_errors)
{
diff --git a/tools/src/hfst-pmatch.cc b/tools/src/hfst-pmatch.cc
index cf223b9..d9e3d1d 100644
--- a/tools/src/hfst-pmatch.cc
+++ b/tools/src/hfst-pmatch.cc
@@ -51,6 +51,7 @@ using std::pair;
bool blankline_separated = true;
bool extract_tags = false;
bool locate_mode = false;
+bool profile = false;
std::string pmatch_filename;
void
@@ -64,7 +65,8 @@ print_usage()
fprintf(message_out,
" -n --newline Newline as input separator (default is blank line)\n"
" -x --extract-tags Only print tagged parts in output\n"
- " -l --locate Only print locations of matches\n");
+ " -l --locate Only print locations of matches\n"
+ " -p --profile Produce profiling data\n");
fprintf(message_out,
"Use standard streams for input and output.\n"
"\n"
@@ -121,6 +123,9 @@ int process_input(hfst_ol::PmatchContainer & container,
if (blankline_separated && !input_text.empty()) {
match_and_print(container, outstream, input_text);
}
+ if (profile) {
+ outstream << "\n" << container.get_profiling_info() << "\n";
+ }
return EXIT_SUCCESS;
}
@@ -137,6 +142,7 @@ int parse_options(int argc, char** argv)
{"newline", no_argument, 0, 'n'},
{"extract-tags", no_argument, 0, 'x'},
{"locate", no_argument, 0, 'l'},
+ {"profile", no_argument, 0, 'p'},
{0,0,0,0}
};
int option_index = 0;
@@ -160,6 +166,9 @@ int parse_options(int argc, char** argv)
case 'l':
locate_mode = true;
break;
+ case 'p':
+ profile = true;
+ break;
#include "inc/getopt-cases-error.h"
}
@@ -212,7 +221,11 @@ int main(int argc, char ** argv)
std::cerr << "Could not open file " << pmatch_filename << std::endl;
return EXIT_FAILURE;
}
- hfst_ol::PmatchContainer container(instream, verbose, extract_tags);
+ hfst_ol::PmatchContainer container(instream);
+ container.set_verbose(verbose);
+// the locate_mode bool in this tool only affects its own processing
+ container.set_locate_mode(extract_tags);
+ container.set_profile(profile);
// if (outfile != stdout) {
// std::filebuf fb;
// fb.open(outfilename, std::ios::out);
diff --git a/tools/src/hfst-proc2.cc b/tools/src/hfst-proc2.cc
index 46c2e9a..8a257f4 100644
--- a/tools/src/hfst-proc2.cc
+++ b/tools/src/hfst-proc2.cc
@@ -213,7 +213,8 @@ int main(int argc, char ** argv)
std::cerr << "Could not open file " << tokenizer_filename << std::endl;
return EXIT_FAILURE;
}
- hfst_ol::PmatchContainer container(instream, verbose);
+ hfst_ol::PmatchContainer container(instream);
+ container.set_verbose(verbose);
// if (outfile != stdout) {
// std::filebuf fb;
// fb.open(outfilename, std::ios::out);
diff --git a/tools/src/hfst-summarize.cc b/tools/src/hfst-summarize.cc
index 63ef7bd..ec4b1a5 100644
--- a/tools/src/hfst-summarize.cc
+++ b/tools/src/hfst-summarize.cc
@@ -54,6 +54,8 @@ using hfst::StringSet;
#include "inc/globals-unary.h"
// add tools-specific variables here
+static bool print_symbol_pair_statistics = false;
+static int symbol_pair_threshold = -1;
void
print_usage()
@@ -66,6 +68,9 @@ print_usage()
print_common_program_options(message_out);
print_common_unary_program_options(message_out);
// fprintf(message_out, (tool-specific options and short descriptions)
+ fprintf(message_out, "Summarize options:\n");
+ fprintf(message_out, " -p, --print-symbol-pair-statistics=N Print info about symbol pairs that occur\n");
+ fprintf(message_out, " at most N times (default is infinity)\n");
fprintf(message_out, "\n");
print_common_unary_program_parameter_instructions(message_out);
fprintf(message_out, "\n");
@@ -89,13 +94,14 @@ parse_options(int argc, char** argv)
{
HFST_GETOPT_COMMON_LONG,
HFST_GETOPT_UNARY_LONG,
- // add tool-specific options here
- {0,0,0,0}
+ // add tool-specific options here
+ {"print-symbol-pair-statistics", optional_argument, 0, 'S'},
+ {0,0,0,0}
};
int option_index = 0;
// add tool-specific options here
char c = getopt_long(argc, argv, HFST_GETOPT_COMMON_SHORT
- HFST_GETOPT_UNARY_SHORT,
+ HFST_GETOPT_UNARY_SHORT "S::",
long_options, &option_index);
if (-1 == c)
{
@@ -107,6 +113,25 @@ parse_options(int argc, char** argv)
#include "inc/getopt-cases-common.h"
#include "inc/getopt-cases-unary.h"
// add tool-specific cases here
+ case 'S':
+ print_symbol_pair_statistics = true ;
+ if (optarg != NULL)
+ {
+ if (optarg[0] == '=')
+ {
+ optarg++;
+ }
+ symbol_pair_threshold = hfst_strtoul(optarg, 10);
+ if (symbol_pair_threshold < 0)
+ {
+ error(EXIT_FAILURE, 0, "%u is not a valid argument for option --print-symbol-pair-statistics\n", symbol_pair_threshold);
+ }
+ if (symbol_pair_threshold == 0)
+ {
+ error(EXIT_FAILURE, 0, "0 is not a valid argument for option --print-symbol-pair-statistics\n");
+ }
+ }
+ break;
#include "inc/getopt-cases-error.h"
}
}
@@ -208,6 +233,8 @@ process_stream(HfstInputStream& instream)
is_mutable = false;
break;
}
+
+ std::map<std::pair<std::string, std::string>,unsigned int> symbol_pairs;
// iterate states in random order
HfstState source_state=0;
for (HfstBasicTransducer::const_iterator it = mutt->begin();
@@ -232,6 +259,13 @@ process_stream(HfstInputStream& instream)
arcs_here++;
foundAlphabet.insert(tr_it->get_input_symbol());
foundAlphabet.insert(tr_it->get_output_symbol());
+
+ // ADDED
+ if (print_symbol_pair_statistics)
+ {
+ symbol_pairs[std::pair<std::string,std::string>(tr_it->get_input_symbol(), tr_it->get_output_symbol())]++;
+ }
+
if (tr_it->get_input_symbol() != tr_it->get_output_symbol())
{
acceptor = false;
@@ -521,6 +555,27 @@ process_stream(HfstInputStream& instream)
}
fprintf(outfile, "\n");
}
+ // ADDED
+ if (print_symbol_pair_statistics)
+ {
+ if (symbol_pair_threshold > -1)
+ {
+ fprintf(outfile, "symbol pairs that occur at most %u times:\n", symbol_pair_threshold);
+ }
+ else
+ {
+ fprintf(outfile, "symbol pairs:\n");
+ }
+ for (std::map<std::pair<std::string,std::string>,unsigned int>::const_iterator it = symbol_pairs.begin(); it != symbol_pairs.end(); it++)
+ {
+ if (it->second <= symbol_pair_threshold)
+ {
+ fprintf(outfile, "%s:%s\t%u\n", it->first.first.c_str(), it->first.second.c_str(), it->second);
+ }
+ }
+ fprintf(outfile, "\n");
+ }
+
delete trans;
}
}
diff --git a/tools/src/parsers/XfstCompiler.cc b/tools/src/parsers/XfstCompiler.cc
index dae3c9e..31344e3 100644
--- a/tools/src/parsers/XfstCompiler.cc
+++ b/tools/src/parsers/XfstCompiler.cc
@@ -20,11 +20,9 @@
#include <string>
#include <map>
-#include <list>
#include <queue>
#include <stack>
-using std::list;
using std::string;
using std::map;
using std::queue;
@@ -999,13 +997,13 @@ namespace xfst {
{
hfst_fprintf(errorstream_, "unsupported unicode range %s-%s\n", start, end);
}
- list<string> l;
+ std::set<string> l;
for (char c = *start; c < *end; c++)
{
char *s = static_cast<char*>(malloc(sizeof(char)*2));
*s = c;
*(s+1) = '\0';
- l.push_back(s);
+ l.insert(s);
}
lists_[name] = l;
return *this;
@@ -1022,16 +1020,17 @@ namespace xfst {
MAYBE_QUIT;
PROMPT_AND_RETURN_THIS;
}
- list<string> l;
+ std::set<string> l;
char* p = strdup(values);
char* token = strtok(p, " ");
while (token != NULL)
{
- l.push_back(token);
+ l.insert(token);
token = strtok(NULL, " ");
}
free(p);
lists_[name] = l;
+ xre_.define_list(name, l);
PROMPT_AND_RETURN_THIS;
}
@@ -2528,31 +2527,43 @@ namespace xfst {
XfstCompiler&
XfstCompiler::print_list(const char* name, FILE* outfile)
{
- list<string> l = lists_[name];
- hfst_fprintf(outfile, "%10s:", name);
- for (list<string>::const_iterator s = l.begin();
+ if (lists_.find(name) == lists_.end())
+ {
+ hfst_fprintf(outfile, "No such list defined: %s\n", name);
+ PROMPT_AND_RETURN_THIS;
+ }
+ std::set<string> l = lists_[name];
+ hfst_fprintf(outfile, "%10s: ", name);
+ for (std::set<string>::const_iterator s = l.begin();
s != l.end();
++s)
{
hfst_fprintf(outfile, "%s ", s->c_str());
}
+ hfst_fprintf(outfile, "\n");
PROMPT_AND_RETURN_THIS;
}
XfstCompiler&
XfstCompiler::print_list(FILE* outfile)
{
- for (map<string,list<string> >::const_iterator l = lists_.begin();
+ if (lists_.size() == 0)
+ {
+ hfst_fprintf(outfile, "No lists defined.\n");
+ PROMPT_AND_RETURN_THIS;
+ }
+ for (map<string,std::set<string> >::const_iterator l = lists_.begin();
l != lists_.end();
++l)
{
- hfst_fprintf(outfile, "%10s:", l->first.c_str());
- for (list<string>::const_iterator s = l->second.begin();
+ hfst_fprintf(outfile, "%10s: ", l->first.c_str());
+ for (std::set<string>::const_iterator s = l->second.begin();
s != l->second.end();
++s)
{
hfst_fprintf(outfile, "%s ", s->c_str());
}
+ hfst_fprintf(outfile, "\n");
}
PROMPT_AND_RETURN_THIS;
}
@@ -3585,12 +3596,6 @@ namespace xfst {
switch (operation)
{
case INTERSECT_NET:
- /*{
- HfstBasicTransducer basic(*t);
- HfstBasicTransducer merge_tr = HfstBasicTransducer::merge(fsm, basic);
- fprintf(stderr, "result of merge is:\n");
- merge_tr.write_in_att_format(stderr);
- }*/
result->intersect(*t);
break;
case IGNORE_NET:
@@ -4183,6 +4188,7 @@ namespace xfst {
PROMPT_AND_RETURN_THIS;
}
+ // Returns an automaton that contains one ore more "^[" "^]" expressions.
static HfstTransducer * contains_regexps(hfst::xre::XreCompiler & xre_)
{
HfstTransducer * not_bracket_star = xre_.compile("[? - \"^[\" - \"^]\"]* ;");
@@ -4194,6 +4200,22 @@ namespace xfst {
return well_formed;
}
+ static HfstTransducer * contains_regexp_markers_on_one_side(hfst::xre::XreCompiler & xre_, bool input_side)
+ {
+ HfstTransducer * retval = NULL;
+ if (input_side)
+ {
+ retval = xre_.compile("[?:?|0:?|?:0]* [\"^[\":? | \"^]\":? | \"^[\":0 | \"^]\":0] [?:?|0:?|?:0]*");
+ }
+ else // output side
+ {
+ retval = xre_.compile("[?:?|0:?|?:0]* [?:\"^[\" | ?:\"^]\" | 0:\"^[\" | 0:\"^]\"] [?:?|0:?|?:0]*");
+ }
+ assert(retval != NULL);
+ return retval;
+ }
+
+ // @pre \a t must be an automaton
static bool is_well_formed_for_compile_replace(const HfstTransducer * t, hfst::xre::XreCompiler & xre_)
{
HfstTransducer * well_formed = contains_regexps(xre_);
@@ -4212,23 +4234,65 @@ namespace xfst {
return value;
}
- static std::string to_literal_regexp(const hfst::StringPairVector & path)
+ static std::string to_literal_regexp(const hfst::StringPairVector & path, bool input_side)
{
std::string pathstr("[ ");
for (hfst::StringPairVector::const_iterator it = path.begin(); it != path.end(); it++)
{
- pathstr.append("\"").append(it->first).append("\" ");
+ std::string symbol = (input_side) ? it->first : it->second ;
+ pathstr.append("\"").append(symbol).append("\" ");
}
pathstr.append("]");
return pathstr;
}
- static std::string to_regexp(const hfst::StringPairVector & path)
+ static HfstTransducer * to_literal_transducer(const hfst::StringPairVector & path, hfst::xre::XreCompiler & xre_)
{
std::string pathstr("[ ");
for (hfst::StringPairVector::const_iterator it = path.begin(); it != path.end(); it++)
{
- pathstr.append(it->first).append(" ");
+ std::string isymbol = it->first;
+ if (isymbol == hfst::internal_epsilon)
+ {
+ pathstr.append("0");
+ }
+ else
+ {
+ pathstr.append("\"").append(isymbol).append("\"");
+ }
+
+ pathstr.append(":");
+
+ std::string osymbol = it->second;
+ if (osymbol == hfst::internal_epsilon)
+ {
+ pathstr.append("0 ");
+ }
+ else
+ {
+ pathstr.append("\"").append(osymbol).append("\" ");
+ }
+ }
+ pathstr.append("];");
+ // debug
+ //std::cerr << "to_literal_transducer: compiling expression: " << pathstr << std::endl;
+ char * p = strdup(pathstr.c_str());
+ HfstTransducer * retval = xre_.compile(p);
+ free(p);
+ return retval;
+ }
+
+ static std::string to_regexp(const hfst::StringPairVector & path, bool input_side)
+ {
+ std::string pathstr("[ ");
+ for (hfst::StringPairVector::const_iterator it = path.begin(); it != path.end(); it++)
+ {
+ std::string symbol = (input_side) ? it->first : it->second ;
+ // ignore "^[" and "^]"
+ if (symbol != "^]" && symbol != "^[")
+ {
+ pathstr.append(symbol).append(" ");
+ }
}
pathstr.append("]");
return pathstr;
@@ -4243,18 +4307,35 @@ namespace xfst {
using hfst::implementations::HfstReplacementsMap;
GET_TOP(tmp);
- if (is_well_formed_for_compile_replace(tmp, xre_))
+ HfstTransducer tmp_cp(*tmp);
+
+ if (level == UPPER_LEVEL)
+ {
+ tmp_cp.input_project();
+ }
+ else // LOWER_LEVEL
+ {
+ tmp_cp.output_project();
+ }
+
+ if (is_well_formed_for_compile_replace(&tmp_cp, xre_))
{
- fprintf(stderr, "Network is well-formed.\n");
+ if (verbose_)
+ fprintf(stderr, "Network is well-formed.\n");
}
else
{
- fprintf(stderr, "Network is not well-formed.\n");
+ if (verbose_)
+ fprintf(stderr, "Network is not well-formed.\n");
+ xfst_lesser_fail();
+ prompt();
+ return *this;
}
+
HfstBasicTransducer fsm(*tmp);
try
{
- HfstReplacementsMap replacement_map = fsm.find_replacements();
+ HfstReplacementsMap replacement_map = fsm.find_replacements((level == UPPER_LEVEL)); // input_side
for (HfstReplacementsMap::const_iterator it = replacement_map.begin();
it != replacement_map.end(); it++)
@@ -4268,22 +4349,54 @@ namespace xfst {
std::string CPR(""); // Cross-Product Regexp
if (level == LOWER_LEVEL)
{
- CPR = to_literal_regexp(rit->second) + std::string(" .x. ") + to_regexp(rit->second);
- CPR = std::string("\"^[\":0") + std::string(" [") + CPR + std::string("] ") + std::string("\"^]\":0 ;");
+ CPR = to_literal_regexp(rit->second, false /*output side*/) + std::string(" .x. ") + to_regexp(rit->second, false /*output side*/);
+ //CPR = std::string("\"^[\":0") + std::string(" [") + CPR + std::string("] ") + std::string("\"^]\":0 ;");
+ CPR = std::string("[") + CPR + std::string("] ;");
}
- else
+ else // UPPER_LEVEL
{
- CPR = to_regexp(rit->second) + std::string(" .x. ") + to_literal_regexp(rit->second);
- CPR = std::string("0:\"^[\"") + std::string(" [") + CPR + std::string("] ") + std::string("0:\"^]\" ;");
+ CPR = to_regexp(rit->second, true /*input side*/) + std::string(" .x. ") + to_literal_regexp(rit->second, true /*input side*/);
+ //CPR = std::string("0:\"^[\"") + std::string(" [") + CPR + std::string("] ") + std::string("0:\"^]\" ;");
+ CPR = std::string("[") + CPR + std::string("] ;");
}
char * cpr = strdup(CPR.c_str());
- fprintf(stderr, "compiling replacement '%s'...\n", cpr);
+ // debug
+ //fprintf(stderr, "compiling replacement '%s'...\n", cpr);
HfstTransducer * replacement = xre_.compile(cpr);
- assert(replacement != NULL); // todo
+ if (replacement == NULL)
+ {
+ fprintf(stderr, "Could not compile regular expression in compile-replace: %s.\n", cpr);
+ xfst_lesser_fail();
+ prompt();
+ return *this;
+ }
replacement->minimize();
+
+ // debug
+ //std::cerr << "replacement is:" << std::endl << *replacement << std::endl;
+
+ // compose with opposite level
+ if (level == UPPER_LEVEL)
+ {
+ HfstTransducer * original_path = to_literal_transducer(rit->second, xre_);
+ original_path->minimize();
+ replacement->compose(*original_path);
+ delete original_path;
+ replacement->minimize();
+ }
+ else // LOWER_LEVEL
+ {
+ HfstTransducer * original_path = to_literal_transducer(rit->second, xre_);
+ original_path->minimize();
+ original_path->compose(*replacement);
+ original_path->minimize();
+ delete replacement;
+ replacement = original_path;
+ }
+
HfstBasicTransducer repl(*replacement);
// DEBUG
- std::cerr << "inserting transducer:" << std::endl << *replacement << std::endl << "between states " << start_state << " and " << end_state << "." << std::endl ;
+ //std::cerr << "inserting transducer:" << std::endl << *replacement << std::endl << "between states " << start_state << " and " << end_state << "." << std::endl ;
delete replacement;
fsm.insert_transducer(start_state, end_state, repl);
}
@@ -4295,9 +4408,16 @@ namespace xfst {
}
HfstTransducer * result = new HfstTransducer(fsm, format_);
- std::cerr << "result from compile-replace is:" << std::endl << *result << std::endl;
+ // debug
+ //std::cerr << "result from compile-replace before filtering is:" << std::endl << *result << std::endl;
+
+ // filter out regexps (todo: possible that there are regexps on opposite side)
+ HfstTransducer * cr = contains_regexp_markers_on_one_side(xre_, (level == UPPER_LEVEL) /*input side*/);
+ cr->minimize();
+
+ // debug
+ //std::cerr << "filter is:" << std::endl << *cr << std::endl;
- HfstTransducer * cr = contains_regexps(xre_);
result->subtract(*cr).minimize();
delete cr;
stack_.pop();
diff --git a/tools/src/parsers/XfstCompiler.h b/tools/src/parsers/XfstCompiler.h
index 7495231..e73e2b8 100644
--- a/tools/src/parsers/XfstCompiler.h
+++ b/tools/src/parsers/XfstCompiler.h
@@ -29,7 +29,6 @@
#endif
#include <string>
-#include <list>
#include <map>
#include <stack>
@@ -670,7 +669,7 @@ class XfstCompiler
std::map<std::string,std::string> aliases_;
std::map<std::string,std::string> variables_;
std::map<std::string,std::string> properties_;
- std::map<std::string,std::list<string> > lists_;
+ std::map<std::string,std::set<string> > lists_;
hfst::HfstTransducer* last_defined_;
hfst::ImplementationType format_;
bool verbose_;
diff --git a/tools/src/parsers/hfst-xfst.cc b/tools/src/parsers/hfst-xfst.cc
index cef05da..3b1af24 100644
--- a/tools/src/parsers/hfst-xfst.cc
+++ b/tools/src/parsers/hfst-xfst.cc
@@ -187,6 +187,7 @@ int parse_file(const char* filename, hfst::xfst::XfstCompiler &comp)
error(EXIT_FAILURE, 0, "error when reading file %s\n", filename);
return EXIT_FAILURE;
}
+
if (0 != comp.parse_line(line))
{
error(EXIT_FAILURE, 0, "error when parsing file %s\n", filename);
diff --git a/tools/src/parsers/test/Makefile.am b/tools/src/parsers/test/Makefile.am
index 46d332d..3fbbab3 100644
--- a/tools/src/parsers/test/Makefile.am
+++ b/tools/src/parsers/test/Makefile.am
@@ -21,6 +21,7 @@ EXTRA_DIST=test.sh \
reverse_net.xfst reverse_net.att \
upper_side_net.xfst upper_side_net.att \
lower_side_net.xfst lower_side_net.att \
+ one_transition_regex.xfst one_transition_regex.att \
substitute_defined.xfst substitute_defined.att \
substitute_symbol_1.xfst substitute_symbol_1.att \
substitute_symbol_2.xfst substitute_symbol_2.att \
@@ -42,6 +43,8 @@ EXTRA_DIST=test.sh \
substitute_defined_4.xfst substitute_defined_4.att \
substitute_defined_5.xfst substitute_defined_5.att \
substitute_defined_6.xfst substitute_defined_6.att \
+ merge.xfst merge.att \
+ merge_weighted.xfst merge_weighted.att \
replace_identity.xfst replace_identity.att \
quoted_literals.xfst quoted_literals.att \
define.xfst define.att \
@@ -162,5 +165,8 @@ EXTRA_DIST=test.sh \
weighted_parallel_rules_9.xfst weighted_parallel_rules_9.output \
weighted_parallel_rules_10.xfst weighted_parallel_rules_10.output \
weighted_parallel_rules_11.xfst weighted_parallel_rules_11.output \
- xerox_composition.xfst xerox_composition.output
+ xerox_composition.xfst xerox_composition.output \
+ compile_replace_1.xfst compile_replace_1.output \
+ compile_replace_2.xfst compile_replace_2.output \
+ compile_replace_3.xfst compile_replace_3.output
check_DATA=test.sh
diff --git a/tools/src/parsers/test/compile_replace_1.output b/tools/src/parsers/test/compile_replace_1.output
new file mode 100644
index 0000000..a3cabe7
--- /dev/null
+++ b/tools/src/parsers/test/compile_replace_1.output
@@ -0,0 +1,8 @@
+match
+match
+match
+match
+???
+???
+???
+???
diff --git a/tools/src/parsers/test/compile_replace_1.xfst b/tools/src/parsers/test/compile_replace_1.xfst
new file mode 100644
index 0000000..0d88fbd
--- /dev/null
+++ b/tools/src/parsers/test/compile_replace_1.xfst
@@ -0,0 +1,12 @@
+regex "^[":m a "+":t "^]":c 0:h ;
+compile-replace upper
+apply up a
+apply up aa
+apply up aaa
+apply up aaaaaaaaa
+apply up b
+apply up ab
+apply up ba
+apply up abba
+
+
diff --git a/tools/src/parsers/test/compile_replace_2.output b/tools/src/parsers/test/compile_replace_2.output
new file mode 100644
index 0000000..a3cabe7
--- /dev/null
+++ b/tools/src/parsers/test/compile_replace_2.output
@@ -0,0 +1,8 @@
+match
+match
+match
+match
+???
+???
+???
+???
diff --git a/tools/src/parsers/test/compile_replace_2.xfst b/tools/src/parsers/test/compile_replace_2.xfst
new file mode 100644
index 0000000..ee916d5
--- /dev/null
+++ b/tools/src/parsers/test/compile_replace_2.xfst
@@ -0,0 +1,10 @@
+regex m:"^[" a t:"+" c:"^]" h:0 ;
+compile-replace lower
+apply down a
+apply down aa
+apply down aaa
+apply down aaaaaaaaa
+apply down b
+apply down ab
+apply down ba
+apply down abba
diff --git a/tools/src/parsers/test/compile_replace_3.output b/tools/src/parsers/test/compile_replace_3.output
new file mode 100644
index 0000000..10dc3a2
--- /dev/null
+++ b/tools/src/parsers/test/compile_replace_3.output
@@ -0,0 +1,17 @@
+^[a+^]
+^[a+^]
+^[a+^]
+^[a+^]
+???
+???
+???
+???
+--
+^[a+^]
+^[a+^]
+^[a+^]
+^[a+^]
+???
+???
+???
+???
diff --git a/tools/src/parsers/test/compile_replace_3.xfst b/tools/src/parsers/test/compile_replace_3.xfst
new file mode 100644
index 0000000..0eb6ea5
--- /dev/null
+++ b/tools/src/parsers/test/compile_replace_3.xfst
@@ -0,0 +1,21 @@
+regex "^[" a "+" "^]" ;
+compile-replace upper
+apply up a
+apply up aa
+apply up aaa
+apply up aaaaaaaaa
+apply up b
+apply up ab
+apply up ba
+apply up abba
+echo --
+regex "^[" a "+" "^]" ;
+compile-replace lower
+apply down a
+apply down aa
+apply down aaa
+apply down aaaaaaaaa
+apply down b
+apply down ab
+apply down ba
+apply down abba
diff --git a/tools/src/parsers/test/merge.att b/tools/src/parsers/test/merge.att
new file mode 100644
index 0000000..53b30c2
--- /dev/null
+++ b/tools/src/parsers/test/merge.att
@@ -0,0 +1,39 @@
+0 1 k k
+1 2 a a
+2 3 t t
+3 4 a a
+4 5 b b
+5 6 i i
+6
+--
+0 1 k k
+1 2 a a
+2 3 t t
+3 4 a a
+4 5 b b
+5 6 i i
+6
+--
+0 1 k k
+1 2 a a
+2 3 t t
+3 4 a a
+4 5 b b
+5 6 i i
+6
+--
+0 1 k k
+1 2 a a
+2 3 t t
+3 4 a a
+4 5 b b
+5 6 i i
+6
+--
+0 1 k k
+1 2 a a
+2 3 t t
+3 4 a a
+4 5 b b
+5 6 i i
+6
diff --git a/tools/src/parsers/test/merge.xfst b/tools/src/parsers/test/merge.xfst
new file mode 100644
index 0000000..089e6f8
--- /dev/null
+++ b/tools/src/parsers/test/merge.xfst
@@ -0,0 +1,17 @@
+list V a e i o u ;
+regex [a+ i] .m>. {kVtVbV} ;
+write att
+echo --
+list C k t b ;
+regex [k t b] .m>. {CaCaCi} ;
+write att
+echo --
+regex [k t b] .m>. [ [a+ i] .m>. {CVCVCV} ] ;
+write att
+echo --
+regex [a+ i] .m>. [ [k t b] .m>. {CVCVCV} ] ;
+write att
+echo --
+regex [ {CVCVCV} .<m. [k t b] ] .<m. [a+ i] ;
+write att
+quit
diff --git a/tools/src/parsers/test/merge_weighted.att b/tools/src/parsers/test/merge_weighted.att
new file mode 100644
index 0000000..3761a51
--- /dev/null
+++ b/tools/src/parsers/test/merge_weighted.att
@@ -0,0 +1,39 @@
+0 1 k k 7.000000
+1 2 a a 0.000000
+2 3 t t 0.000000
+3 4 a a 0.000000
+4 5 b b 0.000000
+5 6 i i 0.000000
+6 0.000000
+--
+0 1 k k 11.000000
+1 2 a a 0.000000
+2 3 t t 0.000000
+3 4 a a 0.000000
+4 5 b b 0.000000
+5 6 i i 0.000000
+6 0.000000
+--
+0 1 k k 3.000000
+1 2 a a 0.000000
+2 3 t t 0.000000
+3 4 a a 0.000000
+4 5 b b 0.000000
+5 6 i i 0.000000
+6 0.000000
+--
+0 1 k k 2.000000
+1 2 a a 0.000000
+2 3 t t 0.000000
+3 4 a a 0.000000
+4 5 b b 0.000000
+5 6 i i 0.000000
+6 0.000000
+--
+0 1 k k 4.000000
+1 2 a a 0.000000
+2 3 t t 0.000000
+3 4 a a 0.000000
+4 5 b b 0.000000
+5 6 i i 0.000000
+6 0.000000
diff --git a/tools/src/parsers/test/merge_weighted.xfst b/tools/src/parsers/test/merge_weighted.xfst
new file mode 100644
index 0000000..4f2779b
--- /dev/null
+++ b/tools/src/parsers/test/merge_weighted.xfst
@@ -0,0 +1,18 @@
+set print-weight ON
+list V a e i o u ;
+regex [[a::1]+ i::5] .m>. {kVtVbV} ;
+write att
+echo --
+list C k t b ;
+regex [k::1 t::3 b::7] .m>. {CaCaCi} ;
+write att
+echo --
+regex [k t::2 b] .m>. [ [a+ i::1] .m>. {CVCVCV} ] ;
+write att
+echo --
+regex [[a::0.5]+ i] .m>. [ [k::0.3 t b::0.7] .m>. {CVCVCV} ] ;
+write att
+echo --
+regex [ {CVCVCV} .<m. [k t b]::1 ] .<m. [a+ i]::3 ;
+write att
+quit
diff --git a/tools/src/parsers/test/one_transition_regex.att b/tools/src/parsers/test/one_transition_regex.att
new file mode 100644
index 0000000..1016f8a
--- /dev/null
+++ b/tools/src/parsers/test/one_transition_regex.att
@@ -0,0 +1,10 @@
+0 1 x y
+1
+--
+0 1 x @_UNKNOWN_SYMBOL_@
+0 1 x x
+1
+--
+0 1 @_UNKNOWN_SYMBOL_@ y
+0 1 y y
+1
diff --git a/tools/src/parsers/test/one_transition_regex.xfst b/tools/src/parsers/test/one_transition_regex.xfst
new file mode 100644
index 0000000..309bd31
--- /dev/null
+++ b/tools/src/parsers/test/one_transition_regex.xfst
@@ -0,0 +1,11 @@
+define foo x ;
+define bar y ;
+regex foo:bar ;
+write att
+echo --
+regex foo:? ;
+write att
+echo --
+regex ?:bar ;
+write att
+quit
diff --git a/tools/src/parsers/test/test.sh b/tools/src/parsers/test/test.sh
index cc38eff..9f82ab7 100755
--- a/tools/src/parsers/test/test.sh
+++ b/tools/src/parsers/test/test.sh
@@ -99,7 +99,7 @@ do
substitute_defined_1 substitute_defined_2 substitute_defined_3 \
substitute_defined_4 substitute_defined_5 substitute_defined_6 \
at_re_1 at_re_2 at_re_3 at_txt at_stxt at_txt_and_stxt at_pl \
- quoted_literals replace_identity
+ quoted_literals replace_identity one_transition_regex merge
# substitute_symbol_6 fails on sfst
# angle_brackets omitted, since xfst and foma handle them differently
do
@@ -122,6 +122,31 @@ do
fi
done
+ ## The same as above but only for openfst format
+ if [ "$format" = "openfst-tropical" ]; then
+ for testfile in merge_weighted
+ do
+ rm -f result result1 result2
+ if ! (ls $testfile.xfst 2> /dev/null); then
+ echo "skipping missing test for "$testfile"..."
+ continue
+ fi
+ if ! (cat $testfile.xfst | ../hfst-xfst --pipe-mode -q -f $format > result 2> /dev/null); then
+ echo "ERROR: in compiling "$testfile".xfst"
+ exit 1;
+ fi
+ if ! (cat result | ${TXT2FST} > tmp1; cat $testfile.att | ${TXT2FST} > tmp2; ); then
+ echo "ERROR: in compiling "$testfile".att"
+ exit 1;
+ fi
+ if ! (${COMPARE} tmp1 tmp2); then
+ echo "ERROR: "$testfile" test failed"
+ exit 1;
+ fi
+ done
+ fi
+
+
## Test that testfile_fail fails.
#for testfile in define_fail
#do
@@ -142,7 +167,8 @@ do
## Test that the result of testfile.xfst (written to standard output)
## is the same as testfile.output
for testfile in print_stack print_labels print_label_tally \
- shortest_string set_variable info print_net eliminate_flag empty_context xerox_composition
+ shortest_string set_variable info print_net eliminate_flag empty_context xerox_composition \
+ compile_replace_1 compile_replace_2 compile_replace_3
do
if ! (ls $testfile.xfst 2> /dev/null); then
echo "skipping missing test for "$testfile"..."
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/hfst.git
More information about the debian-science-commits
mailing list