[irstlm] 03/146: Updated patches.
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:37:01 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to branch master
in repository irstlm.
commit 70659ce294923cba985d2d781ef927703a290b3a
Author: Giulio Paci <giuliopaci at gmail.com>
Date: Tue May 1 00:46:28 2012 +0200
Updated patches.
- Added a patch to fix commands help.
- Added a patch to import latest changes from upstream SVN.
- Removed scripts_fix.patch
---
.../patches/online_documentation_enhancments.patch | 953 +++++++++++++++++++++
debian/patches/scripts_fix.patch | 11 -
debian/patches/series | 4 +-
debian/patches/upstream_revision_487.patch | 405 +++++++++
4 files changed, 1360 insertions(+), 13 deletions(-)
diff --git a/debian/patches/online_documentation_enhancments.patch b/debian/patches/online_documentation_enhancments.patch
new file mode 100644
index 0000000..dbe4224
--- /dev/null
+++ b/debian/patches/online_documentation_enhancments.patch
@@ -0,0 +1,953 @@
+Description: enhance on-line documentation
+ This patch tries to enhance on-line documentation by providing:
+ - common help format for all the IRSTLM binaries
+ - common help option for all the IRSTLM binaries
+ - a brief description of each binary
+Author: Giulio Paci <giuliopaci at gmail.com>
+Forwarded: no
+--- a/scripts/add-start-end.sh
++++ b/scripts/add-start-end.sh
+@@ -1,5 +1,32 @@
+ #! /bin/bash
+
++function usage()
++{
++ cmnd=$(basename $0);
++ cat<<EOF
++
++$cmnd - adds sentence start/end symbols and trims words longer
++ than 80 characters
++
++USAGE:
++ $cmnd [options]
++
++OPTIONS:
++ -h Show this message
++
++EOF
++}
++
++# Parse options
++while getopts h OPT; do
++ case "$OPT" in
++ h)
++ usage >&2;
++ exit 0;
++ ;;
++ esac
++done
++
+ #adds sentence start/end symbols to standard input and
+ #trims words longer than 80 characters
+
+--- a/scripts/build-lm-qsub.sh
++++ b/scripts/build-lm-qsub.sh
+@@ -1,26 +1,29 @@
+ #! /bin/bash
+
+-usage()
++function usage()
+ {
+-cat << EOF
+-usage: $0 options
++ cmnd=$(basename $0);
++ cat<<EOF
+
+-This script estimates a language model file.
++$cmnd - estimates a language model file
++
++USAGE:
++ $cmnd [options]
+
+ OPTIONS:
+- -h Show this message
+- -i Input training file e.g. 'gunzip -c train.gz'
+- -o Output gzipped LM, e.g. lm.gz
+- -k Number of splits (default 5)
+- -n Order of language model (default 3)
+- -t Directory for temporary files (default ./stat_PID)
+- -p Prune singleton n-grams (default false)
+- -u Use uniform word frequency for dictionary splitting (default false)
+- -q parameters for qsub ("-q <queue>", and any other)
+- -s Smoothing methods: witten-bell (default), kneser-ney (approximated kneser-ney), improved-kneser-ney
+- -b Include sentence boundary n-grams (optional)
+- -d Define subdictionary for n-grams (optional)
+- -v Verbose
++ -h Show this message
++ -i Input training file e.g. 'gunzip -c train.gz'
++ -o Output gzipped LM, e.g. lm.gz
++ -k Number of splits (default 5)
++ -n Order of language model (default 3)
++ -t Directory for temporary files (default ./stat_PID)
++ -p Prune singleton n-grams (default false)
++ -u Use uniform word frequency for dictionary splitting (default false)
++ -q Parameters for qsub ("-q <queue>", and any other)
++ -s Smoothing methods: witten-bell (default), kneser-ney (approximated kneser-ney), improved-kneser-ney
++ -b Include sentence boundary n-grams (optional)
++ -d Define subdictionary for n-grams (optional)
++ -v Verbose
+
+ EOF
+ }
+@@ -69,7 +72,7 @@
+ case $OPTION in
+ h)
+ usage
+- exit 1
++ exit 0
+ ;;
+ v)
+ verbose="--verbose";
+--- a/scripts/build-lm.sh
++++ b/scripts/build-lm.sh
+@@ -2,26 +2,29 @@
+
+ set -m # Enable Job Control
+
+-usage()
++function usage()
+ {
+-cat << EOF
+-usage: $0 options
++ cmnd=$(basename $0);
++ cat<<EOF
+
+-This script estimates a language model file.
++$cmnd - estimates a language model file
++
++USAGE:
++ $cmnd [options]
+
+ OPTIONS:
+- -h Show this message
+- -i Input training file e.g. 'gunzip -c train.gz'
+- -o Output gzipped LM, e.g. lm.gz
+- -k Number of splits (default 5)
+- -n Order of language model (default 3)
+- -t Directory for temporary files (default ./stat_PID)
+- -p Prune singleton n-grams (default false)
+- -u Use uniform word frequency for dictionary splitting (default false)
+- -s Smoothing methods: witten-bell (default), kneser-ney, improved-kneser-ney
+- -b Include sentence boundary n-grams (optional)
+- -d Define subdictionary for n-grams (optional)
+- -v Verbose
++ -h Show this message
++ -i Input training file e.g. 'gunzip -c train.gz'
++ -o Output gzipped LM, e.g. lm.gz
++ -k Number of splits (default 5)
++ -n Order of language model (default 3)
++ -t Directory for temporary files (default ./stat_PID)
++ -p Prune singleton n-grams (default false)
++ -u Use uniform word frequency for dictionary splitting (default false)
++ -s Smoothing methods: witten-bell (default), kneser-ney, improved-kneser-ney
++ -b Include sentence boundary n-grams (optional)
++ -d Define subdictionary for n-grams (optional)
++ -v Verbose
+
+ EOF
+ }
+@@ -62,7 +65,7 @@
+ case $OPTION in
+ h)
+ usage
+- exit 1
++ exit 0
+ ;;
+ v)
+ verbose="--verbose";
+@@ -107,7 +110,6 @@
+ exit 4;
+ esac
+ ;;
+-
+ p)
+ prune='--prune-singletons';
+ ;;
+--- a/scripts/rm-start-end.sh
++++ b/scripts/rm-start-end.sh
+@@ -1,6 +1,30 @@
+ #! /bin/bash
+
+-#rm start-end symbols
++function usage()
++{
++ cmnd=$(basename $0);
++ cat<<EOF
++
++$cmnd - removes sentence start/end symbols
++
++USAGE:
++ $cmnd [options]
++
++OPTIONS:
++ -h Show this message
++
++EOF
++}
++
++# Parse options
++while getopts h OPT; do
++ case "$OPT" in
++ h)
++ usage >&2;
++ exit 0;
++ ;;
++ esac
++done
+
+ sed 's/<s>//g' | sed 's/<\/s>//g' | sed 's/^ *//' | sed 's/ *$//' | sed '/^$/d'
+
+--- a/scripts/split-ngt.sh
++++ b/scripts/split-ngt.sh
+@@ -1,5 +1,37 @@
+ #! /bin/bash
+
++function usage()
++{
++ cmnd=$(basename $0);
++ cat<<EOF
++
++$cmnd - creates partition files with ngram statistics in Google format
++
++USAGE:
++ $cmnd [options] <input> <output> <order> <parts>
++
++DESCRIPTION:
++ <input> Input file name
++ <output> Partition files name prefix
++ <order> Order of the ngrams
++ <parts> Number of partitions
++
++OPTIONS:
++ -h Show this message
++
++EOF
++}
++
++# Parse options
++while getopts h OPT; do
++ case "$OPT" in
++ h)
++ usage >&2;
++ exit 0;
++ ;;
++ esac
++done
++
+ #usage:
+ #ngt-split.sh <input> <output> <size> <parts>
+ #It creates <parts> files (named <output.000>, ... <output.999>)
+--- a/scripts/build-sublm.pl
++++ b/scripts/build-sublm.pl
+@@ -36,6 +36,7 @@
+
+ use strict;
+ use Getopt::Long "GetOptions";
++use File::Basename;
+
+ my $gzip=`which gzip 2> /dev/null`;
+ my $gunzip=`which gunzip 2> /dev/null`;
+@@ -62,24 +63,29 @@
+ 'improved-kneser-ney=s' => \$improved_kneser_ney,
+ 'prune-singletons' => \$prune_singletons,
+ 'cross-sentence' => \$cross_sentence,
+- 'help' => \$help,
++ 'h|help' => \$help,
+ 'verbose' => \$verbose);
+
+
+ if ($help || !$size || !$ngrams || !$sublm) {
+- print "build-sublm.pl <options>\n",
+- "--size <int> maximum n-gram size for the language model\n",
+- "--ngrams <string> input file or command to read the ngram table\n",
+- "--sublm <string> output file prefix to write the sublm statistics \n",
+- "--freq-shift <int> (optional) value to be subtracted from all frequencies\n",
+- "--witten-bell (optional) use witten bell linear smoothing (default)\n",
+- "--kneser-ney <string> (optional) use kneser-ney smoothing with statistics in <string> \n",
+- "--improved-kneser-ney <string> (optional) use improved kneser-ney smoothing with statistics in <string> \n",
+- "--good-turing (optional) use good-turing linear smoothing\n",
+- "--prune-singletons (optional) remove n-grams occurring once, for n=3,4,5,... (disabled by default)\n",
+- "--cross-sentence (optional) include cross-sentence bounds (disabled by default)\n",
+- "--verbose (optional) print debugging info\n",
+- "--help (optional) print these instructions\n";
++ my $cmnd = basename($0);
++ print "\n$cmnd - estimates single LMs\n",
++ "\nUSAGE:\n",
++ " $cmnd [options]\n",
++ "\nOPTIONS:\n",
++ " --size <int> maximum n-gram size for the language model\n",
++ " --ngrams <string> input file or command to read the ngram table\n",
++ " --sublm <string> output file prefix to write the sublm statistics \n",
++ " --freq-shift <int> (optional) value to be subtracted from all frequencies\n",
++ " --witten-bell (optional) use witten bell linear smoothing (default)\n",
++ " --kneser-ney <string> (optional) use kneser-ney smoothing with statistics in <string> \n",
++ " --improved-kneser-ney <string> (optional) use improved kneser-ney smoothing with statistics in <string> \n",
++ " --good-turing (optional) use good-turing linear smoothing\n",
++ " --prune-singletons (optional) remove n-grams occurring once, for n=3,4,5,... (disabled by default)\n",
++ " --cross-sentence (optional) include cross-sentence bounds (disabled by default)\n",
++ " --verbose (optional) print debugging info\n",
++ " -h, --help (optional) print these instructions\n",
++ "\n";
+
+ exit(1);
+ }
+--- a/scripts/merge-sublm.pl
++++ b/scripts/merge-sublm.pl
+@@ -23,21 +23,28 @@
+
+ use strict;
+ use Getopt::Long "GetOptions";
++use File::Basename;
+
+ my ($help,$lm,$size,$sublm)=();
+ $help=1 unless
+ &GetOptions('size=i' => \$size,
+ 'lm=s' => \$lm,
+ 'sublm=s' => \$sublm,
+- 'help' => \$help,);
++ 'h|help' => \$help,);
+
+
+-if ($help || !$size || !$lm || !$sublm){
+- print "merge-sublm.pl <options>\n",
+- "--size <int> maximum n-gram size for the language model\n",
+- "--sublm <string> path identifying all prefix sub LMs \n",
+- "--lm <string> name of final LM file (will be gzipped)\n",
+- "--help (optional) print these instructions\n";
++if ($help || !$size || !$lm || !$sublm) {
++ my $cmnd = basename($0);
++ print "\n$cmnd - merge single LMs\n",
++ "\nUSAGE:\n",
++ " $cmnd [options]\n",
++ "\nOPTIONS:\n",
++ " --size <int> maximum n-gram size for the language model\n",
++ " --sublm <string> path identifying all input prefix sub LMs\n",
++ " --lm <string> name of the output LM file (will be gzipped)\n",
++ " -h, --help (optional) print these instructions\n",
++ "\n";
++
+ exit(1);
+ }
+
+--- a/scripts/split-dict.pl
++++ b/scripts/split-dict.pl
+@@ -29,6 +29,7 @@
+
+ use strict;
+ use Getopt::Long "GetOptions";
++use File::Basename;
+
+ my ($help,$input,$output,$parts)=();
+
+@@ -36,18 +37,24 @@
+ &GetOptions('input=s' => \$input,
+ 'output=s' => \$output,
+ 'parts=i' => \$parts,
+- 'help' => \$help,);
++ 'h|help' => \$help,);
+
+-if ($help || !$input || !$output || !$parts){
+-
+- print "split-dict.pl <options>\n",
+- "--input <string> input dictionary with frequencies\n",
+- "--output <string> prefix of output dictionaries\n",
+- "--parts <int> number of parts to split dictionary into \n",
+- "--help (optional) print these instructions\n",
+- "Remarks: dictionary must be generated with IRSTLM command dict\n",
+- " if dictionary does not contain frequencies, then a\n",
+- " frequency 1 is assumed for all words.\n";
++if ($help || !$input || !$output || !$parts) {
++ my $cmnd = basename($0);
++ print "\n$cmnd - splits a dictionary into frequency-balanced partitions\n",
++ "\nUSAGE:\n",
++ " $cmnd [options]\n",
++ "\nDESCRIPTION:\n",
++ " $cmnd splits a dictionary into frequency-balanced partitions.\n",
++ " The dictionary must be generated with IRSTLM command dict.\n",
++ " If dictionary does not contain frequencies, then a frequency 1 is\n",
++ " assumed for all words.\n",
++ "\nOPTIONS:\n",
++ " --input <string> input dictionary with frequencies\n",
++ " --output <string> prefix of output dictionaries\n",
++ " --parts <int> number of partitions to create\n",
++ " -h, --help (optional) print these instructions\n",
++ "\n";
+
+ exit(1);
+ }
+--- a/scripts/goograms2ngrams.pl
++++ b/scripts/goograms2ngrams.pl
+@@ -44,18 +44,24 @@
+ 'startfrom=i' => \$from,
+ 'googledir=s' => \$googledir,
+ 'ngramdir=s' => \$ngramdir,
+- 'help' => \$help,
++ 'h|help' => \$help,
+ 'verbose' => \$verbose);
+
+
+-if ($help || !$maxsize || !$googledir || !$ngramdir ){
+- print "goograms2ngrams.pl <options>\n",
+- "--maxsize <int> maximum n-gram level of conversion\n",
+- "--startfrom <int> skip initial levels if already available (default 2)\n",
+- "--googledir <string> directory containing the google-grams dirs (1gms,2gms, ...)\n",
+- "--ngramdir <string> directory where to write the n-grams \n",
+- "--verbose (optional) very talktive output\n",
+- "--help (optional) print these instructions\n";
++if ($help || !$maxsize || !$googledir || !$ngramdir ) {
++ my $cmnd = "goograms2ngrams.pl";
++ print "\n$cmnd - transforms google n-grams into real n-grams so that\n",
++ " counts are consistent with respect to lower order n-grams\n",
++ "\nUSAGE:\n",
++ " $cmnd [options]\n",
++ "\nOPTIONS:\n",
++ " --maxsize <int> maximum n-gram level of conversion\n",
++ " --startfrom <int> skip initial levels if already available (default 2)\n",
++ " --googledir <string> directory containing the google-grams dirs (1gms,2gms,...)\n",
++ " --ngramdir <string> directory where to write the n-grams \n",
++ " --verbose (optional) very talktive output\n",
++ " -h, --help (optional) print these instructions\n",
++ "\n";
+
+ exit(1);
+ }
+--- a/scripts/lm-stat.pl
++++ b/scripts/lm-stat.pl
+@@ -23,19 +23,26 @@
+
+ use strict;
+ use Getopt::Long "GetOptions";
++use File::Basename;
+
+ my ($help,$lm,$txt)=();
+ $help=1 unless
+
+ &GetOptions('lm=s' => \$lm,
+ 'txt=s' => \$txt,
+- 'help' => \$help,);
++ 'h|help' => \$help,);
++
++if ($help || !$lm || !$txt) {
++ my $cmnd = basename($0);
++ print "\n$cmnd - computes LM statistics over a string\n",
++ "\nUSAGE:\n",
++ " $cmnd [options]\n",
++ "\nOPTIONS:\n",
++ " --lm <string> language model file \n",
++ " --txt <string> text file\n",
++ " -h, --help (optional) print these instructions\n",
++ "\n";
+
+-if ($help || !$lm || !$txt){
+- print "lm-stat.pl <options>\n",
+- "--lm <string> language model file \n",
+- "--txt <string> text file\n",
+- "--help print these instructions\n";
+ exit(1);
+ }
+
+--- a/scripts/ngram-split.pl
++++ b/scripts/ngram-split.pl
+@@ -24,6 +24,29 @@
+ #n-grams starting with a given word (prefix) are all
+ #contained in one file.
+
++use Getopt::Long "GetOptions";
++use File::Basename;
++
++my ($help,$lm,$size,$sublm)=();
++$help=1 unless
++&GetOptions('h|help' => \$help);
++
++if ($help) {
++ my $cmnd = basename($0);
++ print "\n$cmnd - re-segment google n-gram count files so that n-grams\n",
++ " starting with a given word (prefix) are all contained in one file\n",
++ "\nUSAGE:\n",
++ " $cmnd [options] [<output_prefix>]\n",
++ "\nDESCRIPTION:\n",
++ " Input is expected on STDIN.\n",
++ " <output_prefix> prefix of files to be created\n",
++ "\nOPTIONS:\n",
++ " -h, --help (optional) print these instructions\n",
++ "\n";
++
++ exit(1);
++}
++
+
+ $max_pref=10000; #number of prefixes to be put in one file
+ $max_ngram=5000000;#number of n-grams to be put in one file
+--- a/scripts/sort-lm.pl
++++ b/scripts/sort-lm.pl
+@@ -25,6 +25,7 @@
+
+ use strict;
+ use Getopt::Long "GetOptions";
++use File::Basename;
+
+ my ($help,$ilm,$olm,$inv)=();
+ $help=1 unless
+@@ -35,14 +36,24 @@
+ &GetOptions('ilm=s' => \$ilm,
+ 'olm=s' => \$olm,
+ 'inv' => \$inv,
+- 'help' => \$help,);
++ 'h|help' => \$help,);
++
++if ($help || !$ilm || !$olm) {
++ my $cmnd = basename($0);
++ print "\n$cmnd - sorts n-grams according to lexicographic order\n",
++ "\nUSAGE:\n",
++ " $cmnd [options]\n",
++ "\nDESCRIPTION:\n",
++ " $cmnd sorts n-grams of an ARPA file according to lexicographic order.\n",
++ " Inverted sorting option is propedeutic to building a binary\n",
++ " lmtable with compile-lm with n-grams stored in reverted order.\n",
++ "\nOPTIONS:\n",
++ " -ilm <fname> input ARPA LM filename (default STDIN) \n",
++ " -olm <fname> output ARPA LM filename (default STDOUT)\n",
++ " -inv inverted n-gram sort for compile-lm \n",
++ " -h, --help (optional) print these instructions\n",
++ "\n";
+
+-if ($help || !$ilm || !$olm){
+- print "sort-lm.pl [--ilm <fname>] [--olm <fname>] [--inv] \n",
+- "-ilm <fname> input ARPA LM filename (default /dev/stdin) \n",
+- "-olm <fname> output ARPA LM filename (default /dev/stdout)\n",
+- "-inv inverted n-gram sort for compile-lm \n",
+- "-help print these instructions\n";
+ exit(1);
+ }
+
+--- a/src/compile-lm.cpp
++++ b/src/compile-lm.cpp
+@@ -55,27 +55,29 @@
+ void usage(const char *msg = 0) {
+
+ if (msg) { std::cerr << msg << std::endl; }
+- std::cerr << "Usage: compile-lm [options] input-file.lm [output-file.blm]" << std::endl;
+- if (!msg) std::cerr << std::endl
+- << " compile-lm reads a standard LM file in ARPA format and produces" << std::endl
+- << " a compiled representation that the IRST LM toolkit can quickly" << std::endl
+- << " read and process. LM file can be compressed with gzip." << std::endl << std::endl;
+- std::cerr << "Options:\n"
+- << "--text|-t [yes|no] (output is again in text format)" << std::endl
+- << "--invert|-i [yes|no] (build an inverted n-gram binary table for fast access: default no)" << std::endl
+- << "--filter|-f wordlist (filter a binary language model with a word list)"<< std::endl
+- << "--keepunigrams|-ku [yes|no] (filter by keeping all unigrams in the table: default yes)"<< std::endl
+- << "--eval|-e text-file (computes perplexity of text-file and returns)"<< std::endl
+- << "--randcalls|-r N (computes N random calls on the eval text-file)"<< std::endl
+- << "--dub dict-size (dictionary upperbound to compute OOV word penalty: default 10^7)"<< std::endl
+- << "--score|-s [yes|no] (computes log-prob scores from standard input)"<< std::endl
+- << "--debug|-d 1 (verbose output for --eval option)"<< std::endl
+- << "--sentence [yes|no] (compute pperplexity at sentence level (identified through the end symbol)"<< std::endl
+- << "--memmap|-mm 1 (uses memory map to read a binary LM)"<< std::endl
+- << "--ngram_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
+- << "--dict_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
+- << "--level|l <value> (set the maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken)" << std::endl
+- << "--tmpdir <directory> (directory for temporary computation, default is either the environment variable TMP if defined or \"/tmp\")" << std::endl;
++ std::cerr << std::endl << "compile-lm - compiles an ARPA format LM into an IRSTLM format one" << std::endl;
++ std::cerr << std::endl << "USAGE:" << std::endl
++ << " compile-lm [options] <input-file.lm> [output-file.blm]" << std::endl;
++ if (!msg) std::cerr << std::endl << "DESCRIPTION:" << std::endl
++ << " compile-lm reads a standard LM file in ARPA format and produces" << std::endl
++ << " a compiled representation that the IRST LM toolkit can quickly" << std::endl
++ << " read and process. LM file can be compressed with gzip." << std::endl;
++ std::cerr << std::endl << "OPTIONS:" << std::endl
++ << " --text|-t [yes|no] (output is again in text format)" << std::endl
++ << " --invert|-i [yes|no] (build an inverted n-gram binary table for fast access: default no)" << std::endl
++ << " --filter|-f wordlist (filter a binary language model with a word list)"<< std::endl
++ << " --keepunigrams|-ku [yes|no] (filter by keeping all unigrams in the table: default yes)"<< std::endl
++ << " --eval|-e text-file (computes perplexity of text-file and returns)"<< std::endl
++ << " --randcalls|-r N (computes N random calls on the eval text-file)"<< std::endl
++ << " --dub dict-size (dictionary upperbound to compute OOV word penalty: default 10^7)"<< std::endl
++ << " --score|-s [yes|no] (computes log-prob scores of n-grams from standard input)"<< std::endl
++ << " --debug|-d 1 (verbose output for --eval option)"<< std::endl
++ << " --sentence [yes|no] (compute pperplexity at sentence level (identified through the end symbol)"<< std::endl
++ << " --memmap|-mm 1 (uses memory map to read a binary LM)" << std::endl
++ << " --ngram_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
++ << " --dict_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
++ << " --level|l <value> (set the maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken)" << std::endl
++ << " --tmpdir <directory> (directory for temporary computation, default is either the environment variable TMP if defined or \"/tmp\")" << std::endl;
+ }
+
+ bool starts_with(const std::string &s, const std::string &pre) {
+--- a/src/dict.cpp
++++ b/src/dict.cpp
+@@ -78,23 +78,26 @@
+
+ if (inp==NULL)
+ {
+- std::cerr << "\nUsage: \ndict -i=inputfile [options]\n";
+- std::cerr << "(inputfile can be a corpus or a dictionary)\n\n";
+- std::cerr << "Options:\n";
+- std::cerr << "-o=outputfile\n";
+- std::cerr << "-f=[yes|no] (output word frequencies, default is false)\n";
+- std::cerr << "-sort=[yes|no] (sort dictionary by frequency, default is false)\n";
+- std::cerr << "-pf=<freq> (prune words with frequency below <freq>\n";
+- std::cerr << "-pr=<rank> (prune words with frequency rank above <rank>\n";
+- std::cerr << "-is= (interruption symbol) \n";
+- std::cerr << "-c=[yes|no] (show dictionary growth curve)\n";
+- std::cerr << "-cs=curvesize (default 10)\n";
+- std::cerr << "-t=testfile (compute OOV rates on test corpus)\n";
+- std::cerr << "-LoadFactor=<value> (set the load factor for cache; it should be a positive real value; if not defined a default value is used)\n";
+- std::cerr << "-listOOV=[yes|no] (print OOV words to stderr, default is false)\n\n";
++ std::cerr << std::endl << "dict - extracts a dictionary" << std::endl;
++ std::cerr << std::endl << "USAGE:" << std::endl
++ << " dict -i=<inputfile> [options]" << std::endl;
++ std::cerr << std::endl << "DESCRIPTION:" << std::endl
++ << " dict extracts a dictionary from a corpus or a dictionary." << std::endl;
++ std::cerr << std::endl << "OPTIONS:" << std::endl;
++ std::cerr << " -o=outputfile" << std::endl;
++ std::cerr << " -f=[yes|no] (output word frequencies, default is false)" << std::endl;
++ std::cerr << " -sort=[yes|no] (sort dictionary by frequency, default is false)" << std::endl;
++ std::cerr << " -pf=<freq> (prune words with frequency below <freq>" << std::endl;
++ std::cerr << " -pr=<rank> (prune words with frequency rank above <rank>" << std::endl;
++ std::cerr << " -is= (interruption symbol)" << std::endl;
++ std::cerr << " -c=[yes|no] (show dictionary growth curve)" << std::endl;
++ std::cerr << " -cs=curvesize (default 10)" << std::endl;
++ std::cerr << " -t=testfile (compute OOV rates on test corpus)" << std::endl;
++ std::cerr << " -LoadFactor=<value> (set the load factor for cache; it should be a positive real value; if not defined a default value is used)" << std::endl;
++ std::cerr << " -listOOV=[yes|no] (print OOV words to stderr, default is false)" << std::endl << std::endl;
+
+
+- exit(1);
++ exit(1);
+ };
+
+ // options compatibility issues:
+--- a/src/interpolate-lm.cpp
++++ b/src/interpolate-lm.cpp
+@@ -51,27 +51,28 @@
+
+ void usage(const char *msg = 0) {
+ if (msg) { std::cerr << msg << std::endl; }
+- std::cerr << "Usage: interpolate-lm [options] lm-list-file [lm-list-file.out]" << std::endl;
+- if (!msg) std::cerr << std::endl
+- << " interpolate-lm reads a LM list file including interpolation weights " << std::endl
+- << " with the format: N\\n w1 lm1 \\n w2 lm2 ...\\n wN lmN\n" << std::endl
+- << " It estimates new weights on a development text, " << std::endl
+- << " computes the perplexity on an evaluation text, " << std::endl
+- << " computes probabilities of n-grams read from stdin." << std::endl
+- << " It reads LMs in ARPA and IRSTLM binary format." << std::endl << std::endl;
+-
+- std::cerr << "Options:\n"
+- << "--learn|-l text-file learn optimal interpolation for text-file"<< std::endl
+- << "--order|-o n order of n-grams used in --learn (optional)"<< std::endl
+- << "--eval|-e text-file computes perplexity on text-file"<< std::endl
+- << "--dub dict-size dictionary upperbound (default 10^7)"<< std::endl
+- << "--score|-s [yes|no] compute log-probs of n-grams from stdin"<< std::endl
+- << "--debug|-d [1-3] verbose output for --eval option (see compile-lm)"<< std::endl
+- << "--sentence [yes|no] (compute pperplexity at sentence level (identified through the end symbol)"<< std::endl
+- << "--memmap| -mm 1 use memory map to read a binary LM" << std::endl
+- << "--ngram_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
+- << "--dict_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
+- << "--level|lev <value> (set the maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken)" << std::endl;
++ std::cerr << std::endl << "interpolate-lm - interpolates language models" << std::endl;
++ std::cerr << std::endl << "USAGE:" << std::endl
++ << " interpolate-lm [options] <lm-list-file> [lm-list-file.out]" << std::endl;
++ if (!msg) std::cerr << std::endl << "DESCRIPTION:" << std::endl
++ << " interpolate-lm reads a LM list file including interpolation weights " << std::endl
++ << " with the format: N\\n w1 lm1 \\n w2 lm2 ...\\n wN lmN\n" << std::endl
++ << " It estimates new weights on a development text, " << std::endl
++ << " computes the perplexity on an evaluation text, " << std::endl
++ << " computes probabilities of n-grams read from stdin." << std::endl
++ << " It reads LMs in ARPA and IRSTLM binary format." << std::endl;
++ std::cerr << std::endl << "OPTIONS:" << std::endl
++ << " --learn|-l text-file learn optimal interpolation for text-file"<< std::endl
++ << " --order|-o n order of n-grams used in --learn (optional)"<< std::endl
++ << " --eval|-e text-file (computes perplexity of text-file and returns)"<< std::endl
++ << " --dub dict-size (dictionary upperbound to compute OOV word penalty: default 10^7)"<< std::endl
++ << " --score|-s [yes|no] (computes log-prob scores of n-grams from standard input)"<< std::endl
++ << " --debug|-d [1-3] verbose output for --eval option (see compile-lm)"<< std::endl
++ << " --sentence [yes|no] (compute pperplexity at sentence level (identified through the end symbol)"<< std::endl
++ << " --memmap|-mm 1 (uses memory map to read a binary LM)" << std::endl
++ << " --ngram_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
++ << " --dict_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
++ << " --level|lev <value> (set the maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken)" << std::endl;
+ }
+
+
+--- a/src/ngt.cpp
++++ b/src/ngt.cpp
+@@ -125,9 +125,35 @@
+ GetParams(&argc, &argv, (char*) NULL);
+
+ if (inp==NULL){
+- cerr <<"No input was specified\n";
+- exit(1);
+- };
++ std::cerr << std::endl << "ngt - collects n-grams" << std::endl;
++ std::cerr << std::endl << "USAGE:" << std::endl
++ << " ngt -i=<inputfile> [options]" << std::endl;
++ std::cerr << std::endl << "OPTIONS:" << std::endl;
++ std::cerr << " -Dictionary|-d=dictfile (dictionary filename)" << std::endl;
++ std::cerr << " -IntSymb|-is=string (interruption symbol)" << std::endl;
++ std::cerr << " -NgramSize|-n=[1-" << MAX_NGRAM << "] (n-gram default size, default: 0)" << std::endl;
++ std::cerr << " -InputFile|-i=inputfile" << std::endl;
++ std::cerr << " -OutputFile|-o=outputfile" << std::endl;
++ std::cerr << " -InputGoogleFormat|-gooinp=[yes|no] (default: no)" << std::endl;
++ std::cerr << " -OutputGoogleFormat|-gooout=[yes|no] (default: no)" << std::endl;
++ std::cerr << " -SaveBinaryTable|-b=[yes|no] (default: no)" << std::endl;
++ std::cerr << " -LmTable|-lm=[yes|no] (work with LM table, default: no)" << std::endl;
++ std::cerr << " -DistCo|-dc=int (compute distance co-occurrences, default: 0)" << std::endl;
++ std::cerr << " -AugmentFile|-aug=string (augmentation data)" << std::endl;
++ std::cerr << " -SaveSingle|-ss=[yes|no] (generate single table, default: no)" << std::endl;
++ std::cerr << " -SubDict|-sd|-ConvDict|-cd=dictfile (subdictionary)" << std::endl;
++ std::cerr << " -FilterDict|-fd=dictfile (filter dictionary)" << std::endl;
++ std::cerr << " -FilterTable|-ft=file (ngramtable filename)" << std::endl;
++ std::cerr << " -ftr|-FilterTableRate=double (minimum hit rate of filter, default: 1.0)" << std::endl;
++ std::cerr << " -HistoMask|-hm=string (history mask)" << std::endl;
++ std::cerr << " -InpLen|-il=int (input length for mask generation, default: 0)" << std::endl;
++ std::cerr << " -tlm=[yes|no] (test LM table, default: no)" << std::endl;
++ std::cerr << " -ftlm (file to test LM table)" << std::endl;
++ std::cerr << " -memuse=[yes|no] (default: no)" << std::endl;
++ std::cerr << " -iknstat=file (filename to save IKN statistics)" << std::endl;
++ std::cerr << std::endl;
++ exit(1);
++ };
+
+ if (out==NULL)
+ cerr << "Warning: no output file specified!\n";
+--- a/src/plsa.cpp
++++ b/src/plsa.cpp
+@@ -124,28 +124,55 @@
+ GetParams(&argc, &argv, (char*) NULL);
+
+ if (argc==1 || help){
+- cerr <<"plsa: IRSTLM tool for Probabilistic Latent Semantic Analysis LM inference\n\n";
+-
+- cerr <<"Usage (1): plsa -c=<collection> -d=<dictionary> -m=<model> -t=<topics> -it=<iter>\n\n";
+- cerr <<"Train a PLSA model. Parameters specify collection and dictionary filenames\n";
+- cerr <<"number of EM iterations, number of topics, and model filename. The collection\n";
+- cerr <<"must begin with the number of documents and documents should be separated\n";
+- cerr <<"with the </d> tag. The begin document tag <d> is not considered.\n";
+- cerr <<"Example:\n";
+- cerr <<"3\n";
+- cerr <<"<d> hello world ! </d>\n";
+- cerr <<"<d> good morning good afternoon </d>\n";
+- cerr <<"<d> welcome aboard </d>\n\n";
+-
+- cerr <<"Usage (2): plsa -c=<text collection> -d=<dictionary> -b=<binary collection>\n\n";
+- cerr <<"Binarize a textual document collection to speed-up training (1)\n";
+- cerr <<"\n";
+-
+- cerr <<"Usage (3): plsa -d=<dictionary> -m=<model> -t=<topics> -inf=<text> -f=<features> -it=<iterations>\n\n";
+- cerr <<"Infer a full 1-gram distribution from a model and a small text. The 1-gram\n";
+- cerr <<"is saved in the feature file. The 1-gram\n";
+- cerr <<"\n";
+- exit(1);
++ std::cerr << std::endl << "plsa - performs probabilistic latent semantic analysis LM inference" << std::endl;
++ std::cerr << std::endl << "USAGE:" << std::endl
++ << " plsa -c=<text_collection> -d=<dictionary> -m=<model> -t=<topics> -it=<iter> [options]" << std::endl
++ << " plsa -c=<text_collection> -d=<dictionary> -b=<binary_collection> [options]" << std::endl
++ << " plsa plsa -d=<dictionary> -m=<model> -t=<topics> -inf=<text> -f=<features> -it=<iterations> [options]" << std::endl;
++ std::cerr << std::endl << "DESCRIPTION:" << std::endl;
++ std::cerr << " plsa is a tool for probabilistic latent semantic analysis" << std::endl;
++ std::cerr << " LM inference. It can be used to train a PLSA model, to binarize" << std::endl;
++ std::cerr << " a textual document collection to speed-up training or to" << std::endl;
++ std::cerr << " infer a full n-gram distribution from a model and a small text." << std::endl;
++ std::cerr << std::endl << "OPTIONS:" << std::endl;
++ std::cerr << " plsa is a tool for probabilistic latent semantic analysis" << std::endl;
++ std::cerr << " -Dictionary|-d=<file> (dictionary file)" << std::endl;
++ std::cerr << " -Binary|-b=<file> (binary file)" << std::endl;
++ std::cerr << " -SplitData|-sd=<int> (size of binary file, default: unlimited)" << std::endl;
++ std::cerr << " -Collection|-c=<file> (text collection file)" << std::endl;
++ std::cerr << " -Model|-m=<file> (model file)" << std::endl;
++ std::cerr << " -HFile|-hf=<file>" << std::endl;
++ std::cerr << " -WFile|-wf=<file>" << std::endl;
++ std::cerr << " -TFile|-tf=<file>" << std::endl;
++ std::cerr << " -CombineTFile|-ct=<file>" << std::endl;
++ std::cerr << " -TxtFile|-txt=<file>" << std::endl;
++ std::cerr << " -Inference|-inf=<file>" << std::endl;
++ std::cerr << " -Features|-f=<file>" << std::endl;
++ std::cerr << " -Topics|-t=<int> (number of topics, default: 0)" << std::endl;
++ std::cerr << " -SpecialTopic|-st=<int> (special topic: first dictionary words, default: 0)" << std::endl;
++ std::cerr << " -Iterations|-it=<int> (number of EM iterations, default: 0)" << std::endl;
++ std::cerr << " -Help|-h=[yes|no]" << std::endl;
++ std::cerr << std::endl << "EXAMPLES:" << std::endl;
++ std::cerr <<" (1) plsa -c=<text_collection> -d=<dictionary> -m=<model> -t=<topics> -it=<iter>" << std::endl;
++ std::cerr <<" Train a PLSA model, <model>, from the text collection" << std::endl;
++ std::cerr <<" <text_collection> using the dictionary <dictionary>. The" << std::endl;
++ std::cerr <<" number of EM iterations is specified by <iter> and the" << std::endl;
++ std::cerr <<" number of topics is specified by <topics>." << std::endl;
++ std::cerr <<" The <text_collection> content must begin with the number of" << std::endl;
++ std::cerr <<" documents and documents should be separated with the </d> tag." << std::endl;
++ std::cerr <<" The begin document tag <d> is not considered." << std::endl;
++ std::cerr <<" Example of <text_collection> content:" << std::endl;
++ std::cerr <<" 3" << std::endl;
++ std::cerr <<" <d> hello world ! </d>" << std::endl;
++ std::cerr <<" <d> good morning good afternoon </d>" << std::endl;
++ std::cerr <<" <d> welcome aboard </d>" << std::endl;
++ std::cerr <<" (2) plsa -c=<text_collection> -d=<dictionary> -b=<binary collection>" << std::endl;
++ std::cerr <<" Binarize a textual document collection to speed-up training (1)" << std::endl;
++ std::cerr <<" (3) plsa -d=<dictionary> -m=<model> -t=<topics> -inf=<text> -f=<features> -it=<iterations>" << std::endl;
++ std::cerr <<" Infer a full 1-gram distribution from a model and a small" << std::endl;
++ std::cerr <<" text. The 1-gram is saved in the feature file. The 1-gram" << std::endl;
++ std::cerr << std::endl;
++ exit(1);
+ }
+
+ if (!dictfile)
+--- a/src/prune-lm.cpp
++++ b/src/prune-lm.cpp
+@@ -41,16 +41,20 @@
+
+ void usage(const char *msg = 0) {
+ if (msg) { std::cerr << msg << std::endl; }
+- std::cerr << "Usage: prune-lm [--threshold=th2,th3,...] [--abs=1|0] input-file [output-file]" << std::endl << std::endl;
+- std::cerr << " prune-lm reads a LM in either ARPA or compiled format and" << std::endl;
+- std::cerr << " prunes out n-grams (n=2,3,..) for which backing-off to the" << std::endl;
+- std::cerr << " lower order n-gram results in a small difference in probability." << std::endl;
+- std::cerr << " The pruned LM is saved in ARPA format" << std::endl << std::endl;
+- std::cerr << " Options:" << std::endl;
+- std::cerr << " --threshold=th2,th3,th4,... (pruning threshods for 2-grams, 3-grams, 4-grams,..." << std::endl;
+- std::cerr << " If less thresholds are specified, the last one is " << std::endl;
+- std::cerr << " applied to all following n-gram levels. " << std::endl << std::endl;
+- std::cerr << " --abs=1|0 if 1, use absolute value of weighted difference"<< std::endl;
++ std::cerr << std::endl << "prune-lm - prunes language models" << std::endl;
++ std::cerr << std::endl << "USAGE:" << std::endl
++ << " prune-lm [options] <inputfile> [<outputfile>]" << std::endl;
++ if (!msg) std::cerr << std::endl << "DESCRIPTION:" << std::endl
++ << " prune-lm reads a LM in either ARPA or compiled format and" << std::endl
++ << " prunes out n-grams (n=2,3,..) for which backing-off to the" << std::endl
++ << " lower order n-gram results in a small difference in probability." << std::endl
++ << " The pruned LM is saved in ARPA format" << std::endl;
++ std::cerr << std::endl << "OPTIONS:" << std::endl;
++ std::cerr << " --threshold=th2,th3,th4,... (pruning threshods for 2-grams, 3-grams, 4-grams,..." << std::endl;
++ std::cerr << " If less thresholds are specified, the last one is " << std::endl;
++ std::cerr << " applied to all following n-gram levels. (default: 0)" << std::endl;
++ std::cerr << " --abs=1|0 if 1, use absolute value of weighted difference (default: 0)" << std::endl;
++ std::cerr << std::endl;
+
+ }
+
+--- a/src/quantize-lm.cpp
++++ b/src/quantize-lm.cpp
+@@ -74,17 +74,25 @@
+
+ void usage(const char *msg = 0) {
+ if (msg) { std::cerr << msg << std::endl; }
+- std::cerr << "Usage: quantize-lm input-file.lm [output-file.qlm [tmpfile]] " << std::endl;
+- if (!msg) std::cerr << std::endl
+- << " quantize-lm reads a standard LM file in ARPA format and produces" << std::endl
+- << " a version of it with quantized probabilities and back-off weights"<< std::endl
+- << " that the IRST LMtoolkit can compile. Accepts LMs with .gz suffix." << std::endl
+- << " You can specify the output file to be created and also the pathname " << std::endl
+- << " of a temporary file used by the program. As default, the temporary " << std::endl
+- << " file is created in the /tmp directory. Output file can be " << std::endl
+- << " written to standard output by using the special name -. " << std::endl;
++ std::cerr << std::endl << "quantize-lm - quantizes probabilities and back-off weights" << std::endl;
++ std::cerr << std::endl << "USAGE:" << std::endl
++ << " quantize-lm <input-file.lm> [<output-file.qlm> [<tmpfile>]]" << std::endl;
++ if (!msg) std::cerr << std::endl << "DESCRIPTION:" << std::endl
++ << " quantize-lm reads a standard LM file in ARPA format and produces" << std::endl
++ << " a version of it with quantized probabilities and back-off weights"<< std::endl
++ << " that the IRST LM toolkit can compile. Accepts LMs with .gz suffix." << std::endl
++ << " You can specify the output file to be created and also the pathname" << std::endl
++ << " of a temporary file used by the program. As default, the temporary " << std::endl
++ << " file is created in the /tmp directory. Output file can be" << std::endl
++ << " written to standard output by using the special name -." << std::endl;
++ std::cerr << std::endl;
+ }
+
++void handle_option(const std::string& opt, int argc, const char **argv, int& argi)
++{
++ if (opt == "--help" || opt == "-h") { usage(); exit(1); }
++}
++
+
+ int main(int argc, const char **argv)
+ {
+@@ -95,6 +103,7 @@
+ std::vector<std::string> files;
+ for (int i=1; i < argc; i++) {
+ std::string opt = argv[i];
++ if(opt[0] == '-') handle_option(opt, argc, argv, i);
+ files.push_back(opt);
+ }
+ if (files.size() > 3) { usage("Too many arguments"); exit(1); }
+--- a/src/score-lm.cpp
++++ b/src/score-lm.cpp
+@@ -30,12 +30,16 @@
+
+
+ void usage() {
+- std::cerr << "Usage: score-lm -lm <model> [-dub <dub>] [-mm 1]\n"
+- " score sentences with a language model\n"
+- " -lm language model to use (must be specified)\n"
+- " -dub dictionary upper bound (default: 10000000)\n"
+- " -level max level to load from the language models (default: 1000, meaning the actual LM order)\n"
+- " -mm 1 memory-mapped access to lm\n";
++ std::cerr << std::endl << "score-lm - scores sentences with a language model" << std::endl;
++ std::cerr << std::endl << "USAGE:" << std::endl
++ << " score-lm -lm <model> [options]" << std::endl;
++ std::cerr << std::endl << "OPTIONS:" << std::endl;
++ std::cerr << " -lm language model to use (must be specified)" << std::endl;
++ std::cerr << " -dub dictionary upper bound (default: 10000000" << std::endl;
++ std::cerr << " -level max level to load from the language models (default: 1000," << std::endl;
++ std::cerr << " meaning the actual LM order)" << std::endl;
++ std::cerr << " -mm 1 memory-mapped access to lm (default: 0)" << std::endl;
++ std::cerr << std::endl;
+ exit(1);
+ }
+
+--- a/src/tlm.cpp
++++ b/src/tlm.cpp
+@@ -236,8 +236,53 @@
+
+ if (!trainfile || !lmtype)
+ {
+- cerr <<"Missing parameters\n";
+- exit(1);
++ std::cerr << std::endl << "tlm - trains and tests language models" << std::endl;
++ std::cerr << std::endl << "USAGE:" << std::endl
++ << " tlm [options]" << std::endl;
++ std::cerr << std::endl << "OPTIONS:" << std::endl;
++ std::cerr << " -Back-off|-bo=[yes|no] (yes: back-off or no: interpolation, default: no)" << std::endl;
++ std::cerr << " -Dictionary|-d=<file>" << std::endl;
++ std::cerr << " -DictionaryUpperBound|-dub=<int> (default: 0)" << std::endl;
++ std::cerr << " -NgramSize|-n=[1-" << MAX_NGRAM << "] (default: 0)" << std::endl;
++ std::cerr << " -Ngram|-TrainOn|-tr=<file>" << std::endl;
++ std::cerr << " -oASR|-oasr=<file>" << std::endl;
++ std::cerr << " -o|-oARPA|-oarpa=<file>" << std::endl;
++ std::cerr << " -oBIN|-obin=<file>" << std::endl;
++ std::cerr << " -TestOn|-te=<file>" << std::endl;
++ std::cerr << " -AdaptOn|-ad=<file>" << std::endl;
++ std::cerr << " -AdaptRate|-ar=<double> (default:1.0 )" << std::endl;
++ std::cerr << " -AdaptLevel|-al=[1-" << MAX_NGRAM << "] (default: 0)" << std::endl;
++ std::cerr << " -AdaptOOV|-ao[yes|no] (default: no)" << std::endl;
++ std::cerr << " -SaveScaleFactor|-ssf=<file>" << std::endl;
++ std::cerr << " -LanguageModelType|-lm=<LM_TYPE> (default: none)" << std::endl;
++ std::cerr << " ModifiedShiftBeta|msb" << std::endl;
++ std::cerr << " InterpShiftBeta|ShiftBeta|sb" << std::endl;
++ std::cerr << " InterpShiftOne|ShiftOne|s1" << std::endl;
++ std::cerr << " LinearWittenBell|wb" << std::endl;
++ std::cerr << " LinearGoodTuring" << std::endl;
++ std::cerr << " Mixture|mix" << std::endl;
++ std::cerr << " -Interactive|-i=<INTERACTIVE_TYPE> (default: none)" << std::endl;
++ std::cerr << " Ngram|Yes" << std::endl;
++ std::cerr << " Sequence" << std::endl;
++ std::cerr << " Adapt" << std::endl;
++ std::cerr << " Turn" << std::endl;
++ std::cerr << " Text" << std::endl;
++ std::cerr << " -Statistics|-s=[1-3] (default: 0)" << std::endl;
++ std::cerr << " -PruneThresh|-p=[1-1000] (default: 0)" << std::endl;
++ std::cerr << " -PruneSingletons|-ps=[yes|no] (default: yes)" << std::endl;
++ std::cerr << " -PruneTopSingletons|-pts=[yes|no] (default: no)" << std::endl;
++ std::cerr << " -ComputeLMSize|-sz=[yes|no] (default: 0)" << std::endl;
++ std::cerr << " -MaximumCachingLevel|-mcl=<int> (default: 0)" << std::endl;
++ std::cerr << " -MemoryMap|-memmap|-mm=[yes|no] (default: no)" << std::endl;
++ std::cerr << " -CheckProb|-cp=[yes|no] (default: no)" << std::endl;
++ std::cerr << " -OutProb|-op=<string> (default: )" << std::endl;
++ std::cerr << " -SubLMInfo|-slmi=<string>" << std::endl;
++ std::cerr << " -SaveMixParam|-smp=<file>" << std::endl;
++ std::cerr << " -LoadMixParam|-lmp=<file>" << std::endl;
++ std::cerr << " -SetOovRate|-or=<double> (default: 0)" << std::endl;
++ std::cerr << " -Beta|-beta=<double> (default: -1.0)" << std::endl;
++ std::cerr << std::endl;
++ exit(1);
+ };
+
+
diff --git a/debian/patches/scripts_fix.patch b/debian/patches/scripts_fix.patch
deleted file mode 100644
index 3c0cd3a..0000000
--- a/debian/patches/scripts_fix.patch
+++ /dev/null
@@ -1,11 +0,0 @@
-Description: fixes minor errors in scripts
-Author: Giulio Paci <giuliopaci at gmail.com>
-Forwarded: no
---- a/scripts/build-lm-qsub.sh
-+++ b/scripts/build-lm-qsub.sh
-@@ -1,4 +1,4 @@
--##! /bin/sh
-+#! /bin/sh
-
- usage()
- {
diff --git a/debian/patches/series b/debian/patches/series
index e2f4189..5291d17 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,5 +1,5 @@
+upstream_revision_487.patch
compilation_fixes.patch
spelling_fixes.patch
-
-scripts_fix.patch
+online_documentation_enhancments.patch
diff --git a/debian/patches/upstream_revision_487.patch b/debian/patches/upstream_revision_487.patch
new file mode 100644
index 0000000..bc286d8
--- /dev/null
+++ b/debian/patches/upstream_revision_487.patch
@@ -0,0 +1,405 @@
+Description: upstream changes since SVN revision 487
+Origin: upstream, https://irstlm.svn.sourceforge.net/svnroot/irstlm/trunk
+Forwarded: not-needed
+Applied-Upstream: revision 487, https://irstlm.svn.sourceforge.net/svnroot/irstlm/trunk
+--- a/scripts/add-start-end.sh
++++ b/scripts/add-start-end.sh
+@@ -1,4 +1,4 @@
+-#! /bin/sh
++#! /bin/bash
+
+ #adds sentence start/end symbols to standard input and
+ #trims words longer than 80 characters
+--- a/scripts/build-lm-qsub.sh
++++ b/scripts/build-lm-qsub.sh
+@@ -1,4 +1,4 @@
+-##! /bin/sh
++#! /bin/bash
+
+ usage()
+ {
+@@ -13,7 +13,7 @@
+ -o Output gzipped LM, e.g. lm.gz
+ -k Number of splits (default 5)
+ -n Order of language model (default 3)
+- -t Directory for temporary files (default ./stat)
++ -t Directory for temporary files (default ./stat_PID)
+ -p Prune singleton n-grams (default false)
+ -u Use uniform word frequency for dictionary splitting (default false)
+ -q parameters for qsub ("-q <queue>", and any other)
+@@ -40,6 +40,8 @@
+ #paths to scripts and commands in irstlm
+ scr=$IRSTLM/bin
+ bin=$IRSTLM/bin
++gzip=`which gzip 2> /dev/null`;
++gunzip=`which gunzip 2> /dev/null`;
+
+ #check irstlm installation
+ if [ ! -e $bin/dict -o ! -e $scr/split-dict.pl ]; then
+@@ -49,7 +51,7 @@
+
+ #default parameters
+ logfile=/dev/null
+-tmpdir=stat$$
++tmpdir=stat_$$
+ order=3
+ parts=3
+ inpfile="";
+@@ -109,7 +111,7 @@
+ ;;
+ *)
+ echo "wrong smoothing setting";
+- exiti 4;
++ exit 4;
+ esac
+ ;;
+ p)
+@@ -132,8 +134,8 @@
+ done
+
+
+-if [ $verbose ];then
+-echo inpfile=\"$inpfile\" outfile=$outfile order=$order parts=$parts tmpdir=$tmpdir prune=$prune smoothing=$smoothing dictionary=$dictionary
++if [ $verbose ]; then
++echo inpfile=\"$inpfile\" outfile=$outfile order=$order parts=$parts tmpdir=$tmpdir prune=$prune smoothing=$smoothing dictionary=$dictionary verbose=$verbose
+ fi
+
+ if [ ! "$inpfile" -o ! "$outfile" ]; then
+@@ -152,17 +154,20 @@
+ fi
+
+ #check tmpdir
++tmpdir_created=0;
+ if [ ! -d $tmpdir ]; then
+- echo "Temporary directory $tmpdir not found";
++ echo "Temporary directory $tmpdir does not exist";
+ echo "creating $tmpdir";
+ mkdir -p $tmpdir;
++ tmpdir_created=1;
+ else
+- echo "Cleaning temporary directory $tmpdir";
+- rm $tmpdir/dict* $tmpdir/ngram.dict.* $tmpdir/lm.dict.* $tmpdir/ikn.stat.dict.* 2> /dev/null
++ echo "Cleaning temporary directory $tmpdir";
++ rm -r $tmpdir 2> /dev/null
++ if [ $? != 0 ]; then
++ echo "Warning: some temporary files could not be removed"
++ fi
+ fi
+
+-
+-
+ workingdir=`pwd | perl -pe 's/\/nfsmnt//g'`
+ cd $workingdir
+
+@@ -198,13 +203,16 @@
+
+ unset getpids
+ echo "Extracting n-gram statistics for each word list"
++echo "Important: dictionary must be ordered according to order of appearance of words in data"
++echo "used to generate n-gram blocks, so that sub language model blocks results ordered too"
++
+ for sfx in ${suffix[@]} ; do
+
+ (\
+ qsub $queueparameters -b no -j yes -sync no -o $qsubout.$sfx -e $qsuberr.$sfx -N $qsubname-$sfx << EOF
+ cd $workingdir
+ echo exit status $?
+-$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="gzip -c > $tmpdir/ngram.dict.${sfx}.gz" -fd="$tmpdir/dict.${sfx}" $dictionary -iknstat="$tmpdir/ikn.stat.dict.${sfx}"
++$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.dict.${sfx}.gz" -fd="$tmpdir/dict.${sfx}" $dictionary -iknstat="$tmpdir/ikn.stat.dict.${sfx}"
+ echo exit status $?
+ echo
+ EOF
+@@ -237,7 +245,7 @@
+ cd $workingdir
+ echo exit status $?
+
+-$scr/build-sublm.pl $verbose $prune $smoothing "cat $tmpdir/ikn.stat.dict*" --size $order --ngrams "gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx}
++$scr/build-sublm.pl $verbose $prune $smoothing "cat $tmpdir/ikn.stat.dict*" --size $order --ngrams "$gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx}
+ echo exit status $?
+
+ echo
+@@ -258,7 +266,7 @@
+ cd $workingdir
+ echo exit status $?
+
+-$scr/build-sublm.pl $verbose $prune $smoothing --size $order --ngrams "gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx}
++$scr/build-sublm.pl $verbose $prune $smoothing --size $order --ngrams "$gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx}
+
+ echo
+ EOF
+@@ -292,9 +300,16 @@
+ ) 2>&1 > $qsublog
+
+ echo "Cleaning temporary directory $tmpdir";
+-rm -r $tmpdir 2> /dev/null
++rm $tmpdir/* 2> /dev/null
+ rm $qsubout* $qsuberr* $qsublog* 2> /dev/null
+
+-exit
++if [ $tmpdir_created -eq 1 ]; then
++ echo "Removing temporary directory $tmpdir";
++ rmdir $tmpdir 2> /dev/null
++ if [ $? != 0 ]; then
++ echo "Warning: the temporary directory could not be removed."
++ fi
++fi
+
++exit 0
+
+--- a/scripts/build-lm.sh
++++ b/scripts/build-lm.sh
+@@ -1,4 +1,6 @@
+-#! /bin/sh
++#! /bin/bash
++
++set -m # Enable Job Control
+
+ usage()
+ {
+@@ -13,7 +15,7 @@
+ -o Output gzipped LM, e.g. lm.gz
+ -k Number of splits (default 5)
+ -n Order of language model (default 3)
+- -t Directory for temporary files (default ./stat)
++ -t Directory for temporary files (default ./stat_PID)
+ -p Prune singleton n-grams (default false)
+ -u Use uniform word frequency for dictionary splitting (default false)
+ -s Smoothing methods: witten-bell (default), kneser-ney, improved-kneser-ney
+@@ -30,7 +32,7 @@
+ fi
+
+ #paths to scripts and commands in irstlm
+-scr=$IRSTLM/bin/
++scr=$IRSTLM/bin
+ bin=$IRSTLM/bin
+ gzip=`which gzip 2> /dev/null`;
+ gunzip=`which gunzip 2> /dev/null`;
+@@ -43,7 +45,7 @@
+
+ #default parameters
+ logfile=/dev/null
+-tmpdir=stat
++tmpdir=stat_$$
+ order=3
+ parts=3
+ inpfile="";
+@@ -143,13 +145,18 @@
+ fi
+
+ #check tmpdir
++tmpdir_created=0;
+ if [ ! -d $tmpdir ]; then
+- echo "Temporary directory $tmpdir not found";
++ echo "Temporary directory $tmpdir does not exist";
+ echo "creating $tmpdir";
+ mkdir -p $tmpdir;
++ tmpdir_created=1;
+ else
+- echo "Cleaning temporary directory $tmpdir";
+- rm $tmpdir/dict* $tmpdir/ngram.dict.* $tmpdir/lm.dict.* $tmpdir/ikn.stat.* 2> /dev/null
++ echo "Cleaning temporary directory $tmpdir";
++ rm $tmpdir/* 2> /dev/null
++ if [ $? != 0 ]; then
++ echo "Warning: some temporary files could not be removed"
++ fi
+ fi
+
+
+@@ -166,9 +173,11 @@
+ for sdict in $tmpdir/dict.*;do
+ sdict=`basename $sdict`
+ echo $sdict;
+-$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.${sdict}.gz" -fd="$tmpdir/$sdict" $dictionary -iknstat="$tmpdir/ikn.stat.$sdict" >> $logfile 2>&1
++$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.${sdict}.gz" -fd="$tmpdir/$sdict" $dictionary -iknstat="$tmpdir/ikn.stat.$sdict" >> $logfile 2>&1 &
+ done
+
++# Wait for all parallel jobs to finish
++while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done
+
+ echo "Estimating language models for each word list"
+ for sdict in `ls $tmpdir/dict.*` ; do
+@@ -176,20 +185,32 @@
+ echo $sdict;
+
+ if [ $smoothing = "--kneser-ney" -o $smoothing = "--improved-kneser-ney" ]; then
+-$scr/build-sublm.pl $verbose $prune $smoothing "cat $tmpdir/ikn.stat.dict.*" --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict >> $logfile #2>&1
++$scr/build-sublm.pl $verbose $prune $smoothing "cat $tmpdir/ikn.stat.dict.*" --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict >> $logfile 2>&1 &
+ else
+-$scr/build-sublm.pl $verbose $prune $smoothing --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict >> $logfile #2>&1
++$scr/build-sublm.pl $verbose $prune $smoothing --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict >> $logfile 2>&1 &
+ fi
+
+ done
+
++# Wait for all parallel jobs to finish
++while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done
++
+ echo "Merging language models into $outfile"
+ $scr/merge-sublm.pl --size $order --sublm $tmpdir/lm.dict -lm $outfile >> $logfile 2>&1
+
+ echo "Cleaning temporary directory $tmpdir";
+-rm $tmpdir/dict* $tmpdir/ngram.dict.* $tmpdir/lm.dict.* $tmpdir/ikn.stat.dict.* 2> /dev/null
++rm $tmpdir/* 2> /dev/null
++
++if [ $tmpdir_created -eq 1 ]; then
++ echo "Removing temporary directory $tmpdir";
++ rmdir $tmpdir 2> /dev/null
++ if [ $? != 0 ]; then
++ echo "Warning: the temporary directory could not be removed."
++ fi
++fi
++
++exit 0
++
++
+
+-echo "Removing temporary directory $tmpdir";
+-rmdir $tmpdir 2> /dev/null
+
+-exit
+--- a/scripts/rm-start-end.sh
++++ b/scripts/rm-start-end.sh
+@@ -1,6 +1,6 @@
+-#! /bin/sh
++#! /bin/bash
+
+-#rm star-end symbols
++#rm start-end symbols
+
+ sed 's/<s>//g' | sed 's/<\/s>//g' | sed 's/^ *//' | sed 's/ *$//' | sed '/^$/d'
+
+--- a/scripts/split-ngt.sh
++++ b/scripts/split-ngt.sh
+@@ -1,4 +1,4 @@
+-#! /bin/sh
++#! /bin/bash
+
+ #usage:
+ #ngt-split.sh <input> <output> <size> <parts>
+--- a/src/lmtable.h
++++ b/src/lmtable.h
+@@ -44,7 +44,7 @@
+ #define MIN(a,b) (((a)<(b))?(a):(b))
+
+ #define LMTMAXLEV 20
+-#define MAX_LINE 1024
++#define MAX_LINE 100000
+
+ #ifndef LMTCODESIZE
+ #define LMTCODESIZE (int)3
+--- a/src/plsa.cpp
++++ b/src/plsa.cpp
+@@ -54,6 +54,7 @@
+ char *featurefile=NULL;
+ char *basefile=NULL;
+ char *hfile=NULL;
++ char *tmphfile=NULL;
+ char *tfile=NULL;
+ char *wfile=NULL;
+ char *ctfile=NULL;
+@@ -83,8 +84,8 @@
+ "Model", CMDSTRINGTYPE, &basefile,
+ "m", CMDSTRINGTYPE, &basefile,
+
+- "HFile", CMDSTRINGTYPE, &hfile,
+- "hf", CMDSTRINGTYPE, &hfile,
++ "HFile", CMDSTRINGTYPE, &tmphfile,
++ "hf", CMDSTRINGTYPE, &tmphfile,
+
+ "WFile", CMDSTRINGTYPE, &wfile,
+ "wf", CMDSTRINGTYPE, &wfile,
+@@ -119,6 +120,7 @@
+ (char *)NULL
+ );
+
++
+ GetParams(&argc, &argv, (char*) NULL);
+
+ if (argc==1 || help){
+@@ -170,9 +172,14 @@
+ exit(1);
+ }
+
+- if (!hfile){
++ if (!tmphfile){
+ //set default value
+- strcpy(hfile,"hfff");
++ hfile=new char[4+1];
++ strcpy(hfile,"hfff");
++ }else{
++ //set the value of the parameter
++ hfile=new char[strlen(tmphfile)+1];
++ strcpy(hfile,tmphfile);
+ }
+
+ dictionary dict(dictfile);
+@@ -219,6 +226,7 @@
+ tc.train(adafile,it,.0);
+ }
+ if (strcmp(hfile,"hfff")==0) system("rm -f hfff");
++ delete hfile;
+
+ exit(1);
+ }
+--- a/src/quantize-lm.cpp
++++ b/src/quantize-lm.cpp
+@@ -31,7 +31,7 @@
+ #include "math.h"
+ #include "util.h"
+
+-#define MAX_LINE 1024
++#define MAX_LINE 100000
+
+ //----------------------------------------------------------------------
+ // Special type and global variable for the BIN CLUSTERING algorithm
+--- a/src/util.h
++++ b/src/util.h
+@@ -27,7 +27,7 @@
+
+
+ #define LMTMAXLEV 20
+-#define MAX_LINE 1024
++#define MAX_LINE 100000
+
+ std::string gettempfolder();
+ void createtempfile(std::ofstream &fileStream, std::string &filePath, std::ios_base::openmode flags);
+--- a/config.h.in
++++ b/config.h.in
+@@ -33,6 +33,10 @@
+ /* Define to 1 if you have the <unistd.h> header file. */
+ #undef HAVE_UNISTD_H
+
++/* Define to the sub-directory in which libtool stores uninstalled libraries.
++ */
++#undef LT_OBJDIR
++
+ /* Name of package */
+ #undef PACKAGE
+
+@@ -48,6 +52,9 @@
+ /* Define to the one symbol short name of this package. */
+ #undef PACKAGE_TARNAME
+
++/* Define to the home page for this package. */
++#undef PACKAGE_URL
++
+ /* Define to the version of this package. */
+ #undef PACKAGE_VERSION
+
+--- a/regenerate-makefiles.sh
++++ b/regenerate-makefiles.sh
+@@ -1,4 +1,4 @@
+-#!/bin/bash
++#!/bin/sh
+
+ # NOTE:
+ # Versions 1.9 (or higher) of aclocal and automake are required.
+@@ -14,7 +14,7 @@
+ force=$1;
+ # set parameter force to the value "--force" if you want to recreate all links to the autotools
+
+-function die () {
++die () {
+ echo "$@" >&2
+ exit 1
+ }
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list