[irstlm] 03/146: Updated patches.

Giulio Paci giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:37:01 UTC 2016


This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to branch master
in repository irstlm.

commit 70659ce294923cba985d2d781ef927703a290b3a
Author: Giulio Paci <giuliopaci at gmail.com>
Date:   Tue May 1 00:46:28 2012 +0200

    Updated patches.
    
    - Added a patch to fix commands help.
    - Added a patch to import latest changes from upstream SVN.
    - Removed scripts_fix.patch
---
 .../patches/online_documentation_enhancments.patch | 953 +++++++++++++++++++++
 debian/patches/scripts_fix.patch                   |  11 -
 debian/patches/series                              |   4 +-
 debian/patches/upstream_revision_487.patch         | 405 +++++++++
 4 files changed, 1360 insertions(+), 13 deletions(-)

diff --git a/debian/patches/online_documentation_enhancments.patch b/debian/patches/online_documentation_enhancments.patch
new file mode 100644
index 0000000..dbe4224
--- /dev/null
+++ b/debian/patches/online_documentation_enhancments.patch
@@ -0,0 +1,953 @@
+Description: enhance on-line documentation
+ This patch tries to enhance on-line documentation by providing:
+  - common help format for all the IRSTLM binaries
+  - common help option for all the IRSTLM binaries
+  - a brief description of each binary
+Author: Giulio Paci <giuliopaci at gmail.com>
+Forwarded: no
+--- a/scripts/add-start-end.sh
++++ b/scripts/add-start-end.sh
+@@ -1,5 +1,32 @@
+ #! /bin/bash
+ 
++function usage()
++{
++    cmnd=$(basename $0);
++    cat<<EOF
++
++$cmnd - adds sentence start/end symbols and trims words longer
++       than 80 characters
++
++USAGE:
++       $cmnd [options]
++
++OPTIONS:
++       -h        Show this message
++
++EOF
++}
++
++# Parse options
++while getopts h OPT; do
++    case "$OPT" in
++        h)
++            usage >&2;
++            exit 0;
++            ;;
++    esac
++done
++
+ #adds sentence start/end symbols to standard input and 
+ #trims words longer than 80 characters
+ 
+--- a/scripts/build-lm-qsub.sh
++++ b/scripts/build-lm-qsub.sh
+@@ -1,26 +1,29 @@
+ #! /bin/bash
+ 
+-usage()
++function usage()
+ {
+-cat << EOF
+-usage: $0 options
++    cmnd=$(basename $0);
++    cat<<EOF
+ 
+-This script estimates a language model file. 
++$cmnd - estimates a language model file
++
++USAGE:
++       $cmnd [options]
+ 
+ OPTIONS:
+-   -h      Show this message
+-   -i      Input training file e.g. 'gunzip -c train.gz'
+-   -o      Output gzipped LM, e.g. lm.gz
+-   -k      Number of splits (default 5)
+-   -n      Order of language model (default 3)
+-   -t      Directory for temporary files (default ./stat_PID)
+-   -p      Prune singleton n-grams (default false)
+-   -u      Use uniform word frequency for dictionary splitting (default false)
+-   -q      parameters for qsub ("-q <queue>", and any other)
+-   -s      Smoothing methods: witten-bell (default), kneser-ney (approximated kneser-ney), improved-kneser-ney
+-   -b      Include sentence boundary n-grams (optional)
+-   -d      Define subdictionary for n-grams (optional)
+-   -v      Verbose
++       -h        Show this message
++       -i        Input training file e.g. 'gunzip -c train.gz'
++       -o        Output gzipped LM, e.g. lm.gz
++       -k        Number of splits (default 5)
++       -n        Order of language model (default 3)
++       -t        Directory for temporary files (default ./stat_PID)
++       -p        Prune singleton n-grams (default false)
++       -u        Use uniform word frequency for dictionary splitting (default false)
++       -q        Parameters for qsub ("-q <queue>", and any other)
++       -s        Smoothing methods: witten-bell (default), kneser-ney (approximated kneser-ney), improved-kneser-ney
++       -b        Include sentence boundary n-grams (optional)
++       -d        Define subdictionary for n-grams (optional)
++       -v        Verbose
+ 
+ EOF
+ }
+@@ -69,7 +72,7 @@
+      case $OPTION in
+          h)
+              usage
+-             exit 1
++             exit 0
+              ;;
+          v)
+              verbose="--verbose";
+--- a/scripts/build-lm.sh
++++ b/scripts/build-lm.sh
+@@ -2,26 +2,29 @@
+ 
+ set -m # Enable Job Control
+ 
+-usage()
++function usage()
+ {
+-cat << EOF
+-usage: $0 options
++    cmnd=$(basename $0);
++    cat<<EOF
+ 
+-This script estimates a language model file. 
++$cmnd - estimates a language model file
++
++USAGE:
++       $cmnd [options]
+ 
+ OPTIONS:
+-   -h      Show this message
+-   -i      Input training file e.g. 'gunzip -c train.gz'
+-   -o      Output gzipped LM, e.g. lm.gz
+-   -k      Number of splits (default 5)
+-   -n      Order of language model (default 3)
+-   -t      Directory for temporary files (default ./stat_PID)
+-   -p      Prune singleton n-grams (default false)
+-   -u      Use uniform word frequency for dictionary splitting (default false)
+-   -s      Smoothing methods: witten-bell (default), kneser-ney, improved-kneser-ney
+-   -b      Include sentence boundary n-grams (optional)
+-   -d      Define subdictionary for n-grams (optional)
+-   -v      Verbose
++       -h        Show this message
++       -i        Input training file e.g. 'gunzip -c train.gz'
++       -o        Output gzipped LM, e.g. lm.gz
++       -k        Number of splits (default 5)
++       -n        Order of language model (default 3)
++       -t        Directory for temporary files (default ./stat_PID)
++       -p        Prune singleton n-grams (default false)
++       -u        Use uniform word frequency for dictionary splitting (default false)
++       -s        Smoothing methods: witten-bell (default), kneser-ney, improved-kneser-ney
++       -b        Include sentence boundary n-grams (optional)
++       -d        Define subdictionary for n-grams (optional)
++       -v        Verbose
+ 
+ EOF
+ }
+@@ -62,7 +65,7 @@
+      case $OPTION in
+          h)
+              usage
+-             exit 1
++             exit 0
+              ;;
+          v)
+              verbose="--verbose";
+@@ -107,7 +110,6 @@
+ 		 exit 4;
+ 	     esac
+              ;;
+-  
+          p)
+              prune='--prune-singletons';
+              ;;
+--- a/scripts/rm-start-end.sh
++++ b/scripts/rm-start-end.sh
+@@ -1,6 +1,30 @@
+ #! /bin/bash
+ 
+-#rm start-end symbols
++function usage()
++{
++    cmnd=$(basename $0);
++    cat<<EOF
++
++$cmnd - removes sentence start/end symbols
++
++USAGE:
++       $cmnd [options]
++
++OPTIONS:
++       -h        Show this message
++
++EOF
++}
++
++# Parse options
++while getopts h OPT; do
++    case "$OPT" in
++        h)
++            usage >&2;
++            exit 0;
++            ;;
++    esac
++done
+ 
+ sed 's/<s>//g' | sed 's/<\/s>//g' | sed 's/^ *//' | sed 's/ *$//' | sed '/^$/d'
+ 
+--- a/scripts/split-ngt.sh
++++ b/scripts/split-ngt.sh
+@@ -1,5 +1,37 @@
+ #! /bin/bash
+ 
++function usage()
++{
++    cmnd=$(basename $0);
++    cat<<EOF
++
++$cmnd - creates partition files with ngram statistics in Google format
++
++USAGE:
++       $cmnd [options] <input> <output> <order> <parts>
++
++DESCRIPTION:
++       <input>   Input file name
++       <output>  Partition files name prefix
++       <order>   Order of the ngrams
++       <parts>   Number of partitions
++
++OPTIONS:
++       -h        Show this message
++
++EOF
++}
++
++# Parse options
++while getopts h OPT; do
++    case "$OPT" in
++        h)
++            usage >&2;
++            exit 0;
++            ;;
++    esac
++done
++
+ #usage:
+ #ngt-split.sh <input> <output> <size> <parts>
+ #It creates <parts> files (named <output.000>, ... <output.999>)
+--- a/scripts/build-sublm.pl
++++ b/scripts/build-sublm.pl
+@@ -36,6 +36,7 @@
+ 
+ use strict;
+ use Getopt::Long "GetOptions";
++use File::Basename;
+ 
+ my $gzip=`which gzip 2> /dev/null`;
+ my $gunzip=`which gunzip 2> /dev/null`;
+@@ -62,24 +63,29 @@
+ 	      'improved-kneser-ney=s' => \$improved_kneser_ney,
+ 	      'prune-singletons' => \$prune_singletons,
+ 	      'cross-sentence' => \$cross_sentence,
+-	      'help' => \$help,
++	      'h|help' => \$help,
+ 	      'verbose' => \$verbose);
+ 
+ 
+ if ($help || !$size || !$ngrams || !$sublm) {
+-  print "build-sublm.pl <options>\n",
+-    "--size <int>          maximum n-gram size for the language model\n",
+-    "--ngrams <string>     input file or command to read the ngram table\n",
+-    "--sublm <string>      output file prefix to write the sublm statistics \n",
+-    "--freq-shift <int>    (optional) value to be subtracted from all frequencies\n",
+-    "--witten-bell        (optional) use witten bell linear smoothing (default)\n",
+-    "--kneser-ney <string> (optional) use kneser-ney smoothing with statistics in <string> \n",
+-    "--improved-kneser-ney <string> (optional) use improved kneser-ney smoothing with statistics in <string> \n",
+-    "--good-turing        (optional) use good-turing linear smoothing\n",
+-    "--prune-singletons   (optional) remove n-grams occurring once, for n=3,4,5,... (disabled by default)\n",
+-    "--cross-sentence     (optional) include cross-sentence bounds (disabled by default)\n",
+-    "--verbose            (optional) print debugging info\n",
+-    "--help               (optional) print these instructions\n";    
++	my $cmnd = basename($0);
++  print "\n$cmnd - estimates single LMs\n",
++	"\nUSAGE:\n",
++	"       $cmnd [options]\n",
++	"\nOPTIONS:\n",
++    "       --size <int>          maximum n-gram size for the language model\n",
++    "       --ngrams <string>     input file or command to read the ngram table\n",
++    "       --sublm <string>      output file prefix to write the sublm statistics \n",
++    "       --freq-shift <int>    (optional) value to be subtracted from all frequencies\n",
++    "       --witten-bell         (optional) use witten bell linear smoothing (default)\n",
++    "       --kneser-ney <string> (optional) use kneser-ney smoothing with statistics in <string> \n",
++    "       --improved-kneser-ney <string> (optional) use improved kneser-ney smoothing with statistics in <string> \n",
++    "       --good-turing         (optional) use good-turing linear smoothing\n",
++    "       --prune-singletons    (optional) remove n-grams occurring once, for n=3,4,5,... (disabled by default)\n",
++    "       --cross-sentence      (optional) include cross-sentence bounds (disabled by default)\n",
++    "       --verbose             (optional) print debugging info\n",
++    "       -h, --help            (optional) print these instructions\n",
++    "\n";
+ 
+   exit(1);
+ }
+--- a/scripts/merge-sublm.pl
++++ b/scripts/merge-sublm.pl
+@@ -23,21 +23,28 @@
+ 
+ use strict;
+ use Getopt::Long "GetOptions";
++use File::Basename;
+ 
+ my ($help,$lm,$size,$sublm)=();
+ $help=1 unless
+ &GetOptions('size=i' => \$size,
+             'lm=s' => \$lm,
+             'sublm=s' => \$sublm,
+-            'help' => \$help,);
++            'h|help' => \$help,);
+ 
+ 
+-if ($help || !$size || !$lm || !$sublm){
+-  print "merge-sublm.pl <options>\n",
+-  "--size <int>        maximum n-gram size for the language model\n",
+-  "--sublm <string>    path identifying  all prefix sub LMs \n",
+-  "--lm <string>       name of final LM file (will be gzipped)\n",
+-  "--help              (optional) print these instructions\n";    
++if ($help || !$size || !$lm || !$sublm) {
++	my $cmnd = basename($0);
++  print "\n$cmnd - merge single LMs\n",
++	"\nUSAGE:\n",
++	"       $cmnd [options]\n",
++	"\nOPTIONS:\n",
++    "       --size <int>          maximum n-gram size for the language model\n",
++    "       --sublm <string>      path identifying all input prefix sub LMs\n",
++    "       --lm <string>         name of the output LM file (will be gzipped)\n",
++    "       -h, --help            (optional) print these instructions\n",
++    "\n";
++
+   exit(1);
+ }
+ 
+--- a/scripts/split-dict.pl
++++ b/scripts/split-dict.pl
+@@ -29,6 +29,7 @@
+ 
+ use strict;
+ use Getopt::Long "GetOptions";
++use File::Basename;
+ 
+ my ($help,$input,$output,$parts)=();
+ 
+@@ -36,18 +37,24 @@
+ &GetOptions('input=s' => \$input,
+             'output=s' => \$output, 
+              'parts=i' => \$parts,           
+-             'help' => \$help,);
++             'h|help' => \$help,);
+ 
+-if ($help || !$input || !$output || !$parts){
+-
+-  print "split-dict.pl <options>\n",
+-        "--input <string>    input dictionary with frequencies\n",
+-        "--output <string>   prefix of output dictionaries\n",
+-        "--parts <int>       number of parts to split dictionary into \n",
+-        "--help              (optional) print these instructions\n",
+-        "Remarks: dictionary must be generated with IRSTLM command dict\n",
+-        "         if dictionary does not contain frequencies, then a\n",
+-        "         frequency 1 is assumed for all words.\n";
++if ($help || !$input || !$output || !$parts) {
++	my $cmnd = basename($0);
++  print "\n$cmnd - splits a dictionary into frequency-balanced partitions\n",
++	"\nUSAGE:\n",
++	"       $cmnd [options]\n",
++	"\nDESCRIPTION:\n",
++	"       $cmnd splits a dictionary into frequency-balanced partitions.\n",
++	"       The dictionary must be generated with IRSTLM command dict.\n",
++	"       If dictionary does not contain frequencies, then a frequency 1 is\n",
++	"       assumed for all words.\n",
++	"\nOPTIONS:\n",
++    "       --input <string>      input dictionary with frequencies\n",
++    "       --output <string>     prefix of output dictionaries\n",
++    "       --parts <int>         number of partitions to create\n",
++    "       -h, --help            (optional) print these instructions\n",
++    "\n";
+ 
+   exit(1);
+ }
+--- a/scripts/goograms2ngrams.pl
++++ b/scripts/goograms2ngrams.pl
+@@ -44,18 +44,24 @@
+ 			'startfrom=i' => \$from,
+ 			'googledir=s' => \$googledir,
+ 			'ngramdir=s' => \$ngramdir,
+-			'help' => \$help,
++			'h|help' => \$help,
+ 			'verbose' => \$verbose);
+ 
+ 
+-if ($help || !$maxsize || !$googledir || !$ngramdir ){
+-  print "goograms2ngrams.pl <options>\n",
+-        "--maxsize <int>       maximum n-gram level of conversion\n",
+-        "--startfrom <int>     skip initial levels if already available (default 2)\n",
+-        "--googledir <string>  directory containing the google-grams dirs (1gms,2gms, ...)\n",
+-        "--ngramdir <string>   directory where to write the n-grams \n",
+-        "--verbose            (optional) very talktive output\n",
+-        "--help               (optional) print these instructions\n";    
++if ($help || !$maxsize || !$googledir || !$ngramdir ) {
++	my $cmnd = "goograms2ngrams.pl";
++  print "\n$cmnd - transforms google n-grams into real n-grams so that\n",
++	"       counts are consistent with respect to lower order n-grams\n",
++	"\nUSAGE:\n",
++	"       $cmnd [options]\n",
++	"\nOPTIONS:\n",
++    "       --maxsize <int>       maximum n-gram level of conversion\n",
++    "       --startfrom <int>     skip initial levels if already available (default 2)\n",
++    "       --googledir <string>  directory containing the google-grams dirs (1gms,2gms,...)\n",
++    "       --ngramdir <string>   directory where to write the n-grams \n",
++    "       --verbose             (optional) very talktive output\n",
++    "       -h, --help            (optional) print these instructions\n",
++    "\n";
+ 
+   exit(1);
+ }
+--- a/scripts/lm-stat.pl
++++ b/scripts/lm-stat.pl
+@@ -23,19 +23,26 @@
+ 
+ use strict;
+ use Getopt::Long "GetOptions";
++use File::Basename;
+ 
+ my ($help,$lm,$txt)=();
+ $help=1 unless
+ 
+ &GetOptions('lm=s' => \$lm,
+             'txt=s' => \$txt,
+-            'help' => \$help,);
++            'h|help' => \$help,);
++
++if ($help || !$lm || !$txt) {
++	my $cmnd = basename($0);
++  print "\n$cmnd - computes LM statistics over a string\n",
++	"\nUSAGE:\n",
++	"       $cmnd [options]\n",
++	"\nOPTIONS:\n",
++    "       --lm  <string>        language model file \n",
++    "       --txt <string>        text file\n",
++    "       -h, --help            (optional) print these instructions\n",
++    "\n";
+ 
+-if ($help || !$lm || !$txt){
+-  print "lm-stat.pl <options>\n",
+-  "--lm  <string>    language model file \n",
+-  "--txt <string>    text file\n",
+-  "--help            print these instructions\n";    
+   exit(1);
+ }
+ 
+--- a/scripts/ngram-split.pl
++++ b/scripts/ngram-split.pl
+@@ -24,6 +24,29 @@
+ #n-grams starting with a given word (prefix) are all 
+ #contained in one file.
+ 
++use Getopt::Long "GetOptions";
++use File::Basename;
++
++my ($help,$lm,$size,$sublm)=();
++$help=1 unless
++&GetOptions('h|help' => \$help);
++
++if ($help) {
++	my $cmnd = basename($0);
++  print "\n$cmnd - re-segment google n-gram count files so that n-grams\n",
++    "       starting with a given word (prefix) are all contained in one file\n",
++	"\nUSAGE:\n",
++	"       $cmnd [options] [<output_prefix>]\n",
++	"\nDESCRIPTION:\n",
++	"       Input is expected on STDIN.\n",
++	"       <output_prefix>       prefix of files to be created\n",
++	"\nOPTIONS:\n",
++    "       -h, --help            (optional) print these instructions\n",
++    "\n";
++
++  exit(1);
++}
++
+ 
+ $max_pref=10000;   #number of prefixes to be put in one file 
+ $max_ngram=5000000;#number of n-grams to be put in one file
+--- a/scripts/sort-lm.pl
++++ b/scripts/sort-lm.pl
+@@ -25,6 +25,7 @@
+ 
+ use strict;
+ use Getopt::Long "GetOptions";
++use File::Basename;
+ 
+ my ($help,$ilm,$olm,$inv)=();
+ $help=1 unless
+@@ -35,14 +36,24 @@
+ &GetOptions('ilm=s' => \$ilm,
+ 			'olm=s' => \$olm,
+             'inv' => \$inv,
+-            'help' => \$help,);
++            'h|help' => \$help,);
++
++if ($help || !$ilm || !$olm) {
++	my $cmnd = basename($0);
++  print "\n$cmnd - sorts n-grams according to lexicographic order\n",
++	"\nUSAGE:\n",
++	"       $cmnd [options]\n",
++	"\nDESCRIPTION:\n",
++	"       $cmnd sorts n-grams of an ARPA file according to lexicographic order.\n",
++	"       Inverted sorting option is propedeutic to building a binary\n",
++	"       lmtable with compile-lm with n-grams stored in reverted order.\n",
++	"\nOPTIONS:\n",
++    "       -ilm  <fname>         input ARPA LM filename (default STDIN) \n",
++    "       -olm <fname>          output ARPA LM filename (default STDOUT)\n",
++    "       -inv                  inverted n-gram sort for compile-lm \n",
++    "       -h, --help            (optional) print these instructions\n",
++    "\n";
+ 
+-if ($help || !$ilm || !$olm){
+-  print "sort-lm.pl [--ilm <fname>]  [--olm <fname>] [--inv]   \n",
+-  "-ilm  <fname>   input ARPA LM filename (default /dev/stdin) \n",
+-  "-olm <fname>    output ARPA LM filename (default /dev/stdout)\n",
+-  "-inv            inverted n-gram sort for compile-lm \n",
+-  "-help           print these instructions\n";    
+   exit(1);
+ }
+ 
+--- a/src/compile-lm.cpp
++++ b/src/compile-lm.cpp
+@@ -55,27 +55,29 @@
+ void usage(const char *msg = 0) {
+ 
+   if (msg) { std::cerr << msg << std::endl; }
+-  std::cerr << "Usage: compile-lm [options] input-file.lm [output-file.blm]" << std::endl;
+-  if (!msg) std::cerr << std::endl
+-		      << "  compile-lm reads a standard LM file in ARPA format and produces" << std::endl
+-		      << "  a compiled representation that the IRST LM toolkit can quickly" << std::endl
+-		      << "  read and process. LM file can be compressed with gzip." << std::endl << std::endl;
+-  std::cerr << "Options:\n"
+-	    << "--text|-t [yes|no]  (output is again in text format)" << std::endl
+-	    << "--invert|-i [yes|no]  (build an inverted n-gram binary table for fast access: default no)" << std::endl
+-	    << "--filter|-f wordlist (filter a binary language model with a word list)"<< std::endl
+-	    << "--keepunigrams|-ku [yes|no] (filter by keeping all unigrams in the table: default yes)"<< std::endl
+-	    << "--eval|-e text-file (computes perplexity of text-file and returns)"<< std::endl
+-	    << "--randcalls|-r N (computes N random calls on the eval text-file)"<< std::endl
+-	    << "--dub dict-size (dictionary upperbound to compute OOV word penalty: default 10^7)"<< std::endl
+-	    << "--score|-s [yes|no]  (computes log-prob scores from standard input)"<< std::endl
+-	    << "--debug|-d 1 (verbose output for --eval option)"<< std::endl
+-	    << "--sentence [yes|no] (compute pperplexity at sentence level (identified through the end symbol)"<< std::endl
+-	    << "--memmap|-mm 1 (uses memory map to read a binary LM)"<< std::endl
+-	    << "--ngram_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
+-	    << "--dict_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
+-	    << "--level|l <value> (set the maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken)" << std::endl
+-	    << "--tmpdir <directory> (directory for temporary computation, default is either the environment variable TMP if defined or \"/tmp\")" << std::endl;
++  std::cerr << std::endl << "compile-lm - compiles an ARPA format LM into an IRSTLM format one" << std::endl;
++  std::cerr << std::endl << "USAGE:"  << std::endl
++	    << "       compile-lm [options] <input-file.lm> [output-file.blm]" << std::endl;
++  if (!msg) std::cerr << std::endl << "DESCRIPTION:" << std::endl
++		      << "       compile-lm reads a standard LM file in ARPA format and produces" << std::endl
++		      << "       a compiled representation that the IRST LM toolkit can quickly" << std::endl
++		      << "       read and process. LM file can be compressed with gzip." << std::endl;
++  std::cerr << std::endl << "OPTIONS:" << std::endl
++	    << "       --text|-t [yes|no]  (output is again in text format)" << std::endl
++	    << "       --invert|-i [yes|no]  (build an inverted n-gram binary table for fast access: default no)" << std::endl
++	    << "       --filter|-f wordlist (filter a binary language model with a word list)"<< std::endl
++	    << "       --keepunigrams|-ku [yes|no] (filter by keeping all unigrams in the table: default yes)"<< std::endl
++	    << "       --eval|-e text-file (computes perplexity of text-file and returns)"<< std::endl
++	    << "       --randcalls|-r N (computes N random calls on the eval text-file)"<< std::endl
++	    << "       --dub dict-size (dictionary upperbound to compute OOV word penalty: default 10^7)"<< std::endl
++	    << "       --score|-s [yes|no]  (computes log-prob scores of n-grams from standard input)"<< std::endl
++	    << "       --debug|-d 1 (verbose output for --eval option)"<< std::endl
++	    << "       --sentence [yes|no] (compute pperplexity at sentence level (identified through the end symbol)"<< std::endl
++	    << "       --memmap|-mm 1 (uses memory map to read a binary LM)" << std::endl
++	    << "       --ngram_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
++	    << "       --dict_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
++	    << "       --level|l <value> (set the maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken)" << std::endl
++	    << "       --tmpdir <directory> (directory for temporary computation, default is either the environment variable TMP if defined or \"/tmp\")" << std::endl;
+ }
+ 
+ bool starts_with(const std::string &s, const std::string &pre) {
+--- a/src/dict.cpp
++++ b/src/dict.cpp
+@@ -78,23 +78,26 @@
+ 	
+ 	if (inp==NULL)
+     {
+-		std::cerr << "\nUsage: \ndict -i=inputfile [options]\n";
+-		std::cerr << "(inputfile can be a corpus or a dictionary)\n\n";
+-		std::cerr << "Options:\n";
+-		std::cerr << "-o=outputfile\n";
+-		std::cerr << "-f=[yes|no] (output word frequencies, default is false)\n";
+-		std::cerr << "-sort=[yes|no] (sort dictionary by frequency, default is false)\n";
+-		std::cerr << "-pf=<freq>  (prune words with frequency below <freq>\n";
+-		std::cerr << "-pr=<rank>  (prune words with frequency rank above <rank>\n";
+-		std::cerr << "-is= (interruption symbol) \n";
+-		std::cerr << "-c=[yes|no] (show dictionary growth curve)\n";
+-		std::cerr << "-cs=curvesize (default 10)\n";
+-		std::cerr << "-t=testfile (compute OOV rates on test corpus)\n";
+-		std::cerr << "-LoadFactor=<value> (set the load factor for cache; it should be a positive real value; if not defined a default value is used)\n";
+-		std::cerr << "-listOOV=[yes|no] (print OOV words to stderr, default is false)\n\n";
++  std::cerr << std::endl << "dict - extracts a dictionary" << std::endl;
++  std::cerr << std::endl << "USAGE:"  << std::endl
++	    << "       dict -i=<inputfile> [options]" << std::endl;
++  std::cerr << std::endl << "DESCRIPTION:" << std::endl
++	    << "       dict extracts a dictionary from a corpus or a dictionary." << std::endl;
++  std::cerr << std::endl << "OPTIONS:" << std::endl;
++  std::cerr << "       -o=outputfile" << std::endl;
++  std::cerr << "       -f=[yes|no] (output word frequencies, default is false)" << std::endl;
++  std::cerr << "       -sort=[yes|no] (sort dictionary by frequency, default is false)" << std::endl;
++  std::cerr << "       -pf=<freq>  (prune words with frequency below <freq>" << std::endl;
++  std::cerr << "       -pr=<rank>  (prune words with frequency rank above <rank>" << std::endl;
++  std::cerr << "       -is= (interruption symbol)" << std::endl;
++  std::cerr << "       -c=[yes|no] (show dictionary growth curve)" << std::endl;
++  std::cerr << "       -cs=curvesize (default 10)" << std::endl;
++  std::cerr << "       -t=testfile (compute OOV rates on test corpus)" << std::endl;
++  std::cerr << "       -LoadFactor=<value> (set the load factor for cache; it should be a positive real value; if not defined a default value is used)" << std::endl;
++  std::cerr << "       -listOOV=[yes|no] (print OOV words to stderr, default is false)" << std::endl << std::endl;
+ 		
+ 		
+-		exit(1);
++  exit(1);
+     };
+ 	
+ 	// options compatibility issues:
+--- a/src/interpolate-lm.cpp
++++ b/src/interpolate-lm.cpp
+@@ -51,27 +51,28 @@
+ 
+ void usage(const char *msg = 0) {
+   if (msg) { std::cerr << msg << std::endl; }
+-  std::cerr << "Usage: interpolate-lm [options] lm-list-file [lm-list-file.out]" << std::endl;
+-  if (!msg) std::cerr << std::endl
+-		      << "  interpolate-lm reads a LM list file including interpolation weights " << std::endl
+-		      << "  with the format: N\\n w1 lm1 \\n w2 lm2 ...\\n wN lmN\n" << std::endl
+-		      << "  It estimates new weights on a development text, " << std::endl
+-		      << "  computes the perplexity on an evaluation text, " << std::endl
+-		      << "  computes probabilities of n-grams read from stdin." << std::endl
+-		      << "  It reads LMs in ARPA and IRSTLM binary format." << std::endl  << std::endl;
+-			
+-  std::cerr << "Options:\n"
+-            << "--learn|-l text-file learn optimal interpolation for text-file"<< std::endl
+-            << "--order|-o n         order of n-grams used in --learn (optional)"<< std::endl
+-            << "--eval|-e text-file  computes perplexity on text-file"<< std::endl
+-            << "--dub dict-size      dictionary upperbound (default 10^7)"<< std::endl
+-            << "--score|-s [yes|no]  compute log-probs of n-grams from stdin"<< std::endl
+-            << "--debug|-d [1-3]     verbose output for --eval option (see compile-lm)"<< std::endl
+-            << "--sentence [yes|no] (compute pperplexity at sentence level (identified through the end symbol)"<< std::endl
+-            << "--memmap| -mm 1      use memory map to read a binary LM" << std::endl
+-            << "--ngram_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
+-            << "--dict_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
+-            << "--level|lev <value> (set the maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken)" << std::endl; 
++  std::cerr << std::endl << "interpolate-lm - interpolates language models" << std::endl;
++  std::cerr << std::endl << "USAGE:"  << std::endl
++	    << "       interpolate-lm [options] <lm-list-file> [lm-list-file.out]" << std::endl;
++  if (!msg) std::cerr << std::endl << "DESCRIPTION:" << std::endl
++		      << "       interpolate-lm reads a LM list file including interpolation weights " << std::endl
++		      << "       with the format: N\\n w1 lm1 \\n w2 lm2 ...\\n wN lmN\n" << std::endl
++		      << "       It estimates new weights on a development text, " << std::endl
++		      << "       computes the perplexity on an evaluation text, " << std::endl
++		      << "       computes probabilities of n-grams read from stdin." << std::endl
++		      << "       It reads LMs in ARPA and IRSTLM binary format." << std::endl;
++  std::cerr << std::endl << "OPTIONS:" << std::endl
++	    << "       --learn|-l text-file learn optimal interpolation for text-file"<< std::endl
++	    << "       --order|-o n order of n-grams used in --learn (optional)"<< std::endl
++	    << "       --eval|-e text-file (computes perplexity of text-file and returns)"<< std::endl
++	    << "       --dub dict-size (dictionary upperbound to compute OOV word penalty: default 10^7)"<< std::endl
++	    << "       --score|-s [yes|no]  (computes log-prob scores of n-grams from standard input)"<< std::endl
++	    << "       --debug|-d [1-3] verbose output for --eval option (see compile-lm)"<< std::endl
++	    << "       --sentence [yes|no] (compute pperplexity at sentence level (identified through the end symbol)"<< std::endl
++	    << "       --memmap|-mm 1 (uses memory map to read a binary LM)" << std::endl
++	    << "       --ngram_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
++	    << "       --dict_load_factor <value> (set the load factor for ngram cache ; it should be a positive real value; if not defined a default value is used)" << std::endl
++	    << "       --level|lev <value> (set the maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken)" << std::endl;
+ }
+ 
+ 
+--- a/src/ngt.cpp
++++ b/src/ngt.cpp
+@@ -125,9 +125,35 @@
+   GetParams(&argc, &argv, (char*) NULL);
+   
+   if (inp==NULL){
+-    cerr <<"No input was specified\n";
+-    exit(1);
+-  };
++  std::cerr << std::endl << "ngt - collects n-grams" << std::endl;
++  std::cerr << std::endl << "USAGE:"  << std::endl
++	    << "       ngt -i=<inputfile> [options]" << std::endl;
++  std::cerr << std::endl << "OPTIONS:" << std::endl;
++  std::cerr << "       -Dictionary|-d=dictfile (dictionary filename)" << std::endl;
++  std::cerr << "       -IntSymb|-is=string (interruption symbol)" << std::endl;
++  std::cerr << "       -NgramSize|-n=[1-" << MAX_NGRAM << "] (n-gram default size, default: 0)" << std::endl;
++  std::cerr << "       -InputFile|-i=inputfile" << std::endl;
++  std::cerr << "       -OutputFile|-o=outputfile" << std::endl;
++  std::cerr << "       -InputGoogleFormat|-gooinp=[yes|no] (default: no)" << std::endl;
++  std::cerr << "       -OutputGoogleFormat|-gooout=[yes|no] (default: no)" << std::endl;
++  std::cerr << "       -SaveBinaryTable|-b=[yes|no] (default: no)" << std::endl;
++  std::cerr << "       -LmTable|-lm=[yes|no] (work with LM table, default: no)" << std::endl;
++  std::cerr << "       -DistCo|-dc=int (compute distance co-occurrences, default: 0)" << std::endl;
++  std::cerr << "       -AugmentFile|-aug=string (augmentation data)" << std::endl;
++  std::cerr << "       -SaveSingle|-ss=[yes|no] (generate single table, default: no)" << std::endl;
++  std::cerr << "       -SubDict|-sd|-ConvDict|-cd=dictfile (subdictionary)" << std::endl;
++  std::cerr << "       -FilterDict|-fd=dictfile (filter dictionary)" << std::endl;
++  std::cerr << "       -FilterTable|-ft=file (ngramtable filename)" << std::endl;
++  std::cerr << "       -ftr|-FilterTableRate=double (minimum hit rate of filter, default: 1.0)" << std::endl;
++  std::cerr << "       -HistoMask|-hm=string (history mask)" << std::endl;
++  std::cerr << "       -InpLen|-il=int (input length for mask generation, default: 0)" << std::endl;
++  std::cerr << "       -tlm=[yes|no] (test LM table, default: no)" << std::endl;
++  std::cerr << "       -ftlm (file to test LM table)" << std::endl;
++  std::cerr << "       -memuse=[yes|no] (default: no)" << std::endl;
++  std::cerr << "       -iknstat=file (filename to save IKN statistics)" << std::endl;
++  std::cerr << std::endl;
++  exit(1);
++    };
+ 	
+   if (out==NULL)
+     cerr << "Warning: no output file specified!\n";
+--- a/src/plsa.cpp
++++ b/src/plsa.cpp
+@@ -124,28 +124,55 @@
+ 	GetParams(&argc, &argv, (char*) NULL);
+ 	
+ 	if (argc==1 || help){
+-		cerr <<"plsa: IRSTLM tool for Probabilistic Latent Semantic Analysis LM inference\n\n";
+-
+-		cerr <<"Usage (1): plsa -c=<collection> -d=<dictionary> -m=<model> -t=<topics> -it=<iter>\n\n";
+-		cerr <<"Train a PLSA model. Parameters specify collection and dictionary filenames\n";
+-		cerr <<"number of EM iterations, number of topics, and model filename. The collection\n";
+-		cerr <<"must begin with the number of documents and documents should be separated\n";
+-		cerr <<"with the </d> tag. The begin document tag <d> is not considered.\n";
+-		cerr <<"Example:\n";
+-		cerr <<"3\n";
+-		cerr <<"<d> hello world ! </d>\n";
+-		cerr <<"<d> good morning good afternoon </d>\n";
+-		cerr <<"<d> welcome aboard </d>\n\n";
+-
+-		cerr <<"Usage (2): plsa -c=<text collection> -d=<dictionary> -b=<binary collection>\n\n";
+-		cerr <<"Binarize a textual document collection to speed-up training (1)\n";
+-		cerr <<"\n";
+-		
+-		cerr <<"Usage (3): plsa -d=<dictionary> -m=<model> -t=<topics> -inf=<text> -f=<features> -it=<iterations>\n\n";
+-		cerr <<"Infer a full 1-gram distribution from a model and a small text. The 1-gram\n";
+-		cerr <<"is saved in the feature file. The 1-gram\n";
+-		cerr <<"\n";
+-		exit(1);	
++  std::cerr << std::endl << "plsa - performs probabilistic latent semantic analysis LM inference" << std::endl;
++  std::cerr << std::endl << "USAGE:"  << std::endl
++	    << "       plsa -c=<text_collection> -d=<dictionary> -m=<model> -t=<topics> -it=<iter> [options]" << std::endl
++	    << "       plsa -c=<text_collection> -d=<dictionary> -b=<binary_collection> [options]" << std::endl
++	    << "       plsa plsa -d=<dictionary> -m=<model> -t=<topics> -inf=<text> -f=<features> -it=<iterations> [options]" << std::endl;
++  std::cerr << std::endl << "DESCRIPTION:" << std::endl;
++  std::cerr << "       plsa is a tool for probabilistic latent semantic analysis" << std::endl;
++  std::cerr << "       LM inference. It can be used to train a PLSA model, to binarize" << std::endl;
++  std::cerr << "       a textual document collection to speed-up training or to" << std::endl;
++  std::cerr << "       infer a full n-gram distribution from a model and a small text." << std::endl;
++  std::cerr << std::endl << "OPTIONS:" << std::endl;
++  std::cerr << "       plsa is a tool for probabilistic latent semantic analysis" << std::endl;
++  std::cerr << "       -Dictionary|-d=<file> (dictionary file)" << std::endl;
++  std::cerr << "       -Binary|-b=<file> (binary file)" << std::endl;
++  std::cerr << "       -SplitData|-sd=<int> (size of binary file, default: unlimited)" << std::endl;
++  std::cerr << "       -Collection|-c=<file> (text collection file)" << std::endl;
++  std::cerr << "       -Model|-m=<file> (model file)" << std::endl;
++  std::cerr << "       -HFile|-hf=<file>" << std::endl;
++  std::cerr << "       -WFile|-wf=<file>" << std::endl;
++  std::cerr << "       -TFile|-tf=<file>" << std::endl;
++  std::cerr << "       -CombineTFile|-ct=<file>" << std::endl;
++  std::cerr << "       -TxtFile|-txt=<file>" << std::endl;
++  std::cerr << "       -Inference|-inf=<file>" << std::endl;
++  std::cerr << "       -Features|-f=<file>" << std::endl;
++  std::cerr << "       -Topics|-t=<int> (number of topics, default: 0)" << std::endl;
++  std::cerr << "       -SpecialTopic|-st=<int> (special topic: first dictionary words, default: 0)" << std::endl;
++  std::cerr << "       -Iterations|-it=<int> (number of EM iterations, default: 0)" << std::endl;
++  std::cerr << "       -Help|-h=[yes|no]" << std::endl;
++  std::cerr << std::endl << "EXAMPLES:" << std::endl;
++  std::cerr <<"       (1) plsa -c=<text_collection> -d=<dictionary> -m=<model> -t=<topics> -it=<iter>" << std::endl;
++  std::cerr <<"           Train a PLSA model, <model>, from the text collection" << std::endl;
++  std::cerr <<"           <text_collection> using the dictionary <dictionary>. The" << std::endl;
++  std::cerr <<"           number of EM iterations is specified by <iter> and the" << std::endl;
++  std::cerr <<"           number of topics is specified by <topics>." << std::endl;
++  std::cerr <<"           The <text_collection> content must begin with the number of" << std::endl;
++  std::cerr <<"           documents and documents should be separated with the </d> tag." << std::endl;
++  std::cerr <<"           The begin document tag <d> is not considered." << std::endl;
++  std::cerr <<"           Example of <text_collection> content:" << std::endl;
++  std::cerr <<"           3" << std::endl;
++  std::cerr <<"           <d> hello world ! </d>" << std::endl;
++  std::cerr <<"           <d> good morning good afternoon </d>" << std::endl;
++  std::cerr <<"           <d> welcome aboard </d>" << std::endl;
++  std::cerr <<"       (2) plsa -c=<text_collection> -d=<dictionary> -b=<binary collection>" << std::endl;
++  std::cerr <<"           Binarize a textual document collection to speed-up training (1)" << std::endl;
++  std::cerr <<"       (3) plsa -d=<dictionary> -m=<model> -t=<topics> -inf=<text> -f=<features> -it=<iterations>" << std::endl;
++  std::cerr <<"           Infer a full 1-gram distribution from a model and a small" << std::endl;
++  std::cerr <<"           text. The 1-gram is saved in the feature file. The 1-gram" << std::endl;
++  std::cerr << std::endl;
++  exit(1);
+ 	}
+ 	
+ 	if (!dictfile)
+--- a/src/prune-lm.cpp
++++ b/src/prune-lm.cpp
+@@ -41,16 +41,20 @@
+ 
+ void usage(const char *msg = 0) {
+   if (msg) { std::cerr << msg << std::endl; }
+-  std::cerr << "Usage: prune-lm [--threshold=th2,th3,...] [--abs=1|0] input-file [output-file]" << std::endl << std::endl;
+-  std::cerr << "    prune-lm reads a LM in either ARPA or compiled format and" << std::endl;
+-  std::cerr << "    prunes out n-grams (n=2,3,..) for which backing-off to the" << std::endl;
+-  std::cerr << "    lower order n-gram results in a small difference in probability." << std::endl;
+-  std::cerr << "    The pruned LM is saved in ARPA format" << std::endl << std::endl;
+-  std::cerr << "    Options:" << std::endl;
+-  std::cerr << "    --threshold=th2,th3,th4,... (pruning threshods for 2-grams, 3-grams, 4-grams,..." << std::endl;
+-  std::cerr << "                                 If less thresholds are specified, the last one is  " << std::endl;
+-  std::cerr << "                                 applied to all following n-gram levels.            " << std::endl << std::endl;
+-  std::cerr << "    --abs=1|0 	if 1, use absolute value of weighted difference"<< std::endl;
++  std::cerr << std::endl << "prune-lm - prunes language models" << std::endl;
++  std::cerr << std::endl << "USAGE:"  << std::endl
++	    << "       prune-lm [options] <inputfile> [<outputfile>]" << std::endl;
++  if (!msg) std::cerr << std::endl << "DESCRIPTION:" << std::endl
++		      << "       prune-lm reads a LM in either ARPA or compiled format and" << std::endl
++		      << "       prunes out n-grams (n=2,3,..) for which backing-off to the" << std::endl
++		      << "       lower order n-gram results in a small difference in probability." << std::endl
++		      << "       The pruned LM is saved in ARPA format" << std::endl;
++  std::cerr << std::endl << "OPTIONS:" << std::endl;
++  std::cerr << "       --threshold=th2,th3,th4,... (pruning threshods for 2-grams, 3-grams, 4-grams,..." << std::endl;
++  std::cerr << "           If less thresholds are specified, the last one is  " << std::endl;
++  std::cerr << "           applied to all following n-gram levels. (default: 0)" << std::endl;
++  std::cerr << "       --abs=1|0 if 1, use absolute value of weighted difference (default: 0)" << std::endl;
++  std::cerr << std::endl;
+ 
+ }
+ 
+--- a/src/quantize-lm.cpp
++++ b/src/quantize-lm.cpp
+@@ -74,17 +74,25 @@
+ 
+ void usage(const char *msg = 0) {
+   if (msg) { std::cerr << msg << std::endl; }
+-  std::cerr << "Usage: quantize-lm input-file.lm [output-file.qlm [tmpfile]] " << std::endl;
+-  if (!msg) std::cerr << std::endl
+-    << "  quantize-lm reads a standard LM file in ARPA format and produces" << std::endl
+-    << "  a version of it with quantized probabilities and back-off weights"<< std::endl
+-    << "  that the IRST LMtoolkit can compile. Accepts LMs with .gz suffix." << std::endl
+-    << "  You can specify the output file to be created and also the pathname " << std::endl
+-    << "  of a temporary file used by the program. As default, the temporary "  << std::endl 
+-    << "  file is created in the /tmp directory. Output file can be " << std::endl
+-    << "  written to standard output by using the special name -. "  << std::endl;
++  std::cerr << std::endl << "quantize-lm - quantizes probabilities and back-off weights" << std::endl;
++  std::cerr << std::endl << "USAGE:"  << std::endl
++	    << "       quantize-lm <input-file.lm> [<output-file.qlm> [<tmpfile>]]" << std::endl;
++  if (!msg) std::cerr << std::endl << "DESCRIPTION:" << std::endl
++    << "       quantize-lm reads a standard LM file in ARPA format and produces" << std::endl
++    << "       a version of it with quantized probabilities and back-off weights"<< std::endl
++    << "       that the IRST LM toolkit can compile. Accepts LMs with .gz suffix." << std::endl
++    << "       You can specify the output file to be created and also the pathname" << std::endl
++    << "       of a temporary file used by the program. As default, the temporary "  << std::endl
++    << "       file is created in the /tmp directory. Output file can be" << std::endl
++    << "       written to standard output by using the special name -."  << std::endl;
++  std::cerr << std::endl;
+   }
+ 
++void handle_option(const std::string& opt, int argc, const char **argv, int& argi)
++{
++  if (opt == "--help" || opt == "-h") { usage(); exit(1); }
++}
++
+ 
+ int main(int argc, const char **argv)
+ {
+@@ -95,6 +103,7 @@
+   std::vector<std::string> files;
+   for (int i=1; i < argc; i++) {
+     std::string opt = argv[i];
++    if(opt[0] == '-') handle_option(opt, argc, argv, i);
+     files.push_back(opt);
+   }
+   if (files.size() > 3) { usage("Too many arguments"); exit(1); }
+--- a/src/score-lm.cpp
++++ b/src/score-lm.cpp
+@@ -30,12 +30,16 @@
+ 
+ 
+ void usage() {
+-	std::cerr <<	"Usage: score-lm -lm <model> [-dub <dub>] [-mm 1]\n"
+-			"       score sentences with a language model\n"
+-			"       -lm      language model to use (must be specified)\n"
+-			"       -dub     dictionary upper bound (default: 10000000)\n"
+-			"       -level   max level to load from the language models (default: 1000, meaning the actual LM order)\n"
+-			"       -mm 1    memory-mapped access to lm\n";
++  std::cerr << std::endl << "score-lm - scores sentences with a language model" << std::endl;
++  std::cerr << std::endl << "USAGE:"  << std::endl
++	    << "       score-lm -lm <model>  [options]" << std::endl;
++  std::cerr << std::endl << "OPTIONS:" << std::endl;
++  std::cerr << "       -lm      language model to use (must be specified)" << std::endl;
++  std::cerr << "       -dub     dictionary upper bound (default: 10000000" << std::endl;
++  std::cerr << "       -level   max level to load from the language models (default: 1000," << std::endl;
++  std::cerr << "           meaning the actual LM order)" << std::endl;
++  std::cerr << "       -mm 1    memory-mapped access to lm (default: 0)" << std::endl;
++  std::cerr << std::endl;
+ 	exit(1);
+ }
+ 
+--- a/src/tlm.cpp
++++ b/src/tlm.cpp
+@@ -236,8 +236,53 @@
+ 	
+ 	if (!trainfile || !lmtype)
+     {
+-		cerr <<"Missing parameters\n";
+-		exit(1);
++  std::cerr << std::endl << "tlm - trains and tests language models" << std::endl;
++  std::cerr << std::endl << "USAGE:"  << std::endl
++	    << "       tlm [options]" << std::endl;
++  std::cerr << std::endl << "OPTIONS:" << std::endl;
++  std::cerr << "       -Back-off|-bo=[yes|no] (yes: back-off or no: interpolation, default: no)" << std::endl;
++  std::cerr << "       -Dictionary|-d=<file>" << std::endl;
++  std::cerr << "       -DictionaryUpperBound|-dub=<int> (default: 0)" << std::endl;
++  std::cerr << "       -NgramSize|-n=[1-" << MAX_NGRAM << "] (default: 0)" << std::endl;
++  std::cerr << "       -Ngram|-TrainOn|-tr=<file>" << std::endl;
++  std::cerr << "       -oASR|-oasr=<file>" << std::endl;
++  std::cerr << "       -o|-oARPA|-oarpa=<file>" << std::endl;
++  std::cerr << "       -oBIN|-obin=<file>" << std::endl;
++  std::cerr << "       -TestOn|-te=<file>" << std::endl;
++  std::cerr << "       -AdaptOn|-ad=<file>" << std::endl;
++  std::cerr << "       -AdaptRate|-ar=<double> (default:1.0 )" << std::endl;
++  std::cerr << "       -AdaptLevel|-al=[1-" << MAX_NGRAM << "] (default: 0)" << std::endl;
++  std::cerr << "       -AdaptOOV|-ao[yes|no] (default: no)" << std::endl;
++  std::cerr << "       -SaveScaleFactor|-ssf=<file>" << std::endl;
++  std::cerr << "       -LanguageModelType|-lm=<LM_TYPE> (default: none)" << std::endl;
++  std::cerr << "           ModifiedShiftBeta|msb" << std::endl;
++  std::cerr << "           InterpShiftBeta|ShiftBeta|sb" << std::endl;
++  std::cerr << "           InterpShiftOne|ShiftOne|s1" << std::endl;
++  std::cerr << "           LinearWittenBell|wb" << std::endl;
++  std::cerr << "           LinearGoodTuring" << std::endl;
++  std::cerr << "           Mixture|mix" << std::endl;
++  std::cerr << "       -Interactive|-i=<INTERACTIVE_TYPE> (default: none)" << std::endl;
++  std::cerr << "           Ngram|Yes" << std::endl;
++  std::cerr << "           Sequence" << std::endl;
++  std::cerr << "           Adapt" << std::endl;
++  std::cerr << "           Turn" << std::endl;
++  std::cerr << "           Text" << std::endl;
++  std::cerr << "       -Statistics|-s=[1-3] (default: 0)" << std::endl;
++  std::cerr << "       -PruneThresh|-p=[1-1000] (default: 0)" << std::endl;
++  std::cerr << "       -PruneSingletons|-ps=[yes|no] (default: yes)" << std::endl;
++  std::cerr << "       -PruneTopSingletons|-pts=[yes|no] (default: no)" << std::endl;
++  std::cerr << "       -ComputeLMSize|-sz=[yes|no] (default: 0)" << std::endl;
++  std::cerr << "       -MaximumCachingLevel|-mcl=<int> (default: 0)" << std::endl;
++  std::cerr << "       -MemoryMap|-memmap|-mm=[yes|no] (default: no)" << std::endl;
++  std::cerr << "       -CheckProb|-cp=[yes|no] (default: no)" << std::endl;
++  std::cerr << "       -OutProb|-op=<string> (default: )" << std::endl;
++  std::cerr << "       -SubLMInfo|-slmi=<string>" << std::endl;
++  std::cerr << "       -SaveMixParam|-smp=<file>" << std::endl;
++  std::cerr << "       -LoadMixParam|-lmp=<file>" << std::endl;
++  std::cerr << "       -SetOovRate|-or=<double> (default: 0)" << std::endl;
++  std::cerr << "       -Beta|-beta=<double> (default: -1.0)" << std::endl;
++  std::cerr << std::endl;
++  exit(1);
+     };
+ 	
+ 	
diff --git a/debian/patches/scripts_fix.patch b/debian/patches/scripts_fix.patch
deleted file mode 100644
index 3c0cd3a..0000000
--- a/debian/patches/scripts_fix.patch
+++ /dev/null
@@ -1,11 +0,0 @@
-Description: fixes minor errors in scripts
-Author: Giulio Paci <giuliopaci at gmail.com>
-Forwarded: no
---- a/scripts/build-lm-qsub.sh
-+++ b/scripts/build-lm-qsub.sh
-@@ -1,4 +1,4 @@
--##! /bin/sh
-+#! /bin/sh
- 
- usage()
- {
diff --git a/debian/patches/series b/debian/patches/series
index e2f4189..5291d17 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,5 +1,5 @@
 
+upstream_revision_487.patch
 compilation_fixes.patch
 spelling_fixes.patch
-
-scripts_fix.patch
+online_documentation_enhancments.patch
diff --git a/debian/patches/upstream_revision_487.patch b/debian/patches/upstream_revision_487.patch
new file mode 100644
index 0000000..bc286d8
--- /dev/null
+++ b/debian/patches/upstream_revision_487.patch
@@ -0,0 +1,405 @@
+Description: upstream changes since SVN revision 487
+Origin: upstream, https://irstlm.svn.sourceforge.net/svnroot/irstlm/trunk
+Forwarded: not-needed
+Applied-Upstream: revision 487, https://irstlm.svn.sourceforge.net/svnroot/irstlm/trunk
+--- a/scripts/add-start-end.sh
++++ b/scripts/add-start-end.sh
+@@ -1,4 +1,4 @@
+-#! /bin/sh
++#! /bin/bash
+ 
+ #adds sentence start/end symbols to standard input and 
+ #trims words longer than 80 characters
+--- a/scripts/build-lm-qsub.sh
++++ b/scripts/build-lm-qsub.sh
+@@ -1,4 +1,4 @@
+-##! /bin/sh
++#! /bin/bash
+ 
+ usage()
+ {
+@@ -13,7 +13,7 @@
+    -o      Output gzipped LM, e.g. lm.gz
+    -k      Number of splits (default 5)
+    -n      Order of language model (default 3)
+-   -t      Directory for temporary files (default ./stat)
++   -t      Directory for temporary files (default ./stat_PID)
+    -p      Prune singleton n-grams (default false)
+    -u      Use uniform word frequency for dictionary splitting (default false)
+    -q      parameters for qsub ("-q <queue>", and any other)
+@@ -40,6 +40,8 @@
+ #paths to scripts and commands in irstlm
+ scr=$IRSTLM/bin
+ bin=$IRSTLM/bin
++gzip=`which gzip 2> /dev/null`;
++gunzip=`which gunzip 2> /dev/null`;
+ 
+ #check irstlm installation
+ if [ ! -e $bin/dict -o  ! -e $scr/split-dict.pl ]; then
+@@ -49,7 +51,7 @@
+ 
+ #default parameters
+ logfile=/dev/null
+-tmpdir=stat$$
++tmpdir=stat_$$
+ order=3
+ parts=3
+ inpfile="";
+@@ -109,7 +111,7 @@
+                      ;;
+ 	     *) 
+ 		 echo "wrong smoothing setting";
+-		 exiti 4;
++		 exit 4;
+ 	     esac
+              ;;
+          p)
+@@ -132,8 +134,8 @@
+ done
+ 
+ 
+-if [ $verbose ];then
+-echo inpfile=\"$inpfile\" outfile=$outfile order=$order parts=$parts tmpdir=$tmpdir prune=$prune smoothing=$smoothing dictionary=$dictionary
++if [ $verbose ]; then
++echo inpfile=\"$inpfile\" outfile=$outfile order=$order parts=$parts tmpdir=$tmpdir prune=$prune smoothing=$smoothing dictionary=$dictionary verbose=$verbose
+ fi
+ 
+ if [ ! "$inpfile" -o ! "$outfile" ]; then
+@@ -152,17 +154,20 @@
+ fi
+ 
+ #check tmpdir
++tmpdir_created=0;
+ if [ ! -d $tmpdir ]; then
+-   echo "Temporary directory $tmpdir not found";
++   echo "Temporary directory $tmpdir does not exist";
+    echo "creating $tmpdir";
+    mkdir -p $tmpdir;
++   tmpdir_created=1;
+ else
+-    echo "Cleaning temporary directory $tmpdir";
+-    rm $tmpdir/dict* $tmpdir/ngram.dict.* $tmpdir/lm.dict.* $tmpdir/ikn.stat.dict.* 2> /dev/null
++   echo "Cleaning temporary directory $tmpdir";
++   rm -r $tmpdir 2> /dev/null
++   if [ $? != 0 ]; then
++      echo "Warning: some temporary files could not be removed"
++   fi
+ fi
+ 
+-
+-
+ workingdir=`pwd | perl -pe 's/\/nfsmnt//g'`
+ cd $workingdir
+ 
+@@ -198,13 +203,16 @@
+ 
+ unset getpids
+ echo "Extracting n-gram statistics for each word list"
++echo "Important: dictionary must be ordered according to order of appearance of words in data"
++echo "used to generate n-gram blocks,  so that sub language model blocks results ordered too"
++
+ for sfx in ${suffix[@]} ; do
+ 
+ (\
+ qsub $queueparameters -b no -j yes -sync no -o $qsubout.$sfx -e $qsuberr.$sfx -N $qsubname-$sfx << EOF
+ cd $workingdir
+ echo exit status $?
+-$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="gzip -c > $tmpdir/ngram.dict.${sfx}.gz" -fd="$tmpdir/dict.${sfx}" $dictionary -iknstat="$tmpdir/ikn.stat.dict.${sfx}" 
++$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.dict.${sfx}.gz" -fd="$tmpdir/dict.${sfx}" $dictionary -iknstat="$tmpdir/ikn.stat.dict.${sfx}" 
+ echo exit status $?
+ echo
+ EOF
+@@ -237,7 +245,7 @@
+ cd $workingdir
+ echo exit status $?
+ 
+-$scr/build-sublm.pl $verbose $prune $smoothing "cat $tmpdir/ikn.stat.dict*" --size $order --ngrams "gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx}  
++$scr/build-sublm.pl $verbose $prune $smoothing "cat $tmpdir/ikn.stat.dict*" --size $order --ngrams "$gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx}  
+ echo exit status $?
+ 
+ echo
+@@ -258,7 +266,7 @@
+ cd $workingdir
+ echo exit status $?
+ 
+-$scr/build-sublm.pl $verbose $prune $smoothing --size $order --ngrams "gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx}  
++$scr/build-sublm.pl $verbose $prune $smoothing --size $order --ngrams "$gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx}  
+ 
+ echo
+ EOF
+@@ -292,9 +300,16 @@
+ ) 2>&1 > $qsublog
+ 
+ echo "Cleaning temporary directory $tmpdir";
+-rm -r $tmpdir 2> /dev/null
++rm $tmpdir/* 2> /dev/null
+ rm $qsubout* $qsuberr* $qsublog* 2> /dev/null
+ 
+-exit
++if [ $tmpdir_created -eq 1 ]; then
++    echo "Removing temporary directory $tmpdir";
++    rmdir $tmpdir 2> /dev/null
++    if [ $? != 0 ]; then
++        echo "Warning: the temporary directory could not be removed."
++    fi
++fi
+ 
++exit 0
+ 
+--- a/scripts/build-lm.sh
++++ b/scripts/build-lm.sh
+@@ -1,4 +1,6 @@
+-#! /bin/sh
++#! /bin/bash
++
++set -m # Enable Job Control
+ 
+ usage()
+ {
+@@ -13,7 +15,7 @@
+    -o      Output gzipped LM, e.g. lm.gz
+    -k      Number of splits (default 5)
+    -n      Order of language model (default 3)
+-   -t      Directory for temporary files (default ./stat)
++   -t      Directory for temporary files (default ./stat_PID)
+    -p      Prune singleton n-grams (default false)
+    -u      Use uniform word frequency for dictionary splitting (default false)
+    -s      Smoothing methods: witten-bell (default), kneser-ney, improved-kneser-ney
+@@ -30,7 +32,7 @@
+ fi
+ 
+ #paths to scripts and commands in irstlm
+-scr=$IRSTLM/bin/
++scr=$IRSTLM/bin
+ bin=$IRSTLM/bin
+ gzip=`which gzip 2> /dev/null`;
+ gunzip=`which gunzip 2> /dev/null`;
+@@ -43,7 +45,7 @@
+ 
+ #default parameters
+ logfile=/dev/null
+-tmpdir=stat
++tmpdir=stat_$$
+ order=3
+ parts=3
+ inpfile="";
+@@ -143,13 +145,18 @@
+ fi
+ 
+ #check tmpdir
++tmpdir_created=0;
+ if [ ! -d $tmpdir ]; then
+-   echo "Temporary directory $tmpdir not found";
++   echo "Temporary directory $tmpdir does not exist";
+    echo "creating $tmpdir";
+    mkdir -p $tmpdir;
++   tmpdir_created=1;
+ else
+-    echo "Cleaning temporary directory $tmpdir";
+-    rm $tmpdir/dict* $tmpdir/ngram.dict.* $tmpdir/lm.dict.* $tmpdir/ikn.stat.* 2> /dev/null
++   echo "Cleaning temporary directory $tmpdir";
++    rm $tmpdir/* 2> /dev/null
++    if [ $? != 0 ]; then
++        echo "Warning: some temporary files could not be removed"
++    fi
+ fi
+ 
+ 
+@@ -166,9 +173,11 @@
+ for sdict in $tmpdir/dict.*;do
+ sdict=`basename $sdict`
+ echo $sdict;
+-$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.${sdict}.gz" -fd="$tmpdir/$sdict" $dictionary  -iknstat="$tmpdir/ikn.stat.$sdict" >> $logfile 2>&1
++$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.${sdict}.gz" -fd="$tmpdir/$sdict" $dictionary  -iknstat="$tmpdir/ikn.stat.$sdict" >> $logfile 2>&1 &
+ done
+ 
++# Wait for all parallel jobs to finish
++while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done
+ 
+ echo "Estimating language models for each word list"
+ for sdict in `ls $tmpdir/dict.*` ; do
+@@ -176,20 +185,32 @@
+ echo $sdict;
+ 
+ if [ $smoothing = "--kneser-ney" -o $smoothing = "--improved-kneser-ney" ]; then
+-$scr/build-sublm.pl $verbose $prune $smoothing "cat $tmpdir/ikn.stat.dict.*" --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict  >> $logfile #2>&1
++$scr/build-sublm.pl $verbose $prune $smoothing "cat $tmpdir/ikn.stat.dict.*" --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict  >> $logfile 2>&1 &
+ else
+-$scr/build-sublm.pl $verbose $prune $smoothing  --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict  >> $logfile #2>&1
++$scr/build-sublm.pl $verbose $prune $smoothing  --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict  >> $logfile 2>&1 &
+ fi
+ 
+ done
+ 
++# Wait for all parallel jobs to finish
++while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done
++
+ echo "Merging language models into $outfile"
+ $scr/merge-sublm.pl --size $order --sublm $tmpdir/lm.dict -lm $outfile  >> $logfile 2>&1
+ 
+ echo "Cleaning temporary directory $tmpdir";
+-rm $tmpdir/dict* $tmpdir/ngram.dict.* $tmpdir/lm.dict.* $tmpdir/ikn.stat.dict.* 2> /dev/null
++rm $tmpdir/* 2> /dev/null
++
++if [ $tmpdir_created -eq 1 ]; then
++    echo "Removing temporary directory $tmpdir";
++    rmdir $tmpdir 2> /dev/null
++    if [ $? != 0 ]; then
++        echo "Warning: the temporary directory could not be removed."
++    fi
++fi
++ 
++exit 0
++
++
+ 
+-echo "Removing temporary directory $tmpdir";
+-rmdir $tmpdir 2> /dev/null
+ 
+-exit
+--- a/scripts/rm-start-end.sh
++++ b/scripts/rm-start-end.sh
+@@ -1,6 +1,6 @@
+-#! /bin/sh
++#! /bin/bash
+ 
+-#rm star-end symbols
++#rm start-end symbols
+ 
+ sed 's/<s>//g' | sed 's/<\/s>//g' | sed 's/^ *//' | sed 's/ *$//' | sed '/^$/d'
+ 
+--- a/scripts/split-ngt.sh
++++ b/scripts/split-ngt.sh
+@@ -1,4 +1,4 @@
+-#! /bin/sh
++#! /bin/bash
+ 
+ #usage:
+ #ngt-split.sh <input> <output> <size> <parts>
+--- a/src/lmtable.h
++++ b/src/lmtable.h
+@@ -44,7 +44,7 @@
+ #define MIN(a,b) (((a)<(b))?(a):(b))
+ 
+ #define LMTMAXLEV  20
+-#define MAX_LINE  1024
++#define MAX_LINE  100000
+ 
+ #ifndef  LMTCODESIZE
+ #define  LMTCODESIZE  (int)3
+--- a/src/plsa.cpp
++++ b/src/plsa.cpp
+@@ -54,6 +54,7 @@
+ 	char *featurefile=NULL;
+ 	char *basefile=NULL;
+ 	char *hfile=NULL;
++	char *tmphfile=NULL;
+ 	char *tfile=NULL;
+ 	char *wfile=NULL;
+ 	char *ctfile=NULL;
+@@ -83,8 +84,8 @@
+ 				  "Model", CMDSTRINGTYPE, &basefile,
+ 				  "m", CMDSTRINGTYPE, &basefile,
+ 				  
+-				  "HFile", CMDSTRINGTYPE, &hfile,
+-				  "hf", CMDSTRINGTYPE, &hfile,
++				  "HFile", CMDSTRINGTYPE, &tmphfile,
++				  "hf", CMDSTRINGTYPE, &tmphfile,
+ 				  
+ 				  "WFile", CMDSTRINGTYPE, &wfile,
+ 				  "wf", CMDSTRINGTYPE, &wfile,
+@@ -119,6 +120,7 @@
+ 				  (char *)NULL
+ 				  );
+ 	
++
+ 	GetParams(&argc, &argv, (char*) NULL);
+ 	
+ 	if (argc==1 || help){
+@@ -170,9 +172,14 @@
+ 		exit(1);
+     }
+ 	
+-	if (!hfile){
++	if (!tmphfile){
+ 		//set default value
+-		 strcpy(hfile,"hfff");
++		hfile=new char[4+1];
++		strcpy(hfile,"hfff");
++	}else{
++		//set the value of the parameter
++		hfile=new char[strlen(tmphfile)+1];
++		strcpy(hfile,tmphfile);
+ 	}
+ 
+ 	dictionary dict(dictfile);
+@@ -219,6 +226,7 @@
+ 		tc.train(adafile,it,.0);
+ 	}
+ 	if (strcmp(hfile,"hfff")==0)  system("rm -f hfff");
++	delete hfile;
+ 	
+ 	exit(1); 
+ }
+--- a/src/quantize-lm.cpp
++++ b/src/quantize-lm.cpp
+@@ -31,7 +31,7 @@
+ #include "math.h"
+ #include "util.h"
+ 
+-#define MAX_LINE 1024
++#define MAX_LINE 100000
+ 
+ //----------------------------------------------------------------------
+ //  Special type and global variable for the BIN CLUSTERING algorithm
+--- a/src/util.h
++++ b/src/util.h
+@@ -27,7 +27,7 @@
+ 
+ 
+ #define LMTMAXLEV  20
+-#define MAX_LINE  1024
++#define MAX_LINE  100000
+ 
+ std::string gettempfolder();
+ void createtempfile(std::ofstream  &fileStream, std::string &filePath, std::ios_base::openmode flags);
+--- a/config.h.in
++++ b/config.h.in
+@@ -33,6 +33,10 @@
+ /* Define to 1 if you have the <unistd.h> header file. */
+ #undef HAVE_UNISTD_H
+ 
++/* Define to the sub-directory in which libtool stores uninstalled libraries.
++   */
++#undef LT_OBJDIR
++
+ /* Name of package */
+ #undef PACKAGE
+ 
+@@ -48,6 +52,9 @@
+ /* Define to the one symbol short name of this package. */
+ #undef PACKAGE_TARNAME
+ 
++/* Define to the home page for this package. */
++#undef PACKAGE_URL
++
+ /* Define to the version of this package. */
+ #undef PACKAGE_VERSION
+ 
+--- a/regenerate-makefiles.sh
++++ b/regenerate-makefiles.sh
+@@ -1,4 +1,4 @@
+-#!/bin/bash
++#!/bin/sh
+ 
+ # NOTE:
+ # Versions 1.9 (or higher) of aclocal and automake are required.
+@@ -14,7 +14,7 @@
+ force=$1;
+ # set parameter force to the value "--force" if you want to recreate all links to the autotools
+ 
+-function die () {
++die () {
+   echo "$@" >&2
+   exit 1
+ }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git



More information about the debian-science-commits mailing list