[irstlm] 09/126: added scripts folder
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:39 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.
commit a78296cb54f7febd2c8afb11f84ce9b7e7108be7
Author: Marcello Federico <mrcfdr at gmail.com>
Date: Mon Jul 20 09:38:02 2015 +0200
added scripts folder
---
scripts/CMakeLists.txt | 5 +
scripts/add-start-end.sh | 72 +++++++
scripts/build-lm-qsub.sh | 318 ++++++++++++++++++++++++++++++
scripts/build-lm.sh | 254 ++++++++++++++++++++++++
scripts/build-sublm.pl | 467 ++++++++++++++++++++++++++++++++++++++++++++
scripts/cmake_install.cmake | 48 +++++
scripts/goograms2ngrams.pl | 145 ++++++++++++++
scripts/lm-stat.pl | 63 ++++++
scripts/mdtsel.sh | 219 +++++++++++++++++++++
scripts/merge-sublm.pl | 208 ++++++++++++++++++++
scripts/ngram-split.pl | 84 ++++++++
scripts/other/beautify.perl | 22 +++
scripts/plsa.sh | 346 ++++++++++++++++++++++++++++++++
scripts/qplsa.sh | 183 +++++++++++++++++
scripts/rm-start-end.sh | 30 +++
scripts/sort-lm.pl | 124 ++++++++++++
scripts/split-dict.pl | 157 +++++++++++++++
scripts/split-ngt.sh | 89 +++++++++
scripts/wrapper | 10 +
19 files changed, 2844 insertions(+)
diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
new file mode 100755
index 0000000..f47cddf
--- /dev/null
+++ b/scripts/CMakeLists.txt
@@ -0,0 +1,5 @@
+INSTALL(PROGRAMS
+ add-start-end.sh build-lm-qsub.sh build-lm.sh build-sublm.pl goograms2ngrams.pl lm-stat.pl mdtsel.sh merge-sublm.pl ngram-split.pl rm-start-end.sh sort-lm.pl split-dict.pl split-ngt.sh wrapper
+ DESTINATION bin
+ PERMISSIONS OWNER_EXECUTE OWNER_READ OWNER_WRITE
+ )
diff --git a/scripts/add-start-end.sh b/scripts/add-start-end.sh
new file mode 100755
index 0000000..393e30e
--- /dev/null
+++ b/scripts/add-start-end.sh
@@ -0,0 +1,72 @@
+#! /bin/bash
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+#******************************************************************************
+
+function usage()
+{
+ cmnd=$(basename $0);
+ cat<<EOF
+
+$cmnd - adds sentence start/end symbols in each line and trims very very long words
+
+USAGE:
+ $cmnd [options]
+
+OPTIONS:
+ -h Show this message
+ -r count Specify symbol repetitions (default 1)
+ -t length Trim words up to _length_ chars (default 80)
+ -s char Specify symbol (default s)
+
+EOF
+}
+
+#default setting
+repeat=1;
+maxwordlen=80;
+symbol="s"
+
+# Parse options
+while getopts "hr:t:s:" OPT; do
+ case "$OPT" in
+ h)
+ usage >&2;
+ exit 0;
+ ;;
+ r) repeat=$OPTARG
+ ;;
+ t) maxwordlen=$OPTARG
+ ;;
+ s) symbol=$OPTARG
+ ;;
+ esac
+done
+
+#adds start/end symbols to standard input and
+#trims words longer than 80 characters
+eos="";
+bos="";
+
+for i in `seq $repeat`; do bos="$bos<${symbol}> "; eos="$eos <\/${symbol}>";done
+
+(sed "s/^/$bos/" | sed "s/\$/ $eos/";) |\
+sed "s/\([^ ]\{$maxwordlen\}\)\([^ ]\{1,\}\)/\1/g"
+
diff --git a/scripts/build-lm-qsub.sh b/scripts/build-lm-qsub.sh
new file mode 100755
index 0000000..36100f4
--- /dev/null
+++ b/scripts/build-lm-qsub.sh
@@ -0,0 +1,318 @@
+#! /bin/bash
+
+function usage()
+{
+ cmnd=$(basename $0);
+ cat<<EOF
+
+$cmnd - estimates a language model file
+
+USAGE:
+ $cmnd [options]
+
+OPTIONS:
+ -h Show this message
+ -i Input training file e.g. 'gunzip -c train.gz'
+ -o Output gzipped LM, e.g. lm.gz
+ -k Number of splits (default 5)
+ -n Order of language model (default 3)
+ -t Directory for temporary files (default ./stat_PID)
+ -p Prune singleton n-grams (default false)
+ -u Use uniform word frequency for dictionary splitting (default false)
+ -q Parameters for qsub ("-q <queue>", and any other)
+ -s Smoothing methods: witten-bell (default), kneser-ney (approximated kneser-ney), improved-kneser-ney
+ -b Include sentence boundary n-grams (optional)
+ -d Define subdictionary for n-grams (optional)
+ -v Verbose
+
+EOF
+}
+
+hostname=`uname -n`
+if [ $hostname == "voxgate" ] ; then
+echo "voxgate can not be used as submission host"
+echo "use any other cluster machine"
+exit
+fi
+
+if [ ! $IRSTLM ]; then
+ echo "Set IRSTLM environment variable with path to irstlm"
+ exit 2;
+fi
+
+#paths to scripts and commands in irstlm
+scr=$IRSTLM/bin
+bin=$IRSTLM/bin
+gzip=`which gzip 2> /dev/null`;
+gunzip=`which gunzip 2> /dev/null`;
+
+#check irstlm installation
+if [ ! -e $bin/dict -o ! -e $scr/split-dict.pl ]; then
+ echo "$IRSTLM does not contain a proper installation of IRSTLM"
+ exit 3;
+fi
+
+#default parameters
+logfile=/dev/null
+tmpdir=stat_$$
+order=3
+parts=3
+inpfile="";
+outfile=""
+verbose="";
+smoothing="--witten-bell";
+prune="";
+boundaries="";
+dictionary="";
+uniform="-f=y";
+queueparameters=""
+
+while getopts "hvi:o:n:k:t:s:q:pbl:d:u" OPTION
+do
+ case $OPTION in
+ h)
+ usage
+ exit 0
+ ;;
+ v)
+ verbose="--verbose";
+ ;;
+ i)
+ inpfile=$OPTARG
+ ;;
+ d)
+ dictionary="-sd=$OPTARG"
+ ;;
+
+ u)
+ uniform=" "
+ ;;
+
+ o)
+ outfile=$OPTARG
+ ;;
+ n)
+ order=$OPTARG
+ ;;
+ k)
+ parts=$OPTARG
+ ;;
+ t)
+ tmpdir=$OPTARG
+ ;;
+ s)
+ smoothing=$OPTARG
+ case $smoothing in
+ witten-bell)
+ smoothing="--witten-bell"
+ ;;
+ kneser-ney)
+ smoothing="--kneser-ney"
+ ;;
+ improved-kneser-ney)
+ smoothing="--improved-kneser-ney"
+ ;;
+ *)
+ echo "wrong smoothing setting";
+ exit 4;
+ esac
+ ;;
+ p)
+ prune='--prune-singletons';
+ ;;
+ q)
+ queueparameters=$OPTARG;
+ ;;
+ b)
+ boundaries='--cross-sentence';
+ ;;
+ l)
+ logfile=$OPTARG
+ ;;
+ ?)
+ usage
+ exit
+ ;;
+ esac
+done
+
+
+if [ $verbose ]; then
+echo inpfile=\"$inpfile\" outfile=$outfile order=$order parts=$parts tmpdir=$tmpdir prune=$prune smoothing=$smoothing dictionary=$dictionary verbose=$verbose
+fi
+
+if [ ! "$inpfile" -o ! "$outfile" ]; then
+ usage
+ exit 5
+fi
+
+if [ -e $outfile ]; then
+ echo "Output file $outfile already exists! either remove or rename it."
+ exit 6;
+fi
+
+if [ -e $logfile -a $logfile != "/dev/null" -a $logfile != "/dev/stdout" ]; then
+ echo "Logfile $logfile already exists! either remove or rename it."
+ exit 7;
+fi
+
+#check tmpdir
+tmpdir_created=0;
+if [ ! -d $tmpdir ]; then
+ echo "Temporary directory $tmpdir does not exist";
+ echo "creating $tmpdir";
+ mkdir -p $tmpdir;
+ tmpdir_created=1;
+else
+ echo "Cleaning temporary directory $tmpdir";
+ rm $tmpdir 2> /dev/null
+ if [ $? != 0 ]; then
+ echo "Warning: some temporary files could not be removed"
+ fi
+fi
+
+workingdir=`pwd | perl -pe 's/\/nfsmnt//g'`
+cd $workingdir
+
+qsubout="$workingdir/DICT-OUT$$"
+qsuberr="$workingdir/DICT-ERR$$"
+qsublog="$workingdir/DICT-LOG$$"
+qsubname="DICT"
+
+(\
+qsub $queueparameters -b no -sync yes -o $qsubout -e $qsuberr -N $qsubname << EOF
+cd $workingdir
+echo exit status $?
+echo "Extracting dictionary from training corpus"
+$bin/dict -i="$inpfile" -o=$tmpdir/dictionary $uniform -sort=no
+echo exit status $?
+echo "Splitting dictionary into $parts lists"
+$scr/split-dict.pl --input $tmpdir/dictionary --output $tmpdir/dict. --parts $parts
+echo exit status $?
+EOF
+) 2>&1 > $qsublog
+
+unset suffix
+#getting list of suffixes
+for file in `ls $tmpdir/dict.*` ; do
+sfx=`echo $file | perl -pe 's/^.+\.(\d+)$/$1/'`
+suffix[${#suffix[@]}]=$sfx
+done
+
+qsubout="$workingdir/NGT-OUT$$"
+qsuberr="$workingdir/NGT-ERR$$"
+qsublog="$workingdir/NGT-LOG$$"
+qsubname="NGT"
+
+unset getpids
+echo "Extracting n-gram statistics for each word list"
+echo "Important: dictionary must be ordered according to order of appearance of words in data"
+echo "used to generate n-gram blocks, so that sub language model blocks results ordered too"
+
+for sfx in ${suffix[@]} ; do
+
+(\
+qsub $queueparameters -b no -j yes -sync no -o $qsubout.$sfx -e $qsuberr.$sfx -N $qsubname-$sfx << EOF
+cd $workingdir
+echo exit status $?
+$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.dict.${sfx}.gz" -fd="$tmpdir/dict.${sfx}" $dictionary -iknstat="$tmpdir/ikn.stat.dict.${sfx}"
+echo exit status $?
+echo
+EOF
+) 2>&1 > $qsublog.$sfx
+
+id=`cat $qsublog.$sfx | grep 'Your job' | awk '{print $3}'`
+sgepid[${#sgepid[@]}]=$id
+
+done
+
+waiting=""
+for id in ${sgepid[@]} ; do waiting="$waiting -hold_jid $id" ; done
+
+qsub $queueparameters -sync yes $waiting -j y -o /dev/null -e /dev/null -N $qsubname.W -b y /bin/ls 2>&1 > $qsubname.W.log
+rm $qsubname.W.log
+
+qsubout="$workingdir/SUBLM-OUT$$"
+qsuberr="$workingdir/SUBLM-ERR$$"
+qsublog="$workingdir/SUBLM-LOG$$"
+qsubname="SUBLM"
+
+unset getpids
+echo "Estimating language models for each word list"
+
+if [ $smoothing = "--kneser-ney" -o $smoothing = "--improved-kneser-ney" ]; then
+
+for sfx in ${suffix[@]} ; do
+(\
+qsub $queueparameters -b no -j yes -sync no -o $qsubout.$sfx -e $qsuberr.$sfx -N $qsubname-$sfx << EOF
+cd $workingdir
+echo exit status $?
+
+$scr/build-sublm.pl $verbose $prune $smoothing "cat $tmpdir/ikn.stat.dict*" --size $order --ngrams "$gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx}
+echo exit status $?
+
+echo
+EOF
+) 2>&1 > $qsublog.$sfx
+
+id=`cat $qsublog.$sfx | grep 'Your job' | awk '{print $3}'`
+sgepid[${#sgepid[@]}]=$id
+
+done
+
+else
+
+
+for sfx in ${suffix[@]} ; do
+(\
+qsub $queueparameters -b no -j yes -sync no -o $qsubout.$sfx -e $qsuberr.$sfx -N $qsubname-$sfx << EOF
+cd $workingdir
+echo exit status $?
+
+$scr/build-sublm.pl $verbose $prune $smoothing --size $order --ngrams "$gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx}
+
+echo
+EOF
+) 2>&1 > $qsublog.$sfx
+
+id=`cat $qsublog.$sfx | grep 'Your job' | awk '{print $3}'`
+sgepid[${#sgepid[@]}]=$id
+
+done
+
+fi
+
+
+waiting=""
+for id in ${sgepid[@]} ; do waiting="$waiting -hold_jid $id" ; done
+
+
+qsub $queueparameters -sync yes $waiting -o /dev/null -e /dev/null -N $qsubname.W -b yes /bin/ls 2>&1 > $qsubname.W.log
+rm $qsubname.W.log
+
+echo "Merging language models into $outfile"
+qsubout="$workingdir/MERGE-OUT$$"
+qsuberr="$workingdir/MERGE-ERR$$"
+qsublog="$workingdir/MERGE-LOG$$"
+qsubname="MERGE"
+(\
+qsub $queueparameters -b no -j yes -sync yes -o $qsubout -e $qsuberr -N $qsubname << EOF
+cd $workingdir
+$scr/merge-sublm.pl --size $order --sublm $tmpdir/lm.dict -lm $outfile
+EOF
+) 2>&1 > $qsublog
+
+echo "Cleaning temporary directory $tmpdir";
+rm $tmpdir/* 2> /dev/null
+rm $qsubout* $qsuberr* $qsublog* 2> /dev/null
+
+if [ $tmpdir_created -eq 1 ]; then
+ echo "Removing temporary directory $tmpdir";
+ rmdir $tmpdir 2> /dev/null
+ if [ $? != 0 ]; then
+ echo "Warning: the temporary directory could not be removed."
+ fi
+fi
+
+exit 0
+
diff --git a/scripts/build-lm.sh b/scripts/build-lm.sh
new file mode 100755
index 0000000..82ef71b
--- /dev/null
+++ b/scripts/build-lm.sh
@@ -0,0 +1,254 @@
+#! /bin/bash
+
+set -m # Enable Job Control
+
+function usage()
+{
+ cmnd=$(basename $0);
+ cat<<EOF
+
+$cmnd - estimates a language model file and saves it in intermediate ARPA format
+
+USAGE:
+ $cmnd [options]
+
+OPTIONS:
+ -i|--InputFile Input training file e.g. 'gunzip -c train.gz'
+ -o|--OutputFile Output gzipped LM, e.g. lm.gz
+ -k|--Parts Number of splits (default 5)
+ -n|--NgramSize Order of language model (default 3)
+ -d|--Dictionary Define subdictionary for n-grams (optional, default is without any subdictionary)
+ -s|--LanguageModelType Smoothing methods: witten-bell (default), shift-beta, improved-shift-beta, stupid-backoff; kneser-ney and improved-kneser-ney still accepted for back-compatibility, but mapped into shift-beta and improved-shift-beta, respectively
+ -p|--PruneSingletons Prune singleton n-grams (default false)
+ -f|--PruneFrequencyThreshold Pruning frequency threshold for each level; comma-separated list of values; (default is '0,0,...,0', for all levels)
+ -t|--TmpDir Directory for temporary files (default ./stat_PID)
+ -l|--LogFile File to store logging info (default /dev/null)
+ -u|--uniform Use uniform word frequency for dictionary splitting (default false)
+ -b|--boundaries Include sentence boundary n-grams (optional, default false)
+ -v|--verbose Verbose
+ -h|-?|--help Show this message
+
+EOF
+}
+
+if [ ! $IRSTLM ]; then
+ echo "Set IRSTLM environment variable with path to irstlm"
+ exit 2
+fi
+
+#paths to scripts and commands in irstlm
+scr=$IRSTLM/bin
+bin=$IRSTLM/bin
+gzip=`which gzip 2> /dev/null`;
+gunzip=`which gunzip 2> /dev/null`;
+
+#check irstlm installation
+if [ ! -e $bin/dict -o ! -e $scr/split-dict.pl ]; then
+ echo "$IRSTLM does not contain a proper installation of IRSTLM"
+ exit 3
+fi
+
+#default parameters
+logfile=/dev/null
+tmpdir=stat_$$
+order=3
+parts=3
+inpfile="";
+outfile=""
+verbose="";
+smoothing="witten-bell";
+prune="";
+prune_thr_str="";
+boundaries="";
+dictionary="";
+uniform="-f=y";
+backoff=""
+
+while [ "$1" != "" ]; do
+ case $1 in
+ -i | --InputFile ) shift;
+ inpfile=$1;
+ ;;
+ -o | --OutputFile ) shift;
+ outfile=$1;
+ ;;
+ -n | --NgramSize ) shift;
+ order=$1;
+ ;;
+ -k | --Parts ) shift;
+ parts=$1;
+ ;;
+ -d | --Dictionary ) shift;
+ dictionary="-sd=$1";
+ ;;
+ -s | --LanguageModelType ) shift;
+ smoothing=$1;
+ ;;
+ -f | --PruneFrequencyThreshold ) shift;
+ prune_thr_str="--PruneFrequencyThreshold=$1";
+ ;;
+ -p | --PruneSingletons ) prune='--prune-singletons';
+ ;;
+ -l | --LogFile ) shift;
+ logfile=$1;
+ ;;
+ -t | --TmpDir ) shift;
+ tmpdir=$1;
+ ;;
+ -u | --uniform ) uniform=' ';
+ ;;
+ -b | --boundaries ) boundaries='--cross-sentence';
+ ;;
+ -v | --verbose ) verbose='--verbose';
+ ;;
+ -h | -? | --help ) usage;
+ exit 0;
+ ;;
+ * ) usage;
+ exit 1;
+ esac
+ shift
+done
+
+case $smoothing in
+witten-bell)
+smoothing="--witten-bell";
+;;
+kneser-ney)
+## kneser-ney still accepted for back-compatibility, but mapped into shift-beta
+smoothing="--shift-beta";
+;;
+improved-kneser-ney)
+## improved-kneser-ney still accepted for back-compatibility, but mapped into improved-shift-beta
+smoothing="--improved-shift-beta";
+;;
+shift-beta)
+smoothing="--shift-beta";
+;;
+improved-shift-beta)
+smoothing="--improved-shift-beta";
+;;
+stupid-backoff)
+smoothing="--stupid-backoff";
+backoff="--backoff"
+;;
+*)
+echo "wrong smoothing setting; '$smoothing' does not exist";
+exit 4
+esac
+
+
+echo "LOGFILE:$logfile"
+
+
+if [ $verbose ] ; then
+echo inpfile='"'$inpfile'"' outfile=$outfile order=$order parts=$parts tmpdir=$tmpdir prune=$prune smoothing=$smoothing dictionary=$dictionary verbose=$verbose prune_thr_str=$prune_thr_str >> $logfile 2>&1
+fi
+
+if [ ! "$inpfile" -o ! "$outfile" ] ; then
+ usage
+ exit 5
+fi
+
+if [ -e $outfile ]; then
+ echo "Output file $outfile already exists! either remove or rename it."
+ exit 6
+fi
+
+if [ -e $logfile -a $logfile != "/dev/null" -a $logfile != "/dev/stdout" ]; then
+ echo "Logfile $logfile already exists! either remove or rename it."
+ exit 7
+fi
+
+echo "BIS LOGFILE:$logfile" >> $logfile 2>&1
+
+#check tmpdir
+tmpdir_created=0;
+if [ ! -d $tmpdir ]; then
+ echo "Temporary directory $tmpdir does not exist" >> $logfile 2>&1
+ echo "creating $tmpdir" >> $logfile 2>&1
+ mkdir -p $tmpdir
+ tmpdir_created=1
+else
+ echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1
+ rm $tmpdir/* 2> /dev/null
+ if [ $? != 0 ]; then
+ echo "Warning: some temporary files could not be removed" >> $logfile 2>&1
+ fi
+fi
+
+
+echo "Extracting dictionary from training corpus" >> $logfile 2>&1
+$bin/dict -i="$inpfile" -o=$tmpdir/dictionary $uniform -sort=no 2> $logfile
+
+echo "Splitting dictionary into $parts lists" >> $logfile 2>&1
+$scr/split-dict.pl --input $tmpdir/dictionary --output $tmpdir/dict. --parts $parts >> $logfile 2>&1
+
+echo "Extracting n-gram statistics for each word list" >> $logfile 2>&1
+echo "Important: dictionary must be ordered according to order of appearance of words in data" >> $logfile 2>&1
+echo "used to generate n-gram blocks, so that sub language model blocks results ordered too" >> $logfile 2>&1
+
+for sdict in $tmpdir/dict.*;do
+sdict=`basename $sdict`
+echo "Extracting n-gram statistics for $sdict" >> $logfile 2>&1
+if [ $smoothing = "--shift-beta" -o $smoothing = "--improved-shift-beta" ]; then
+additional_parameters="-iknstat=$tmpdir/ikn.stat.$sdict"
+else
+additional_parameters=""
+fi
+
+$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.${sdict}.gz" -fd="$tmpdir/$sdict" $dictionary $additional_parameters >> $logfile 2>&1 &
+
+#$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.${sdict}.gz" -fd="$tmpdir/$sdict" $dictionary -iknstat="$tmpdir/ikn.stat.$sdict" >> $logfile 2>&1 &
+#else
+#$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.${sdict}.gz" -fd="$tmpdir/$sdict" $dictionary >> $logfile 2>&1 &
+#fi
+done
+
+# Wait for all parallel jobs to finish
+while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done
+
+echo "Estimating language models for each word list" >> $logfile 2>&1
+for sdict in `ls $tmpdir/dict.*` ; do
+sdict=`basename $sdict`
+echo "Estimating language models for $sdict" >> $logfile 2>&1
+
+if [ $smoothing = "--shift-beta" -o $smoothing = "--improved-shift-beta" ]; then
+additional_smoothing_parameters="cat $tmpdir/ikn.stat.dict.*"
+additional_parameters="$backoff"
+else
+additional_smoothing_parameters=""
+additional_parameters=""
+fi
+$scr/build-sublm.pl $verbose $prune $prune_thr_str $smoothing "$additional_smoothing_parameters" --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict $additional_parameters >> $logfile 2>&1 &
+
+#if [ $smoothing = "--shift-beta" -o $smoothing = "--improved-shift-beta" ]; then
+#$scr/build-sublm.pl $verbose $prune $prune_thr_str $smoothing "cat $tmpdir/ikn.stat.dict.*" --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict $backoff >> $logfile 2>&1 &
+#else
+#$scr/build-sublm.pl $verbose $prune $prune_thr_str $smoothing --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict >> $logfile 2>&1 &
+#fi
+
+done
+
+# Wait for all parallel jobs to finish
+while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done
+
+echo "Merging language models into $outfile" >> $logfile 2>&1
+$scr/merge-sublm.pl --size $order --sublm $tmpdir/lm.dict -lm $outfile $backoff >> $logfile 2>&1
+
+echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1
+rm $tmpdir/* 2> /dev/null
+
+if [ $tmpdir_created -eq 1 ]; then
+ echo "Removing temporary directory $tmpdir" >> $logfile 2>&1
+ rmdir $tmpdir 2> /dev/null
+ if [ $? != 0 ]; then
+ echo "Warning: the temporary directory could not be removed." >> $logfile 2>&1
+ fi
+fi
+
+exit 0
+
+
+
+
diff --git a/scripts/build-sublm.pl b/scripts/build-sublm.pl
new file mode 100755
index 0000000..0bbe875
--- /dev/null
+++ b/scripts/build-sublm.pl
@@ -0,0 +1,467 @@
+#! /usr/bin/perl
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+#******************************************************************************
+
+
+
+#first pass: read dictionary and generate 1-grams
+#second pass:
+#for n=2 to N
+# foreach n-1-grams
+# foreach n-grams with history n-1
+# compute smoothing statistics
+# store successors
+# compute back-off probability
+# compute smoothing probability
+# write n-1 gram with back-off prob
+# write all n-grams with smoothed probability
+
+use strict;
+use Getopt::Long "GetOptions";
+use File::Basename;
+
+my $gzip=`which gzip 2> /dev/null`;
+my $gunzip=`which gunzip 2> /dev/null`;
+chomp($gzip);
+chomp($gunzip);
+my $cutoffword="<CUTOFF>"; #special word for Google 1T-ngram cut-offs
+my $cutoffvalue=39; #cut-off threshold for Google 1T-ngram cut-offs
+
+#set defaults for optional parameters
+my ($verbose,$size,$ngrams,$sublm)=(0, 0, undef, undef);
+my ($witten_bell,$good_turing,$shift_beta,$improved_shift_beta,$stupid_backoff)=(0, 0, "", "", "");
+my ($witten_bell_flag,$good_turing_flag,$shift_beta_flag,$improved_shift_beta_flag,$stupid_backoff_flag)=(0, 0, 0, 0, 0);
+my ($freqshift,$prune_singletons,$prune_thr_str,$cross_sentence)=(0, 0, "", 0);
+
+my $help = 0;
+$help = 1 unless
+&GetOptions('size=i' => \$size,
+'freq-shift=i' => \$freqshift,
+'ngrams=s' => \$ngrams,
+'sublm=s' => \$sublm,
+'witten-bell' => \$witten_bell,
+'good-turing' => \$good_turing,
+'shift-beta=s' => \$shift_beta,
+'improved-shift-beta=s' => \$improved_shift_beta,
+'stupid-backoff' => \$stupid_backoff,
+'prune-singletons' => \$prune_singletons,
+'pft|PruneFrequencyThreshold=s' => \$prune_thr_str,
+'cross-sentence' => \$cross_sentence,
+'h|help' => \$help,
+'verbose' => \$verbose);
+
+
+if ($help || !$size || !$ngrams || !$sublm) {
+ my $cmnd = basename($0);
+ print "\n$cmnd - estimates single LMs\n",
+ "\nUSAGE:\n",
+ " $cmnd [options]\n",
+ "\nOPTIONS:\n",
+ " --size <int> maximum n-gram size for the language model\n",
+ " --ngrams <string> input file or command to read the ngram table\n",
+ " --sublm <string> output file prefix to write the sublm statistics \n",
+ " --freq-shift <int> (optional) value to be subtracted from all frequencies\n",
+ " --witten-bell (optional) use Witten-Bell linear smoothing (default) \n",
+ " --shift-beta <string> (optional) use Shift-Beta smoothing with statistics in <string>\n",
+ " --improved-shift-beta <string> (optional) use Improved Shift-Beta smoothing with statistics in <string>, similar to Improved Kneser Ney but without corrected counts\n",
+ " --good-turing (optional) use Good-Turing linear smoothing\n",
+ " --stupid-backoff (optional) use Stupid-Backoff smoothing\n",
+ " --prune-singletons (optional) remove n-grams occurring once, for n=3,4,5,... (disabled by default)\n",
+ " -pft, --PruneFrequencyThreshold <string> (optional) pruning frequency threshold for each level; comma-separated list of values; (default is \"0,0,...,0\", for all levels)\n",
+ " --cross-sentence (optional) include cross-sentence bounds (disabled by default)\n",
+ " --verbose (optional) print debugging info\n",
+ " -h, --help (optional) print these instructions\n",
+ "\n";
+
+ exit(1);
+}
+
+$good_turing_flag = 1 if ($good_turing);
+die "build-sublm: This LM is no more supported\n\n" if ($good_turing_flag==1);
+
+$witten_bell_flag = 1 if ($witten_bell);
+$shift_beta_flag = 1 if ($shift_beta);
+$stupid_backoff_flag = 1 if ($stupid_backoff);
+$improved_shift_beta_flag = 1 if ($improved_shift_beta);
+$witten_bell = $witten_bell_flag = 1 if ($witten_bell_flag + $shift_beta_flag + $improved_shift_beta_flag + $stupid_backoff_flag) == 0;
+
+print STDERR "build-sublm: size=$size ngrams=$ngrams sublm=$sublm witten-bell=$witten_bell shift-beta=$shift_beta improved-shift-beta=$improved_shift_beta stupid-backoff=$stupid_backoff prune-singletons=$prune_singletons cross-sentence=$cross_sentence PruneFrequencyThreshold=$prune_thr_str\n" if $verbose;
+
+
+die "build-sublm: choose only one smoothing method\n" if ($witten_bell_flag + $shift_beta_flag + $improved_shift_beta_flag + $stupid_backoff_flag) > 1;
+
+die "build-sublm: value of --size must be larger than 0\n" if $size<1;
+
+
+
+my @pruneFreqThr=();
+my $i=0;
+while ($i<=$size){
+ $pruneFreqThr[$i++]=0;
+}
+
+print STDERR "Pruning frequency threshold values:$prune_thr_str\n" if ($verbose);
+
+my @v=split(/,/,$prune_thr_str);
+$i=0;
+while ($i<scalar(@v)){
+ $pruneFreqThr[$i+1]=$v[$i];
+ $i++;
+ if ($i>=$size){
+ print STDERR "too many pruning frequency threshold values; kept the first values and skipped the others\n" if ($verbose);
+ last;
+ };
+}
+
+$i=1;
+while ($i<=$size){
+ if ($pruneFreqThr[$i] < $pruneFreqThr[$i-1]){
+ $pruneFreqThr[$i]=$pruneFreqThr[$i-1];
+ print STDERR "the value of the pruning frequency threshold for level $i has been adjusted to value $pruneFreqThr[$i]\n" if ($verbose);
+ }
+ $i++;
+}
+
+if ($verbose){
+ $i=0;
+ while ($i<=$size){
+ print STDERR "pruneFreqThr[$i]=$pruneFreqThr[$i]\n";
+ $i++;
+ }
+}
+
+my $log10=log(10.0); #service variable to convert log into log10
+my $oldwrd=""; #variable to check if 1-gram changed
+my @cnt=(); #counter of n-grams
+my $totcnt=0; #total counter of n-grams
+my ($ng, at ng); #read ngrams
+my $ngcnt=0; #store ngram frequency
+my $n;
+
+print STDERR "Collecting 1-gram counts\n" if $verbose;
+
+open(INP,"$ngrams") || open(INP,"$ngrams|") || die "cannot open $ngrams\n";
+open(GR,"|$gzip -c >${sublm}.1gr.gz") || die "cannot create ${sublm}.1gr.gz\n";
+
+while ($ng=<INP>) {
+
+ chomp($ng); @ng=split(/[ \t]+/,$ng); $ngcnt=(pop @ng) - $freqshift;
+
+ # warn "ng: |@ng| ngcnt:$ngcnt\n";
+
+ if ($oldwrd ne $ng[0]) {
+ # warn "$totcnt,$oldwrd,$ng[0]\n" if $oldwrd ne '';
+ printf (GR "%s\t%s\n",$totcnt,$oldwrd) if $oldwrd ne '';
+ $totcnt=0;$oldwrd=$ng[0];
+ }
+
+ #update counter
+ $totcnt+=$ngcnt;
+}
+
+printf GR "%s\t%s\n",$totcnt,$oldwrd;
+close(INP);
+close(GR);
+
+my (@h,$h,$hpr); #n-gram history
+my (@dict,$code); #sorted dictionary of history successors
+my ($diff,$singlediff,$diff1,$diff2,$diff3); #different successors of history
+my (@n1, at n2, at n3, at n4, at uno3); #IKN: n-grams occurring once or twice ...
+my (@beta,$beta); #IKN: n-grams occurring once or twice ...
+my $locfreq;
+
+#collect global statistics for (Improved) Shift-Beta smoothing
+if ($shift_beta_flag || $improved_shift_beta_flag) {
+ my $statfile=$shift_beta || $improved_shift_beta;
+ print STDERR "load \& merge IKN statistics from $statfile \n" if $verbose;
+ open(IKN,"$statfile") || open(IKN,"$statfile|") || die "cannot open $statfile\n";
+ while (<IKN>) {
+ my($lev,$n1,$n2,$n3,$n4,$uno3)=$_=~/level: (\d+) n1: (\d+) n2: (\d+) n3: (\d+) n4: (\d+) unover3: (\d+)/;
+ $n1[$lev]+=$n1;$n2[$lev]+=$n2;$n3[$lev]+=$n3;$n4[$lev]+=$n4;$uno3[$lev]+=$uno3;
+ print STDERR "from $statfile level $lev: n1:$n1 n2:$n2 n3:$n3 n4:$n4 uno3:$uno3\n";
+ print STDERR "level $lev: n1[$lev]:$n1[$lev] n3[$lev]:$n2[$lev] n3[$lev]:$n3[$lev] n4[$lev]:$n4[$lev] uno3[$lev]:$uno3[$lev]\n";
+ }
+ if ($verbose){
+ for (my $lev=1;$lev<=$#n1;$lev++) {
+ print STDERR "level $lev: n1[$lev]:$n1[$lev] n3[$lev]:$n2[$lev] n3[$lev]:$n3[$lev] n4[$lev]:$n4[$lev] uno3[$lev]:$uno3[$lev]\n";
+ }
+ }
+ close(IKN);
+}
+
+print STDERR "Computing n-gram probabilities:\n" if $verbose;
+
+foreach ($n=2;$n<=$size;$n++) {
+
+ $code=-1;@cnt=(); @dict=(); $totcnt=0;$diff=0; $singlediff=1; $diff1=0; $diff2=0; $diff3=0; $oldwrd="";
+
+ #compute smothing statistics
+ my (@beta,$beta);
+
+ if ($stupid_backoff_flag) {
+ $beta=0.4;
+ print STDERR "Stupid-Backoff smoothing: beta $n: $beta\n" if $verbose;
+ }
+
+ if ($shift_beta_flag) {
+ if ($n1[$n]==0 || $n2[$n]==0) {
+ print STDERR "Error in Shift-Beta smoothing statistics: resorting to Witten-Bell\n" if $verbose;
+ $beta=0;
+ } else {
+ $beta=$n1[$n]/($n1[$n] + 2 * $n2[$n]);
+ print STDERR "Shift-Beta smoothing: beta $n: $beta\n" if $verbose;
+ }
+ }
+
+ if ($improved_shift_beta_flag) {
+
+ my $Y=$n1[$n]/($n1[$n] + 2 * $n2[$n]);
+
+ if ($n3[$n] == 0 || $n4[$n] == 0 || $n2[$n] <= $n3[$n] || $n3[$n] <= $n4[$n]) {
+ print STDERR "Warning: higher order count-of-counts are wrong\n" if $verbose;
+ print STDERR "Fixing this problem by resorting only on the lower order count-of-counts\n" if $verbose;
+ $beta[1] = $Y;
+ $beta[2] = $Y;
+ $beta[3] = $Y;
+ } else {
+ $beta[1] = 1 - 2 * $Y * $n2[$n] / $n1[$n];
+ $beta[2] = 2 - 3 * $Y * $n3[$n] / $n2[$n];
+ $beta[3] = 3 - 4 * $Y * $n4[$n] / $n3[$n];
+ }
+ print STDERR "Improved-Shift-Beta smoothing: level:$n beta[1]:$beta[1] beta[2]:$beta[2] beta[3]:$beta[3]\n" if $verbose;
+ }
+
+ open(HGR,"$gunzip -c ${sublm}.".($n-1)."gr.gz |") || die "cannot open ${sublm}.".($n-1)."gr.gz\n";
+ open(INP,"$ngrams") || open(INP,"$ngrams |") || die "cannot open $ngrams\n";
+ open(GR,"| $gzip -c >${sublm}.${n}gr.gz");
+ open(NHGR,"| $gzip -c > ${sublm}.".($n-1)."ngr.gz") || die "cannot open ${sublm}.".($n-1)."ngr.gz";
+
+ my $ngram;
+ my ($reduced_h, $reduced_ng) = ("", "");
+
+ $ng=<INP>; chomp($ng); @ng=split(/[ \t]+/,$ng); $ngcnt=(pop @ng) - $freqshift;
+ $h=<HGR>; chomp($h); @h=split(/[ \t]+/,$h); $hpr=shift @h;
+ $reduced_ng=join(" ", at ng[0..$n-2]);
+ $reduced_h=join(" ", at h[0..$n-2]);
+
+ @cnt=(); @dict=();
+ $code=-1; $totcnt=0; $diff=0; $singlediff=0; $diff1=0; $diff2=0; $diff3=0; $oldwrd="";
+ do{
+
+ #load all n-grams starting with history h, and collect useful statistics
+
+ while ($reduced_h eq $reduced_ng){ #must be true the first time!
+
+ if ($oldwrd ne $ng[$n-1]) { #could this be otherwise? [Marcello 22/5/09]
+ $oldwrd=$ng[$n-1];
+ ++$code;
+ }
+
+ $dict[$code]=$ng[$n-1];
+ $cnt[$code]+=$ngcnt;
+ $totcnt+=$ngcnt;
+
+ $ng=<INP>;
+
+ if (defined($ng)){
+ chomp($ng);
+ @ng=split(/[ \t]+/,$ng);$ngcnt=(pop @ng) - $freqshift;
+ $reduced_ng=join(" ", at ng[0..$n-2]);
+ }
+ else{
+ last;
+ }
+ }
+
+ $diff=scalar(@cnt);
+ for (my $c=0;$c<scalar(@cnt);++$c){
+ $singlediff++ if $cnt[$c]==1;
+
+ if ($diff>1 && $dict[$c] eq $cutoffword) { # in google n-grams
+ #find estimates for remaining diff and singlediff
+ #proportional estimate
+ $diff--; #remove cutoffword
+ my $concentration=1.0-($diff-1)/$totcnt;
+ my $mass=1; #$totcnt/($totcnt+$ngcnt);
+ my $index=(1-($concentration * $mass))/(1-1/$cutoffvalue) + (1/$cutoffvalue);
+ my $cutoffdiff=int($ngcnt * $index);
+ $cutoffdiff=1 if $cutoffdiff==0;
+ print STDERR "diff $diff $totcnt cutofffreq $ngcnt -- cutoffdiff: $cutoffdiff\n";
+ print STDERR "concentration:",$concentration," mass:", $mass,"\n";
+ $diff+=$cutoffdiff;
+ }
+ }
+
+
+ if ($improved_shift_beta) {
+ for (my $c=0;$c<=$code;$c++) {
+ $diff1++ if $cnt[$c]==1;
+ $diff2++ if $cnt[$c]==2;
+ $diff3++ if $cnt[$c]>=3;
+ }
+ }
+
+ #print smoothed probabilities
+ my $boprob=0; #accumulate pruned probabilities
+ my $prob=0;
+ my $boprob_correction=0; #prob for the correction due to singleton pruning
+
+ if ($totcnt>0){
+ for (my $c=0;$c<=$code;$c++) {
+
+ $ngram=join(" ",$reduced_h,$dict[$c]);
+
+ print STDERR "totcnt:$totcnt diff:$diff singlediff:$singlediff\n" if $totcnt+$diff+$singlediff==0;
+
+ if ($shift_beta && $beta>0) {
+ $prob=($cnt[$c]-$beta)/$totcnt;
+ } elsif ($improved_shift_beta) {
+ my $b=($cnt[$c]>= 3? $beta[3]:$beta[$cnt[$c]]);
+ $prob=($cnt[$c] - $b)/$totcnt;
+ } elsif ($stupid_backoff) {
+ $prob=$cnt[$c]/$totcnt;
+ } else { ### other smoothing types, like Witten-Bell
+ $prob=$cnt[$c]/($totcnt+$diff);
+ }
+
+ ## skip n-grams containing OOV
+ ## if (&containsOOV($ngram)){ print STDERR "ngram:|$ngram| contains OOV --> hence skip\n"; next; }
+
+ ## skip also n-grams containing eos symbols not at the final
+ ## if (&CrossSentence($ngram)){ print STDERR "ngram:|$ngram| is Cross Sentence --> hence skip\n"; next; }
+
+
+ #rm singleton n-grams for (n>=3), if flag is active
+ #rm n-grams (n>=2) containing cross-sentence boundaries, if flag is not active
+ #rm n-grams containing <unk> or <cutoff> except for 1-grams
+
+ #warn "considering $size $n |$ngram|\n";
+ if (($prune_singletons && $n>=3 && $cnt[$c]==1) ||
+ (!$cross_sentence && &CrossSentence($ngram)) ||
+ (&containsOOV($dict[$c])) ||
+ ($n>=2 && &containsOOV($h)) ||
+ ($dict[$c] eq $cutoffword)
+ )
+ {
+ $boprob+=$prob;
+
+ if ($n<$size) { #output this anyway because it will be an history for n+1
+ printf GR "%f\t%s %s\n",-10000,$reduced_h,$dict[$c];
+ }
+ } else {
+ if ($cnt[$c] > $pruneFreqThr[$n]){
+ # print unpruned n-1 gram
+ my $logp=log($prob)/$log10;
+ printf(GR "%f\t%s %s\n",($logp>0?0:$logp),$reduced_h,$dict[$c]);
+ }else{
+ if ($n<$size) { #output this anyway because it will be an history for n+1
+ printf GR "%f\t%s %s\n",-10000,$reduced_h,$dict[$c];
+ }
+ }
+ }
+ }
+ }else{
+ $boprob=0;
+ }
+
+ if (($prune_singletons && $n>=3)){
+ if ($shift_beta && $beta>0) { # correction due to singleton pruning
+ $boprob_correction += (1.0-$beta) * $singlediff / $totcnt;
+ } elsif ($improved_shift_beta) { # correction due to singleton pruning
+ $boprob_correction += (1-$beta[1]) * $singlediff / $totcnt;
+ } elsif ($stupid_backoff) { # correction due to singleton pruning
+ $boprob_correction += $singlediff/($totcnt);
+ } else { # correction due to singleton pruning
+ $boprob_correction += $singlediff/($totcnt+$diff);
+ }
+ }
+ else{
+ $boprob_correction = 0;
+ }
+
+ $boprob=$boprob_correction;
+
+ #rewrite history including back-off weight
+
+ #check if history has to be pruned out
+ if ($hpr==-10000) {
+ #skip this history
+ } elsif ($shift_beta && $beta>0) {
+ print STDERR "wrong division: considering rewriting history --- h:|$h| --- hpr=$hpr --- totcnt:$totcnt -- denumerator:",($totcnt),"\n" if $totcnt==0;
+ my $lambda=$beta * $diff/$totcnt;
+ my $logp=log($boprob+$lambda)/$log10;
+ printf NHGR "%s\t%f\n",$h,($logp>0?0:$logp);
+ } elsif ($improved_shift_beta) {
+ print STDERR "wrong division: considering rewriting history --- h:|$h| --- hpr=$hpr --- totcnt:$totcnt -- denumerator:",($totcnt),"\n" if $totcnt==0;
+ my $lambda=($beta[1] * $diff1 + $beta[2] * $diff2 + $beta[3] * $diff3)/$totcnt;
+ my $logp=log($boprob+$lambda)/$log10;
+ printf NHGR "%s\t%f\n",$h,($logp>0?0:$logp);
+ } elsif ($stupid_backoff) {
+ print STDERR "wrong division: considering rewriting history --- h:|$h| --- hpr=$hpr --- totcnt:$totcnt -- denumerator:",($totcnt),"\n" if $totcnt==0;
+ my $lambda=$beta;
+ my $logp=log($lambda)/$log10;
+ printf NHGR "%s\t%f\n",$h,($logp>0?0:$logp);
+ } else {
+ print STDERR "wrong division: considering rewriting history --- h:|$h| --- hpr=$hpr --- totcnt:$totcnt diff:$diff -- denumerator:",($totcnt+$diff),"\n" if $totcnt+$diff==0;
+ my $lambda=$diff/($totcnt+$diff);
+ my $logp=log($boprob+$lambda)/$log10;
+ printf NHGR "%s\t%f\n",$h,($logp>0?0:$logp);
+ }
+
+ #reset smoothing statistics
+ $code=-1;@cnt=(); @dict=(); $totcnt=0;$diff=0;$singlediff=0;$oldwrd="";$diff1=0;$diff2=0;$diff3=0;$locfreq=0;
+
+ #read next history
+ $h=<HGR>;
+
+ if (defined($h)){
+ chomp($h); @h=split(/[ \t]+/,$h); $hpr=shift @h;
+ $reduced_h=join(" ", at h[0..$n-2]);
+ }else{
+ die "ERROR: Something could be wrong: history are terminated before ngrams!" if defined($ng);
+ }
+ }until (!defined($ng)); #n-grams are over
+
+ close(HGR); close(INP); close(GR); close(NHGR);
+
+ rename("${sublm}.".($n-1)."ngr.gz","${sublm}.".($n-1)."gr.gz");
+}
+
+
+#check if n-gram contains cross-sentence boundaries
+sub CrossSentence(){
+ my ($ngram) = @_;
+ if ($ngram=~/<\/s> /i) { #if </s> occurs not only in the last place
+ print STDERR "check CrossSentence ngram:|$ngram| is CrossSentence\n" if $verbose;
+ return 1;
+ }
+ return 0;
+}
+
+#check if n-gram contains OOV
+sub containsOOV(){
+ my ($ngram) = @_;
+ if ($ngram=~/<UNK>/i){
+ print STDERR "check containsOOV ngram:|$ngram| contains OOV\n" if $verbose;
+ return 1;
+ }
+ return 0;
+}
diff --git a/scripts/cmake_install.cmake b/scripts/cmake_install.cmake
new file mode 100644
index 0000000..05bbaa7
--- /dev/null
+++ b/scripts/cmake_install.cmake
@@ -0,0 +1,48 @@
+# Install script for directory: /Users/marcello/Workspace/software/irstlm/trunk/scripts
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+ set(CMAKE_INSTALL_PREFIX "/Users/marcello/Workspace/software/irstlm")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+ if(BUILD_TYPE)
+ string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+ CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+ else()
+ set(CMAKE_INSTALL_CONFIG_NAME "")
+ endif()
+ message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+ if(COMPONENT)
+ message(STATUS "Install component: \"${COMPONENT}\"")
+ set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+ else()
+ set(CMAKE_INSTALL_COMPONENT)
+ endif()
+endif()
+
+if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+ file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE PROGRAM PERMISSIONS OWNER_EXECUTE OWNER_READ OWNER_WRITE FILES
+ "/Users/marcello/Workspace/software/irstlm/trunk/scripts/add-start-end.sh"
+ "/Users/marcello/Workspace/software/irstlm/trunk/scripts/build-lm-qsub.sh"
+ "/Users/marcello/Workspace/software/irstlm/trunk/scripts/build-lm.sh"
+ "/Users/marcello/Workspace/software/irstlm/trunk/scripts/build-sublm.pl"
+ "/Users/marcello/Workspace/software/irstlm/trunk/scripts/goograms2ngrams.pl"
+ "/Users/marcello/Workspace/software/irstlm/trunk/scripts/lm-stat.pl"
+ "/Users/marcello/Workspace/software/irstlm/trunk/scripts/mdtsel.sh"
+ "/Users/marcello/Workspace/software/irstlm/trunk/scripts/merge-sublm.pl"
+ "/Users/marcello/Workspace/software/irstlm/trunk/scripts/ngram-split.pl"
+ "/Users/marcello/Workspace/software/irstlm/trunk/scripts/rm-start-end.sh"
+ "/Users/marcello/Workspace/software/irstlm/trunk/scripts/sort-lm.pl"
+ "/Users/marcello/Workspace/software/irstlm/trunk/scripts/split-dict.pl"
+ "/Users/marcello/Workspace/software/irstlm/trunk/scripts/split-ngt.sh"
+ "/Users/marcello/Workspace/software/irstlm/trunk/scripts/wrapper"
+ )
+endif()
+
diff --git a/scripts/goograms2ngrams.pl b/scripts/goograms2ngrams.pl
new file mode 100755
index 0000000..9232b84
--- /dev/null
+++ b/scripts/goograms2ngrams.pl
@@ -0,0 +1,145 @@
+#! /usr/bin/perl
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+#******************************************************************************
+
+
+
+#transforms google n-grams into real n-grams so that counts are
+#consistent with respect to lower order n-grams
+
+use strict;
+use Getopt::Long "GetOptions";
+
+my $gzip=`which gzip 2> /dev/null`;
+my $gunzip=`which gunzip 2> /dev/null`;
+chomp($gzip);
+chomp($gunzip);
+
+my $cutoffword="<CUTOFF>"; #special word for Google 1T-ngram cut-offs
+my $blocksize=10000000; #this is the blocksize of produced n-grams
+my $from=2; #starting n-gram level
+
+my($help,$verbose,$maxsize,$googledir,$ngramdir)=();
+
+$help=1 unless
+&GetOptions('maxsize=i' => \$maxsize,
+ 'startfrom=i' => \$from,
+ 'googledir=s' => \$googledir,
+ 'ngramdir=s' => \$ngramdir,
+ 'h|help' => \$help,
+ 'verbose' => \$verbose);
+
+
+if ($help || !$maxsize || !$googledir || !$ngramdir ) {
+ my $cmnd = "goograms2ngrams.pl";
+ print "\n$cmnd - transforms google n-grams into real n-grams so that\n",
+ " counts are consistent with respect to lower order n-grams\n",
+ "\nUSAGE:\n",
+ " $cmnd [options]\n",
+ "\nOPTIONS:\n",
+ " --maxsize <int> maximum n-gram level of conversion\n",
+ " --startfrom <int> skip initial levels if already available (default 2)\n",
+ " --googledir <string> directory containing the google-grams dirs (1gms,2gms,...)\n",
+ " --ngramdir <string> directory where to write the n-grams \n",
+ " --verbose (optional) very talktive output\n",
+ " -h, --help (optional) print these instructions\n",
+ "\n";
+
+ exit(1);
+}
+
+warn "goograms2ngrams: maxsize $maxsize from $from googledir $googledir ngramdir $ngramdir \n"
+if $verbose;
+
+die "goograms2ngrams: value of --maxsize must be between 2 and 5\n" if $maxsize<2 || $maxsize>5;
+die "goograms2ngrams: cannot find --googledir $googledir \n" if ! -d $googledir;
+die "goograms2ngrams: cannot find --ngramdir $ngramdir \n" if ! -d $ngramdir;
+
+
+my ($n,$hgrams,$ggrams,$ngrams)=();
+my ($ggr,$hgr,$hgrcnt,$ggrcnt,$totggrcnt)=();
+my (@ggr, at hgr)=();
+
+foreach ($n=$from;$n<=$maxsize;$n++){
+
+ my $counter=0;
+
+ warn "Converting google-$n-grams into $n-gram\n";
+
+ $hgrams=($n==2?"${googledir}/1gms/vocab.gz":"${ngramdir}/".($n-1)."grams-*.gz");
+ open(HGR,"$gunzip -c $hgrams |") || die "cannot open $hgrams\n";
+
+ $ggrams="${googledir}/".($n)."gms/".($n)."gm-*";
+ open(GGR,"$gunzip -c $ggrams |") || die "cannot open $ggrams\n";
+
+ my $id = sprintf("%04d", 0);
+ $ngrams="${ngramdir}/".($n)."grams-${id}.gz";
+
+ next if -e $ngrams; #go to next step if file exists already;
+ open(NGR,"|$gzip -c > $ngrams ") || die "cannot open $ngrams\n";
+
+ chop($ggr=<GGR>); @ggr=split(/[ \t]/,$ggr);$ggrcnt=(pop @ggr);
+ #warn "ggr: ",$ggrcnt," ",join(" ", at ggr[0..$n-1]),"\n";
+
+ while ($hgr=<HGR>){
+
+ $counter++;
+ printf(STDERR ".") if ($counter % 1000000)==0;
+
+ chop($hgr); @hgr=split(/[ \t]/,$hgr); $hgrcnt=(pop @hgr);
+ #warn "hgr: ",$hgrcnt," ",join(" ", at hgr[0..$n-2]),"\n";
+
+ if (join(" ", at hgr[0..$n-2]) eq join(" ", at ggr[0..$n-2])){
+
+ $totggrcnt=0;
+ do{
+ $totggrcnt+=$ggrcnt;
+ print NGR join(" ", at ggr[0..$n-1])," ",$ggrcnt,"\n";
+ chop($ggr=<GGR>);@ggr=split(/[ \t]/,$ggr);$ggrcnt=(pop @ggr);
+ }until (join(" ", at hgr[0..$n-2]) ne join(" ", at ggr[0..$n-2]));
+
+ if ($hgrcnt > $totggrcnt){
+ #warn "difference: $hgrcnt $totggrcnt =",$hgrcnt-$totggrcnt,"\n";
+ print NGR join(" ", at hgr[0..$n-1])," ",$cutoffword," ",$hgrcnt-$totggrcnt,"\n";
+ }
+ }
+ else{
+ #warn "fully pruned context: $hgr\n";
+ print NGR join(" ", at hgr[0..$n-1])," ",$cutoffword," ",$hgrcnt,"\n";
+ }
+
+ if (($counter % $blocksize)==0){
+ close(NGR);
+ my $id = sprintf("%04d", int($counter / $blocksize));
+ $ngrams="${ngramdir}/".($n)."grams-${id}.gz";
+ open(NGR,"|$gzip -c > $ngrams ") || die "cannot open $ngrams\n";
+ }
+
+ }
+
+ close(HGR);close(NGR);close(GGR);
+
+}
+
+
+
+
+
diff --git a/scripts/lm-stat.pl b/scripts/lm-stat.pl
new file mode 100755
index 0000000..ac2558d
--- /dev/null
+++ b/scripts/lm-stat.pl
@@ -0,0 +1,63 @@
+#! /usr/bin/perl
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+#******************************************************************************
+#computes LM statistics over a string
+
+use strict;
+use Getopt::Long "GetOptions";
+use File::Basename;
+
+my ($help,$lm,$txt)=();
+$help=1 unless
+
+&GetOptions('lm=s' => \$lm,
+ 'txt=s' => \$txt,
+ 'h|help' => \$help,);
+
+if ($help || !$lm || !$txt) {
+ my $cmnd = basename($0);
+ print "\n$cmnd - computes LM statistics over a string\n",
+ "\nUSAGE:\n",
+ " $cmnd [options]\n",
+ "\nOPTIONS:\n",
+ " --lm <string> language model file \n",
+ " --txt <string> text file\n",
+ " -h, --help (optional) print these instructions\n",
+ "\n";
+
+ exit(1);
+}
+
+if (!$ENV{IRSTLM}){
+ print "Set environment variable IRSTLM with path to the irstlm directory\n";
+ exit(1);
+}
+
+
+
+my $clm="$ENV{IRSTLM}/bin/compile-lm";
+
+open (OUT,"$clm $lm --eval $txt --debug 1|");
+while (<OUT>){
+print;
+}
+
+close(OUT);
diff --git a/scripts/mdtsel.sh b/scripts/mdtsel.sh
new file mode 100755
index 0000000..164d4a5
--- /dev/null
+++ b/scripts/mdtsel.sh
@@ -0,0 +1,219 @@
+#! /bin/bash
+
+#/******************************************************************************
+#IrstLM: IRST Language Model Toolkit
+#Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+#
+#This library is free software; you can redistribute it and/or
+#modify it under the terms of the GNU Lesser General Public
+#License as published by the Free Software Foundation; either
+#version 2.1 of the License, or (at your option) any later version.
+#
+#This library is distributed in the hope that it will be useful,
+#
+#
+#but WITHOUT ANY WARRANTY; without even the implied warranty of
+#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+#Lesser General Public License for more details.
+#
+#You should have received a copy of the GNU Lesser General Public
+#License along with this library; if not, write to the Free Software
+#Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+#
+#******************************************************************************/
+
+# mdtsel.sh
+# by M. Federico
+# Copyright Marcello Federico, Fondazione Bruno Kessler, 2012
+
+
+set -m #enable job control
+
+usage()
+{
+ cmnd=$(basename $0);
+ cat << EOF
+
+$cmnd - performs data selection assuming an indomain corpus and
+ a very large out of domain corpus.
+
+USAGE:
+ $cmnd [options]
+
+DESCRIPTION.
+ This command performs data selection assuming an indomain
+ corpus and a very large out of domain corpus.
+ Both corpora must contain one sentence in each line delimited
+ with <s> and </s>. The process produces a file of scores.
+
+
+OPTIONS:
+ -h Show this message
+ -v Verbose
+ -i In-domain corpus
+ -o Out-domain corpus
+ -s Scores output file
+ -x Out-domain lines are indexed
+ -w Temporary work directory (default /tmp)
+ -j Number of jobs (default 6)
+ -m Data selection model (1 or 2, default 2)
+ -f Word frequency threshold (default 2)
+ -n Ngram order to use (n>=1 default 3)
+ -d Vocabulary size upper bound (default 10000000)
+ -c Cross-validation parameter (cv>=1, default 1)
+
+EOF
+}
+
+
+if [ ! $IRSTLM ]; then
+ echo "Set IRSTLM environment variable with path to irstlm"
+ exit 2
+fi
+
+#paths to scripts and commands in irstlm
+scr=$IRSTLM/bin
+bin=$IRSTLM/bin
+
+#check irstlm installation
+if [ ! -e $bin/dtsel ]; then
+ echo "$IRSTLM does not contain a proper installation of IRSTLM"
+ exit 3
+fi
+
+#default parameters
+indomfile="";
+outdomfile="";
+scoresfile="";
+workdir=/tmp
+logfile="/dev/null"
+jobs=6
+model=2
+minfreq=2
+ngramorder=3
+cv=1
+dub=10000000
+
+verbose="";
+useindex=0;
+
+while getopts “hvi:o:s:l:w:j:m:f:n:c:d:x:” OPTION
+do
+ case $OPTION in
+ h)
+ usage
+ exit 1
+ ;;
+ v)
+ verbose="--verbose";
+ ;;
+ i)
+ indfile=$OPTARG
+ ;;
+
+ o)
+ outdfile=$OPTARG
+ ;;
+ s)
+ scorefile=$OPTARG
+ ;;
+ l)
+ logfile=$OPTARG
+ ;;
+ w)
+ workdir=$OPTARG
+ ;;
+ j)
+ jobs=$OPTARG
+ ;;
+
+ m)
+ model=$OPTARG
+ ;;
+
+ n)
+ ngramorder=$OPTARG
+ ;;
+ f)
+ minfreq=$OPTARG;
+ ;;
+ d)
+ dub=$OPTARG;
+ ;;
+ x)
+ useindex=$OPTARG;
+ ;;
+
+ ?)
+ usage
+ exit 1
+ ;;
+
+ esac
+done
+
+
+if [ $verbose ];then
+echo indfile= $indfile outdfile= $outdfile scorefile= $scorefile useindex= $useindex
+echo logfile= $logfile workdir= $workdir
+echo jobs= $jobs model= $model ngramorder= $ngramorder minfreq= $minfreq dub=$dub
+fi
+
+if [ ! $indfile -o ! $outdfile -o ! $scorefile ]; then
+ usage
+ exit 5
+fi
+
+if [ -e $scorefile ]; then
+ echo "Output score file $outfile already exists! either remove or rename it."
+ exit 6
+fi
+
+if [ $logfile != "/dev/null" -a $logfile != "/dev/stdout" -a -e $logfile ]; then
+ echo "Logfile $logfile already exists! either remove or rename it."
+ exit 7
+fi
+
+workdir_created=0
+
+if [ ! -d $workdir ]; then
+ echo "Temporary work directory $workdir does not exist";
+ echo "creating $workdir";
+ mkdir -p $workdir;
+ workdir_created=1;
+fi
+
+
+#get process id to name process specific temporary files
+pid=$$
+
+#compute size of out domain corpus and block size of split
+lines=`wc -l < $outdfile`
+size=`echo "( $lines + 1000 )" / $jobs | bc` #to avoid any small block
+
+#perform split
+split -l $size $outdfile $workdir/dtsel${pid}-files-
+
+for file in $workdir/dtsel${pid}-files-*
+do
+echo $file
+( \
+$bin/dtsel -x=$useindex -i=$indfile -o=$file -s=${file}.scores -n=$ngramorder -dub=$dub -f=$minfreq -m=$model ; \
+cat ${file}.scores | perl -pe '/^nan /1000 /g;' | sort -g > ${file}.scores.tmp ; \
+mv ${file}.scores.tmp ${file}.scores \
+) >>$logfile 2>&1 &
+
+done
+
+# Wait for all parallel jobs to finish
+while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done
+
+sort -g -m $workdir/dtsel${pid}-files-*.scores > $scorefile
+rm $workdir/dtsel${pid}-files-*
+if [ $workdir_created == 1 ]
+then
+rmdir $workdir
+fi
+
+
+
diff --git a/scripts/merge-sublm.pl b/scripts/merge-sublm.pl
new file mode 100755
index 0000000..730aa28
--- /dev/null
+++ b/scripts/merge-sublm.pl
@@ -0,0 +1,208 @@
+#! /usr/bin/perl
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+#******************************************************************************
+#merge prefix LMs into one single file
+
+use strict;
+use Getopt::Long "GetOptions";
+use File::Basename;
+
+my ($help,$lm,$size,$sublm,$backoff)=();
+$help=0;
+$backoff=0;
+
+&GetOptions('size=i' => \$size,
+'lm=s' => \$lm,
+'sublm=s' => \$sublm,
+'backoff' => \$backoff,
+'h|help' => \$help);
+
+if ($help || !$size || !$lm || !$sublm) {
+ my $cmnd = basename($0);
+ print "\n$cmnd - merge single LMs\n",
+ "\nUSAGE:\n",
+ " $cmnd [options]\n",
+ "\nOPTIONS:\n",
+ " --size <int> maximum n-gram size for the language model\n",
+ " --sublm <string> path identifying all input prefix sub LMs\n",
+ " --lm <string> name of the output LM file (will be gzipped)\n",
+ " --backoff (optional) create a backoff LM, output is directly in ARPA format (default is false, i.e. iARPA format) \n",
+ " -h, --help (optional) print these instructions\n",
+ "\n";
+
+ exit(1);
+}
+
+
+my $gzip=`which gzip 2> /dev/null`;
+my $gunzip=`which gunzip 2> /dev/null`;
+chomp($gzip);
+chomp($gunzip);
+
+warn "merge-sublm.pl --size $size --sublm $sublm --lm $lm --backoff $backoff\n";
+
+warn "Compute total sizes of n-grams\n";
+my @size=(); #number of n-grams for each level
+my $tot1gr=0; #total frequency of 1-grams
+my $unk=0; #frequency of <unk>
+my $pr; #probability of 1-grams
+my (@files,$files); #sublm files for a given n-gram size
+
+for (my $n=1;$n<=$size;$n++){
+
+ @files=map { glob($_) } "${sublm}*.${n}gr*";
+ $files=join(" ", at files);
+ $files || die "cannot find sublm files\n";
+ warn "join files $files\n";
+
+ if ($n==1){
+ open(INP,"$gunzip -c $files|") || die "cannot open $files\n";
+ while(my $line = <INP>){
+ $size[$n]++;
+ chomp($line);
+ warn "there is an empty line in any of these files ($files); this should not happen\n" if $line =~ /^$/;
+ my @words = split(/[ \t]+/,$line);
+ #cut down counts for sentence initial
+ $words[0]=1 if $words[1]=~/<s>/;
+ #there could be more independent <unk> words
+ #generated by ngt with -sd option
+ $size[$n]-- if $unk && $words[1] eq "<unk>";
+ $unk+=$words[0] if $words[1]=~/<unk>/i;
+ $tot1gr+=$words[0];
+ }
+ close(INP);
+ if ($unk==0){
+ warn "implicitely add <unk> word to counters\n";
+ $tot1gr+=$size[$n]; #equivalent to WB smoothing
+ $size[$n]++;
+ }
+ }else{
+ for (my $j=0;$j<scalar(@files);$j++){
+ safesystem("$gunzip -c $files[$j] | grep -v '10000.000' | wc -l > wc$$") or die;
+ open(INP,"wc$$") || die "cannot open wc$$\n";
+ my $wc = <INP>;
+ chomp($wc);
+ $size[$n] += $wc;
+ close(INP);
+ unlink("wc$$");
+ }
+ }
+ warn "n:$n size:$size[$n] unk:$unk\n";
+}
+
+warn "Merge all sub LMs\n";
+
+$lm.=".gz" if $lm!~/.gz$/;
+open(LM,"|$gzip -c > $lm") || die "Cannot open $lm\n";
+
+warn "Write LM Header\n";
+if ($backoff){
+ printf LM "ARPA\n\n";
+} else{
+ printf LM "iARPA\n\n";
+}
+
+printf LM "\\data\\\n";
+for (my $n=1;$n<=$size;$n++){
+ printf LM "ngram $n=\t$size[$n]\n";
+}
+printf LM "\n";
+close(LM);
+
+warn "Writing LM Tables\n";
+for (my $n=1;$n<=$size;$n++){
+
+ warn "Level $n\n";
+
+ @files=map { glob($_) } "${sublm}*.${n}gr*";
+ $files=join(" ", at files);
+ warn "input from: $files\n";
+ if ($n==1){
+ open(INP,"$gunzip -c $files|") || die "cannot open $files\n";
+ open(LM,"|$gzip -c >> $lm");
+ printf LM "\\$n-grams:\n";
+ while(my $line = <INP>){
+ chomp($line);
+ warn "there is an empty line in any of these files ($files); this should not happen\n" if $line =~ /^$/;
+ #lowercase some expressions of google n-grams
+ $line=~s/<S>/<s>/g;
+ $line=~s/<\/S>/<\/s>/g;
+ $line=~s/<UNK>/<unk>/g;
+
+ my @words = split(/[ \t]+/,$line);
+
+ #always print unk a the eqnd
+ next if $words[1]=~/<unk>/i;
+
+ #cut down counts for sentence initial
+ $words[0]=1 if $words[1]=~/<s>/i;
+
+ #apply witten-bell smoothing on 1-grams
+ $pr=(log($words[0]+1)-log($tot1gr+$size[1]))/log(10.0);
+ shift @words;
+ printf LM "%f\t%s\t%f\n",$pr,$words[0],$words[1];
+ }
+ close(INP);
+
+ #print final <unk>
+ #witten-bell smoothing of <unk> probability
+ if ($unk){
+ $pr=(log($unk+1)-log($tot1gr+$size[1]))/log(10.0);
+ }else{
+ $pr=(log($size[1]-1+1)-log($tot1gr+$size[1]))/log(10.0);
+ }
+
+ printf LM "%f <unk>\n",$pr;
+ close(LM);
+ }else{
+ open(LM,"|$gzip -c >> $lm");
+ printf LM "\\$n-grams:\n";
+ close(LM);
+ for (my $j=0;$j<scalar(@files);$j++){
+ safesystem("$gunzip -c $files[$j] | grep -v '10000.000' | gzip -c >> $lm") or die;
+ }
+ }
+
+}
+
+open(LM,"|$gzip -c >> $lm") || die "Cannot open $lm\n";
+printf LM "\\end\\\n";
+close(LM);
+
+sub safesystem {
+ print STDERR "Executing: @_\n";
+ system(@_);
+ if ($? == -1) {
+ print STDERR "Failed to execute: @_\n $!\n";
+ exit(1);
+ }
+ elsif ($? & 127) {
+ printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
+ ($? & 127), ($? & 128) ? 'with' : 'without';
+ exit(1);
+ }
+ else {
+ my $exitcode = $? >> 8;
+ print STDERR "Exit code: $exitcode\n" if $exitcode;
+ return ! $exitcode;
+ }
+}
+
diff --git a/scripts/ngram-split.pl b/scripts/ngram-split.pl
new file mode 100755
index 0000000..27700d3
--- /dev/null
+++ b/scripts/ngram-split.pl
@@ -0,0 +1,84 @@
+#! /usr/bin/perl
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+#******************************************************************************
+
+#re-segment google n-gram count files into files so that
+#n-grams starting with a given word (prefix) are all
+#contained in one file.
+
+use Getopt::Long "GetOptions";
+use File::Basename;
+
+my ($help,$lm,$size,$sublm)=();
+$help=1 unless
+&GetOptions('h|help' => \$help);
+
+if ($help) {
+ my $cmnd = basename($0);
+ print "\n$cmnd - re-segment google n-gram count files so that n-grams\n",
+ " starting with a given word (prefix) are all contained in one file\n",
+ "\nUSAGE:\n",
+ " $cmnd [options] [<output_prefix>]\n",
+ "\nDESCRIPTION:\n",
+ " Input is expected on STDIN.\n",
+ " <output_prefix> prefix of files to be created\n",
+ "\nOPTIONS:\n",
+ " -h, --help (optional) print these instructions\n",
+ "\n";
+
+ exit(1);
+}
+
+
+$max_pref=10000; #number of prefixes to be put in one file
+$max_ngram=5000000;#number of n-grams to be put in one file
+$file_cnt=0; #counter of files
+$pref_cnt=0; #counter of prefixes in the current file
+$ngram_cnt=0; #counter of n-gram in the current file
+
+$path=($ARGV[0]?$ARGV[0]:"goong"); #path of files to be created
+
+$gzip=`which gzip`;
+chomp($gzip);
+
+$pwrd="";
+open(OUT,sprintf("|$gzip -c > %s.%04d.gz",$path,++$file_cnt));
+
+while ($ng=<STDIN>){
+ ($wrd)=$ng=~/^([^ ]+)/;
+ #warn "$wrd\n";
+ if ($pwrd ne $wrd){
+ $pwrd=$wrd;
+ if ($file_pref>$max_pref || $ngram_cnt>$max_ngram){
+ warn "it's time to change file\n";
+ close(OUT);
+ open(OUT,sprintf("|$gzip -c > %s.%04d.gz",$path,++$file_cnt));
+ $pref_cnt=$ngram_cnt=0;
+ }
+ else{
+ $pref_cnt++;
+ }
+ }
+ print OUT $ng;
+ $ngram_cnt++;
+}
+close(OUT);
+
diff --git a/scripts/other/beautify.perl b/scripts/other/beautify.perl
new file mode 100755
index 0000000..eafaeeb
--- /dev/null
+++ b/scripts/other/beautify.perl
@@ -0,0 +1,22 @@
+#!/usr/bin/perl
+
+my $os=`uname | tr -d '\012'`;
+my $dir=`dirname $0 | tr -d '\012'`;
+my $astyle="$dir/astyle_$os";
+
+opendir(DIR,".") or die "Can't open the current directory: $!\n";
+
+# read file/directory names in that directory into @names
+ at names = readdir(DIR) or die "Unable to read current dir:$!\n";
+
+foreach $name (@names) {
+ next if ($name eq "."); # skip the current directory entry
+ next if ($name eq ".."); # skip the parent directory entry
+
+ if (-d $name){ # is this a directory?
+ `$astyle --style="k&r" -s2 --recursive -v "$name/*.h" "$name/*.cpp"`;
+ next; # can skip to the next name in the for loop
+ }
+}
+
+closedir(DIR);
diff --git a/scripts/plsa.sh b/scripts/plsa.sh
new file mode 100755
index 0000000..d59e3a5
--- /dev/null
+++ b/scripts/plsa.sh
@@ -0,0 +1,346 @@
+#! /bin/bash
+
+set -m # Enable Job Control
+
+
+
+function usage()
+{
+cmnd=$(basename $0);
+cat<<EOF
+
+$cmnd - train and/or test a probabilistic latent semantic model
+
+USAGE:
+$cmnd [options]
+
+TRAINING OPTIONS:
+
+-c file Collection of training documents e.g. 'gunzip -c docs.gz'
+-d file Dictionary file (default dictionary)
+-f Force to use existing dictionary
+-m fle Output model file e.g. model
+-n count Number of topics (default 100)
+-i count Number of training iterations (default 20)
+-t folder Temporary working directory (default ./stat_PID)
+-p count Prune words with counts < arg (default 2)
+-k count Number of processes (default 5)
+
+-r file Model output file in readable format
+-s count Put top arg frequent words in special topic 0
+-l file Log file (optional)
+-v Verbose
+-h Show this message
+
+
+TESTING OPTIONS
+
+-c file Testing documents e.g. test
+-d file Dictionary file (default dictionary)
+-m file Model file
+-n number Number of topics (default 100)
+-u file Output document unigram distribution
+-o file Output document topic distributions
+-i counts Number of training iterations (default 20)
+-t folder Temporary working directory (default ./stat_PID)
+-l file Log file (optional)
+-k count Number of processes (default 5)
+-v Verbose
+-h Show this message
+
+
+EOF
+}
+
+
+
+if [ ! $IRSTLM ]; then
+echo "Set IRSTLM environment variable with path to irstlm"
+exit 2
+fi
+
+#paths to scripts and commands in irstlm
+scr=$IRSTLM/bin
+bin=$IRSTLM/bin
+gzip=`which gzip 2> /dev/null`;
+gunzip=`which gunzip 2> /dev/null`;
+
+#default parameters
+tmpdir=stat_$$
+data=""
+topics=100
+splits=5
+iter=20
+prunefreq=2
+spectopics=0
+logfile="/dev/null"
+verbose=""
+unigram=""
+outtopic=""
+dict="dictionary"
+forcedict=""
+model=""
+txtfile="/dev/null"
+
+while getopts "hvfc:m:r:k:i:n:t:d:p:s:l:u:o:" OPTION
+do
+case $OPTION in
+h)
+usage
+exit 0
+;;
+v)
+verbose="--verbose";
+;;
+c)
+data=$OPTARG
+;;
+m)
+model=$OPTARG
+;;
+r)
+txtfile=$OPTARG
+;;
+k)
+splits=$OPTARG
+;;
+i)
+iter=$OPTARG
+;;
+t)
+tmpdir=$OPTARG
+;;
+d)
+dict=$OPTARG
+;;
+f)
+forcedict="TRUE"
+;;
+p)
+prunefreq=$OPTARG
+;;
+s)
+spectopics=$OPTARG
+;;
+n)
+topics=$OPTARG
+;;
+l)
+logfile=$OPTARG
+;;
+u)
+unigram=$OPTARG
+;;
+o)
+outtopic=$OPTARG
+;;
+
+?)
+usage
+exit 1
+;;
+esac
+done
+
+if [ $verbose ]; then
+echo data=$data model=$model topics=$topics iter=$iter dict=$dict
+logfile="/dev/stdout"
+fi
+
+if [ "$unigram" == "" -a "$outtopic" == "" ]; then
+
+#training branch
+
+if [ ! "$data" -o ! "$model" ]; then
+usage
+exit 1
+fi
+
+if [ -e $logfile -a $logfile != "/dev/null" -a $logfile != "/dev/stdout" ]; then
+echo "Logfile $logfile already exists! either remove or rename it."
+exit 1
+fi
+
+if [ -e $model ]; then
+echo "Output file $model already exists! either remove or rename it." >> $logfile 2>&1
+exit 1
+fi
+
+if [ -e $txtfile -a $txtfile != "/dev/null" ]; then
+echo "Output file $txtfile already exists! either remove or rename it." >> $logfile 2>&1
+exit 1
+fi
+
+
+if [ -e $logfile -a $logfile != "/dev/null" -a $logfile != "/dev/stdout" ]; then
+echo "Logfile $logfile already exists! either remove or rename it." >> $logfile 2>&1
+exit 1
+fi
+
+#if [ ! -e "$data" ]; then
+#echo "Cannot find data $data." >> $logfile 2>&1
+#exit 1;
+#fi
+
+if [ ! -e $dict ]; then
+echo extract dictionary >> $logfile
+$bin/dict -i="$data" -o=$dict -PruneFreq=$prunefreq -f=y >> $logfile 2>&1
+if [ `head -n 1 $dict| cut -d " " -f 3` -lt 10 ]; then
+echo "Dictionary contains errors"
+exit 2;
+fi
+else
+echo "Warning: dictionary file already exists." >> $logfile 2>&1
+if [ $forcedict ]; then
+echo "Warning: authorization to use it." >> $logfile 2>&1
+else
+echo "No authorization to use it (see option -f)." >> $logfile 2>&1
+exit 1
+fi
+fi
+
+
+
+#check tmpdir
+tmpdir_created=0;
+if [ ! -d $tmpdir ]; then
+echo "Creating temporary working directory $tmpdir" >> $logfile 2>&1
+mkdir -p $tmpdir;
+tmpdir_created=1;
+else
+echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1
+rm $tmpdir/* 2> /dev/null
+if [ $? != 0 ]; then
+echo "Warning: some temporary files could not be removed" >> $logfile 2>&1
+fi
+fi
+
+#####
+echo split documents >> $logfile 2>&1
+$bin/plsa -c="$data" -d=$dict -b=$tmpdir/data -sd=$splits >> $logfile 2>&1
+
+machine=`uname -s`
+if [ $machine == "Darwin" ] ; then
+splitlist=`jot - 1 $splits`
+iterlist=`jot - 1 $iter`
+else
+splitlist=`seq 1 1 $splits`
+iterlist=`seq 1 1 $iter`
+fi
+
+#rm $tmpdir/Tlist
+for sp in $splitlist ; do echo $tmpdir/data.T.$sp >> $tmpdir/Tlist 2>&1; done
+#rm $model
+for it in $iterlist ; do
+echo "ITERATION $it" >> $logfile 2>&1
+for sp in $splitlist ; do
+(date; echo it $it split $sp )>> $logfile 2>&1
+$bin/plsa -c=$tmpdir/data.$sp -d=$dict -st=$spectopics -hf=$tmpdir/data.H.$sp -tf=$tmpdir/data.T.$sp -wf=$model -m=$model -t=$topics -it=1 -tit=$it >> $logfile 2>&1 &
+done
+while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done
+
+(date; echo recombination ) >> $logfile 2>&1
+
+$bin/plsa -ct=$tmpdir/Tlist -c="$data" -d=$dict -hf=$tmpdir/data.H -m=$model -t=$topics -it=1 -txt=$txtfile >> $logfile 2>&1
+
+done
+(date; echo End of training) >> $logfile 2>&1
+
+echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1
+rm $tmpdir/* 2> /dev/null
+
+if [ $tmpdir_created -eq 1 ]; then
+echo "Removing temporary directory $tmpdir" >> $logfile 2>&1
+rmdir $tmpdir 2> /dev/null
+if [ $? != 0 ]; then
+echo "Warning: the temporary directory could not be removed." >> $logfile 2>&1
+fi
+fi
+exit 0
+
+
+
+#testing branch
+else
+
+if [ ! $model -o ! -e $model ]; then
+echo "Need to specify existing model" >> $logfile 2>&1
+exit 1;
+fi
+
+
+if [ ! $dict -o ! -e $dict ]; then
+echo "Need to specify dictionary file of the model" >> $logfile 2>&1
+exit 1;
+fi
+
+if [ $unigram ]; then
+$bin/plsa -inf="$data" -d=$dict -m=$model -hf=hfff.out$$ -t=$topics -it=$iter -wof=$unigram >> $logfile 2>&1
+rm hfff.out$$
+
+else #topic distribution
+
+#check tmpdir
+tmpdir_created=0;
+if [ ! -d $tmpdir ]; then
+echo "Creating temporary working directory $tmpdir" >> $logfile 2>&1
+mkdir -p $tmpdir;
+tmpdir_created=1;
+else
+echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1
+rm $tmpdir/* 2> /dev/null
+if [ $? != 0 ]; then
+echo "Warning: some temporary files could not be removed" >> $logfile 2>&1
+fi
+fi
+
+#####
+echo split documents >> $logfile 2>&1
+$bin/plsa -c="$data" -d=$dict -b=$tmpdir/data -sd=$splits >> $logfile 2>&1
+
+machine=`uname -s`
+if [ $machine == "Darwin" ] ; then
+splitlist=`jot - 1 $splits`
+else
+splitlist=`seq 1 1 $splits`
+fi
+
+#rm $tmpdir/Tlist
+for sp in $splitlist ; do echo $tmpdir/data.T.$sp >> $tmpdir/Tlist 2>&1; done
+#rm $model
+
+for sp in $splitlist ; do
+(date; echo split $sp )>> $logfile 2>&1
+
+$bin/plsa -inf=$tmpdir/data.$sp -d=$dict -hf=$tmpdir/data.H.$sp -m=$model -t=$topics -it=$iter -tof=$tmpdir/topic.$sp >> $logfile 2>&1 &
+
+done
+while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done
+
+(date; echo recombination ) >> $logfile 2>&1
+
+echo > $outtopic
+for sp in $splitlist ; do #makes sure that 1 < 2 < ... < 11 ...
+cat $tmpdir/topic.$sp >> $outtopic
+done
+
+(date; echo End of training) >> $logfile 2>&1
+
+echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1
+rm $tmpdir/* 2> /dev/null
+
+if [ $tmpdir_created -eq 1 ]; then
+echo "Removing temporary directory $tmpdir" >> $logfile 2>&1
+rmdir $tmpdir 2> /dev/null
+if [ $? != 0 ]; then
+echo "Warning: the temporary directory could not be removed." >> $logfile 2>&1
+fi
+fi
+
+fi
+fi
+
+
+exit 0
+
+
diff --git a/scripts/qplsa.sh b/scripts/qplsa.sh
new file mode 100755
index 0000000..fad8765
--- /dev/null
+++ b/scripts/qplsa.sh
@@ -0,0 +1,183 @@
+#! /bin/bash
+
+sDir=$(cd $(dirname $0) ; /bin/pwd)
+
+#Task data
+bin=/hltsrv0/federico/plsa/bin
+wdir=/panfs/panfem/test-hlt/federico/plsa/CC
+#/hltsrv0/federico/plsa/ted
+ldir=/scratch/federico
+
+data=doc_en.00.bin
+dict=ted.dict
+
+#ted-en
+topics=150
+iter=2
+prunefreq=5
+spectopics=500
+Tlist=$wdir/tlist
+splits=2
+model=model.$splits
+txtfile=Wfile.$splits
+
+#parameters
+numSlots=1-3
+ram=10G
+qL=bld.q,bld-ib.q
+
+#Preparation phase
+jName=PLSA.PRE
+
+#preparation ends when tlist is prepared
+rm $Tlist
+jName=PLSA.TRAIN
+
+range=`yes | head -n $splits | awk '{printf("%02d ",a);a++}'`
+iter=`seq 1 1 $iter| tr "\012" " "`
+
+qsub -cwd -N $jName -j y -q $qL -l mf=$ram -t $numSlots -o $wdir/log -S /bin/bash <<EOF
+
+
+me=\`echo \$SGE_TASK_ID | awk '{printf("%02d",\$1-1)}'\`
+lastid=\`echo \$SGE_TASK_LAST | awk '{printf("%02d",\$1-1)}'\`
+
+(echo start ; date) > $wdir/monitor.\$SGE_TASK_ID
+echo
+
+if [[ ! -d $ldir ]]; then mkdir $ldir; fi
+
+#################################
+if [ \$me -eq \$lastid ]
+then
+(echo master starts ; uname -n ; date) > $wdir/monitor.\$SGE_TASK_ID
+
+#prepare Tlist file
+rm $Tlist
+for sp in $range; do
+echo $wdir/$data.T.\$sp >> $Tlist
+done
+
+#tell slaves to copy and binarize data
+
+for sp in $range; do
+
+(echo cp $wdir/$data.\$sp.gz $wdir/$dict $ldir \; ;\
+echo $bin/plsa -c=\"gunzip -c $ldir/$data.\$sp.gz\" -d=$ldir/$dict -b=$ldir/$data.\$sp \; ;\
+echo rm $ldir/$data.\$sp.gz ) > $wdir/taskfor_\$sp
+touch $wdir/doit_\$sp
+done
+
+(echo master prepare ; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+#wait that all have finished
+while ls $wdir/doit_* &> /dev/null; do sleep 1; done
+
+(echo master start iteration ; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+for it in $iter; do
+for sp in $range; do
+
+(echo master iteration \$it split \$sp; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+echo tell slave to run an iteration
+(echo if [[ -e $wdir/$model ]] \; then cp $wdir/$model $ldir/$model \; fi ;
+ echo $bin/plsa -c=$ldir/$data.\$sp -d=$ldir/$dict -st=$spectopics -hf=$ldir/$data.H.\$sp -tf=$ldir/$data.T.\$sp -wf=$ldir/$model -m=$ldir/$model -t=$topics -it=1 -tit=\$it ;\
+echo cp $ldir/$data.T.\$sp $wdir ) > $wdir/taskfor_\$sp
+touch $wdir/taskfor_\$sp
+touch $wdir/doit_\$sp
+done
+
+(echo master start waiting \$it ; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+
+#echo wait that all have finished
+while ls $wdir/doit_* &> /dev/null; do
+(echo master waiting \$it ; date) >> $wdir/monitor.\$SGE_TASK_ID
+ls $wdir/doit_*
+sleep 1;
+done
+
+(echo master start recombination \$it ; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+echo recombine
+$bin/plsa -ct=$Tlist -c=dummy -d=$wdir/$dict -m=$wdir/$model -t=$topics -it=1 -txt=$wdir/$txtfile
+
+done
+
+
+(echo master tells slaves to remove data; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+echo tell slaves to remove their local data
+for sp in $range; do
+echo rm $ldir/$dict $ldir/$data.\$sp $ldir/$model > $wdir/taskfor_\$sp
+touch $wdir/taskfor_\$sp
+touch $wdir/doit_\$sp
+done
+echo wait that all have finished
+
+(echo master waits for slaves; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+while ls $wdir/doit_* &> /dev/null; do sleep 1; done
+
+echo tell slaves to exit
+
+(echo master tells slaves to exit; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+for sp in $range; do
+echo exit > $wdir/taskfor_\$sp
+touch $wdir/taskfor_\$sp
+touch $wdir/doit_\$sp
+done
+
+(echo master waits for slaves; date) >> $wdir/monitor.\$SGE_TASK_ID
+while ls $wdir/doit_* &> /dev/null; do sleep 1; done
+
+(echo master ends; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+rm $wdir/$data.H* $wdir/$model $wdir/$data.T* $wdir/taskfor_*
+
+#############################
+else
+
+(echo slave starts ; uname -n ; date) > $wdir/monitor.\$SGE_TASK_ID
+
+while :
+do
+
+(echo slave \$me iteration \$it waits for job; echo \$cmd; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+touch $wdir
+
+if [[ -e $wdir/doit_\$me ]]; then
+
+cmd=\`cat $wdir/taskfor_\$me\`
+
+(echo slave \$me starts executing; echo \$cmd; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+if [[ \$cmd == *exit* ]]; then
+ #rm before cmd execution
+ rm $wdir/doit_\$me >& /dev/null
+ exit 0
+else
+ /bin/sh $wdir/taskfor_\$me
+ #rm after cmd execution
+ rm $wdir/doit_\$me >& /dev/null
+fi
+
+(echo slave ended executing; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+fi
+
+sleep 1
+
+done
+
+fi
+
+(echo end;uname -a; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+exit 0
+
+EOF
+
diff --git a/scripts/rm-start-end.sh b/scripts/rm-start-end.sh
new file mode 100644
index 0000000..015e2ac
--- /dev/null
+++ b/scripts/rm-start-end.sh
@@ -0,0 +1,30 @@
+#! /bin/bash
+
+function usage()
+{
+ cmnd=$(basename $0);
+ cat<<EOF
+
+$cmnd - removes sentence start/end symbols
+
+USAGE:
+ $cmnd [options]
+
+OPTIONS:
+ -h Show this message
+
+EOF
+}
+
+# Parse options
+while getopts h OPT; do
+ case "$OPT" in
+ h)
+ usage >&2;
+ exit 0;
+ ;;
+ esac
+done
+
+sed 's/<s>//g' | sed 's/<\/s>//g' | sed 's/^ *//' | sed 's/ *$//' | sed '/^$/d'
+
diff --git a/scripts/sort-lm.pl b/scripts/sort-lm.pl
new file mode 100755
index 0000000..63d7c23
--- /dev/null
+++ b/scripts/sort-lm.pl
@@ -0,0 +1,124 @@
+#! /usr/bin/perl
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2010 Marcello Federico, FBK-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+#******************************************************************************
+#Sorts n-grams of an ARPA file according to lexicographic order.
+#Inverted sorting option is propedeutic to building a binary
+#lmtable with compile-lm with n-grams stored in reverted order.
+
+use strict;
+use Getopt::Long "GetOptions";
+
+my ($help,$ilm,$olm,$inv,$tmpdir)=();
+$help=0;
+
+$ilm="/dev/stdin";
+$olm="/dev/stdout";
+my $tmpdir="$ENV{TMP}";
+
+&GetOptions('ilm=s' => \$ilm,
+ 'olm=s' => \$olm,
+ 'tmpdir=s' => \$tmpdir,
+ 'inv' => \$inv,
+ 'help' => \$help,);
+
+if ($help || !$ilm || !$olm){
+ print "sort-lm.pl [--ilm <fname>] [--olm <fname>] [--inv]\n",
+ "-ilm <fname> input ARPA LM filename (default /dev/stdin)\n",
+ "-olm <fname> output ARPA LM filename (default /dev/stdout)\n",
+ "-tmpdir temporary directory for sorting (default is the enivronment variable TMP\n",
+ "-inv inverted n-gram sort for compile-lm\n",
+ "-help print these instructions\n";
+ exit(1);
+}
+
+warn "temporary directory for sorting is $tmpdir\n";
+
+my $order=0;
+my $sortcmd="";
+
+$ENV{'LC_ALL'}='C';
+
+open (INP, "< $ilm") || die "cannot open input LM file: $ilm\n";
+open (OUT, "> $olm") || die "cannot open output LM file: $olm\n";
+
+
+warn "reading from standard input\n" if $ilm eq "/dev/stdin";
+warn "writing to standard output\n" if $olm eq "/dev/stdout";
+
+$_=<INP>;
+
+#sanity check
+die "Error: input cannot be an intermediate iARPA file. First convert it to ARPA format with compile-lm.\n" if
+$_=~/^iARPA/;
+
+my $isQuantized=0;
+$isQuantized=1 if $_=~/^qARPA/;
+
+while(!/^\\end\\/){
+
+
+ if (($order)=$_=~/^\\(\d+)-grams:/){
+ print(OUT $_);$_=<INP>;
+ if ($isQuantized){
+ print(OUT $_); chop $_;#print centers
+ my $centers=$_; $_=<INP>;
+ warn "skip $centers centers\n";
+ for (my $c=1;$c<=$centers;$c++){
+ print(OUT $_);$_=<INP>;
+ }
+
+ }
+ #sort command
+ #$sortcmd="sort -b"; #does not seem to work properly
+ $sortcmd="sort --temporary-directory=$tmpdir";
+ if ($inv){
+ warn "inverted sorting of $order-grams\n";
+ for (my $n=$order;$n>0;$n--){
+ $sortcmd.=" -k ".($n+1).",".($n+1);
+ }
+ }else{
+ warn "direct sorting of $order-grams\n";
+ for (my $n=1;$n<=$order;$n++){
+ $sortcmd.=" -k ".($n+1).",".($n+1);
+ }
+ }
+
+ close(OUT);open (OUT,"|$sortcmd >> $olm");
+
+
+ do{
+ print(OUT $_);$_=<INP>;
+
+ }until (/^\\/ || /^\n/);
+
+ close(OUT); open(OUT, ">> $olm");
+
+ }
+ else{
+ print(OUT $_);$_=<INP>;
+ }
+
+}
+
+print(OUT $_);
+
+close(INP);
+close(OUT);
diff --git a/scripts/split-dict.pl b/scripts/split-dict.pl
new file mode 100755
index 0000000..942d66e
--- /dev/null
+++ b/scripts/split-dict.pl
@@ -0,0 +1,157 @@
+#! /usr/bin/perl
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+#******************************************************************************
+
+#usage:
+#split-dict.pl <input> <output> <parts>
+#It splits the <input> dictionary into <parts> dictionaries
+#(named <output000>, ... <output999>)
+#splitting is balanced wrt to frequency of the <input> dictionary
+#if not available a frequency of 1 is considered
+
+use strict;
+use Getopt::Long "GetOptions";
+use File::Basename;
+
+my ($help,$input,$output,$parts)=();
+
+$help=1 unless
+&GetOptions('input=s' => \$input,
+ 'output=s' => \$output,
+ 'parts=i' => \$parts,
+ 'h|help' => \$help,);
+
+if ($help || !$input || !$output || !$parts) {
+ my $cmnd = basename($0);
+ print "\n$cmnd - splits a dictionary into frequency-balanced partitions\n",
+ "\nUSAGE:\n",
+ " $cmnd [options]\n",
+ "\nDESCRIPTION:\n",
+ " $cmnd splits a dictionary into frequency-balanced partitions.\n",
+ " The dictionary must be generated with IRSTLM command dict.\n",
+ " If dictionary does not contain frequencies, then a frequency 1 is\n",
+ " assumed for all words.\n",
+ "\nOPTIONS:\n",
+ " --input <string> input dictionary with frequencies\n",
+ " --output <string> prefix of output dictionaries\n",
+ " --parts <int> number of partitions to create\n",
+ " -h, --help (optional) print these instructions\n",
+ "\n";
+
+ exit(1);
+}
+
+
+
+my $freqflag=0;
+my ($w,$f,$globf,$thr);
+my (@D, at F,%S, at C);
+open(IN,"$input");
+
+chomp($_=<IN>);
+#if input is a dictionary.
+if (/^dictionary[ \t]+\d+[ \t]+\d+$/i){
+ my ($dummy,$size);
+ ($dummy,$dummy,$size)=split(/[ \t]+/,$_);
+ $freqflag=1 if /DICTIONARY/;
+}
+
+$globf=0;
+while(chomp($_=<IN>)){
+ if ($freqflag){
+ ($w,$f)=split(/[ \t]+/,$_);
+ }
+ else{
+ $w=$_;
+ $f=1;
+ }
+ push @D, $w;
+ push @F, $f;
+ $globf+=$f;
+}
+close (IN);
+
+$thr=$globf/$parts;
+my $totf=0;
+print STDERR "Dictionary 0: (thr: $thr , $globf, $totf , $parts)\n";
+
+my $sfx=0;
+my $w;
+for (my $i=0;$i<=$#D;$i++){
+
+# if the remaining words are less than or equal to
+# the number of remaining sub-dictionaries to create
+# put only one word per each sub-dictionary.
+ if (($totf>0) && ($#D+1-$i) <= ($parts-1-$sfx)){
+# recompute threshold on the remaining global frequency
+# according to the number of remaining parts
+ $sfx++;
+ $globf-=$totf;
+ $thr=($globf)/($parts-$sfx);
+ print STDERR "Dictionary $sfx: (thr: $thr , $globf , $totf , ",($parts-$sfx),")\n";
+ $totf=0;
+ }
+
+ $totf+=$F[$i];
+ $w=$D[$i];
+ $S{$w}=$sfx;
+ $C[$sfx]++;
+ if ($totf>$thr){
+# recompute threshold on the remaining global frequency
+# according to the number of remaining parts
+ $sfx++;
+ $globf-=$totf;
+ $thr=($globf)/($parts-$sfx);
+ print STDERR "Dictionary $sfx: (thr: $thr , $globf , $totf , ",($parts-$sfx),")\n";
+ $totf=0;
+ }
+}
+
+
+my $oldsfx=-1;
+for (my $i=0;$i<=$#D;$i++){
+ $w=$D[$i];
+ $sfx="0000$S{$w}";
+ $sfx=~s/.+(\d{3})/$1/;
+ if ($sfx != $oldsfx){
+#print STDERR "opening $output$sfx\n";
+ close (OUT) if $oldsfx!= -1;
+ open(OUT,">$output$sfx");
+ if ($freqflag){
+ print OUT "DICTIONARY 0 $C[$sfx]\n";
+ }
+ else{
+ print OUT "dictionary 0 $C[$sfx]\n";
+ }
+ $oldsfx=$sfx;
+ }
+ if ($freqflag){
+ print OUT "$w $F[$i]\n";
+ }
+ else{
+ print OUT "$w\n";
+ }
+}
+close (OUT) if $oldsfx!= -1;
+
+my $numdict=$S{$D[$#D]}+1;
+die "Only $numdict dictionaries were crested instead of $parts!" if ($numdict != $parts);
+
diff --git a/scripts/split-ngt.sh b/scripts/split-ngt.sh
new file mode 100755
index 0000000..1072cdb
--- /dev/null
+++ b/scripts/split-ngt.sh
@@ -0,0 +1,89 @@
+#! /bin/bash
+
+function usage()
+{
+ cmnd=$(basename $0);
+ cat<<EOF
+
+$cmnd - creates partition files with ngram statistics in Google format
+
+USAGE:
+ $cmnd [options] <input> <output> <order> <parts>
+
+DESCRIPTION:
+ <input> Input file name
+ <output> Partition files name prefix
+ <order> Order of the ngrams
+ <parts> Number of partitions
+
+OPTIONS:
+ -h Show this message
+
+EOF
+}
+
+# Parse options
+while getopts h OPT; do
+ case "$OPT" in
+ h)
+ usage >&2;
+ exit 0;
+ ;;
+ * ) usage;
+ exit 1;
+ ;;
+ esac
+done
+
+#usage:
+#ngt-split.sh [options] <input> <output> <size> <parts>
+#It creates <parts> files (named <output.000>, ... <output.999>)
+#containing ngram statistics (of <order> length) in Google format
+#These files are a partition of the whole set of ngrams
+
+basedir=$IRSTLM
+bindir=$basedir/bin
+scriptdir=$basedir/scripts
+
+unset par
+while [ $# -gt 0 ]
+do
+ echo "$0: arg $1"
+ par[${#par[@]}]="$1"
+ shift
+done
+
+inputfile=${par[0]}
+outputfile=${par[1]}
+order=${par[2]}
+parts=${par[3]}
+
+dictfile=dict$$
+
+
+echo "Extracting dictionary from training corpus"
+$bindir/dict -i="$inputfile" -o=$dictfile -f=y -sort=n
+
+echo "Splitting dictionary into $parts lists"
+$scriptdir/split-dict.pl --input $dictfile --output ${dictfile}. --parts $parts
+
+rm $dictfile
+
+
+echo "Extracting n-gram statistics for each word list"
+echo "Important: dictionary must be ordered according to order of appearance of words in data"
+echo "used to generate n-gram blocks, so that sub language model blocks results ordered too"
+
+for d in `ls ${dictfile}.*` ; do
+w=`echo $d | perl -pe 's/.+(\.[0-9]+)$/$1/i'`
+w="$outputfile$w"
+
+sdict=`basename $sdict`
+echo "Extracting n-gram statistics for $sdict"
+
+echo "$bindir/ngt -i="$inputfile" -n=$order -gooout=y -o=$w -fd=$d > /dev/null"
+$bindir/ngt -n=$order -gooout=y -o=$w -fd=$d -i="$inputfile" > /dev/null
+rm $d
+done
+
+exit 0
diff --git a/scripts/wrapper b/scripts/wrapper
new file mode 100644
index 0000000..2b2754c
--- /dev/null
+++ b/scripts/wrapper
@@ -0,0 +1,10 @@
+#! /bin/sh
+
+#set machine type for compilation
+MY_ARCH=`uname -m`
+
+name=`basename $0`
+dir=`dirname $0`"/$MY_ARCH"
+
+$dir/$name "$@"
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list