[irstlm] 09/126: added scripts folder

Giulio Paci giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:39 UTC 2016


This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.

commit a78296cb54f7febd2c8afb11f84ce9b7e7108be7
Author: Marcello Federico <mrcfdr at gmail.com>
Date:   Mon Jul 20 09:38:02 2015 +0200

    added scripts folder
---
 scripts/CMakeLists.txt      |   5 +
 scripts/add-start-end.sh    |  72 +++++++
 scripts/build-lm-qsub.sh    | 318 ++++++++++++++++++++++++++++++
 scripts/build-lm.sh         | 254 ++++++++++++++++++++++++
 scripts/build-sublm.pl      | 467 ++++++++++++++++++++++++++++++++++++++++++++
 scripts/cmake_install.cmake |  48 +++++
 scripts/goograms2ngrams.pl  | 145 ++++++++++++++
 scripts/lm-stat.pl          |  63 ++++++
 scripts/mdtsel.sh           | 219 +++++++++++++++++++++
 scripts/merge-sublm.pl      | 208 ++++++++++++++++++++
 scripts/ngram-split.pl      |  84 ++++++++
 scripts/other/beautify.perl |  22 +++
 scripts/plsa.sh             | 346 ++++++++++++++++++++++++++++++++
 scripts/qplsa.sh            | 183 +++++++++++++++++
 scripts/rm-start-end.sh     |  30 +++
 scripts/sort-lm.pl          | 124 ++++++++++++
 scripts/split-dict.pl       | 157 +++++++++++++++
 scripts/split-ngt.sh        |  89 +++++++++
 scripts/wrapper             |  10 +
 19 files changed, 2844 insertions(+)

diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
new file mode 100755
index 0000000..f47cddf
--- /dev/null
+++ b/scripts/CMakeLists.txt
@@ -0,0 +1,5 @@
+INSTALL(PROGRAMS
+    add-start-end.sh build-lm-qsub.sh build-lm.sh build-sublm.pl goograms2ngrams.pl lm-stat.pl mdtsel.sh merge-sublm.pl ngram-split.pl rm-start-end.sh sort-lm.pl split-dict.pl split-ngt.sh wrapper 
+    DESTINATION bin
+    PERMISSIONS OWNER_EXECUTE OWNER_READ OWNER_WRITE
+    )
diff --git a/scripts/add-start-end.sh b/scripts/add-start-end.sh
new file mode 100755
index 0000000..393e30e
--- /dev/null
+++ b/scripts/add-start-end.sh
@@ -0,0 +1,72 @@
+#! /bin/bash
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+
+#******************************************************************************
+
+function usage()
+{
+    cmnd=$(basename $0);
+    cat<<EOF
+
+$cmnd - adds sentence start/end symbols in each line and trims very very long words
+
+USAGE:
+       $cmnd [options]
+
+OPTIONS:
+       -h        Show this message
+       -r count  Specify symbol repetitions (default 1)
+       -t length Trim words up to _length_ chars (default 80)
+       -s char   Specify symbol (default s)
+
+EOF
+}
+
+#default setting
+repeat=1; 
+maxwordlen=80;
+symbol="s"
+
+# Parse options
+while getopts "hr:t:s:" OPT; do
+    case "$OPT" in
+        h)
+            usage >&2;
+            exit 0;
+            ;;
+        r)  repeat=$OPTARG
+            ;; 
+        t)  maxwordlen=$OPTARG
+            ;; 
+        s)  symbol=$OPTARG
+            ;; 
+    esac
+done
+
+#adds start/end symbols to standard input and 
+#trims words longer than 80 characters
+eos="";
+bos="";
+
+for i in `seq $repeat`; do bos="$bos<${symbol}> "; eos="$eos <\/${symbol}>";done
+
+(sed "s/^/$bos/" | sed "s/\$/ $eos/";) |\
+sed "s/\([^ ]\{$maxwordlen\}\)\([^ ]\{1,\}\)/\1/g"
+
diff --git a/scripts/build-lm-qsub.sh b/scripts/build-lm-qsub.sh
new file mode 100755
index 0000000..36100f4
--- /dev/null
+++ b/scripts/build-lm-qsub.sh
@@ -0,0 +1,318 @@
+#! /bin/bash
+
+function usage()
+{
+    cmnd=$(basename $0);
+    cat<<EOF
+
+$cmnd - estimates a language model file
+
+USAGE:
+       $cmnd [options]
+
+OPTIONS:
+       -h        Show this message
+       -i        Input training file e.g. 'gunzip -c train.gz'
+       -o        Output gzipped LM, e.g. lm.gz
+       -k        Number of splits (default 5)
+       -n        Order of language model (default 3)
+       -t        Directory for temporary files (default ./stat_PID)
+       -p        Prune singleton n-grams (default false)
+       -u        Use uniform word frequency for dictionary splitting (default false)
+       -q        Parameters for qsub ("-q <queue>", and any other)
+       -s        Smoothing methods: witten-bell (default), kneser-ney (approximated kneser-ney), improved-kneser-ney
+       -b        Include sentence boundary n-grams (optional)
+       -d        Define subdictionary for n-grams (optional)
+       -v        Verbose
+
+EOF
+}
+
+hostname=`uname -n`
+if [ $hostname == "voxgate" ] ; then
+echo "voxgate can not be used as submission host"
+echo "use any other cluster machine"
+exit
+fi
+
+if [ ! $IRSTLM ]; then
+   echo "Set IRSTLM environment variable with path to irstlm"
+   exit 2;
+fi
+
+#paths to scripts and commands in irstlm
+scr=$IRSTLM/bin
+bin=$IRSTLM/bin
+gzip=`which gzip 2> /dev/null`;
+gunzip=`which gunzip 2> /dev/null`;
+
+#check irstlm installation
+if [ ! -e $bin/dict -o  ! -e $scr/split-dict.pl ]; then
+   echo "$IRSTLM does not contain a proper installation of IRSTLM"
+   exit 3;
+fi
+
+#default parameters
+logfile=/dev/null
+tmpdir=stat_$$
+order=3
+parts=3
+inpfile="";
+outfile=""
+verbose="";
+smoothing="--witten-bell";
+prune="";
+boundaries="";
+dictionary="";
+uniform="-f=y";
+queueparameters=""
+
+while getopts "hvi:o:n:k:t:s:q:pbl:d:u" OPTION
+do
+     case $OPTION in
+         h)
+             usage
+             exit 0
+             ;;
+         v)
+             verbose="--verbose";
+             ;;
+         i)
+             inpfile=$OPTARG
+             ;;
+         d)
+             dictionary="-sd=$OPTARG"
+             ;;
+
+         u)
+             uniform=" "
+             ;;
+
+         o)
+             outfile=$OPTARG
+             ;;
+         n)
+             order=$OPTARG
+             ;;
+         k)
+             parts=$OPTARG
+             ;;
+         t)
+             tmpdir=$OPTARG
+             ;;
+         s)
+             smoothing=$OPTARG
+	     case $smoothing in
+	     witten-bell) 
+		     smoothing="--witten-bell"
+		     ;; 
+	     kneser-ney)
+		     smoothing="--kneser-ney"
+		     ;;
+             improved-kneser-ney)
+                     smoothing="--improved-kneser-ney"
+                     ;;
+	     *) 
+		 echo "wrong smoothing setting";
+		 exit 4;
+	     esac
+             ;;
+         p)
+             prune='--prune-singletons';
+             ;;
+         q)
+             queueparameters=$OPTARG;
+             ;;
+         b)
+             boundaries='--cross-sentence';
+             ;;
+	 l)
+             logfile=$OPTARG
+             ;;
+         ?)
+             usage
+             exit
+             ;;
+     esac
+done
+
+
+if [ $verbose ]; then
+echo inpfile=\"$inpfile\" outfile=$outfile order=$order parts=$parts tmpdir=$tmpdir prune=$prune smoothing=$smoothing dictionary=$dictionary verbose=$verbose
+fi
+
+if [ ! "$inpfile" -o ! "$outfile" ]; then
+    usage
+    exit 5 
+fi
+ 
+if [ -e $outfile ]; then
+   echo "Output file $outfile already exists! either remove or rename it."
+   exit 6;
+fi
+
+if [ -e $logfile -a $logfile != "/dev/null" -a $logfile != "/dev/stdout" ]; then
+   echo "Logfile $logfile already exists! either remove or rename it."
+   exit 7;
+fi
+
+#check tmpdir
+tmpdir_created=0;
+if [ ! -d $tmpdir ]; then
+   echo "Temporary directory $tmpdir does not exist";
+   echo "creating $tmpdir";
+   mkdir -p $tmpdir;
+   tmpdir_created=1;
+else
+   echo "Cleaning temporary directory $tmpdir";
+   rm $tmpdir 2> /dev/null
+   if [ $? != 0 ]; then
+      echo "Warning: some temporary files could not be removed"
+   fi
+fi
+
+workingdir=`pwd | perl -pe 's/\/nfsmnt//g'`
+cd $workingdir
+
+qsubout="$workingdir/DICT-OUT$$"
+qsuberr="$workingdir/DICT-ERR$$"
+qsublog="$workingdir/DICT-LOG$$"
+qsubname="DICT"
+
+(\
+qsub $queueparameters -b no -sync yes -o $qsubout -e $qsuberr -N $qsubname << EOF
+cd $workingdir
+echo exit status $?
+echo "Extracting dictionary from training corpus"
+$bin/dict -i="$inpfile" -o=$tmpdir/dictionary $uniform -sort=no
+echo exit status $?
+echo "Splitting dictionary into $parts lists"
+$scr/split-dict.pl --input $tmpdir/dictionary --output $tmpdir/dict. --parts $parts
+echo exit status $?
+EOF
+) 2>&1 > $qsublog
+
+unset suffix
+#getting list of suffixes
+for file in `ls $tmpdir/dict.*` ; do
+sfx=`echo $file | perl -pe 's/^.+\.(\d+)$/$1/'`
+suffix[${#suffix[@]}]=$sfx
+done
+
+qsubout="$workingdir/NGT-OUT$$"
+qsuberr="$workingdir/NGT-ERR$$"
+qsublog="$workingdir/NGT-LOG$$"
+qsubname="NGT"
+
+unset getpids
+echo "Extracting n-gram statistics for each word list"
+echo "Important: dictionary must be ordered according to order of appearance of words in data"
+echo "used to generate n-gram blocks,  so that sub language model blocks results ordered too"
+
+for sfx in ${suffix[@]} ; do
+
+(\
+qsub $queueparameters -b no -j yes -sync no -o $qsubout.$sfx -e $qsuberr.$sfx -N $qsubname-$sfx << EOF
+cd $workingdir
+echo exit status $?
+$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.dict.${sfx}.gz" -fd="$tmpdir/dict.${sfx}" $dictionary -iknstat="$tmpdir/ikn.stat.dict.${sfx}" 
+echo exit status $?
+echo
+EOF
+) 2>&1 > $qsublog.$sfx
+
+id=`cat $qsublog.$sfx | grep 'Your job' | awk '{print $3}'`
+sgepid[${#sgepid[@]}]=$id
+
+done
+
+waiting=""
+for id in ${sgepid[@]} ; do waiting="$waiting -hold_jid $id" ; done
+
+qsub $queueparameters -sync yes $waiting -j y -o /dev/null -e /dev/null -N $qsubname.W -b y /bin/ls 2>&1 > $qsubname.W.log
+rm $qsubname.W.log
+
+qsubout="$workingdir/SUBLM-OUT$$"
+qsuberr="$workingdir/SUBLM-ERR$$"
+qsublog="$workingdir/SUBLM-LOG$$"
+qsubname="SUBLM"
+
+unset getpids
+echo "Estimating language models for each word list"
+
+if [ $smoothing = "--kneser-ney" -o $smoothing = "--improved-kneser-ney" ]; then
+
+for sfx in ${suffix[@]} ; do
+(\
+qsub $queueparameters -b no -j yes -sync no -o $qsubout.$sfx -e $qsuberr.$sfx -N $qsubname-$sfx << EOF
+cd $workingdir
+echo exit status $?
+
+$scr/build-sublm.pl $verbose $prune $smoothing "cat $tmpdir/ikn.stat.dict*" --size $order --ngrams "$gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx}  
+echo exit status $?
+
+echo
+EOF
+) 2>&1 > $qsublog.$sfx
+
+id=`cat $qsublog.$sfx | grep 'Your job' | awk '{print $3}'`
+sgepid[${#sgepid[@]}]=$id
+
+done
+
+else
+
+
+for sfx in ${suffix[@]} ; do
+(\
+qsub $queueparameters -b no -j yes -sync no -o $qsubout.$sfx -e $qsuberr.$sfx -N $qsubname-$sfx << EOF
+cd $workingdir
+echo exit status $?
+
+$scr/build-sublm.pl $verbose $prune $smoothing --size $order --ngrams "$gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx}  
+
+echo
+EOF
+) 2>&1 > $qsublog.$sfx
+
+id=`cat $qsublog.$sfx | grep 'Your job' | awk '{print $3}'`
+sgepid[${#sgepid[@]}]=$id
+
+done
+
+fi
+
+
+waiting=""
+for id in ${sgepid[@]} ; do waiting="$waiting -hold_jid $id" ; done
+
+
+qsub $queueparameters -sync yes $waiting -o /dev/null -e /dev/null -N $qsubname.W -b yes /bin/ls 2>&1 > $qsubname.W.log
+rm $qsubname.W.log
+
+echo "Merging language models into $outfile"
+qsubout="$workingdir/MERGE-OUT$$"
+qsuberr="$workingdir/MERGE-ERR$$"
+qsublog="$workingdir/MERGE-LOG$$"
+qsubname="MERGE"
+(\
+qsub $queueparameters -b no -j yes -sync yes -o $qsubout -e $qsuberr -N $qsubname << EOF
+cd $workingdir
+$scr/merge-sublm.pl --size $order --sublm $tmpdir/lm.dict -lm $outfile
+EOF
+) 2>&1 > $qsublog
+
+echo "Cleaning temporary directory $tmpdir";
+rm $tmpdir/* 2> /dev/null
+rm $qsubout* $qsuberr* $qsublog* 2> /dev/null
+
+if [ $tmpdir_created -eq 1 ]; then
+    echo "Removing temporary directory $tmpdir";
+    rmdir $tmpdir 2> /dev/null
+    if [ $? != 0 ]; then
+        echo "Warning: the temporary directory could not be removed."
+    fi
+fi
+
+exit 0
+
diff --git a/scripts/build-lm.sh b/scripts/build-lm.sh
new file mode 100755
index 0000000..82ef71b
--- /dev/null
+++ b/scripts/build-lm.sh
@@ -0,0 +1,254 @@
+#! /bin/bash
+
+set -m # Enable Job Control
+
+function usage()
+{
+    cmnd=$(basename $0);
+    cat<<EOF
+
+$cmnd - estimates a language model file and saves it in intermediate ARPA format
+
+USAGE:
+       $cmnd [options]
+
+OPTIONS:
+       -i|--InputFile          Input training file e.g. 'gunzip -c train.gz'
+       -o|--OutputFile         Output gzipped LM, e.g. lm.gz
+       -k|--Parts              Number of splits (default 5)
+       -n|--NgramSize          Order of language model (default 3)
+       -d|--Dictionary         Define subdictionary for n-grams (optional, default is without any subdictionary)
+       -s|--LanguageModelType  Smoothing methods: witten-bell (default), shift-beta, improved-shift-beta, stupid-backoff; kneser-ney and improved-kneser-ney still accepted for back-compatibility, but mapped into shift-beta and improved-shift-beta, respectively
+       -p|--PruneSingletons    Prune singleton n-grams (default false)
+       -f|--PruneFrequencyThreshold      Pruning frequency threshold for each level; comma-separated list of values; (default is '0,0,...,0', for all levels)
+       -t|--TmpDir             Directory for temporary files (default ./stat_PID)
+       -l|--LogFile            File to store logging info (default /dev/null)
+       -u|--uniform            Use uniform word frequency for dictionary splitting (default false)
+       -b|--boundaries         Include sentence boundary n-grams (optional, default false)
+       -v|--verbose            Verbose
+       -h|-?|--help            Show this message
+
+EOF
+}
+
+if [ ! $IRSTLM ]; then
+   echo "Set IRSTLM environment variable with path to irstlm"
+   exit 2
+fi
+
+#paths to scripts and commands in irstlm
+scr=$IRSTLM/bin
+bin=$IRSTLM/bin
+gzip=`which gzip 2> /dev/null`;
+gunzip=`which gunzip 2> /dev/null`;
+
+#check irstlm installation
+if [ ! -e $bin/dict -o  ! -e $scr/split-dict.pl ]; then
+   echo "$IRSTLM does not contain a proper installation of IRSTLM"
+   exit 3
+fi
+
+#default parameters
+logfile=/dev/null
+tmpdir=stat_$$
+order=3
+parts=3
+inpfile="";
+outfile=""
+verbose="";
+smoothing="witten-bell";
+prune="";
+prune_thr_str="";
+boundaries="";
+dictionary="";
+uniform="-f=y";
+backoff=""
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -i | --InputFile )          shift;
+																inpfile=$1;
+																;;
+        -o | --OutputFile )         shift;
+																outfile=$1;
+                                ;;
+        -n | --NgramSize )           shift;
+																order=$1;
+                                ;;
+        -k | --Parts )          shift;
+																parts=$1;
+                                ;;
+        -d | --Dictionary )     shift;
+                                dictionary="-sd=$1";
+                                ;;
+        -s | --LanguageModelType )        shift;
+																				  smoothing=$1;
+                                          ;;
+        -f | --PruneFrequencyThreshold )  shift;
+																          prune_thr_str="--PruneFrequencyThreshold=$1";
+                                          ;;
+        -p | --PruneSingletons )     prune='--prune-singletons';
+																			;;
+        -l | --LogFile )        shift;
+																logfile=$1;
+                                ;;
+        -t | --TmpDir )         shift;
+																tmpdir=$1;
+                                ;;
+        -u | --uniform )        uniform=' ';
+                                ;;
+        -b | --boundaries )     boundaries='--cross-sentence';
+																;;
+        -v | --verbose )        verbose='--verbose';
+                                ;;
+        -h | -? | --help )      usage;
+                                exit 0;
+                                ;;
+        * )                     usage;
+                                exit 1;
+    esac
+    shift
+done
+
+case $smoothing in
+witten-bell) 
+smoothing="--witten-bell";
+;; 
+kneser-ney)
+## kneser-ney still accepted for back-compatibility, but mapped into shift-beta
+smoothing="--shift-beta";
+;;
+improved-kneser-ney)
+## improved-kneser-ney still accepted for back-compatibility, but mapped into improved-shift-beta
+smoothing="--improved-shift-beta"; 
+;;
+shift-beta)
+smoothing="--shift-beta";
+;;
+improved-shift-beta)
+smoothing="--improved-shift-beta";
+;;
+stupid-backoff)
+smoothing="--stupid-backoff";
+backoff="--backoff"
+;;
+*) 
+echo "wrong smoothing setting; '$smoothing' does not exist";
+exit 4
+esac
+			
+
+echo "LOGFILE:$logfile"
+			 
+
+if [ $verbose ] ; then
+echo inpfile='"'$inpfile'"' outfile=$outfile order=$order parts=$parts tmpdir=$tmpdir prune=$prune smoothing=$smoothing dictionary=$dictionary verbose=$verbose prune_thr_str=$prune_thr_str  >> $logfile 2>&1
+fi
+
+if [ ! "$inpfile" -o ! "$outfile" ] ; then
+    usage
+    exit 5
+fi
+ 
+if [ -e $outfile ]; then
+   echo "Output file $outfile already exists! either remove or rename it."
+   exit 6
+fi
+
+if [ -e $logfile -a $logfile != "/dev/null" -a $logfile != "/dev/stdout" ]; then
+   echo "Logfile $logfile already exists! either remove or rename it."
+   exit 7
+fi
+
+echo "BIS LOGFILE:$logfile" >> $logfile 2>&1
+
+#check tmpdir
+tmpdir_created=0;
+if [ ! -d $tmpdir ]; then
+   echo "Temporary directory $tmpdir does not exist"  >> $logfile 2>&1
+   echo "creating $tmpdir"  >> $logfile 2>&1
+   mkdir -p $tmpdir
+   tmpdir_created=1
+else
+   echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1
+    rm $tmpdir/* 2> /dev/null
+    if [ $? != 0 ]; then
+        echo "Warning: some temporary files could not be removed" >> $logfile 2>&1
+    fi
+fi
+
+
+echo "Extracting dictionary from training corpus" >> $logfile 2>&1
+$bin/dict -i="$inpfile" -o=$tmpdir/dictionary $uniform -sort=no 2> $logfile
+
+echo "Splitting dictionary into $parts lists" >> $logfile 2>&1
+$scr/split-dict.pl --input $tmpdir/dictionary --output $tmpdir/dict. --parts $parts >> $logfile 2>&1
+
+echo "Extracting n-gram statistics for each word list" >> $logfile 2>&1
+echo "Important: dictionary must be ordered according to order of appearance of words in data" >> $logfile 2>&1
+echo "used to generate n-gram blocks,  so that sub language model blocks results ordered too" >> $logfile 2>&1
+
+for sdict in $tmpdir/dict.*;do
+sdict=`basename $sdict`
+echo "Extracting n-gram statistics for $sdict" >> $logfile 2>&1
+if [ $smoothing = "--shift-beta" -o $smoothing = "--improved-shift-beta" ]; then
+additional_parameters="-iknstat=$tmpdir/ikn.stat.$sdict"
+else
+additional_parameters=""
+fi
+
+$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.${sdict}.gz" -fd="$tmpdir/$sdict" $dictionary $additional_parameters >> $logfile 2>&1 &
+
+#$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.${sdict}.gz" -fd="$tmpdir/$sdict" $dictionary -iknstat="$tmpdir/ikn.stat.$sdict" >> $logfile 2>&1 &
+#else
+#$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.${sdict}.gz" -fd="$tmpdir/$sdict" $dictionary >> $logfile 2>&1 &
+#fi
+done
+
+# Wait for all parallel jobs to finish
+while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done
+
+echo "Estimating language models for each word list" >> $logfile 2>&1
+for sdict in `ls $tmpdir/dict.*` ; do
+sdict=`basename $sdict`
+echo "Estimating language models for $sdict" >> $logfile 2>&1
+
+if [ $smoothing = "--shift-beta" -o $smoothing = "--improved-shift-beta" ]; then
+additional_smoothing_parameters="cat $tmpdir/ikn.stat.dict.*"
+additional_parameters="$backoff"
+else
+additional_smoothing_parameters=""
+additional_parameters=""
+fi
+$scr/build-sublm.pl $verbose $prune $prune_thr_str $smoothing "$additional_smoothing_parameters" --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict $additional_parameters >> $logfile 2>&1 &
+
+#if [ $smoothing = "--shift-beta" -o $smoothing = "--improved-shift-beta" ]; then
+#$scr/build-sublm.pl $verbose $prune $prune_thr_str $smoothing "cat $tmpdir/ikn.stat.dict.*" --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict $backoff >> $logfile 2>&1 &
+#else
+#$scr/build-sublm.pl $verbose $prune $prune_thr_str $smoothing --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict >> $logfile 2>&1 &
+#fi
+
+done
+
+# Wait for all parallel jobs to finish
+while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done
+
+echo "Merging language models into $outfile" >> $logfile 2>&1
+$scr/merge-sublm.pl --size $order --sublm $tmpdir/lm.dict -lm $outfile $backoff  >> $logfile 2>&1
+
+echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1
+rm $tmpdir/* 2> /dev/null
+
+if [ $tmpdir_created -eq 1 ]; then
+    echo "Removing temporary directory $tmpdir" >> $logfile 2>&1
+    rmdir $tmpdir 2> /dev/null
+    if [ $? != 0 ]; then
+        echo "Warning: the temporary directory could not be removed." >> $logfile 2>&1
+    fi
+fi
+ 
+exit 0
+
+
+
+
diff --git a/scripts/build-sublm.pl b/scripts/build-sublm.pl
new file mode 100755
index 0000000..0bbe875
--- /dev/null
+++ b/scripts/build-sublm.pl
@@ -0,0 +1,467 @@
+#! /usr/bin/perl
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+
+#******************************************************************************
+
+
+
+#first pass: read dictionary and generate 1-grams
+#second pass: 
+#for n=2 to N
+#  foreach n-1-grams
+#      foreach  n-grams with history n-1
+#          compute smoothing statistics
+#          store successors
+#      compute back-off probability
+#      compute smoothing probability
+#      write n-1 gram with back-off prob 
+#      write all n-grams with smoothed probability
+
+use strict;
+use Getopt::Long "GetOptions";
+use File::Basename;
+
+my $gzip=`which gzip 2> /dev/null`;
+my $gunzip=`which gunzip 2> /dev/null`;
+chomp($gzip);
+chomp($gunzip);
+my $cutoffword="<CUTOFF>"; #special word for Google 1T-ngram cut-offs 
+my $cutoffvalue=39;   #cut-off threshold for Google 1T-ngram cut-offs 
+
+#set defaults for optional parameters
+my ($verbose,$size,$ngrams,$sublm)=(0, 0, undef, undef);
+my ($witten_bell,$good_turing,$shift_beta,$improved_shift_beta,$stupid_backoff)=(0, 0, "", "", "");
+my ($witten_bell_flag,$good_turing_flag,$shift_beta_flag,$improved_shift_beta_flag,$stupid_backoff_flag)=(0, 0, 0, 0, 0);
+my ($freqshift,$prune_singletons,$prune_thr_str,$cross_sentence)=(0, 0, "", 0);
+
+my $help = 0;
+$help = 1 unless
+&GetOptions('size=i' => \$size,
+'freq-shift=i' => \$freqshift, 
+'ngrams=s' => \$ngrams,
+'sublm=s' => \$sublm,
+'witten-bell' => \$witten_bell,
+'good-turing' => \$good_turing,
+'shift-beta=s' => \$shift_beta,
+'improved-shift-beta=s' => \$improved_shift_beta,
+'stupid-backoff' => \$stupid_backoff,
+'prune-singletons' => \$prune_singletons,
+'pft|PruneFrequencyThreshold=s' => \$prune_thr_str,
+'cross-sentence' => \$cross_sentence,
+'h|help' => \$help,
+'verbose' => \$verbose);
+
+
+if ($help || !$size || !$ngrams || !$sublm) {
+	my $cmnd = basename($0);
+  print "\n$cmnd - estimates single LMs\n",
+	"\nUSAGE:\n",
+	"       $cmnd [options]\n",
+	"\nOPTIONS:\n",
+	"       --size <int>          maximum n-gram size for the language model\n",
+	"       --ngrams <string>     input file or command to read the ngram table\n",
+	"       --sublm <string>      output file prefix to write the sublm statistics \n",
+	"       --freq-shift <int>    (optional) value to be subtracted from all frequencies\n",
+	"       --witten-bell         (optional) use Witten-Bell linear smoothing (default) \n",
+	"       --shift-beta <string> (optional) use Shift-Beta smoothing with statistics in <string>\n",
+	"       --improved-shift-beta <string> (optional) use Improved Shift-Beta smoothing with statistics in <string>, similar to Improved Kneser Ney but without corrected counts\n",
+	"       --good-turing         (optional) use Good-Turing linear smoothing\n",
+	"       --stupid-backoff      (optional) use Stupid-Backoff smoothing\n",
+	"       --prune-singletons    (optional) remove n-grams occurring once, for n=3,4,5,... (disabled by default)\n",
+	"       -pft, --PruneFrequencyThreshold <string>	(optional) pruning frequency threshold for each level; comma-separated list of values; (default is \"0,0,...,0\", for all levels)\n",
+	"       --cross-sentence      (optional) include cross-sentence bounds (disabled by default)\n",
+	"       --verbose             (optional) print debugging info\n",
+	"       -h, --help            (optional) print these instructions\n",
+	"\n";
+	
+  exit(1);
+}
+
+$good_turing_flag = 1 if ($good_turing);
+die "build-sublm: This LM is no more supported\n\n" if ($good_turing_flag==1);
+
+$witten_bell_flag = 1 if ($witten_bell);
+$shift_beta_flag = 1 if ($shift_beta);
+$stupid_backoff_flag = 1 if ($stupid_backoff);
+$improved_shift_beta_flag = 1 if ($improved_shift_beta);
+$witten_bell = $witten_bell_flag = 1 if ($witten_bell_flag + $shift_beta_flag + $improved_shift_beta_flag + $stupid_backoff_flag) == 0;
+
+print STDERR  "build-sublm: size=$size ngrams=$ngrams sublm=$sublm witten-bell=$witten_bell shift-beta=$shift_beta improved-shift-beta=$improved_shift_beta stupid-backoff=$stupid_backoff prune-singletons=$prune_singletons cross-sentence=$cross_sentence PruneFrequencyThreshold=$prune_thr_str\n" if $verbose;
+
+
+die "build-sublm: choose only one smoothing method\n" if ($witten_bell_flag + $shift_beta_flag + $improved_shift_beta_flag + $stupid_backoff_flag) > 1;
+
+die "build-sublm: value of --size must be larger than 0\n" if $size<1;
+
+
+
+my @pruneFreqThr=();
+my $i=0;
+while ($i<=$size){
+	$pruneFreqThr[$i++]=0;
+}
+
+print STDERR "Pruning frequency threshold values:$prune_thr_str\n" if ($verbose);
+
+my @v=split(/,/,$prune_thr_str);
+$i=0;
+while ($i<scalar(@v)){
+	$pruneFreqThr[$i+1]=$v[$i];
+	$i++;
+	if ($i>=$size){
+		print STDERR "too many pruning frequency threshold values; kept the first values and skipped the others\n" if ($verbose);
+		last;	
+	};
+}
+
+$i=1;
+while ($i<=$size){
+	if ($pruneFreqThr[$i] < $pruneFreqThr[$i-1]){
+		$pruneFreqThr[$i]=$pruneFreqThr[$i-1];
+		print STDERR "the value of the pruning frequency threshold for level $i has been adjusted to value $pruneFreqThr[$i]\n" if ($verbose);
+	}
+	$i++;
+}
+
+if ($verbose){
+	$i=0;
+	while ($i<=$size){
+		print STDERR "pruneFreqThr[$i]=$pruneFreqThr[$i]\n";
+		$i++;
+	}
+}
+
+my $log10=log(10.0);	   #service variable to convert log into log10
+my $oldwrd="";		   #variable to check if 1-gram changed 
+my @cnt=();		   #counter of n-grams
+my $totcnt=0;		   #total counter of n-grams
+my ($ng, at ng);		   #read ngrams
+my $ngcnt=0;		   #store ngram frequency
+my $n;
+
+print STDERR  "Collecting 1-gram counts\n" if $verbose;
+
+open(INP,"$ngrams") || open(INP,"$ngrams|")  || die "cannot open $ngrams\n";
+open(GR,"|$gzip -c >${sublm}.1gr.gz") || die "cannot create ${sublm}.1gr.gz\n";
+
+while ($ng=<INP>) {
+  
+  chomp($ng);  @ng=split(/[ \t]+/,$ng);  $ngcnt=(pop @ng) - $freqshift;
+  
+	#	warn "ng: |@ng| ngcnt:$ngcnt\n";
+	
+  if ($oldwrd ne $ng[0]) {
+		#    warn "$totcnt,$oldwrd,$ng[0]\n" if $oldwrd ne '';
+    printf (GR "%s\t%s\n",$totcnt,$oldwrd) if $oldwrd ne '';
+    $totcnt=0;$oldwrd=$ng[0];
+  }
+  
+  #update counter
+  $totcnt+=$ngcnt;
+}
+
+printf GR "%s\t%s\n",$totcnt,$oldwrd;
+close(INP);
+close(GR);
+
+my (@h,$h,$hpr);	      #n-gram history 
+my (@dict,$code);	      #sorted dictionary of history successors
+my ($diff,$singlediff,$diff1,$diff2,$diff3); #different successors of history
+my (@n1, at n2, at n3, at n4, at uno3);  #IKN: n-grams occurring once or twice ...
+my (@beta,$beta);	     #IKN: n-grams occurring once or twice ...
+my $locfreq;
+
+#collect global statistics for (Improved) Shift-Beta smoothing
+if ($shift_beta_flag || $improved_shift_beta_flag) {
+  my $statfile=$shift_beta || $improved_shift_beta;
+  print STDERR  "load \& merge IKN statistics from $statfile \n" if $verbose;
+  open(IKN,"$statfile") || open(IKN,"$statfile|")  || die "cannot open $statfile\n";
+  while (<IKN>) {
+    my($lev,$n1,$n2,$n3,$n4,$uno3)=$_=~/level: (\d+)  n1: (\d+) n2: (\d+) n3: (\d+) n4: (\d+) unover3: (\d+)/;
+    $n1[$lev]+=$n1;$n2[$lev]+=$n2;$n3[$lev]+=$n3;$n4[$lev]+=$n4;$uno3[$lev]+=$uno3;
+		print STDERR  "from $statfile level $lev: n1:$n1 n2:$n2 n3:$n3 n4:$n4 uno3:$uno3\n";
+		print STDERR  "level $lev: n1[$lev]:$n1[$lev] n3[$lev]:$n2[$lev]  n3[$lev]:$n3[$lev] n4[$lev]:$n4[$lev] uno3[$lev]:$uno3[$lev]\n";
+  }
+	if ($verbose){
+		for (my $lev=1;$lev<=$#n1;$lev++) {
+			print STDERR  "level $lev: n1[$lev]:$n1[$lev] n3[$lev]:$n2[$lev]  n3[$lev]:$n3[$lev] n4[$lev]:$n4[$lev] uno3[$lev]:$uno3[$lev]\n";
+		}
+	}
+  close(IKN);
+}
+
+print STDERR  "Computing n-gram probabilities:\n" if $verbose;
+
+foreach ($n=2;$n<=$size;$n++) {
+	
+  $code=-1;@cnt=(); @dict=(); $totcnt=0;$diff=0; $singlediff=1; $diff1=0; $diff2=0; $diff3=0; $oldwrd=""; 
+	
+  #compute smothing statistics         
+  my (@beta,$beta);               
+	
+  if ($stupid_backoff_flag) {
+		$beta=0.4;
+		print STDERR  "Stupid-Backoff smoothing: beta $n: $beta\n" if $verbose;
+	}
+	
+  if ($shift_beta_flag) {
+    if ($n1[$n]==0 || $n2[$n]==0) {
+      print STDERR  "Error in Shift-Beta smoothing statistics: resorting to Witten-Bell\n" if $verbose;  
+      $beta=0;  
+    } else {
+      $beta=$n1[$n]/($n1[$n] + 2 * $n2[$n]); 
+      print STDERR  "Shift-Beta smoothing: beta $n: $beta\n" if $verbose;  
+    }
+  }
+	
+  if ($improved_shift_beta_flag) {
+		
+    my $Y=$n1[$n]/($n1[$n] + 2 * $n2[$n]);
+		
+    if ($n3[$n] == 0 || $n4[$n] == 0 || $n2[$n] <= $n3[$n] || $n3[$n] <= $n4[$n]) {
+      print STDERR  "Warning: higher order count-of-counts are wrong\n" if $verbose;
+      print STDERR  "Fixing this problem by resorting only on the lower order count-of-counts\n" if $verbose;     
+      $beta[1] = $Y;
+      $beta[2] = $Y;
+      $beta[3] = $Y;
+    } else {
+      $beta[1] = 1 - 2 * $Y * $n2[$n] / $n1[$n];
+      $beta[2] = 2 - 3 * $Y * $n3[$n] / $n2[$n];
+      $beta[3] = 3 - 4 * $Y * $n4[$n] / $n3[$n];
+    }
+		print STDERR  "Improved-Shift-Beta  smoothing: level:$n beta[1]:$beta[1] beta[2]:$beta[2] beta[3]:$beta[3]\n" if $verbose; 
+  }
+	
+  open(HGR,"$gunzip -c ${sublm}.".($n-1)."gr.gz |") || die "cannot open ${sublm}.".($n-1)."gr.gz\n";
+  open(INP,"$ngrams") || open(INP,"$ngrams |")  || die "cannot open $ngrams\n";
+  open(GR,"| $gzip -c >${sublm}.${n}gr.gz");
+  open(NHGR,"| $gzip -c > ${sublm}.".($n-1)."ngr.gz") || die "cannot open ${sublm}.".($n-1)."ngr.gz";
+	
+  my $ngram;
+  my ($reduced_h, $reduced_ng) = ("", "");
+	
+  $ng=<INP>; chomp($ng); @ng=split(/[ \t]+/,$ng); $ngcnt=(pop @ng) - $freqshift;
+  $h=<HGR>; chomp($h); @h=split(/[ \t]+/,$h); $hpr=shift @h;
+  $reduced_ng=join(" ", at ng[0..$n-2]);
+  $reduced_h=join(" ", at h[0..$n-2]);
+	
+  @cnt=(); @dict=();
+  $code=-1; $totcnt=0; $diff=0; $singlediff=0; $diff1=0; $diff2=0; $diff3=0; $oldwrd="";
+  do{
+		
+    #load all n-grams starting with history h, and collect useful statistics 
+		
+    while ($reduced_h eq $reduced_ng){ #must be true the first time!
+
+      if ($oldwrd ne $ng[$n-1]) { #could this be otherwise? [Marcello 22/5/09]
+				$oldwrd=$ng[$n-1];
+				++$code;
+			}
+			
+			$dict[$code]=$ng[$n-1];
+      $cnt[$code]+=$ngcnt;
+			$totcnt+=$ngcnt;
+			
+      $ng=<INP>;
+
+			if (defined($ng)){
+				chomp($ng);
+				@ng=split(/[ \t]+/,$ng);$ngcnt=(pop @ng) - $freqshift;  
+				$reduced_ng=join(" ", at ng[0..$n-2]);
+			}
+			else{
+				last;
+			}
+    }	
+		
+		$diff=scalar(@cnt);	
+		for (my $c=0;$c<scalar(@cnt);++$c){
+			$singlediff++ if $cnt[$c]==1;
+			
+      if ($diff>1 && $dict[$c] eq $cutoffword) { # in google n-grams
+				#find estimates for remaining diff and singlediff
+				#proportional estimate
+				$diff--;		#remove cutoffword
+				my $concentration=1.0-($diff-1)/$totcnt;
+				my $mass=1;		#$totcnt/($totcnt+$ngcnt);
+				my $index=(1-($concentration * $mass))/(1-1/$cutoffvalue) + (1/$cutoffvalue);
+				my $cutoffdiff=int($ngcnt * $index);
+				$cutoffdiff=1 if $cutoffdiff==0;
+				print STDERR "diff $diff $totcnt cutofffreq $ngcnt -- cutoffdiff: $cutoffdiff\n";
+				print STDERR "concentration:",$concentration," mass:", $mass,"\n";
+				$diff+=$cutoffdiff;
+      }
+		}
+
+		
+    if ($improved_shift_beta) { 
+      for (my $c=0;$c<=$code;$c++) {
+				$diff1++ if $cnt[$c]==1;
+				$diff2++ if $cnt[$c]==2;
+				$diff3++ if $cnt[$c]>=3;
+      }
+    }
+		
+    #print smoothed probabilities
+    my $boprob=0;		#accumulate pruned probabilities 
+    my $prob=0;
+		my $boprob_correction=0; #prob for the correction due to singleton pruning
+		
+		if ($totcnt>0){	
+			for (my $c=0;$c<=$code;$c++) {
+				
+				$ngram=join(" ",$reduced_h,$dict[$c]);
+
+				print STDERR "totcnt:$totcnt diff:$diff singlediff:$singlediff\n" if $totcnt+$diff+$singlediff==0;
+				
+				if ($shift_beta && $beta>0) {
+					$prob=($cnt[$c]-$beta)/$totcnt;
+				} elsif ($improved_shift_beta) {
+					my $b=($cnt[$c]>= 3? $beta[3]:$beta[$cnt[$c]]);
+					$prob=($cnt[$c] - $b)/$totcnt;
+				} elsif ($stupid_backoff) {
+					$prob=$cnt[$c]/$totcnt;
+				} else { ### other smoothing types, like Witten-Bell
+					$prob=$cnt[$c]/($totcnt+$diff);
+				}
+				
+				## skip n-grams containing OOV
+				##		  if (&containsOOV($ngram)){ print STDERR "ngram:|$ngram| contains OOV --> hence skip\n";  next; }
+				
+				## skip also n-grams containing eos symbols not at the final
+				##			if (&CrossSentence($ngram)){ print STDERR "ngram:|$ngram| is Cross Sentence --> hence skip\n";  next; }
+				
+				
+				#rm singleton n-grams for (n>=3), if flag is active
+				#rm n-grams (n>=2) containing cross-sentence boundaries, if flag is not active
+				#rm n-grams containing <unk> or <cutoff> except for 1-grams
+				
+				#warn "considering $size $n |$ngram|\n";				
+				if (($prune_singletons && $n>=3 && $cnt[$c]==1) ||
+					(!$cross_sentence && &CrossSentence($ngram)) || 
+					(&containsOOV($dict[$c])) ||
+					($n>=2 && &containsOOV($h)) ||	
+					($dict[$c] eq $cutoffword) 
+					)
+				{						
+					$boprob+=$prob;
+					
+					if ($n<$size) {	#output this anyway because it will be an history for n+1 
+						printf GR "%f\t%s %s\n",-10000,$reduced_h,$dict[$c];
+					}
+				} else {
+					if ($cnt[$c] > $pruneFreqThr[$n]){
+						# print unpruned n-1 gram
+						my $logp=log($prob)/$log10;
+						printf(GR "%f\t%s %s\n",($logp>0?0:$logp),$reduced_h,$dict[$c]);
+					}else{
+						if ($n<$size) {	#output this anyway because it will be an history for n+1 
+							printf GR "%f\t%s %s\n",-10000,$reduced_h,$dict[$c];
+						}
+					}
+				}
+			}
+		}else{
+			$boprob=0;
+		}
+		
+		if (($prune_singletons && $n>=3)){
+			if ($shift_beta && $beta>0) { # correction due to singleton pruning
+				$boprob_correction += (1.0-$beta) * $singlediff / $totcnt;
+			} elsif ($improved_shift_beta) { # correction due to singleton pruning
+				$boprob_correction += (1-$beta[1]) * $singlediff / $totcnt;
+			} elsif ($stupid_backoff) { # correction due to singleton pruning
+				$boprob_correction += $singlediff/($totcnt);
+			} else { # correction due to singleton pruning
+				$boprob_correction += $singlediff/($totcnt+$diff);
+			} 
+		}
+		else{
+			$boprob_correction = 0;
+		}
+
+		$boprob=$boprob_correction;
+			
+    #rewrite history including back-off weight
+		
+    #check if history has to be pruned out
+    if ($hpr==-10000) {
+      #skip this history
+    } elsif ($shift_beta && $beta>0) {
+			print STDERR "wrong division: considering rewriting history --- h:|$h| --- hpr=$hpr --- totcnt:$totcnt -- denumerator:",($totcnt),"\n" if $totcnt==0;
+      my $lambda=$beta * $diff/$totcnt; 	
+      my $logp=log($boprob+$lambda)/$log10;
+      printf NHGR "%s\t%f\n",$h,($logp>0?0:$logp);
+    } elsif ($improved_shift_beta) {
+			print STDERR "wrong division: considering rewriting history --- h:|$h| --- hpr=$hpr --- totcnt:$totcnt -- denumerator:",($totcnt),"\n" if $totcnt==0;
+      my $lambda=($beta[1] * $diff1 + $beta[2] * $diff2 + $beta[3] * $diff3)/$totcnt; 	  
+      my $logp=log($boprob+$lambda)/$log10;
+      printf NHGR "%s\t%f\n",$h,($logp>0?0:$logp);
+    } elsif ($stupid_backoff) {
+			print STDERR "wrong division: considering rewriting history --- h:|$h| --- hpr=$hpr --- totcnt:$totcnt -- denumerator:",($totcnt),"\n" if $totcnt==0;
+      my $lambda=$beta;
+			my $logp=log($lambda)/$log10;
+      printf NHGR "%s\t%f\n",$h,($logp>0?0:$logp);
+    } else {
+			print STDERR "wrong division: considering rewriting history --- h:|$h| --- hpr=$hpr --- totcnt:$totcnt diff:$diff -- denumerator:",($totcnt+$diff),"\n" if $totcnt+$diff==0;
+      my $lambda=$diff/($totcnt+$diff); 
+      my $logp=log($boprob+$lambda)/$log10;
+      printf NHGR "%s\t%f\n",$h,($logp>0?0:$logp);
+    }     
+		
+    #reset smoothing statistics
+    $code=-1;@cnt=(); @dict=(); $totcnt=0;$diff=0;$singlediff=0;$oldwrd="";$diff1=0;$diff2=0;$diff3=0;$locfreq=0;
+		
+    #read next history
+    $h=<HGR>;
+		
+    if (defined($h)){
+      chomp($h); @h=split(/[ \t]+/,$h); $hpr=shift @h;
+      $reduced_h=join(" ", at h[0..$n-2]);
+    }else{
+      die "ERROR: Something could be wrong: history are terminated before ngrams!" if defined($ng);
+    }
+  }until (!defined($ng));		#n-grams are over
+	
+  close(HGR); close(INP); close(GR); close(NHGR);
+	
+  rename("${sublm}.".($n-1)."ngr.gz","${sublm}.".($n-1)."gr.gz");
+}   
+
+
+#check if n-gram contains cross-sentence boundaries
+sub CrossSentence(){
+  my ($ngram) = @_;
+  if ($ngram=~/<\/s> /i) { #if </s> occurs not only in the last place
+		print STDERR  "check CrossSentence ngram:|$ngram| is CrossSentence\n" if $verbose;
+    return 1;
+  }
+  return 0;
+}
+
+#check if n-gram contains OOV
+sub containsOOV(){
+  my ($ngram) = @_;
+  if ($ngram=~/<UNK>/i){
+		print STDERR  "check containsOOV ngram:|$ngram| contains OOV\n" if $verbose;
+    return 1;
+  }
+  return 0;
+}
diff --git a/scripts/cmake_install.cmake b/scripts/cmake_install.cmake
new file mode 100644
index 0000000..05bbaa7
--- /dev/null
+++ b/scripts/cmake_install.cmake
@@ -0,0 +1,48 @@
+# Install script for directory: /Users/marcello/Workspace/software/irstlm/trunk/scripts
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/Users/marcello/Workspace/software/irstlm")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE PROGRAM PERMISSIONS OWNER_EXECUTE OWNER_READ OWNER_WRITE FILES
+    "/Users/marcello/Workspace/software/irstlm/trunk/scripts/add-start-end.sh"
+    "/Users/marcello/Workspace/software/irstlm/trunk/scripts/build-lm-qsub.sh"
+    "/Users/marcello/Workspace/software/irstlm/trunk/scripts/build-lm.sh"
+    "/Users/marcello/Workspace/software/irstlm/trunk/scripts/build-sublm.pl"
+    "/Users/marcello/Workspace/software/irstlm/trunk/scripts/goograms2ngrams.pl"
+    "/Users/marcello/Workspace/software/irstlm/trunk/scripts/lm-stat.pl"
+    "/Users/marcello/Workspace/software/irstlm/trunk/scripts/mdtsel.sh"
+    "/Users/marcello/Workspace/software/irstlm/trunk/scripts/merge-sublm.pl"
+    "/Users/marcello/Workspace/software/irstlm/trunk/scripts/ngram-split.pl"
+    "/Users/marcello/Workspace/software/irstlm/trunk/scripts/rm-start-end.sh"
+    "/Users/marcello/Workspace/software/irstlm/trunk/scripts/sort-lm.pl"
+    "/Users/marcello/Workspace/software/irstlm/trunk/scripts/split-dict.pl"
+    "/Users/marcello/Workspace/software/irstlm/trunk/scripts/split-ngt.sh"
+    "/Users/marcello/Workspace/software/irstlm/trunk/scripts/wrapper"
+    )
+endif()
+
diff --git a/scripts/goograms2ngrams.pl b/scripts/goograms2ngrams.pl
new file mode 100755
index 0000000..9232b84
--- /dev/null
+++ b/scripts/goograms2ngrams.pl
@@ -0,0 +1,145 @@
+#! /usr/bin/perl
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+
+#******************************************************************************
+
+
+
+#transforms google n-grams into real n-grams so that  counts are
+#consistent with respect to lower order n-grams
+
+use strict;
+use Getopt::Long "GetOptions";
+
+my $gzip=`which gzip 2> /dev/null`;
+my $gunzip=`which gunzip 2> /dev/null`;
+chomp($gzip);
+chomp($gunzip);
+
+my $cutoffword="<CUTOFF>"; #special word for Google 1T-ngram cut-offs
+my $blocksize=10000000; #this is the blocksize of produced n-grams
+my $from=2;             #starting n-gram level
+
+my($help,$verbose,$maxsize,$googledir,$ngramdir)=();
+
+$help=1 unless
+&GetOptions('maxsize=i' => \$maxsize,
+			'startfrom=i' => \$from,
+			'googledir=s' => \$googledir,
+			'ngramdir=s' => \$ngramdir,
+			'h|help' => \$help,
+			'verbose' => \$verbose);
+
+
+if ($help || !$maxsize || !$googledir || !$ngramdir ) {
+	my $cmnd = "goograms2ngrams.pl";
+  print "\n$cmnd - transforms google n-grams into real n-grams so that\n",
+	"       counts are consistent with respect to lower order n-grams\n",
+	"\nUSAGE:\n",
+	"       $cmnd [options]\n",
+	"\nOPTIONS:\n",
+    "       --maxsize <int>       maximum n-gram level of conversion\n",
+    "       --startfrom <int>     skip initial levels if already available (default 2)\n",
+    "       --googledir <string>  directory containing the google-grams dirs (1gms,2gms,...)\n",
+    "       --ngramdir <string>   directory where to write the n-grams \n",
+    "       --verbose             (optional) very talktive output\n",
+    "       -h, --help            (optional) print these instructions\n",
+    "\n";
+
+  exit(1);
+}
+
+warn "goograms2ngrams: maxsize $maxsize from $from googledir $googledir ngramdir $ngramdir \n"
+if $verbose;
+
+die "goograms2ngrams: value of --maxsize must be between 2 and 5\n" if $maxsize<2 || $maxsize>5;
+die "goograms2ngrams: cannot find --googledir $googledir \n" if ! -d $googledir;
+die "goograms2ngrams: cannot find --ngramdir  $ngramdir \n" if ! -d $ngramdir;
+
+
+my ($n,$hgrams,$ggrams,$ngrams)=();
+my ($ggr,$hgr,$hgrcnt,$ggrcnt,$totggrcnt)=();
+my (@ggr, at hgr)=();
+
+foreach ($n=$from;$n<=$maxsize;$n++){
+  
+  my $counter=0;
+  	
+  warn "Converting google-$n-grams into $n-gram\n"; 
+
+  $hgrams=($n==2?"${googledir}/1gms/vocab.gz":"${ngramdir}/".($n-1)."grams-*.gz");
+  open(HGR,"$gunzip -c $hgrams |") || die "cannot open $hgrams\n";
+  
+  $ggrams="${googledir}/".($n)."gms/".($n)."gm-*";
+  open(GGR,"$gunzip -c $ggrams |") || die "cannot open $ggrams\n";
+   
+  my $id = sprintf("%04d", 0);
+  $ngrams="${ngramdir}/".($n)."grams-${id}.gz";
+
+  next if -e $ngrams; #go to next step if file exists already;
+  open(NGR,"|$gzip -c > $ngrams ")  || die "cannot open $ngrams\n";
+
+  chop($ggr=<GGR>); @ggr=split(/[ \t]/,$ggr);$ggrcnt=(pop @ggr);
+  #warn "ggr: ",$ggrcnt," ",join(" ", at ggr[0..$n-1]),"\n";
+
+  while ($hgr=<HGR>){	
+
+	$counter++;
+	printf(STDERR ".") if ($counter % 1000000)==0;
+	  
+	chop($hgr); @hgr=split(/[ \t]/,$hgr); $hgrcnt=(pop @hgr);
+    #warn "hgr: ",$hgrcnt," ",join(" ", at hgr[0..$n-2]),"\n";
+  
+	if (join(" ", at hgr[0..$n-2]) eq join(" ", at ggr[0..$n-2])){ 
+
+		$totggrcnt=0;
+		do{
+			$totggrcnt+=$ggrcnt;
+			print NGR join(" ", at ggr[0..$n-1])," ",$ggrcnt,"\n";
+			chop($ggr=<GGR>);@ggr=split(/[ \t]/,$ggr);$ggrcnt=(pop @ggr);
+		}until (join(" ", at hgr[0..$n-2]) ne join(" ", at ggr[0..$n-2]));
+
+		if ($hgrcnt > $totggrcnt){
+			#warn "difference: $hgrcnt $totggrcnt =",$hgrcnt-$totggrcnt,"\n";
+			print NGR join(" ", at hgr[0..$n-1])," ",$cutoffword," ",$hgrcnt-$totggrcnt,"\n";
+		}
+	}
+	else{ 
+		#warn "fully pruned context: $hgr\n";
+		print NGR join(" ", at hgr[0..$n-1])," ",$cutoffword," ",$hgrcnt,"\n";
+	}	
+	
+	if (($counter % $blocksize)==0){ 
+		close(NGR);
+		my $id = sprintf("%04d", int($counter / $blocksize));
+		$ngrams="${ngramdir}/".($n)."grams-${id}.gz";
+		open(NGR,"|$gzip -c > $ngrams ")  || die "cannot open $ngrams\n";	
+	}
+	
+  }
+
+  close(HGR);close(NGR);close(GGR);
+  
+}
+  
+  
+  
+  
+  
diff --git a/scripts/lm-stat.pl b/scripts/lm-stat.pl
new file mode 100755
index 0000000..ac2558d
--- /dev/null
+++ b/scripts/lm-stat.pl
@@ -0,0 +1,63 @@
+#! /usr/bin/perl
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+
+#******************************************************************************
+#computes LM statistics over a string
+
+use strict;
+use Getopt::Long "GetOptions";
+use File::Basename;
+
+my ($help,$lm,$txt)=();
+$help=1 unless
+
+&GetOptions('lm=s' => \$lm,
+            'txt=s' => \$txt,
+            'h|help' => \$help,);
+
+if ($help || !$lm || !$txt) {
+	my $cmnd = basename($0);
+  print "\n$cmnd - computes LM statistics over a string\n",
+	"\nUSAGE:\n",
+	"       $cmnd [options]\n",
+	"\nOPTIONS:\n",
+    "       --lm  <string>        language model file \n",
+    "       --txt <string>        text file\n",
+    "       -h, --help            (optional) print these instructions\n",
+    "\n";
+
+  exit(1);
+}
+
+if (!$ENV{IRSTLM}){
+  print "Set environment variable IRSTLM with path to the irstlm directory\n";
+  exit(1);
+}
+
+
+
+my $clm="$ENV{IRSTLM}/bin/compile-lm";
+
+open (OUT,"$clm $lm --eval $txt --debug 1|");
+while (<OUT>){
+print;
+}
+
+close(OUT);
diff --git a/scripts/mdtsel.sh b/scripts/mdtsel.sh
new file mode 100755
index 0000000..164d4a5
--- /dev/null
+++ b/scripts/mdtsel.sh
@@ -0,0 +1,219 @@
+#! /bin/bash 
+
+#/******************************************************************************
+#IrstLM: IRST Language Model Toolkit
+#Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+#
+#This library is free software; you can redistribute it and/or
+#modify it under the terms of the GNU Lesser General Public
+#License as published by the Free Software Foundation; either
+#version 2.1 of the License, or (at your option) any later version.
+#
+#This library is distributed in the hope that it will be useful,
+# 
+# 
+#but WITHOUT ANY WARRANTY; without even the implied warranty of
+#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#Lesser General Public License for more details.
+#
+#You should have received a copy of the GNU Lesser General Public
+#License along with this library; if not, write to the Free Software
+#Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+#
+#******************************************************************************/
+
+# mdtsel.sh
+# by M. Federico
+# Copyright Marcello Federico, Fondazione Bruno Kessler, 2012
+
+
+set -m #enable job control
+
+usage()
+{
+    cmnd=$(basename $0);
+    cat << EOF
+
+$cmnd - performs data selection assuming an indomain corpus and
+        a very large out of domain corpus.
+
+USAGE:
+       $cmnd [options]
+
+DESCRIPTION.
+       This command performs data selection assuming an indomain
+       corpus and a very large out of domain corpus.
+       Both corpora must contain one sentence in each line delimited
+       with <s> and </s>. The process produces a file of scores.
+
+
+OPTIONS:
+       -h        Show this message
+       -v        Verbose
+       -i        In-domain corpus 
+       -o        Out-domain corpus
+       -s        Scores output file 
+       -x        Out-domain lines are indexed
+       -w        Temporary work directory (default /tmp)
+       -j        Number of jobs (default 6)
+       -m        Data selection model (1 or 2, default 2)
+       -f        Word frequency threshold (default 2)
+       -n        Ngram order to use (n>=1 default 3)
+       -d        Vocabulary size upper bound (default 10000000)   
+       -c        Cross-validation parameter (cv>=1, default 1)
+
+EOF
+}
+
+
+if [ ! $IRSTLM ]; then
+   echo "Set IRSTLM environment variable with path to irstlm"
+   exit 2
+fi
+
+#paths to scripts and commands in irstlm
+scr=$IRSTLM/bin
+bin=$IRSTLM/bin
+
+#check irstlm installation
+if [ ! -e $bin/dtsel ]; then
+   echo "$IRSTLM does not contain a proper installation of IRSTLM"
+   exit 3
+fi
+
+#default parameters
+indomfile="";
+outdomfile="";
+scoresfile="";
+workdir=/tmp
+logfile="/dev/null"
+jobs=6
+model=2
+minfreq=2
+ngramorder=3
+cv=1
+dub=10000000
+
+verbose="";
+useindex=0;
+
+while getopts “hvi:o:s:l:w:j:m:f:n:c:d:x:” OPTION
+do
+     case $OPTION in
+         h)
+             usage
+             exit 1
+             ;;
+         v)
+             verbose="--verbose";
+             ;;
+         i)
+             indfile=$OPTARG
+             ;;
+			 
+         o)
+             outdfile=$OPTARG
+             ;;
+         s)
+             scorefile=$OPTARG
+             ;;			 
+         l)
+             logfile=$OPTARG
+             ;;
+         w)
+		     workdir=$OPTARG
+             ;;			 
+         j)
+		     jobs=$OPTARG
+             ;;
+
+		 m)
+             model=$OPTARG
+             ;;	 
+
+         n)
+             ngramorder=$OPTARG
+             ;;
+         f)
+		     minfreq=$OPTARG;	
+			 ;;
+	     d)
+		     dub=$OPTARG;	
+			 ;;
+		 x)
+		     useindex=$OPTARG;	
+			 ;;
+
+ 		 ?)
+             usage
+             exit 1
+             ;;
+	
+		esac
+done
+
+
+if [ $verbose ];then
+echo indfile= $indfile outdfile= $outdfile scorefile= $scorefile useindex= $useindex 
+echo logfile= $logfile workdir= $workdir 
+echo jobs= $jobs model= $model ngramorder= $ngramorder minfreq= $minfreq dub=$dub
+fi
+
+if [ ! $indfile -o ! $outdfile -o ! $scorefile ]; then
+    usage
+    exit 5
+fi
+ 
+if [ -e $scorefile ]; then
+   echo "Output score file $outfile already exists! either remove or rename it."
+   exit 6
+fi
+
+if [ $logfile != "/dev/null" -a $logfile != "/dev/stdout" -a -e $logfile ]; then
+   echo "Logfile $logfile already exists! either remove or rename it."
+   exit 7
+fi
+
+workdir_created=0
+
+if [ ! -d $workdir ]; then
+   echo "Temporary work directory $workdir does not exist";
+   echo "creating $workdir";
+   mkdir -p $workdir;
+   workdir_created=1;
+fi
+
+
+#get process id to name process specific temporary files
+pid=$$
+
+#compute size of out domain corpus and block size of split
+lines=`wc -l < $outdfile`
+size=`echo "( $lines + 1000 )" / $jobs | bc` #to avoid any small block
+
+#perform split 
+split -l $size $outdfile $workdir/dtsel${pid}-files-
+
+for file in $workdir/dtsel${pid}-files-*
+do
+echo $file  
+( \
+$bin/dtsel -x=$useindex -i=$indfile -o=$file -s=${file}.scores -n=$ngramorder -dub=$dub -f=$minfreq -m=$model ; \
+cat ${file}.scores | perl -pe '/^nan /1000 /g;' | sort -g > ${file}.scores.tmp ; \
+mv ${file}.scores.tmp ${file}.scores \ 
+) >>$logfile 2>&1 &
+
+done
+
+# Wait for all parallel jobs to finish
+while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done
+
+sort -g -m $workdir/dtsel${pid}-files-*.scores > $scorefile
+rm $workdir/dtsel${pid}-files-*
+if [ $workdir_created == 1 ]
+then
+rmdir $workdir
+fi
+
+
+
diff --git a/scripts/merge-sublm.pl b/scripts/merge-sublm.pl
new file mode 100755
index 0000000..730aa28
--- /dev/null
+++ b/scripts/merge-sublm.pl
@@ -0,0 +1,208 @@
+#! /usr/bin/perl
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+
+#******************************************************************************
+#merge prefix LMs into one single file
+
+use strict;
+use Getopt::Long "GetOptions";
+use File::Basename;
+
+my ($help,$lm,$size,$sublm,$backoff)=();
+$help=0;
+$backoff=0;
+
+&GetOptions('size=i' => \$size,
+'lm=s' => \$lm,
+'sublm=s' => \$sublm,
+'backoff' => \$backoff,
+'h|help' => \$help);
+
+if ($help || !$size || !$lm || !$sublm) {
+	my $cmnd = basename($0);
+  print "\n$cmnd - merge single LMs\n",
+	"\nUSAGE:\n",
+	"       $cmnd [options]\n",
+	"\nOPTIONS:\n",
+	"       --size <int>          maximum n-gram size for the language model\n",
+	"       --sublm <string>      path identifying all input prefix sub LMs\n",
+	"       --lm <string>         name of the output LM file (will be gzipped)\n",
+	"       --backoff						  (optional) create a backoff LM, output is directly in ARPA format (default is false, i.e. iARPA format) \n",
+	"       -h, --help            (optional) print these instructions\n",
+	"\n";
+
+  exit(1);
+}
+
+
+my $gzip=`which gzip 2> /dev/null`;
+my $gunzip=`which gunzip 2> /dev/null`;
+chomp($gzip);
+chomp($gunzip);
+
+warn "merge-sublm.pl --size $size --sublm $sublm --lm $lm --backoff $backoff\n";
+
+warn "Compute total sizes of n-grams\n";
+my @size=();          #number of n-grams for each level
+my $tot1gr=0;         #total frequency of 1-grams
+my $unk=0;            #frequency of <unk>
+my $pr;               #probability of 1-grams
+my (@files,$files);   #sublm files for a given n-gram size  
+
+for (my $n=1;$n<=$size;$n++){
+
+  @files=map { glob($_) } "${sublm}*.${n}gr*";
+  $files=join(" ", at files);
+  $files || die "cannot find sublm files\n";
+  warn "join files $files\n";
+  
+  if ($n==1){
+    open(INP,"$gunzip -c $files|") || die "cannot open $files\n";
+    while(my $line = <INP>){
+      $size[$n]++;
+      chomp($line);
+      warn "there is an empty line in any of these files ($files); this should not happen\n" if $line =~ /^$/;
+      my @words = split(/[ \t]+/,$line);
+      #cut down counts for sentence initial
+      $words[0]=1 if $words[1]=~/<s>/;
+      #there could be more independent <unk> words
+      #generated by ngt with -sd option
+      $size[$n]-- if $unk && $words[1] eq "<unk>";
+      $unk+=$words[0] if $words[1]=~/<unk>/i;
+      $tot1gr+=$words[0];
+    }
+    close(INP);
+    if ($unk==0){
+      warn "implicitely add <unk> word to counters\n";
+      $tot1gr+=$size[$n]; #equivalent to WB smoothing
+      $size[$n]++; 
+    }
+  }else{
+    for (my $j=0;$j<scalar(@files);$j++){
+      safesystem("$gunzip -c $files[$j] | grep -v '10000.000' | wc -l > wc$$") or die;
+      open(INP,"wc$$") || die "cannot open wc$$\n";
+      my $wc = <INP>;
+      chomp($wc);
+      $size[$n] += $wc;
+      close(INP);
+      unlink("wc$$");
+    }
+  }
+  warn "n:$n size:$size[$n] unk:$unk\n";
+}
+
+warn "Merge all sub LMs\n";
+
+$lm.=".gz" if $lm!~/.gz$/;
+open(LM,"|$gzip -c > $lm") || die "Cannot open $lm\n";
+
+warn "Write LM Header\n";
+if ($backoff){
+	printf LM "ARPA\n\n";
+} else{
+	printf LM "iARPA\n\n";
+}
+
+printf LM "\\data\\\n";
+for (my $n=1;$n<=$size;$n++){
+    printf LM "ngram $n=\t$size[$n]\n";
+}
+printf LM "\n";
+close(LM);
+
+warn "Writing LM Tables\n";
+for (my $n=1;$n<=$size;$n++){
+  
+  warn "Level $n\n";
+  
+  @files=map { glob($_) } "${sublm}*.${n}gr*";
+  $files=join(" ", at files);
+  warn "input from: $files\n";
+  if ($n==1){         
+    open(INP,"$gunzip -c $files|") || die "cannot open $files\n";
+    open(LM,"|$gzip -c >> $lm");
+    printf LM "\\$n-grams:\n";
+    while(my $line = <INP>){   
+      chomp($line);
+      warn "there is an empty line in any of these files ($files); this should not happen\n" if $line =~ /^$/;
+	 #lowercase some expressions of google n-grams
+      $line=~s/<S>/<s>/g;
+      $line=~s/<\/S>/<\/s>/g;
+      $line=~s/<UNK>/<unk>/g;
+
+      my @words = split(/[ \t]+/,$line);
+
+      #always print unk a the eqnd
+      next if $words[1]=~/<unk>/i;
+
+      #cut down counts for sentence initial
+      $words[0]=1 if $words[1]=~/<s>/i;	  
+	  	
+      #apply witten-bell smoothing on 1-grams
+      $pr=(log($words[0]+1)-log($tot1gr+$size[1]))/log(10.0);
+      shift @words;
+      printf LM "%f\t%s\t%f\n",$pr,$words[0],$words[1];
+    }
+    close(INP);
+
+    #print final <unk>
+    #witten-bell smoothing of <unk> probability
+    if ($unk){
+      $pr=(log($unk+1)-log($tot1gr+$size[1]))/log(10.0);
+    }else{
+      $pr=(log($size[1]-1+1)-log($tot1gr+$size[1]))/log(10.0);
+    }
+
+    printf LM "%f <unk>\n",$pr;
+    close(LM);
+  }else{
+    open(LM,"|$gzip -c >> $lm");
+    printf LM "\\$n-grams:\n";
+    close(LM);
+    for (my $j=0;$j<scalar(@files);$j++){
+      safesystem("$gunzip -c $files[$j] | grep -v '10000.000' | gzip -c >> $lm") or die;
+    }
+  }
+
+}
+
+open(LM,"|$gzip -c >> $lm") || die "Cannot open $lm\n";
+printf LM "\\end\\\n";
+close(LM);
+
+sub safesystem {
+  print STDERR "Executing: @_\n";
+  system(@_);
+  if ($? == -1) {
+      print STDERR "Failed to execute: @_\n  $!\n";
+      exit(1);
+  }
+  elsif ($? & 127) {
+      printf STDERR "Execution of: @_\n  died with signal %d, %s coredump\n",
+          ($? & 127),  ($? & 128) ? 'with' : 'without';
+      exit(1);
+  }
+  else {
+    my $exitcode = $? >> 8;
+    print STDERR "Exit code: $exitcode\n" if $exitcode;
+    return ! $exitcode;
+  }
+}
+
diff --git a/scripts/ngram-split.pl b/scripts/ngram-split.pl
new file mode 100755
index 0000000..27700d3
--- /dev/null
+++ b/scripts/ngram-split.pl
@@ -0,0 +1,84 @@
+#! /usr/bin/perl
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+
+#******************************************************************************
+
+#re-segment google n-gram count files into files so that
+#n-grams starting with a given word (prefix) are all 
+#contained in one file.
+
+use Getopt::Long "GetOptions";
+use File::Basename;
+
+my ($help,$lm,$size,$sublm)=();
+$help=1 unless
+&GetOptions('h|help' => \$help);
+
+if ($help) {
+	my $cmnd = basename($0);
+  print "\n$cmnd - re-segment google n-gram count files so that n-grams\n",
+    "       starting with a given word (prefix) are all contained in one file\n",
+	"\nUSAGE:\n",
+	"       $cmnd [options] [<output_prefix>]\n",
+	"\nDESCRIPTION:\n",
+	"       Input is expected on STDIN.\n",
+	"       <output_prefix>       prefix of files to be created\n",
+	"\nOPTIONS:\n",
+    "       -h, --help            (optional) print these instructions\n",
+    "\n";
+
+  exit(1);
+}
+
+
+$max_pref=10000;   #number of prefixes to be put in one file 
+$max_ngram=5000000;#number of n-grams to be put in one file
+$file_cnt=0;       #counter of files 
+$pref_cnt=0;       #counter of prefixes in the current file
+$ngram_cnt=0;      #counter of n-gram in the current file   
+
+$path=($ARGV[0]?$ARGV[0]:"goong");     #path of files to be created
+
+$gzip=`which gzip`; 
+chomp($gzip);
+
+$pwrd="";
+open(OUT,sprintf("|$gzip -c > %s.%04d.gz",$path,++$file_cnt));
+
+while ($ng=<STDIN>){
+  ($wrd)=$ng=~/^([^ ]+)/;
+  #warn "$wrd\n";
+  if ($pwrd ne $wrd){
+    $pwrd=$wrd;
+    if ($file_pref>$max_pref || $ngram_cnt>$max_ngram){
+      warn "it's time to change file\n";
+      close(OUT);
+      open(OUT,sprintf("|$gzip -c > %s.%04d.gz",$path,++$file_cnt));
+      $pref_cnt=$ngram_cnt=0;
+    }
+    else{
+      $pref_cnt++;
+    }
+  }
+  print OUT $ng;
+  $ngram_cnt++;
+}
+close(OUT);
+
diff --git a/scripts/other/beautify.perl b/scripts/other/beautify.perl
new file mode 100755
index 0000000..eafaeeb
--- /dev/null
+++ b/scripts/other/beautify.perl
@@ -0,0 +1,22 @@
+#!/usr/bin/perl
+
+my $os=`uname | tr -d '\012'`;
+my $dir=`dirname $0 | tr -d '\012'`;
+my $astyle="$dir/astyle_$os";
+
+opendir(DIR,".") or die "Can't open the current directory: $!\n";
+
+# read file/directory names in that directory into @names 
+ at names = readdir(DIR) or die "Unable to read current dir:$!\n";
+
+foreach $name (@names) {
+   next if ($name eq ".");   # skip the current directory entry
+   next if ($name eq "..");  # skip the parent  directory entry
+
+   if (-d $name){            # is this a directory?
+      `$astyle --style="k&r" -s2 --recursive -v "$name/*.h" "$name/*.cpp"`;
+      next;                  # can skip to the next name in the for loop 
+   }
+}
+
+closedir(DIR);
diff --git a/scripts/plsa.sh b/scripts/plsa.sh
new file mode 100755
index 0000000..d59e3a5
--- /dev/null
+++ b/scripts/plsa.sh
@@ -0,0 +1,346 @@
+#! /bin/bash
+
+set -m # Enable Job Control
+
+
+
+function usage()
+{
+cmnd=$(basename $0);
+cat<<EOF
+
+$cmnd - train and/or test a probabilistic latent semantic model
+
+USAGE:
+$cmnd [options]
+
+TRAINING OPTIONS:
+
+-c file     Collection of training documents e.g. 'gunzip -c docs.gz'
+-d file     Dictionary file (default dictionary)
+-f          Force to use existing dictionary
+-m fle      Output model file e.g. model
+-n count    Number of topics (default 100)
+-i count    Number of training iterations (default 20)
+-t folder   Temporary working directory (default ./stat_PID)
+-p count    Prune words with counts < arg (default 2)
+-k count    Number of processes (default 5)
+
+-r file     Model output file in readable format
+-s count    Put top arg frequent words in special topic 0
+-l file     Log file (optional)
+-v          Verbose
+-h          Show this message
+
+
+TESTING OPTIONS
+
+-c file     Testing documents e.g. test
+-d file     Dictionary file (default dictionary)
+-m file     Model file
+-n number   Number of topics (default 100)
+-u file     Output document unigram distribution
+-o file     Output document topic distributions
+-i counts   Number of training iterations (default 20)
+-t folder   Temporary working directory (default ./stat_PID)
+-l file     Log file (optional)
+-k count    Number of processes (default 5)
+-v          Verbose
+-h          Show this message
+
+
+EOF
+}
+
+
+
+if [ ! $IRSTLM ]; then
+echo "Set IRSTLM environment variable with path to irstlm"
+exit 2
+fi
+
+#paths to scripts and commands in irstlm
+scr=$IRSTLM/bin
+bin=$IRSTLM/bin
+gzip=`which gzip 2> /dev/null`;
+gunzip=`which gunzip 2> /dev/null`;
+
+#default parameters
+tmpdir=stat_$$
+data=""
+topics=100
+splits=5
+iter=20
+prunefreq=2
+spectopics=0
+logfile="/dev/null"
+verbose=""
+unigram=""
+outtopic=""
+dict="dictionary"
+forcedict=""
+model=""
+txtfile="/dev/null"
+
+while getopts "hvfc:m:r:k:i:n:t:d:p:s:l:u:o:" OPTION
+do
+case $OPTION in
+h)
+usage
+exit 0
+;;
+v)
+verbose="--verbose";
+;;
+c)
+data=$OPTARG
+;;
+m)
+model=$OPTARG
+;;
+r)
+txtfile=$OPTARG
+;;
+k)
+splits=$OPTARG
+;;
+i)
+iter=$OPTARG
+;;
+t)
+tmpdir=$OPTARG
+;;
+d)
+dict=$OPTARG
+;;
+f)
+forcedict="TRUE"
+;;
+p)
+prunefreq=$OPTARG
+;;
+s)
+spectopics=$OPTARG
+;;
+n)
+topics=$OPTARG
+;;
+l)
+logfile=$OPTARG
+;;
+u)
+unigram=$OPTARG
+;;
+o)
+outtopic=$OPTARG
+;;
+
+?)
+usage
+exit 1
+;;
+esac
+done
+
+if [ $verbose ]; then
+echo data=$data  model=$model  topics=$topics iter=$iter dict=$dict
+logfile="/dev/stdout"
+fi
+
+if [ "$unigram" == "" -a "$outtopic" == "" ]; then
+
+#training branch
+
+if [ ! "$data" -o  ! "$model" ]; then
+usage
+exit 1
+fi
+
+if [ -e $logfile -a $logfile != "/dev/null" -a $logfile != "/dev/stdout" ]; then
+echo "Logfile $logfile already exists! either remove or rename it."
+exit 1
+fi
+
+if [ -e $model ]; then
+echo "Output file $model already exists! either remove or rename it." >> $logfile 2>&1
+exit 1
+fi
+
+if [ -e $txtfile -a $txtfile != "/dev/null" ]; then
+echo "Output file $txtfile already exists! either remove or rename it." >> $logfile 2>&1
+exit 1
+fi
+
+
+if [ -e $logfile -a $logfile != "/dev/null" -a $logfile != "/dev/stdout" ]; then
+echo "Logfile $logfile already exists! either remove or rename it." >> $logfile 2>&1
+exit 1
+fi
+
+#if [ ! -e "$data" ]; then
+#echo "Cannot find data $data." >> $logfile 2>&1
+#exit 1;
+#fi
+
+if [ ! -e $dict ]; then
+echo extract dictionary >> $logfile
+$bin/dict -i="$data" -o=$dict -PruneFreq=$prunefreq -f=y >> $logfile 2>&1
+if [ `head -n 1 $dict| cut -d " " -f 3` -lt 10 ]; then
+echo "Dictionary contains errors"
+exit 2;
+fi
+else
+echo "Warning: dictionary file already exists." >> $logfile 2>&1
+if [ $forcedict ]; then
+echo "Warning: authorization to use it." >> $logfile 2>&1
+else
+echo "No authorization to use it (see option -f)." >> $logfile 2>&1
+exit 1
+fi
+fi
+
+
+
+#check tmpdir
+tmpdir_created=0;
+if [ ! -d $tmpdir ]; then
+echo "Creating temporary working directory $tmpdir" >> $logfile 2>&1
+mkdir -p $tmpdir;
+tmpdir_created=1;
+else
+echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1
+rm $tmpdir/* 2> /dev/null
+if [ $? != 0 ]; then
+echo "Warning: some temporary files could not be removed" >> $logfile 2>&1
+fi
+fi
+
+#####
+echo split documents >> $logfile 2>&1
+$bin/plsa -c="$data" -d=$dict -b=$tmpdir/data -sd=$splits >> $logfile 2>&1
+
+machine=`uname -s` 
+if [ $machine == "Darwin" ] ; then
+splitlist=`jot - 1 $splits`
+iterlist=`jot - 1 $iter`
+else
+splitlist=`seq  1 1 $splits`
+iterlist=`seq 1 1 $iter`
+fi
+
+#rm $tmpdir/Tlist
+for sp in $splitlist ; do echo $tmpdir/data.T.$sp >> $tmpdir/Tlist 2>&1; done
+#rm $model
+for it in $iterlist ; do
+echo "ITERATION $it" >> $logfile 2>&1
+for sp in $splitlist ; do
+(date; echo it $it split $sp )>> $logfile 2>&1
+$bin/plsa -c=$tmpdir/data.$sp -d=$dict -st=$spectopics -hf=$tmpdir/data.H.$sp -tf=$tmpdir/data.T.$sp -wf=$model -m=$model -t=$topics -it=1 -tit=$it >> $logfile 2>&1 &
+done
+while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done
+
+(date; echo recombination ) >> $logfile 2>&1
+
+$bin/plsa -ct=$tmpdir/Tlist -c="$data" -d=$dict -hf=$tmpdir/data.H -m=$model -t=$topics -it=1 -txt=$txtfile >> $logfile 2>&1
+
+done
+(date; echo End of training) >> $logfile 2>&1
+
+echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1 
+rm $tmpdir/* 2> /dev/null
+
+if [ $tmpdir_created -eq 1 ]; then
+echo "Removing temporary directory $tmpdir" >> $logfile 2>&1
+rmdir $tmpdir 2> /dev/null
+if [ $? != 0 ]; then
+echo "Warning: the temporary directory could not be removed." >> $logfile 2>&1
+fi
+fi
+exit 0
+
+           
+           
+#testing branch
+else
+
+if [ ! $model -o ! -e $model ]; then
+echo "Need to specify existing model" >> $logfile 2>&1
+exit 1;
+fi
+
+
+if [ ! $dict  -o ! -e $dict  ]; then
+echo "Need to specify dictionary file of the model" >> $logfile 2>&1
+exit 1;
+fi
+
+if [ $unigram ]; then
+$bin/plsa -inf="$data" -d=$dict -m=$model -hf=hfff.out$$ -t=$topics -it=$iter -wof=$unigram >> $logfile 2>&1
+rm hfff.out$$
+
+else  #topic distribution
+
+#check tmpdir
+tmpdir_created=0;
+if [ ! -d $tmpdir ]; then
+echo "Creating temporary working directory $tmpdir" >> $logfile 2>&1
+mkdir -p $tmpdir;
+tmpdir_created=1;
+else
+echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1
+rm $tmpdir/* 2> /dev/null
+if [ $? != 0 ]; then
+echo "Warning: some temporary files could not be removed" >> $logfile 2>&1
+fi
+fi
+
+#####
+echo split documents >> $logfile 2>&1
+$bin/plsa -c="$data" -d=$dict -b=$tmpdir/data -sd=$splits >> $logfile 2>&1
+
+machine=`uname -s`
+if [ $machine == "Darwin" ] ; then
+splitlist=`jot - 1 $splits`
+else
+splitlist=`seq 1 1 $splits`
+fi
+
+#rm $tmpdir/Tlist
+for sp in $splitlist ; do echo $tmpdir/data.T.$sp >> $tmpdir/Tlist 2>&1; done
+#rm $model
+
+for sp in $splitlist ; do
+(date; echo split $sp )>> $logfile 2>&1
+
+$bin/plsa -inf=$tmpdir/data.$sp -d=$dict -hf=$tmpdir/data.H.$sp -m=$model -t=$topics -it=$iter -tof=$tmpdir/topic.$sp >> $logfile 2>&1 &
+
+done
+while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done
+
+(date; echo recombination ) >> $logfile 2>&1
+
+echo > $outtopic
+for sp in $splitlist ; do  #makes sure that 1 < 2 < ... < 11 ...
+cat $tmpdir/topic.$sp >> $outtopic
+done
+
+(date; echo End of training) >> $logfile 2>&1
+
+echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1
+rm $tmpdir/* 2> /dev/null
+
+if [ $tmpdir_created -eq 1 ]; then
+echo "Removing temporary directory $tmpdir" >> $logfile 2>&1
+rmdir $tmpdir 2> /dev/null
+if [ $? != 0 ]; then
+echo "Warning: the temporary directory could not be removed." >> $logfile 2>&1
+fi
+fi
+
+fi
+fi
+
+
+exit 0
+
+
diff --git a/scripts/qplsa.sh b/scripts/qplsa.sh
new file mode 100755
index 0000000..fad8765
--- /dev/null
+++ b/scripts/qplsa.sh
@@ -0,0 +1,183 @@
+#! /bin/bash
+
+sDir=$(cd $(dirname $0) ; /bin/pwd)
+
+#Task data
+bin=/hltsrv0/federico/plsa/bin
+wdir=/panfs/panfem/test-hlt/federico/plsa/CC
+#/hltsrv0/federico/plsa/ted
+ldir=/scratch/federico
+
+data=doc_en.00.bin
+dict=ted.dict
+
+#ted-en
+topics=150
+iter=2
+prunefreq=5
+spectopics=500
+Tlist=$wdir/tlist
+splits=2
+model=model.$splits
+txtfile=Wfile.$splits
+
+#parameters
+numSlots=1-3
+ram=10G
+qL=bld.q,bld-ib.q
+
+#Preparation phase
+jName=PLSA.PRE
+
+#preparation ends when tlist is prepared
+rm $Tlist 
+jName=PLSA.TRAIN
+
+range=`yes | head -n $splits | awk '{printf("%02d ",a);a++}'`
+iter=`seq 1 1 $iter| tr "\012" " "`
+
+qsub -cwd -N $jName -j y -q $qL -l mf=$ram -t $numSlots -o $wdir/log -S /bin/bash <<EOF
+
+
+me=\`echo \$SGE_TASK_ID | awk '{printf("%02d",\$1-1)}'\`
+lastid=\`echo \$SGE_TASK_LAST | awk '{printf("%02d",\$1-1)}'\`
+
+(echo start ; date) > $wdir/monitor.\$SGE_TASK_ID
+echo
+
+if [[ ! -d $ldir ]]; then mkdir $ldir; fi
+
+#################################
+if [ \$me -eq \$lastid ]
+then
+(echo master starts ; uname -n ; date) > $wdir/monitor.\$SGE_TASK_ID
+
+#prepare Tlist file
+rm $Tlist
+for sp in $range; do 
+echo $wdir/$data.T.\$sp >> $Tlist
+done
+
+#tell slaves to copy and binarize data
+
+for sp in $range; do
+
+(echo cp $wdir/$data.\$sp.gz $wdir/$dict $ldir \; ;\
+echo $bin/plsa -c=\"gunzip -c $ldir/$data.\$sp.gz\" -d=$ldir/$dict -b=$ldir/$data.\$sp \; ;\
+echo rm $ldir/$data.\$sp.gz ) > $wdir/taskfor_\$sp
+touch $wdir/doit_\$sp
+done
+
+(echo master prepare ; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+#wait that all have finished
+while ls $wdir/doit_* &> /dev/null; do sleep 1; done
+
+(echo master start iteration ; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+for it in $iter; do
+for sp in $range; do
+
+(echo master iteration \$it split \$sp; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+echo tell slave to run an iteration
+(echo if [[ -e $wdir/$model ]] \; then cp $wdir/$model $ldir/$model \; fi ;
+ echo $bin/plsa -c=$ldir/$data.\$sp -d=$ldir/$dict -st=$spectopics -hf=$ldir/$data.H.\$sp -tf=$ldir/$data.T.\$sp -wf=$ldir/$model -m=$ldir/$model -t=$topics -it=1 -tit=\$it ;\
+echo cp $ldir/$data.T.\$sp $wdir ) > $wdir/taskfor_\$sp
+touch $wdir/taskfor_\$sp
+touch $wdir/doit_\$sp
+done
+
+(echo master start waiting \$it ; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+
+#echo wait that all have finished
+while ls $wdir/doit_* &> /dev/null; do
+(echo master waiting \$it ; date) >> $wdir/monitor.\$SGE_TASK_ID
+ls $wdir/doit_*
+sleep 1;
+done
+
+(echo master start recombination \$it ; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+echo recombine
+$bin/plsa -ct=$Tlist -c=dummy -d=$wdir/$dict -m=$wdir/$model -t=$topics -it=1 -txt=$wdir/$txtfile
+
+done
+
+
+(echo master tells slaves to remove data; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+echo tell slaves to remove their local data
+for sp in $range; do
+echo rm $ldir/$dict $ldir/$data.\$sp $ldir/$model > $wdir/taskfor_\$sp
+touch $wdir/taskfor_\$sp
+touch $wdir/doit_\$sp
+done
+echo wait that all have finished
+
+(echo master waits for slaves; date) >> $wdir/monitor.\$SGE_TASK_ID
+ 
+while ls $wdir/doit_* &> /dev/null; do sleep 1; done
+
+echo tell slaves to exit
+
+(echo master tells slaves to exit; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+for sp in $range; do
+echo exit > $wdir/taskfor_\$sp
+touch $wdir/taskfor_\$sp
+touch $wdir/doit_\$sp
+done
+
+(echo master waits for slaves; date) >> $wdir/monitor.\$SGE_TASK_ID
+while ls $wdir/doit_* &> /dev/null; do sleep 1; done
+
+(echo master ends; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+rm $wdir/$data.H* $wdir/$model $wdir/$data.T* $wdir/taskfor_*
+
+#############################
+else
+
+(echo slave starts ; uname -n ; date) > $wdir/monitor.\$SGE_TASK_ID
+
+while :
+do
+
+(echo slave \$me iteration \$it waits for job; echo \$cmd; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+touch $wdir
+
+if [[ -e $wdir/doit_\$me ]]; then
+
+cmd=\`cat $wdir/taskfor_\$me\`
+
+(echo slave \$me starts executing; echo \$cmd; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+if [[ \$cmd == *exit* ]]; then
+    #rm before cmd execution
+    rm $wdir/doit_\$me >& /dev/null
+    exit 0
+else
+    /bin/sh $wdir/taskfor_\$me
+    #rm after cmd execution
+    rm $wdir/doit_\$me >& /dev/null
+fi
+
+(echo slave ended executing; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+fi
+
+sleep 1
+
+done
+
+fi
+
+(echo end;uname -a; date) >> $wdir/monitor.\$SGE_TASK_ID
+
+exit 0
+
+EOF
+
diff --git a/scripts/rm-start-end.sh b/scripts/rm-start-end.sh
new file mode 100644
index 0000000..015e2ac
--- /dev/null
+++ b/scripts/rm-start-end.sh
@@ -0,0 +1,30 @@
+#! /bin/bash
+
+function usage()
+{
+    cmnd=$(basename $0);
+    cat<<EOF
+
+$cmnd - removes sentence start/end symbols
+
+USAGE:
+       $cmnd [options]
+
+OPTIONS:
+       -h        Show this message
+
+EOF
+}
+
+# Parse options
+while getopts h OPT; do
+    case "$OPT" in
+        h)
+            usage >&2;
+            exit 0;
+            ;;
+    esac
+done
+
+sed 's/<s>//g' | sed 's/<\/s>//g' | sed 's/^ *//' | sed 's/ *$//' | sed '/^$/d'
+
diff --git a/scripts/sort-lm.pl b/scripts/sort-lm.pl
new file mode 100755
index 0000000..63d7c23
--- /dev/null
+++ b/scripts/sort-lm.pl
@@ -0,0 +1,124 @@
+#! /usr/bin/perl
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2010 Marcello Federico, FBK-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+
+#******************************************************************************
+#Sorts n-grams of an ARPA file according to lexicographic order.
+#Inverted sorting option is propedeutic to building a binary
+#lmtable with compile-lm with n-grams stored in reverted order.
+
+use strict;
+use Getopt::Long "GetOptions";
+
+my ($help,$ilm,$olm,$inv,$tmpdir)=();
+$help=0;
+
+$ilm="/dev/stdin";
+$olm="/dev/stdout";
+my $tmpdir="$ENV{TMP}";
+
+&GetOptions('ilm=s' => \$ilm,
+			'olm=s' => \$olm,
+			'tmpdir=s' => \$tmpdir,
+            'inv' => \$inv,
+            'help' => \$help,);
+
+if ($help || !$ilm || !$olm){
+  print "sort-lm.pl [--ilm <fname>]  [--olm <fname>] [--inv]\n",
+  "-ilm  <fname>   input ARPA LM filename (default /dev/stdin)\n",
+  "-olm <fname>    output ARPA LM filename (default /dev/stdout)\n",
+  "-tmpdir         temporary directory for sorting (default is the enivronment variable TMP\n",
+  "-inv            inverted n-gram sort for compile-lm\n",
+  "-help           print these instructions\n";    
+  exit(1);
+}
+
+warn "temporary directory for sorting is $tmpdir\n";
+
+my $order=0;
+my $sortcmd="";
+
+$ENV{'LC_ALL'}='C';
+
+open (INP, "< $ilm") || die "cannot open input LM file: $ilm\n";
+open (OUT, "> $olm") || die "cannot open output LM file: $olm\n";
+
+
+warn "reading from standard input\n" if $ilm eq "/dev/stdin";
+warn "writing to standard output\n" if $olm eq "/dev/stdout";
+
+$_=<INP>;
+
+#sanity check
+die "Error: input cannot be an intermediate iARPA file. First convert it to ARPA format with compile-lm.\n" if 
+$_=~/^iARPA/;
+
+my $isQuantized=0;
+$isQuantized=1 if $_=~/^qARPA/;
+
+while(!/^\\end\\/){
+
+	
+	if (($order)=$_=~/^\\(\d+)-grams:/){
+		print(OUT $_);$_=<INP>;	
+		if ($isQuantized){
+			print(OUT $_); chop $_;#print centers
+			my $centers=$_; $_=<INP>;
+			warn "skip $centers centers\n";		
+			for (my $c=1;$c<=$centers;$c++){
+				print(OUT $_);$_=<INP>; 
+			}
+			
+		}
+		#sort command
+		#$sortcmd="sort -b"; #does not seem to work properly
+		$sortcmd="sort --temporary-directory=$tmpdir";
+		if ($inv){
+			warn "inverted sorting of $order-grams\n";
+			for (my $n=$order;$n>0;$n--){
+				$sortcmd.=" -k ".($n+1).",".($n+1);
+			}
+		}else{
+			warn "direct sorting of $order-grams\n";
+			for (my $n=1;$n<=$order;$n++){
+				$sortcmd.=" -k ".($n+1).",".($n+1);
+			}
+		}
+				
+		close(OUT);open (OUT,"|$sortcmd >> $olm");
+		
+		
+		do{ 
+			print(OUT $_);$_=<INP>;			
+				
+		}until (/^\\/ || /^\n/);
+		
+		close(OUT); open(OUT, ">> $olm");	
+		
+	}
+	else{
+		print(OUT $_);$_=<INP>;	
+	}
+	
+}
+
+print(OUT $_);
+
+close(INP);
+close(OUT);
diff --git a/scripts/split-dict.pl b/scripts/split-dict.pl
new file mode 100755
index 0000000..942d66e
--- /dev/null
+++ b/scripts/split-dict.pl
@@ -0,0 +1,157 @@
+#! /usr/bin/perl
+
+#*****************************************************************************
+# IrstLM: IRST Language Model Toolkit
+# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+
+#******************************************************************************
+
+#usage:
+#split-dict.pl <input> <output> <parts>
+#It splits the <input> dictionary into <parts> dictionaries
+#(named <output000>, ... <output999>)
+#splitting is balanced wrt to frequency of the <input> dictionary
+#if not available a frequency of 1 is considered
+
+use strict;
+use Getopt::Long "GetOptions";
+use File::Basename;
+
+my ($help,$input,$output,$parts)=();
+
+$help=1 unless
+&GetOptions('input=s' => \$input,
+            'output=s' => \$output, 
+             'parts=i' => \$parts,           
+             'h|help' => \$help,);
+
+if ($help || !$input || !$output || !$parts) {
+	my $cmnd = basename($0);
+  print "\n$cmnd - splits a dictionary into frequency-balanced partitions\n",
+	"\nUSAGE:\n",
+	"       $cmnd [options]\n",
+	"\nDESCRIPTION:\n",
+	"       $cmnd splits a dictionary into frequency-balanced partitions.\n",
+	"       The dictionary must be generated with IRSTLM command dict.\n",
+	"       If dictionary does not contain frequencies, then a frequency 1 is\n",
+	"       assumed for all words.\n",
+	"\nOPTIONS:\n",
+    "       --input <string>      input dictionary with frequencies\n",
+    "       --output <string>     prefix of output dictionaries\n",
+    "       --parts <int>         number of partitions to create\n",
+    "       -h, --help            (optional) print these instructions\n",
+    "\n";
+
+  exit(1);
+}
+
+
+
+my $freqflag=0;
+my ($w,$f,$globf,$thr);
+my (@D, at F,%S, at C);
+open(IN,"$input");
+
+chomp($_=<IN>);
+#if input is a dictionary.
+if (/^dictionary[ \t]+\d+[ \t]+\d+$/i){
+  my ($dummy,$size);
+  ($dummy,$dummy,$size)=split(/[ \t]+/,$_);
+  $freqflag=1 if /DICTIONARY/;
+}
+
+$globf=0;
+while(chomp($_=<IN>)){
+	if ($freqflag){
+		($w,$f)=split(/[ \t]+/,$_);
+	}
+	else{
+		$w=$_;
+		$f=1;
+	}
+	push @D, $w;
+	push @F, $f;
+  $globf+=$f;
+}
+close (IN);
+
+$thr=$globf/$parts;
+my $totf=0;
+print STDERR "Dictionary 0: (thr: $thr , $globf, $totf , $parts)\n";
+
+my $sfx=0;
+my $w;
+for (my $i=0;$i<=$#D;$i++){
+	
+# if the remaining words are less than or equal to 
+# the number of remaining sub-dictionaries to create
+# put only one word per each sub-dictionary.
+	if (($totf>0) && ($#D+1-$i) <= ($parts-1-$sfx)){
+# recompute threshold on the remaining global frequency
+# according to the number of remaining parts
+		$sfx++;
+		$globf-=$totf;
+		$thr=($globf)/($parts-$sfx);
+		print STDERR "Dictionary $sfx: (thr: $thr , $globf , $totf , ",($parts-$sfx),")\n";
+		$totf=0;
+	}
+
+	$totf+=$F[$i];
+	$w=$D[$i];
+	$S{$w}=$sfx;
+	$C[$sfx]++;
+	if ($totf>$thr){
+# recompute threshold on the remaining global frequency
+# according to the number of remaining parts
+		$sfx++;
+		$globf-=$totf;
+		$thr=($globf)/($parts-$sfx);
+		print STDERR "Dictionary $sfx: (thr: $thr , $globf , $totf , ",($parts-$sfx),")\n";
+		$totf=0;
+	}
+}
+
+
+my $oldsfx=-1;
+for (my $i=0;$i<=$#D;$i++){
+	$w=$D[$i];
+	$sfx="0000$S{$w}";
+	$sfx=~s/.+(\d{3})/$1/;
+	if ($sfx != $oldsfx){
+#print STDERR "opening $output$sfx\n";
+		close (OUT) if $oldsfx!= -1;
+		open(OUT,">$output$sfx");
+		if ($freqflag){
+			print OUT "DICTIONARY 0 $C[$sfx]\n";
+		}
+		else{
+			print OUT "dictionary 0 $C[$sfx]\n";
+		}
+		$oldsfx=$sfx;
+	}
+	if ($freqflag){
+		print OUT "$w $F[$i]\n";
+	}
+	else{
+		print OUT "$w\n";
+	}
+}
+close (OUT) if $oldsfx!= -1;
+
+my $numdict=$S{$D[$#D]}+1;
+die "Only $numdict dictionaries were crested instead of $parts!" if ($numdict != $parts);
+
diff --git a/scripts/split-ngt.sh b/scripts/split-ngt.sh
new file mode 100755
index 0000000..1072cdb
--- /dev/null
+++ b/scripts/split-ngt.sh
@@ -0,0 +1,89 @@
+#! /bin/bash
+
+function usage()
+{
+    cmnd=$(basename $0);
+    cat<<EOF
+
+$cmnd - creates partition files with ngram statistics in Google format
+
+USAGE:
+       $cmnd [options] <input> <output> <order> <parts>
+
+DESCRIPTION:
+       <input>   Input file name
+       <output>  Partition files name prefix
+       <order>   Order of the ngrams
+       <parts>   Number of partitions
+
+OPTIONS:
+       -h        Show this message
+
+EOF
+}
+
+# Parse options
+while getopts h OPT; do
+    case "$OPT" in
+        h)
+            usage >&2;
+            exit 0;
+            ;;
+        * ) usage;
+            exit 1;
+						;;
+    esac
+done
+
+#usage:
+#ngt-split.sh [options] <input> <output> <size> <parts>
+#It creates <parts> files (named <output.000>, ... <output.999>)
+#containing ngram statistics (of <order> length) in Google format
+#These files are a partition of the whole set of ngrams
+
+basedir=$IRSTLM
+bindir=$basedir/bin
+scriptdir=$basedir/scripts
+
+unset par
+while [ $# -gt 0 ]
+do
+   echo "$0: arg $1"
+   par[${#par[@]}]="$1"
+   shift
+done
+
+inputfile=${par[0]}
+outputfile=${par[1]}
+order=${par[2]}
+parts=${par[3]}
+
+dictfile=dict$$
+
+
+echo "Extracting dictionary from training corpus"
+$bindir/dict -i="$inputfile" -o=$dictfile -f=y -sort=n
+
+echo "Splitting dictionary into $parts lists"
+$scriptdir/split-dict.pl --input $dictfile --output ${dictfile}. --parts $parts
+
+rm $dictfile
+
+
+echo "Extracting n-gram statistics for each word list"
+echo "Important: dictionary must be ordered according to order of appearance of words in data"
+echo "used to generate n-gram blocks,  so that sub language model blocks results ordered too"
+
+for d in `ls ${dictfile}.*` ; do
+w=`echo $d | perl -pe 's/.+(\.[0-9]+)$/$1/i'`
+w="$outputfile$w"
+
+sdict=`basename $sdict`
+echo "Extracting n-gram statistics for $sdict"
+
+echo "$bindir/ngt -i="$inputfile"  -n=$order -gooout=y -o=$w -fd=$d  > /dev/null"
+$bindir/ngt -n=$order -gooout=y -o=$w -fd=$d -i="$inputfile"  > /dev/null
+rm $d
+done
+
+exit 0
diff --git a/scripts/wrapper b/scripts/wrapper
new file mode 100644
index 0000000..2b2754c
--- /dev/null
+++ b/scripts/wrapper
@@ -0,0 +1,10 @@
+#! /bin/sh
+
+#set machine type for compilation
+MY_ARCH=`uname -m`
+
+name=`basename $0`
+dir=`dirname $0`"/$MY_ARCH"
+
+$dir/$name "$@" 
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git



More information about the debian-science-commits mailing list