[irstlm] 23/78: enabled a new context-dependent LM model based on a correction factor of basic LM probabilities

Giulio Paci giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:47:02 UTC 2016


This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to tag adaptiveLM.v0.10
in repository irstlm.

commit 3d48f1353cceebade3677102ffc0cc218528f02c
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Mon Nov 9 00:45:36 2015 +0100

    enabled a new context-dependent LM model based on a correction factor of basic LM probabilities
---
 src/CMakeLists.txt                   |    5 +-
 src/Makefile.am                      |   13 +-
 src/context-dependent-evaluation.cpp | 1098 ++++++++++++++++++++++++++++++++++
 src/context-similarity.cpp           |  559 +++++++++++++++++
 src/context-similarity.h             |  138 +++++
 src/lmContainer.cpp                  |    8 +
 6 files changed, 1817 insertions(+), 4 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cef6d31..2717ce9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -6,6 +6,7 @@ ADD_DEFINITIONS("-D_LARGE_FILES")
 ADD_DEFINITIONS("-D_FILE_OFFSET_BITS=64")
 ADD_DEFINITIONS("-DMYCODESIZE=3")
 ADD_DEFINITIONS("-DDEBUG")
+#ADD_DEFINITIONS("-DTRACE_LEVEL=1")
 
 if (TRACE_LEVEL)
   ADD_DEFINITIONS("-DTRACE_LEVEL=${TRACE_LEVEL}")
@@ -62,12 +63,14 @@ SET( LIB_IRSTLM_SRC
         cplsa.h cplsa.cpp 
         cswam.h cswam.cpp 
         doc.h doc.cpp
+	lmContextDependent.h lmContextDependent.cpp
+        context-similarity.h context-similarity.cpp
 )
 
 ADD_LIBRARY(irstlm STATIC ${LIB_IRSTLM_SRC})
 LINK_DIRECTORIES (${LIBRARY_OUTPUT_PATH})
 
-FOREACH(CMD dict ngt tlm dtsel plsa cswa compile-lm interpolate-lm prune-lm quantize-lm score-lm verify-caching)
+FOREACH(CMD dict ngt tlm dtsel plsa cswa compile-lm interpolate-lm prune-lm quantize-lm score-lm verify-caching context-dependent-evaluation)
 
 ADD_EXECUTABLE(${CMD} ${CMD}.cpp)
 TARGET_LINK_LIBRARIES (${CMD} irstlm -lm -lz -lpthread)
diff --git a/src/Makefile.am b/src/Makefile.am
index 6169e64..6f54075 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -32,7 +32,9 @@ libirstlm_la_HEADERS = \
   shiftlm.h \
   cplsa.h \
   cswam.h \
-  doc.h
+  doc.h \
+  lmContextDependent.h \
+  context-similarity.h
 
 libirstlm_la_SOURCES = \
   cmd.c \
@@ -60,7 +62,9 @@ libirstlm_la_SOURCES = \
   shiftlm.cpp \
   cplsa.cpp \
   cswam.cpp \
-  doc.cpp
+  doc.cpp \
+  lmContextDependent.cpp \
+  context-similarity.cpp
 
 CLEANFILES = $(BUILT_SOURCES)
 
@@ -69,7 +73,8 @@ libirstlm_la_LIBADD = $(BOOST_LDFLAGS) $(BOOST_THREAD_LIB)
 LDADD = -lirstlm -lpthread
 DEPENDENCIES = libirstlm.la
 
-bin_PROGRAMS = dict ngt dtsel compile-lm interpolate-lm prune-lm quantize-lm prune-lm score-lm tlm plsa verify-caching cswa
+bin_PROGRAMS = dict ngt dtsel compile-lm interpolate-lm prune-lm quantize-lm prune-lm score-lm tlm plsa verify-caching cswa context-dependent-evaluation
+
 dict_SOURCES = dict.cpp
 dict_DEPENDENCIES = $(DEPENDENCIES)
 ngt_SOURCES = ngt.cpp
@@ -94,3 +99,5 @@ verify_caching_SOURCES = verify-caching.cpp
 verify_caching_DEPENDENCIES = $(DEPENDENCIES)
 cswa_SOURCES = cswa.cpp
 cswa_DEPENDENCIES = $(DEPENDENCIES)
+context_dependent_evaluation_SOURCES = context-dependent-evaluation.cpp
+context_dependent_evaluation_DEPENDENCIES = $(DEPENDENCIES)
diff --git a/src/context-dependent-evaluation.cpp b/src/context-dependent-evaluation.cpp
new file mode 100644
index 0000000..0b32f27
--- /dev/null
+++ b/src/context-dependent-evaluation.cpp
@@ -0,0 +1,1098 @@
+// $Id: compile-lm.cpp 3677 2010-10-13 09:06:51Z bertoldi $
+
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit, compile LM
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ 
+ ******************************************************************************/
+
+
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <string>
+#include <stdlib.h>
+#include "cmd.h"
+#include "util.h"
+#include "math.h"
+#include "lmContainer.h"
+#include "lmContextDependent.h"
+
+using namespace std;
+using namespace irstlm;
+
+typedef std::pair<double,int> double_and_int_pair;
+
+struct cmp_double_and_int_pair {
+	//order first by the first field (double), and in case of equality by the second field (int)
+	bool operator()(const double_and_int_pair& a, const double_and_int_pair& b) const {
+		if (a.first < b.first){
+			return true;
+		}else if (a.first > b.first){
+			return false;
+		}else{
+			if (a.second<b.second){
+				return true;
+			}else{
+				return false;
+			}
+		}
+	}
+};
+
+typedef std::map<int, double_and_int_pair> int_to_double_and_int_map;
+//typedef std::map<double_and_int_pair,int,cmp_double_and_int_pair> double_and_int_to_int_map;
+typedef std::map<double_and_int_pair,double_and_int_pair,cmp_double_and_int_pair> double_and_int_to_double_and_int_map;
+
+/********************************/
+void print_help(int TypeFlag=0){
+  std::cerr << std::endl << "context-dependent-evaluation - compute ngram probabilities and text perplexity given a LM" << std::endl;
+  std::cerr << std::endl << "USAGE:"  << std::endl;
+	std::cerr << "       context-dependent-evaluation [options] lm=<input-file.lm>" << std::endl;
+	std::cerr << std::endl << "DESCRIPTION:" << std::endl;
+	std::cerr << "       context-dependent-evaluation uses the given LM to compute ngram probabilities and text perplexity of the input" << std::endl;
+	std::cerr << "       The LM must be in a IRSTLM-compliant type" << std::endl;
+	std::cerr << std::endl << "OPTIONS:" << std::endl;
+	
+	FullPrintParams(TypeFlag, 0, 1, stderr);
+}
+
+void usage(const char *msg = 0){
+  if (msg) {
+    std::cerr << msg << std::endl;
+  }
+	if (!msg){
+		print_help();
+	}
+}
+
+void load_lexicon(const char* lexfile, std::multimap< std::string, std::string >& lexicon){
+	if (lexfile!= NULL) {
+		fstream inp(lexfile,ios::in|ios::binary);
+		std::string w1, w2;
+		while (inp >> w1 >> w2){
+			lexicon.insert(make_pair(w1,w2));
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{	
+  char *testfile=NULL;
+  char *lmfile=NULL;
+	
+	bool sent_flag = false;
+	bool contextbasedscore = false;
+	bool topicscore = false;
+	bool rankscore = false;
+	bool context_model_active = true;
+	bool context_model_normalization = false;
+  char *lexiconfile=NULL;
+	
+	bool add_lexicon_words = false;
+	bool add_lm_words = false;
+	bool add_sentence_words = false;
+	bool add_full_dictionary = false;
+	int successor_limit=100;
+	
+	int debug = 0;
+  int requiredMaxlev = 1000;
+  int dub = 10000000;
+  int randcalls = 0;
+  float ngramcache_load_factor = 0.0;
+  float dictionary_load_factor = 0.0;
+	
+  bool help=false;
+	
+	DeclareParams((char*)
+								"lm", CMDSTRINGTYPE|CMDMSG, &lmfile, "LM to load",
+								"test", CMDSTRINGTYPE|CMDMSG, &testfile, "computes scores of the specified text file",
+								"lexicon", CMDSTRINGTYPE|CMDMSG, &lexiconfile, "lexicon file contains associated words (required by rankscore)",
+                "randcalls", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file",
+								"r", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file",
+                "contextbasedscore", CMDBOOLTYPE|CMDMSG, &contextbasedscore, "computes context-dependent probabilities and pseudo-perplexity of the text from standard input",
+                "topicscore", CMDBOOLTYPE|CMDMSG, &topicscore, "computes the topic scores of the text from standard input",
+                "rankscore", CMDBOOLTYPE|CMDMSG, &rankscore, "computes the average rank position of the text from standard input",
+								"debug", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
+								"d", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
+                "level", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken",
+								"l", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken",
+                "dub", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7",
+                "sentence", CMDBOOLTYPE|CMDMSG, &sent_flag, "computes perplexity at sentence level (identified through the end symbol)",
+                "dict_load_factor", CMDFLOATTYPE|CMDMSG, &dictionary_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is 0",
+                "ngram_load_factor", CMDFLOATTYPE|CMDMSG, &ngramcache_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is false",
+                "context_model_active", CMDBOOLTYPE|CMDMSG, &context_model_active, "enable/disable context-dependent model (default is true)",
+                "context_model_normalization", CMDBOOLTYPE|CMDMSG, &context_model_normalization, "enable/disable normalization of context-dependent model (default is false)",
+                "add_lexicon_words", CMDBOOLTYPE|CMDMSG, &add_lexicon_words, "enable/disable addition of the words in the lexicon into the alternatives (default is false)",
+                "add_lm_words", CMDBOOLTYPE|CMDMSG, &add_lm_words, "enable/disable addition of the unigram/bigrmam successors into the alternatives (default is false)",
+                "add_sentence_words", CMDBOOLTYPE|CMDMSG, &add_sentence_words, "enable/disable addition of the words of the current sentence into the alternatives (default is false)",
+                "add_full_dictionary", CMDBOOLTYPE|CMDMSG, &add_full_dictionary, "enable/disable addition of all words of the LM dictionary into the alternatives (default is false)",
+								"successor_limit", CMDINTTYPE|CMDMSG, &successor_limit, "threshold to decide whether adding the unigram/bigram successors into the alternatives (default is 100)",
+								
+								"Help", CMDBOOLTYPE|CMDMSG, &help, "print this help",
+								"h", CMDBOOLTYPE|CMDMSG, &help, "print this help",
+								
+                (char*)NULL
+								);
+	
+	if (argc == 1){
+		usage();
+		exit_error(IRSTLM_NO_ERROR);
+	}
+	
+	GetParams(&argc, &argv, (char*) NULL);
+	
+	if (help){
+		usage();
+		exit_error(IRSTLM_NO_ERROR);
+	}	
+	
+	if (lmfile == NULL) {
+		usage();
+		exit_error(IRSTLM_ERROR_DATA,"Warning: Please specify a LM file to read from");
+	}
+	
+	if (testfile == NULL) {
+		usage();
+		exit_error(IRSTLM_NO_ERROR);
+	}
+	
+	if (lmfile!=NULL) VERBOSE(1, "lmfile: " << lmfile << std::endl);
+  if (testfile!=NULL) VERBOSE(1, "testfile: " << testfile << std::endl);
+	if (lexiconfile != NULL){
+		VERBOSE(1, "lexicon: " << lexiconfile << std::endl);
+	}
+  VERBOSE(1, "contextbasedscore: " << contextbasedscore << std::endl);
+  VERBOSE(1, "topicscore: " << topicscore << std::endl);
+  VERBOSE(1, "rankscore: " << rankscore << std::endl);
+	
+	VERBOSE(1,"add_lexicon_words: " << add_lexicon_words << std::endl);
+	VERBOSE(1,"add_lm_words: " << add_lm_words << " successor_limit:" << successor_limit<< std::endl);
+	VERBOSE(1,"add_sentence_words: " << add_sentence_words << std::endl);
+	
+  std::cerr << "loading up to the LM level " << requiredMaxlev << " (if any)" << std::endl;
+  std::cerr << "dub: " << dub<< std::endl;
+	
+	
+	if (topicscore == true) {
+		VERBOSE(0, "NOT SUPPORTED" << std::endl);
+		return 0;
+	}
+	
+  //checking the language model type
+  std::string infile(lmfile);
+	
+  lmContainer* lmt = lmContainer::CreateLanguageModel(infile,ngramcache_load_factor,dictionary_load_factor);
+	
+  lmt->setMaxLoadedLevel(requiredMaxlev);
+	
+  lmt->load(infile);
+	((lmContextDependent*) lmt)->set_Active(context_model_active);
+	((lmContextDependent*) lmt)->set_Normalized(context_model_normalization);
+	
+  if (dub) lmt->setlogOOVpenalty((int)dub);
+	
+  //use caches to save time (only if PS_CACHE_ENABLE is defined through compilation flags)
+  lmt->init_caches(lmt->maxlevel());
+	
+	//read lexicon form file
+	
+	std::multimap< std::string, std::string > lexicon;
+	if (add_lexicon_words){
+		if (lexiconfile != NULL) {
+			load_lexicon(lexiconfile, lexicon);
+		}else{
+			VERBOSE(1, "You did not set any lexicon, but you activated parameter \"--add_lexicon_words\". This is formally correct; maybe you want to pass the lexicon through the input; Please check whether your setting is correct." << std::endl);			
+		}
+	}else{
+		VERBOSE(1, "You set a lexicon, but you did not activate parameter \"--add_lexicon_words\". Hence, words in he lexicon are not used as alternatives" << std::endl);
+	}
+	/*
+	if (std::string lexiconfile!= NULL) {
+		fstream inp(lexiconfile,ios::in|ios::binary);
+		std::string w1, w2;
+		while (inp >> w1 >> w2){
+			lexicon.insert(make_pair(w1,w2));
+		}
+		add_lexicon_words=true;
+	}
+	*/
+	
+	if (topicscore == true) {
+		if (lmt->getLanguageModelType() != _IRSTLM_LMCONTEXTDEPENDENT) {
+			exit_error(IRSTLM_ERROR_DATA, "This type of score is not available for the LM loaded");
+		}
+		if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) {
+			debug = (debug>4)?4:debug;
+			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+		}
+		if (lmt->getLanguageModelType() == _IRSTLM_LMMACRO) {
+			debug = (debug>4)?4:debug;
+			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+		}
+		if (lmt->getLanguageModelType() == _IRSTLM_LMCLASS) {
+			debug = (debug>4)?4:debug;
+			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+		}
+		
+		std::cerr << "Start Topic Score generation " << std::endl;
+		std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl;
+		std::cout.setf(ios::fixed);
+		std::cout.precision(2);
+		
+		std::fstream inptxt(testfile,std::ios::in);
+		
+		// loop over input lines
+		char line[MAX_LINE];
+		while (inptxt.getline(line,MAX_LINE)) {
+			
+			std::string line_str = line;
+			
+			VERBOSE(2,"input_line:|" << line_str << "|" << std::endl);
+			
+			//getting sentence string;
+			std::string tmp_sentence;
+			std::string sentence;
+			std::string context;
+			std::string sentence_lexiconfile;
+			
+			//remove lexicon string from the input, even if it is not used at all for this type of score
+			((lmContextDependent*) lmt)->GetSentenceAndLexicon(tmp_sentence,sentence_lexiconfile,line_str);
+			bool withContext = lmt->GetSentenceAndContext(sentence,context,tmp_sentence);
+			
+			//getting apriori topic weights
+			topic_map_t apriori_topic_map;
+			if (withContext){
+				lmt->setContextMap(apriori_topic_map,context);
+			}
+			// computation using std::string
+			// loop over ngrams of the sentence
+			string_vec_t word_vec;
+			split(sentence, ' ', word_vec);
+			
+			//first points to the last recent term to take into account
+			//last points to the position after the most recent term to take into account
+			//last could point outside the vector of string; do NOT use word_vec.at(last)
+			size_t last, first;
+			size_t size=0;
+			size_t order = lmt->maxlevel();
+			
+			topic_map_t sentence_topic_map;
+			for (size_t i=0; i<word_vec.size(); ++i){
+				++size;
+				size=(size<order)?size:order;
+				last=i+1;
+				
+				// reset ngram at begin of sentence
+				if (word_vec.at(i) == lmt->getDict()->BoS()) {
+					size=1;
+					continue;
+				}
+				first = last - size;
+				
+				string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
+				
+				if (size>=1) {
+					VERBOSE(2,"computing topic_scores for first:|" << first << "| and last:|" << last << "|" << std::endl);	
+					
+					topic_map_t tmp_topic_map;
+					((lmContextDependent*) lmt)->getContextSimilarity()->get_topic_scores(tmp_word_vec, tmp_topic_map);
+					IFVERBOSE(2){
+						VERBOSE(2,"before normalization word-based topic-distribution:");
+						((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map);
+					}
+					((lmContextDependent*) lmt)->getContextSimilarity()->normalize_topic_scores(tmp_topic_map);
+					IFVERBOSE(2){
+						VERBOSE(2,"after normalization word-based topic-distribution:");
+						((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map);
+					}
+					VERBOSE(2,"first:" << first << " last:" << last <<  " tmp_topic_map.size:" << tmp_topic_map.size() << std::endl);
+					
+					((lmContextDependent*) lmt)->getContextSimilarity()->add_topic_scores(sentence_topic_map, tmp_topic_map);
+					IFVERBOSE(2){
+						VERBOSE(2,"word-based topic-distribution:");
+						((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map,apriori_topic_map,1);
+					}
+					tmp_topic_map.clear();
+				}
+			}
+			IFVERBOSE(2){
+				VERBOSE(2,"sentence-based topic-distribution:");
+				((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map,apriori_topic_map,last);
+			}
+			std::cout << sentence << ((lmContextDependent*) lmt)->getContextDelimiter();
+			((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map);	
+			apriori_topic_map.clear();
+		}
+		
+		delete lmt;
+		return 0;
+	}
+  if (contextbasedscore == true) {
+		
+		if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) {
+			debug = (debug>4)?4:debug;
+			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+		}
+		if (lmt->getLanguageModelType() == _IRSTLM_LMMACRO) {
+			debug = (debug>4)?4:debug;
+			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+		}
+		if (lmt->getLanguageModelType() == _IRSTLM_LMCLASS) {
+			debug = (debug>4)?4:debug;
+			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+		}
+		std::cerr << "Start ContextBased Evaluation" << std::endl;
+		std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl;
+		std::cout.setf(ios::fixed);
+		std::cout.precision(2);
+		
+		int Nw=0,Noov=0;
+		double logPr=0,PP=0,PPwp=0,current_Pr;
+		double norm_logPr=0,norm_PP=0,norm_PPwp=0,norm_Pr;
+		double model_logPr=0,model_PP=0,model_PPwp=0,model_Pr;
+		double model_norm_logPr=0,model_norm_PP=0,model_norm_PPwp=0,model_norm_Pr;
+		int current_dict_alternatives = 0;
+		
+		double bow;
+		int bol=0;
+		char *msp;
+		ngram_state_t msidx;
+		unsigned int statesize;
+		
+		// variables for storing sentence-based Perplexity
+		int sent_Nw=0,sent_Noov=0;
+		double sent_logPr=0,sent_PP=0,sent_PPwp=0;	
+		double sent_norm_logPr=0,sent_norm_PP=0,sent_norm_PPwp=0;		
+		double sent_model_logPr=0,sent_model_PP=0,sent_model_PPwp=0;		
+		double sent_model_norm_logPr=0,sent_model_norm_PP=0,sent_model_norm_PPwp=0;		
+		int sent_current_dict_alternatives = 0;
+		
+		double oovpenalty = lmt->getlogOOVpenalty();
+		double norm_oovpenalty = oovpenalty;
+		
+		VERBOSE(1,"oovpenalty:" << oovpenalty  << std::endl);	
+		
+		std::fstream inptxt(testfile,std::ios::in);
+		
+		// loop over input lines
+		char line[MAX_LINE];
+		while (inptxt.getline(line,MAX_LINE)) {
+			
+			std::string line_str = line;
+			
+			VERBOSE(1,"input_line:|" << line_str << "|" << std::endl);	
+			
+			//getting sentence string;
+			std::string tmp_sentence;
+			std::string sentence;
+			std::string context;
+			std::string sentence_lexiconfile;
+			
+			bool withLexicon = ((lmContextDependent*) lmt)->GetSentenceAndLexicon(tmp_sentence,sentence_lexiconfile,line_str);
+			bool withContext = lmt->GetSentenceAndContext(sentence,context,tmp_sentence);
+			
+			//getting apriori topic weights
+			topic_map_t apriori_topic_map;
+			if (withContext){
+				((lmContextDependent*) lmt)->setContextMap(apriori_topic_map,context);
+			}
+			// computation using std::string
+			// loop over ngrams of the sentence
+			string_vec_t word_vec;
+			split(sentence, ' ', word_vec);
+			
+			//first points to the last recent term to take into account
+			//last points to the position after the most recent term to take into account
+			//last could point outside the vector of string; do NOT use word_vec.at(last)
+			size_t last, first;
+			size_t order = lmt->maxlevel();
+			
+			//start the computation from the second word because the first is the BoS symbol,but including BoS in the ngrams
+			size_t size=0;
+			for (size_t i=0; i< word_vec.size(); ++i){
+				++size;
+				size=(size<order)?size:order;
+				last=i+1;
+				
+				// reset ngram at begin of sentence
+				if (word_vec.at(i) == lmt->getDict()->BoS()) {
+					size=1;
+					continue;
+				}
+				first = last - size;
+				
+				string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
+			
+				if (size>=1) {
+					VERBOSE(2,"computing prob for first:|" << first << "| and last:|" << last << "|" << std::endl);
+					
+					VERBOSE(2,"word_vec.at(i):|" << word_vec.at(i) << "| lmt->getDict()->OOV():|" << lmt->getDict()->OOV() << "|" << std::endl);
+					if (lmt->getDict()->encode(word_vec.at(i).c_str()) == lmt->getDict()->oovcode()) {
+						Noov++;
+						sent_Noov++;
+					}
+					Nw++;
+					sent_Nw++;
+					
+					if ((Nw % 100000)==0) {
+						std::cerr << ".";
+						lmt->check_caches_levels();
+					}
+					
+					
+					VERBOSE(2,"tmp_word_vec.size:|" << tmp_word_vec.size() << "|" << std::endl);	
+					VERBOSE(2,"dict.size:|" << lmt->getDict()->size() << "|" << std::endl);	
+						
+					if (withContext){
+						current_Pr = lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msidx, &msp, &statesize);
+					}else{
+						current_Pr = lmt->clprob(tmp_word_vec, &bow, &bol, &msidx, &msp, &statesize);
+					}
+					/*
+					 double tot_pr = 0.0;
+					 if (context_model_normalization){
+					 tot_pr = ((lmContextDependent*) lmt)->total_clprob(tmp_word_vec, apriori_topic_map);
+					 }
+					 */
+					
+					//					string_vec_t::iterator it=tmp_word_vec.end()-1;
+					int current_pos = tmp_word_vec.size()-1;
+					std::string current_word = tmp_word_vec.at(current_pos);
+					
+					//loop over a set of selected alternative words
+					//populate the dictionary with all words associated with the current word
+					
+					dictionary* current_dict;
+					if (add_full_dictionary){
+						//loop over all words in the LM
+						current_dict = lmt->getDict();
+					}else{
+						current_dict = new dictionary((char *)NULL,1000000);
+					}
+					current_dict->incflag(1);
+					
+					current_dict->encode(current_word.c_str());
+					
+					VERBOSE(2,"after current word current_dict->size:" << current_dict->size() << std::endl);
+					
+					//add words from the lexicon
+					if (add_lexicon_words){
+						
+						if (withLexicon){
+							lexicon.clear();
+							load_lexicon(sentence_lexiconfile.c_str(), lexicon);
+						}
+												
+						std::pair <std::multimap< std::string, std::string>::iterator, std::multimap< std::string, std::string>::iterator> ret = lexicon.equal_range(current_word);
+						for (std::multimap<std::string, std::string>::const_iterator it=ret.first; it!=ret.second; ++it)
+						{
+							current_dict->encode((it->second).c_str());
+							/*
+							 //exclude the current word from the selected alternative words
+							 if (current_word != (it->second).c_str()){
+							 current_dict->encode((it->second).c_str());
+							 }
+							 */
+						}
+					}
+					VERBOSE(2,"after add_lexicon_words current_dict->size:" << current_dict->size() << std::endl);
+					
+					if (add_lm_words){
+						bool succ_flag=false;
+						ngram hg(lmt->getDict());
+						
+						if (size==1) {
+							hg.pushw(lmt->getDict()->BoS());
+							hg.pushc(0);
+							
+							lmt->get(hg,hg.size,hg.size-1);
+							VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+							if (hg.succ < successor_limit){
+								succ_flag=true;
+							}else{
+								VERBOSE(3,"successors are not added into the alternatives because they are too many" << std::endl);	
+							}
+						}else if (size>=2) {
+							hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-2));
+							hg.pushc(0);
+							
+							lmt->get(hg,hg.size,hg.size-1);
+							VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+							if (hg.succ < successor_limit){
+								succ_flag=true;
+							}else{
+								VERBOSE(3,"successors are not added into the alternatives because they are too many" << std::endl);	
+							}
+							
+							if (!succ_flag && size>=3){
+								hg.size=0;
+								hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-3));
+								hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-2));
+								hg.pushc(0);
+								
+								lmt->get(hg,hg.size,hg.size-1);
+								VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+								
+								if (hg.succ < successor_limit){
+									succ_flag=true;
+								}else{
+									VERBOSE(3,"successors are not added into the alternatives because they are too many" << std::endl);	
+								}
+							}
+						}
+						
+						
+						if (succ_flag){
+							ngram ng=hg;
+							lmt->succscan(hg,ng,LMT_INIT,ng.size);	
+							while(lmt->succscan(hg,ng,LMT_CONT,ng.size)) {
+								current_dict->encode(ng.dict->decode(*ng.wordp(1)));
+							}
+						}
+						
+					}
+					VERBOSE(2,"after add_lm_words current_dict->size:" << current_dict->size() << std::endl);
+					
+					if (add_sentence_words){
+						for (string_vec_t::const_iterator it=word_vec.begin(); it!=word_vec.end(); ++it)
+						{
+							current_dict->encode(it->c_str());
+						}
+					}
+					current_dict->incflag(0);
+					VERBOSE(2,"after add_sentence_words current_dict->size:" << current_dict->size() << std::endl);
+					
+					sent_current_dict_alternatives += current_dict->size();
+					current_dict_alternatives += current_dict->size();
+					
+					VERBOSE(2,"current_dict->size:" << current_dict->size() << std::endl);
+					for (int h=0;h<current_dict->size();++h){
+						VERBOSE(3,"h:" << h << " w:|" << current_dict->decode(h) << "|" << std::endl);
+					}
+					
+					//the first word in current_dict is always the current_word; hence we can skip it during the scan 
+					//variables for the computation of the oracle probability, i.e. the maximum prob
+					//double best_pr = -1000000.0;
+					//int best_code = lmt->getlogOOVpenalty();
+					double best_pr = current_Pr;
+					int best_code = 0;
+					//variables for the computation of the mass probability related to the current word, i.e. the sum of the probs for all words associated with the current word
+					double current_tot_pr = pow(10.0,current_Pr);
+					//					for (int j=0; j<current_dict->size(); ++j){
+					for (int j=1; j<current_dict->size(); ++j)
+					{
+						//loop over all words in the LM
+					  tmp_word_vec.at(current_pos) = current_dict->decode(j);
+						IFVERBOSE(3){
+							std::cout << "tmp_word_vec j:" << j;
+							for (string_vec_t::const_iterator it2=tmp_word_vec.begin(); it2!=tmp_word_vec.end(); ++it2) {
+								std::cout << " |" << (*it2) << "|";
+							}
+							std::cout << std::endl;
+						}				
+						
+						double pr;
+						if (withContext){
+							pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msidx, &msp, &statesize, NULL, NULL);
+						}else{
+							pr=lmt->clprob(tmp_word_vec, &bow, &bol, &msidx, &msp, &statesize, NULL, NULL);
+						}
+						current_tot_pr += pow(10.0,pr);
+						if (best_pr < pr){
+							best_pr = pr;
+							best_code = j;
+							VERBOSE(3,"current_best:" << best_code << " current_word:|" << current_dict->decode(best_code) << "| best_prob:" << pow(10.0,best_pr) << " norm_best_prob:" << pow(10.0,best_pr - current_tot_pr) << std::endl);
+						}
+						VERBOSE(3,"current_Pr:" << current_Pr << " current_word:" << current_word << "| ===> code:" << j << " word:|" << tmp_word_vec.at(current_pos) << "| pr:" << pr << " versus best_code:" << best_code << " best_word:|" << current_dict->decode(best_code) << "| best_pr:" << best_pr << std::endl);
+					}
+					
+					current_tot_pr=log10(current_tot_pr);
+					
+					model_Pr = best_pr;
+					VERBOSE(2,"model_best_code:" << best_code << " model_best_word:|" << current_dict->decode(best_code) << "| model_best_prob:" << pow(10.0,best_pr) << " current_tot_pr:" << current_tot_pr << std::endl);
+					
+					norm_oovpenalty = oovpenalty;
+					VERBOSE(2,"current_tot_pr:" << current_tot_pr << " oovpenalty:" << oovpenalty << " norm_oovpenalty:" << norm_oovpenalty << std::endl);	
+					
+					norm_Pr = current_Pr - current_tot_pr;
+					model_norm_Pr = model_Pr - current_tot_pr;
+					VERBOSE(1,"current_Pr:" << current_Pr << " norm_Pr:" << norm_Pr << " model_Pr:" << model_Pr << " model_norm_Pr:" << model_norm_Pr << " current_code:" << lmt->getDict()->encode(word_vec.at(i).c_str()) << " current_word:|" << word_vec.at(i) << "| model_best_code:" << best_code << " model_best_word:|" << current_dict->decode(best_code) << "|" << std::endl);
+					
+					model_norm_logPr+=model_norm_Pr;
+					sent_model_norm_logPr+=model_norm_Pr;
+					norm_logPr+=norm_Pr;
+					sent_norm_logPr+=norm_Pr;
+					VERBOSE(2,"sent_model_norm_logPr:" << sent_model_norm_logPr << " model_norm_logPr:" << model_norm_logPr << std::endl);	
+					VERBOSE(2,"sent_norm_logPr:" << sent_norm_logPr << " norm_logPr:" << norm_logPr << std::endl);	
+					
+					model_logPr+=model_Pr;
+					sent_model_logPr+=model_Pr;
+					logPr+=current_Pr;
+					sent_logPr+=current_Pr;
+					VERBOSE(2,"sent_model_logPr:" << sent_model_logPr << " model_logPr:" << model_logPr << std::endl);	
+					VERBOSE(2,"sent_logPr:" << sent_logPr << " current_Pr:" << current_Pr << std::endl);
+					delete current_dict;
+				}
+			}
+			
+			if (sent_flag) {
+				sent_model_norm_PP = exp((-sent_model_norm_logPr * M_LN10) / sent_Nw);
+				sent_model_norm_PPwp = sent_model_norm_PP * (1 - 1/exp(sent_Noov *  norm_oovpenalty * M_LN10 / sent_Nw));
+				sent_norm_PP = exp((-sent_norm_logPr * M_LN10) / sent_Nw);
+				sent_norm_PPwp = sent_norm_PP * (1 - 1/exp(sent_Noov * norm_oovpenalty * M_LN10 / sent_Nw));
+				
+				
+				std::cout << "%% sent_Nw=" << sent_Nw
+				<< " sent_norm_logPr=" << sent_norm_logPr
+				<< " sent_norm_PP=" << sent_norm_PP
+				<< " sent_norm_PPwp=" << sent_norm_PPwp
+				<< " sent_norm_PP_noOOV=" << (sent_norm_PP-sent_norm_PPwp)
+				<< " sent_Noov=" << sent_Noov
+				<< " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" 
+				<< " sent_avg_alternatives=" << (float) sent_current_dict_alternatives/sent_Nw
+				<< std::endl;
+				
+				std::cout << "%% sent_Nw=" << sent_Nw
+				<< " sent_model_norm_logPr=" << sent_model_norm_logPr
+				<< " sent_model_norm_PP=" << sent_model_norm_PP
+				<< " sent_model_norm_PPwp=" << sent_model_norm_PPwp
+				<< " sent_model_norm_PP_noOOV=" << (sent_model_norm_PP-sent_model_norm_PPwp)
+				<< " sent_Noov=" << sent_Noov
+				<< " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" 
+				<< " sent_avg_alternatives=" << (float) sent_current_dict_alternatives/sent_Nw
+				<< std::endl;
+				
+				sent_model_PP = exp((-sent_model_logPr * M_LN10) / sent_Nw);
+				sent_model_PPwp = sent_model_PP * (1 - 1/exp(sent_Noov * oovpenalty * M_LN10 / sent_Nw));
+				sent_PP = exp((-sent_logPr * M_LN10) / sent_Nw);
+				sent_PPwp = sent_PP * (1 - 1/exp(sent_Noov * oovpenalty * M_LN10 / sent_Nw));
+				std::cout << "%% sent_Nw=" << sent_Nw
+				<< " sent_logPr=" << sent_logPr
+				<< " sent_PP=" << sent_PP
+				<< " sent_PPwp=" << sent_PPwp
+				<< " sent_PP_noOOV=" << (sent_PP-sent_PPwp)
+				<< " sent_Noov=" << sent_Noov
+				<< " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" 
+				<< " sent_avg_alternatives=" << (float) sent_current_dict_alternatives/sent_Nw
+				<< std::endl;
+			
+				std::cout << "%% sent_Nw=" << sent_Nw
+				<< " sent_model_logPr=" << sent_model_logPr
+				<< " sent_model_PP=" << sent_model_PP
+				<< " sent_model_PPwp=" << sent_model_PPwp
+				<< " sent_model_PP_noOOV=" << (sent_model_PP-sent_model_PPwp)
+				<< " sent_Noov=" << sent_Noov
+				<< " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" 
+				<< " sent_avg_alternatives=" << (float) sent_current_dict_alternatives/sent_Nw
+				<< std::endl;
+				std::cout.flush();
+				//reset statistics for sentence based Perplexity
+				sent_Noov = 0;
+				sent_Nw = 0;
+				sent_model_norm_logPr = 0.0;
+				sent_model_logPr = 0.0;
+				sent_norm_logPr = 0.0;
+				sent_logPr = 0.0;
+				sent_current_dict_alternatives = 0;
+			}
+			
+			apriori_topic_map.clear();
+		}
+
+		model_norm_PP = exp((-model_norm_logPr * M_LN10) / Nw);
+		model_norm_PPwp = model_norm_PP * (1 - 1/exp(Noov *  norm_oovpenalty * M_LN10 / Nw));
+		model_PP = exp((-model_logPr * M_LN10) / Nw);
+		model_PPwp = model_PP * (1 - 1/exp(Noov * oovpenalty * M_LN10 / Nw));
+		norm_PP = exp((-norm_logPr * M_LN10) / Nw);
+		norm_PPwp = norm_PP * (1 - 1/exp(Noov *  norm_oovpenalty * M_LN10 / Nw));
+		PP = exp((-logPr * M_LN10) / Nw);
+		PPwp = PP * (1 - 1/exp(Noov * oovpenalty * M_LN10 / Nw));
+		
+		std::cout << "%% Nw=" << Nw
+		<< " model_logPr=" << model_logPr
+		<< " model_PP=" << model_PP
+		<< " model_PPwp=" << model_PPwp
+		<< " model_PP_noOOV=" << (model_PP-model_PPwp)
+		<< " Noov=" << Noov
+		<< " OOVrate=" << (float)Noov/Nw * 100.0 << "%" 
+		<< " avg_alternatives=" << (float) current_dict_alternatives/Nw
+		<< std::endl;
+		std::cout.flush();
+		
+		std::cout << "%% Nw=" << Nw
+		<< " model_norm_logPr=" << model_norm_logPr
+		<< " model_norm_PP=" << model_norm_PP
+		<< " model_norm_PPwp=" << model_norm_PPwp
+		<< " model_norm_PP_noOOV=" << (model_norm_PP-model_norm_PPwp)
+		<< " Noov=" << Noov
+		<< " OOVrate=" << (float)Noov/Nw * 100.0 << "%"
+		<< " avg_alternatives=" << (float) current_dict_alternatives/Nw
+		<< std::endl;
+		std::cout.flush();
+		
+		std::cout << "%% Nw=" << Nw
+		<< " logPr=" << logPr
+		<< " PP=" << PP
+		<< " PPwp=" << PPwp
+		<< " PP_noOOV=" << (PP-PPwp)
+		<< " Noov=" << Noov
+		<< " OOVrate=" << (float)Noov/Nw * 100.0 << "%"
+		<< " avg_alternatives=" << (float) current_dict_alternatives/Nw
+		<< std::endl;
+		std::cout.flush();
+		
+		std::cout << "%% Nw=" << Nw
+		<< " norm_logPr=" << norm_logPr
+		<< " norm_PP=" << norm_PP
+		<< " norm_PPwp=" << norm_PPwp
+		<< " norm_PP_noOOV=" << (norm_PP-norm_PPwp)
+		<< " Noov=" << Noov
+		<< " OOVrate=" << (float)Noov/Nw * 100.0 << "%"
+		<< " avg_alternatives=" << (float) current_dict_alternatives/Nw
+		<< std::endl;
+		std::cout.flush();
+		
+		if (debug>1) lmt->used_caches();
+		
+		if (debug>1) lmt->stat();
+		
+		delete lmt;
+		return 0;
+	}
+  if (rankscore == true) {
+		
+		if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) {
+			debug = (debug>4)?4:debug;
+			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+		}
+		if (lmt->getLanguageModelType() == _IRSTLM_LMMACRO) {
+			debug = (debug>4)?4:debug;
+			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+		}
+		if (lmt->getLanguageModelType() == _IRSTLM_LMCLASS) {
+			debug = (debug>4)?4:debug;
+			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+		}
+		std::cerr << "Start RankBased Evaluation" << std::endl;
+		std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl;
+		std::cout.setf(ios::fixed);
+		std::cout.precision(2);
+		
+		int Nw=0,Noov=0;
+		double avgRank;
+		int tot_rank = 0;
+		int max_rank = 0;	
+		int current_dict_alternatives = 0;
+		
+		double bow;
+		int bol=0;
+		char *msp;
+		ngram_state_t msidx;
+		unsigned int statesize;
+		
+		// variables for storing sentence-based Rank Statistics
+		int sent_Nw=0,sent_Noov=0;
+		double sent_avgRank;
+		int sent_tot_rank = 0;
+		int sent_id = 0;	
+		int sent_current_dict_alternatives = 0;
+		
+		std::fstream inptxt(testfile,std::ios::in);
+		
+		// loop over input lines
+		char line[MAX_LINE];
+		while (inptxt.getline(line,MAX_LINE)) {
+			
+			std::string line_str = line;
+			
+			VERBOSE(1,"input_line:|" << line_str << "|" << std::endl);	
+			
+			//getting sentence string;
+			std::string tmp_sentence;
+			std::string sentence;
+			std::string context;
+			std::string sentence_lexiconfile;
+			
+			bool withLexicon = ((lmContextDependent*) lmt)->GetSentenceAndLexicon(tmp_sentence,sentence_lexiconfile,line_str);
+			bool withContext = lmt->GetSentenceAndContext(sentence,context,tmp_sentence);
+			
+			//getting apriori topic weights
+			topic_map_t apriori_topic_map;
+			if (withContext){
+				((lmContextDependent*) lmt)->setContextMap(apriori_topic_map,context);
+			}
+			
+			// computation using std::string
+			// loop over ngrams of the sentence
+			string_vec_t word_vec;
+			split(sentence, ' ', word_vec);
+			
+			//first points to the last recent term to take into account
+			//last points to the position after the most recent term to take into account
+			//last could point outside the vector of string; do NOT use word_vec.at(last)
+			size_t last, first;
+			size_t size=0;
+			size_t order = lmt->maxlevel();
+
+			std::stringstream rank_outstr;
+			
+			for (size_t word_pos=0; word_pos<word_vec.size(); ++word_pos){
+				++size;
+				size=(size<order)?size:order;
+				last=word_pos+1;
+				
+				// reset ngram at begin of sentence
+				if (word_vec.at(word_pos) == lmt->getDict()->BoS()) {
+					size=1;
+					continue;
+				}
+				first = last - size;
+				
+				string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
+				
+				if (size>=1) {
+					
+					VERBOSE(2,"computing rank for first:|" << first << "| and last:|" << last << "|" << std::endl);	
+					
+					VERBOSE(2,"word_vec.at(word_pos):|" << word_vec.at(word_pos) << "| lmt->getDict()->OOV():|" << lmt->getDict()->OOV() << "|" << std::endl);
+					
+					if (lmt->getDict()->encode(word_vec.at(word_pos).c_str()) == lmt->getDict()->oovcode()) {
+						Noov++;
+						sent_Noov++;
+					}
+					Nw++;
+					sent_Nw++;
+					
+					if ((Nw % 100000)==0) {
+						std::cerr << ".";
+						lmt->check_caches_levels();
+					}
+					
+					VERBOSE(2,"tmp_word_vec.size:|" << tmp_word_vec.size() << "|" << std::endl);	
+					VERBOSE(2,"dict.size:|" << lmt->getDict()->size() << "|" << std::endl);	
+					string_vec_t::iterator it=tmp_word_vec.end()-1;
+					
+					int current_pos = tmp_word_vec.size()-1;
+					std::string current_word = tmp_word_vec.at(current_pos);
+					
+					double current_Pr;
+					if (withContext){
+						current_Pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msidx, &msp, &statesize);				
+					}else{
+						current_Pr=lmt->clprob(tmp_word_vec, &bow, &bol, &msidx, &msp, &statesize);				
+					}
+					
+					//loop over a set of selected alternative words
+					//populate the dictionary with all words associated with the current word
+					
+					dictionary* current_dict;
+					if (add_full_dictionary){
+						//loop over all words in the LM
+						current_dict = lmt->getDict();
+					}else{
+						current_dict = new dictionary((char *)NULL,1000000);
+					}
+					current_dict->incflag(1);
+					
+					current_dict->encode(current_word.c_str());
+					
+					VERBOSE(2,"after current word current_dict->size:" << current_dict->size() << std::endl);
+					
+					//add words from the lexicon
+					if (add_lexicon_words){
+						
+						if (withLexicon){
+							lexicon.clear();
+							load_lexicon(sentence_lexiconfile.c_str(), lexicon);
+						}
+						
+						std::pair <std::multimap< std::string, std::string>::iterator, std::multimap< std::string, std::string>::iterator> ret = lexicon.equal_range(current_word);
+						for (std::multimap<std::string, std::string>::const_iterator it=ret.first; it!=ret.second; ++it)
+						{
+							current_dict->encode((it->second).c_str());
+							/*
+							 //exclude the current word from the selected alternative words
+							 if (current_word != (it->second).c_str()){
+							 current_dict->encode((it->second).c_str());
+							 }
+							 */
+						}
+					}
+					VERBOSE(2,"after add_lexicon_words current_dict->size:" << current_dict->size() << std::endl);
+					
+					if (add_lm_words){
+						bool succ_flag=false;
+						ngram hg(lmt->getDict());
+						
+						if (size==1) {
+							hg.pushw(lmt->getDict()->BoS());
+							hg.pushc(0);
+							
+							lmt->get(hg,hg.size,hg.size-1);
+							VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+							if (hg.succ < successor_limit){
+								succ_flag=true;
+							}else{
+								VERBOSE(3,"successors are not added into the alternatives because they are too many" << std::endl);	
+							}
+						}else if (size>=2) {
+							hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-2));
+							hg.pushc(0);
+							
+							lmt->get(hg,hg.size,hg.size-1);
+							VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+							if (hg.succ < successor_limit){
+								succ_flag=true;
+							}else{
+								VERBOSE(3,"successors are not added into the alternatives because they are too many" << std::endl);	
+							}
+							
+							if (!succ_flag && size>=3){
+								hg.size=0;
+								hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-3));
+								hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-2));
+								hg.pushc(0);
+								
+								lmt->get(hg,hg.size,hg.size-1);
+								VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+								
+								if (hg.succ < successor_limit){
+									succ_flag=true;
+								}else{
+									VERBOSE(3,"successors are not added into the alternatives because they are too many" << std::endl);	
+								}
+							}
+						}
+						
+						
+						if (succ_flag){
+							ngram ng=hg;
+							lmt->succscan(hg,ng,LMT_INIT,ng.size);	
+							while(lmt->succscan(hg,ng,LMT_CONT,ng.size)) {
+								current_dict->encode(ng.dict->decode(*ng.wordp(1)));
+							}
+						}
+						
+					}
+					
+					VERBOSE(2,"after add_lm_words current_dict->size:" << current_dict->size() << std::endl);
+					
+					if (add_sentence_words){
+						for (string_vec_t::const_iterator it=word_vec.begin(); it!=word_vec.end(); ++it)
+						{
+							current_dict->encode(it->c_str());
+						}
+					}
+					current_dict->incflag(0);
+					VERBOSE(2,"after add_sentence_words current_dict->size:" << current_dict->size() << std::endl);
+					
+					sent_current_dict_alternatives += current_dict->size();
+					current_dict_alternatives += current_dict->size();
+					
+					VERBOSE(2,"current_dict->size:" << current_dict->size() << std::endl);
+					for (int h=0;h<current_dict->size();++h){
+						VERBOSE(2,"h:" << h << " w:|" << current_dict->decode(h) << "|" << std::endl);
+					}
+
+				  //the first word in current_dict is always the current_word; hence we can skip it during the scan 
+					//variables for the computation of the ranking
+					max_rank = current_dict->size(); //the current word is  included in the selected alternative words
+					int current_rank = 1;
+					//					for (int j=0; j<current_dict->size(); ++j){
+					for (int j=1; j<current_dict->size(); j++)
+					{
+					  tmp_word_vec.at(current_pos) = current_dict->decode(j);
+						IFVERBOSE(3){
+							std::cout << "tmp_word_vec j:" << j;
+							for (string_vec_t::const_iterator it2=tmp_word_vec.begin(); it2!=tmp_word_vec.end(); ++it2) {
+								std::cout << " |" << (*it2) << "|";
+							}
+							std::cout << std::endl;
+						}
+						
+						double pr;
+						if (withContext){
+							pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msidx, &msp, &statesize);
+						}else{
+							pr=lmt->clprob(tmp_word_vec, &bow, &bol, &msidx, &msp, &statesize);
+						}
+						if (pr > current_Pr){
+							++current_rank;	
+						}
+						
+						VERBOSE(3," current_pos:" << current_pos << " word:|" << tmp_word_vec.at(current_pos) << "| current_Pr:" << current_Pr << " pr:" << pr << " current_rank:" << current_rank <<std::endl);
+					}
+					
+					sent_tot_rank += current_rank;
+					tot_rank += current_rank;
+					
+					if (debug>1){
+						//output format:
+						//word_pos:current_rank:max_rank
+						rank_outstr << " " << word_pos << ":" << current_rank << ":" << max_rank;
+					}
+					delete current_dict;
+				}
+			}
+			
+			if (sent_flag) {
+				if (debug>1){
+					VERBOSE(1," sent_tot_rank:" << sent_tot_rank << " sent_Nw:" << sent_Nw << std::endl);
+					//output format: a blank-separated list of triplets
+					//current_pos:current_rank:max_rank
+					std::cout << "sent_id=" << sent_id << " ranking= " << rank_outstr.str() << std::endl;
+				}
+				
+				sent_avgRank = ((double) sent_tot_rank)  / sent_Nw;
+				
+				std::cout << "%% sent_Nw=" << sent_Nw
+				<< " sent_avgRank=" << sent_avgRank
+				<< " sent_Noov=" << sent_Noov
+				<< " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%"
+				<< " sent_avg_alternatives=" << (float) sent_current_dict_alternatives/sent_Nw
+				<< std::endl;
+				std::cout.flush();
+				
+				//reset statistics for sentence based avg Ranking
+				sent_Nw = 0;
+				sent_Noov = 0;
+				sent_tot_rank = 0;
+				++sent_id;
+				sent_current_dict_alternatives = 0;
+			}
+		}
+		
+		avgRank = ((double) tot_rank) / Nw;
+		
+		std::cout << "%% Nw=" << Nw
+		<< " avgRank=" << avgRank
+		<< " Noov=" << Noov
+		<< " OOVrate=" << (float)Noov/Nw * 100.0 << "%"
+		<< " avg_alternatives=" << (float) current_dict_alternatives/Nw
+		<< std::endl;
+		std::cout.flush();
+		
+		if (debug>1) lmt->used_caches();
+		
+		if (debug>1) lmt->stat();
+		
+		delete lmt;
+		return 0;
+	}
+}
+
diff --git a/src/context-similarity.cpp b/src/context-similarity.cpp
new file mode 100644
index 0000000..0f2af82
--- /dev/null
+++ b/src/context-similarity.cpp
@@ -0,0 +1,559 @@
+// $Id: lmContextDependent.cpp 3686 2010-10-15 11:55:32Z bertoldi $
+
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ 
+ ******************************************************************************/
+#include <stdio.h>
+#include <cstdlib>
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include "ngramtable.h"
+#include "lmContainer.h"
+#include "context-similarity.h"
+#include "util.h"
+#include "mfstream.h"
+
+using namespace std;
+
+inline void error(const char* message)
+{
+  std::cerr << message << "\n";
+  throw std::runtime_error(message);
+}
+
+namespace irstlm {
+	
+	ContextSimilarity::ContextSimilarity(const std::string &k_modelfile, const std::string &hk_modelfile, const std::string &hwk_modelfile)
+	{
+		m_hwk_order=3;
+		m_hk_order=2;
+		m_wk_order=m_hk_order;
+		m_k_order=1;
+		m_hwk_ngt=new ngramtable((char*) hwk_modelfile.c_str(), m_hwk_order, NULL,NULL,NULL);
+		m_hk_ngt=new ngramtable((char*) hk_modelfile.c_str(), m_hk_order, NULL,NULL,NULL);
+		m_wk_ngt=m_hk_ngt; //just a link to m_hk_ngt
+		m_k_ngt=new ngramtable((char*) k_modelfile.c_str(), m_k_order, NULL,NULL,NULL);
+		
+		m_smoothing = 0.001;
+		m_threshold_on_h = 0;
+		m_active=true;
+		
+		m_topic_size = m_k_ngt->getDict()->size();
+		VERBOSE(1, "There are " << m_topic_size << " topics in the model" << std::endl);
+		
+#ifdef MY_ASSERT_FLAG
+		VERBOSE(0, "MY_ASSERT is active" << std::endl);
+#else
+		VERBOSE(0, "MY_ASSERT is NOT active" << std::endl);
+#endif
+		
+	}
+	
+	
+	ContextSimilarity::~ContextSimilarity()
+	{
+		delete m_hwk_ngt;
+		delete m_hk_ngt;
+		//delete m_wk_ngt;  //it is just a link to m_hk_ngt
+		delete m_k_ngt;
+	}
+	
+	void ContextSimilarity::normalize_topic_scores(topic_map_t& map)
+	{	
+		UNUSED(map);
+		/* normalization type 1
+		 double max = -1000000.0;
+		 double min =  1000000.0;
+		 for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it){
+		 min = (map[it->first]<min)?map[it->first]:min;
+		 max = (map[it->first]>max)?map[it->first]:max;
+		 }
+		 for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it){
+		 map[it->first] = (map[it->first]-min)/(max-min);
+		 }
+		 VERBOSE(2,"min:"<<min << " max:" << max << std::endl);
+		 */
+		/*
+		 //normalization type 2
+		 double norm =  0.0;
+     for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it){
+		 norm += fabs(map[it->first]);
+		 }
+		 for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it){
+		 map[it->first] = map[it->first]/norm;
+		 }
+		 VERBOSE(2,"norm:" << norm << std::endl);
+		 */
+	}
+	
+	double ContextSimilarity::DeltaCrossEntropy(topic_map_t& topic_map, topic_map_t& tmp_map, double len)
+	{
+		double xDeltaEntropy = 0.0;
+		for (topic_map_t::iterator it=tmp_map.begin(); it != tmp_map.end(); ++it){
+			xDeltaEntropy += topic_map[it->first] * tmp_map[it->first];
+			//			VERBOSE(2,"topic_map[it->first]:" << topic_map[it->first] << " tmp_map[it->first]:" << tmp_map[it->first] << " product:" << topic_map[it->first] * tmp_map[it->first] << std::endl);
+		}
+		//		VERBOSE(2," xDeltaEntropy:" << xDeltaEntropy << " len:" << len << " xDeltaEntropy/len:" << xDeltaEntropy/len << std::endl);
+		return xDeltaEntropy/len;
+	}
+	
+	void ContextSimilarity::add_topic_scores(topic_map_t& topic_map, topic_map_t& tmp_map)
+	{
+		for (topic_map_t::iterator it=tmp_map.begin(); it != tmp_map.end(); ++it){
+			topic_map[it->first] += tmp_map[it->first];
+		}
+	}
+	
+	//returns the scores for all topics in the topic models (without apriori topic prob)
+	void ContextSimilarity::print_topic_scores(topic_map_t& map)
+	{
+		for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it)
+		{
+			if (it!=map.begin()) { std::cout << topic_map_delimiter1; }
+			std::cout << it->first << topic_map_delimiter2 << it->second;
+			//			std::cout << it->first << topic_map_delimiter2 << exp(it->second * M_LN10);
+		}
+		
+		std::cout << std::endl;
+	}
+	
+	void ContextSimilarity::print_topic_scores(topic_map_t& map, topic_map_t& refmap, double len)
+	{
+		for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it)
+		{
+			if (it!=map.begin()) { std::cout << topic_map_delimiter1; }
+			std::cout << it->first << topic_map_delimiter2 << it->second;
+		}
+		std::cout << " DeltaCrossEntropy:" << DeltaCrossEntropy(refmap,map,len);
+		std::cout << std::endl;
+	}
+	
+	void ContextSimilarity::create_ngram(const string_vec_t& text, ngram& ng)
+	{
+		//text is a vector of strings with w in the last position and the history in the previous positions
+		//text must have at least one word
+		//if text has two word, further computation will rely on normal counts, i.e. counts(h,w,k), counts(h,w), counts(w,k), counts(h,k), counts(k)
+		//if text has only one word, further computation will rely on lower-order counts, i.e. (w,k), counts(w), counts(w), counts(k), counts()
+		VERBOSE(2,"void ContextSimilarity::create_ngram" << std::endl);
+		VERBOSE(2,"text.size:" << text.size() << std::endl);
+		
+		MY_ASSERT(text.size()>0);
+		
+		if (text.size()==1){
+			//all further computation will rely on lower-order counts
+			ng.pushw(text.at(text.size()-1));
+		}else {
+			ng.pushw(text.at(text.size()-2));
+			ng.pushw(text.at(text.size()-1));
+		}
+		VERBOSE(2,"output of create_ngram ng:|" << ng << "| ng.size:" << ng.size << std::endl);
+	}
+	
+	void ContextSimilarity::create_topic_ngram(const string_vec_t& text, const std::string& topic, ngram& ng)
+	{
+		//text is a vector of string with w in the last position and the history in the previous positions
+		//text must have at least one word
+		//topic is added in the most recent position of the ngram
+		create_ngram(text, ng);
+		add_topic(topic, ng);
+		VERBOSE(2,"output of create_topic_ngram ng:|" << ng << "| ng.size:" << ng.size << std::endl);
+	}
+	
+	void ContextSimilarity::add_topic(const std::string& topic, ngram& ng)
+	{		
+		ng.pushw(topic);
+	}
+	
+	void ContextSimilarity::modify_topic(const std::string& topic, ngram& ng)
+	{		
+		*ng.wordp(1) = ng.dict->encode(topic.c_str());
+	}
+	
+	void ContextSimilarity::get_counts(ngram& ng, ngramtable& ngt, double& c_xk, double& c_x)
+	{			
+		VERBOSE(2, "double ContextSimilarity::get_counts(ngram& ng, double& c_xk, double& c_x) with ng:|" << ng << "|" << std::endl);
+		//counts taken from the tables are modified to avoid zero values for the probs
+		//a constant epsilon (smmothing) is added
+		//we also assume that c(x) = sum_k c(xk)
+		
+		//we assume that ng ends with a correct topic 
+		//we assume that ng is compliant with ngt, and has the correct size
+		
+		c_xk = m_smoothing;
+		c_x  = m_smoothing * m_topic_size;
+		
+		if (ngt.get(ng)) { c_xk += ng.freq; }
+		if (ngt.get(ng,ng.size,ng.size-1)) { c_x += ng.freq; }
+		
+		VERBOSE(3, "c_xk:" << c_xk << " c_x:" << c_x << std::endl);
+	}
+	
+	double ContextSimilarity::topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2)
+	{
+		ngram ng(ngt.getDict());
+		
+		create_topic_ngram(text, topic, ng);
+		
+		return topic_score(ng, ngt, ngt2);
+	}
+	
+	double ContextSimilarity::topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2){
+#ifdef OPTION_1
+		return topic_score_option1(ng, ngt, ngt2);
+#elif OPTION_2
+		return topic_score_option2(ng, ngt, ngt2);
+#elif OPTION_3
+		return topic_score_option3(ng, ngt, ngt2);
+#else
+		return topic_score_option0(ng, ngt, ngt2);
+#endif
+	}
+	
+	double ContextSimilarity::topic_score_option0(ngram& ng, ngramtable& ngt, ngramtable& ngt2)
+	{
+		UNUSED(ngt);
+		UNUSED(ngt2);
+		VERBOSE(2, "double ContextSimilarity::topic_score_option0(ngram& ng, ngramtable& ngt) with ng:|" << ng << "|" << std::endl);
+		
+		//option 0: uniform (not considering log function) 
+		//P(k|hw) = 1/number_of_topics
+		double log_pr = -log(m_topic_size)/M_LN10;
+		
+		VERBOSE(3, "option0: return: " << log_pr<< std::endl);	
+		return log_pr;
+	}
+	
+	double ContextSimilarity::topic_score_option1(ngram& ng, ngramtable& ngt, ngramtable& ngt2)
+	{
+		VERBOSE(2, "double ContextSimilarity::topic_score_option1(ngram& ng, ngramtable& ngt) with ng:|" << ng << "|" << std::endl);
+		
+		//ngt provides counts c(hwk) and c(hw) (or c(wk) and c(w))
+		double c_xk, c_x;
+		get_counts(ng, ngt, c_xk, c_x);
+		
+		//copy and transform codes
+		// shift all terms, but the topic
+		// ng2[3]=ng[4];
+		// ng2[2]=ng[3];
+		// ng2[1]=ng[1];
+		ngram ng2(ngt2.getDict());
+		ng2.trans(ng);
+		ng2.shift();
+		*ng2.wordp(1)=ng2.dict->encode(ng.dict->decode(*ng.wordp(1)));
+		
+		//ngt2 provides counts c(hk) and c(h) (or c(k) and c())
+		double c_xk2, c_x2;
+		get_counts(ng2, ngt2, c_xk2, c_x2);
+		
+		//option 1: (not considering log function) 
+		//P(k|hw)/sum_v P(k|hv) ~approx~ P(k|hw)/P(k|h) ~approx~ num_pr/den_pr
+		//num_pr = c'(hwk)/c'(hw)
+		//den_pr = c'(hk)/c'(h)
+		double den_log_pr = log10(c_xk2) - log10(c_x2);
+		double num_log_pr = log10(c_xk) - log10(c_x);
+		double log_pr = num_log_pr - den_log_pr;
+		VERBOSE(3, "option1: num_log_pr:" << num_log_pr << " den_log_pr:" << den_log_pr << " return: " << log_pr << std::endl);
+		return log_pr;
+	}
+	
+	double ContextSimilarity::topic_score_option2(ngram& ng, ngramtable& ngt, ngramtable& ngt2)
+	{
+		UNUSED(ngt2);
+		VERBOSE(2, "double ContextSimilarity::topic_score_option2(ngram& ng, ngramtable& ngt) with ng:|" << ng << "|" << std::endl);
+		
+		//ngt provides counts c(hwk) and c(hw) (or c(wk) and c(w))
+		double c_xk, c_x;
+		get_counts(ng, ngt, c_xk, c_x);
+		
+		//option 1: (not considering log function) 
+		//P(k|hw)/sum_v P(k|hv) ~approx~ P(k|hw)/P(k|h) ~approx~ c'(hwk)/c'(hw)
+		double log_pr = log10(c_xk) - log10(c_x);
+		VERBOSE(3, "option2: log_pr:" << log_pr << " return: " << log_pr << std::endl);
+		return log_pr;
+	}
+	
+	double ContextSimilarity::topic_score_option3(ngram& ng, ngramtable& ngt, ngramtable& ngt2)
+	{
+		VERBOSE(2, "double ContextSimilarity::topic_score_option3(ngram& ng, ngramtable& ngt) with ng:|" << ng << "|" << std::endl);
+		
+		//ngt provides counts c(hwk) and c(hw) (or c(wk) and c(w))
+		double c_xk, c_x;
+		get_counts(ng, ngt, c_xk, c_x);
+		
+		//copy and transform codes
+		// shift all terms, but the topic
+		// ng2[3]=ng[4];
+		// ng2[2]=ng[3];
+		// ng2[1]=ng[1];
+		ngram ng2(ngt2.getDict());
+		ng2.trans(ng);
+		ng2.shift();
+		*ng2.wordp(1)=ng2.dict->encode(ng.dict->decode(*ng.wordp(1)));
+		
+		//ngt2 provides counts c(hk) and c(h) (or c(k) and c())
+		double c_xk2, c_x2;
+		get_counts(ng2, ngt2, c_xk2, c_x2);
+		
+		/*;
+		 //approximation 3: (not considering log function) 
+		 //P(k|hw)/sum_v P(k|hv) ~approx~ logistic_function(P(k|hw)/P(k|h))
+		 // ~approx~ logistic_function(num_pr/den_pr)
+		 // ~approx~ logistic_function(c'(hwk)/c'(hw)/c'(hk)/c'(h))
+		 // ~approx~ logistic_function((c'(hwk)*c'(h))/(c'(hw)*c'(hk)))
+		 
+		 return logistic_function((c'(hwk)*c'(h))/(c'(hw)*c'(hk)),1.0,1.0)
+		 */
+		
+		double log_pr = logistic_function((c_xk*c_x2)/(c_x*c_xk2),1.0,1.0);
+		
+		VERBOSE(3, "option3: return: " << log_pr << std::endl);
+		return log_pr;
+	}
+	
+	double ContextSimilarity::total_topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2, dictionary& dict)
+	{
+		ngram ng(ngt.getDict());
+		create_topic_ngram(text, topic, ng);
+		return total_topic_score(ng, ngt, ngt2, dict);
+	}
+	
+	double ContextSimilarity::total_topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict)
+	{		
+		double tot_pr = 0.0;
+		double v_topic_pr;
+		for (int v=0; v<dict.size(); ++v){
+			//replace last word, which is in position 2, keeping topic in position 1 unchanged
+			*ng.wordp(2) = ng.dict->encode(dict.decode(v));
+			v_topic_pr = topic_score(ng, ngt, ngt2);
+			tot_pr += pow(10.0,v_topic_pr); //v_pr is a lo10 prob
+		}
+		return log10(tot_pr);
+	}
+	
+	double ContextSimilarity::total_topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight)
+	{
+		ngram ng(ngt.getDict());
+		create_topic_ngram(text, topic, ng);
+		return total_topic_score(ng, ngt, ngt2, dict, lm, weight);
+	}
+	
+	double ContextSimilarity::total_topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight)
+	{		
+		double tot_pr = 0.0;
+		double v_pr, v_topic_pr, v_lm_pr;
+		for (int v=0; v<dict.size(); ++v){
+			//replace last word, which is in position 2, keeping topic in position 1 unchanged
+			*ng.wordp(2) = ng.dict->encode(dict.decode(v));
+			v_topic_pr = topic_score(ng, ngt, ngt2);
+			v_lm_pr = lm.clprob(ng);
+			v_pr = v_lm_pr + weight * v_topic_pr;
+			tot_pr += pow(10.0,v_pr); //v_pr is a lo10 prob
+		}
+		return log10(tot_pr);
+	}
+	
+	void ContextSimilarity::modify_context_map(string_vec_t& text, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, topic_map_t& topic_weights, topic_map_t& mod_topic_weights)
+	{
+		ngram ng(ngt.getDict());
+		create_topic_ngram(text, "dummy_topic", ng); //just for initialization
+		
+		modify_context_map(ng, ngt, ngt2, dict, topic_weights, mod_topic_weights);
+	}
+	
+	void ContextSimilarity::modify_context_map(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, topic_map_t& topic_weights, topic_map_t& mod_topic_weights)
+	{
+		double global_score;
+		double mod_topic_pr;
+		for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
+		{
+			modify_topic(it->first, ng);
+			global_score = total_topic_score(ng, ngt, ngt2, dict);
+			global_score = pow(10.0,global_score);
+			mod_topic_pr = it->second/global_score;
+			mod_topic_weights.insert(make_pair(it->first,mod_topic_pr));
+		}
+	}
+	
+	void ContextSimilarity::modify_context_map(string_vec_t& text, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight, topic_map_t& topic_weights, topic_map_t& mod_topic_weights)
+	{
+		ngram ng(ngt.getDict());
+		create_topic_ngram(text, "dummy_topic", ng); //just for initialization
+		
+		modify_context_map(ng, ngt, ngt2, dict, lm, weight, topic_weights, mod_topic_weights);
+	}
+	
+	void ContextSimilarity::modify_context_map(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight, topic_map_t& topic_weights, topic_map_t& mod_topic_weights)
+	{
+		double global_score;
+		double mod_topic_pr;
+		for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
+		{
+			modify_topic(it->first, ng);
+			global_score = total_topic_score(ng, ngt, ngt2, dict, lm, weight);
+			global_score = pow(10.0,global_score);
+			mod_topic_pr = it->second/global_score;
+			mod_topic_weights.insert(make_pair(it->first,mod_topic_pr));
+		}
+	}
+	
+	
+	double ContextSimilarity::context_similarity(string_vec_t& text, topic_map_t& topic_weights)
+	{
+#ifdef SOLUTION_1
+		return context_similarity_solution1(text, topic_weights);
+#elif SOLUTION_2
+		return context_similarity_solution2(text, topic_weights);
+#else
+		UNUSED(text);
+		UNUSED(topic_weights);
+		VERBOSE(3, "This solution type is not defined; forced to default solution 1" << std::endl);
+		return context_similarity_solution1(text, topic_weights);
+//		exit(IRSTLM_CMD_ERROR_GENERIC);
+#endif
+	}
+	
+	//return the log10 of the similarity score
+	double ContextSimilarity::context_similarity_solution1(string_vec_t& text, topic_map_t& topic_weights)
+	{
+		VERBOSE(2, "double ContextSimilarity::context_similarity_solution1(string_vec_t& text, topic_map_t& topic_weights)" << std::endl);
+		double ret_log10_pr = 0.0;
+		
+		if (!m_active){
+			//similarity score is disable
+			//return an uninforming score (log(1.0) = 0.0)
+			ret_log10_pr = 0.0;
+		}
+		else if (topic_weights.size() == 0){
+			//a-priori topic distribution is "empty", i.e. there is no score for any topic
+			//return an uninforming score (log(1.0) = 0.0)
+			ret_log10_pr = 0.0;
+		}
+		else{
+			VERBOSE(3, "topic_weights.size():" << topic_weights.size() << std::endl);
+				
+			ngramtable* current_ngt;
+			ngramtable* current_ngt2;
+			
+			if (text.size()==1){
+				current_ngt = m_wk_ngt;
+				current_ngt2 = m_k_ngt;
+			}
+			else{
+				current_ngt = m_hwk_ngt;
+				current_ngt2 = m_hk_ngt;
+			}
+			
+			ngram ng(current_ngt->getDict());
+			create_topic_ngram(text, "dummy_topic", ng); //just for initialization
+			
+			
+			if (reliable(ng, current_ngt)){
+				//this word sequence is reliable
+				
+				double ret_pr = 0.0;
+				for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
+				{
+					ngram current_ng = ng;
+					modify_topic(it->first, current_ng);
+					
+					double apriori_topic_score = it->second; //prob
+					double current_topic_score = exp(topic_score(current_ng, *current_ngt, *current_ngt2) * M_LN10); //topic_score(...) returns  a log10; hence exp is applied to (score *  M_LN10)
+					
+					VERBOSE(3, "current_ng:|" << current_ng << "| topic:|" << it->first  << "| apriori_topic_score:" << apriori_topic_score << " topic_score:" << current_topic_score << " score_toadd:" << ret_pr << std::endl);
+					ret_pr += apriori_topic_score * current_topic_score;
+					VERBOSE(3, "CURRENT ret_pr:" << ret_pr << std::endl);
+				}
+				ret_log10_pr = log10(ret_pr);
+			}
+			else{
+				//this word sequence is not reliable enough, because occurrences of ng are too little
+				//return an uninforming score (log10(1/K) = -log10(K))
+				ret_log10_pr = -log(m_topic_size)/M_LN10;
+//				ret_log10_pr = 0.0;
+				VERBOSE(3, "CURRENT ret_pr:" << pow(10.0,ret_log10_pr) << std::endl);
+			}
+			
+		}
+		VERBOSE(2, "ret_log10_pr:" << ret_log10_pr << std::endl);
+		return ret_log10_pr;
+	}
+	
+	//return the log10 of the similarity score
+	double ContextSimilarity::context_similarity_solution2(string_vec_t& text, topic_map_t& topic_weights)
+	{
+		return context_similarity_solution1(text, topic_weights);
+	}
+	
+	bool ContextSimilarity::reliable(ngram& ng, ngramtable* ngt)
+	{
+		VERBOSE(2, "ContextSimilarity::reliable(ngram& ng, ngramtable* ngt) ng:|" << ng << "| ng.size:" << ng.size<< "| thr:" << m_threshold_on_h << std::endl);	
+		
+		bool ret=false;
+		
+		if (ngt->get(ng,ng.size,ng.size-1) && (ng.freq > m_threshold_on_h)){
+			ret=true;
+		}else{
+			ret=false;
+		}
+		VERBOSE(3, "ng:|" << ng << "| thr:" << m_threshold_on_h << " reliable:" << ret << std::endl);
+		return ret;
+	}
+	
+	
+	//returns the scores for all topics in the topic models (without apriori topic prob)
+	void ContextSimilarity::get_topic_scores(string_vec_t& text, topic_map_t& topic_map)
+	{
+		if (m_active){ //similarity score is disable
+			ngramtable* current_ngt;
+			ngramtable* current_ngt2;
+			
+			if (text.size()==1){
+				current_ngt = m_wk_ngt;
+				current_ngt2 = m_k_ngt;
+			}
+			else{
+				current_ngt = m_hwk_ngt;
+				current_ngt2 = m_hk_ngt;
+			}
+			
+			ngram ng(current_ngt->getDict());
+			create_topic_ngram(text, "dummy_topic", ng); //just for initialization
+			
+			get_topic_scores(ng, *current_ngt, *current_ngt2, topic_map);
+		}
+	}	
+	
+	
+	//returns the scores for all topics in the topic models (without apriori topic prob)
+	void ContextSimilarity::get_topic_scores(ngram& ng, ngramtable& ngt, ngramtable& ngt2, topic_map_t& topic_map)
+	{		
+		if (m_active){ //similarity score is disable
+			for (int i=0; i<m_k_ngt->getDict()->size();++i)
+			{
+				std::string _topic = m_k_ngt->getDict()->decode(i);
+				modify_topic(_topic, ng);
+				topic_map[_topic] = pow(10.0,topic_score(ng, ngt, ngt2));
+			}
+		}
+	}	
+	
+}//namespace irstlm
diff --git a/src/context-similarity.h b/src/context-similarity.h
new file mode 100644
index 0000000..5e67553
--- /dev/null
+++ b/src/context-similarity.h
@@ -0,0 +1,138 @@
+// $Id: lmContextDependent.h 3686 2010-10-15 11:55:32Z bertoldi $
+
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ 
+ ******************************************************************************/
+
+#ifndef MF_CONTEXTSIMILARITY_H
+#define MF_CONTEXTSIMILARITY_H
+
+#include <stdio.h>
+#include <cstdlib>
+#include <stdlib.h>
+#include <string>
+#include <math.h>
+#include "cmd.h"
+#include "util.h"
+#include "dictionary.h"
+#include "n_gram.h"
+#include "ngramtable.h"
+#include "lmContainer.h"
+
+class ngram;
+
+namespace irstlm {
+#define topic_map_delimiter1 ':'
+#define topic_map_delimiter2 ','
+#define SIMILARITY_LOWER_BOUND -10000
+	
+	class ContextSimilarity
+	{
+	private:
+		ngramtable* m_hwk_ngt; // counts(h, w, topic)
+		ngramtable* m_hk_ngt; // counts(h, topic)
+		ngramtable* m_wk_ngt; // counts(w, topic)
+		ngramtable* m_k_ngt; // counts(topic)
+		int m_k_order; //order of m_k_ngt
+		int m_hk_order; //order of m_hk_ngt
+		int m_wk_order; //order of m_wk_ngt
+		int m_hwk_order; //order of m_hwk_ngt
+	
+		int m_topic_size; //number of topics in the model
+		
+		topic_map_t topic_map; 
+		int m_threshold_on_h; //frequency threshold on h to allow computation of similairty scores
+		double m_smoothing; //smoothing value to sum to the counts to avoid zero-prob; implements a sort of shift-beta smoothing
+
+		//flag for enabling/disabling context_similarity scores
+		// if disabled, context_similarity is 0.0 and topic_scores distribution is empty
+		bool m_active;
+		
+		void create_ngram(const string_vec_t& text, ngram& ng);
+		void add_topic(const std::string& topic, ngram& ng);
+		void modify_topic(const std::string& topic, ngram& ng);
+		void create_topic_ngram(const string_vec_t& text, const std::string& topic, ngram& ng);
+	
+		void get_counts(ngram& ng, ngramtable& ngt, double& c_xk, double& c_x);
+		
+		double topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2);
+		double topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2);
+		double topic_score_option0(ngram& ng, ngramtable& ngt, ngramtable& ngt2);
+		double topic_score_option1(ngram& ng, ngramtable& ngt, ngramtable& ngt2);
+		double topic_score_option2(ngram& ng, ngramtable& ngt, ngramtable& ngt2);
+		double topic_score_option3(ngram& ng, ngramtable& ngt, ngramtable& ngt2);
+		
+		double total_topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2, dictionary& dict);
+		double total_topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight);
+		double total_topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict);
+		double total_topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight);
+		
+		
+		
+		void modify_context_map(string_vec_t& text, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, topic_map_t& topic_weights, topic_map_t& mod_topic_weights);
+		void modify_context_map(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, topic_map_t& topic_weights, topic_map_t& mod_topic_weights);
+		void modify_context_map(string_vec_t& text, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight, topic_map_t& topic_weights, topic_map_t& mod_topic_weights);
+		void modify_context_map(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight, topic_map_t& topic_weights, topic_map_t& mod_topic_weights);
+		
+		double context_similarity_solution1(string_vec_t& text, topic_map_t& topic_weights);
+		double context_similarity_solution2(string_vec_t& text, topic_map_t& topic_weights);
+		
+		bool reliable(ngram& ng, ngramtable* ngt);
+		
+	public:
+		ContextSimilarity(const std::string &dictfile, const std::string &num_modelfile, const std::string &den_modelfile);
+		~ContextSimilarity();
+		
+		void get_topic_scores(string_vec_t& text, topic_map_t& topic_map);
+		void get_topic_scores(ngram& ng, ngramtable& ngt, ngramtable& ngt2, topic_map_t& topic_map);
+		
+		void add_topic_scores(topic_map_t& map, topic_map_t& tmp_map);
+		void print_topic_scores(topic_map_t& map);
+		void print_topic_scores(topic_map_t& map, topic_map_t& refmap, double len);
+		double DeltaCrossEntropy(topic_map_t& topic_map, topic_map_t& tmp_map, double len);
+
+		void normalize_topic_scores(topic_map_t& map);
+		
+		double context_similarity(string_vec_t& text, topic_map_t& topic_weights);
+		
+		int get_Threshold_on_H(){
+			return  m_threshold_on_h;
+		}
+		void set_Threshold_on_H(int val){
+			m_threshold_on_h = val;
+		}
+		double get_SmoothingValue(){
+			return  m_smoothing;
+		}
+		void set_SmoothingValue(double val){
+			m_smoothing = val;
+		}
+		bool is_Active(){
+			return  m_active;
+		}
+		void set_Active(bool val){
+			m_active = val;
+		}
+		
+	};
+}
+
+
+#endif
+
diff --git a/src/lmContainer.cpp b/src/lmContainer.cpp
index afdd77c..654a064 100644
--- a/src/lmContainer.cpp
+++ b/src/lmContainer.cpp
@@ -32,6 +32,7 @@
 #include "lmmacro.h"
 #include "lmclass.h"
 #include "lmInterpolation.h"
+#include "lmContextDependent.h"
 
 using namespace std;
 
@@ -94,6 +95,8 @@ namespace irstlm {
 		VERBOSE(1,"type: " << type << std::endl);
 		if (header == "lmminterpolation" || header == "LMINTERPOLATION") {
 			type = _IRSTLM_LMINTERPOLATION;
+		} else if (header == "lmcontextdependent" || header == "LMCONTEXTDEPENDENT") {
+			type = _IRSTLM_LMCONTEXTDEPENDENT;
 		} else if (header == "lmmacro" || header == "LMMACRO") {
 			type = _IRSTLM_LMMACRO;
 		} else if (header == "lmclass" || header == "LMCLASS") {
@@ -142,6 +145,11 @@ namespace irstlm {
 				VERBOSE(1,"_IRSTLM_LMINTERPOLATION" << std::endl);
 				lm = new lmInterpolation(nlf, dlf);
 				break;
+
+			case _IRSTLM_LMCONTEXTDEPENDENT:
+				VERBOSE(1,"_IRSTLM_LMCONTEXTDEPENDENT" << std::endl);
+				lm = new lmContextDependent(nlf, dlf);
+				break;
 				
 			default:
 				VERBOSE(1,"UNKNOWN" << std::endl);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git



More information about the debian-science-commits mailing list