[irstlm] 48/126: restructuring of code using ngramtable instead of lmtable for computing context-dependent scores

Giulio Paci giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:44 UTC 2016


This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.

commit 3d9b78b2b585176a020a23d5f32f83ae5b2bfd67
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Fri Aug 7 13:43:22 2015 +0200

    restructuring of code using ngramtable instead of lmtable for computing context-dependent scores
---
 src/context-similarity.cpp | 242 +++++++++++++++++++++++++++++----------------
 src/context-similarity.h   |  54 ++++++++--
 src/lmContextDependent.cpp |  75 +++++++-------
 src/lmContextDependent.h   |   8 +-
 4 files changed, 249 insertions(+), 130 deletions(-)

diff --git a/src/context-similarity.cpp b/src/context-similarity.cpp
index 0c84501..a474d72 100644
--- a/src/context-similarity.cpp
+++ b/src/context-similarity.cpp
@@ -26,6 +26,7 @@
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include "ngramtable.h"
 #include "lmContainer.h"
 #include "context-similarity.h"
 #include "util.h"
@@ -40,92 +41,99 @@ inline void error(const char* message)
 }
 
 namespace irstlm {
-	ContextSimilarity::ContextSimilarity(const std::string &dictfile, const std::string &num_modelfile, const std::string &den_modelfile)
+	ContextSimilarity::ContextSimilarity(const std::string &k_modelfile, const std::string &hk_modelfile, const std::string &hwk_modelfile)
 	{
-		m_num_lm=lmContainer::CreateLanguageModel(num_modelfile);
-		m_den_lm=lmContainer::CreateLanguageModel(num_modelfile);
+		m_hwk_order=3;
+		m_hk_order=2;
+		m_k_order=1;
+		m_hwk_ngt=new ngramtable((char*) hwk_modelfile.c_str(), m_hwk_order, NULL,NULL,NULL);
+		m_hk_ngt=new ngramtable((char*) hk_modelfile.c_str(), m_hk_order, NULL,NULL,NULL);
+		m_k_ngt=new ngramtable((char*) k_modelfile.c_str(), m_k_order, NULL,NULL,NULL);
 		
-		m_num_lm->load(num_modelfile);
-		m_den_lm->load(den_modelfile);
+		m_smoothing = 0.001;
+		m_threshold_on_h = 0;
+		m_active=true;
 		
-		m_num_lm->getDict()->genoovcode();
-		m_den_lm->getDict()->genoovcode();
-		
-		//loading form file		
-		std::string str;
-		
-		mfstream inp(dictfile.c_str(),ios::in);
-		
-		if (!inp) {
-			std::stringstream ss_msg;
-			ss_msg << "cannot open " << dictfile << "\n";
-			exit_error(IRSTLM_ERROR_IO, ss_msg.str());
-		}
-		VERBOSE(0, "Loading the list of topic" << std::endl);
-		
-		while (inp >> str)
-		{
-			m_lm_topic_dict.insert(str);
-		}
-		VERBOSE(0, "There are " << m_lm_topic_dict.size() << " topic" << std::endl);
+		m_topic_size = m_k_ngt->getDict()->size();
+		VERBOSE(1, "There are " << m_topic_size << " topics in the model" << std::endl);
 	}
 
 	
 	ContextSimilarity::~ContextSimilarity()
-	{}
+	{
+		delete m_hwk_ngt;
+		delete m_hk_ngt;
+	}
 	
 	//return the log10 of the similarity score
-	double ContextSimilarity::score(string_vec_t& text, topic_map_t& topic_weights)
+	double ContextSimilarity::get_context_similarity(string_vec_t& text, topic_map_t& topic_weights)
 	{
-		VERBOSE(4, "double ContextSimilarity::score(string_vec_t& text, topic_map_t& topic_weights)" << std::endl);
+		VERBOSE(2, "double ContextSimilarity::get_context_similarity(string_vec_t& text, topic_map_t& topic_weights)" << std::endl);
 		double ret_log10_pr;
 		
-		if (topic_weights.size() > 0){
+		if (!m_active){ //similarity score is disable
+			ret_log10_pr = 0.0;
+		}else if (m_topic_size == 0){
+			//a-priori topic distribution is "empty", i.e. there is no score for any topic
+			//return an uninforming score (0.0)
+			ret_log10_pr = 0.0;
+		} else{
+			VERBOSE(3, "topic_weights.size():" << topic_weights.size() << std::endl);
+			ngram base_num_ng(m_hwk_ngt->getDict());
+			ngram base_den_ng(m_hk_ngt->getDict());
 			
-			ngram base_num_ng(m_num_lm->getDict());
-			ngram base_den_ng(m_den_lm->getDict());
-			create_ngram(text, base_num_ng, base_den_ng);
 			
-			for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
-			{
-				ngram num_ng = base_num_ng;
-				ngram den_ng = base_den_ng;
-				add_topic(it->first, num_ng, den_ng);
-				double apriori_topic_score = log10(it->second);
-				double topic_score = get_topic_similarity(num_ng, den_ng); //log10-prob
+			create_ngram(text, base_num_ng, base_den_ng);
+			if (den_reliable(base_den_ng)){ //we do not know about the reliability of the denominator
 				
-				VERBOSE(3, "topic:|" << it->first  << "apriori_topic_score:" << apriori_topic_score << " topic_score:" << topic_score << std::endl);
-				if (it == topic_weights.begin()){
-					ret_log10_pr = apriori_topic_score + topic_score;
-				}else{
-					ret_log10_pr = logsum(ret_log10_pr, apriori_topic_score + topic_score)/M_LN10;
+				for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
+				{
+					ngram num_ng = base_num_ng;
+					ngram den_ng = base_den_ng;
+					add_topic(it->first, num_ng, den_ng);
+					
+					double apriori_topic_score = log10(it->second); //log10-prob
+					double topic_score = get_topic_similarity(num_ng, den_ng); //log10-prob
+					
+					VERBOSE(3, "topic:|" << it->first  << "| apriori_topic_score:" << apriori_topic_score << " topic_score:" << topic_score << std::endl);
+					if (it == topic_weights.begin()){
+						ret_log10_pr = apriori_topic_score + topic_score;
+					}else{
+						ret_log10_pr = logsum(ret_log10_pr, apriori_topic_score + topic_score)/M_LN10;
+					}
+					VERBOSE(3, "CURRENT ret_log10_pr:" << ret_log10_pr << std::endl);
 				}
-				VERBOSE(4, "CURRENT ret_log10_pr:" << ret_log10_pr << std::endl);
+			}else{
+				//the similarity score is not reliable enough, because occurrences of base_den_ng are too little 
+				//we also assume that also counts for base_num_ng are unreliable
+				//return an uninforming score (0.0)
+				ret_log10_pr = 0.0;
 			}
-		}else{
-			//a-priori topic distribution is "empty", i.e. there is nore score for any topic
-			//return a "constant" lower-bound score,  SIMILARITY_LOWER_BOUND = log(0.0)
-			ret_log10_pr = SIMILARITY_LOWER_BOUND;
 		}
 		
-		VERBOSE(3, "ret_log10_pr:" << ret_log10_pr << std::endl);
+		VERBOSE(2, "ret_log10_pr:" << ret_log10_pr << std::endl);
 		return ret_log10_pr;
 	}
-	
+
 	//returns the scores for all topics in the topic models (without apriori topic prob)
 	void ContextSimilarity::get_topic_scores(topic_map_t& topic_map, string_vec_t& text)
-	{		
-		ngram base_num_ng(m_num_lm->getDict());
-		ngram base_den_ng(m_den_lm->getDict());
+	{				
+		ngram base_num_ng(m_hwk_ngt->getDict());
+		ngram base_den_ng(m_hk_ngt->getDict());
 		create_ngram(text, base_num_ng, base_den_ng);
 		
-		for (topic_dict_t::iterator it=m_lm_topic_dict.begin(); it != m_lm_topic_dict.end(); ++it)
-		{
-			ngram num_ng = base_num_ng;
-			ngram den_ng = base_den_ng;
-			add_topic(*it, num_ng, den_ng);
-			topic_map[*it] = get_topic_similarity(num_ng, den_ng);
+		
+		if (m_active){ //similarity score is disable
+			for (int i=0; i<m_k_ngt->getDict()->size();++i)
+			{
+				ngram num_ng = base_num_ng;
+				ngram den_ng = base_den_ng;
+				std::string _topic = m_k_ngt->getDict()->decode(i);
+				add_topic(_topic, num_ng, den_ng);
+				topic_map[_topic] = get_topic_similarity(num_ng, den_ng);
+			}
 		}
+			
 	}
 	
 	
@@ -168,26 +176,21 @@ namespace irstlm {
 	void ContextSimilarity::create_ngram(const string_vec_t& text, ngram& num_ng, ngram& den_ng)
 	{
 		//text is a vector of strings with w in the last position and the history in the previous positions
-		//text must have at least two words
-		VERBOSE(3,"void ContextSimilarity::create_ngram" << std::endl);
+		//text must have at least one word
+		//if text has two word, further computation will rely on normal counts, i.e. counts(h,w,k), counts(h,w), counts(h,k), counts(k)
+		//if text has only one word, further computation will rely on lower-order counts, i.e. (w,k), counts(w), counts(k), counts()
+		VERBOSE(2,"void ContextSimilarity::create_ngram" << std::endl);
 
-		//TO_CHECK: what happens when text has zero element
-		//		if (text.size()==0)
-		
-		//TO_CHECK: what happens when text has just one element
+		MY_ASSERT(text.size()==0);
 		
-		
-		
-		// lm model for the numerator is assumed to be a 3-gram lm, hence num_gr have only size 3 (two words and one topic); here we insert two words
 		if (text.size()==1){
-			num_ng.pushw(num_ng.dict->OOV());
+			//all further computation will rely on lower-order counts
+			num_ng.pushw(text.at(text.size()-1));
 		}else {
 			num_ng.pushw(text.at(text.size()-2));
+			num_ng.pushw(text.at(text.size()-1));
+			den_ng.pushw(text.at(text.size()-2));
 		}
-		num_ng.pushw(text.at(text.size()-1));
-		
-		// lm model for the denominator is assumed to be a 2-gram lm, hence den_gr have only size 2 (one word and one topic); here we insert one word
-		den_ng.pushw(text.at(text.size()-1));
 	}
 	
 	void ContextSimilarity::create_topic_ngram(const string_vec_t& text, const std::string& topic, ngram& num_ng, ngram& den_ng)
@@ -206,8 +209,8 @@ namespace irstlm {
 	
 	double ContextSimilarity::get_topic_similarity(string_vec_t text, const std::string& topic)
 	{
-		ngram num_ng(m_num_lm->getDict());
-		ngram den_ng(m_den_lm->getDict());
+		ngram num_ng(m_hwk_ngt->getDict());
+		ngram den_ng(m_hk_ngt->getDict());
 		
 		create_topic_ngram(text, topic, num_ng, den_ng);
 		
@@ -215,13 +218,86 @@ namespace irstlm {
 	}
 	
 	double ContextSimilarity::get_topic_similarity(ngram& num_ng, ngram& den_ng)
-	{	
-		double num_pr=m_num_lm->clprob(num_ng);
-		double den_pr=m_den_lm->clprob(den_ng);
-	 VERBOSE(4, "num_ng:|" << num_ng << "| pr:" << num_pr << std::endl);
-	 VERBOSE(4, "den_ng:|" << den_ng << "| pr:" << den_pr << std::endl);
-		return num_pr - den_pr;
-		//		return m_lm->clprob(num_ng) - m_lm->clprob(den_ng);
+	{			
+		VERBOSE(2, "double ContextSimilarity::get_topic_similarity(ngram& num_ng, ngram& den_ng) with  num_ng:|" << num_ng << "| den_ng:|" << den_ng << "|" << std::endl);
+		
+		double num_log_pr, den_log_pr;
+		
+		double c_hk=m_smoothing, c_h=m_smoothing * m_topic_size;
+		double c_hwk=m_smoothing, c_hw=m_smoothing * m_topic_size;
+		
+		if (den_ng.size == m_hk_order){//we rely on counts(h,k) and counts(h)
+			if (m_hk_ngt->get(den_ng)) {	c_hk += den_ng.freq; }
+			if (m_hk_ngt->get(den_ng,2,1)) { c_h += den_ng.freq; }
+		}else{//we actually rely on counts(k) and counts()
+			/*
+			 if (m_k_ngt->get(den_ng)) {	c_hk += den_ng.freq; }
+			 c_h += m_hk_ngt->getDict()->totfreq();
+			 */
+			c_hk += m_hk_ngt->getDict()->freq(*(den_ng.wordp(1)));
+			c_h += m_k_ngt->getDict()->totfreq();
+		}
+		den_log_pr = log10(c_hk) - log10(c_h);
+		VERBOSE(3, "c_hk:" << c_hk << " c_h:" << c_h << std::endl);
+		
+		
+		if (num_reliable(num_ng)){
+			if (num_ng.size == m_hwk_order){ //we rely on counts(h,w,k) and counts(h,w)
+				if (m_hwk_ngt->get(num_ng)) {	c_hwk += num_ng.freq; }
+				if (m_hwk_ngt->get(num_ng,3,2)) { c_hw += num_ng.freq; }
+			}else{ //we actually rely on counts(h,k) and counts(h)
+				if (m_hk_ngt->get(num_ng)) {	c_hwk += num_ng.freq; }
+				if (m_hk_ngt->get(num_ng,3,2)) { c_hw += num_ng.freq; }
+			}
+			num_log_pr = log10(c_hwk) - log10(c_hw);
+			VERBOSE(3, "c_hwk:" << c_hwk << " c_hw:" << c_hw << std::endl);
+		}else{
+			num_log_pr = -log10(m_topic_size);
+		}
+		
+		VERBOSE(3, "num_log_pr:" << num_log_pr << " den_log_pr:" << den_log_pr << std::endl);
+		return num_log_pr - den_log_pr;
+	}
+	
+	bool ContextSimilarity::num_reliable(ngram& num_ng)
+	{
+		VERBOSE(2, "ContextSimilarity::num_reliable(ngram& num_ng) num_ng:|" << num_ng << "| thr:" << m_threshold_on_h << "|" << std::endl);		
+		if (num_ng.size < 2){
+			//num_ng has size lower than expected (2)
+			//in this case we will rely on counts(h, topic) instead of counts(h, w, topic)
+			VERBOSE(3, "num_ng:|" << num_ng << "| has size lower than expected (2) TRUE" << std::endl);
+			return true;
+		}
+		if (m_hwk_ngt->get(num_ng,3,2) && (num_ng.freq > m_threshold_on_h)){
+			VERBOSE(3, "num_ng:|" << num_ng << "| thr:" << m_threshold_on_h << " TRUE" << std::endl);
+			return true;
+		}else{
+			VERBOSE(3, "num_ng:|" << num_ng << "| thr:" << m_threshold_on_h << " FALSE" << std::endl);
+			return false;
+		}
+	}
+	
+	
+	bool ContextSimilarity::den_reliable(ngram& den_ng)
+	{
+		VERBOSE(2, "ContextSimilarity::den_reliable(ngram& den_ng) den_ng:|" << den_ng << "| thr:" << m_threshold_on_h << "|" << std::endl);
+
+		if (den_ng.size < 1){
+			//den_ng has size lower than expected (1)
+			//in this case we will rely on counts(topic) instead of counts(h, topic)
+			VERBOSE(3, "den_ng:|" << den_ng << "| has size lower than expected (1) TRUE" << std::endl);
+			return true;
+		}
+		den_ng.pushc(0);
+		if (m_hk_ngt->get(den_ng,2,1) && (den_ng.freq > m_threshold_on_h)){
+			den_ng.shift();
+			VERBOSE(3, "den_ng:|" << den_ng << "| thr:" << m_threshold_on_h << " TRUE" << std::endl);
+			return true;
+		}else{
+			den_ng.shift();
+			VERBOSE(3, "den_ng:|" << den_ng << "| thr:" << m_threshold_on_h << " FALSE" << std::endl);
+			return false;
+		}
 	}
 	
 }//namespace irstlm
diff --git a/src/context-similarity.h b/src/context-similarity.h
index d646fb6..32324c6 100644
--- a/src/context-similarity.h
+++ b/src/context-similarity.h
@@ -28,30 +28,41 @@
 #include <stdlib.h>
 #include <string>
 #include <math.h>
-#include <set>
 #include "util.h"
 #include "dictionary.h"
 #include "n_gram.h"
+#include "ngramtable.h"
 #include "lmContainer.h"
 
 class ngram;
 
 namespace irstlm {
-	#define topic_map_delimiter1 ':'
-	#define topic_map_delimiter2 ','
-	#define SIMILARITY_LOWER_BOUND -10000
+#define topic_map_delimiter1 ':'
+#define topic_map_delimiter2 ','
+#define SIMILARITY_LOWER_BOUND -10000
 	
 	typedef std::map< std::string, float > topic_map_t;
-	typedef std::set< std::string > topic_dict_t;
 	
-
+	
 	class ContextSimilarity
 	{
 	private:
-		lmContainer* m_num_lm; // P(topic | h' w)
-		lmContainer* m_den_lm; // P(topic | h')
-		topic_dict_t m_lm_topic_dict; //the dictionary of the topics seen in the language model
+		ngramtable* m_hwk_ngt; // counts(h, w, topic)
+		ngramtable* m_hk_ngt; // counts(h, topic)
+		ngramtable* m_k_ngt; // counts(topic)
+		int m_k_order; //order of m_k_ngt
+		int m_hk_order; //order of m_hk_ngt
+		int m_hwk_order; //order of m_hwk_ngt
+	
+		int m_topic_size; //number of topics in the model
+		
 		topic_map_t topic_map; 
+		int m_threshold_on_h; //frequency threshold on h to allow computation of similairty scores
+		double m_smoothing; //smoothing value to sum to the counts to avoid zero-prob; implements a sort of shift-beta smoothing
+
+		//flag for enabling/disabling context_similarity scores
+		// if disabled, context_similarity is 0.0 and topic_scores distribution is empty
+		bool m_active;
 		
 		void create_ngram(const string_vec_t& text, ngram& num_ng, ngram& den_ng);
 		void add_topic(const std::string& topic, ngram& num_ng, ngram& den_ng);
@@ -60,6 +71,9 @@ namespace irstlm {
 		double get_topic_similarity(string_vec_t text, const std::string& topic);
 		double get_topic_similarity(ngram& num_ng, ngram& den_ng);
 		
+		bool num_reliable(ngram& num_ng);
+		bool den_reliable(ngram& den_ng);
+		
 	public:
 		ContextSimilarity(const std::string &dictfile, const std::string &num_modelfile, const std::string &den_modelfile);
 		~ContextSimilarity();
@@ -69,7 +83,27 @@ namespace irstlm {
 		void add_topic_scores(topic_map_t& map, topic_map_t& tmp_map);
 		void print_topic_scores(topic_map_t& map);
 		
-		double score(string_vec_t& text, topic_map_t& topic_weights);
+		double get_context_similarity(string_vec_t& text, topic_map_t& topic_weights);
+		
+		int get_Threshold_on_H(){
+			return  m_threshold_on_h;
+		}
+		void set_Threshold_on_H(int val){
+			m_threshold_on_h = val;
+		}
+		double get_SmoothingValue(){
+			return  m_smoothing;
+		}
+		void set_SmoothingValue(double val){
+			m_smoothing = val;
+		}
+		bool is_Active(){
+			return  m_active;
+		}
+		void set_Active(bool val){
+			m_active = val;
+		}
+		
 	};
 }
 
diff --git a/src/lmContextDependent.cpp b/src/lmContextDependent.cpp
index ebd6ed7..83d23a0 100644
--- a/src/lmContextDependent.cpp
+++ b/src/lmContextDependent.cpp
@@ -61,7 +61,7 @@ namespace irstlm {
 	void lmContextDependent::load(const std::string &filename,int mmap)
 	{
 		VERBOSE(2,"lmContextDependent::load(const std::string &filename,int memmap)" << std::endl);
-		VERBOSE(2," filename:|" << filename << "|" << std::endl);
+		VERBOSE(2,"configuration file:|" << filename << "|" << std::endl);
 		
 		dictionary_upperbound=1000000;
 		int memmap=mmap;
@@ -71,22 +71,22 @@ namespace irstlm {
 		VERBOSE(0, "filename:|" << filename << "|" << std::endl);
 		
 		char line[MAX_LINE];
-		const char* words[LMCONFIGURE_MAX_TOKEN];
+		const char* words[LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN];
 		int tokenN;
 		inp.getline(line,MAX_LINE,'\n');
-		tokenN = parseWords(line,words,LMCONFIGURE_MAX_TOKEN);
+		tokenN = parseWords(line,words,LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN);
 		
 		if (tokenN != 1 || ((strcmp(words[0],"LMCONTEXTDEPENDENT") != 0) && (strcmp(words[0],"lmcontextdependent")!=0)))
-			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_num_model topic_nden_model");
+			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold");
 		
 		//reading ngram-based LM
 		inp.getline(line,BUFSIZ,'\n');
 		tokenN = parseWords(line,words,1);
 		if(tokenN < 1 || tokenN > 1) {
-			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict  topic_num_model topic_nden_model");
+			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold");
 		}
 		
-		VERBOSE(0, "modelfile:|" << words[0] << "|" << std::endl);
+		VERBOSE(0, "model_w:|" << words[0] << "|" << std::endl);
 		//checking the language model type
 		m_lm=lmContainer::CreateLanguageModel(words[0],ngramcache_load_factor, dictionary_load_factor);
 		
@@ -101,24 +101,31 @@ namespace irstlm {
 		
 		//reading topic model
 		inp.getline(line,BUFSIZ,'\n');
-		tokenN = parseWords(line,words,4);
+		tokenN = parseWords(line,words,LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN);
 		
-		if(tokenN < 4 || tokenN > 4) {
-			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_num_model topic_nden_model");
+		if(tokenN < 5 || tokenN > LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN) {
+			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold");
 		}
 		
 		//loading topic model and initialization
 		m_similaritymodel_weight = (float) atof(words[0]);
-		std::string _dict = words[1];
-		std::string _num_lm = words[2];
-		std::string _den_lm = words[3];
-		m_similaritymodel = new ContextSimilarity(_dict, _num_lm, _den_lm);
+		std::string _k_ngt = words[1];
+		std::string _hk_ngt = words[2];
+		std::string _hwk_ngt = words[3];
+		int _thr = atoi(words[4]);
+		double _smoothing = 0.1;
+		if (tokenN == 6){ _smoothing = atof(words[5]); }
+		m_similaritymodel = new ContextSimilarity(_k_ngt, _hk_ngt, _hwk_ngt);
+		m_similaritymodel->set_Threshold_on_H(_thr);
+		m_similaritymodel->set_SmoothingValue(_smoothing);
 		
 		inp.close();
 		
-		VERBOSE(0, "topic_dict:|" << _dict << "|" << std::endl);
-		VERBOSE(0, "topic_num_model:|" << _num_lm << "|" << std::endl);
-		VERBOSE(0, "topic_den_model:|" << _den_lm << "|" << std::endl);
+		VERBOSE(0, "model_k:|" << _k_ngt << "|" << std::endl);
+		VERBOSE(0, "model_hk:|" << _hk_ngt << "|" << std::endl);
+		VERBOSE(0, "model_hwk:|" << _hwk_ngt << "|" << std::endl);
+		VERBOSE(0, "topic_threshold_on_h:|" << m_similaritymodel->get_Threshold_on_H() << "|" << std::endl);
+		VERBOSE(0, "shift-beta smoothing on counts:|" << m_similaritymodel->get_SmoothingValue() << "|" << std::endl);
 	}
 
 	void lmContextDependent::GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line)
@@ -127,9 +134,6 @@ namespace irstlm {
 		if (pos != std::string::npos){ // context_delimiter is found
 			sentence = line.substr(0, pos);
 			line.erase(0, pos + context_delimiter.length());
-			VERBOSE(0,"pos:|" << pos << "|" << std::endl);	
-			VERBOSE(0,"sentence:|" << sentence << "|" << std::endl);	
-			VERBOSE(0,"line:|" << line << "|" << std::endl);	
 			
 			//getting context string;
 			context = line;
@@ -137,38 +141,41 @@ namespace irstlm {
 			sentence = line;
 			context = "";
 		}	
+		VERBOSE(1,"line:|" << line << "|" << std::endl);
+		VERBOSE(1,"sentence:|" << sentence << "|" << std::endl);	
+		VERBOSE(1,"context:|" << context << "|" << std::endl);	
 	}
 
 	double lmContextDependent::lprob(ngram ng, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
 	{
-		string_vec_t text;   // replace with the text passed as parameter
-		double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
-		double similarity_score = m_similaritymodel->score(text, topic_weights);
-		double ret_logprob = lm_logprob;
-		if (similarity_score != SIMILARITY_LOWER_BOUND){
-			ret_logprob += m_similaritymodel_weight * similarity_score;
+		VERBOSE(2,"lmContextDependent::lprob(ngram ng, topic_map_t& topic_weights, ...)" << std::endl);
+		string_vec_t text;
+		if (ng.size>1){
+			text.push_back(ng.dict->decode(*ng.wordp(2)));
 		}
-		VERBOSE(0, "lm_log10_pr:" << lm_logprob << " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_log10_pr:" << ret_logprob << std::endl);
+		text.push_back(ng.dict->decode(*ng.wordp(1)));
+		double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
+		double similarity_score = m_similaritymodel->get_context_similarity(text, topic_weights);
+		double ret_logprob = lm_logprob + m_similaritymodel_weight * similarity_score;
+		VERBOSE(3, "lm_log10_pr:" << lm_logprob << " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_log10_pr:" << ret_logprob << std::endl);
 		
 		return ret_logprob;
 	}
 	
 	double lmContextDependent::lprob(string_vec_t& text, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
 	{
-		VERBOSE(0,"lmContextDependent::lprob(string_vec_t& text, topic_map_t& topic_weights, " << std::endl);
+		VERBOSE(2,"lmContextDependent::lprob(string_vec_t& text, topic_map_t& topic_weights, ...)" << std::endl);
+
 		//create the actual ngram
 		ngram ng(dict);
 		ng.pushw(text);
-		VERBOSE(0,"ng:|" << ng << "|" << std::endl);		
+		VERBOSE(3,"ng:|" << ng << "|" << std::endl);		
 		
 		MY_ASSERT (ng.size == (int) text.size());
 		double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
-		double similarity_score = m_similaritymodel->score(text, topic_weights);
-		double ret_logprob = lm_logprob;
-		if (similarity_score != SIMILARITY_LOWER_BOUND){
-			ret_logprob += m_similaritymodel_weight * similarity_score;
-		}
-		VERBOSE(0, "lm_log10_pr:" << lm_logprob << " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_log10_pr:" << ret_logprob << std::endl);
+		double similarity_score = m_similaritymodel->get_context_similarity(text, topic_weights);
+		double ret_logprob = lm_logprob + m_similaritymodel_weight * similarity_score;
+		VERBOSE(3, "lm_log10_pr:" << lm_logprob << " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_log10_pr:" << ret_logprob << std::endl);
 		
 		return ret_logprob;
 	}
diff --git a/src/lmContextDependent.h b/src/lmContextDependent.h
index 7b0c7a5..f6b6e85 100644
--- a/src/lmContextDependent.h
+++ b/src/lmContextDependent.h
@@ -60,7 +60,7 @@ namespace irstlm {
 	 and a bigram-based topic model 
 	 */
 	
-#define LMCONFIGURE_MAX_TOKEN 3
+#define LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN 6
 	
 	static const std::string context_delimiter="___CONTEXT___";
 	
@@ -142,8 +142,6 @@ namespace irstlm {
 			return lprob(ng, topic_weights, bow, bol, maxsuffptr, statesize, extendible);
 		};
 		virtual double clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL){
-			
-			VERBOSE(0,"lmContainer::clprob(string_vec_t& text,...." << std::endl);
 			return lprob(text, topic_weights, bow, bol, maxsuffptr, statesize, extendible);
 		};
 		
@@ -202,6 +200,10 @@ namespace irstlm {
 		inline virtual bool is_OOV(int code) { //returns true if the word is OOV for each subLM
 			return m_lm->is_OOV(code);
 		}
+		
+		inline void set_Active(bool value){
+			m_similaritymodel->set_Active(value);
+		}
 	};
 }//namespace irstlm
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git



More information about the debian-science-commits mailing list