[irstlm] 58/126: changes to make code more modular

Giulio Paci giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:45 UTC 2016


This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.

commit 9c820729c4062fa204ecb6f4fae643217e582933
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Wed Sep 9 17:06:53 2015 +0200

    changes to make code more modular
---
 src/context-similarity.cpp | 565 +++++++++++++++++++++++++++++++++------------
 src/context-similarity.h   |  50 +++-
 src/lmContextDependent.cpp |  56 ++++-
 src/lmContextDependent.h   |  18 +-
 4 files changed, 514 insertions(+), 175 deletions(-)

diff --git a/src/context-similarity.cpp b/src/context-similarity.cpp
index 6a72b6f..e0b0ea1 100644
--- a/src/context-similarity.cpp
+++ b/src/context-similarity.cpp
@@ -45,9 +45,11 @@ namespace irstlm {
 	{
 		m_hwk_order=3;
 		m_hk_order=2;
+		m_wk_order=m_hk_order;
 		m_k_order=1;
 		m_hwk_ngt=new ngramtable((char*) hwk_modelfile.c_str(), m_hwk_order, NULL,NULL,NULL);
 		m_hk_ngt=new ngramtable((char*) hk_modelfile.c_str(), m_hk_order, NULL,NULL,NULL);
+		m_wk_ngt=m_hk_ngt; //just a link to m_hk_ngt
 		m_k_ngt=new ngramtable((char*) k_modelfile.c_str(), m_k_order, NULL,NULL,NULL);
 		
 		m_smoothing = 0.001;
@@ -64,86 +66,55 @@ namespace irstlm {
 #endif
 		
 	}
-
+	
 	
 	ContextSimilarity::~ContextSimilarity()
 	{
 		delete m_hwk_ngt;
 		delete m_hk_ngt;
+		//delete m_wk_ngt;  //it is just a link to m_hk_ngt
+		delete m_k_ngt;
 	}
 	
-	//return the log10 of the similarity score
-	double ContextSimilarity::get_context_similarity(string_vec_t& text, topic_map_t& topic_weights)
-	{
-		VERBOSE(2, "double ContextSimilarity::get_context_similarity(string_vec_t& text, topic_map_t& topic_weights)" << std::endl);
-		double ret_log10_pr;
-		
-		if (!m_active){ //similarity score is disable
-			ret_log10_pr = 0.0;
-		}else if (m_topic_size == 0){
-			//a-priori topic distribution is "empty", i.e. there is no score for any topic
-			//return an uninforming score (0.0)
-			ret_log10_pr = 0.0;
-		} else{
-			VERBOSE(3, "topic_weights.size():" << topic_weights.size() << std::endl);
-			ngram base_num_ng(m_hwk_ngt->getDict());
-			ngram base_den_ng(m_hk_ngt->getDict());
-			
-			
-			create_ngram(text, base_num_ng, base_den_ng);
-			if (base_reliable(base_den_ng, 2, m_hk_ngt)){ //we do not know about the reliability of the denominator
-				
-				for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
-				{
-					ngram num_ng = base_num_ng;
-					ngram den_ng = base_den_ng;
-					add_topic(it->first, num_ng, den_ng);
-					
-					double apriori_topic_score = log10(it->second); //log10-prob
-					double topic_score = get_topic_similarity(num_ng, den_ng); //log10-prob
-					
-					VERBOSE(3, "topic:|" << it->first  << "| apriori_topic_score:" << apriori_topic_score << " topic_score:" << topic_score << std::endl);
-					if (it == topic_weights.begin()){
-						ret_log10_pr = apriori_topic_score + topic_score;
-					}else{
-						ret_log10_pr = logsum(ret_log10_pr, apriori_topic_score + topic_score)/M_LN10;
-					}
-					VERBOSE(3, "CURRENT ret_log10_pr:" << ret_log10_pr << std::endl);
-				}
-			}else{
-				//the similarity score is not reliable enough, because occurrences of base_den_ng are too little 
-				//we also assume that also counts for base_num_ng are unreliable
-				//return an uninforming score (0.0)
-				ret_log10_pr = 0.0;
-			}
-		}
-		
-		VERBOSE(2, "ret_log10_pr:" << ret_log10_pr << std::endl);
-		return ret_log10_pr;
+	void ContextSimilarity::normalize_topic_scores(topic_map_t& map)
+	{	
+		UNUSED(map);
+		/* normalization type 1
+		 double max = -1000000.0;
+		 double min =  1000000.0;
+		 for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it){
+		 min = (map[it->first]<min)?map[it->first]:min;
+		 max = (map[it->first]>max)?map[it->first]:max;
+		 }
+		 for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it){
+		 map[it->first] = (map[it->first]-min)/(max-min);
+		 }
+		 VERBOSE(2,"min:"<<min << " max:" << max << std::endl);
+		 */
+		/*
+		 //normalization type 2
+		 double norm =  0.0;
+     for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it){
+		 norm += fabs(map[it->first]);
+		 }
+		 for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it){
+		 map[it->first] = map[it->first]/norm;
+		 }
+		 VERBOSE(2,"norm:" << norm << std::endl);
+		 */
 	}
-
-	//returns the scores for all topics in the topic models (without apriori topic prob)
-	void ContextSimilarity::get_topic_scores(topic_map_t& topic_map, string_vec_t& text)
-	{				
-		ngram base_num_ng(m_hwk_ngt->getDict());
-		ngram base_den_ng(m_hk_ngt->getDict());
-		create_ngram(text, base_num_ng, base_den_ng);
-		
-		
-		if (m_active){ //similarity score is disable
-			for (int i=0; i<m_k_ngt->getDict()->size();++i)
-			{
-				ngram num_ng = base_num_ng;
-				ngram den_ng = base_den_ng;
-				std::string _topic = m_k_ngt->getDict()->decode(i);
-				add_topic(_topic, num_ng, den_ng);
-				topic_map[_topic] = get_topic_similarity(num_ng, den_ng);
-			}
+	
+	double ContextSimilarity::DeltaCrossEntropy(topic_map_t& topic_map, topic_map_t& tmp_map, double len)
+	{
+		double xDeltaEntropy = 0.0;
+		for (topic_map_t::iterator it=tmp_map.begin(); it != tmp_map.end(); ++it){
+			xDeltaEntropy += topic_map[it->first] * tmp_map[it->first];
+			//			VERBOSE(2,"topic_map[it->first]:" << topic_map[it->first] << " tmp_map[it->first]:" << tmp_map[it->first] << " product:" << topic_map[it->first] * tmp_map[it->first] << std::endl);
 		}
-			
+		//		VERBOSE(2," xDeltaEntropy:" << xDeltaEntropy << " len:" << len << " xDeltaEntropy/len:" << xDeltaEntropy/len << std::endl);
+		return xDeltaEntropy/len;
 	}
 	
-	
 	void ContextSimilarity::add_topic_scores(topic_map_t& topic_map, topic_map_t& tmp_map)
 	{
 		for (topic_map_t::iterator it=tmp_map.begin(); it != tmp_map.end(); ++it){
@@ -158,7 +129,20 @@ namespace irstlm {
 		{
 			if (it!=map.begin()) { std::cout << topic_map_delimiter1; }
 			std::cout << it->first << topic_map_delimiter2 << it->second;
+			//			std::cout << it->first << topic_map_delimiter2 << exp(it->second * M_LN10);
+		}
+		
+		std::cout << std::endl;
+	}
+	
+	void ContextSimilarity::print_topic_scores(topic_map_t& map, topic_map_t& refmap, double len)
+	{
+		for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it)
+		{
+			if (it!=map.begin()) { std::cout << topic_map_delimiter1; }
+			std::cout << it->first << topic_map_delimiter2 << it->second;
 		}
+		std::cout << " DeltaCrossEntropy:" << DeltaCrossEntropy(refmap,map,len);
 		std::cout << std::endl;
 	}
 	
@@ -180,108 +164,368 @@ namespace irstlm {
 		}
 	}
 	
-	void ContextSimilarity::create_ngram(const string_vec_t& text, ngram& num_ng, ngram& den_ng)
+	void ContextSimilarity::create_ngram(const string_vec_t& text, ngram& ng)
 	{
 		//text is a vector of strings with w in the last position and the history in the previous positions
 		//text must have at least one word
-		//if text has two word, further computation will rely on normal counts, i.e. counts(h,w,k), counts(h,w), counts(h,k), counts(k)
-		//if text has only one word, further computation will rely on lower-order counts, i.e. (w,k), counts(w), counts(k), counts()
+		//if text has two word, further computation will rely on normal counts, i.e. counts(h,w,k), counts(h,w), counts(w,k), counts(h,k), counts(k)
+		//if text has only one word, further computation will rely on lower-order counts, i.e. (w,k), counts(w), counts(w), counts(k), counts()
 		VERBOSE(2,"void ContextSimilarity::create_ngram" << std::endl);
 		VERBOSE(2,"text.size:" << text.size() << std::endl);
-
+		
 		MY_ASSERT(text.size()>0);
 		
 		if (text.size()==1){
 			//all further computation will rely on lower-order counts
-			num_ng.pushw(text.at(text.size()-1));
+			ng.pushw(text.at(text.size()-1));
 		}else {
-			num_ng.pushw(text.at(text.size()-2));
-			num_ng.pushw(text.at(text.size()-1));
-			den_ng.pushw(text.at(text.size()-2));
+			ng.pushw(text.at(text.size()-2));
+			ng.pushw(text.at(text.size()-1));
 		}
+		VERBOSE(2,"output of create_ngram ng:|" << ng << "| ng.size:" << ng.size << std::endl);
 	}
 	
-	void ContextSimilarity::create_topic_ngram(const string_vec_t& text, const std::string& topic, ngram& num_ng, ngram& den_ng)
+	void ContextSimilarity::create_topic_ngram(const string_vec_t& text, const std::string& topic, ngram& ng)
 	{
-		//text is  a vector of string with w in the last position and the history in the previous positions
-		//text must have at least two words
-		create_ngram(text, num_ng, den_ng);
-		add_topic(topic, num_ng, den_ng);
+		//text is a vector of string with w in the last position and the history in the previous positions
+		//text must have at least one word
+		//topic is added in the most recent position of the ngram
+		create_ngram(text, ng);
+		add_topic(topic, ng);
+		VERBOSE(2,"output of create_topic_ngram ng:|" << ng << "| ng.size:" << ng.size << std::endl);
+	}
+	
+	void ContextSimilarity::add_topic(const std::string& topic, ngram& ng)
+	{		
+		ng.pushw(topic);
 	}
 	
-	void ContextSimilarity::add_topic(const std::string& topic, ngram& num_ng, ngram& den_ng)
+	void ContextSimilarity::modify_topic(const std::string& topic, ngram& ng)
 	{		
-		num_ng.pushw(topic);
-		den_ng.pushw(topic);
+		*ng.wordp(1) = ng.dict->encode(topic.c_str());
 	}
 	
-	double ContextSimilarity::get_topic_similarity(string_vec_t text, const std::string& topic)
+	void ContextSimilarity::get_counts(ngram& ng, ngramtable& ngt, double& c_xk, double& c_x)
+	{			
+		VERBOSE(2, "double ContextSimilarity::get_counts(ngram& ng, double& c_xk, double& c_x) with ng:|" << ng << "|" << std::endl);
+		//counts taken from the tables are modified to avoid zero values for the probs
+		//a constant epsilon (smmothing) is added
+		//we also assume that c(x) = sum_k c(xk)
+		
+		//we assume that ng ends with a correct topic 
+		//we assume that ng is compliant with ngt, and has the correct size
+		
+		c_xk = m_smoothing;
+		c_x  = m_smoothing * m_topic_size;
+		
+		if (ngt.get(ng)) { c_xk += ng.freq; }
+		if (ngt.get(ng,ng.size,ng.size-1)) { c_x += ng.freq; }
+		
+		VERBOSE(3, "c_xk:" << c_xk << " c_x:" << c_x << std::endl);
+	}
+	
+	double ContextSimilarity::topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2)
 	{
-		ngram num_ng(m_hwk_ngt->getDict());
-		ngram den_ng(m_hk_ngt->getDict());
+		ngram ng(ngt.getDict());
 		
-		create_topic_ngram(text, topic, num_ng, den_ng);
+		create_topic_ngram(text, topic, ng);
 		
-		return get_topic_similarity(num_ng, den_ng);
+		return topic_score(ng, ngt, ngt2);
 	}
 	
-	double ContextSimilarity::get_topic_similarity(ngram& num_ng, ngram& den_ng)
-	{			
-		VERBOSE(2, "double ContextSimilarity::get_topic_similarity(ngram& num_ng, ngram& den_ng) with  num_ng:|" << num_ng << "| den_ng:|" << den_ng << "|" << std::endl);
+	double ContextSimilarity::topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2){
+#ifdef OPTION_1
+		return topic_score_option1(ng, ngt, ngt2);
+#elif OPTION_2
+		return topic_score_option2(ng, ngt, ngt2);
+#elif OPTION_3
+		return topic_score_option3(ng, ngt, ngt2);
+#else
+		return topic_score_option0(ng, ngt, ngt2);
+#endif
+	}
+	
+	double ContextSimilarity::topic_score_option0(ngram& ng, ngramtable& ngt, ngramtable& ngt2)
+	{
+		UNUSED(ngt);
+		UNUSED(ngt2);
+		VERBOSE(2, "double ContextSimilarity::topic_score_option0(ngram& ng, ngramtable& ngt) with ng:|" << ng << "|" << std::endl);
+		
+		//option 0: uniform (not considering log function) 
+		//P(k|hw) = 1/number_of_topics
+		double log_pr = -log(m_topic_size)/M_LN10;
+		
+		VERBOSE(3, "option0: return: " << log_pr<< std::endl);	
+		return log_pr;
+	}
+	
+	double ContextSimilarity::topic_score_option1(ngram& ng, ngramtable& ngt, ngramtable& ngt2)
+	{
+		VERBOSE(2, "double ContextSimilarity::topic_score_option1(ngram& ng, ngramtable& ngt) with ng:|" << ng << "|" << std::endl);
+		
+		//ngt provides counts c(hwk) and c(hw) (or c(wk) and c(w))
+		double c_xk, c_x;
+		get_counts(ng, ngt, c_xk, c_x);
+		
+		//copy and transform codes
+		// shift all terms, but the topic
+		// ng2[3]=ng[4];
+		// ng2[2]=ng[3];
+		// ng2[1]=ng[1];
+		ngram ng2(ngt2.getDict());
+		ng2.trans(ng);
+		int topic=*ng.wordp(1);
+		ng2.shift();
+		*ng2.wordp(2)=topic;
+		
+		//ngt2 provides counts c(hk) and c(h) (or c(k) and c())
+		double c_xk2, c_x2;
+		get_counts(ng2, ngt2, c_xk2, c_x2);
+		
+		//option 1: (not considering log function) 
+		//P(k|hw)/sum_v P(k|hv) ~approx~ P(k|hw)/P(k|h) ~approx~ num_pr/den_pr
+		//num_pr = c'(hwk)/c'(hw)
+		//den_pr = c'(hk)/c'(h)
+		double den_log_pr = log10(c_xk2) - log10(c_x2);
+		double num_log_pr = log10(c_xk) - log10(c_x);
+		double log_pr = num_log_pr - den_log_pr;
+		VERBOSE(3, "option1: num_log_pr:" << num_log_pr << " den_log_pr:" << den_log_pr << " return: " << log_pr << std::endl);
+		return log_pr;
+	}
+	
+	double ContextSimilarity::topic_score_option2(ngram& ng, ngramtable& ngt, ngramtable& ngt2)
+	{
+		UNUSED(ngt2);
+		VERBOSE(2, "double ContextSimilarity::topic_score_option2(ngram& ng, ngramtable& ngt) with ng:|" << ng << "|" << std::endl);
+		
+		//ngt provides counts c(hwk) and c(hw) (or c(wk) and c(w))
+		double c_xk, c_x;
+		get_counts(ng, ngt, c_xk, c_x);
+		
+		//option 1: (not considering log function) 
+		//P(k|hw)/sum_v P(k|hv) ~approx~ P(k|hw)/P(k|h) ~approx~ c'(hwk)/c'(hw)
+		double log_pr = log10(c_xk) - log10(c_x);
+		VERBOSE(3, "option2: log_pr:" << log_pr << " return: " << log_pr << std::endl);
+		return log_pr;
+	}
+	
+	double ContextSimilarity::topic_score_option3(ngram& ng, ngramtable& ngt, ngramtable& ngt2)
+	{
+		VERBOSE(2, "double ContextSimilarity::topic_score_option3(ngram& ng, ngramtable& ngt) with ng:|" << ng << "|" << std::endl);
+		
+		//ngt provides counts c(hwk) and c(hw) (or c(wk) and c(w))
+		double c_xk, c_x;
+		get_counts(ng, ngt, c_xk, c_x);
+		
+		//copy and transform codes
+		// shift all terms, but the topic
+		// ng2[3]=ng[4];
+		// ng2[2]=ng[3];
+		// ng2[1]=ng[1];
+		ngram ng2(ngt2.getDict());
+		ng2.trans(ng);
+		int topic=*ng.wordp(1);
+		ng2.shift();
+		*ng2.wordp(2)=topic;
+		
+		//ngt2 provides counts c(hk) and c(h) (or c(k) and c())
+		double c_xk2, c_x2;
+		get_counts(ng2, ngt2, c_xk2, c_x2);
+		
+		/*;
+		 //approximation 3: (not considering log function) 
+		 //P(k|hw)/sum_v P(k|hv) ~approx~ logistic_function(P(k|hw)/P(k|h))
+		 // ~approx~ logistic_function(num_pr/den_pr)
+		 // ~approx~ logistic_function(c'(hwk)/c'(hw)/c'(hk)/c'(h))
+		 // ~approx~ logistic_function((c'(hwk)*c'(h))/(c'(hw)*c'(hk)))
+		 
+		 return logistic_function((c'(hwk)*c'(h))/(c'(hw)*c'(hk)),1.0,1.0)
+		 */
+		
+		double log_pr = logistic_function((c_xk*c_x2)/(c_x*c_xk2),1.0,1.0);
+		
+		VERBOSE(3, "option3: return: " << log_pr << std::endl);
+		return log_pr;
+	}
+	
+	double ContextSimilarity::total_topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2, dictionary& dict)
+	{
+		ngram ng(ngt.getDict());
+		create_topic_ngram(text, topic, ng);
+		return total_topic_score(ng, ngt, ngt2, dict);
+	}
+	
+	double ContextSimilarity::total_topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict)
+	{		
+		double tot_pr = 0.0;
+		double v_topic_pr;
+		for (int v=0; v<dict.size(); ++v){
+			//replace last word, which is in position 2, keeping topic in position 1 unchanged
+			*ng.wordp(2) = ng.dict->encode(dict.decode(v));
+			v_topic_pr = topic_score(ng, ngt, ngt2);
+			tot_pr += pow(10.0,v_topic_pr); //v_pr is a lo10 prob
+		}
+		return log10(tot_pr);
+	}
+	
+	double ContextSimilarity::total_topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight)
+	{
+		ngram ng(ngt.getDict());
+		create_topic_ngram(text, topic, ng);
+		return total_topic_score(ng, ngt, ngt2, dict, lm, weight);
+	}
+	
+	double ContextSimilarity::total_topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight)
+	{		
+		double tot_pr = 0.0;
+		double v_pr, v_topic_pr, v_lm_pr;
+		for (int v=0; v<dict.size(); ++v){
+			//replace last word, which is in position 2, keeping topic in position 1 unchanged
+			*ng.wordp(2) = ng.dict->encode(dict.decode(v));
+			v_topic_pr = topic_score(ng, ngt, ngt2);
+			v_lm_pr = lm.clprob(ng);
+			v_pr = v_lm_pr + weight * v_topic_pr;
+			tot_pr += pow(10.0,v_pr); //v_pr is a lo10 prob
+		}
+		return log10(tot_pr);
+	}
+	
+	void ContextSimilarity::modify_context_map(string_vec_t& text, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, topic_map_t& topic_weights, topic_map_t& mod_topic_weights)
+	{
+		ngram ng(ngt.getDict());
+		create_topic_ngram(text, "dummy_topic", ng); //just for initialization
 		
-		double num_log_pr, den_log_pr;
+		modify_context_map(ng, ngt, ngt2, dict, topic_weights, mod_topic_weights);
+	}
+	
+	void ContextSimilarity::modify_context_map(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, topic_map_t& topic_weights, topic_map_t& mod_topic_weights)
+	{
+		double global_score;
+		double mod_topic_pr;
+		for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
+		{
+			modify_topic(it->first, ng);
+			global_score = total_topic_score(ng, ngt, ngt2, dict);
+			global_score = pow(10.0,global_score);
+			mod_topic_pr = it->second/global_score;
+			mod_topic_weights.insert(make_pair(it->first,mod_topic_pr));
+		}
+	}
+	
+	void ContextSimilarity::modify_context_map(string_vec_t& text, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight, topic_map_t& topic_weights, topic_map_t& mod_topic_weights)
+	{
+		ngram ng(ngt.getDict());
+		create_topic_ngram(text, "dummy_topic", ng); //just for initialization
 		
-		double c_hk=m_smoothing, c_h=m_smoothing * m_topic_size;
-		double c_hwk=m_smoothing, c_hw=m_smoothing * m_topic_size;
+		modify_context_map(ng, ngt, ngt2, dict, lm, weight, topic_weights, mod_topic_weights);
+	}
+	
+	void ContextSimilarity::modify_context_map(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight, topic_map_t& topic_weights, topic_map_t& mod_topic_weights)
+	{
+		double global_score;
+		double mod_topic_pr;
+		for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
+		{
+			modify_topic(it->first, ng);
+			global_score = total_topic_score(ng, ngt, ngt2, dict, lm, weight);
+			global_score = pow(10.0,global_score);
+			mod_topic_pr = it->second/global_score;
+			mod_topic_weights.insert(make_pair(it->first,mod_topic_pr));
+		}
+	}
+	
+	
+	double ContextSimilarity::context_similarity(string_vec_t& text, topic_map_t& topic_weights)
+	{
+#ifdef SOLUTION_1
+		return context_similarity_solution1(text, topic_weights);
+#elif SOLUTION_2
+		return context_similarity_solution2(text, topic_weights);
+#else
+		UNUSED(text);
+		UNUSED(topic_weights);
+		exit(IRSTLM_CMD_ERROR_GENERIC);
+#endif
+	}
+	
+	//return the log10 of the similarity score
+	double ContextSimilarity::context_similarity_solution1(string_vec_t& text, topic_map_t& topic_weights)
+	{
+		VERBOSE(2, "double ContextSimilarity::context_similarity_solution1(string_vec_t& text, topic_map_t& topic_weights)" << std::endl);
+		double ret_log10_pr = 0.0;
 		
-		if (den_ng.size == m_hk_order){//we rely on counts(h,k) and counts(h)
-			if (m_hk_ngt->get(den_ng)) {	c_hk += den_ng.freq; }
-			if (m_hk_ngt->get(den_ng,den_ng.size,den_ng.size-1)) { c_h += den_ng.freq; }
-		}else{//we actually rely on counts(k) and counts()
-			c_hk += m_hk_ngt->getDict()->freq(*(den_ng.wordp(1)));
-			c_h += m_k_ngt->getDict()->totfreq();
+		if (!m_active){
+			//similarity score is disable
+			//return an uninforming score (log(1.0) = 0.0)
+			ret_log10_pr = 0.0;
+		}
+		else if (m_topic_size == 0){
+			//a-priori topic distribution is "empty", i.e. there is no score for any topic
+			//return an uninforming score (log(1.0) = 0.0)
+			ret_log10_pr = 0.0;
 		}
-		den_log_pr = log10(c_hk) - log10(c_h);
-		VERBOSE(3, "c_hk:" << c_hk << " c_h:" << c_h << std::endl);
-		
-		if (num_ng.size == m_hwk_order){ //we rely on counts(h,w,k) and counts(h,w)
-			if (reliable(num_ng, num_ng.size, m_hwk_ngt)){
-				if (m_hwk_ngt->get(num_ng)) {	c_hwk += num_ng.freq; }
-				if (m_hwk_ngt->get(num_ng,num_ng.size,num_ng.size-1)) { c_hw += num_ng.freq; }
-			}else{
-				c_hwk=1;
-				c_hk=m_topic_size;
+		else{
+			VERBOSE(3, "topic_weights.size():" << topic_weights.size() << std::endl);
+				
+			ngramtable* current_ngt;
+			ngramtable* current_ngt2;
+			
+			if (text.size()==1){
+				current_ngt = m_wk_ngt;
+				current_ngt2 = m_k_ngt;
+			}
+			else{
+				current_ngt = m_hwk_ngt;
+				current_ngt2 = m_hk_ngt;
 			}
-		}else{ //hence num_ng.size=m_hwk_order-1, we actually rely on counts(h,k) and counts(h)
-			if (reliable(num_ng, num_ng.size, m_hk_ngt)){
-				if (m_hk_ngt->get(num_ng)) {	c_hwk += num_ng.freq; }
-				if (m_hk_ngt->get(num_ng,num_ng.size,num_ng.size-1)) { c_hw += num_ng.freq; }
-			}else{
-				c_hwk=1;
-				c_hk=m_topic_size;
+			
+			ngram ng(current_ngt->getDict());
+			create_topic_ngram(text, "dummy_topic", ng); //just for initialization
+			
+			
+			if (reliable(ng, current_ngt)){
+				//this word sequence is reliable
+				
+				double ret_pr = 0.0;
+				for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
+				{
+					ngram current_ng = ng;
+					modify_topic(it->first, current_ng);
+					
+					double apriori_topic_score = it->second; //prob
+					double current_topic_score = exp(topic_score(current_ng, *current_ngt, *current_ngt2) * M_LN10); //topic_score(...) returns  a log10; hence exp is applied to (score *  M_LN10)
+					
+					VERBOSE(3, "current_ng:|" << current_ng << "| topic:|" << it->first  << "| apriori_topic_score:" << apriori_topic_score << " topic_score:" << current_topic_score << " score_toadd:" << ret_pr << std::endl);
+					ret_pr += apriori_topic_score * current_topic_score;
+					VERBOSE(3, "CURRENT ret_pr:" << ret_pr << std::endl);
+				}
+				ret_log10_pr = log10(ret_pr);
 			}
+			else{
+				//this word sequence is not reliable enough, because occurrences of base_den_ng are too little
+				//return an uninforming score (log(1.0) = 0.0)
+				ret_log10_pr = 0.0;
+				VERBOSE(3, "CURRENT ret_pr:" << 1.0 << std::endl);
+			}
+			
 		}
-		VERBOSE(3, "c_hwk:" << c_hwk << " c_hw:" << c_hw << std::endl);
-		num_log_pr = log10(c_hwk) - log10(c_hw);
-		
-		VERBOSE(3, "num_log_pr:" << num_log_pr << " den_log_pr:" << den_log_pr << std::endl);
-		return num_log_pr - den_log_pr;
+		VERBOSE(2, "ret_log10_pr:" << ret_log10_pr << std::endl);
+		return ret_log10_pr;
 	}
 	
-	bool ContextSimilarity::reliable(ngram& ng, int size, ngramtable* ngt)
+	//return the log10 of the similarity score
+	double ContextSimilarity::context_similarity_solution2(string_vec_t& text, topic_map_t& topic_weights)
 	{
-		VERBOSE(2, "ContextSimilarity::reliable(ngram& ng, int size, ngramtable* ngt) ng:|" << ng << "| thr:" << m_threshold_on_h << "| ng.size:" << ng.size << " size:" << size << std::endl);	
+		return context_similarity_solution1(text, topic_weights);
+	}
+	
+	bool ContextSimilarity::reliable(ngram& ng, ngramtable* ngt)
+	{
+		VERBOSE(2, "ContextSimilarity::reliable(ngram& ng, ngramtable* ngt) ng:|" << ng << "| ng.size:" << ng.size<< "| thr:" << m_threshold_on_h << std::endl);	
 		
 		bool ret=false;
 		
-		if (ng.size < size){
-			//num_ng has size lower than expected (2)
-			//in this case we will rely on counts(h, topic) instead of counts(h, w, topic)
-			VERBOSE(3, "ng:|" << ng << "| has size (" << ng.size<< " ) lower than expected (" << size << ")" << std::endl);
-			ret=true;
-		}
-		
-		if (ngt->get(ng,size,size-1) && (ng.freq > m_threshold_on_h)){
+		if (ngt->get(ng,ng.size,ng.size-1) && (ng.freq > m_threshold_on_h)){
 			ret=true;
 		}else{
 			ret=false;
@@ -290,29 +534,42 @@ namespace irstlm {
 		return ret;
 	}
 	
-	bool ContextSimilarity::base_reliable(ngram& ng, int size, ngramtable* ngt)
+	
+	//returns the scores for all topics in the topic models (without apriori topic prob)
+	void ContextSimilarity::get_topic_scores(string_vec_t& text, topic_map_t& topic_map)
 	{
-		VERBOSE(2, "ContextSimilarity::base_reliable(ngram& ng, int size, ngramtable* ngt) ng:|" << ng << "| thr:" << m_threshold_on_h << "|" << std::endl);
-
-		bool ret=false;
-		
-		if (ng.size < size){
-			//den_ng has size lower than expected (1)
-			//in this case we will rely on counts(topic) instead of counts(h, topic)
-			VERBOSE(3, "ng:|" << ng << "| has size (" << ng.size<< " ) lower than expected (" << size << ")" << std::endl);
-			ret=true;
+		if (m_active){ //similarity score is disable
+			ngramtable* current_ngt;
+			ngramtable* current_ngt2;
+			
+			if (text.size()==1){
+				current_ngt = m_wk_ngt;
+				current_ngt2 = m_k_ngt;
+			}
+			else{
+				current_ngt = m_hwk_ngt;
+				current_ngt2 = m_hk_ngt;
+			}
+			
+			ngram ng(current_ngt->getDict());
+			create_topic_ngram(text, "dummy_topic", ng); //just for initialization
+			
+			get_topic_scores(ng, *current_ngt, *current_ngt2, topic_map);
 		}
-		else{
-			ng.pushc(0);
-			if (ngt->get(ng,size,size-1) && (ng.freq > m_threshold_on_h)){
-				ret=true;
-			}else{
-				ret=false;
+	}	
+	
+	
+	//returns the scores for all topics in the topic models (without apriori topic prob)
+	void ContextSimilarity::get_topic_scores(ngram& ng, ngramtable& ngt, ngramtable& ngt2, topic_map_t& topic_map)
+	{		
+		if (m_active){ //similarity score is disable
+			for (int i=0; i<m_k_ngt->getDict()->size();++i)
+			{
+				std::string _topic = m_k_ngt->getDict()->decode(i);
+				modify_topic(_topic, ng);
+				topic_map[_topic] = pow(10.0,topic_score(ng, ngt, ngt2));
 			}
-			ng.shift();
 		}
-		VERBOSE(3, "ng:|" << ng << "| thr:" << m_threshold_on_h << " reliable:" << ret << std::endl);
-		return ret;
-	}
+	}	
 	
 }//namespace irstlm
diff --git a/src/context-similarity.h b/src/context-similarity.h
index d26ec84..8029b0f 100644
--- a/src/context-similarity.h
+++ b/src/context-similarity.h
@@ -28,6 +28,7 @@
 #include <stdlib.h>
 #include <string>
 #include <math.h>
+#include "cmd.h"
 #include "util.h"
 #include "dictionary.h"
 #include "n_gram.h"
@@ -43,15 +44,16 @@ namespace irstlm {
 	
 	typedef std::map< std::string, float > topic_map_t;
 	
-	
 	class ContextSimilarity
 	{
 	private:
 		ngramtable* m_hwk_ngt; // counts(h, w, topic)
 		ngramtable* m_hk_ngt; // counts(h, topic)
+		ngramtable* m_wk_ngt; // counts(w, topic)
 		ngramtable* m_k_ngt; // counts(topic)
 		int m_k_order; //order of m_k_ngt
 		int m_hk_order; //order of m_hk_ngt
+		int m_wk_order; //order of m_wk_ngt
 		int m_hwk_order; //order of m_hwk_ngt
 	
 		int m_topic_size; //number of topics in the model
@@ -64,26 +66,54 @@ namespace irstlm {
 		// if disabled, context_similarity is 0.0 and topic_scores distribution is empty
 		bool m_active;
 		
-		void create_ngram(const string_vec_t& text, ngram& num_ng, ngram& den_ng);
-		void add_topic(const std::string& topic, ngram& num_ng, ngram& den_ng);
-		void create_topic_ngram(const string_vec_t& text, const std::string& topic, ngram& num_ng, ngram& den_ng);
+		void create_ngram(const string_vec_t& text, ngram& ng);
+		void add_topic(const std::string& topic, ngram& ng);
+		void modify_topic(const std::string& topic, ngram& ng);
+		void create_topic_ngram(const string_vec_t& text, const std::string& topic, ngram& ng);
+	
+		void get_counts(ngram& ng, ngramtable& ngt, double& c_xk, double& c_x);
+		
+		double topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2);
+		double topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2);
+		double topic_score_option0(ngram& ng, ngramtable& ngt, ngramtable& ngt2);
+		double topic_score_option1(ngram& ng, ngramtable& ngt, ngramtable& ngt2);
+		double topic_score_option2(ngram& ng, ngramtable& ngt, ngramtable& ngt2);
+		double topic_score_option3(ngram& ng, ngramtable& ngt, ngramtable& ngt2);
+		
+		double total_topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2, dictionary& dict);
+		double total_topic_score(string_vec_t text, const std::string& topic, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight);
+		double total_topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict);
+		double total_topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight);
+		
 		
-		double get_topic_similarity(string_vec_t text, const std::string& topic);
-		double get_topic_similarity(ngram& num_ng, ngram& den_ng);
 		
-		bool reliable(ngram& ng, int size, ngramtable* ngt);
-		bool base_reliable(ngram& ng, int size, ngramtable* ngt);
+		void modify_context_map(string_vec_t& text, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, topic_map_t& topic_weights, topic_map_t& mod_topic_weights);
+		void modify_context_map(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, topic_map_t& topic_weights, topic_map_t& mod_topic_weights);
+		void modify_context_map(string_vec_t& text, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight, topic_map_t& topic_weights, topic_map_t& mod_topic_weights);
+		void modify_context_map(ngram& ng, ngramtable& ngt, ngramtable& ngt2, dictionary& dict, lmContainer& lm, double weight, topic_map_t& topic_weights, topic_map_t& mod_topic_weights);
+		
+		double context_similarity_solution1(string_vec_t& text, topic_map_t& topic_weights);
+		double context_similarity_solution2(string_vec_t& text, topic_map_t& topic_weights);
+		
+		bool reliable(ngram& ng, ngramtable* ngt);
 		
 	public:
 		ContextSimilarity(const std::string &dictfile, const std::string &num_modelfile, const std::string &den_modelfile);
 		~ContextSimilarity();
 		
 		void setContextMap(topic_map_t& topic_map, const std::string& context);
-		void get_topic_scores(topic_map_t& map, string_vec_t& text);
+		
+		void get_topic_scores(string_vec_t& text, topic_map_t& topic_map);
+		void get_topic_scores(ngram& ng, ngramtable& ngt, ngramtable& ngt2, topic_map_t& topic_map);
+		
 		void add_topic_scores(topic_map_t& map, topic_map_t& tmp_map);
 		void print_topic_scores(topic_map_t& map);
+		void print_topic_scores(topic_map_t& map, topic_map_t& refmap, double len);
+		double DeltaCrossEntropy(topic_map_t& topic_map, topic_map_t& tmp_map, double len);
+
+		void normalize_topic_scores(topic_map_t& map);
 		
-		double get_context_similarity(string_vec_t& text, topic_map_t& topic_weights);
+		double context_similarity(string_vec_t& text, topic_map_t& topic_weights);
 		
 		int get_Threshold_on_H(){
 			return  m_threshold_on_h;
diff --git a/src/lmContextDependent.cpp b/src/lmContextDependent.cpp
index 83d23a0..54b7bcf 100644
--- a/src/lmContextDependent.cpp
+++ b/src/lmContextDependent.cpp
@@ -130,6 +130,8 @@ namespace irstlm {
 
 	void lmContextDependent::GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line)
 	{
+		VERBOSE(2,"lmContextDependent::GetSentenceAndContext" << std::endl);
+		VERBOSE(2,"line:|" << line << "|" << std::endl);
 		size_t pos = line.find(context_delimiter);	
 		if (pos != std::string::npos){ // context_delimiter is found
 			sentence = line.substr(0, pos);
@@ -141,9 +143,8 @@ namespace irstlm {
 			sentence = line;
 			context = "";
 		}	
-		VERBOSE(1,"line:|" << line << "|" << std::endl);
-		VERBOSE(1,"sentence:|" << sentence << "|" << std::endl);	
-		VERBOSE(1,"context:|" << context << "|" << std::endl);	
+		VERBOSE(2,"sentence:|" << sentence << "|" << std::endl);	
+		VERBOSE(2,"context:|" << context << "|" << std::endl);	
 	}
 
 	double lmContextDependent::lprob(ngram ng, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
@@ -154,32 +155,67 @@ namespace irstlm {
 			text.push_back(ng.dict->decode(*ng.wordp(2)));
 		}
 		text.push_back(ng.dict->decode(*ng.wordp(1)));
-		double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
-		double similarity_score = m_similaritymodel->get_context_similarity(text, topic_weights);
-		double ret_logprob = lm_logprob + m_similaritymodel_weight * similarity_score;
-		VERBOSE(3, "lm_log10_pr:" << lm_logprob << " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_log10_pr:" << ret_logprob << std::endl);
 		
-		return ret_logprob;
+		return lprob(ng, text, topic_weights, bow, bol, maxsuffptr, statesize, extendible);
 	}
 	
 	double lmContextDependent::lprob(string_vec_t& text, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
 	{
 		VERBOSE(2,"lmContextDependent::lprob(string_vec_t& text, topic_map_t& topic_weights, ...)" << std::endl);
-
+		
 		//create the actual ngram
 		ngram ng(dict);
 		ng.pushw(text);
 		VERBOSE(3,"ng:|" << ng << "|" << std::endl);		
 		
 		MY_ASSERT (ng.size == (int) text.size());
+		return lprob(ng, text, topic_weights, bow, bol, maxsuffptr, statesize, extendible);
+	}
+	
+	double lmContextDependent::lprob(ngram& ng, string_vec_t& text, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
+	{
+		VERBOSE(2,"lmContextDependent::lprob(ngram& ng, topic_map_t& topic_weights, ...)" << std::endl);
 		double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
-		double similarity_score = m_similaritymodel->get_context_similarity(text, topic_weights);
+//		double similarity_score = 1.0;
+		double similarity_score = m_similaritymodel->context_similarity(text, topic_weights);
 		double ret_logprob = lm_logprob + m_similaritymodel_weight * similarity_score;
 		VERBOSE(3, "lm_log10_pr:" << lm_logprob << " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_log10_pr:" << ret_logprob << std::endl);
 		
 		return ret_logprob;
 	}
 	
+	
+	double lmContextDependent::total_clprob(string_vec_t& text, topic_map_t& topic_weights)
+	{		
+		VERBOSE(2,"lmContextDependent::total_lprob(string_vec_t& text, topic_map_t& topic_weights)" << std::endl);
+		double tot_pr = 0.0;
+		double v_pr;
+		for (int v=0; v<dict->size(); ++v){
+			//replace last word, which is in position 2, keeping topic in position 1 unchanged
+			text.at(text.size()-1) = dict->decode(v);
+			v_pr = clprob(text, topic_weights);
+			tot_pr += pow(10.0,v_pr); //v_pr is a lo10 prob
+		}
+		return log10(tot_pr);
+	}
+	
+	double lmContextDependent::total_clprob(ngram& ng, topic_map_t& topic_weights)
+	{		
+		VERBOSE(2,"lmContextDependent::total_lprob(ngram& ng, topic_map_t& topic_weights)" << std::endl);
+		double tot_pr = 0.0;
+		double v_pr;
+		double oovpenalty = getlogOOVpenalty();
+		setlogOOVpenalty((double) 0);
+		for (int v=0; v<dict->size(); ++v){
+			//replace last word, which is in position 2, keeping topic in position 1 unchanged
+			*ng.wordp(1) = ng.dict->encode(dict->decode(v));
+			v_pr = clprob(ng, topic_weights);
+			tot_pr += pow(10.0,v_pr); //v_pr is a lo10 prob
+		}
+		setlogOOVpenalty(oovpenalty);
+		return log10(tot_pr);
+	}
+	
 	double lmContextDependent::lprob(int* codes, int sz, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
 	{
 		VERBOSE(3,"lmContextDependent::lprob(int* codes, int sz, topic_map_t& topic_weights, " << std::endl);
diff --git a/src/lmContextDependent.h b/src/lmContextDependent.h
index f6b6e85..58c5e1e 100644
--- a/src/lmContextDependent.h
+++ b/src/lmContextDependent.h
@@ -77,6 +77,11 @@ namespace irstlm {
 		lmContainer* m_lm;
 		bool m_isinverted;
 		
+		
+		//flag for enabling/disabling normalization of the language model 
+		// if disabled, score returns by the language model do not sum to 1.0
+		bool m_normalization;
+		
 		ContextSimilarity* m_similaritymodel;   //to remove when TopicModel is ready
 		double m_lm_weight;
 		
@@ -149,6 +154,10 @@ namespace irstlm {
 		virtual double lprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
 		virtual double lprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
 		
+		double lprob(ngram& ng, string_vec_t& text, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible);
+		double total_clprob(string_vec_t& text, topic_map_t& topic_weights);
+		double total_clprob(ngram& ng, topic_map_t& topic_weights);		
+		
 		int maxlevel() const {
 			return maxlev;
 		};
@@ -158,7 +167,6 @@ namespace irstlm {
 			dict=d;
 		};
 		
-		
 		virtual inline lmContainer* getWordLM() const {
 			return m_lm;
 		};
@@ -204,6 +212,14 @@ namespace irstlm {
 		inline void set_Active(bool value){
 			m_similaritymodel->set_Active(value);
 		}
+		
+		bool is_Normalized(){
+			return  m_normalization;
+		}
+		void set_Normalized(bool val){
+			m_normalization = val;
+		}
+		
 	};
 }//namespace irstlm
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git



More information about the debian-science-commits mailing list