[irstlm] 32/126: added computation of sentence-based topic score distribution; added supporting functions; code cleanup;

Giulio Paci giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:42 UTC 2016


This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.

commit 836988edc745c26de4c80c4c57d352fafcfa3546
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Mon Jul 27 15:56:56 2015 +0200

    added computation of sentence-based topic score distribution; added supporting functions; code cleanup;
---
 src/context-dependent-evaluation.cpp | 174 +++++++++++++++++++++++++----------
 src/context-similarity.cpp           | 137 ++++++++++++++++++---------
 src/context-similarity.h             |  18 ++--
 src/lmContextDependent.cpp           |  42 +++++++--
 src/lmContextDependent.h             |  20 +++-
 5 files changed, 279 insertions(+), 112 deletions(-)

diff --git a/src/context-dependent-evaluation.cpp b/src/context-dependent-evaluation.cpp
index 48a138f..5ac7cd0 100644
--- a/src/context-dependent-evaluation.cpp
+++ b/src/context-dependent-evaluation.cpp
@@ -30,6 +30,7 @@
 #include "util.h"
 #include "math.h"
 #include "lmContainer.h"
+#include "lmContextDependent.h"
 
 using namespace std;
 using namespace irstlm;
@@ -64,6 +65,7 @@ int main(int argc, char **argv)
 	
 	bool sent_PP_flag = false;
 	bool contextbasedscore = false;
+	bool topicscore = false;
 	
 	int debug = 0;
   int requiredMaxlev = 1000;
@@ -80,7 +82,7 @@ int main(int argc, char **argv)
                 "randcalls", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file",
 								"r", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file",
                 "contextbasedscore", CMDBOOLTYPE|CMDMSG, &contextbasedscore, "computes context-dependent probabilities and pseudo-perplexity of the text from standard input",
-                "cbs", CMDBOOLTYPE|CMDMSG, &contextbasedscore, "computes context-dependent probabilities and pseudo-perplexity of the text from standard input",
+                "topicscore", CMDBOOLTYPE|CMDMSG, &topicscore, "computes the topic scores of the text from standard input",
 								"debug", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
 								"d", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
                 "level", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken",
@@ -121,6 +123,7 @@ int main(int argc, char **argv)
   if (lmfile!=NULL) std::cerr << "lmfile: " << lmfile << std::endl;
   if (testfile!=NULL) std::cerr << "testfile: " << testfile << std::endl;
   if (contextbasedscore==true) std::cerr << "contextbasedscore: " << contextbasedscore << std::endl;
+  if (topicscore==true) std::cerr << "topicscore: " << topicscore << std::endl;
   std::cerr << "loading up to the LM level " << requiredMaxlev << " (if any)" << std::endl;
   std::cerr << "dub: " << dub<< std::endl;
 	
@@ -139,6 +142,105 @@ int main(int argc, char **argv)
   //use caches to save time (only if PS_CACHE_ENABLE is defined through compilation flags)
   lmt->init_caches(lmt->maxlevel());
 	
+	if (topicscore == true) {
+		
+		if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) {
+			debug = (debug>4)?4:debug;
+			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+		}
+		if (lmt->getLanguageModelType() == _IRSTLM_LMMACRO) {
+			debug = (debug>4)?4:debug;
+			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+		}
+		if (lmt->getLanguageModelType() == _IRSTLM_LMCLASS) {
+			debug = (debug>4)?4:debug;
+			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+		}
+		
+		std::cerr << "Start Topic Score generation " << std::endl;
+		std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl;
+		std::cout.setf(ios::fixed);
+		std::cout.precision(2);
+		
+		std::fstream inptxt(testfile,std::ios::in);
+		
+		// loop over input lines
+		char line[MAX_LINE];
+		while (inptxt.getline(line,MAX_LINE)) {
+			
+			std::string line_str = line;
+			
+			VERBOSE(0,"line_str:|" << line_str << "|" << std::endl);	
+			
+			//getting sentence string;
+			std::string sentence;
+			std::string context;
+			
+			
+			((lmContextDependent*) lmt)->GetSentenceAndContext(sentence,context,line_str);
+			VERBOSE(0,"sentence:|" << sentence << "|" << std::endl);	
+			VERBOSE(0,"context:|" << context << "|" << std::endl);	
+			VERBOSE(0,"line_str:|" << line_str << "|" << std::endl);
+				
+			//getting apriori topic weights
+			topic_map_t apriori_topic_map;
+			((lmContextDependent*) lmt)->getContextSimilarity()->setContextMap(apriori_topic_map,context);
+			
+			if(1){
+				// computation using std::string
+				// loop over ngrams of the sentence
+				string_vec_t word_vec;
+				split(sentence, ' ', word_vec);
+				
+				//first points to the last recent term to take into account
+				//last points to the position after the most recent term to take into account
+				//last could point outside the vector of string; do NOT use word_vec.at(last)
+				size_t last, first;
+				size_t size=0;
+				size_t order = lmt->maxlevel();
+				
+				
+				
+				topic_map_t sentence_topic_map;
+				VERBOSE(0,"word_vec.size():|" << word_vec.size() << "|" << std::endl);	
+				for (size_t i=0; i<word_vec.size(); ++i){
+					++size;
+					size=(size<order)?size:order;
+					last=i+1;
+					// reset ngram at begin of sentence
+					if (word_vec.at(i) == lmt->getDict()->BoS()) {
+						size=0;
+						continue;
+					}
+					first = last - size;
+					
+					VERBOSE(0,"topic scores for first:|" << first << "| last:|" << last << "| size:" << size << std::endl);
+					string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
+					
+					if (size>=1) {
+						VERBOSE(0,"computing topic_scores for first:|" << first << "| and last:|" << last << "|" << std::endl);	
+						
+						topic_map_t tmp_topic_map;
+						((lmContextDependent*) lmt)->getContextSimilarity()->get_topic_scores(tmp_topic_map, tmp_word_vec);
+						
+						std::cout << "first:" << first << " last:" << last << ((lmContextDependent*) lmt)->getContextDelimiter();
+						((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map);
+						
+						((lmContextDependent*) lmt)->getContextSimilarity()->add_topic_scores(sentence_topic_map, tmp_topic_map);
+						tmp_topic_map.clear();
+					}
+				}
+				std::cout << sentence << ((lmContextDependent*) lmt)->getContextDelimiter();
+				((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map);
+			}
+			
+			apriori_topic_map.clear();
+		}
+		
+		
+		delete lmt;
+		return 0;
+	}
   if (contextbasedscore == true) {
 		
 		if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) {
@@ -153,7 +255,7 @@ int main(int argc, char **argv)
 			debug = (debug>4)?4:debug;
 			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
 		}
-		std::cerr << "Start Eval" << std::endl;
+		std::cerr << "Start ContextBased Evaluation" << std::endl;
 		std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl;
 		std::cout.setf(ios::fixed);
 		std::cout.precision(2);
@@ -168,16 +270,7 @@ int main(int argc, char **argv)
 		
 		// variables for storing sentence-based Perplexity
 		int sent_Nbo=0, sent_Nw=0,sent_Noov=0;
-		double sent_logPr=0,sent_PP=0,sent_PPwp=0;
-		
-		
-//		ngram ng(lmt->getDict());
-		
-		const std::string context_delimiter="___CONTEXT___";
-		const char topic_map_delimiter='=';
-		
-		string_vec_t topic_weight_vec;
-		string_vec_t topic_weight;
+		double sent_logPr=0,sent_PP=0,sent_PPwp=0;		
 		
 		std::fstream inptxt(testfile,std::ios::in);
 		
@@ -193,65 +286,48 @@ int main(int argc, char **argv)
 			std::string sentence;
 			std::string context;
 			
-			size_t pos = line_str.find(context_delimiter);	
-			if (pos != std::string::npos){ // context_delimiter is found
-				sentence = line_str.substr(0, pos);
-				std::cout << sentence << std::endl;
-				line_str.erase(0, pos + context_delimiter.length());
-				VERBOSE(0,"pos:|" << pos << "|" << std::endl);	
-				VERBOSE(0,"sentence:|" << sentence << "|" << std::endl);	
-				VERBOSE(0,"line_str:|" << line_str << "|" << std::endl);	
-				
-				//getting context string;
-				context = line_str;
-			}else{
-				sentence = line_str;
-				context = "";
-			}	
-			
+			((lmContextDependent*) lmt)->GetSentenceAndContext(sentence,context,line_str);
+			VERBOSE(0,"sentence:|" << sentence << "|" << std::endl);	
 			VERBOSE(0,"context:|" << context << "|" << std::endl);	
-			VERBOSE(0,"line_str:|" << line_str << "|" << std::endl);	
-			//getting topic weights
-			topic_map_t topic_weight_map;
+			VERBOSE(0,"line_str:|" << line_str << "|" << std::endl);
 			
-			split(context, ' ', topic_weight_vec);
-			for (string_vec_t::iterator it=topic_weight_vec.begin(); it!=topic_weight_vec.end(); ++it){
-				split(*it, topic_map_delimiter, topic_weight);
-				topic_weight_map[topic_weight.at(0)] = strtod (topic_weight.at(1).c_str(), NULL);
-				topic_weight.clear();
-			}
-			topic_weight_vec.clear();
+			//getting apriori topic weights
+			topic_map_t apriori_topic_map;
+			((lmContextDependent*) lmt)->getContextSimilarity()->setContextMap(apriori_topic_map,context); 
 			
 			
 			if(1){
 				// computation using std::string
 				// loop over ngrams of the sentence
-				string_vec_t w_vec;
-				split(sentence, ' ', w_vec);
+				string_vec_t word_vec;
+				split(sentence, ' ', word_vec);
 				
+				//first points to the last recent term to take into account
+				//last points to the position after the most recent term to take into account
+				//last could point outside the vector of string; do NOT use word_vec.at(last)
 				size_t last, first;
 				size_t size=0;
 				size_t order = lmt->maxlevel();
 				
-				VERBOSE(0,"w_vec.size():|" << w_vec.size() << "|" << std::endl);	
-				for (size_t i=0; i<w_vec.size(); ++i){
+				VERBOSE(0,"word_vec.size():|" << word_vec.size() << "|" << std::endl);	
+				for (size_t i=0; i<word_vec.size(); ++i){
 					++size;
 					size=(size<order)?size:order;
 					last=i+1;
 					// reset ngram at begin of sentence
-					if (w_vec.at(i) == lmt->getDict()->BoS()) {
+					if (word_vec.at(i) == lmt->getDict()->BoS()) {
 						size=0;
 						continue;
 					}
 					first = last - size;
 					
-					VERBOSE(0,"prob for first:|" << first << "| and last:|" << last << "| size:" << size << std::endl);
-					string_vec_t tmp_w_vec(w_vec.begin() + first, w_vec.begin() +last);
+					VERBOSE(0,"prob for first:|" << first << "| last:|" << last << "| size:" << size << std::endl);
+					string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
 					
 					if (size>=1) {
-						VERBOSE(0,"computing prob for first:|" << first << "| and last:|" << last << "| is Pr=" << Pr << std::endl);	
-						Pr=lmt->clprob(tmp_w_vec, topic_weight_map, &bow, &bol, &msp, &statesize);
-						VERBOSE(0,"prob for first:|" << first << "| and last:|" << last << "| is Pr=" << Pr << std::endl);	
+						VERBOSE(0,"computing prob for first:|" << first << "| and last:|" << last << "|" << std::endl);	
+						Pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
+						VERBOSE(0," --> prob for first:|" << first << "| and last:|" << last << "| is Pr=" << Pr << std::endl);	
 						logPr+=Pr;
 						sent_logPr+=Pr;
 						VERBOSE(0,"sent_logPr:|" << sent_logPr << " logPr:|" << logPr << std::endl);	
@@ -268,7 +344,7 @@ int main(int argc, char **argv)
 				lmt->check_caches_levels();
 			}
 			
-			topic_weight_map.clear();
+			apriori_topic_map.clear();
 		}
 		
 		
diff --git a/src/context-similarity.cpp b/src/context-similarity.cpp
index ddfaf21..cc86337 100644
--- a/src/context-similarity.cpp
+++ b/src/context-similarity.cpp
@@ -40,13 +40,16 @@ inline void error(const char* message)
 }
 
 namespace irstlm {
-	ContextSimilarity::ContextSimilarity(const std::string &dictfile, const std::string &modelfile)
+	ContextSimilarity::ContextSimilarity(const std::string &dictfile, const std::string &num_modelfile, const std::string &den_modelfile)
 	{
-		m_lm=lmContainer::CreateLanguageModel(modelfile);
+		m_num_lm=lmContainer::CreateLanguageModel(num_modelfile);
+		m_den_lm=lmContainer::CreateLanguageModel(num_modelfile);
 		
-		m_lm->load(modelfile);
+		m_num_lm->load(num_modelfile);
+		m_den_lm->load(den_modelfile);
 		
-		m_lm->getDict()->genoovcode();
+		m_num_lm->getDict()->genoovcode();
+		m_den_lm->getDict()->genoovcode();
 		
 		//loading form file		
 		std::string str;
@@ -74,46 +77,46 @@ namespace irstlm {
 	double ContextSimilarity::score(string_vec_t& text, topic_map_t& topic_weights)
 	{
 		VERBOSE(4, "double ContextSimilarity::score(string_vec_t& text, topic_map_t& topic_weights)" << std::endl);
+		double ret_logprob = SIMILARITY_LOWER_BOUND;
+		
 		if (topic_weights.size() == 0){
 			//a-priori topic distribution is "empty", i.e. there is nore score for any topic
 			//return a "constant" lower-bound score,  SIMILARITY_LOWER_BOUND = log(0.0)
-			return SIMILARITY_LOWER_BOUND;
-		}
-		
-		ngram base_num_ng(m_lm->getDict());
-		ngram base_den_ng(m_lm->getDict());
-		create_ngram(text, base_num_ng, base_den_ng);
-		
-		double ret_logprob = 0.0;
-		double add_logprob;
-		topic_map_t::iterator it = topic_weights.begin();
-		do
-		{
-			ngram num_ng = base_num_ng;
-			ngram den_ng = base_den_ng;
-			add_topic(it->first, num_ng, den_ng);
+			ret_logprob = SIMILARITY_LOWER_BOUND;
+		}else{
+			
+			ngram base_num_ng(m_num_lm->getDict());
+			ngram base_den_ng(m_den_lm->getDict());
+			create_ngram(text, base_num_ng, base_den_ng);
 			
-			VERBOSE(0, "topic:|" << it->first << " log(p(topic):" << log(it->second) << std::endl);
-			double topic_score = get_topic_similarity(num_ng, den_ng);
-			add_logprob = log(it->second) + topic_score;
-			VERBOSE(0, "topic_score:" << topic_score << std::endl);
-			VERBOSE(0, "add_logprob:" << add_logprob << std::endl);
-			ret_logprob = logsum(ret_logprob, add_logprob);
-			++it;
-		}while (it!= topic_weights.end());
+			for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
+			{
+				ngram num_ng = base_num_ng;
+				ngram den_ng = base_den_ng;
+				add_topic(it->first, num_ng, den_ng);
+				double apriori_topic_score = log(it->second);
+				double topic_score = get_topic_similarity(num_ng, den_ng);
+				
+				VERBOSE(3, "topic:|" << it->first  << "apriori_topic_score:" << apriori_topic_score << " topic_score:" << topic_score << std::endl);
+				if (it == topic_weights.begin()){
+					ret_logprob = apriori_topic_score + topic_score;
+				}else{
+					ret_logprob = logsum(ret_logprob, apriori_topic_score + topic_score);
+				}
+				VERBOSE(4, "CURRENT ret_logprob:" << ret_logprob << std::endl);
+			}
+		}
 		
 		
-		VERBOSE(0, "ret_logprob:" << ret_logprob << std::endl);
+		VERBOSE(3, "ret_logprob:" << ret_logprob << std::endl);
 		return ret_logprob;
 	}
 	
-	
-	topic_map_t ContextSimilarity::get_topic_scores(string_vec_t& text)
-	{
-		topic_map_t topic_map;
-		
-		ngram base_num_ng(m_lm->getDict());
-		ngram base_den_ng(m_lm->getDict());
+	//returns the scores for all topics in the topic models (without apriori topic prob)
+	void ContextSimilarity::get_topic_scores(topic_map_t& topic_map, string_vec_t& text)
+	{		
+		ngram base_num_ng(m_num_lm->getDict());
+		ngram base_den_ng(m_den_lm->getDict());
 		create_ngram(text, base_num_ng, base_den_ng);
 		
 		for (topic_dict_t::iterator it=m_lm_topic_dict.begin(); it != m_lm_topic_dict.end(); ++it)
@@ -123,12 +126,51 @@ namespace irstlm {
 			add_topic(*it, num_ng, den_ng);
 			topic_map[*it] = get_topic_similarity(num_ng, den_ng);
 		}
-		return topic_map;
+	}
+	
+	
+	void ContextSimilarity::add_topic_scores(topic_map_t& topic_map, topic_map_t& tmp_map)
+	{
+		for (topic_map_t::iterator it=tmp_map.begin(); it != tmp_map.end(); ++it){
+			topic_map[it->first] += tmp_map[it->first];
+		}
+	}
+	
+	//returns the scores for all topics in the topic models (without apriori topic prob)
+	void ContextSimilarity::print_topic_scores(topic_map_t& map)
+	{
+		for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it)
+		{
+			if (it!=map.begin()) { std::cout << topic_map_delimiter1; }
+			std::cout << it->first << topic_map_delimiter2 << it->second;
+		}
+		std::cout << std::endl;
+	}
+	
+	void ContextSimilarity::setContextMap(topic_map_t& topic_map, const std::string& context){
+		
+		VERBOSE(0,"context:|" << context << "|" << std::endl);
+		
+		string_vec_t topic_weight_vec;
+		string_vec_t topic_weight;
+		
+		// context is supposed in this format
+		// topic-name1,topic-value1:topic-name2,topic-value2:...:topic-nameN,topic-valueN
+		
+		//first-level split the context in a vector of 	topic-name1,topic-value1, using the first separator ':'
+		split(context, topic_map_delimiter1, topic_weight_vec);
+		for (string_vec_t::iterator it=topic_weight_vec.begin(); it!=topic_weight_vec.end(); ++it){
+			//first-level split the context in a vector of 	topic-name1 and ,topic-value1, using the second separator ','
+			split(*it, topic_map_delimiter2, topic_weight);
+			topic_map[topic_weight.at(0)] = strtod (topic_weight.at(1).c_str(), NULL);
+			topic_weight.clear();
+		}
+		VERBOSE(0,"found " << topic_map.size() << " entries in the context" << std::endl);
 	}
 	
 	void ContextSimilarity::create_ngram(const string_vec_t& text, ngram& num_ng, ngram& den_ng)
 	{
-		//text is  a vector of string with w in the last position and the history in the previous positions
+		//text is a vector of strings with w in the last position and the history in the previous positions
 		//text must have at least two words
 		VERBOSE(3,"void ContextSimilarity::create_ngram" << std::endl);
 
@@ -136,6 +178,10 @@ namespace irstlm {
 		//		if (text.size()==0)
 		
 		//TO_CHECK: what happens when text has just one element
+		
+		
+		
+		// lm model for the numerator is assumed to be a 3-gram lm, hence num_gr have only size 3 (two words and one topic); here we insert two words
 		if (text.size()==1){
 			num_ng.pushw(num_ng.dict->OOV());
 		}else {
@@ -143,7 +189,7 @@ namespace irstlm {
 		}
 		num_ng.pushw(text.at(text.size()-1));
 		
-		den_ng.pushw(den_ng.dict->OOV());		//or den_ng.pushc(m_lm->getDict()->getoovcode());
+		// lm model for the denominator is assumed to be a 2-gram lm, hence den_gr have only size 2 (one word and one topic); here we insert one word
 		den_ng.pushw(text.at(text.size()-1));
 	}
 	
@@ -163,8 +209,8 @@ namespace irstlm {
 	
 	double ContextSimilarity::get_topic_similarity(string_vec_t text, const std::string& topic)
 	{
-		ngram num_ng(m_lm->getDict());
-		ngram den_ng(m_lm->getDict());
+		ngram num_ng(m_num_lm->getDict());
+		ngram den_ng(m_den_lm->getDict());
 		
 		create_topic_ngram(text, topic, num_ng, den_ng);
 		
@@ -173,11 +219,12 @@ namespace irstlm {
 	
 	double ContextSimilarity::get_topic_similarity(ngram& num_ng, ngram& den_ng)
 	{	
-		double num_pr=m_lm->clprob(num_ng);
-		double den_pr=m_lm->clprob(den_ng);
-	 VERBOSE(0, "num_ng:|" << num_ng << "| pr:" << num_pr << std::endl);
-	 VERBOSE(0, "den_ng:|" << den_ng << "| pr:" << den_pr << std::endl);
-		return m_lm->clprob(num_ng) - m_lm->clprob(den_ng);
+		double num_pr=m_num_lm->clprob(num_ng);
+		double den_pr=m_den_lm->clprob(den_ng);
+	 VERBOSE(4, "num_ng:|" << num_ng << "| pr:" << num_pr << std::endl);
+	 VERBOSE(4, "den_ng:|" << den_ng << "| pr:" << den_pr << std::endl);
+		return num_pr - den_pr;
+		//		return m_lm->clprob(num_ng) - m_lm->clprob(den_ng);
 	}
 	
 }//namespace irstlm
diff --git a/src/context-similarity.h b/src/context-similarity.h
index 4a2533b..d646fb6 100644
--- a/src/context-similarity.h
+++ b/src/context-similarity.h
@@ -37,16 +37,19 @@
 class ngram;
 
 namespace irstlm {
-	
+	#define topic_map_delimiter1 ':'
+	#define topic_map_delimiter2 ','
+	#define SIMILARITY_LOWER_BOUND -10000
 	
 	typedef std::map< std::string, float > topic_map_t;
 	typedef std::set< std::string > topic_dict_t;
 	
-	#define SIMILARITY_LOWER_BOUND -10000
+
 	class ContextSimilarity
 	{
 	private:
-		lmContainer* m_lm; // P(topic | h' w)
+		lmContainer* m_num_lm; // P(topic | h' w)
+		lmContainer* m_den_lm; // P(topic | h')
 		topic_dict_t m_lm_topic_dict; //the dictionary of the topics seen in the language model
 		topic_map_t topic_map; 
 		
@@ -58,10 +61,13 @@ namespace irstlm {
 		double get_topic_similarity(ngram& num_ng, ngram& den_ng);
 		
 	public:
-		ContextSimilarity(const std::string &dictfile, const std::string &modelfile);
+		ContextSimilarity(const std::string &dictfile, const std::string &num_modelfile, const std::string &den_modelfile);
 		~ContextSimilarity();
-
-		topic_map_t get_topic_scores(string_vec_t& text);
+		
+		void setContextMap(topic_map_t& topic_map, const std::string& context);
+		void get_topic_scores(topic_map_t& map, string_vec_t& text);
+		void add_topic_scores(topic_map_t& map, topic_map_t& tmp_map);
+		void print_topic_scores(topic_map_t& map);
 		
 		double score(string_vec_t& text, topic_map_t& topic_weights);
 	};
diff --git a/src/lmContextDependent.cpp b/src/lmContextDependent.cpp
index 00c7a34..da7e134 100644
--- a/src/lmContextDependent.cpp
+++ b/src/lmContextDependent.cpp
@@ -38,6 +38,7 @@ inline void error(const char* message)
 }
 
 namespace irstlm {
+	
 	lmContextDependent::lmContextDependent(float nlf, float dlf)
 	{
 		ngramcache_load_factor = nlf;
@@ -76,13 +77,13 @@ namespace irstlm {
 		tokenN = parseWords(line,words,LMCONFIGURE_MAX_TOKEN);
 		
 		if (tokenN != 1 || ((strcmp(words[0],"LMCONTEXTDEPENDENT") != 0) && (strcmp(words[0],"lmcontextdependent")!=0)))
-			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_model");
+			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_num_model topic_nden_model");
 		
 		//reading ngram-based LM
 		inp.getline(line,BUFSIZ,'\n');
 		tokenN = parseWords(line,words,1);
 		if(tokenN < 1 || tokenN > 1) {
-			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_model");
+			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict  topic_num_model topic_nden_model");
 		}
 		
 		VERBOSE(0, "modelfile:|" << words[0] << "|" << std::endl);
@@ -100,22 +101,45 @@ namespace irstlm {
 		
 		//reading topic model
 		inp.getline(line,BUFSIZ,'\n');
-		tokenN = parseWords(line,words,3);
+		tokenN = parseWords(line,words,4);
 		
-		if(tokenN < 3 || tokenN > 3) {
-			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_model");
+		if(tokenN < 4 || tokenN > 4) {
+			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_num_model topic_nden_model");
 		}
 		
 		//loading topic model and initialization
 		m_similaritymodel_weight = (float) atof(words[0]);
-		m_similaritymodel = new ContextSimilarity(words[1], words[2]);
+		std::string _dict = words[1];
+		std::string _num_lm = words[2];
+		std::string _den_lm = words[3];
+		m_similaritymodel = new ContextSimilarity(_dict, _num_lm, _den_lm);
 		
 		inp.close();
 		
-		VERBOSE(0, "topicdict:|" << words[1] << "|" << std::endl);
-		VERBOSE(0, "topicmodel:|" << words[2] << "|" << std::endl);
+		VERBOSE(0, "topic_dict:|" << _dict << "|" << std::endl);
+		VERBOSE(0, "topic_num_model:|" << _num_lm << "|" << std::endl);
+		VERBOSE(0, "topic_den_model:|" << _den_lm << "|" << std::endl);
 	}
-	
+
+	void lmContextDependent::GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line)
+	{
+		size_t pos = line.find(context_delimiter);	
+		if (pos != std::string::npos){ // context_delimiter is found
+			sentence = line.substr(0, pos);
+			std::cout << sentence << std::endl;
+			line.erase(0, pos + context_delimiter.length());
+			VERBOSE(0,"pos:|" << pos << "|" << std::endl);	
+			VERBOSE(0,"sentence:|" << sentence << "|" << std::endl);	
+			VERBOSE(0,"line:|" << line << "|" << std::endl);	
+			
+			//getting context string;
+			context = line;
+		}else{
+			sentence = line;
+			context = "";
+		}	
+	}
+
 	double lmContextDependent::lprob(ngram ng, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
 	{
 		string_vec_t text;   // replace with the text passed as parameter
diff --git a/src/lmContextDependent.h b/src/lmContextDependent.h
index 7dc8364..7b0c7a5 100644
--- a/src/lmContextDependent.h
+++ b/src/lmContextDependent.h
@@ -62,6 +62,8 @@ namespace irstlm {
 	
 #define LMCONFIGURE_MAX_TOKEN 3
 	
+	static const std::string context_delimiter="___CONTEXT___";
+	
 	class lmContextDependent: public lmContainer
 	{
 	private:
@@ -73,15 +75,12 @@ namespace irstlm {
 		int memmap;  //level from which n-grams are accessed via mmap
 		
 		lmContainer* m_lm;
-//		std::string m_lm_file;
 		bool m_isinverted;
 		
-		//  TopicModel* m_topicmodel;
 		ContextSimilarity* m_similaritymodel;   //to remove when TopicModel is ready
 		double m_lm_weight;
 		
 		double m_similaritymodel_weight;
-//		std::string m_similaritymodel_file;
 		
 		float ngramcache_load_factor;
 		float dictionary_load_factor;
@@ -96,6 +95,12 @@ namespace irstlm {
 		void load(const std::string &filename,int mmap=0);
 		
 		
+		inline std::string getContextDelimiter() const{
+			return context_delimiter;
+		}
+		
+		void GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line);
+		
 		virtual double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL){
 			VERBOSE(0, "virtual double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL)" << std::endl << "This LM type (lmContextDependent) does not support this function" << std::endl);
 			UNUSED(ng);
@@ -155,6 +160,15 @@ namespace irstlm {
 			dict=d;
 		};
 		
+		
+		virtual inline lmContainer* getWordLM() const {
+			return m_lm;
+		};
+		
+		virtual inline ContextSimilarity* getContextSimilarity() const {
+			return m_similaritymodel;
+		};
+		
 		virtual inline dictionary* getDict() const {
 			return dict;
 		};

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git



More information about the debian-science-commits mailing list