[irstlm] 31/126: code clanup; debugging outputs

Tue May 17 07:46:42 UTC 2016

This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.

commit e604c8b2630e126118628886a90a06639dc196f5
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Mon Jul 27 07:56:29 2015 +0200

    code clanup; debugging outputs
---
 src/context-dependent-evaluation.cpp | 89 +++---------------------------------
 src/context-similarity.cpp           | 61 ++++++++++++++++++++----
 src/context-similarity.h             |  6 +--
 src/lmContainer.cpp                  |  4 +-
 src/lmContainer.h                    |  2 +
 src/lmContextDependent.cpp           | 42 +++++++++--------
 src/lmContextDependent.h             |  2 +
 7 files changed, 90 insertions(+), 116 deletions(-)

diff --git a/src/context-dependent-evaluation.cpp b/src/context-dependent-evaluation.cpp
index 8923b3d..48a138f 100644
--- a/src/context-dependent-evaluation.cpp
+++ b/src/context-dependent-evaluation.cpp
@@ -171,12 +171,7 @@ int main(int argc, char **argv)
 		double sent_logPr=0,sent_PP=0,sent_PPwp=0;
 		
 		
-		ngram ng(lmt->getDict());
-		ng.dict->incflag(1);
-		int bos=ng.dict->encode(ng.dict->BoS());
-		int eos=ng.dict->encode(ng.dict->EoS());
-		ng.dict->incflag(0);
-		
+//		ngram ng(lmt->getDict());
 		
 		const std::string context_delimiter="___CONTEXT___";
 		const char topic_map_delimiter='=';
@@ -212,7 +207,8 @@ int main(int argc, char **argv)
 			}else{
 				sentence = line_str;
 				context = "";
-			}
+			}	
+			
 			VERBOSE(0,"context:|" << context << "|" << std::endl);	
 			VERBOSE(0,"line_str:|" << line_str << "|" << std::endl);	
 			//getting topic weights
@@ -226,8 +222,6 @@ int main(int argc, char **argv)
 			}
 			topic_weight_vec.clear();
 			
-			lmt->dictionary_incflag(1);
-			
 			
 			if(1){
 				// computation using std::string
@@ -238,6 +232,8 @@ int main(int argc, char **argv)
 				size_t last, first;
 				size_t size=0;
 				size_t order = lmt->maxlevel();
+				
+				VERBOSE(0,"w_vec.size():|" << w_vec.size() << "|" << std::endl);	
 				for (size_t i=0; i<w_vec.size(); ++i){
 					++size;
 					size=(size<order)?size:order;
@@ -252,11 +248,8 @@ int main(int argc, char **argv)
 					VERBOSE(0,"prob for first:|" << first << "| and last:|" << last << "| size:" << size << std::endl);
 					string_vec_t tmp_w_vec(w_vec.begin() + first, w_vec.begin() +last);
 					
-					for (string_vec_t::iterator it=tmp_w_vec.begin(); it!=tmp_w_vec.end(); ++it){
-						
-						VERBOSE(0,"*it:|" << *it << "|" << std::endl);	
-					}
-					if (ng.size>=1) {
+					if (size>=1) {
+						VERBOSE(0,"computing prob for first:|" << first << "| and last:|" << last << "| is Pr=" << Pr << std::endl);	
 						Pr=lmt->clprob(tmp_w_vec, topic_weight_map, &bow, &bol, &msp, &statesize);
 						VERBOSE(0,"prob for first:|" << first << "| and last:|" << last << "| is Pr=" << Pr << std::endl);	
 						logPr+=Pr;
@@ -270,74 +263,6 @@ int main(int argc, char **argv)
 				}
 			}
 			
-			if(0){
-			// computation using ngram object
-			// loop over ngrams of the sentence
-			std::istringstream ss(sentence); // Insert the string into a stream
-			while (ss >> ng){
-				//computing context-based prob for each ngram of the sentence
-				VERBOSE(0,"working on ng:|" << ng << "| ng.size:" << ng.size << std::endl);	
-				
-				if (ng.size>lmt->maxlevel()) ng.size=lmt->maxlevel();	
-				
-				// reset ngram at begin of sentence
-				if (*ng.wordp(1)==bos) {
-					ng.size=1;
-					continue;
-				}
-				
-				if (ng.size>=1) {
-					Pr=lmt->clprob(ng,topic_weight_map, &bow, &bol, &msp, &statesize);
-					VERBOSE(0,"prob for ng:|" << ng << "| is Pr=" << Pr << std::endl);	
-					logPr+=Pr;
-					sent_logPr+=Pr;
-					VERBOSE(0,"sent_logPr:|" << sent_logPr << " logPr:|" << logPr << std::endl);	
-					
-					if (debug==1) {
-						std::cout << ng.dict->decode(*ng.wordp(1)) << " [" << ng.size-bol << "]" << " ";
-						if (*ng.wordp(1)==eos) std::cout << std::endl;
-					} else if (debug==2) {
-						std::cout << ng << " [" << ng.size-bol << "-gram]" << " " << Pr;
-						std::cout << std::endl;
-						std::cout.flush();
-					} else if (debug==3) {
-						std::cout << ng << " [" << ng.size-bol << "-gram]" << " " << Pr << " bow:" << bow;
-						std::cout << std::endl;
-						std::cout.flush();
-					} else if (debug==4) {
-						std::cout << ng << " [" << ng.size-bol << "-gram: recombine:" << statesize << " state:" << (void*) msp << "] [" << ng.size+1-((bol==0)?(1):bol) << "-gram: bol:" << bol << "] " << Pr << " bow:" << bow;
-						std::cout << std::endl;
-						std::cout.flush();
-					}
-				}
-				
-				if (lmt->is_OOV(*ng.wordp(1))) {
-					Noov++;
-					sent_Noov++;
-				}
-				if (bol) {
-					Nbo++;
-					sent_Nbo++;
-				}
-				Nw++;
-				sent_Nw++;
-				if (sent_PP_flag && (*ng.wordp(1)==eos)) {
-					sent_PP=exp((-sent_logPr * log(10.0)) /sent_Nw);
-					sent_PPwp= sent_PP * (1 - 1/exp((sent_Noov *  lmt->getlogOOVpenalty()) * log(10.0) / sent_Nw));
-					
-					std::cout << "%% sent_Nw=" << sent_Nw
-					<< " sent_PP=" << sent_PP
-					<< " sent_PPwp=" << sent_PPwp
-					<< " sent_Nbo=" << sent_Nbo
-					<< " sent_Noov=" << sent_Noov
-					<< " sent_OOV=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
-					std::cout.flush();
-					//reset statistics for sentence based Perplexity
-					sent_Nw=sent_Noov=sent_Nbo=0;
-					sent_logPr=0.0;
-				}
-			}
-			}
 			if ((Nw % 100000)==0) {
 				std::cerr << ".";
 				lmt->check_caches_levels();
diff --git a/src/context-similarity.cpp b/src/context-similarity.cpp
index 9a7a7ca..ddfaf21 100644
--- a/src/context-similarity.cpp
+++ b/src/context-similarity.cpp
@@ -23,11 +23,13 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <iostream>
+#include <sstream>
 #include <stdexcept>
 #include <string>
 #include "lmContainer.h"
 #include "context-similarity.h"
 #include "util.h"
+#include "mfstream.h"
 
 using namespace std;
 
@@ -38,22 +40,40 @@ inline void error(const char* message)
 }
 
 namespace irstlm {
-	ContextSimilarity::ContextSimilarity(const std::string &filename)
+	ContextSimilarity::ContextSimilarity(const std::string &dictfile, const std::string &modelfile)
 	{
-		m_lm=lmContainer::CreateLanguageModel(filename);
+		m_lm=lmContainer::CreateLanguageModel(modelfile);
 		
-		m_lm->load(filename);
+		m_lm->load(modelfile);
 		
 		m_lm->getDict()->genoovcode();
+		
+		//loading form file		
+		std::string str;
+		
+		mfstream inp(dictfile.c_str(),ios::in);
+		
+		if (!inp) {
+			std::stringstream ss_msg;
+			ss_msg << "cannot open " << dictfile << "\n";
+			exit_error(IRSTLM_ERROR_IO, ss_msg.str());
+		}
+		VERBOSE(0, "Loading the list of topic" << std::endl);
+		
+		while (inp >> str)
+		{
+			m_lm_topic_dict.insert(str);
+		}
+		VERBOSE(0, "There are " << m_lm_topic_dict.size() << " topic" << std::endl);
 	}
+
 	
 	ContextSimilarity::~ContextSimilarity()
-	{
-		// delete m_lm
-	}
+	{}
 	
 	double ContextSimilarity::score(string_vec_t& text, topic_map_t& topic_weights)
 	{
+		VERBOSE(4, "double ContextSimilarity::score(string_vec_t& text, topic_map_t& topic_weights)" << std::endl);
 		if (topic_weights.size() == 0){
 			//a-priori topic distribution is "empty", i.e. there is nore score for any topic
 			//return a "constant" lower-bound score,  SIMILARITY_LOWER_BOUND = log(0.0)
@@ -72,11 +92,18 @@ namespace irstlm {
 			ngram num_ng = base_num_ng;
 			ngram den_ng = base_den_ng;
 			add_topic(it->first, num_ng, den_ng);
-			add_logprob = log(it->second) + get_topic_similarity(num_ng, den_ng);
+			
+			VERBOSE(0, "topic:|" << it->first << " log(p(topic):" << log(it->second) << std::endl);
+			double topic_score = get_topic_similarity(num_ng, den_ng);
+			add_logprob = log(it->second) + topic_score;
+			VERBOSE(0, "topic_score:" << topic_score << std::endl);
+			VERBOSE(0, "add_logprob:" << add_logprob << std::endl);
 			ret_logprob = logsum(ret_logprob, add_logprob);
 			++it;
 		}while (it!= topic_weights.end());
 		
+		
+		VERBOSE(0, "ret_logprob:" << ret_logprob << std::endl);
 		return ret_logprob;
 	}
 	
@@ -89,7 +116,7 @@ namespace irstlm {
 		ngram base_den_ng(m_lm->getDict());
 		create_ngram(text, base_num_ng, base_den_ng);
 		
-		for (topic_dict_t::iterator it=m_lm_topic_dict->begin(); it != m_lm_topic_dict->end(); ++it)
+		for (topic_dict_t::iterator it=m_lm_topic_dict.begin(); it != m_lm_topic_dict.end(); ++it)
 		{
 			ngram num_ng = base_num_ng;
 			ngram den_ng = base_den_ng;
@@ -103,7 +130,17 @@ namespace irstlm {
 	{
 		//text is  a vector of string with w in the last position and the history in the previous positions
 		//text must have at least two words
-		num_ng.pushw(text.at(text.size()-2));
+		VERBOSE(3,"void ContextSimilarity::create_ngram" << std::endl);
+
+		//TO_CHECK: what happens when text has zero element
+		//		if (text.size()==0)
+		
+		//TO_CHECK: what happens when text has just one element
+		if (text.size()==1){
+			num_ng.pushw(num_ng.dict->OOV());
+		}else {
+			num_ng.pushw(text.at(text.size()-2));
+		}
 		num_ng.pushw(text.at(text.size()-1));
 		
 		den_ng.pushw(den_ng.dict->OOV());		//or den_ng.pushc(m_lm->getDict()->getoovcode());
@@ -135,7 +172,11 @@ namespace irstlm {
 	}
 	
 	double ContextSimilarity::get_topic_similarity(ngram& num_ng, ngram& den_ng)
-	{
+	{	
+		double num_pr=m_lm->clprob(num_ng);
+		double den_pr=m_lm->clprob(den_ng);
+	 VERBOSE(0, "num_ng:|" << num_ng << "| pr:" << num_pr << std::endl);
+	 VERBOSE(0, "den_ng:|" << den_ng << "| pr:" << den_pr << std::endl);
 		return m_lm->clprob(num_ng) - m_lm->clprob(den_ng);
 	}
 	
diff --git a/src/context-similarity.h b/src/context-similarity.h
index 16fbdf3..4a2533b 100644
--- a/src/context-similarity.h
+++ b/src/context-similarity.h
@@ -47,8 +47,8 @@ namespace irstlm {
 	{
 	private:
 		lmContainer* m_lm; // P(topic | h' w)
-		topic_dict_t* m_lm_topic_dict; //the dictionary of the topics seen in the language model
-		topic_map_t* topic_map; 
+		topic_dict_t m_lm_topic_dict; //the dictionary of the topics seen in the language model
+		topic_map_t topic_map; 
 		
 		void create_ngram(const string_vec_t& text, ngram& num_ng, ngram& den_ng);
 		void add_topic(const std::string& topic, ngram& num_ng, ngram& den_ng);
@@ -58,7 +58,7 @@ namespace irstlm {
 		double get_topic_similarity(ngram& num_ng, ngram& den_ng);
 		
 	public:
-		ContextSimilarity(const std::string &filename);
+		ContextSimilarity(const std::string &dictfile, const std::string &modelfile);
 		~ContextSimilarity();
 
 		topic_map_t get_topic_scores(string_vec_t& text);
diff --git a/src/lmContainer.cpp b/src/lmContainer.cpp
index dc042f8..56e7187 100644
--- a/src/lmContainer.cpp
+++ b/src/lmContainer.cpp
@@ -92,7 +92,6 @@ namespace irstlm {
 		VERBOSE(1,"LM header:|" << header << "|" << std::endl);
 		
 		int type=_IRSTLM_LMUNKNOWN;
-		VERBOSE(1,"type: " << type << std::endl);
 		if (header == "lminterpolation" || header == "LMINTERPOLATION") {
 			type = _IRSTLM_LMINTERPOLATION;
 		} else if (header == "lmcontextdependent" || header == "LMCONTEXTDEPENDENT") {
@@ -104,7 +103,7 @@ namespace irstlm {
 		} else {
 			type = _IRSTLM_LMTABLE;
 		}
-		VERBOSE(1,"type: " << type << std::endl);
+		VERBOSE(1,"LM type: " << type << std::endl);
 		
 		return type;
 	};
@@ -151,6 +150,7 @@ namespace irstlm {
 		}
 		
 		lm->setLanguageModelType(type);
+		
 		return lm;
 	}
 	
diff --git a/src/lmContainer.h b/src/lmContainer.h
index 5f760ff..131f207 100644
--- a/src/lmContainer.h
+++ b/src/lmContainer.h
@@ -122,6 +122,7 @@ public:
     return 0.0;
   };
   virtual double clprob(string_vec_t& text, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL) {
+		VERBOSE(0,"lmContainer::clprob(string_vec_t& text, double* bow,...." << std::endl);
     UNUSED(text);
     UNUSED(bow);
     UNUSED(bol);
@@ -152,6 +153,7 @@ public:
   }
 	
 	virtual double clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) {
+		VERBOSE(3,"lmContainer::clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow,...." << std::endl);
     UNUSED(topic_weights);
     return clprob(text, bow, bol, maxsuffptr, statesize, extendible);
   }
diff --git a/src/lmContextDependent.cpp b/src/lmContextDependent.cpp
index 5fab621..00c7a34 100644
--- a/src/lmContextDependent.cpp
+++ b/src/lmContextDependent.cpp
@@ -67,6 +67,7 @@ namespace irstlm {
 		
 		//get info from the configuration file
 		fstream inp(filename.c_str(),ios::in|ios::binary);
+		VERBOSE(0, "filename:|" << filename << "|" << std::endl);
 		
 		char line[MAX_LINE];
 		const char* words[LMCONFIGURE_MAX_TOKEN];
@@ -75,43 +76,44 @@ namespace irstlm {
 		tokenN = parseWords(line,words,LMCONFIGURE_MAX_TOKEN);
 		
 		if (tokenN != 1 || ((strcmp(words[0],"LMCONTEXTDEPENDENT") != 0) && (strcmp(words[0],"lmcontextdependent")!=0)))
-			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nweight_of_ngram_LM filename_of_LM\nweight_of_topic_model filename_of_topic_model");
+			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_model");
 		
 		//reading ngram-based LM
 		inp.getline(line,BUFSIZ,'\n');
-		tokenN = parseWords(line,words,2);
-		if(tokenN < 2 || tokenN > 2) {
-			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nweight_of_ngram_LM filename_of_LM\nweight_of_topic_model filename_of_topic_model");
+		tokenN = parseWords(line,words,1);
+		if(tokenN < 1 || tokenN > 1) {
+			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_model");
 		}
 		
-		//loading ngram-based LM and initialization
-		m_lm_weight = (float) atof(words[0]);
-		
+		VERBOSE(0, "modelfile:|" << words[0] << "|" << std::endl);
 		//checking the language model type
-		m_lm=lmContainer::CreateLanguageModel(words[1],ngramcache_load_factor, dictionary_load_factor);
+		m_lm=lmContainer::CreateLanguageModel(words[0],ngramcache_load_factor, dictionary_load_factor);
 		
 		m_lm->setMaxLoadedLevel(requiredMaxlev);
 		
-		m_lm->load(words[1], memmap);
+		m_lm->load(words[0], memmap);
 		maxlev=m_lm->maxlevel();
 		dict=m_lm->getDict();
 		getDict()->genoovcode();
 		
-		m_lm->init_caches(m_lm->maxlevel());		
+		m_lm->init_caches(m_lm->maxlevel());
 		
 		//reading topic model
 		inp.getline(line,BUFSIZ,'\n');
-		tokenN = parseWords(line,words,2);
+		tokenN = parseWords(line,words,3);
 		
-		if(tokenN < 2 || tokenN > 2) {
-			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nweight_of_ngram_LM filename_of_LM\nweight_of_topic_model filename_of_topic_model");
+		if(tokenN < 3 || tokenN > 3) {
+			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_model");
 		}
 		
 		//loading topic model and initialization
 		m_similaritymodel_weight = (float) atof(words[0]);
-		m_similaritymodel = new ContextSimilarity(words[1]);
+		m_similaritymodel = new ContextSimilarity(words[1], words[2]);
 		
 		inp.close();
+		
+		VERBOSE(0, "topicdict:|" << words[1] << "|" << std::endl);
+		VERBOSE(0, "topicmodel:|" << words[2] << "|" << std::endl);
 	}
 	
 	double lmContextDependent::lprob(ngram ng, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
@@ -119,8 +121,8 @@ namespace irstlm {
 		string_vec_t text;   // replace with the text passed as parameter
 		double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
 		double similarity_score = m_similaritymodel->score(text, topic_weights);
-		double ret_logprob = m_lm_weight * lm_logprob + m_similaritymodel_weight * similarity_score;
-		VERBOSE(0, "lm_logprob:" << lm_logprob << " m_lm_weight:" << m_lm_weight<< " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_logprob:" << ret_logprob << std::endl);
+		double ret_logprob = lm_logprob + m_similaritymodel_weight * similarity_score;
+		VERBOSE(0, "lm_logprob:" << lm_logprob << " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_logprob:" << ret_logprob << std::endl);
 		
 		return ret_logprob;
 	}
@@ -131,18 +133,20 @@ namespace irstlm {
 		//create the actual ngram
 		ngram ng(dict);
 		ng.pushw(text);
+		VERBOSE(0,"ng:|" << ng << "|" << std::endl);		
+		
 		MY_ASSERT (ng.size == (int) text.size());
 		double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
 		double similarity_score = m_similaritymodel->score(text, topic_weights);
-		double ret_logprob = m_lm_weight * lm_logprob + m_similaritymodel_weight * similarity_score;
-		VERBOSE(0, "lm_logprob:" << lm_logprob << " m_lm_weight:" << m_lm_weight<< " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_logprob:" << ret_logprob << std::endl);
+		double ret_logprob = lm_logprob + m_similaritymodel_weight * similarity_score;
+		VERBOSE(0, "lm_logprob:" << lm_logprob << " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_logprob:" << ret_logprob << std::endl);
 		
 		return ret_logprob;
 	}
 	
 	double lmContextDependent::lprob(int* codes, int sz, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
 	{
-		VERBOSE(0,"lmContextDependent::lprob(int* codes, int sz, topic_map_t& topic_weights, " << std::endl);
+		VERBOSE(3,"lmContextDependent::lprob(int* codes, int sz, topic_map_t& topic_weights, " << std::endl);
 		//create the actual ngram
 		ngram ong(dict);
 		ong.pushc(codes,sz);
diff --git a/src/lmContextDependent.h b/src/lmContextDependent.h
index efecec6..7dc8364 100644
--- a/src/lmContextDependent.h
+++ b/src/lmContextDependent.h
@@ -137,6 +137,8 @@ namespace irstlm {
 			return lprob(ng, topic_weights, bow, bol, maxsuffptr, statesize, extendible);
 		};
 		virtual double clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL){
+			
+			VERBOSE(0,"lmContainer::clprob(string_vec_t& text,...." << std::endl);
 			return lprob(text, topic_weights, bow, bol, maxsuffptr, statesize, extendible);
 		};
 		

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git