[irstlm] 34/126: code cleanup

Tue May 17 07:46:42 UTC 2016

This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.

commit 8685d5f6fd58564aa44028e5d6cf719f0f416702
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Fri Jul 31 08:52:07 2015 +0200

    code cleanup
---
 src/context-dependent-evaluation.cpp | 187 +++++++++++++++++++----------------
 1 file changed, 103 insertions(+), 84 deletions(-)

diff --git a/src/context-dependent-evaluation.cpp b/src/context-dependent-evaluation.cpp
index 2400d10..9a3cf1e 100644
--- a/src/context-dependent-evaluation.cpp
+++ b/src/context-dependent-evaluation.cpp
@@ -181,60 +181,57 @@ int main(int argc, char **argv)
 			VERBOSE(0,"sentence:|" << sentence << "|" << std::endl);	
 			VERBOSE(0,"context:|" << context << "|" << std::endl);	
 			VERBOSE(0,"line_str:|" << line_str << "|" << std::endl);
-				
+			
 			//getting apriori topic weights
 			topic_map_t apriori_topic_map;
 			((lmContextDependent*) lmt)->getContextSimilarity()->setContextMap(apriori_topic_map,context);
 			
-			if(1){
-				// computation using std::string
-				// loop over ngrams of the sentence
-				string_vec_t word_vec;
-				split(sentence, ' ', word_vec);
-				
-				//first points to the last recent term to take into account
-				//last points to the position after the most recent term to take into account
-				//last could point outside the vector of string; do NOT use word_vec.at(last)
-				size_t last, first;
-				size_t size=0;
-				size_t order = lmt->maxlevel();
-				
+			// computation using std::string
+			// loop over ngrams of the sentence
+			string_vec_t word_vec;
+			split(sentence, ' ', word_vec);
+			
+			//first points to the last recent term to take into account
+			//last points to the position after the most recent term to take into account
+			//last could point outside the vector of string; do NOT use word_vec.at(last)
+			size_t last, first;
+			size_t size=0;
+			size_t order = lmt->maxlevel();
+			
+			
+			
+			topic_map_t sentence_topic_map;
+			VERBOSE(0,"word_vec.size():|" << word_vec.size() << "|" << std::endl);	
+			for (size_t i=0; i<word_vec.size(); ++i){
+				++size;
+				size=(size<order)?size:order;
+				last=i+1;
+				// reset ngram at begin of sentence
+				if (word_vec.at(i) == lmt->getDict()->BoS()) {
+					size=0;
+					continue;
+				}
+				first = last - size;
 				
+				VERBOSE(0,"topic scores for first:|" << first << "| last:|" << last << "| size:" << size << std::endl);
+				string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
 				
-				topic_map_t sentence_topic_map;
-				VERBOSE(0,"word_vec.size():|" << word_vec.size() << "|" << std::endl);	
-				for (size_t i=0; i<word_vec.size(); ++i){
-					++size;
-					size=(size<order)?size:order;
-					last=i+1;
-					// reset ngram at begin of sentence
-					if (word_vec.at(i) == lmt->getDict()->BoS()) {
-						size=0;
-						continue;
-					}
-					first = last - size;
+				if (size>=1) {
+					VERBOSE(2,"computing topic_scores for first:|" << first << "| and last:|" << last << "|" << std::endl);	
 					
-					VERBOSE(0,"topic scores for first:|" << first << "| last:|" << last << "| size:" << size << std::endl);
-					string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
+					topic_map_t tmp_topic_map;
+					((lmContextDependent*) lmt)->getContextSimilarity()->get_topic_scores(tmp_topic_map, tmp_word_vec);
 					
-					if (size>=1) {
-						VERBOSE(2,"computing topic_scores for first:|" << first << "| and last:|" << last << "|" << std::endl);	
-						
-						topic_map_t tmp_topic_map;
-						((lmContextDependent*) lmt)->getContextSimilarity()->get_topic_scores(tmp_topic_map, tmp_word_vec);
-						
-						VERBOSE(2,std::cout << "first:" << first << " last:" << last << ((lmContextDependent*) lmt)->getContextDelimiter());
-						if (debug > 0){
-							((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map);
-						}
-						((lmContextDependent*) lmt)->getContextSimilarity()->add_topic_scores(sentence_topic_map, tmp_topic_map);
-						tmp_topic_map.clear();
+					VERBOSE(2,std::cout << "first:" << first << " last:" << last << ((lmContextDependent*) lmt)->getContextDelimiter());
+					if (debug > 0){
+						((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map);
 					}
+					((lmContextDependent*) lmt)->getContextSimilarity()->add_topic_scores(sentence_topic_map, tmp_topic_map);
+					tmp_topic_map.clear();
 				}
-				std::cout << sentence << ((lmContextDependent*) lmt)->getContextDelimiter();
-				((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map);
 			}
-			
+			std::cout << sentence << ((lmContextDependent*) lmt)->getContextDelimiter();
+			((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map);			
 			apriori_topic_map.clear();
 		}
 		
@@ -261,7 +258,7 @@ int main(int argc, char **argv)
 		std::cout.setf(ios::fixed);
 		std::cout.precision(2);
 		
-		int Nbo=0, Nw=0,Noov=0;
+		int Nw=0,Noov=0;
 		double logPr=0,PP=0,PPwp=0,Pr;
 		
 		double bow;
@@ -270,7 +267,7 @@ int main(int argc, char **argv)
 		unsigned int statesize;
 		
 		// variables for storing sentence-based Perplexity
-		int sent_Nbo=0, sent_Nw=0,sent_Noov=0;
+		int sent_Nw=0,sent_Noov=0;
 		double sent_logPr=0,sent_PP=0,sent_PPwp=0;		
 		
 		std::fstream inptxt(testfile,std::ios::in);
@@ -282,7 +279,7 @@ int main(int argc, char **argv)
 			std::string line_str = line;
 			
 			VERBOSE(0,"line_str:|" << line_str << "|" << std::endl);	
-						
+			
 			//getting sentence string;
 			std::string sentence;
 			std::string context;
@@ -296,55 +293,78 @@ int main(int argc, char **argv)
 			topic_map_t apriori_topic_map;
 			((lmContextDependent*) lmt)->getContextSimilarity()->setContextMap(apriori_topic_map,context); 
 			
+			// computation using std::string
+			// loop over ngrams of the sentence
+			string_vec_t word_vec;
+			split(sentence, ' ', word_vec);
+			
+			//first points to the last recent term to take into account
+			//last points to the position after the most recent term to take into account
+			//last could point outside the vector of string; do NOT use word_vec.at(last)
+			size_t last, first;
+			size_t size=0;
+			size_t order = lmt->maxlevel();
 			
-			if(1){
-				// computation using std::string
-				// loop over ngrams of the sentence
-				string_vec_t word_vec;
-				split(sentence, ' ', word_vec);
+			VERBOSE(0,"word_vec.size():|" << word_vec.size() << "|" << std::endl);	
+			for (size_t i=0; i<word_vec.size(); ++i){
+				++size;
+				size=(size<order)?size:order;
+				last=i+1;
+				// reset ngram at begin of sentence
+				if (word_vec.at(i) == lmt->getDict()->BoS()) {
+					size=0;
+					continue;
+				}
+				first = last - size;
 				
-				//first points to the last recent term to take into account
-				//last points to the position after the most recent term to take into account
-				//last could point outside the vector of string; do NOT use word_vec.at(last)
-				size_t last, first;
-				size_t size=0;
-				size_t order = lmt->maxlevel();
+				VERBOSE(0,"prob for first:|" << first << "| last:|" << last << "| size:" << size << std::endl);
+				string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
 				
-				VERBOSE(0,"word_vec.size():|" << word_vec.size() << "|" << std::endl);	
-				for (size_t i=0; i<word_vec.size(); ++i){
-					++size;
-					size=(size<order)?size:order;
-					last=i+1;
-					// reset ngram at begin of sentence
-					if (word_vec.at(i) == lmt->getDict()->BoS()) {
-						size=0;
-						continue;
+				if (size>=1) {
+					VERBOSE(0,"computing prob for first:|" << first << "| and last:|" << last << "|" << std::endl);	
+					Pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
+					VERBOSE(0," --> prob for first:|" << first << "| and last:|" << last << "| is Pr=" << Pr << std::endl);	
+					logPr+=Pr;
+					sent_logPr+=Pr;
+					VERBOSE(0,"sent_logPr:|" << sent_logPr << " logPr:|" << logPr << std::endl);	
+					
+					if (debug==1) {
+						std::cout << "first:|" << first << "| and last:| [" << size-bol << "]" << " " << std::endl;
 					}
-					first = last - size;
 					
-					VERBOSE(0,"prob for first:|" << first << "| last:|" << last << "| size:" << size << std::endl);
-					string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
+					VERBOSE(0,"word_vec.at(i):|" << word_vec.at(i) << " lmt->getDict()->OOV():|" << lmt->getDict()->OOV() << std::endl);
+					if (lmt->getDict()->encode(word_vec.at(i).c_str()) == lmt->getDict()->oovcode()) {
+						Noov++;
+						sent_Noov++;
+					}
+					Nw++;
+					sent_Nw++;
 					
-					if (size>=1) {
-						VERBOSE(0,"computing prob for first:|" << first << "| and last:|" << last << "|" << std::endl);	
-						Pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
-						VERBOSE(0," --> prob for first:|" << first << "| and last:|" << last << "| is Pr=" << Pr << std::endl);	
-						logPr+=Pr;
-						sent_logPr+=Pr;
-						VERBOSE(0,"sent_logPr:|" << sent_logPr << " logPr:|" << logPr << std::endl);	
-						
-						if (debug==1) {
-							std::cout << "first:|" << first << "| and last:| [" << size-bol << "]" << " " << std::endl;
-						}
+					if ((Nw % 100000)==0) {
+						std::cerr << ".";
+						lmt->check_caches_levels();
 					}
+					
 				}
 			}
 			
-			if ((Nw % 100000)==0) {
-				std::cerr << ".";
-				lmt->check_caches_levels();
+			if (sent_PP_flag) {
+				sent_PP=exp((-sent_logPr * log(10.0)) /sent_Nw);
+				sent_PPwp= sent_PP * (1 - 1/exp((sent_Noov *  lmt->getlogOOVpenalty()) * log(10.0) / sent_Nw));
+				
+				std::cout << "%% sent_Nw=" << sent_Nw
+				<< " sent_PP=" << sent_PP
+				<< " sent_PPwp=" << sent_PPwp
+				<< " sent_Noov=" << sent_Noov
+				<< " sent_OOV=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
+				std::cout.flush();
+				//reset statistics for sentence based Perplexity
+				sent_Nw=sent_Noov;
+				sent_logPr=0.0;
 			}
 			
+			
+			
 			apriori_topic_map.clear();
 		}
 		
@@ -356,7 +376,6 @@ int main(int argc, char **argv)
 		std::cout << "%% Nw=" << Nw
 		<< " PP=" << PP
 		<< " PPwp=" << PPwp
-		<< " Nbo=" << Nbo
 		<< " Noov=" << Noov
 		<< " OOV=" << (float)Noov/Nw * 100.0 << "%";
 		if (debug) std::cout << " logPr=" <<  logPr;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git