[irstlm] 59/126: added evaluation of ranking

Giulio Paci giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:45 UTC 2016


This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.

commit 7b840239959bce2eaf5bd88865ed7822310d697e
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Wed Sep 9 17:07:41 2015 +0200

    added evaluation of ranking
---
 src/context-dependent-evaluation.cpp | 477 ++++++++++++++++++++++++++++++++---
 1 file changed, 441 insertions(+), 36 deletions(-)

diff --git a/src/context-dependent-evaluation.cpp b/src/context-dependent-evaluation.cpp
index a3c6f88..45ade14 100644
--- a/src/context-dependent-evaluation.cpp
+++ b/src/context-dependent-evaluation.cpp
@@ -35,6 +35,29 @@
 using namespace std;
 using namespace irstlm;
 
+typedef std::pair<double,int> double_and_int_pair;
+
+struct cmp_double_and_int_pair {
+	//order first by the first field (double), and in case of equality by the second field (int)
+	bool operator()(const double_and_int_pair& a, const double_and_int_pair& b) const {
+		if (a.first < b.first){
+			return true;
+		}else if (a.first > b.first){
+			return false;
+		}else{
+			if (a.second<b.second){
+				return true;
+			}else{
+				return false;
+			}
+		}
+	}
+};
+
+typedef std::map<int, double_and_int_pair> int_to_double_and_int_map;
+//typedef std::map<double_and_int_pair,int,cmp_double_and_int_pair> double_and_int_to_int_map;
+typedef std::map<double_and_int_pair,double_and_int_pair,cmp_double_and_int_pair> double_and_int_to_double_and_int_map;
+
 /********************************/
 void print_help(int TypeFlag=0){
   std::cerr << std::endl << "context-dependent-evaluation - compute ngram probabilities and text perplexity given a LM" << std::endl;
@@ -63,10 +86,12 @@ int main(int argc, char **argv)
   char *testfile=NULL;
   char *lmfile=NULL;
 	
-	bool sent_PP_flag = false;
+	bool sent_flag = false;
 	bool contextbasedscore = false;
 	bool topicscore = false;
+	bool rankscore = false;
 	bool context_model_active = true;
+	bool context_model_normalization = false;
 	
 	int debug = 0;
   int requiredMaxlev = 1000;
@@ -84,15 +109,17 @@ int main(int argc, char **argv)
 								"r", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file",
                 "contextbasedscore", CMDBOOLTYPE|CMDMSG, &contextbasedscore, "computes context-dependent probabilities and pseudo-perplexity of the text from standard input",
                 "topicscore", CMDBOOLTYPE|CMDMSG, &topicscore, "computes the topic scores of the text from standard input",
+                "rankscore", CMDBOOLTYPE|CMDMSG, &rankscore, "computes the avergae rank position of the text from standard input",
 								"debug", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
 								"d", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
                 "level", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken",
 								"l", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken",
                 "dub", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7",
-                "sentence", CMDBOOLTYPE|CMDMSG, &sent_PP_flag, "computes perplexity at sentence level (identified through the end symbol)",
+                "sentence", CMDBOOLTYPE|CMDMSG, &sent_flag, "computes perplexity at sentence level (identified through the end symbol)",
                 "dict_load_factor", CMDFLOATTYPE|CMDMSG, &dictionary_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is 0",
                 "ngram_load_factor", CMDFLOATTYPE|CMDMSG, &ngramcache_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is false",
-                "context_model_active", CMDBOOLTYPE|CMDMSG, &context_model_active, "enable/disable context-dependent models (default is true)",
+                "context_model_active", CMDBOOLTYPE|CMDMSG, &context_model_active, "enable/disable context-dependent model (default is true)",
+                "context_model_normalization", CMDBOOLTYPE|CMDMSG, &context_model_normalization, "enable/disable normalization of context-dependent model (default is false)",
 								
 								"Help", CMDBOOLTYPE|CMDMSG, &help, "print this help",
 								"h", CMDBOOLTYPE|CMDMSG, &help, "print this help",
@@ -128,7 +155,6 @@ int main(int argc, char **argv)
   if (topicscore==true) std::cerr << "topicscore: " << topicscore << std::endl;
   std::cerr << "loading up to the LM level " << requiredMaxlev << " (if any)" << std::endl;
   std::cerr << "dub: " << dub<< std::endl;
-  std::cerr << "dub: " << dub<< std::endl;
 	
 	
   //checking the language model type
@@ -140,6 +166,7 @@ int main(int argc, char **argv)
 	
   lmt->load(infile);
 	((lmContextDependent*) lmt)->set_Active(context_model_active);
+	((lmContextDependent*) lmt)->set_Normalized(context_model_normalization);
 	
   if (dub) lmt->setlogOOVpenalty((int)dub);
 	
@@ -174,16 +201,13 @@ int main(int argc, char **argv)
 			
 			std::string line_str = line;
 			
-			VERBOSE(2,"input_line:|" << line_str << "|" << std::endl);	
+			VERBOSE(2,"input_line:|" << line_str << "|" << std::endl);
 			
 			//getting sentence string;
 			std::string sentence;
 			std::string context;
 			
-			
 			((lmContextDependent*) lmt)->GetSentenceAndContext(sentence,context,line_str);
-			VERBOSE(2,"sentence:|" << sentence << "|" << std::endl);	
-			VERBOSE(2,"context:|" << context << "|" << std::endl);
 			
 			//getting apriori topic weights
 			topic_map_t apriori_topic_map;
@@ -206,6 +230,7 @@ int main(int argc, char **argv)
 				++size;
 				size=(size<order)?size:order;
 				last=i+1;
+				
 				// reset ngram at begin of sentence
 				if (word_vec.at(i) == lmt->getDict()->BoS()) {
 					size=0;
@@ -219,18 +244,43 @@ int main(int argc, char **argv)
 					VERBOSE(2,"computing topic_scores for first:|" << first << "| and last:|" << last << "|" << std::endl);	
 					
 					topic_map_t tmp_topic_map;
-					((lmContextDependent*) lmt)->getContextSimilarity()->get_topic_scores(tmp_topic_map, tmp_word_vec);
-					
-					VERBOSE(2,"first:" << first << " last:" << last <<  " tmp_topic_map.size:" << tmp_topic_map.size() << std::endl);
-					if (debug > 0){
+					((lmContextDependent*) lmt)->getContextSimilarity()->get_topic_scores(tmp_word_vec, tmp_topic_map);
+					IFVERBOSE(2){
+						VERBOSE(2,"before normalization word-based topic-distribution:");
+						((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map);
+					}
+					((lmContextDependent*) lmt)->getContextSimilarity()->normalize_topic_scores(tmp_topic_map);
+					IFVERBOSE(2){
+						VERBOSE(2,"after normalization word-based topic-distribution:");
 						((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map);
 					}
+					VERBOSE(2,"first:" << first << " last:" << last <<  " tmp_topic_map.size:" << tmp_topic_map.size() << std::endl);
+					
 					((lmContextDependent*) lmt)->getContextSimilarity()->add_topic_scores(sentence_topic_map, tmp_topic_map);
+					IFVERBOSE(2){
+						//						VERBOSE(2,"word-based topic-distribution:");
+						//						((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map);
+						VERBOSE(2,"word-based topic-distribution:");
+						((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map,apriori_topic_map,1);
+					}
 					tmp_topic_map.clear();
+					//					IFVERBOSE(2){
+					//						VERBOSE(2,"sentence-based topic-distribution:");
+					//						((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map);
+					//						VERBOSE(2,"sentence-based topic-distribution:");
+					//						((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map,apriori_topic_map);
+					//					}
 				}
 			}
+			IFVERBOSE(2){
+				//						VERBOSE(2,"sentence-based topic-distribution:");
+				//						((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map);
+				VERBOSE(2,"sentence-based topic-distribution:");
+				((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map,apriori_topic_map,last);
+			}
 			std::cout << sentence << ((lmContextDependent*) lmt)->getContextDelimiter();
-			((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map);			
+			((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map);
+			//			((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map,apriori_topic_map);			
 			apriori_topic_map.clear();
 		}
 		
@@ -258,6 +308,9 @@ int main(int argc, char **argv)
 		
 		int Nw=0,Noov=0;
 		double logPr=0,PP=0,PPwp=0,Pr;
+		double norm_logPr=0,norm_PP=0,norm_PPwp=0,norm_Pr;
+		double model_logPr=0,model_PP=0,model_PPwp=0,model_Pr;
+		double model_norm_logPr=0,model_norm_PP=0,model_norm_PPwp=0,model_norm_Pr;
 		
 		double bow;
 		int bol=0;
@@ -266,7 +319,15 @@ int main(int argc, char **argv)
 		
 		// variables for storing sentence-based Perplexity
 		int sent_Nw=0,sent_Noov=0;
-		double sent_logPr=0,sent_PP=0,sent_PPwp=0;		
+		double sent_logPr=0,sent_PP=0,sent_PPwp=0;	
+		double sent_norm_logPr=0,sent_norm_PP=0,sent_norm_PPwp=0;		
+		double sent_model_logPr=0,sent_model_PP=0,sent_model_PPwp=0;		
+		double sent_model_norm_logPr=0,sent_model_norm_PP=0,sent_model_norm_PPwp=0;		
+		
+		double oovpenalty = lmt->getlogOOVpenalty();
+		double norm_oovpenalty = oovpenalty;
+		
+		VERBOSE(1,"oovpenalty:" << oovpenalty  << std::endl);	
 		
 		std::fstream inptxt(testfile,std::ios::in);
 		
@@ -283,8 +344,6 @@ int main(int argc, char **argv)
 			std::string context;
 			
 			((lmContextDependent*) lmt)->GetSentenceAndContext(sentence,context,line_str);
-			VERBOSE(1,"sentence:|" << sentence << "|" << std::endl);	
-			VERBOSE(1,"context:|" << context << "|" << std::endl);
 			
 			//getting apriori topic weights
 			topic_map_t apriori_topic_map;
@@ -306,7 +365,7 @@ int main(int argc, char **argv)
 				++size;
 				size=(size<order)?size:order;
 				last=i+1;
-
+				
 				// reset ngram at begin of sentence
 				if (word_vec.at(i) == lmt->getDict()->BoS()) {
 					size=0;
@@ -317,17 +376,9 @@ int main(int argc, char **argv)
 				string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
 				
 				if (size>=1) {
-					VERBOSE(2,"computing prob for first:|" << first << "| and last:|" << last << "|" << std::endl);	
-					Pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
-					logPr+=Pr;
-					sent_logPr+=Pr;
-					VERBOSE(2,"sent_logPr:|" << sent_logPr << " logPr:|" << logPr << std::endl);	
+					VERBOSE(2,"computing prob for first:|" << first << "| and last:|" << last << "|" << std::endl);
 					
-					if (debug==1) {
-						std::cout << "first:|" << first << "| and last:| [" << size-bol << "]" << " " << std::endl;
-					}
-					
-					VERBOSE(2,"word_vec.at(i):|" << word_vec.at(i) << " lmt->getDict()->OOV():|" << lmt->getDict()->OOV() << std::endl);
+					VERBOSE(2,"word_vec.at(i):|" << word_vec.at(i) << "| lmt->getDict()->OOV():|" << lmt->getDict()->OOV() << "|" << std::endl);
 					if (lmt->getDict()->encode(word_vec.at(i).c_str()) == lmt->getDict()->oovcode()) {
 						Noov++;
 						sent_Noov++;
@@ -340,38 +391,392 @@ int main(int argc, char **argv)
 						lmt->check_caches_levels();
 					}
 					
+					
+					VERBOSE(2,"tmp_word_vec.size:|" << tmp_word_vec.size() << "|" << std::endl);	
+					VERBOSE(2,"dict.size:|" << lmt->getDict()->size() << "|" << std::endl);	
+					
+					Pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
+					double tot_pr = 0.0;
+					if (context_model_normalization){
+						tot_pr = ((lmContextDependent*) lmt)->total_clprob(tmp_word_vec, apriori_topic_map);
+					}
+					
+//					string_vec_t::iterator it=tmp_word_vec.end()-1;
+					int current_pos = tmp_word_vec.size()-1;
+					std::string current_word = tmp_word_vec.at(current_pos);
+					
+					int_to_double_and_int_map code_to_prob_and_code_map;
+					double_and_int_to_double_and_int_map prob_and_code_to_prob_and_rank_map;
+					
+					//computation of the oracle probability. i.e. the maximum prob
+					double best_pr = -1000000.0;
+					int best_code = lmt->getlogOOVpenalty();
+					for (int i=0; i<lmt->getDict()->size(); ++i)
+					{
+						//loop over all words in the LM
+					  tmp_word_vec.at(current_pos) = lmt->getDict()->decode(i);
+						IFVERBOSE(3){
+							std::cout << "tmp_word_vec i:" << i;
+							for (string_vec_t::const_iterator it2=tmp_word_vec.begin(); it2!=tmp_word_vec.end(); ++it2) {	std::cout << " |" << (*it2) << "|"; }
+							std::cout << std::endl;
+						}				
+						
+						double pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
+						if (best_pr < pr){
+							best_pr = pr;
+							best_code = i;
+							VERBOSE(3,"current_best:" << best_code << " current_word:|" << lmt->getDict()->decode(best_code) << "| best_prob:" << pow(10.0,best_pr) << " norm_best_prob:" << pow(10.0,best_pr - tot_pr) << std::endl);
+						}
+					}
+					model_Pr = best_pr;
+					VERBOSE(2,"model_best_code:" << best_code << " model_best_word:|" << lmt->getDict()->decode(best_code) << "| model_best_prob:" << pow(10.0,best_pr) << std::endl);
+					IFVERBOSE(3){
+						for (int_to_double_and_int_map::const_iterator it3=code_to_prob_and_code_map.begin(); it3!=code_to_prob_and_code_map.end(); ++it3)
+						{
+							VERBOSE(3,"it3: word:" << it3->first << " pr:" << (it3->second).first << " word:" << (it3->second).second << std::endl);
+						}
+						
+						for (double_and_int_to_double_and_int_map::const_iterator it3=prob_and_code_to_prob_and_rank_map.begin(); it3!=prob_and_code_to_prob_and_rank_map.end(); ++it3)
+						{
+							VERBOSE(3,"it3: pr:" << (it3->first).first << " second:" << (it3->first).second << " norm_pr:" << (it3->second).first << " rank:" << (it3->second).second << std::endl);
+						}
+					}
+
+					norm_oovpenalty = oovpenalty;
+					VERBOSE(2,"tot_pr:" << tot_pr << " oovpenalty:" << oovpenalty << " norm_oovpenalty:" << norm_oovpenalty << std::endl);	
+
+
+					norm_Pr = Pr - tot_pr;
+					model_norm_Pr = model_Pr - tot_pr;
+					VERBOSE(1,"Pr:" << Pr << " norm_Pr:" << norm_Pr << " model_Pr:" << model_Pr << " model_norm_Pr:" << model_norm_Pr << " model_best_code:" << best_code << " model_best_word:|" << lmt->getDict()->decode(best_code) << "|" << std::endl);
+
+					model_norm_logPr+=model_norm_Pr;
+					sent_model_norm_logPr+=model_norm_Pr;
+					norm_logPr+=norm_Pr;
+					sent_norm_logPr+=norm_Pr;
+					VERBOSE(2,"sent_model_norm_logPr:" << sent_model_norm_logPr << " model_norm_logPr:" << model_norm_logPr << std::endl);	
+					VERBOSE(2,"sent_norm_logPr:" << sent_norm_logPr << " norm_logPr:" << norm_logPr << std::endl);	
+					
+					model_logPr+=model_Pr;
+					sent_model_logPr+=model_Pr;
+					logPr+=Pr;
+					sent_logPr+=Pr;
+					VERBOSE(2,"sent_model_logPr:" << sent_model_logPr << " model_logPr:" << model_logPr << std::endl);	
+					VERBOSE(2,"sent_logPr:" << sent_logPr << " logPr:" << logPr << std::endl);
+					 
+					
 				}
 			}
 			
-			if (sent_PP_flag) {
-				sent_PP=exp((-sent_logPr * M_LN10) /sent_Nw);
-				sent_PPwp= sent_PP * (1 - 1/exp((sent_Noov *  lmt->getlogOOVpenalty()) * M_LN10 / sent_Nw));
+			if (sent_flag) {
+				sent_model_norm_PP = exp((-sent_model_norm_logPr * M_LN10) / sent_Nw);
+				sent_model_norm_PPwp = sent_model_norm_PP * (1 - 1/exp(sent_Noov *  norm_oovpenalty * M_LN10 / sent_Nw));
+				sent_norm_PP = exp((-sent_norm_logPr * M_LN10) / sent_Nw);
+				sent_norm_PPwp = sent_norm_PP * (1 - 1/exp(sent_Noov * norm_oovpenalty * M_LN10 / sent_Nw));
+				
 				
 				std::cout << "%% sent_Nw=" << sent_Nw
+				<< " sent_norm_logPr=" << sent_norm_logPr
+				<< " sent_norm_PP=" << sent_norm_PP
+				<< " sent_norm_PPwp=" << sent_norm_PPwp
+				<< " sent_norm_PP_noOOV=" << (sent_norm_PP-sent_norm_PPwp)
+				<< " sent_Noov=" << sent_Noov
+				<< " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
+				std::cout << "%% sent_Nw=" << sent_Nw
+				<< " sent_model_norm_logPr=" << sent_model_norm_logPr
+				<< " sent_model_norm_PP=" << sent_model_norm_PP
+				<< " sent_model_norm_PPwp=" << sent_model_norm_PPwp
+				<< " sent_model_norm_PP_noOOV=" << (sent_model_norm_PP-sent_model_norm_PPwp)
+				<< " sent_Noov=" << sent_Noov
+				<< " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
+				
+				sent_model_PP = exp((-sent_model_logPr * M_LN10) / sent_Nw);
+				sent_model_PPwp = sent_model_PP * (1 - 1/exp(sent_Noov * oovpenalty * M_LN10 / sent_Nw));
+				sent_PP = exp((-sent_logPr * M_LN10) / sent_Nw);
+				sent_PPwp = sent_PP * (1 - 1/exp(sent_Noov * oovpenalty * M_LN10 / sent_Nw));
+				std::cout << "%% sent_Nw=" << sent_Nw
+				<< " sent_logPr=" << sent_logPr
 				<< " sent_PP=" << sent_PP
 				<< " sent_PPwp=" << sent_PPwp
+				<< " sent_PP_noOOV=" << (sent_PP-sent_PPwp)
+				<< " sent_Noov=" << sent_Noov
+				<< " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
+			
+				std::cout << "%% sent_Nw=" << sent_Nw
+				<< " sent_model_logPr=" << sent_model_logPr
+				<< " sent_model_PP=" << sent_model_PP
+				<< " sent_model_PPwp=" << sent_model_PPwp
+				<< " sent_model_PP_noOOV=" << (sent_model_PP-sent_model_PPwp)
 				<< " sent_Noov=" << sent_Noov
-				<< " sent_OOV=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
+				<< " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
 				std::cout.flush();
 				//reset statistics for sentence based Perplexity
 				sent_Noov = 0;
 				sent_Nw = 0;
-				sent_logPr=0.0;
+				sent_model_norm_logPr = 0.0;
+				sent_model_logPr = 0.0;
+				sent_norm_logPr = 0.0;
+				sent_logPr = 0.0;
 			}
 			
 			apriori_topic_map.clear();
 		}
 		
 		
-		PP=exp((-logPr * M_LN10) / Nw);
-		PPwp= PP * (1 - 1/exp((Noov *  lmt->getlogOOVpenalty()) * M_LN10 / Nw));
+		model_norm_PP = exp((-model_norm_logPr * M_LN10) / Nw);
+		model_norm_PPwp = model_norm_PP * (1 - 1/exp(Noov *  norm_oovpenalty * M_LN10 / Nw));
+		model_PP = exp((-model_logPr * M_LN10) / Nw);
+		model_PPwp = model_PP * (1 - 1/exp(Noov * oovpenalty * M_LN10 / Nw));
+		norm_PP = exp((-norm_logPr * M_LN10) / Nw);
+		norm_PPwp = norm_PP * (1 - 1/exp(Noov *  norm_oovpenalty * M_LN10 / Nw));
+		PP = exp((-logPr * M_LN10) / Nw);
+		PPwp = PP * (1 - 1/exp(Noov * oovpenalty * M_LN10 / Nw));
 		
 		std::cout << "%% Nw=" << Nw
+		<< " model_logPr=" << model_logPr
+		<< " model_PP=" << model_PP
+		<< " model_PPwp=" << model_PPwp
+		<< " model_PP_noOOV=" << (model_PP-model_PPwp)
+		<< " Noov=" << Noov
+		<< " OOVrate=" << (float)Noov/Nw * 100.0 << "%";
+		std::cout << std::endl;
+		std::cout.flush();
+		std::cout << "%% Nw=" << Nw
+		<< " model_norm_logPr=" << model_norm_logPr
+		<< " model_norm_PP=" << model_norm_PP
+		<< " model_norm_PPwp=" << model_norm_PPwp
+		<< " model_norm_PP_noOOV=" << (model_norm_PP-model_norm_PPwp)
+		<< " Noov=" << Noov
+		<< " OOVrate=" << (float)Noov/Nw * 100.0 << "%";
+		std::cout << std::endl;
+		std::cout.flush();
+		std::cout << "%% Nw=" << Nw
+		<< " logPr=" << logPr
 		<< " PP=" << PP
 		<< " PPwp=" << PPwp
+		<< " PP_noOOV=" << (PP-PPwp)
+		<< " Noov=" << Noov
+		<< " OOVrate=" << (float)Noov/Nw * 100.0 << "%";
+		std::cout << std::endl;
+		std::cout.flush();
+		std::cout << "%% Nw=" << Nw
+		<< " norm_logPr=" << norm_logPr
+		<< " norm_PP=" << norm_PP
+		<< " norm_PPwp=" << norm_PPwp
+		<< " norm_PP_noOOV=" << (norm_PP-norm_PPwp)
+		<< " Noov=" << Noov
+		<< " OOVrate=" << (float)Noov/Nw * 100.0 << "%";
+		std::cout << std::endl;
+		std::cout.flush();
+		
+		if (debug>1) lmt->used_caches();
+		
+		if (debug>1) lmt->stat();
+		
+		delete lmt;
+		return 0;
+	}
+  if (rankscore == true) {
+		
+		if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) {
+			debug = (debug>4)?4:debug;
+			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+		}
+		if (lmt->getLanguageModelType() == _IRSTLM_LMMACRO) {
+			debug = (debug>4)?4:debug;
+			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+		}
+		if (lmt->getLanguageModelType() == _IRSTLM_LMCLASS) {
+			debug = (debug>4)?4:debug;
+			std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+		}
+		std::cerr << "Start RankBased Evaluation" << std::endl;
+		std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl;
+		std::cout.setf(ios::fixed);
+		std::cout.precision(2);
+		
+		int Nw=0,Noov=0;
+		double avgRank;
+		int tot_rank = 0;
+		
+		double bow;
+		int bol=0;
+		char *msp;
+		unsigned int statesize;
+		
+		// variables for storing sentence-based Rank Statistics
+		int sent_Nw=0,sent_Noov=0;
+		double sent_avgRank;
+		int sent_tot_rank = 0;	
+		
+		std::fstream inptxt(testfile,std::ios::in);
+		
+		// loop over input lines
+		char line[MAX_LINE];
+		while (inptxt.getline(line,MAX_LINE)) {
+			
+			std::string line_str = line;
+			
+			VERBOSE(1,"input_line:|" << line_str << "|" << std::endl);	
+			
+			//getting sentence string;
+			std::string sentence;
+			std::string context;
+			
+			((lmContextDependent*) lmt)->GetSentenceAndContext(sentence,context,line_str);
+			
+			//getting apriori topic weights
+			topic_map_t apriori_topic_map;
+			((lmContextDependent*) lmt)->getContextSimilarity()->setContextMap(apriori_topic_map,context); 
+			
+			// computation using std::string
+			// loop over ngrams of the sentence
+			string_vec_t word_vec;
+			split(sentence, ' ', word_vec);
+			
+			//first points to the last recent term to take into account
+			//last points to the position after the most recent term to take into account
+			//last could point outside the vector of string; do NOT use word_vec.at(last)
+			size_t last, first;
+			size_t size=0;
+			size_t order = lmt->maxlevel();
+			
+			for (size_t i=0; i<word_vec.size(); ++i){
+				++size;
+				size=(size<order)?size:order;
+				last=i+1;
+				
+				// reset ngram at begin of sentence
+				if (word_vec.at(i) == lmt->getDict()->BoS()) {
+					size=0;
+					continue;
+				}
+				first = last - size;
+				
+				string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
+				
+				if (size>=1) {
+					
+					VERBOSE(2,"computing rank for first:|" << first << "| and last:|" << last << "|" << std::endl);	
+					
+					VERBOSE(2,"word_vec.at(i):|" << word_vec.at(i) << "| lmt->getDict()->OOV():|" << lmt->getDict()->OOV() << "|" << std::endl);
+					
+					if (lmt->getDict()->encode(word_vec.at(i).c_str()) == lmt->getDict()->oovcode()) {
+						Noov++;
+						sent_Noov++;
+					}
+					Nw++;
+					sent_Nw++;
+					
+					if ((Nw % 100000)==0) {
+						std::cerr << ".";
+						lmt->check_caches_levels();
+					}
+					
+					VERBOSE(2,"tmp_word_vec.size:|" << tmp_word_vec.size() << "|" << std::endl);	
+					VERBOSE(2,"dict.size:|" << lmt->getDict()->size() << "|" << std::endl);	
+					string_vec_t::iterator it=tmp_word_vec.end()-1;
+					
+					int current_pos = tmp_word_vec.size()-1;
+					std::string current_word = tmp_word_vec.at(current_pos);
+					int current_code = lmt->getDict()->encode(current_word.c_str());
+					
+					int_to_double_and_int_map code_to_prob_and_code_map;
+					double_and_int_to_double_and_int_map prob_and_code_to_prob_and_rank_map;
+					
+					//computation of the ranking
+					int default_rank=-1;
+					for (int i=0; i<lmt->getDict()->size(); i++)
+					{
+						//loop over all words in the LM
+					  tmp_word_vec.at(current_pos) = lmt->getDict()->decode(i);
+						//					  *it = lmt->getDict()->decode(i);
+						IFVERBOSE(3){
+							std::cout << "tmp_word_vec i:" << i;
+							for (string_vec_t::const_iterator it2=tmp_word_vec.begin(); it2!=tmp_word_vec.end(); ++it2) {
+								std::cout << " |" << (*it2) << "|";
+							}
+							std::cout << std::endl;
+						}
+						double pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
+						if (context_model_normalization){
+							double tot_pr = ((lmContextDependent*) lmt)->total_clprob(tmp_word_vec, apriori_topic_map);
+							pr = pr - tot_pr;
+						}
+						code_to_prob_and_code_map.insert(make_pair(i,make_pair(pr,i)));
+						prob_and_code_to_prob_and_rank_map.insert(make_pair(make_pair(pr,i),make_pair(pr,default_rank)));
+						VERBOSE(3," i:" << i << " word_prob:" << pr << std::endl);
+					}
+					IFVERBOSE(3){
+						
+						for (int_to_double_and_int_map::const_iterator it3=code_to_prob_and_code_map.begin(); it3!=code_to_prob_and_code_map.end();++it3)
+						{
+							VERBOSE(3,"it3: word:" << it3->first << " pr:" << (it3->second).first << " word:" << (it3->second).second << std::endl);
+						}
+						
+						for (double_and_int_to_double_and_int_map::const_iterator it3=prob_and_code_to_prob_and_rank_map.begin(); it3!=prob_and_code_to_prob_and_rank_map.end();++it3)
+						{
+							VERBOSE(3,"it3: pr:" << (it3->first).first << " second:" << (it3->first).second << " norm_pr:" << (it3->second).first << " rank:" << (it3->second).second << std::endl);
+						}
+					}
+					
+					
+					//set rank of word according to their prob
+					//note that prob are already sorted in the map in ascending order wrt to prob (and secondarily code)
+					int rank=0;
+					for (double_and_int_to_double_and_int_map::reverse_iterator it3=prob_and_code_to_prob_and_rank_map.rbegin(); it3!=prob_and_code_to_prob_and_rank_map.rend();++it3)
+					{
+						(it3->second).second = rank;
+						++rank;
+						IFVERBOSE(3){
+							if (rank < 10){
+								VERBOSE(3,"it3: pr:" << (it3->first).first << " code:" << (it3->first).second << " rank:" << (it3->second).second << std::endl);
+							}
+						}
+					}
+
+					IFVERBOSE(3){
+						int i_tmp=0;
+						for (double_and_int_to_double_and_int_map::const_reverse_iterator it3=prob_and_code_to_prob_and_rank_map.rbegin(); it3!=prob_and_code_to_prob_and_rank_map.rend();++it3)
+						{
+							VERBOSE(3,"it3: pr:" << (it3->first).first << " code:" << (it3->first).second <<" rank:" << (it3->second).second << std::endl);
+							if (++i_tmp==10) break;
+						}
+					}
+					double_and_int_pair current_prob_and_code = (code_to_prob_and_code_map.find(current_code))->second;
+					int current_rank = ((prob_and_code_to_prob_and_rank_map.find(current_prob_and_code))->second).second;
+					VERBOSE(1," current_word:" << current_word << " current_code:" << current_code << " current_rank:" << current_rank << std::endl);
+					
+					sent_tot_rank += current_rank;
+					tot_rank += current_rank;
+				}
+			}
+			
+			if (sent_flag) {
+				VERBOSE(1," sent_tot_rank:" << sent_tot_rank << " sent_Nw:" << sent_Nw << std::endl);
+				
+				sent_avgRank = sent_tot_rank / sent_Nw;
+				
+				std::cout << "%% sent_Nw=" << sent_Nw
+				<< " sent_avgRank=" << sent_avgRank
+				<< " sent_Noov=" << sent_Noov
+				<< " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%";
+				std::cout << std::endl;
+				std::cout.flush();
+				
+				//reset statistics for sentence based avg Ranking
+				sent_Nw = 0;
+				sent_Noov = 0;
+				sent_tot_rank = 0;
+			}
+		}
+		
+		avgRank = tot_rank / Nw;
+		
+		std::cout << "%% Nw=" << Nw
+		<< " avgRank=" << avgRank
 		<< " Noov=" << Noov
-		<< " OOV=" << (float)Noov/Nw * 100.0 << "%";
-		if (debug > 0) std::cout << " log10_Pr=" <<  logPr;
+		<< " OOVrate=" << (float)Noov/Nw * 100.0 << "%";
 		std::cout << std::endl;
 		std::cout.flush();
 		

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git



More information about the debian-science-commits mailing list