[irstlm] 59/126: added evaluation of ranking
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:45 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.
commit 7b840239959bce2eaf5bd88865ed7822310d697e
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Wed Sep 9 17:07:41 2015 +0200
added evaluation of ranking
---
src/context-dependent-evaluation.cpp | 477 ++++++++++++++++++++++++++++++++---
1 file changed, 441 insertions(+), 36 deletions(-)
diff --git a/src/context-dependent-evaluation.cpp b/src/context-dependent-evaluation.cpp
index a3c6f88..45ade14 100644
--- a/src/context-dependent-evaluation.cpp
+++ b/src/context-dependent-evaluation.cpp
@@ -35,6 +35,29 @@
using namespace std;
using namespace irstlm;
+typedef std::pair<double,int> double_and_int_pair;
+
+struct cmp_double_and_int_pair {
+ //order first by the first field (double), and in case of equality by the second field (int)
+ bool operator()(const double_and_int_pair& a, const double_and_int_pair& b) const {
+ if (a.first < b.first){
+ return true;
+ }else if (a.first > b.first){
+ return false;
+ }else{
+ if (a.second<b.second){
+ return true;
+ }else{
+ return false;
+ }
+ }
+ }
+};
+
+typedef std::map<int, double_and_int_pair> int_to_double_and_int_map;
+//typedef std::map<double_and_int_pair,int,cmp_double_and_int_pair> double_and_int_to_int_map;
+typedef std::map<double_and_int_pair,double_and_int_pair,cmp_double_and_int_pair> double_and_int_to_double_and_int_map;
+
/********************************/
void print_help(int TypeFlag=0){
std::cerr << std::endl << "context-dependent-evaluation - compute ngram probabilities and text perplexity given a LM" << std::endl;
@@ -63,10 +86,12 @@ int main(int argc, char **argv)
char *testfile=NULL;
char *lmfile=NULL;
- bool sent_PP_flag = false;
+ bool sent_flag = false;
bool contextbasedscore = false;
bool topicscore = false;
+ bool rankscore = false;
bool context_model_active = true;
+ bool context_model_normalization = false;
int debug = 0;
int requiredMaxlev = 1000;
@@ -84,15 +109,17 @@ int main(int argc, char **argv)
"r", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file",
"contextbasedscore", CMDBOOLTYPE|CMDMSG, &contextbasedscore, "computes context-dependent probabilities and pseudo-perplexity of the text from standard input",
"topicscore", CMDBOOLTYPE|CMDMSG, &topicscore, "computes the topic scores of the text from standard input",
+ "rankscore", CMDBOOLTYPE|CMDMSG, &rankscore, "computes the avergae rank position of the text from standard input",
"debug", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
"d", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
"level", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken",
"l", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken",
"dub", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7",
- "sentence", CMDBOOLTYPE|CMDMSG, &sent_PP_flag, "computes perplexity at sentence level (identified through the end symbol)",
+ "sentence", CMDBOOLTYPE|CMDMSG, &sent_flag, "computes perplexity at sentence level (identified through the end symbol)",
"dict_load_factor", CMDFLOATTYPE|CMDMSG, &dictionary_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is 0",
"ngram_load_factor", CMDFLOATTYPE|CMDMSG, &ngramcache_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is false",
- "context_model_active", CMDBOOLTYPE|CMDMSG, &context_model_active, "enable/disable context-dependent models (default is true)",
+ "context_model_active", CMDBOOLTYPE|CMDMSG, &context_model_active, "enable/disable context-dependent model (default is true)",
+ "context_model_normalization", CMDBOOLTYPE|CMDMSG, &context_model_normalization, "enable/disable normalization of context-dependent model (default is false)",
"Help", CMDBOOLTYPE|CMDMSG, &help, "print this help",
"h", CMDBOOLTYPE|CMDMSG, &help, "print this help",
@@ -128,7 +155,6 @@ int main(int argc, char **argv)
if (topicscore==true) std::cerr << "topicscore: " << topicscore << std::endl;
std::cerr << "loading up to the LM level " << requiredMaxlev << " (if any)" << std::endl;
std::cerr << "dub: " << dub<< std::endl;
- std::cerr << "dub: " << dub<< std::endl;
//checking the language model type
@@ -140,6 +166,7 @@ int main(int argc, char **argv)
lmt->load(infile);
((lmContextDependent*) lmt)->set_Active(context_model_active);
+ ((lmContextDependent*) lmt)->set_Normalized(context_model_normalization);
if (dub) lmt->setlogOOVpenalty((int)dub);
@@ -174,16 +201,13 @@ int main(int argc, char **argv)
std::string line_str = line;
- VERBOSE(2,"input_line:|" << line_str << "|" << std::endl);
+ VERBOSE(2,"input_line:|" << line_str << "|" << std::endl);
//getting sentence string;
std::string sentence;
std::string context;
-
((lmContextDependent*) lmt)->GetSentenceAndContext(sentence,context,line_str);
- VERBOSE(2,"sentence:|" << sentence << "|" << std::endl);
- VERBOSE(2,"context:|" << context << "|" << std::endl);
//getting apriori topic weights
topic_map_t apriori_topic_map;
@@ -206,6 +230,7 @@ int main(int argc, char **argv)
++size;
size=(size<order)?size:order;
last=i+1;
+
// reset ngram at begin of sentence
if (word_vec.at(i) == lmt->getDict()->BoS()) {
size=0;
@@ -219,18 +244,43 @@ int main(int argc, char **argv)
VERBOSE(2,"computing topic_scores for first:|" << first << "| and last:|" << last << "|" << std::endl);
topic_map_t tmp_topic_map;
- ((lmContextDependent*) lmt)->getContextSimilarity()->get_topic_scores(tmp_topic_map, tmp_word_vec);
-
- VERBOSE(2,"first:" << first << " last:" << last << " tmp_topic_map.size:" << tmp_topic_map.size() << std::endl);
- if (debug > 0){
+ ((lmContextDependent*) lmt)->getContextSimilarity()->get_topic_scores(tmp_word_vec, tmp_topic_map);
+ IFVERBOSE(2){
+ VERBOSE(2,"before normalization word-based topic-distribution:");
+ ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map);
+ }
+ ((lmContextDependent*) lmt)->getContextSimilarity()->normalize_topic_scores(tmp_topic_map);
+ IFVERBOSE(2){
+ VERBOSE(2,"after normalization word-based topic-distribution:");
((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map);
}
+ VERBOSE(2,"first:" << first << " last:" << last << " tmp_topic_map.size:" << tmp_topic_map.size() << std::endl);
+
((lmContextDependent*) lmt)->getContextSimilarity()->add_topic_scores(sentence_topic_map, tmp_topic_map);
+ IFVERBOSE(2){
+ // VERBOSE(2,"word-based topic-distribution:");
+ // ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map);
+ VERBOSE(2,"word-based topic-distribution:");
+ ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map,apriori_topic_map,1);
+ }
tmp_topic_map.clear();
+ // IFVERBOSE(2){
+ // VERBOSE(2,"sentence-based topic-distribution:");
+ // ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map);
+ // VERBOSE(2,"sentence-based topic-distribution:");
+ // ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map,apriori_topic_map);
+ // }
}
}
+ IFVERBOSE(2){
+ // VERBOSE(2,"sentence-based topic-distribution:");
+ // ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map);
+ VERBOSE(2,"sentence-based topic-distribution:");
+ ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map,apriori_topic_map,last);
+ }
std::cout << sentence << ((lmContextDependent*) lmt)->getContextDelimiter();
- ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map);
+ ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map);
+ // ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map,apriori_topic_map);
apriori_topic_map.clear();
}
@@ -258,6 +308,9 @@ int main(int argc, char **argv)
int Nw=0,Noov=0;
double logPr=0,PP=0,PPwp=0,Pr;
+ double norm_logPr=0,norm_PP=0,norm_PPwp=0,norm_Pr;
+ double model_logPr=0,model_PP=0,model_PPwp=0,model_Pr;
+ double model_norm_logPr=0,model_norm_PP=0,model_norm_PPwp=0,model_norm_Pr;
double bow;
int bol=0;
@@ -266,7 +319,15 @@ int main(int argc, char **argv)
// variables for storing sentence-based Perplexity
int sent_Nw=0,sent_Noov=0;
- double sent_logPr=0,sent_PP=0,sent_PPwp=0;
+ double sent_logPr=0,sent_PP=0,sent_PPwp=0;
+ double sent_norm_logPr=0,sent_norm_PP=0,sent_norm_PPwp=0;
+ double sent_model_logPr=0,sent_model_PP=0,sent_model_PPwp=0;
+ double sent_model_norm_logPr=0,sent_model_norm_PP=0,sent_model_norm_PPwp=0;
+
+ double oovpenalty = lmt->getlogOOVpenalty();
+ double norm_oovpenalty = oovpenalty;
+
+ VERBOSE(1,"oovpenalty:" << oovpenalty << std::endl);
std::fstream inptxt(testfile,std::ios::in);
@@ -283,8 +344,6 @@ int main(int argc, char **argv)
std::string context;
((lmContextDependent*) lmt)->GetSentenceAndContext(sentence,context,line_str);
- VERBOSE(1,"sentence:|" << sentence << "|" << std::endl);
- VERBOSE(1,"context:|" << context << "|" << std::endl);
//getting apriori topic weights
topic_map_t apriori_topic_map;
@@ -306,7 +365,7 @@ int main(int argc, char **argv)
++size;
size=(size<order)?size:order;
last=i+1;
-
+
// reset ngram at begin of sentence
if (word_vec.at(i) == lmt->getDict()->BoS()) {
size=0;
@@ -317,17 +376,9 @@ int main(int argc, char **argv)
string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
if (size>=1) {
- VERBOSE(2,"computing prob for first:|" << first << "| and last:|" << last << "|" << std::endl);
- Pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
- logPr+=Pr;
- sent_logPr+=Pr;
- VERBOSE(2,"sent_logPr:|" << sent_logPr << " logPr:|" << logPr << std::endl);
+ VERBOSE(2,"computing prob for first:|" << first << "| and last:|" << last << "|" << std::endl);
- if (debug==1) {
- std::cout << "first:|" << first << "| and last:| [" << size-bol << "]" << " " << std::endl;
- }
-
- VERBOSE(2,"word_vec.at(i):|" << word_vec.at(i) << " lmt->getDict()->OOV():|" << lmt->getDict()->OOV() << std::endl);
+ VERBOSE(2,"word_vec.at(i):|" << word_vec.at(i) << "| lmt->getDict()->OOV():|" << lmt->getDict()->OOV() << "|" << std::endl);
if (lmt->getDict()->encode(word_vec.at(i).c_str()) == lmt->getDict()->oovcode()) {
Noov++;
sent_Noov++;
@@ -340,38 +391,392 @@ int main(int argc, char **argv)
lmt->check_caches_levels();
}
+
+ VERBOSE(2,"tmp_word_vec.size:|" << tmp_word_vec.size() << "|" << std::endl);
+ VERBOSE(2,"dict.size:|" << lmt->getDict()->size() << "|" << std::endl);
+
+ Pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
+ double tot_pr = 0.0;
+ if (context_model_normalization){
+ tot_pr = ((lmContextDependent*) lmt)->total_clprob(tmp_word_vec, apriori_topic_map);
+ }
+
+// string_vec_t::iterator it=tmp_word_vec.end()-1;
+ int current_pos = tmp_word_vec.size()-1;
+ std::string current_word = tmp_word_vec.at(current_pos);
+
+ int_to_double_and_int_map code_to_prob_and_code_map;
+ double_and_int_to_double_and_int_map prob_and_code_to_prob_and_rank_map;
+
+ //computation of the oracle probability. i.e. the maximum prob
+ double best_pr = -1000000.0;
+ int best_code = lmt->getlogOOVpenalty();
+ for (int i=0; i<lmt->getDict()->size(); ++i)
+ {
+ //loop over all words in the LM
+ tmp_word_vec.at(current_pos) = lmt->getDict()->decode(i);
+ IFVERBOSE(3){
+ std::cout << "tmp_word_vec i:" << i;
+ for (string_vec_t::const_iterator it2=tmp_word_vec.begin(); it2!=tmp_word_vec.end(); ++it2) { std::cout << " |" << (*it2) << "|"; }
+ std::cout << std::endl;
+ }
+
+ double pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
+ if (best_pr < pr){
+ best_pr = pr;
+ best_code = i;
+ VERBOSE(3,"current_best:" << best_code << " current_word:|" << lmt->getDict()->decode(best_code) << "| best_prob:" << pow(10.0,best_pr) << " norm_best_prob:" << pow(10.0,best_pr - tot_pr) << std::endl);
+ }
+ }
+ model_Pr = best_pr;
+ VERBOSE(2,"model_best_code:" << best_code << " model_best_word:|" << lmt->getDict()->decode(best_code) << "| model_best_prob:" << pow(10.0,best_pr) << std::endl);
+ IFVERBOSE(3){
+ for (int_to_double_and_int_map::const_iterator it3=code_to_prob_and_code_map.begin(); it3!=code_to_prob_and_code_map.end(); ++it3)
+ {
+ VERBOSE(3,"it3: word:" << it3->first << " pr:" << (it3->second).first << " word:" << (it3->second).second << std::endl);
+ }
+
+ for (double_and_int_to_double_and_int_map::const_iterator it3=prob_and_code_to_prob_and_rank_map.begin(); it3!=prob_and_code_to_prob_and_rank_map.end(); ++it3)
+ {
+ VERBOSE(3,"it3: pr:" << (it3->first).first << " second:" << (it3->first).second << " norm_pr:" << (it3->second).first << " rank:" << (it3->second).second << std::endl);
+ }
+ }
+
+ norm_oovpenalty = oovpenalty;
+ VERBOSE(2,"tot_pr:" << tot_pr << " oovpenalty:" << oovpenalty << " norm_oovpenalty:" << norm_oovpenalty << std::endl);
+
+
+ norm_Pr = Pr - tot_pr;
+ model_norm_Pr = model_Pr - tot_pr;
+ VERBOSE(1,"Pr:" << Pr << " norm_Pr:" << norm_Pr << " model_Pr:" << model_Pr << " model_norm_Pr:" << model_norm_Pr << " model_best_code:" << best_code << " model_best_word:|" << lmt->getDict()->decode(best_code) << "|" << std::endl);
+
+ model_norm_logPr+=model_norm_Pr;
+ sent_model_norm_logPr+=model_norm_Pr;
+ norm_logPr+=norm_Pr;
+ sent_norm_logPr+=norm_Pr;
+ VERBOSE(2,"sent_model_norm_logPr:" << sent_model_norm_logPr << " model_norm_logPr:" << model_norm_logPr << std::endl);
+ VERBOSE(2,"sent_norm_logPr:" << sent_norm_logPr << " norm_logPr:" << norm_logPr << std::endl);
+
+ model_logPr+=model_Pr;
+ sent_model_logPr+=model_Pr;
+ logPr+=Pr;
+ sent_logPr+=Pr;
+ VERBOSE(2,"sent_model_logPr:" << sent_model_logPr << " model_logPr:" << model_logPr << std::endl);
+ VERBOSE(2,"sent_logPr:" << sent_logPr << " logPr:" << logPr << std::endl);
+
+
}
}
- if (sent_PP_flag) {
- sent_PP=exp((-sent_logPr * M_LN10) /sent_Nw);
- sent_PPwp= sent_PP * (1 - 1/exp((sent_Noov * lmt->getlogOOVpenalty()) * M_LN10 / sent_Nw));
+ if (sent_flag) {
+ sent_model_norm_PP = exp((-sent_model_norm_logPr * M_LN10) / sent_Nw);
+ sent_model_norm_PPwp = sent_model_norm_PP * (1 - 1/exp(sent_Noov * norm_oovpenalty * M_LN10 / sent_Nw));
+ sent_norm_PP = exp((-sent_norm_logPr * M_LN10) / sent_Nw);
+ sent_norm_PPwp = sent_norm_PP * (1 - 1/exp(sent_Noov * norm_oovpenalty * M_LN10 / sent_Nw));
+
std::cout << "%% sent_Nw=" << sent_Nw
+ << " sent_norm_logPr=" << sent_norm_logPr
+ << " sent_norm_PP=" << sent_norm_PP
+ << " sent_norm_PPwp=" << sent_norm_PPwp
+ << " sent_norm_PP_noOOV=" << (sent_norm_PP-sent_norm_PPwp)
+ << " sent_Noov=" << sent_Noov
+ << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
+ std::cout << "%% sent_Nw=" << sent_Nw
+ << " sent_model_norm_logPr=" << sent_model_norm_logPr
+ << " sent_model_norm_PP=" << sent_model_norm_PP
+ << " sent_model_norm_PPwp=" << sent_model_norm_PPwp
+ << " sent_model_norm_PP_noOOV=" << (sent_model_norm_PP-sent_model_norm_PPwp)
+ << " sent_Noov=" << sent_Noov
+ << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
+
+ sent_model_PP = exp((-sent_model_logPr * M_LN10) / sent_Nw);
+ sent_model_PPwp = sent_model_PP * (1 - 1/exp(sent_Noov * oovpenalty * M_LN10 / sent_Nw));
+ sent_PP = exp((-sent_logPr * M_LN10) / sent_Nw);
+ sent_PPwp = sent_PP * (1 - 1/exp(sent_Noov * oovpenalty * M_LN10 / sent_Nw));
+ std::cout << "%% sent_Nw=" << sent_Nw
+ << " sent_logPr=" << sent_logPr
<< " sent_PP=" << sent_PP
<< " sent_PPwp=" << sent_PPwp
+ << " sent_PP_noOOV=" << (sent_PP-sent_PPwp)
+ << " sent_Noov=" << sent_Noov
+ << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
+
+ std::cout << "%% sent_Nw=" << sent_Nw
+ << " sent_model_logPr=" << sent_model_logPr
+ << " sent_model_PP=" << sent_model_PP
+ << " sent_model_PPwp=" << sent_model_PPwp
+ << " sent_model_PP_noOOV=" << (sent_model_PP-sent_model_PPwp)
<< " sent_Noov=" << sent_Noov
- << " sent_OOV=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
+ << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
std::cout.flush();
//reset statistics for sentence based Perplexity
sent_Noov = 0;
sent_Nw = 0;
- sent_logPr=0.0;
+ sent_model_norm_logPr = 0.0;
+ sent_model_logPr = 0.0;
+ sent_norm_logPr = 0.0;
+ sent_logPr = 0.0;
}
apriori_topic_map.clear();
}
- PP=exp((-logPr * M_LN10) / Nw);
- PPwp= PP * (1 - 1/exp((Noov * lmt->getlogOOVpenalty()) * M_LN10 / Nw));
+ model_norm_PP = exp((-model_norm_logPr * M_LN10) / Nw);
+ model_norm_PPwp = model_norm_PP * (1 - 1/exp(Noov * norm_oovpenalty * M_LN10 / Nw));
+ model_PP = exp((-model_logPr * M_LN10) / Nw);
+ model_PPwp = model_PP * (1 - 1/exp(Noov * oovpenalty * M_LN10 / Nw));
+ norm_PP = exp((-norm_logPr * M_LN10) / Nw);
+ norm_PPwp = norm_PP * (1 - 1/exp(Noov * norm_oovpenalty * M_LN10 / Nw));
+ PP = exp((-logPr * M_LN10) / Nw);
+ PPwp = PP * (1 - 1/exp(Noov * oovpenalty * M_LN10 / Nw));
std::cout << "%% Nw=" << Nw
+ << " model_logPr=" << model_logPr
+ << " model_PP=" << model_PP
+ << " model_PPwp=" << model_PPwp
+ << " model_PP_noOOV=" << (model_PP-model_PPwp)
+ << " Noov=" << Noov
+ << " OOVrate=" << (float)Noov/Nw * 100.0 << "%";
+ std::cout << std::endl;
+ std::cout.flush();
+ std::cout << "%% Nw=" << Nw
+ << " model_norm_logPr=" << model_norm_logPr
+ << " model_norm_PP=" << model_norm_PP
+ << " model_norm_PPwp=" << model_norm_PPwp
+ << " model_norm_PP_noOOV=" << (model_norm_PP-model_norm_PPwp)
+ << " Noov=" << Noov
+ << " OOVrate=" << (float)Noov/Nw * 100.0 << "%";
+ std::cout << std::endl;
+ std::cout.flush();
+ std::cout << "%% Nw=" << Nw
+ << " logPr=" << logPr
<< " PP=" << PP
<< " PPwp=" << PPwp
+ << " PP_noOOV=" << (PP-PPwp)
+ << " Noov=" << Noov
+ << " OOVrate=" << (float)Noov/Nw * 100.0 << "%";
+ std::cout << std::endl;
+ std::cout.flush();
+ std::cout << "%% Nw=" << Nw
+ << " norm_logPr=" << norm_logPr
+ << " norm_PP=" << norm_PP
+ << " norm_PPwp=" << norm_PPwp
+ << " norm_PP_noOOV=" << (norm_PP-norm_PPwp)
+ << " Noov=" << Noov
+ << " OOVrate=" << (float)Noov/Nw * 100.0 << "%";
+ std::cout << std::endl;
+ std::cout.flush();
+
+ if (debug>1) lmt->used_caches();
+
+ if (debug>1) lmt->stat();
+
+ delete lmt;
+ return 0;
+ }
+ if (rankscore == true) {
+
+ if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ if (lmt->getLanguageModelType() == _IRSTLM_LMMACRO) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ if (lmt->getLanguageModelType() == _IRSTLM_LMCLASS) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ std::cerr << "Start RankBased Evaluation" << std::endl;
+ std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl;
+ std::cout.setf(ios::fixed);
+ std::cout.precision(2);
+
+ int Nw=0,Noov=0;
+ double avgRank;
+ int tot_rank = 0;
+
+ double bow;
+ int bol=0;
+ char *msp;
+ unsigned int statesize;
+
+ // variables for storing sentence-based Rank Statistics
+ int sent_Nw=0,sent_Noov=0;
+ double sent_avgRank;
+ int sent_tot_rank = 0;
+
+ std::fstream inptxt(testfile,std::ios::in);
+
+ // loop over input lines
+ char line[MAX_LINE];
+ while (inptxt.getline(line,MAX_LINE)) {
+
+ std::string line_str = line;
+
+ VERBOSE(1,"input_line:|" << line_str << "|" << std::endl);
+
+ //getting sentence string;
+ std::string sentence;
+ std::string context;
+
+ ((lmContextDependent*) lmt)->GetSentenceAndContext(sentence,context,line_str);
+
+ //getting apriori topic weights
+ topic_map_t apriori_topic_map;
+ ((lmContextDependent*) lmt)->getContextSimilarity()->setContextMap(apriori_topic_map,context);
+
+ // computation using std::string
+ // loop over ngrams of the sentence
+ string_vec_t word_vec;
+ split(sentence, ' ', word_vec);
+
+ //first points to the last recent term to take into account
+ //last points to the position after the most recent term to take into account
+ //last could point outside the vector of string; do NOT use word_vec.at(last)
+ size_t last, first;
+ size_t size=0;
+ size_t order = lmt->maxlevel();
+
+ for (size_t i=0; i<word_vec.size(); ++i){
+ ++size;
+ size=(size<order)?size:order;
+ last=i+1;
+
+ // reset ngram at begin of sentence
+ if (word_vec.at(i) == lmt->getDict()->BoS()) {
+ size=0;
+ continue;
+ }
+ first = last - size;
+
+ string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
+
+ if (size>=1) {
+
+ VERBOSE(2,"computing rank for first:|" << first << "| and last:|" << last << "|" << std::endl);
+
+ VERBOSE(2,"word_vec.at(i):|" << word_vec.at(i) << "| lmt->getDict()->OOV():|" << lmt->getDict()->OOV() << "|" << std::endl);
+
+ if (lmt->getDict()->encode(word_vec.at(i).c_str()) == lmt->getDict()->oovcode()) {
+ Noov++;
+ sent_Noov++;
+ }
+ Nw++;
+ sent_Nw++;
+
+ if ((Nw % 100000)==0) {
+ std::cerr << ".";
+ lmt->check_caches_levels();
+ }
+
+ VERBOSE(2,"tmp_word_vec.size:|" << tmp_word_vec.size() << "|" << std::endl);
+ VERBOSE(2,"dict.size:|" << lmt->getDict()->size() << "|" << std::endl);
+ string_vec_t::iterator it=tmp_word_vec.end()-1;
+
+ int current_pos = tmp_word_vec.size()-1;
+ std::string current_word = tmp_word_vec.at(current_pos);
+ int current_code = lmt->getDict()->encode(current_word.c_str());
+
+ int_to_double_and_int_map code_to_prob_and_code_map;
+ double_and_int_to_double_and_int_map prob_and_code_to_prob_and_rank_map;
+
+ //computation of the ranking
+ int default_rank=-1;
+ for (int i=0; i<lmt->getDict()->size(); i++)
+ {
+ //loop over all words in the LM
+ tmp_word_vec.at(current_pos) = lmt->getDict()->decode(i);
+ // *it = lmt->getDict()->decode(i);
+ IFVERBOSE(3){
+ std::cout << "tmp_word_vec i:" << i;
+ for (string_vec_t::const_iterator it2=tmp_word_vec.begin(); it2!=tmp_word_vec.end(); ++it2) {
+ std::cout << " |" << (*it2) << "|";
+ }
+ std::cout << std::endl;
+ }
+ double pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
+ if (context_model_normalization){
+ double tot_pr = ((lmContextDependent*) lmt)->total_clprob(tmp_word_vec, apriori_topic_map);
+ pr = pr - tot_pr;
+ }
+ code_to_prob_and_code_map.insert(make_pair(i,make_pair(pr,i)));
+ prob_and_code_to_prob_and_rank_map.insert(make_pair(make_pair(pr,i),make_pair(pr,default_rank)));
+ VERBOSE(3," i:" << i << " word_prob:" << pr << std::endl);
+ }
+ IFVERBOSE(3){
+
+ for (int_to_double_and_int_map::const_iterator it3=code_to_prob_and_code_map.begin(); it3!=code_to_prob_and_code_map.end();++it3)
+ {
+ VERBOSE(3,"it3: word:" << it3->first << " pr:" << (it3->second).first << " word:" << (it3->second).second << std::endl);
+ }
+
+ for (double_and_int_to_double_and_int_map::const_iterator it3=prob_and_code_to_prob_and_rank_map.begin(); it3!=prob_and_code_to_prob_and_rank_map.end();++it3)
+ {
+ VERBOSE(3,"it3: pr:" << (it3->first).first << " second:" << (it3->first).second << " norm_pr:" << (it3->second).first << " rank:" << (it3->second).second << std::endl);
+ }
+ }
+
+
+ //set rank of word according to their prob
+ //note that prob are already sorted in the map in ascending order wrt to prob (and secondarily code)
+ int rank=0;
+ for (double_and_int_to_double_and_int_map::reverse_iterator it3=prob_and_code_to_prob_and_rank_map.rbegin(); it3!=prob_and_code_to_prob_and_rank_map.rend();++it3)
+ {
+ (it3->second).second = rank;
+ ++rank;
+ IFVERBOSE(3){
+ if (rank < 10){
+ VERBOSE(3,"it3: pr:" << (it3->first).first << " code:" << (it3->first).second << " rank:" << (it3->second).second << std::endl);
+ }
+ }
+ }
+
+ IFVERBOSE(3){
+ int i_tmp=0;
+ for (double_and_int_to_double_and_int_map::const_reverse_iterator it3=prob_and_code_to_prob_and_rank_map.rbegin(); it3!=prob_and_code_to_prob_and_rank_map.rend();++it3)
+ {
+ VERBOSE(3,"it3: pr:" << (it3->first).first << " code:" << (it3->first).second <<" rank:" << (it3->second).second << std::endl);
+ if (++i_tmp==10) break;
+ }
+ }
+ double_and_int_pair current_prob_and_code = (code_to_prob_and_code_map.find(current_code))->second;
+ int current_rank = ((prob_and_code_to_prob_and_rank_map.find(current_prob_and_code))->second).second;
+ VERBOSE(1," current_word:" << current_word << " current_code:" << current_code << " current_rank:" << current_rank << std::endl);
+
+ sent_tot_rank += current_rank;
+ tot_rank += current_rank;
+ }
+ }
+
+ if (sent_flag) {
+ VERBOSE(1," sent_tot_rank:" << sent_tot_rank << " sent_Nw:" << sent_Nw << std::endl);
+
+ sent_avgRank = sent_tot_rank / sent_Nw;
+
+ std::cout << "%% sent_Nw=" << sent_Nw
+ << " sent_avgRank=" << sent_avgRank
+ << " sent_Noov=" << sent_Noov
+ << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%";
+ std::cout << std::endl;
+ std::cout.flush();
+
+ //reset statistics for sentence based avg Ranking
+ sent_Nw = 0;
+ sent_Noov = 0;
+ sent_tot_rank = 0;
+ }
+ }
+
+ avgRank = tot_rank / Nw;
+
+ std::cout << "%% Nw=" << Nw
+ << " avgRank=" << avgRank
<< " Noov=" << Noov
- << " OOV=" << (float)Noov/Nw * 100.0 << "%";
- if (debug > 0) std::cout << " log10_Pr=" << logPr;
+ << " OOVrate=" << (float)Noov/Nw * 100.0 << "%";
std::cout << std::endl;
std::cout.flush();
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list