[irstlm] 101/126: added more functionalities to context-dependent-evaluation; disabled topiscore evaluation
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:49 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.
commit c2cbc61f8e610581ff2b41ac9c863f682c8c83c5
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Tue Sep 22 12:27:54 2015 +0200
added more functionalities to context-dependent-evaluation; disabled topiscore evaluation
---
src/context-dependent-evaluation.cpp | 528 ++++++++++++++++++-----------------
1 file changed, 268 insertions(+), 260 deletions(-)
diff --git a/src/context-dependent-evaluation.cpp b/src/context-dependent-evaluation.cpp
index 17398ba..93ad15d 100644
--- a/src/context-dependent-evaluation.cpp
+++ b/src/context-dependent-evaluation.cpp
@@ -94,6 +94,11 @@ int main(int argc, char **argv)
bool context_model_normalization = false;
char *lexiconfile=NULL;
+ bool add_lexicon_words = false;
+ bool add_lm_words = false;
+ bool add_sentence_words = false;
+ int successor_limit=100;
+
int debug = 0;
int requiredMaxlev = 1000;
int dub = 10000000;
@@ -111,7 +116,7 @@ int main(int argc, char **argv)
"r", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file",
"contextbasedscore", CMDBOOLTYPE|CMDMSG, &contextbasedscore, "computes context-dependent probabilities and pseudo-perplexity of the text from standard input",
"topicscore", CMDBOOLTYPE|CMDMSG, &topicscore, "computes the topic scores of the text from standard input",
- "rankscore", CMDBOOLTYPE|CMDMSG, &rankscore, "computes the avergae rank position of the text from standard input",
+ "rankscore", CMDBOOLTYPE|CMDMSG, &rankscore, "computes the average rank position of the text from standard input",
"debug", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
"d", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
"level", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken",
@@ -122,6 +127,9 @@ int main(int argc, char **argv)
"ngram_load_factor", CMDFLOATTYPE|CMDMSG, &ngramcache_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is false",
"context_model_active", CMDBOOLTYPE|CMDMSG, &context_model_active, "enable/disable context-dependent model (default is true)",
"context_model_normalization", CMDBOOLTYPE|CMDMSG, &context_model_normalization, "enable/disable normalization of context-dependent model (default is false)",
+ "add_lm_words", CMDBOOLTYPE|CMDMSG, &add_lm_words, "enable/disable addition of the unigram/bigrmam successors into the alternatives (default is false)",
+ "add_sentence_words", CMDBOOLTYPE|CMDMSG, &add_sentence_words, "enable/disable addition of the words of the current sentence into the alternatives (default is false)",
+ "successor_limit", CMDINTTYPE|CMDMSG, &successor_limit, "threshold to decide whether adding the unigram/bigram successors into the alternatives (default is 100)",
"Help", CMDBOOLTYPE|CMDMSG, &help, "print this help",
"h", CMDBOOLTYPE|CMDMSG, &help, "print this help",
@@ -151,19 +159,18 @@ int main(int argc, char **argv)
exit_error(IRSTLM_NO_ERROR);
}
- if (lmfile!=NULL) std::cerr << "lmfile: " << lmfile << std::endl;
- if (testfile!=NULL) std::cerr << "testfile: " << testfile << std::endl;
- if (contextbasedscore==true) std::cerr << "contextbasedscore: " << contextbasedscore << std::endl;
- if (topicscore==true) std::cerr << "topicscore: " << topicscore << std::endl;
- if (rankscore==true){
- std::cerr << "rankscore: " << rankscore << std::endl;
-
- if (lexiconfile == NULL) {
- usage();
- exit_error(IRSTLM_ERROR_DATA,"Warning: Please specify a lexicon file to read from");
- }
- std::cerr << "lexicon: " << lexiconfile << std::endl;
- }
+ if (lmfile!=NULL) VERBOSE(1, "lmfile: " << lmfile << std::endl);
+ if (testfile!=NULL) VERBOSE(1, "testfile: " << testfile << std::endl);
+ if (lexiconfile != NULL) VERBOSE(1, "lexicon: " << lexiconfile << std::endl);
+
+ VERBOSE(1, "contextbasedscore: " << contextbasedscore << std::endl);
+ VERBOSE(1, "topicscore: " << topicscore << std::endl);
+ VERBOSE(1, "rankscore: " << rankscore << std::endl);
+
+ VERBOSE(1,"add_lexicon_words: " << add_lexicon_words << std::endl);
+ VERBOSE(1,"add_lm_words: " << add_lm_words << " successor_limit:" << successor_limit<< std::endl);
+ VERBOSE(1,"add_sentence_words: " << add_sentence_words << std::endl);
+
std::cerr << "loading up to the LM level " << requiredMaxlev << " (if any)" << std::endl;
std::cerr << "dub: " << dub<< std::endl;
@@ -186,15 +193,18 @@ int main(int argc, char **argv)
//read lexicon form file
std::multimap< std::string, std::string > lexicon;
- if (lexiconfile == NULL) {
+ if (lexiconfile != NULL) {
fstream inp(lexiconfile,ios::in|ios::binary);
std::string w1, w2;
while (inp >> w1 >> w2){
lexicon.insert(make_pair(w1,w2));
}
+ add_lexicon_words=true;
}
if (topicscore == true) {
+ VERBOSE(0, "NOT SUPPORTED" << std::endl);
+ return 0;
if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) {
debug = (debug>4)?4:debug;
@@ -328,10 +338,11 @@ int main(int argc, char **argv)
std::cout.precision(2);
int Nw=0,Noov=0;
- double logPr=0,PP=0,PPwp=0,Pr;
+ double logPr=0,PP=0,PPwp=0,current_Pr;
double norm_logPr=0,norm_PP=0,norm_PPwp=0,norm_Pr;
double model_logPr=0,model_PP=0,model_PPwp=0,model_Pr;
double model_norm_logPr=0,model_norm_PP=0,model_norm_PPwp=0,model_norm_Pr;
+ int current_dict_alternatives = 0;
double bow;
int bol=0;
@@ -344,6 +355,7 @@ int main(int argc, char **argv)
double sent_norm_logPr=0,sent_norm_PP=0,sent_norm_PPwp=0;
double sent_model_logPr=0,sent_model_PP=0,sent_model_PPwp=0;
double sent_model_norm_logPr=0,sent_model_norm_PP=0,sent_model_norm_PPwp=0;
+ int sent_current_dict_alternatives = 0;
double oovpenalty = lmt->getlogOOVpenalty();
double norm_oovpenalty = oovpenalty;
@@ -375,14 +387,18 @@ int main(int argc, char **argv)
string_vec_t word_vec;
split(sentence, ' ', word_vec);
+ //add the BoS symbol at the beginning
+ string_vec_t::iterator it = word_vec.insert ( word_vec.begin() , lmt->getDict()->BoS() );
+
//first points to the last recent term to take into account
//last points to the position after the most recent term to take into account
//last could point outside the vector of string; do NOT use word_vec.at(last)
size_t last, first;
- size_t size=0;
size_t order = lmt->maxlevel();
- for (size_t i=0; i< word_vec.size(); ++i){
+ //start the computation from the second word because the first is the BoS symbol,but including BoS in the ngrams
+ size_t size=1;
+ for (size_t i=1; i< word_vec.size(); ++i){
++size;
size=(size<order)?size:order;
last=i+1;
@@ -416,7 +432,7 @@ int main(int argc, char **argv)
VERBOSE(2,"tmp_word_vec.size:|" << tmp_word_vec.size() << "|" << std::endl);
VERBOSE(2,"dict.size:|" << lmt->getDict()->size() << "|" << std::endl);
- double current_pr = lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
+ current_Pr = lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
/*
double tot_pr = 0.0;
if (context_model_normalization){
@@ -427,16 +443,7 @@ int main(int argc, char **argv)
// string_vec_t::iterator it=tmp_word_vec.end()-1;
int current_pos = tmp_word_vec.size()-1;
std::string current_word = tmp_word_vec.at(current_pos);
-
- int_to_double_and_int_map code_to_prob_and_code_map;
- double_and_int_to_double_and_int_map prob_and_code_to_prob_and_rank_map;
-
- //computation of the oracle probability. i.e. the maximum prob
- double best_pr = -1000000.0;
- int best_code = lmt->getlogOOVpenalty();
-
-
-
+
/*
//loop over all words in the LM
dictionary* current_dict = lmt->getDict();
@@ -447,19 +454,110 @@ int main(int argc, char **argv)
dictionary* current_dict = new dictionary((char *)NULL,1000000);
current_dict->incflag(1);
- std::pair <std::multimap< std::string, std::string>::iterator, std::multimap< std::string, std::string>::iterator> ret = lexicon.equal_range(current_word);
- for (std::multimap<std::string, std::string>::const_iterator it=ret.first; it!=ret.second; ++it)
- {
- if (current_word != (it->second).c_str()){
- //exclude the current word from the selected alternative words
+ current_dict->encode(current_word.c_str());
+
+ VERBOSE(2,"after current word current_dict->size:" << current_dict->size() << std::endl);
+
+ //add words from the lexicon
+ if (add_lexicon_words){
+ std::pair <std::multimap< std::string, std::string>::iterator, std::multimap< std::string, std::string>::iterator> ret = lexicon.equal_range(current_word);
+ for (std::multimap<std::string, std::string>::const_iterator it=ret.first; it!=ret.second; ++it)
+ {
current_dict->encode((it->second).c_str());
+ /*
+ //exclude the current word from the selected alternative words
+ if (current_word != (it->second).c_str()){
+ current_dict->encode((it->second).c_str());
+ }
+ */
+ }
+ }
+ VERBOSE(2,"after add_lexicon_words current_dict->size:" << current_dict->size() << std::endl);
+
+ if (add_lm_words){
+ bool succ_flag=false;
+ ngram hg(lmt->getDict());
+
+ if (size==1) {
+ hg.pushw(lmt->getDict()->BoS());
+ hg.pushc(0);
+
+ lmt->get(hg,hg.size,hg.size-1);
+ VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+ if (hg.succ < successor_limit){
+ succ_flag=true;
+ }else{
+ VERBOSE(3,"successors are not added into the alternatives because they are too many" << std::endl);
+ }
+ }else if (size>=2) {
+ hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-2));
+ hg.pushc(0);
+
+ lmt->get(hg,hg.size,hg.size-1);
+ VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+ if (hg.succ < successor_limit){
+ succ_flag=true;
+ }else{
+ VERBOSE(3,"successors are not added into the alternatives because they are too many" << std::endl);
+ }
+
+ if (!succ_flag && size>=3){
+ hg.size=0;
+ hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-3));
+ hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-2));
+ hg.pushc(0);
+
+ lmt->get(hg,hg.size,hg.size-1);
+ VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+
+ if (hg.succ < successor_limit){
+ succ_flag=true;
+ }else{
+ VERBOSE(3,"successors are not added into the alternatives because they are too many" << std::endl);
+ }
+ }
+ }
+
+
+ if (succ_flag){
+ ngram ng=hg;
+ lmt->succscan(hg,ng,LMT_INIT,ng.size);
+ while(lmt->succscan(hg,ng,LMT_CONT,ng.size)) {
+ current_dict->encode(ng.dict->decode(*ng.wordp(1)));
+ }
+ }
+
+ }
+
+ VERBOSE(2,"after add_lm_words current_dict->size:" << current_dict->size() << std::endl);
+
+ if (add_sentence_words){
+ for (string_vec_t::const_iterator it=word_vec.begin(); it!=word_vec.end(); ++it)
+ {
+ current_dict->encode(it->c_str());
}
}
current_dict->incflag(0);
+ VERBOSE(2,"after add_sentence_words current_dict->size:" << current_dict->size() << std::endl);
- double tot_pr = 0.0;
- for (int j=0; j<current_dict->size(); ++j)
- {
+ sent_current_dict_alternatives += current_dict->size();
+ current_dict_alternatives += current_dict->size();
+
+ VERBOSE(2,"current_dict->size:" << current_dict->size() << std::endl);
+ for (int h=0;h<current_dict->size();++h){
+ VERBOSE(2,"h:" << h << " w:|" << current_dict->decode(h) << "|" << std::endl);
+ }
+
+ //the first word in current_dict is always the current_word; hence we can skip it during the scan
+ //variables for the computation of the oracle probability, i.e. the maximum prob
+ //double best_pr = -1000000.0;
+ //int best_code = lmt->getlogOOVpenalty();
+ double best_pr = current_Pr;
+ int best_code = 0;
+ //variables for the computation of the mass probability related to the current word, i.e. the sum of the probs for all words associated with the current word
+ double current_tot_pr = pow(10.0,current_Pr);
+// for (int j=0; j<current_dict->size(); ++j){
+ for (int j=1; j<current_dict->size(); ++j){
//loop over all words in the LM
tmp_word_vec.at(current_pos) = current_dict->decode(j);
IFVERBOSE(3){
@@ -471,34 +569,25 @@ int main(int argc, char **argv)
}
double pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
+ current_tot_pr += pow(10.0,pr);
if (best_pr < pr){
best_pr = pr;
best_code = j;
- VERBOSE(3,"current_best:" << best_code << " current_word:|" << lmt->getDict()->decode(best_code) << "| best_prob:" << pow(10.0,best_pr) << " norm_best_prob:" << pow(10.0,best_pr - tot_pr) << std::endl);
+ VERBOSE(3,"current_best:" << best_code << " current_word:|" << current_dict->decode(best_code) << "| best_prob:" << pow(10.0,best_pr) << " norm_best_prob:" << pow(10.0,best_pr - current_tot_pr) << std::endl);
}
- tot_pr += pow(10.0,best_pr);
+ VERBOSE(3,"current_Pr:" << current_Pr << " current_word:" << current_word << "| ===> code:" << j << " word:|" << tmp_word_vec.at(current_pos) << "| pr:" << pr << " versus best_code:" << best_code << " best_word:|" << current_dict->decode(best_code) << "| best_pr:" << best_pr << std::endl);
}
+ current_tot_pr=log10(current_tot_pr);
+
model_Pr = best_pr;
- VERBOSE(2,"model_best_code:" << best_code << " model_best_word:|" << lmt->getDict()->decode(best_code) << "| model_best_prob:" << pow(10.0,best_pr) << " tot_pr:" << tot_pr << std::endl);
- IFVERBOSE(3){
- for (int_to_double_and_int_map::const_iterator it3=code_to_prob_and_code_map.begin(); it3!=code_to_prob_and_code_map.end(); ++it3)
- {
- VERBOSE(3,"it3: word:" << it3->first << " pr:" << (it3->second).first << " word:" << (it3->second).second << std::endl);
- }
-
- for (double_and_int_to_double_and_int_map::const_iterator it3=prob_and_code_to_prob_and_rank_map.begin(); it3!=prob_and_code_to_prob_and_rank_map.end(); ++it3)
- {
- VERBOSE(3,"it3: pr:" << (it3->first).first << " second:" << (it3->first).second << " norm_pr:" << (it3->second).first << " rank:" << (it3->second).second << std::endl);
- }
- }
+ VERBOSE(2,"model_best_code:" << best_code << " model_best_word:|" << current_dict->decode(best_code) << "| model_best_prob:" << pow(10.0,best_pr) << " current_tot_pr:" << current_tot_pr << std::endl);
norm_oovpenalty = oovpenalty;
- VERBOSE(2,"tot_pr:" << tot_pr << " oovpenalty:" << oovpenalty << " norm_oovpenalty:" << norm_oovpenalty << std::endl);
-
+ VERBOSE(2,"current_tot_pr:" << current_tot_pr << " oovpenalty:" << oovpenalty << " norm_oovpenalty:" << norm_oovpenalty << std::endl);
- norm_Pr = current_pr - tot_pr;
- model_norm_Pr = model_Pr - tot_pr;
- VERBOSE(1,"Pr:" << Pr << " norm_Pr:" << norm_Pr << " model_Pr:" << model_Pr << " model_norm_Pr:" << model_norm_Pr << " current_code:" << lmt->getDict()->encode(word_vec.at(i).c_str()) << " current_word:|" << word_vec.at(i) << "| model_best_code:" << best_code << " model_best_word:|" << lmt->getDict()->decode(best_code) << "|" << std::endl);
+ norm_Pr = current_Pr - current_tot_pr;
+ model_norm_Pr = model_Pr - current_tot_pr;
+ VERBOSE(1,"current_Pr:" << current_Pr << " norm_Pr:" << norm_Pr << " model_Pr:" << model_Pr << " model_norm_Pr:" << model_norm_Pr << " current_code:" << lmt->getDict()->encode(word_vec.at(i).c_str()) << " current_word:|" << word_vec.at(i) << "| model_best_code:" << best_code << " model_best_word:|" << current_dict->decode(best_code) << "|" << std::endl);
model_norm_logPr+=model_norm_Pr;
sent_model_norm_logPr+=model_norm_Pr;
@@ -509,12 +598,10 @@ int main(int argc, char **argv)
model_logPr+=model_Pr;
sent_model_logPr+=model_Pr;
- logPr+=current_pr;
- sent_logPr+=Pr;
+ logPr+=current_Pr;
+ sent_logPr+=current_Pr;
VERBOSE(2,"sent_model_logPr:" << sent_model_logPr << " model_logPr:" << model_logPr << std::endl);
- VERBOSE(2,"sent_logPr:" << sent_logPr << " logPr:" << logPr << std::endl);
-
-
+ VERBOSE(2,"sent_logPr:" << sent_logPr << " current_Pr:" << current_Pr << std::endl);
}
}
@@ -531,14 +618,19 @@ int main(int argc, char **argv)
<< " sent_norm_PPwp=" << sent_norm_PPwp
<< " sent_norm_PP_noOOV=" << (sent_norm_PP-sent_norm_PPwp)
<< " sent_Noov=" << sent_Noov
- << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
+ << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%"
+ << " sent_avg_alternatives=" << (float) sent_current_dict_alternatives/sent_Nw
+ << std::endl;
+
std::cout << "%% sent_Nw=" << sent_Nw
<< " sent_model_norm_logPr=" << sent_model_norm_logPr
<< " sent_model_norm_PP=" << sent_model_norm_PP
<< " sent_model_norm_PPwp=" << sent_model_norm_PPwp
<< " sent_model_norm_PP_noOOV=" << (sent_model_norm_PP-sent_model_norm_PPwp)
<< " sent_Noov=" << sent_Noov
- << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
+ << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%"
+ << " sent_avg_alternatives=" << (float) sent_current_dict_alternatives/sent_Nw
+ << std::endl;
sent_model_PP = exp((-sent_model_logPr * M_LN10) / sent_Nw);
sent_model_PPwp = sent_model_PP * (1 - 1/exp(sent_Noov * oovpenalty * M_LN10 / sent_Nw));
@@ -550,7 +642,9 @@ int main(int argc, char **argv)
<< " sent_PPwp=" << sent_PPwp
<< " sent_PP_noOOV=" << (sent_PP-sent_PPwp)
<< " sent_Noov=" << sent_Noov
- << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
+ << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%"
+ << " sent_avg_alternatives=" << (float) sent_current_dict_alternatives/sent_Nw
+ << std::endl;
std::cout << "%% sent_Nw=" << sent_Nw
<< " sent_model_logPr=" << sent_model_logPr
@@ -558,7 +652,9 @@ int main(int argc, char **argv)
<< " sent_model_PPwp=" << sent_model_PPwp
<< " sent_model_PP_noOOV=" << (sent_model_PP-sent_model_PPwp)
<< " sent_Noov=" << sent_Noov
- << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
+ << " sent_OOVrate=" << (float)sent_Noov/sent_Nw * 100.0 << "%"
+ << " sent_avg_alternatives=" << (float) sent_current_dict_alternatives/sent_Nw
+ << std::endl;
std::cout.flush();
//reset statistics for sentence based Perplexity
sent_Noov = 0;
@@ -567,6 +663,7 @@ int main(int argc, char **argv)
sent_model_logPr = 0.0;
sent_norm_logPr = 0.0;
sent_logPr = 0.0;
+ sent_current_dict_alternatives = 0;
}
apriori_topic_map.clear();
@@ -588,35 +685,42 @@ int main(int argc, char **argv)
<< " model_PPwp=" << model_PPwp
<< " model_PP_noOOV=" << (model_PP-model_PPwp)
<< " Noov=" << Noov
- << " OOVrate=" << (float)Noov/Nw * 100.0 << "%";
- std::cout << std::endl;
+ << " OOVrate=" << (float)Noov/Nw * 100.0 << "%"
+ << " avg_alternatives=" << (float) current_dict_alternatives/Nw
+ << std::endl;
std::cout.flush();
+
std::cout << "%% Nw=" << Nw
<< " model_norm_logPr=" << model_norm_logPr
<< " model_norm_PP=" << model_norm_PP
<< " model_norm_PPwp=" << model_norm_PPwp
<< " model_norm_PP_noOOV=" << (model_norm_PP-model_norm_PPwp)
<< " Noov=" << Noov
- << " OOVrate=" << (float)Noov/Nw * 100.0 << "%";
- std::cout << std::endl;
+ << " OOVrate=" << (float)Noov/Nw * 100.0 << "%"
+ << " avg_alternatives=" << (float) current_dict_alternatives/Nw
+ << std::endl;
std::cout.flush();
+
std::cout << "%% Nw=" << Nw
<< " logPr=" << logPr
<< " PP=" << PP
<< " PPwp=" << PPwp
<< " PP_noOOV=" << (PP-PPwp)
<< " Noov=" << Noov
- << " OOVrate=" << (float)Noov/Nw * 100.0 << "%";
- std::cout << std::endl;
+ << " OOVrate=" << (float)Noov/Nw * 100.0 << "%"
+ << " avg_alternatives=" << (float) current_dict_alternatives/Nw
+ << std::endl;
std::cout.flush();
+
std::cout << "%% Nw=" << Nw
<< " norm_logPr=" << norm_logPr
<< " norm_PP=" << norm_PP
<< " norm_PPwp=" << norm_PPwp
<< " norm_PP_noOOV=" << (norm_PP-norm_PPwp)
<< " Noov=" << Noov
- << " OOVrate=" << (float)Noov/Nw * 100.0 << "%";
- std::cout << std::endl;
+ << " OOVrate=" << (float)Noov/Nw * 100.0 << "%"
+ << " avg_alternatives=" << (float) current_dict_alternatives/Nw
+ << std::endl;
std::cout.flush();
if (debug>1) lmt->used_caches();
@@ -650,53 +754,6 @@ int main(int argc, char **argv)
int tot_rank = 0;
int max_rank = 0;
- /*
- //collect total occurrences of current word in the following intervals
- // [firs position], [<=1%], [<=2%], [<=5%], [<=10%]
- int Rank_histogram[5];
- int Rank_limit[5];
- */
-
- /*
- int max_rank = lmt->getDict()->size();
- double ratio = 0.001;
-
- double Rank_perc[5];
- Rank_perc[0] = 0; Rank_limit[0] = 1;
- Rank_perc[1] = 1 * ratio; Rank_limit[1] = Rank_perc[1] * max_rank;
- Rank_perc[2] = 2 * ratio; Rank_limit[2] = Rank_perc[2] * max_rank;
- Rank_perc[3] = 5 * ratio; Rank_limit[3] = Rank_perc[3] * max_rank;
- Rank_perc[4] = 10 * ratio; Rank_limit[4] = Rank_perc[4] * max_rank;
-
- VERBOSE(1, "Rank thresholds: Rank_[bst]=1" <<
- " Rank_[1]=" << Rank_perc[1]*100 <<"%<=" << Rank_limit[1] <<
- " Rank_[2]=" << Rank_perc[2]*100 <<"%<=" << Rank_limit[2] <<
- " Rank_[3]=" << Rank_perc[3]*100 <<"%<=" << Rank_limit[3] <<
- " Rank_[4]=" << Rank_perc[4]*100 <<"%<=" << Rank_limit[4] <<
- std::endl);
- */
-
- /*
- Rank_limit[0] = 1;
- Rank_limit[1] = 10;
- Rank_limit[2] = 20;
- Rank_limit[3] = 50;
- Rank_limit[4] = 100;
-
- VERBOSE(1, "Rank thresholds: Rank_[bst]=1" <<
- " Rank_[1]=" << Rank_limit[1] <<
- " Rank_[2]=" << Rank_limit[2] <<
- " Rank_[3]=" << Rank_limit[3] <<
- " Rank_[4]=" << Rank_limit[4] <<
- std::endl);
-
- Rank_histogram[0] = 0;
- Rank_histogram[1] = 0;
- Rank_histogram[2] = 0;
- Rank_histogram[3] = 0;
- Rank_histogram[4] = 0;
- */
-
double bow;
int bol=0;
char *msp;
@@ -781,96 +838,115 @@ int main(int argc, char **argv)
int current_pos = tmp_word_vec.size()-1;
std::string current_word = tmp_word_vec.at(current_pos);
- double current_pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
- double tot_pr = 0.0;
-
- /*
- if (context_model_normalization){
- tot_pr = ((lmContextDependent*) lmt)->total_clprob(tmp_word_vec, apriori_topic_map);
- current_pr = current_pr - tot_pr;
- }
-*/
+ double current_pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
/*
//loop over all words in the LM
dictionary* current_dict = lmt->getDict();
- */
+ */
//loop over a set of selected alternative words
//populate the dictionary with all words associated with the current word
dictionary* current_dict = new dictionary((char *)NULL,1000000);
current_dict->incflag(1);
- std::pair <std::multimap< std::string, std::string>::iterator, std::multimap< std::string, std::string>::iterator> ret = lexicon.equal_range(current_word);
- for (std::multimap<std::string, std::string>::const_iterator it=ret.first; it!=ret.second; ++it)
- {
- if (current_word != (it->second).c_str()){
- //exclude the current word from the selected alternative words
+ current_dict->encode(current_word.c_str());
+
+ VERBOSE(2,"after current word current_dict->size:" << current_dict->size() << std::endl);
+
+ //add words from the lexicon
+ if (add_lexicon_words){
+ std::pair <std::multimap< std::string, std::string>::iterator, std::multimap< std::string, std::string>::iterator> ret = lexicon.equal_range(current_word);
+ for (std::multimap<std::string, std::string>::const_iterator it=ret.first; it!=ret.second; ++it)
+ {
current_dict->encode((it->second).c_str());
+ /*
+ //exclude the current word from the selected alternative words
+ if (current_word != (it->second).c_str()){
+ current_dict->encode((it->second).c_str());
+ }
+ */
}
}
- current_dict->incflag(0);
+ VERBOSE(2,"after add_lexicon_words current_dict->size:" << current_dict->size() << std::endl);
- max_rank = current_dict->size()+1; //we add 1, to count for the current word as well, which is not included in the selected alternative words
- int current_rank = 1;
- for (int i=0; i<current_dict->size(); i++)
- {
- tmp_word_vec.at(current_pos) = current_dict->decode(i);
- IFVERBOSE(3){
- std::cout << "tmp_word_vec i:" << i;
- for (string_vec_t::const_iterator it2=tmp_word_vec.begin(); it2!=tmp_word_vec.end(); ++it2) {
- std::cout << " |" << (*it2) << "|";
+ if (add_lm_words){
+ bool succ_flag=false;
+ ngram hg(lmt->getDict());
+
+ if (size==1) {
+ hg.pushw(lmt->getDict()->BoS());
+ hg.pushc(0);
+
+ lmt->get(hg,hg.size,hg.size-1);
+ VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+ if (hg.succ < 100){
+ succ_flag=true;
+ }
+ }else if (size==2) {
+ hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-2));
+ hg.pushc(0);
+
+ lmt->get(hg,hg.size,hg.size-1);
+ VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+ if (hg.succ < 100){
+ succ_flag=true;
+ }
+ }else if(size>=3){
+ hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-2));
+ hg.pushc(0);
+
+ lmt->get(hg,hg.size,hg.size-1);
+ VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+
+
+ if (hg.succ < 100){
+ succ_flag=true;
+ }
+
+ if (!succ_flag){
+ hg.size=0;
+ hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-3));
+ hg.pushw(tmp_word_vec.at(tmp_word_vec.size()-2));
+ hg.pushc(0);
+
+ lmt->get(hg,hg.size,hg.size-1);
+ VERBOSE(1,"add_lm_words hg:|" << hg << "| hg.size:" << hg.size << " hg.succ:" << hg.succ << std::endl);
+
+ if (hg.succ < 100){
+ succ_flag=true;
+ }
}
- std::cout << std::endl;
- }
- double pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
- if (context_model_normalization){
- pr = pr - tot_pr;
- }
- if (pr > current_pr){
- ++current_rank;
}
- VERBOSE(3," current_pos:" << current_pos << " word:|" << tmp_word_vec.at(current_pos) << "| current_pr:" << current_pr << " pr:" << pr << " current_rank:" << current_rank <<std::endl);
- }
- delete current_dict;
- /* loop over the whole dictionary
- int current_rank = 1;
- //computation of the ranking of the current word (among all LM words)
- for (int i=0; i<lmt->getDict()->size(); i++)
- {
- //loop over all words in the LM
- tmp_word_vec.at(current_pos) = lmt->getDict()->decode(i);
- // *it = lmt->getDict()->decode(i);
- IFVERBOSE(3){
- std::cout << "tmp_word_vec i:" << i;
- for (string_vec_t::const_iterator it2=tmp_word_vec.begin(); it2!=tmp_word_vec.end(); ++it2) {
- std::cout << " |" << (*it2) << "|";
+
+ if (succ_flag){
+ ngram ng=hg;
+ lmt->succscan(hg,ng,LMT_INIT,ng.size);
+ while(lmt->succscan(hg,ng,LMT_CONT,ng.size)) {
+ current_dict->encode(ng.dict->decode(*ng.wordp(1)));
}
- std::cout << std::endl;
- }
- double pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
- if (context_model_normalization){
- pr = pr - tot_pr;
- }
- if (pr > current_pr){
- ++current_rank;
}
+
}
- */
+ VERBOSE(2,"after add_lm_words current_dict->size:" << current_dict->size() << std::endl);
- /*
- int_to_double_and_int_map code_to_prob_and_code_map;
- double_and_int_to_double_and_int_map prob_and_code_to_prob_and_rank_map;
+ if (add_sentence_words){
+ for (string_vec_t::const_iterator it=word_vec.begin(); it!=word_vec.end(); ++it)
+ {
+ current_dict->encode(it->c_str());
+ }
+ }
+ current_dict->incflag(0);
- //computation of the ranking
- int default_rank=-1;
- for (int i=0; i<lmt->getDict()->size(); i++)
+ VERBOSE(2,"after add_sentence_words current_dict->size:" << current_dict->size() << std::endl);
+
+ max_rank = current_dict->size(); //the current word is included in the selected alternative words
+ int current_rank = 1;
+ for (int i=0; i<current_dict->size(); i++)
{
- //loop over all words in the LM
- tmp_word_vec.at(current_pos) = lmt->getDict()->decode(i);
- // *it = lmt->getDict()->decode(i);
+ tmp_word_vec.at(current_pos) = current_dict->decode(i);
IFVERBOSE(3){
std::cout << "tmp_word_vec i:" << i;
for (string_vec_t::const_iterator it2=tmp_word_vec.begin(); it2!=tmp_word_vec.end(); ++it2) {
@@ -879,75 +955,18 @@ int main(int argc, char **argv)
std::cout << std::endl;
}
double pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
- if (context_model_normalization){
- pr = pr - tot_pr;
- }
- code_to_prob_and_code_map.insert(make_pair(i,make_pair(pr,i)));
- prob_and_code_to_prob_and_rank_map.insert(make_pair(make_pair(pr,i),make_pair(pr,default_rank)));
- VERBOSE(3," i:" << i << " word_prob:" << pr << std::endl);
- }
- IFVERBOSE(3){
- for (int_to_double_and_int_map::const_iterator it3=code_to_prob_and_code_map.begin(); it3!=code_to_prob_and_code_map.end();++it3)
- {
- VERBOSE(3,"it3: word:" << it3->first << " pr:" << (it3->second).first << " word:" << (it3->second).second << std::endl);
+ if (pr > current_pr){
+ ++current_rank;
}
- for (double_and_int_to_double_and_int_map::const_iterator it3=prob_and_code_to_prob_and_rank_map.begin(); it3!=prob_and_code_to_prob_and_rank_map.end();++it3)
- {
- VERBOSE(3,"it3: pr:" << (it3->first).first << " second:" << (it3->first).second << " norm_pr:" << (it3->second).first << " rank:" << (it3->second).second << std::endl);
- }
- }
- //set rank of word according to their prob
- //note that prob are already sorted in the map in ascending order wrt to prob (and secondarily code)
- int rank=0;
- for (double_and_int_to_double_and_int_map::reverse_iterator it3=prob_and_code_to_prob_and_rank_map.rbegin(); it3!=prob_and_code_to_prob_and_rank_map.rend();++it3)
- {
- (it3->second).second = rank;
- ++rank;
- IFVERBOSE(3){
- if (rank < 10){
- VERBOSE(3,"it3: pr:" << (it3->first).first << " code:" << (it3->first).second << " rank:" << (it3->second).second << std::endl);
- }
- }
- }
-
-
- IFVERBOSE(3){
- int i_tmp=0;
- for (double_and_int_to_double_and_int_map::const_reverse_iterator it3=prob_and_code_to_prob_and_rank_map.rbegin(); it3!=prob_and_code_to_prob_and_rank_map.rend();++it3)
- {
- VERBOSE(3,"it3: pr:" << (it3->first).first << " code:" << (it3->first).second <<" rank:" << (it3->second).second << std::endl);
- if (++i_tmp==10) break;
- }
+ VERBOSE(3," current_pos:" << current_pos << " word:|" << tmp_word_vec.at(current_pos) << "| current_pr:" << current_pr << " pr:" << pr << " current_rank:" << current_rank <<std::endl);
}
- double_and_int_pair current_prob_and_code = (code_to_prob_and_code_map.find(current_code))->second;
- int current_rank = ((prob_and_code_to_prob_and_rank_map.find(current_prob_and_code))->second).second;
- VERBOSE(1," current_word:" << current_word << " current_code:" << current_code << " current_rank:" << current_rank << std::endl);
-
- */
+ delete current_dict;
sent_tot_rank += current_rank;
tot_rank += current_rank;
- /*
- if (current_rank <= Rank_limit[0]){
- ++Rank_histogram[0];
- VERBOSE(1,"HERE 0 current_rank:" << current_rank << " Rank_limit[0]:" << Rank_limit[0] << std::endl);
- }
- if (current_rank <= Rank_limit[1]){
- ++Rank_histogram[1]; ++Rank_histogram[2]; ++Rank_histogram[3]; ++Rank_histogram[4];
- VERBOSE(1,"HERE 1 current_rank:" << current_rank << " Rank_limit[1]:" << Rank_limit[1] << std::endl);
- }else if (current_rank <= Rank_limit[2]){
- ++Rank_histogram[2]; ++Rank_histogram[3]; ++Rank_histogram[4];
- VERBOSE(1,"HERE 2 current_rank:" << current_rank << " Rank_limit[2]:" << Rank_limit[2] << std::endl);
- }else if (current_rank <= Rank_limit[3]){
- ++Rank_histogram[3]; ++Rank_histogram[4];
- VERBOSE(1,"HERE 3 current_rank:" << current_rank << " Rank_limit[3]:" << Rank_limit[3] << std::endl);
- }else if (current_rank <= Rank_limit[4]){
- ++Rank_histogram[4];
- VERBOSE(1,"HERE 4 current_rank:" << current_rank << " Rank_limit[4]:" << Rank_limit[4] << std::endl);
- }
- */
+
if (debug>1){
//output format:
//word_pos:current_rank:max_rank
@@ -987,18 +1006,7 @@ int main(int argc, char **argv)
<< " avgRank=" << avgRank
<< " Noov=" << Noov
<< " OOVrate=" << (float)Noov/Nw * 100.0 << "%";
- /*
- std::cout << " Rank_[bst]=" << Rank_histogram[0];
- std::cout << " Rank_[1]=" << Rank_histogram[1];
- std::cout << " Rank_[2]=" << Rank_histogram[2];
- std::cout << " Rank_[3]=" << Rank_histogram[3];
- std::cout << " Rank_[4]=" << Rank_histogram[4];
- std::cout << " Rank_[bst]=" << (float)Rank_histogram[0]/Nw * 100.0 << "%";
- std::cout << " Rank_[1]=" << (float)Rank_histogram[1]/Nw * 100.0 << "%";
- std::cout << " Rank_[2]=" << (float)Rank_histogram[2]/Nw * 100.0 << "%";
- std::cout << " Rank_[3]=" << (float)Rank_histogram[3]/Nw * 100.0 << "%";
- std::cout << " Rank_[4]=" << (float)Rank_histogram[4]/Nw * 100.0 << "%";
- */
+
std::cout << std::endl;
std::cout.flush();
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list