[irstlm] 31/126: code clanup; debugging outputs
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:42 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.
commit e604c8b2630e126118628886a90a06639dc196f5
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Mon Jul 27 07:56:29 2015 +0200
code clanup; debugging outputs
---
src/context-dependent-evaluation.cpp | 89 +++---------------------------------
src/context-similarity.cpp | 61 ++++++++++++++++++++----
src/context-similarity.h | 6 +--
src/lmContainer.cpp | 4 +-
src/lmContainer.h | 2 +
src/lmContextDependent.cpp | 42 +++++++++--------
src/lmContextDependent.h | 2 +
7 files changed, 90 insertions(+), 116 deletions(-)
diff --git a/src/context-dependent-evaluation.cpp b/src/context-dependent-evaluation.cpp
index 8923b3d..48a138f 100644
--- a/src/context-dependent-evaluation.cpp
+++ b/src/context-dependent-evaluation.cpp
@@ -171,12 +171,7 @@ int main(int argc, char **argv)
double sent_logPr=0,sent_PP=0,sent_PPwp=0;
- ngram ng(lmt->getDict());
- ng.dict->incflag(1);
- int bos=ng.dict->encode(ng.dict->BoS());
- int eos=ng.dict->encode(ng.dict->EoS());
- ng.dict->incflag(0);
-
+// ngram ng(lmt->getDict());
const std::string context_delimiter="___CONTEXT___";
const char topic_map_delimiter='=';
@@ -212,7 +207,8 @@ int main(int argc, char **argv)
}else{
sentence = line_str;
context = "";
- }
+ }
+
VERBOSE(0,"context:|" << context << "|" << std::endl);
VERBOSE(0,"line_str:|" << line_str << "|" << std::endl);
//getting topic weights
@@ -226,8 +222,6 @@ int main(int argc, char **argv)
}
topic_weight_vec.clear();
- lmt->dictionary_incflag(1);
-
if(1){
// computation using std::string
@@ -238,6 +232,8 @@ int main(int argc, char **argv)
size_t last, first;
size_t size=0;
size_t order = lmt->maxlevel();
+
+ VERBOSE(0,"w_vec.size():|" << w_vec.size() << "|" << std::endl);
for (size_t i=0; i<w_vec.size(); ++i){
++size;
size=(size<order)?size:order;
@@ -252,11 +248,8 @@ int main(int argc, char **argv)
VERBOSE(0,"prob for first:|" << first << "| and last:|" << last << "| size:" << size << std::endl);
string_vec_t tmp_w_vec(w_vec.begin() + first, w_vec.begin() +last);
- for (string_vec_t::iterator it=tmp_w_vec.begin(); it!=tmp_w_vec.end(); ++it){
-
- VERBOSE(0,"*it:|" << *it << "|" << std::endl);
- }
- if (ng.size>=1) {
+ if (size>=1) {
+ VERBOSE(0,"computing prob for first:|" << first << "| and last:|" << last << "| is Pr=" << Pr << std::endl);
Pr=lmt->clprob(tmp_w_vec, topic_weight_map, &bow, &bol, &msp, &statesize);
VERBOSE(0,"prob for first:|" << first << "| and last:|" << last << "| is Pr=" << Pr << std::endl);
logPr+=Pr;
@@ -270,74 +263,6 @@ int main(int argc, char **argv)
}
}
- if(0){
- // computation using ngram object
- // loop over ngrams of the sentence
- std::istringstream ss(sentence); // Insert the string into a stream
- while (ss >> ng){
- //computing context-based prob for each ngram of the sentence
- VERBOSE(0,"working on ng:|" << ng << "| ng.size:" << ng.size << std::endl);
-
- if (ng.size>lmt->maxlevel()) ng.size=lmt->maxlevel();
-
- // reset ngram at begin of sentence
- if (*ng.wordp(1)==bos) {
- ng.size=1;
- continue;
- }
-
- if (ng.size>=1) {
- Pr=lmt->clprob(ng,topic_weight_map, &bow, &bol, &msp, &statesize);
- VERBOSE(0,"prob for ng:|" << ng << "| is Pr=" << Pr << std::endl);
- logPr+=Pr;
- sent_logPr+=Pr;
- VERBOSE(0,"sent_logPr:|" << sent_logPr << " logPr:|" << logPr << std::endl);
-
- if (debug==1) {
- std::cout << ng.dict->decode(*ng.wordp(1)) << " [" << ng.size-bol << "]" << " ";
- if (*ng.wordp(1)==eos) std::cout << std::endl;
- } else if (debug==2) {
- std::cout << ng << " [" << ng.size-bol << "-gram]" << " " << Pr;
- std::cout << std::endl;
- std::cout.flush();
- } else if (debug==3) {
- std::cout << ng << " [" << ng.size-bol << "-gram]" << " " << Pr << " bow:" << bow;
- std::cout << std::endl;
- std::cout.flush();
- } else if (debug==4) {
- std::cout << ng << " [" << ng.size-bol << "-gram: recombine:" << statesize << " state:" << (void*) msp << "] [" << ng.size+1-((bol==0)?(1):bol) << "-gram: bol:" << bol << "] " << Pr << " bow:" << bow;
- std::cout << std::endl;
- std::cout.flush();
- }
- }
-
- if (lmt->is_OOV(*ng.wordp(1))) {
- Noov++;
- sent_Noov++;
- }
- if (bol) {
- Nbo++;
- sent_Nbo++;
- }
- Nw++;
- sent_Nw++;
- if (sent_PP_flag && (*ng.wordp(1)==eos)) {
- sent_PP=exp((-sent_logPr * log(10.0)) /sent_Nw);
- sent_PPwp= sent_PP * (1 - 1/exp((sent_Noov * lmt->getlogOOVpenalty()) * log(10.0) / sent_Nw));
-
- std::cout << "%% sent_Nw=" << sent_Nw
- << " sent_PP=" << sent_PP
- << " sent_PPwp=" << sent_PPwp
- << " sent_Nbo=" << sent_Nbo
- << " sent_Noov=" << sent_Noov
- << " sent_OOV=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl;
- std::cout.flush();
- //reset statistics for sentence based Perplexity
- sent_Nw=sent_Noov=sent_Nbo=0;
- sent_logPr=0.0;
- }
- }
- }
if ((Nw % 100000)==0) {
std::cerr << ".";
lmt->check_caches_levels();
diff --git a/src/context-similarity.cpp b/src/context-similarity.cpp
index 9a7a7ca..ddfaf21 100644
--- a/src/context-similarity.cpp
+++ b/src/context-similarity.cpp
@@ -23,11 +23,13 @@
#include <cstdlib>
#include <stdlib.h>
#include <iostream>
+#include <sstream>
#include <stdexcept>
#include <string>
#include "lmContainer.h"
#include "context-similarity.h"
#include "util.h"
+#include "mfstream.h"
using namespace std;
@@ -38,22 +40,40 @@ inline void error(const char* message)
}
namespace irstlm {
- ContextSimilarity::ContextSimilarity(const std::string &filename)
+ ContextSimilarity::ContextSimilarity(const std::string &dictfile, const std::string &modelfile)
{
- m_lm=lmContainer::CreateLanguageModel(filename);
+ m_lm=lmContainer::CreateLanguageModel(modelfile);
- m_lm->load(filename);
+ m_lm->load(modelfile);
m_lm->getDict()->genoovcode();
+
+ //loading form file
+ std::string str;
+
+ mfstream inp(dictfile.c_str(),ios::in);
+
+ if (!inp) {
+ std::stringstream ss_msg;
+ ss_msg << "cannot open " << dictfile << "\n";
+ exit_error(IRSTLM_ERROR_IO, ss_msg.str());
+ }
+ VERBOSE(0, "Loading the list of topic" << std::endl);
+
+ while (inp >> str)
+ {
+ m_lm_topic_dict.insert(str);
+ }
+ VERBOSE(0, "There are " << m_lm_topic_dict.size() << " topic" << std::endl);
}
+
ContextSimilarity::~ContextSimilarity()
- {
- // delete m_lm
- }
+ {}
double ContextSimilarity::score(string_vec_t& text, topic_map_t& topic_weights)
{
+ VERBOSE(4, "double ContextSimilarity::score(string_vec_t& text, topic_map_t& topic_weights)" << std::endl);
if (topic_weights.size() == 0){
//a-priori topic distribution is "empty", i.e. there is nore score for any topic
//return a "constant" lower-bound score, SIMILARITY_LOWER_BOUND = log(0.0)
@@ -72,11 +92,18 @@ namespace irstlm {
ngram num_ng = base_num_ng;
ngram den_ng = base_den_ng;
add_topic(it->first, num_ng, den_ng);
- add_logprob = log(it->second) + get_topic_similarity(num_ng, den_ng);
+
+ VERBOSE(0, "topic:|" << it->first << " log(p(topic):" << log(it->second) << std::endl);
+ double topic_score = get_topic_similarity(num_ng, den_ng);
+ add_logprob = log(it->second) + topic_score;
+ VERBOSE(0, "topic_score:" << topic_score << std::endl);
+ VERBOSE(0, "add_logprob:" << add_logprob << std::endl);
ret_logprob = logsum(ret_logprob, add_logprob);
++it;
}while (it!= topic_weights.end());
+
+ VERBOSE(0, "ret_logprob:" << ret_logprob << std::endl);
return ret_logprob;
}
@@ -89,7 +116,7 @@ namespace irstlm {
ngram base_den_ng(m_lm->getDict());
create_ngram(text, base_num_ng, base_den_ng);
- for (topic_dict_t::iterator it=m_lm_topic_dict->begin(); it != m_lm_topic_dict->end(); ++it)
+ for (topic_dict_t::iterator it=m_lm_topic_dict.begin(); it != m_lm_topic_dict.end(); ++it)
{
ngram num_ng = base_num_ng;
ngram den_ng = base_den_ng;
@@ -103,7 +130,17 @@ namespace irstlm {
{
//text is a vector of string with w in the last position and the history in the previous positions
//text must have at least two words
- num_ng.pushw(text.at(text.size()-2));
+ VERBOSE(3,"void ContextSimilarity::create_ngram" << std::endl);
+
+ //TO_CHECK: what happens when text has zero element
+ // if (text.size()==0)
+
+ //TO_CHECK: what happens when text has just one element
+ if (text.size()==1){
+ num_ng.pushw(num_ng.dict->OOV());
+ }else {
+ num_ng.pushw(text.at(text.size()-2));
+ }
num_ng.pushw(text.at(text.size()-1));
den_ng.pushw(den_ng.dict->OOV()); //or den_ng.pushc(m_lm->getDict()->getoovcode());
@@ -135,7 +172,11 @@ namespace irstlm {
}
double ContextSimilarity::get_topic_similarity(ngram& num_ng, ngram& den_ng)
- {
+ {
+ double num_pr=m_lm->clprob(num_ng);
+ double den_pr=m_lm->clprob(den_ng);
+ VERBOSE(0, "num_ng:|" << num_ng << "| pr:" << num_pr << std::endl);
+ VERBOSE(0, "den_ng:|" << den_ng << "| pr:" << den_pr << std::endl);
return m_lm->clprob(num_ng) - m_lm->clprob(den_ng);
}
diff --git a/src/context-similarity.h b/src/context-similarity.h
index 16fbdf3..4a2533b 100644
--- a/src/context-similarity.h
+++ b/src/context-similarity.h
@@ -47,8 +47,8 @@ namespace irstlm {
{
private:
lmContainer* m_lm; // P(topic | h' w)
- topic_dict_t* m_lm_topic_dict; //the dictionary of the topics seen in the language model
- topic_map_t* topic_map;
+ topic_dict_t m_lm_topic_dict; //the dictionary of the topics seen in the language model
+ topic_map_t topic_map;
void create_ngram(const string_vec_t& text, ngram& num_ng, ngram& den_ng);
void add_topic(const std::string& topic, ngram& num_ng, ngram& den_ng);
@@ -58,7 +58,7 @@ namespace irstlm {
double get_topic_similarity(ngram& num_ng, ngram& den_ng);
public:
- ContextSimilarity(const std::string &filename);
+ ContextSimilarity(const std::string &dictfile, const std::string &modelfile);
~ContextSimilarity();
topic_map_t get_topic_scores(string_vec_t& text);
diff --git a/src/lmContainer.cpp b/src/lmContainer.cpp
index dc042f8..56e7187 100644
--- a/src/lmContainer.cpp
+++ b/src/lmContainer.cpp
@@ -92,7 +92,6 @@ namespace irstlm {
VERBOSE(1,"LM header:|" << header << "|" << std::endl);
int type=_IRSTLM_LMUNKNOWN;
- VERBOSE(1,"type: " << type << std::endl);
if (header == "lminterpolation" || header == "LMINTERPOLATION") {
type = _IRSTLM_LMINTERPOLATION;
} else if (header == "lmcontextdependent" || header == "LMCONTEXTDEPENDENT") {
@@ -104,7 +103,7 @@ namespace irstlm {
} else {
type = _IRSTLM_LMTABLE;
}
- VERBOSE(1,"type: " << type << std::endl);
+ VERBOSE(1,"LM type: " << type << std::endl);
return type;
};
@@ -151,6 +150,7 @@ namespace irstlm {
}
lm->setLanguageModelType(type);
+
return lm;
}
diff --git a/src/lmContainer.h b/src/lmContainer.h
index 5f760ff..131f207 100644
--- a/src/lmContainer.h
+++ b/src/lmContainer.h
@@ -122,6 +122,7 @@ public:
return 0.0;
};
virtual double clprob(string_vec_t& text, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL) {
+ VERBOSE(0,"lmContainer::clprob(string_vec_t& text, double* bow,...." << std::endl);
UNUSED(text);
UNUSED(bow);
UNUSED(bol);
@@ -152,6 +153,7 @@ public:
}
virtual double clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) {
+ VERBOSE(3,"lmContainer::clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow,...." << std::endl);
UNUSED(topic_weights);
return clprob(text, bow, bol, maxsuffptr, statesize, extendible);
}
diff --git a/src/lmContextDependent.cpp b/src/lmContextDependent.cpp
index 5fab621..00c7a34 100644
--- a/src/lmContextDependent.cpp
+++ b/src/lmContextDependent.cpp
@@ -67,6 +67,7 @@ namespace irstlm {
//get info from the configuration file
fstream inp(filename.c_str(),ios::in|ios::binary);
+ VERBOSE(0, "filename:|" << filename << "|" << std::endl);
char line[MAX_LINE];
const char* words[LMCONFIGURE_MAX_TOKEN];
@@ -75,43 +76,44 @@ namespace irstlm {
tokenN = parseWords(line,words,LMCONFIGURE_MAX_TOKEN);
if (tokenN != 1 || ((strcmp(words[0],"LMCONTEXTDEPENDENT") != 0) && (strcmp(words[0],"lmcontextdependent")!=0)))
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nweight_of_ngram_LM filename_of_LM\nweight_of_topic_model filename_of_topic_model");
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_model");
//reading ngram-based LM
inp.getline(line,BUFSIZ,'\n');
- tokenN = parseWords(line,words,2);
- if(tokenN < 2 || tokenN > 2) {
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nweight_of_ngram_LM filename_of_LM\nweight_of_topic_model filename_of_topic_model");
+ tokenN = parseWords(line,words,1);
+ if(tokenN < 1 || tokenN > 1) {
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_model");
}
- //loading ngram-based LM and initialization
- m_lm_weight = (float) atof(words[0]);
-
+ VERBOSE(0, "modelfile:|" << words[0] << "|" << std::endl);
//checking the language model type
- m_lm=lmContainer::CreateLanguageModel(words[1],ngramcache_load_factor, dictionary_load_factor);
+ m_lm=lmContainer::CreateLanguageModel(words[0],ngramcache_load_factor, dictionary_load_factor);
m_lm->setMaxLoadedLevel(requiredMaxlev);
- m_lm->load(words[1], memmap);
+ m_lm->load(words[0], memmap);
maxlev=m_lm->maxlevel();
dict=m_lm->getDict();
getDict()->genoovcode();
- m_lm->init_caches(m_lm->maxlevel());
+ m_lm->init_caches(m_lm->maxlevel());
//reading topic model
inp.getline(line,BUFSIZ,'\n');
- tokenN = parseWords(line,words,2);
+ tokenN = parseWords(line,words,3);
- if(tokenN < 2 || tokenN > 2) {
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nweight_of_ngram_LM filename_of_LM\nweight_of_topic_model filename_of_topic_model");
+ if(tokenN < 3 || tokenN > 3) {
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_model");
}
//loading topic model and initialization
m_similaritymodel_weight = (float) atof(words[0]);
- m_similaritymodel = new ContextSimilarity(words[1]);
+ m_similaritymodel = new ContextSimilarity(words[1], words[2]);
inp.close();
+
+ VERBOSE(0, "topicdict:|" << words[1] << "|" << std::endl);
+ VERBOSE(0, "topicmodel:|" << words[2] << "|" << std::endl);
}
double lmContextDependent::lprob(ngram ng, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
@@ -119,8 +121,8 @@ namespace irstlm {
string_vec_t text; // replace with the text passed as parameter
double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
double similarity_score = m_similaritymodel->score(text, topic_weights);
- double ret_logprob = m_lm_weight * lm_logprob + m_similaritymodel_weight * similarity_score;
- VERBOSE(0, "lm_logprob:" << lm_logprob << " m_lm_weight:" << m_lm_weight<< " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_logprob:" << ret_logprob << std::endl);
+ double ret_logprob = lm_logprob + m_similaritymodel_weight * similarity_score;
+ VERBOSE(0, "lm_logprob:" << lm_logprob << " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_logprob:" << ret_logprob << std::endl);
return ret_logprob;
}
@@ -131,18 +133,20 @@ namespace irstlm {
//create the actual ngram
ngram ng(dict);
ng.pushw(text);
+ VERBOSE(0,"ng:|" << ng << "|" << std::endl);
+
MY_ASSERT (ng.size == (int) text.size());
double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
double similarity_score = m_similaritymodel->score(text, topic_weights);
- double ret_logprob = m_lm_weight * lm_logprob + m_similaritymodel_weight * similarity_score;
- VERBOSE(0, "lm_logprob:" << lm_logprob << " m_lm_weight:" << m_lm_weight<< " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_logprob:" << ret_logprob << std::endl);
+ double ret_logprob = lm_logprob + m_similaritymodel_weight * similarity_score;
+ VERBOSE(0, "lm_logprob:" << lm_logprob << " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_logprob:" << ret_logprob << std::endl);
return ret_logprob;
}
double lmContextDependent::lprob(int* codes, int sz, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
{
- VERBOSE(0,"lmContextDependent::lprob(int* codes, int sz, topic_map_t& topic_weights, " << std::endl);
+ VERBOSE(3,"lmContextDependent::lprob(int* codes, int sz, topic_map_t& topic_weights, " << std::endl);
//create the actual ngram
ngram ong(dict);
ong.pushc(codes,sz);
diff --git a/src/lmContextDependent.h b/src/lmContextDependent.h
index efecec6..7dc8364 100644
--- a/src/lmContextDependent.h
+++ b/src/lmContextDependent.h
@@ -137,6 +137,8 @@ namespace irstlm {
return lprob(ng, topic_weights, bow, bol, maxsuffptr, statesize, extendible);
};
virtual double clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL){
+
+ VERBOSE(0,"lmContainer::clprob(string_vec_t& text,...." << std::endl);
return lprob(text, topic_weights, bow, bol, maxsuffptr, statesize, extendible);
};
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list