[irstlm] 32/126: added computation of sentence-based topic score distribution; added supporting functions; code cleanup;
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:42 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.
commit 836988edc745c26de4c80c4c57d352fafcfa3546
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Mon Jul 27 15:56:56 2015 +0200
added computation of sentence-based topic score distribution; added supporting functions; code cleanup;
---
src/context-dependent-evaluation.cpp | 174 +++++++++++++++++++++++++----------
src/context-similarity.cpp | 137 ++++++++++++++++++---------
src/context-similarity.h | 18 ++--
src/lmContextDependent.cpp | 42 +++++++--
src/lmContextDependent.h | 20 +++-
5 files changed, 279 insertions(+), 112 deletions(-)
diff --git a/src/context-dependent-evaluation.cpp b/src/context-dependent-evaluation.cpp
index 48a138f..5ac7cd0 100644
--- a/src/context-dependent-evaluation.cpp
+++ b/src/context-dependent-evaluation.cpp
@@ -30,6 +30,7 @@
#include "util.h"
#include "math.h"
#include "lmContainer.h"
+#include "lmContextDependent.h"
using namespace std;
using namespace irstlm;
@@ -64,6 +65,7 @@ int main(int argc, char **argv)
bool sent_PP_flag = false;
bool contextbasedscore = false;
+ bool topicscore = false;
int debug = 0;
int requiredMaxlev = 1000;
@@ -80,7 +82,7 @@ int main(int argc, char **argv)
"randcalls", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file",
"r", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file",
"contextbasedscore", CMDBOOLTYPE|CMDMSG, &contextbasedscore, "computes context-dependent probabilities and pseudo-perplexity of the text from standard input",
- "cbs", CMDBOOLTYPE|CMDMSG, &contextbasedscore, "computes context-dependent probabilities and pseudo-perplexity of the text from standard input",
+ "topicscore", CMDBOOLTYPE|CMDMSG, &topicscore, "computes the topic scores of the text from standard input",
"debug", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
"d", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0",
"level", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken",
@@ -121,6 +123,7 @@ int main(int argc, char **argv)
if (lmfile!=NULL) std::cerr << "lmfile: " << lmfile << std::endl;
if (testfile!=NULL) std::cerr << "testfile: " << testfile << std::endl;
if (contextbasedscore==true) std::cerr << "contextbasedscore: " << contextbasedscore << std::endl;
+ if (topicscore==true) std::cerr << "topicscore: " << topicscore << std::endl;
std::cerr << "loading up to the LM level " << requiredMaxlev << " (if any)" << std::endl;
std::cerr << "dub: " << dub<< std::endl;
@@ -139,6 +142,105 @@ int main(int argc, char **argv)
//use caches to save time (only if PS_CACHE_ENABLE is defined through compilation flags)
lmt->init_caches(lmt->maxlevel());
+ if (topicscore == true) {
+
+ if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ if (lmt->getLanguageModelType() == _IRSTLM_LMMACRO) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+ if (lmt->getLanguageModelType() == _IRSTLM_LMCLASS) {
+ debug = (debug>4)?4:debug;
+ std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
+ }
+
+ std::cerr << "Start Topic Score generation " << std::endl;
+ std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl;
+ std::cout.setf(ios::fixed);
+ std::cout.precision(2);
+
+ std::fstream inptxt(testfile,std::ios::in);
+
+ // loop over input lines
+ char line[MAX_LINE];
+ while (inptxt.getline(line,MAX_LINE)) {
+
+ std::string line_str = line;
+
+ VERBOSE(0,"line_str:|" << line_str << "|" << std::endl);
+
+ //getting sentence string;
+ std::string sentence;
+ std::string context;
+
+
+ ((lmContextDependent*) lmt)->GetSentenceAndContext(sentence,context,line_str);
+ VERBOSE(0,"sentence:|" << sentence << "|" << std::endl);
+ VERBOSE(0,"context:|" << context << "|" << std::endl);
+ VERBOSE(0,"line_str:|" << line_str << "|" << std::endl);
+
+ //getting apriori topic weights
+ topic_map_t apriori_topic_map;
+ ((lmContextDependent*) lmt)->getContextSimilarity()->setContextMap(apriori_topic_map,context);
+
+ if(1){
+ // computation using std::string
+ // loop over ngrams of the sentence
+ string_vec_t word_vec;
+ split(sentence, ' ', word_vec);
+
+ //first points to the last recent term to take into account
+ //last points to the position after the most recent term to take into account
+ //last could point outside the vector of string; do NOT use word_vec.at(last)
+ size_t last, first;
+ size_t size=0;
+ size_t order = lmt->maxlevel();
+
+
+
+ topic_map_t sentence_topic_map;
+ VERBOSE(0,"word_vec.size():|" << word_vec.size() << "|" << std::endl);
+ for (size_t i=0; i<word_vec.size(); ++i){
+ ++size;
+ size=(size<order)?size:order;
+ last=i+1;
+ // reset ngram at begin of sentence
+ if (word_vec.at(i) == lmt->getDict()->BoS()) {
+ size=0;
+ continue;
+ }
+ first = last - size;
+
+ VERBOSE(0,"topic scores for first:|" << first << "| last:|" << last << "| size:" << size << std::endl);
+ string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
+
+ if (size>=1) {
+ VERBOSE(0,"computing topic_scores for first:|" << first << "| and last:|" << last << "|" << std::endl);
+
+ topic_map_t tmp_topic_map;
+ ((lmContextDependent*) lmt)->getContextSimilarity()->get_topic_scores(tmp_topic_map, tmp_word_vec);
+
+ std::cout << "first:" << first << " last:" << last << ((lmContextDependent*) lmt)->getContextDelimiter();
+ ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(tmp_topic_map);
+
+ ((lmContextDependent*) lmt)->getContextSimilarity()->add_topic_scores(sentence_topic_map, tmp_topic_map);
+ tmp_topic_map.clear();
+ }
+ }
+ std::cout << sentence << ((lmContextDependent*) lmt)->getContextDelimiter();
+ ((lmContextDependent*) lmt)->getContextSimilarity()->print_topic_scores(sentence_topic_map);
+ }
+
+ apriori_topic_map.clear();
+ }
+
+
+ delete lmt;
+ return 0;
+ }
if (contextbasedscore == true) {
if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) {
@@ -153,7 +255,7 @@ int main(int argc, char **argv)
debug = (debug>4)?4:debug;
std::cerr << "Maximum debug value for this LM type: " << debug << std::endl;
}
- std::cerr << "Start Eval" << std::endl;
+ std::cerr << "Start ContextBased Evaluation" << std::endl;
std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl;
std::cout.setf(ios::fixed);
std::cout.precision(2);
@@ -168,16 +270,7 @@ int main(int argc, char **argv)
// variables for storing sentence-based Perplexity
int sent_Nbo=0, sent_Nw=0,sent_Noov=0;
- double sent_logPr=0,sent_PP=0,sent_PPwp=0;
-
-
-// ngram ng(lmt->getDict());
-
- const std::string context_delimiter="___CONTEXT___";
- const char topic_map_delimiter='=';
-
- string_vec_t topic_weight_vec;
- string_vec_t topic_weight;
+ double sent_logPr=0,sent_PP=0,sent_PPwp=0;
std::fstream inptxt(testfile,std::ios::in);
@@ -193,65 +286,48 @@ int main(int argc, char **argv)
std::string sentence;
std::string context;
- size_t pos = line_str.find(context_delimiter);
- if (pos != std::string::npos){ // context_delimiter is found
- sentence = line_str.substr(0, pos);
- std::cout << sentence << std::endl;
- line_str.erase(0, pos + context_delimiter.length());
- VERBOSE(0,"pos:|" << pos << "|" << std::endl);
- VERBOSE(0,"sentence:|" << sentence << "|" << std::endl);
- VERBOSE(0,"line_str:|" << line_str << "|" << std::endl);
-
- //getting context string;
- context = line_str;
- }else{
- sentence = line_str;
- context = "";
- }
-
+ ((lmContextDependent*) lmt)->GetSentenceAndContext(sentence,context,line_str);
+ VERBOSE(0,"sentence:|" << sentence << "|" << std::endl);
VERBOSE(0,"context:|" << context << "|" << std::endl);
- VERBOSE(0,"line_str:|" << line_str << "|" << std::endl);
- //getting topic weights
- topic_map_t topic_weight_map;
+ VERBOSE(0,"line_str:|" << line_str << "|" << std::endl);
- split(context, ' ', topic_weight_vec);
- for (string_vec_t::iterator it=topic_weight_vec.begin(); it!=topic_weight_vec.end(); ++it){
- split(*it, topic_map_delimiter, topic_weight);
- topic_weight_map[topic_weight.at(0)] = strtod (topic_weight.at(1).c_str(), NULL);
- topic_weight.clear();
- }
- topic_weight_vec.clear();
+ //getting apriori topic weights
+ topic_map_t apriori_topic_map;
+ ((lmContextDependent*) lmt)->getContextSimilarity()->setContextMap(apriori_topic_map,context);
if(1){
// computation using std::string
// loop over ngrams of the sentence
- string_vec_t w_vec;
- split(sentence, ' ', w_vec);
+ string_vec_t word_vec;
+ split(sentence, ' ', word_vec);
+ //first points to the last recent term to take into account
+ //last points to the position after the most recent term to take into account
+ //last could point outside the vector of string; do NOT use word_vec.at(last)
size_t last, first;
size_t size=0;
size_t order = lmt->maxlevel();
- VERBOSE(0,"w_vec.size():|" << w_vec.size() << "|" << std::endl);
- for (size_t i=0; i<w_vec.size(); ++i){
+ VERBOSE(0,"word_vec.size():|" << word_vec.size() << "|" << std::endl);
+ for (size_t i=0; i<word_vec.size(); ++i){
++size;
size=(size<order)?size:order;
last=i+1;
// reset ngram at begin of sentence
- if (w_vec.at(i) == lmt->getDict()->BoS()) {
+ if (word_vec.at(i) == lmt->getDict()->BoS()) {
size=0;
continue;
}
first = last - size;
- VERBOSE(0,"prob for first:|" << first << "| and last:|" << last << "| size:" << size << std::endl);
- string_vec_t tmp_w_vec(w_vec.begin() + first, w_vec.begin() +last);
+ VERBOSE(0,"prob for first:|" << first << "| last:|" << last << "| size:" << size << std::endl);
+ string_vec_t tmp_word_vec(word_vec.begin() + first, word_vec.begin() +last);
if (size>=1) {
- VERBOSE(0,"computing prob for first:|" << first << "| and last:|" << last << "| is Pr=" << Pr << std::endl);
- Pr=lmt->clprob(tmp_w_vec, topic_weight_map, &bow, &bol, &msp, &statesize);
- VERBOSE(0,"prob for first:|" << first << "| and last:|" << last << "| is Pr=" << Pr << std::endl);
+ VERBOSE(0,"computing prob for first:|" << first << "| and last:|" << last << "|" << std::endl);
+ Pr=lmt->clprob(tmp_word_vec, apriori_topic_map, &bow, &bol, &msp, &statesize);
+ VERBOSE(0," --> prob for first:|" << first << "| and last:|" << last << "| is Pr=" << Pr << std::endl);
logPr+=Pr;
sent_logPr+=Pr;
VERBOSE(0,"sent_logPr:|" << sent_logPr << " logPr:|" << logPr << std::endl);
@@ -268,7 +344,7 @@ int main(int argc, char **argv)
lmt->check_caches_levels();
}
- topic_weight_map.clear();
+ apriori_topic_map.clear();
}
diff --git a/src/context-similarity.cpp b/src/context-similarity.cpp
index ddfaf21..cc86337 100644
--- a/src/context-similarity.cpp
+++ b/src/context-similarity.cpp
@@ -40,13 +40,16 @@ inline void error(const char* message)
}
namespace irstlm {
- ContextSimilarity::ContextSimilarity(const std::string &dictfile, const std::string &modelfile)
+ ContextSimilarity::ContextSimilarity(const std::string &dictfile, const std::string &num_modelfile, const std::string &den_modelfile)
{
- m_lm=lmContainer::CreateLanguageModel(modelfile);
+ m_num_lm=lmContainer::CreateLanguageModel(num_modelfile);
+ m_den_lm=lmContainer::CreateLanguageModel(num_modelfile);
- m_lm->load(modelfile);
+ m_num_lm->load(num_modelfile);
+ m_den_lm->load(den_modelfile);
- m_lm->getDict()->genoovcode();
+ m_num_lm->getDict()->genoovcode();
+ m_den_lm->getDict()->genoovcode();
//loading form file
std::string str;
@@ -74,46 +77,46 @@ namespace irstlm {
double ContextSimilarity::score(string_vec_t& text, topic_map_t& topic_weights)
{
VERBOSE(4, "double ContextSimilarity::score(string_vec_t& text, topic_map_t& topic_weights)" << std::endl);
+ double ret_logprob = SIMILARITY_LOWER_BOUND;
+
if (topic_weights.size() == 0){
//a-priori topic distribution is "empty", i.e. there is nore score for any topic
//return a "constant" lower-bound score, SIMILARITY_LOWER_BOUND = log(0.0)
- return SIMILARITY_LOWER_BOUND;
- }
-
- ngram base_num_ng(m_lm->getDict());
- ngram base_den_ng(m_lm->getDict());
- create_ngram(text, base_num_ng, base_den_ng);
-
- double ret_logprob = 0.0;
- double add_logprob;
- topic_map_t::iterator it = topic_weights.begin();
- do
- {
- ngram num_ng = base_num_ng;
- ngram den_ng = base_den_ng;
- add_topic(it->first, num_ng, den_ng);
+ ret_logprob = SIMILARITY_LOWER_BOUND;
+ }else{
+
+ ngram base_num_ng(m_num_lm->getDict());
+ ngram base_den_ng(m_den_lm->getDict());
+ create_ngram(text, base_num_ng, base_den_ng);
- VERBOSE(0, "topic:|" << it->first << " log(p(topic):" << log(it->second) << std::endl);
- double topic_score = get_topic_similarity(num_ng, den_ng);
- add_logprob = log(it->second) + topic_score;
- VERBOSE(0, "topic_score:" << topic_score << std::endl);
- VERBOSE(0, "add_logprob:" << add_logprob << std::endl);
- ret_logprob = logsum(ret_logprob, add_logprob);
- ++it;
- }while (it!= topic_weights.end());
+ for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
+ {
+ ngram num_ng = base_num_ng;
+ ngram den_ng = base_den_ng;
+ add_topic(it->first, num_ng, den_ng);
+ double apriori_topic_score = log(it->second);
+ double topic_score = get_topic_similarity(num_ng, den_ng);
+
+ VERBOSE(3, "topic:|" << it->first << "apriori_topic_score:" << apriori_topic_score << " topic_score:" << topic_score << std::endl);
+ if (it == topic_weights.begin()){
+ ret_logprob = apriori_topic_score + topic_score;
+ }else{
+ ret_logprob = logsum(ret_logprob, apriori_topic_score + topic_score);
+ }
+ VERBOSE(4, "CURRENT ret_logprob:" << ret_logprob << std::endl);
+ }
+ }
- VERBOSE(0, "ret_logprob:" << ret_logprob << std::endl);
+ VERBOSE(3, "ret_logprob:" << ret_logprob << std::endl);
return ret_logprob;
}
-
- topic_map_t ContextSimilarity::get_topic_scores(string_vec_t& text)
- {
- topic_map_t topic_map;
-
- ngram base_num_ng(m_lm->getDict());
- ngram base_den_ng(m_lm->getDict());
+ //returns the scores for all topics in the topic models (without apriori topic prob)
+ void ContextSimilarity::get_topic_scores(topic_map_t& topic_map, string_vec_t& text)
+ {
+ ngram base_num_ng(m_num_lm->getDict());
+ ngram base_den_ng(m_den_lm->getDict());
create_ngram(text, base_num_ng, base_den_ng);
for (topic_dict_t::iterator it=m_lm_topic_dict.begin(); it != m_lm_topic_dict.end(); ++it)
@@ -123,12 +126,51 @@ namespace irstlm {
add_topic(*it, num_ng, den_ng);
topic_map[*it] = get_topic_similarity(num_ng, den_ng);
}
- return topic_map;
+ }
+
+
+ void ContextSimilarity::add_topic_scores(topic_map_t& topic_map, topic_map_t& tmp_map)
+ {
+ for (topic_map_t::iterator it=tmp_map.begin(); it != tmp_map.end(); ++it){
+ topic_map[it->first] += tmp_map[it->first];
+ }
+ }
+
+ //returns the scores for all topics in the topic models (without apriori topic prob)
+ void ContextSimilarity::print_topic_scores(topic_map_t& map)
+ {
+ for (topic_map_t::iterator it=map.begin(); it != map.end(); ++it)
+ {
+ if (it!=map.begin()) { std::cout << topic_map_delimiter1; }
+ std::cout << it->first << topic_map_delimiter2 << it->second;
+ }
+ std::cout << std::endl;
+ }
+
+ void ContextSimilarity::setContextMap(topic_map_t& topic_map, const std::string& context){
+
+ VERBOSE(0,"context:|" << context << "|" << std::endl);
+
+ string_vec_t topic_weight_vec;
+ string_vec_t topic_weight;
+
+ // context is supposed in this format
+ // topic-name1,topic-value1:topic-name2,topic-value2:...:topic-nameN,topic-valueN
+
+ //first-level split the context in a vector of topic-name1,topic-value1, using the first separator ':'
+ split(context, topic_map_delimiter1, topic_weight_vec);
+ for (string_vec_t::iterator it=topic_weight_vec.begin(); it!=topic_weight_vec.end(); ++it){
+ //first-level split the context in a vector of topic-name1 and ,topic-value1, using the second separator ','
+ split(*it, topic_map_delimiter2, topic_weight);
+ topic_map[topic_weight.at(0)] = strtod (topic_weight.at(1).c_str(), NULL);
+ topic_weight.clear();
+ }
+ VERBOSE(0,"found " << topic_map.size() << " entries in the context" << std::endl);
}
void ContextSimilarity::create_ngram(const string_vec_t& text, ngram& num_ng, ngram& den_ng)
{
- //text is a vector of string with w in the last position and the history in the previous positions
+ //text is a vector of strings with w in the last position and the history in the previous positions
//text must have at least two words
VERBOSE(3,"void ContextSimilarity::create_ngram" << std::endl);
@@ -136,6 +178,10 @@ namespace irstlm {
// if (text.size()==0)
//TO_CHECK: what happens when text has just one element
+
+
+
+ // lm model for the numerator is assumed to be a 3-gram lm, hence num_gr have only size 3 (two words and one topic); here we insert two words
if (text.size()==1){
num_ng.pushw(num_ng.dict->OOV());
}else {
@@ -143,7 +189,7 @@ namespace irstlm {
}
num_ng.pushw(text.at(text.size()-1));
- den_ng.pushw(den_ng.dict->OOV()); //or den_ng.pushc(m_lm->getDict()->getoovcode());
+ // lm model for the denominator is assumed to be a 2-gram lm, hence den_gr have only size 2 (one word and one topic); here we insert one word
den_ng.pushw(text.at(text.size()-1));
}
@@ -163,8 +209,8 @@ namespace irstlm {
double ContextSimilarity::get_topic_similarity(string_vec_t text, const std::string& topic)
{
- ngram num_ng(m_lm->getDict());
- ngram den_ng(m_lm->getDict());
+ ngram num_ng(m_num_lm->getDict());
+ ngram den_ng(m_den_lm->getDict());
create_topic_ngram(text, topic, num_ng, den_ng);
@@ -173,11 +219,12 @@ namespace irstlm {
double ContextSimilarity::get_topic_similarity(ngram& num_ng, ngram& den_ng)
{
- double num_pr=m_lm->clprob(num_ng);
- double den_pr=m_lm->clprob(den_ng);
- VERBOSE(0, "num_ng:|" << num_ng << "| pr:" << num_pr << std::endl);
- VERBOSE(0, "den_ng:|" << den_ng << "| pr:" << den_pr << std::endl);
- return m_lm->clprob(num_ng) - m_lm->clprob(den_ng);
+ double num_pr=m_num_lm->clprob(num_ng);
+ double den_pr=m_den_lm->clprob(den_ng);
+ VERBOSE(4, "num_ng:|" << num_ng << "| pr:" << num_pr << std::endl);
+ VERBOSE(4, "den_ng:|" << den_ng << "| pr:" << den_pr << std::endl);
+ return num_pr - den_pr;
+ // return m_lm->clprob(num_ng) - m_lm->clprob(den_ng);
}
}//namespace irstlm
diff --git a/src/context-similarity.h b/src/context-similarity.h
index 4a2533b..d646fb6 100644
--- a/src/context-similarity.h
+++ b/src/context-similarity.h
@@ -37,16 +37,19 @@
class ngram;
namespace irstlm {
-
+ #define topic_map_delimiter1 ':'
+ #define topic_map_delimiter2 ','
+ #define SIMILARITY_LOWER_BOUND -10000
typedef std::map< std::string, float > topic_map_t;
typedef std::set< std::string > topic_dict_t;
- #define SIMILARITY_LOWER_BOUND -10000
+
class ContextSimilarity
{
private:
- lmContainer* m_lm; // P(topic | h' w)
+ lmContainer* m_num_lm; // P(topic | h' w)
+ lmContainer* m_den_lm; // P(topic | h')
topic_dict_t m_lm_topic_dict; //the dictionary of the topics seen in the language model
topic_map_t topic_map;
@@ -58,10 +61,13 @@ namespace irstlm {
double get_topic_similarity(ngram& num_ng, ngram& den_ng);
public:
- ContextSimilarity(const std::string &dictfile, const std::string &modelfile);
+ ContextSimilarity(const std::string &dictfile, const std::string &num_modelfile, const std::string &den_modelfile);
~ContextSimilarity();
-
- topic_map_t get_topic_scores(string_vec_t& text);
+
+ void setContextMap(topic_map_t& topic_map, const std::string& context);
+ void get_topic_scores(topic_map_t& map, string_vec_t& text);
+ void add_topic_scores(topic_map_t& map, topic_map_t& tmp_map);
+ void print_topic_scores(topic_map_t& map);
double score(string_vec_t& text, topic_map_t& topic_weights);
};
diff --git a/src/lmContextDependent.cpp b/src/lmContextDependent.cpp
index 00c7a34..da7e134 100644
--- a/src/lmContextDependent.cpp
+++ b/src/lmContextDependent.cpp
@@ -38,6 +38,7 @@ inline void error(const char* message)
}
namespace irstlm {
+
lmContextDependent::lmContextDependent(float nlf, float dlf)
{
ngramcache_load_factor = nlf;
@@ -76,13 +77,13 @@ namespace irstlm {
tokenN = parseWords(line,words,LMCONFIGURE_MAX_TOKEN);
if (tokenN != 1 || ((strcmp(words[0],"LMCONTEXTDEPENDENT") != 0) && (strcmp(words[0],"lmcontextdependent")!=0)))
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_model");
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_num_model topic_nden_model");
//reading ngram-based LM
inp.getline(line,BUFSIZ,'\n');
tokenN = parseWords(line,words,1);
if(tokenN < 1 || tokenN > 1) {
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_model");
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_num_model topic_nden_model");
}
VERBOSE(0, "modelfile:|" << words[0] << "|" << std::endl);
@@ -100,22 +101,45 @@ namespace irstlm {
//reading topic model
inp.getline(line,BUFSIZ,'\n');
- tokenN = parseWords(line,words,3);
+ tokenN = parseWords(line,words,4);
- if(tokenN < 3 || tokenN > 3) {
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_model");
+ if(tokenN < 4 || tokenN > 4) {
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_num_model topic_nden_model");
}
//loading topic model and initialization
m_similaritymodel_weight = (float) atof(words[0]);
- m_similaritymodel = new ContextSimilarity(words[1], words[2]);
+ std::string _dict = words[1];
+ std::string _num_lm = words[2];
+ std::string _den_lm = words[3];
+ m_similaritymodel = new ContextSimilarity(_dict, _num_lm, _den_lm);
inp.close();
- VERBOSE(0, "topicdict:|" << words[1] << "|" << std::endl);
- VERBOSE(0, "topicmodel:|" << words[2] << "|" << std::endl);
+ VERBOSE(0, "topic_dict:|" << _dict << "|" << std::endl);
+ VERBOSE(0, "topic_num_model:|" << _num_lm << "|" << std::endl);
+ VERBOSE(0, "topic_den_model:|" << _den_lm << "|" << std::endl);
}
-
+
+ void lmContextDependent::GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line)
+ {
+ size_t pos = line.find(context_delimiter);
+ if (pos != std::string::npos){ // context_delimiter is found
+ sentence = line.substr(0, pos);
+ std::cout << sentence << std::endl;
+ line.erase(0, pos + context_delimiter.length());
+ VERBOSE(0,"pos:|" << pos << "|" << std::endl);
+ VERBOSE(0,"sentence:|" << sentence << "|" << std::endl);
+ VERBOSE(0,"line:|" << line << "|" << std::endl);
+
+ //getting context string;
+ context = line;
+ }else{
+ sentence = line;
+ context = "";
+ }
+ }
+
double lmContextDependent::lprob(ngram ng, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
{
string_vec_t text; // replace with the text passed as parameter
diff --git a/src/lmContextDependent.h b/src/lmContextDependent.h
index 7dc8364..7b0c7a5 100644
--- a/src/lmContextDependent.h
+++ b/src/lmContextDependent.h
@@ -62,6 +62,8 @@ namespace irstlm {
#define LMCONFIGURE_MAX_TOKEN 3
+ static const std::string context_delimiter="___CONTEXT___";
+
class lmContextDependent: public lmContainer
{
private:
@@ -73,15 +75,12 @@ namespace irstlm {
int memmap; //level from which n-grams are accessed via mmap
lmContainer* m_lm;
-// std::string m_lm_file;
bool m_isinverted;
- // TopicModel* m_topicmodel;
ContextSimilarity* m_similaritymodel; //to remove when TopicModel is ready
double m_lm_weight;
double m_similaritymodel_weight;
-// std::string m_similaritymodel_file;
float ngramcache_load_factor;
float dictionary_load_factor;
@@ -96,6 +95,12 @@ namespace irstlm {
void load(const std::string &filename,int mmap=0);
+ inline std::string getContextDelimiter() const{
+ return context_delimiter;
+ }
+
+ void GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line);
+
virtual double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL){
VERBOSE(0, "virtual double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL)" << std::endl << "This LM type (lmContextDependent) does not support this function" << std::endl);
UNUSED(ng);
@@ -155,6 +160,15 @@ namespace irstlm {
dict=d;
};
+
+ virtual inline lmContainer* getWordLM() const {
+ return m_lm;
+ };
+
+ virtual inline ContextSimilarity* getContextSimilarity() const {
+ return m_similaritymodel;
+ };
+
virtual inline dictionary* getDict() const {
return dict;
};
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list