[irstlm] 48/126: restructuring of code using ngramtable instead of lmtable for computing context-dependent scores
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:44 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.
commit 3d9b78b2b585176a020a23d5f32f83ae5b2bfd67
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Fri Aug 7 13:43:22 2015 +0200
restructuring of code using ngramtable instead of lmtable for computing context-dependent scores
---
src/context-similarity.cpp | 242 +++++++++++++++++++++++++++++----------------
src/context-similarity.h | 54 ++++++++--
src/lmContextDependent.cpp | 75 +++++++-------
src/lmContextDependent.h | 8 +-
4 files changed, 249 insertions(+), 130 deletions(-)
diff --git a/src/context-similarity.cpp b/src/context-similarity.cpp
index 0c84501..a474d72 100644
--- a/src/context-similarity.cpp
+++ b/src/context-similarity.cpp
@@ -26,6 +26,7 @@
#include <sstream>
#include <stdexcept>
#include <string>
+#include "ngramtable.h"
#include "lmContainer.h"
#include "context-similarity.h"
#include "util.h"
@@ -40,92 +41,99 @@ inline void error(const char* message)
}
namespace irstlm {
- ContextSimilarity::ContextSimilarity(const std::string &dictfile, const std::string &num_modelfile, const std::string &den_modelfile)
+ ContextSimilarity::ContextSimilarity(const std::string &k_modelfile, const std::string &hk_modelfile, const std::string &hwk_modelfile)
{
- m_num_lm=lmContainer::CreateLanguageModel(num_modelfile);
- m_den_lm=lmContainer::CreateLanguageModel(num_modelfile);
+ m_hwk_order=3;
+ m_hk_order=2;
+ m_k_order=1;
+ m_hwk_ngt=new ngramtable((char*) hwk_modelfile.c_str(), m_hwk_order, NULL,NULL,NULL);
+ m_hk_ngt=new ngramtable((char*) hk_modelfile.c_str(), m_hk_order, NULL,NULL,NULL);
+ m_k_ngt=new ngramtable((char*) k_modelfile.c_str(), m_k_order, NULL,NULL,NULL);
- m_num_lm->load(num_modelfile);
- m_den_lm->load(den_modelfile);
+ m_smoothing = 0.001;
+ m_threshold_on_h = 0;
+ m_active=true;
- m_num_lm->getDict()->genoovcode();
- m_den_lm->getDict()->genoovcode();
-
- //loading form file
- std::string str;
-
- mfstream inp(dictfile.c_str(),ios::in);
-
- if (!inp) {
- std::stringstream ss_msg;
- ss_msg << "cannot open " << dictfile << "\n";
- exit_error(IRSTLM_ERROR_IO, ss_msg.str());
- }
- VERBOSE(0, "Loading the list of topic" << std::endl);
-
- while (inp >> str)
- {
- m_lm_topic_dict.insert(str);
- }
- VERBOSE(0, "There are " << m_lm_topic_dict.size() << " topic" << std::endl);
+ m_topic_size = m_k_ngt->getDict()->size();
+ VERBOSE(1, "There are " << m_topic_size << " topics in the model" << std::endl);
}
ContextSimilarity::~ContextSimilarity()
- {}
+ {
+ delete m_hwk_ngt;
+ delete m_hk_ngt;
+ }
//return the log10 of the similarity score
- double ContextSimilarity::score(string_vec_t& text, topic_map_t& topic_weights)
+ double ContextSimilarity::get_context_similarity(string_vec_t& text, topic_map_t& topic_weights)
{
- VERBOSE(4, "double ContextSimilarity::score(string_vec_t& text, topic_map_t& topic_weights)" << std::endl);
+ VERBOSE(2, "double ContextSimilarity::get_context_similarity(string_vec_t& text, topic_map_t& topic_weights)" << std::endl);
double ret_log10_pr;
- if (topic_weights.size() > 0){
+ if (!m_active){ //similarity score is disable
+ ret_log10_pr = 0.0;
+ }else if (m_topic_size == 0){
+ //a-priori topic distribution is "empty", i.e. there is no score for any topic
+ //return an uninforming score (0.0)
+ ret_log10_pr = 0.0;
+ } else{
+ VERBOSE(3, "topic_weights.size():" << topic_weights.size() << std::endl);
+ ngram base_num_ng(m_hwk_ngt->getDict());
+ ngram base_den_ng(m_hk_ngt->getDict());
- ngram base_num_ng(m_num_lm->getDict());
- ngram base_den_ng(m_den_lm->getDict());
- create_ngram(text, base_num_ng, base_den_ng);
- for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
- {
- ngram num_ng = base_num_ng;
- ngram den_ng = base_den_ng;
- add_topic(it->first, num_ng, den_ng);
- double apriori_topic_score = log10(it->second);
- double topic_score = get_topic_similarity(num_ng, den_ng); //log10-prob
+ create_ngram(text, base_num_ng, base_den_ng);
+ if (den_reliable(base_den_ng)){ //we do not know about the reliability of the denominator
- VERBOSE(3, "topic:|" << it->first << "apriori_topic_score:" << apriori_topic_score << " topic_score:" << topic_score << std::endl);
- if (it == topic_weights.begin()){
- ret_log10_pr = apriori_topic_score + topic_score;
- }else{
- ret_log10_pr = logsum(ret_log10_pr, apriori_topic_score + topic_score)/M_LN10;
+ for (topic_map_t::iterator it = topic_weights.begin(); it!= topic_weights.end(); ++it)
+ {
+ ngram num_ng = base_num_ng;
+ ngram den_ng = base_den_ng;
+ add_topic(it->first, num_ng, den_ng);
+
+ double apriori_topic_score = log10(it->second); //log10-prob
+ double topic_score = get_topic_similarity(num_ng, den_ng); //log10-prob
+
+ VERBOSE(3, "topic:|" << it->first << "| apriori_topic_score:" << apriori_topic_score << " topic_score:" << topic_score << std::endl);
+ if (it == topic_weights.begin()){
+ ret_log10_pr = apriori_topic_score + topic_score;
+ }else{
+ ret_log10_pr = logsum(ret_log10_pr, apriori_topic_score + topic_score)/M_LN10;
+ }
+ VERBOSE(3, "CURRENT ret_log10_pr:" << ret_log10_pr << std::endl);
}
- VERBOSE(4, "CURRENT ret_log10_pr:" << ret_log10_pr << std::endl);
+ }else{
+ //the similarity score is not reliable enough, because occurrences of base_den_ng are too little
+ //we also assume that also counts for base_num_ng are unreliable
+ //return an uninforming score (0.0)
+ ret_log10_pr = 0.0;
}
- }else{
- //a-priori topic distribution is "empty", i.e. there is nore score for any topic
- //return a "constant" lower-bound score, SIMILARITY_LOWER_BOUND = log(0.0)
- ret_log10_pr = SIMILARITY_LOWER_BOUND;
}
- VERBOSE(3, "ret_log10_pr:" << ret_log10_pr << std::endl);
+ VERBOSE(2, "ret_log10_pr:" << ret_log10_pr << std::endl);
return ret_log10_pr;
}
-
+
//returns the scores for all topics in the topic models (without apriori topic prob)
void ContextSimilarity::get_topic_scores(topic_map_t& topic_map, string_vec_t& text)
- {
- ngram base_num_ng(m_num_lm->getDict());
- ngram base_den_ng(m_den_lm->getDict());
+ {
+ ngram base_num_ng(m_hwk_ngt->getDict());
+ ngram base_den_ng(m_hk_ngt->getDict());
create_ngram(text, base_num_ng, base_den_ng);
- for (topic_dict_t::iterator it=m_lm_topic_dict.begin(); it != m_lm_topic_dict.end(); ++it)
- {
- ngram num_ng = base_num_ng;
- ngram den_ng = base_den_ng;
- add_topic(*it, num_ng, den_ng);
- topic_map[*it] = get_topic_similarity(num_ng, den_ng);
+
+ if (m_active){ //similarity score is disable
+ for (int i=0; i<m_k_ngt->getDict()->size();++i)
+ {
+ ngram num_ng = base_num_ng;
+ ngram den_ng = base_den_ng;
+ std::string _topic = m_k_ngt->getDict()->decode(i);
+ add_topic(_topic, num_ng, den_ng);
+ topic_map[_topic] = get_topic_similarity(num_ng, den_ng);
+ }
}
+
}
@@ -168,26 +176,21 @@ namespace irstlm {
void ContextSimilarity::create_ngram(const string_vec_t& text, ngram& num_ng, ngram& den_ng)
{
//text is a vector of strings with w in the last position and the history in the previous positions
- //text must have at least two words
- VERBOSE(3,"void ContextSimilarity::create_ngram" << std::endl);
+ //text must have at least one word
+ //if text has two word, further computation will rely on normal counts, i.e. counts(h,w,k), counts(h,w), counts(h,k), counts(k)
+ //if text has only one word, further computation will rely on lower-order counts, i.e. (w,k), counts(w), counts(k), counts()
+ VERBOSE(2,"void ContextSimilarity::create_ngram" << std::endl);
- //TO_CHECK: what happens when text has zero element
- // if (text.size()==0)
-
- //TO_CHECK: what happens when text has just one element
+ MY_ASSERT(text.size()==0);
-
-
- // lm model for the numerator is assumed to be a 3-gram lm, hence num_gr have only size 3 (two words and one topic); here we insert two words
if (text.size()==1){
- num_ng.pushw(num_ng.dict->OOV());
+ //all further computation will rely on lower-order counts
+ num_ng.pushw(text.at(text.size()-1));
}else {
num_ng.pushw(text.at(text.size()-2));
+ num_ng.pushw(text.at(text.size()-1));
+ den_ng.pushw(text.at(text.size()-2));
}
- num_ng.pushw(text.at(text.size()-1));
-
- // lm model for the denominator is assumed to be a 2-gram lm, hence den_gr have only size 2 (one word and one topic); here we insert one word
- den_ng.pushw(text.at(text.size()-1));
}
void ContextSimilarity::create_topic_ngram(const string_vec_t& text, const std::string& topic, ngram& num_ng, ngram& den_ng)
@@ -206,8 +209,8 @@ namespace irstlm {
double ContextSimilarity::get_topic_similarity(string_vec_t text, const std::string& topic)
{
- ngram num_ng(m_num_lm->getDict());
- ngram den_ng(m_den_lm->getDict());
+ ngram num_ng(m_hwk_ngt->getDict());
+ ngram den_ng(m_hk_ngt->getDict());
create_topic_ngram(text, topic, num_ng, den_ng);
@@ -215,13 +218,86 @@ namespace irstlm {
}
double ContextSimilarity::get_topic_similarity(ngram& num_ng, ngram& den_ng)
- {
- double num_pr=m_num_lm->clprob(num_ng);
- double den_pr=m_den_lm->clprob(den_ng);
- VERBOSE(4, "num_ng:|" << num_ng << "| pr:" << num_pr << std::endl);
- VERBOSE(4, "den_ng:|" << den_ng << "| pr:" << den_pr << std::endl);
- return num_pr - den_pr;
- // return m_lm->clprob(num_ng) - m_lm->clprob(den_ng);
+ {
+ VERBOSE(2, "double ContextSimilarity::get_topic_similarity(ngram& num_ng, ngram& den_ng) with num_ng:|" << num_ng << "| den_ng:|" << den_ng << "|" << std::endl);
+
+ double num_log_pr, den_log_pr;
+
+ double c_hk=m_smoothing, c_h=m_smoothing * m_topic_size;
+ double c_hwk=m_smoothing, c_hw=m_smoothing * m_topic_size;
+
+ if (den_ng.size == m_hk_order){//we rely on counts(h,k) and counts(h)
+ if (m_hk_ngt->get(den_ng)) { c_hk += den_ng.freq; }
+ if (m_hk_ngt->get(den_ng,2,1)) { c_h += den_ng.freq; }
+ }else{//we actually rely on counts(k) and counts()
+ /*
+ if (m_k_ngt->get(den_ng)) { c_hk += den_ng.freq; }
+ c_h += m_hk_ngt->getDict()->totfreq();
+ */
+ c_hk += m_hk_ngt->getDict()->freq(*(den_ng.wordp(1)));
+ c_h += m_k_ngt->getDict()->totfreq();
+ }
+ den_log_pr = log10(c_hk) - log10(c_h);
+ VERBOSE(3, "c_hk:" << c_hk << " c_h:" << c_h << std::endl);
+
+
+ if (num_reliable(num_ng)){
+ if (num_ng.size == m_hwk_order){ //we rely on counts(h,w,k) and counts(h,w)
+ if (m_hwk_ngt->get(num_ng)) { c_hwk += num_ng.freq; }
+ if (m_hwk_ngt->get(num_ng,3,2)) { c_hw += num_ng.freq; }
+ }else{ //we actually rely on counts(h,k) and counts(h)
+ if (m_hk_ngt->get(num_ng)) { c_hwk += num_ng.freq; }
+ if (m_hk_ngt->get(num_ng,3,2)) { c_hw += num_ng.freq; }
+ }
+ num_log_pr = log10(c_hwk) - log10(c_hw);
+ VERBOSE(3, "c_hwk:" << c_hwk << " c_hw:" << c_hw << std::endl);
+ }else{
+ num_log_pr = -log10(m_topic_size);
+ }
+
+ VERBOSE(3, "num_log_pr:" << num_log_pr << " den_log_pr:" << den_log_pr << std::endl);
+ return num_log_pr - den_log_pr;
+ }
+
+ bool ContextSimilarity::num_reliable(ngram& num_ng)
+ {
+ VERBOSE(2, "ContextSimilarity::num_reliable(ngram& num_ng) num_ng:|" << num_ng << "| thr:" << m_threshold_on_h << "|" << std::endl);
+ if (num_ng.size < 2){
+ //num_ng has size lower than expected (2)
+ //in this case we will rely on counts(h, topic) instead of counts(h, w, topic)
+ VERBOSE(3, "num_ng:|" << num_ng << "| has size lower than expected (2) TRUE" << std::endl);
+ return true;
+ }
+ if (m_hwk_ngt->get(num_ng,3,2) && (num_ng.freq > m_threshold_on_h)){
+ VERBOSE(3, "num_ng:|" << num_ng << "| thr:" << m_threshold_on_h << " TRUE" << std::endl);
+ return true;
+ }else{
+ VERBOSE(3, "num_ng:|" << num_ng << "| thr:" << m_threshold_on_h << " FALSE" << std::endl);
+ return false;
+ }
+ }
+
+
+ bool ContextSimilarity::den_reliable(ngram& den_ng)
+ {
+ VERBOSE(2, "ContextSimilarity::den_reliable(ngram& den_ng) den_ng:|" << den_ng << "| thr:" << m_threshold_on_h << "|" << std::endl);
+
+ if (den_ng.size < 1){
+ //den_ng has size lower than expected (1)
+ //in this case we will rely on counts(topic) instead of counts(h, topic)
+ VERBOSE(3, "den_ng:|" << den_ng << "| has size lower than expected (1) TRUE" << std::endl);
+ return true;
+ }
+ den_ng.pushc(0);
+ if (m_hk_ngt->get(den_ng,2,1) && (den_ng.freq > m_threshold_on_h)){
+ den_ng.shift();
+ VERBOSE(3, "den_ng:|" << den_ng << "| thr:" << m_threshold_on_h << " TRUE" << std::endl);
+ return true;
+ }else{
+ den_ng.shift();
+ VERBOSE(3, "den_ng:|" << den_ng << "| thr:" << m_threshold_on_h << " FALSE" << std::endl);
+ return false;
+ }
}
}//namespace irstlm
diff --git a/src/context-similarity.h b/src/context-similarity.h
index d646fb6..32324c6 100644
--- a/src/context-similarity.h
+++ b/src/context-similarity.h
@@ -28,30 +28,41 @@
#include <stdlib.h>
#include <string>
#include <math.h>
-#include <set>
#include "util.h"
#include "dictionary.h"
#include "n_gram.h"
+#include "ngramtable.h"
#include "lmContainer.h"
class ngram;
namespace irstlm {
- #define topic_map_delimiter1 ':'
- #define topic_map_delimiter2 ','
- #define SIMILARITY_LOWER_BOUND -10000
+#define topic_map_delimiter1 ':'
+#define topic_map_delimiter2 ','
+#define SIMILARITY_LOWER_BOUND -10000
typedef std::map< std::string, float > topic_map_t;
- typedef std::set< std::string > topic_dict_t;
-
+
class ContextSimilarity
{
private:
- lmContainer* m_num_lm; // P(topic | h' w)
- lmContainer* m_den_lm; // P(topic | h')
- topic_dict_t m_lm_topic_dict; //the dictionary of the topics seen in the language model
+ ngramtable* m_hwk_ngt; // counts(h, w, topic)
+ ngramtable* m_hk_ngt; // counts(h, topic)
+ ngramtable* m_k_ngt; // counts(topic)
+ int m_k_order; //order of m_k_ngt
+ int m_hk_order; //order of m_hk_ngt
+ int m_hwk_order; //order of m_hwk_ngt
+
+ int m_topic_size; //number of topics in the model
+
topic_map_t topic_map;
+ int m_threshold_on_h; //frequency threshold on h to allow computation of similairty scores
+ double m_smoothing; //smoothing value to sum to the counts to avoid zero-prob; implements a sort of shift-beta smoothing
+
+ //flag for enabling/disabling context_similarity scores
+ // if disabled, context_similarity is 0.0 and topic_scores distribution is empty
+ bool m_active;
void create_ngram(const string_vec_t& text, ngram& num_ng, ngram& den_ng);
void add_topic(const std::string& topic, ngram& num_ng, ngram& den_ng);
@@ -60,6 +71,9 @@ namespace irstlm {
double get_topic_similarity(string_vec_t text, const std::string& topic);
double get_topic_similarity(ngram& num_ng, ngram& den_ng);
+ bool num_reliable(ngram& num_ng);
+ bool den_reliable(ngram& den_ng);
+
public:
ContextSimilarity(const std::string &dictfile, const std::string &num_modelfile, const std::string &den_modelfile);
~ContextSimilarity();
@@ -69,7 +83,27 @@ namespace irstlm {
void add_topic_scores(topic_map_t& map, topic_map_t& tmp_map);
void print_topic_scores(topic_map_t& map);
- double score(string_vec_t& text, topic_map_t& topic_weights);
+ double get_context_similarity(string_vec_t& text, topic_map_t& topic_weights);
+
+ int get_Threshold_on_H(){
+ return m_threshold_on_h;
+ }
+ void set_Threshold_on_H(int val){
+ m_threshold_on_h = val;
+ }
+ double get_SmoothingValue(){
+ return m_smoothing;
+ }
+ void set_SmoothingValue(double val){
+ m_smoothing = val;
+ }
+ bool is_Active(){
+ return m_active;
+ }
+ void set_Active(bool val){
+ m_active = val;
+ }
+
};
}
diff --git a/src/lmContextDependent.cpp b/src/lmContextDependent.cpp
index ebd6ed7..83d23a0 100644
--- a/src/lmContextDependent.cpp
+++ b/src/lmContextDependent.cpp
@@ -61,7 +61,7 @@ namespace irstlm {
void lmContextDependent::load(const std::string &filename,int mmap)
{
VERBOSE(2,"lmContextDependent::load(const std::string &filename,int memmap)" << std::endl);
- VERBOSE(2," filename:|" << filename << "|" << std::endl);
+ VERBOSE(2,"configuration file:|" << filename << "|" << std::endl);
dictionary_upperbound=1000000;
int memmap=mmap;
@@ -71,22 +71,22 @@ namespace irstlm {
VERBOSE(0, "filename:|" << filename << "|" << std::endl);
char line[MAX_LINE];
- const char* words[LMCONFIGURE_MAX_TOKEN];
+ const char* words[LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN];
int tokenN;
inp.getline(line,MAX_LINE,'\n');
- tokenN = parseWords(line,words,LMCONFIGURE_MAX_TOKEN);
+ tokenN = parseWords(line,words,LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN);
if (tokenN != 1 || ((strcmp(words[0],"LMCONTEXTDEPENDENT") != 0) && (strcmp(words[0],"lmcontextdependent")!=0)))
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_num_model topic_nden_model");
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold");
//reading ngram-based LM
inp.getline(line,BUFSIZ,'\n');
tokenN = parseWords(line,words,1);
if(tokenN < 1 || tokenN > 1) {
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_num_model topic_nden_model");
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold");
}
- VERBOSE(0, "modelfile:|" << words[0] << "|" << std::endl);
+ VERBOSE(0, "model_w:|" << words[0] << "|" << std::endl);
//checking the language model type
m_lm=lmContainer::CreateLanguageModel(words[0],ngramcache_load_factor, dictionary_load_factor);
@@ -101,24 +101,31 @@ namespace irstlm {
//reading topic model
inp.getline(line,BUFSIZ,'\n');
- tokenN = parseWords(line,words,4);
+ tokenN = parseWords(line,words,LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN);
- if(tokenN < 4 || tokenN > 4) {
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight topic_dict topic_num_model topic_nden_model");
+ if(tokenN < 5 || tokenN > LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN) {
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold");
}
//loading topic model and initialization
m_similaritymodel_weight = (float) atof(words[0]);
- std::string _dict = words[1];
- std::string _num_lm = words[2];
- std::string _den_lm = words[3];
- m_similaritymodel = new ContextSimilarity(_dict, _num_lm, _den_lm);
+ std::string _k_ngt = words[1];
+ std::string _hk_ngt = words[2];
+ std::string _hwk_ngt = words[3];
+ int _thr = atoi(words[4]);
+ double _smoothing = 0.1;
+ if (tokenN == 6){ _smoothing = atof(words[5]); }
+ m_similaritymodel = new ContextSimilarity(_k_ngt, _hk_ngt, _hwk_ngt);
+ m_similaritymodel->set_Threshold_on_H(_thr);
+ m_similaritymodel->set_SmoothingValue(_smoothing);
inp.close();
- VERBOSE(0, "topic_dict:|" << _dict << "|" << std::endl);
- VERBOSE(0, "topic_num_model:|" << _num_lm << "|" << std::endl);
- VERBOSE(0, "topic_den_model:|" << _den_lm << "|" << std::endl);
+ VERBOSE(0, "model_k:|" << _k_ngt << "|" << std::endl);
+ VERBOSE(0, "model_hk:|" << _hk_ngt << "|" << std::endl);
+ VERBOSE(0, "model_hwk:|" << _hwk_ngt << "|" << std::endl);
+ VERBOSE(0, "topic_threshold_on_h:|" << m_similaritymodel->get_Threshold_on_H() << "|" << std::endl);
+ VERBOSE(0, "shift-beta smoothing on counts:|" << m_similaritymodel->get_SmoothingValue() << "|" << std::endl);
}
void lmContextDependent::GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line)
@@ -127,9 +134,6 @@ namespace irstlm {
if (pos != std::string::npos){ // context_delimiter is found
sentence = line.substr(0, pos);
line.erase(0, pos + context_delimiter.length());
- VERBOSE(0,"pos:|" << pos << "|" << std::endl);
- VERBOSE(0,"sentence:|" << sentence << "|" << std::endl);
- VERBOSE(0,"line:|" << line << "|" << std::endl);
//getting context string;
context = line;
@@ -137,38 +141,41 @@ namespace irstlm {
sentence = line;
context = "";
}
+ VERBOSE(1,"line:|" << line << "|" << std::endl);
+ VERBOSE(1,"sentence:|" << sentence << "|" << std::endl);
+ VERBOSE(1,"context:|" << context << "|" << std::endl);
}
double lmContextDependent::lprob(ngram ng, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
{
- string_vec_t text; // replace with the text passed as parameter
- double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
- double similarity_score = m_similaritymodel->score(text, topic_weights);
- double ret_logprob = lm_logprob;
- if (similarity_score != SIMILARITY_LOWER_BOUND){
- ret_logprob += m_similaritymodel_weight * similarity_score;
+ VERBOSE(2,"lmContextDependent::lprob(ngram ng, topic_map_t& topic_weights, ...)" << std::endl);
+ string_vec_t text;
+ if (ng.size>1){
+ text.push_back(ng.dict->decode(*ng.wordp(2)));
}
- VERBOSE(0, "lm_log10_pr:" << lm_logprob << " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_log10_pr:" << ret_logprob << std::endl);
+ text.push_back(ng.dict->decode(*ng.wordp(1)));
+ double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
+ double similarity_score = m_similaritymodel->get_context_similarity(text, topic_weights);
+ double ret_logprob = lm_logprob + m_similaritymodel_weight * similarity_score;
+ VERBOSE(3, "lm_log10_pr:" << lm_logprob << " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_log10_pr:" << ret_logprob << std::endl);
return ret_logprob;
}
double lmContextDependent::lprob(string_vec_t& text, topic_map_t& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
{
- VERBOSE(0,"lmContextDependent::lprob(string_vec_t& text, topic_map_t& topic_weights, " << std::endl);
+ VERBOSE(2,"lmContextDependent::lprob(string_vec_t& text, topic_map_t& topic_weights, ...)" << std::endl);
+
//create the actual ngram
ngram ng(dict);
ng.pushw(text);
- VERBOSE(0,"ng:|" << ng << "|" << std::endl);
+ VERBOSE(3,"ng:|" << ng << "|" << std::endl);
MY_ASSERT (ng.size == (int) text.size());
double lm_logprob = m_lm->clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
- double similarity_score = m_similaritymodel->score(text, topic_weights);
- double ret_logprob = lm_logprob;
- if (similarity_score != SIMILARITY_LOWER_BOUND){
- ret_logprob += m_similaritymodel_weight * similarity_score;
- }
- VERBOSE(0, "lm_log10_pr:" << lm_logprob << " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_log10_pr:" << ret_logprob << std::endl);
+ double similarity_score = m_similaritymodel->get_context_similarity(text, topic_weights);
+ double ret_logprob = lm_logprob + m_similaritymodel_weight * similarity_score;
+ VERBOSE(3, "lm_log10_pr:" << lm_logprob << " similarity_score:" << similarity_score << " m_similaritymodel_weight:" << m_similaritymodel_weight << " ret_log10_pr:" << ret_logprob << std::endl);
return ret_logprob;
}
diff --git a/src/lmContextDependent.h b/src/lmContextDependent.h
index 7b0c7a5..f6b6e85 100644
--- a/src/lmContextDependent.h
+++ b/src/lmContextDependent.h
@@ -60,7 +60,7 @@ namespace irstlm {
and a bigram-based topic model
*/
-#define LMCONFIGURE_MAX_TOKEN 3
+#define LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN 6
static const std::string context_delimiter="___CONTEXT___";
@@ -142,8 +142,6 @@ namespace irstlm {
return lprob(ng, topic_weights, bow, bol, maxsuffptr, statesize, extendible);
};
virtual double clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL){
-
- VERBOSE(0,"lmContainer::clprob(string_vec_t& text,...." << std::endl);
return lprob(text, topic_weights, bow, bol, maxsuffptr, statesize, extendible);
};
@@ -202,6 +200,10 @@ namespace irstlm {
inline virtual bool is_OOV(int code) { //returns true if the word is OOV for each subLM
return m_lm->is_OOV(code);
}
+
+ inline void set_Active(bool value){
+ m_similaritymodel->set_Active(value);
+ }
};
}//namespace irstlm
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list