[irstlm] 44/78: topic scoring type is now a parameter to read from configuration file; code cleanup and optimization
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:47:04 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to tag adaptiveLM.v0.10
in repository irstlm.
commit 58742a2a6ef5f23adef8db537a359ea0a054d9c2
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Thu Nov 19 10:39:25 2015 +0100
topic scoring type is now a parameter to read from configuration file; code cleanup and optimization
---
src/context-similarity.cpp | 39 ++++++++++++++++++---------------
src/context-similarity.h | 23 ++++++++++++++++++++
src/lmContextDependent.cpp | 54 ++++++++++++++++++++++++++++++++++++----------
src/lmContextDependent.h | 2 +-
src/lmInterpolation.cpp | 19 ++++++++++++----
5 files changed, 104 insertions(+), 33 deletions(-)
diff --git a/src/context-similarity.cpp b/src/context-similarity.cpp
index 0f2af82..a4ac74c 100644
--- a/src/context-similarity.cpp
+++ b/src/context-similarity.cpp
@@ -56,7 +56,7 @@ namespace irstlm {
m_smoothing = 0.001;
m_threshold_on_h = 0;
m_active=true;
-
+ m_score_type = TOPIC_SCORE_TYPE_2;
m_topic_size = m_k_ngt->getDict()->size();
VERBOSE(1, "There are " << m_topic_size << " topics in the model" << std::endl);
@@ -217,15 +217,20 @@ namespace irstlm {
}
double ContextSimilarity::topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2){
-#ifdef OPTION_1
- return topic_score_option1(ng, ngt, ngt2);
-#elif OPTION_2
- return topic_score_option2(ng, ngt, ngt2);
-#elif OPTION_3
- return topic_score_option3(ng, ngt, ngt2);
-#else
- return topic_score_option0(ng, ngt, ngt2);
-#endif
+ switch (m_score_type){
+ case TOPIC_SCORE_TYPE_0:
+ return topic_score_option0(ng, ngt, ngt2);
+ case TOPIC_SCORE_TYPE_1:
+ return topic_score_option1(ng, ngt, ngt2);
+ case TOPIC_SCORE_TYPE_2:
+ return topic_score_option2(ng, ngt, ngt2);
+ case TOPIC_SCORE_TYPE_3:
+ return topic_score_option3(ng, ngt, ngt2);
+ default:
+ std::stringstream ss_msg;
+ ss_msg << "Topic score type " << m_score_type << " is unknown.";
+ exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
+ }
}
double ContextSimilarity::topic_score_option0(ngram& ng, ngramtable& ngt, ngramtable& ngt2)
@@ -234,11 +239,11 @@ namespace irstlm {
UNUSED(ngt2);
VERBOSE(2, "double ContextSimilarity::topic_score_option0(ngram& ng, ngramtable& ngt) with ng:|" << ng << "|" << std::endl);
- //option 0: uniform (not considering log function)
+ //approximation 0: uniform (not considering log function)
//P(k|hw) = 1/number_of_topics
double log_pr = -log(m_topic_size)/M_LN10;
- VERBOSE(3, "option0: return: " << log_pr<< std::endl);
+ VERBOSE(3, "score_type:0 return:" << log_pr<< std::endl);
return log_pr;
}
@@ -264,14 +269,14 @@ namespace irstlm {
double c_xk2, c_x2;
get_counts(ng2, ngt2, c_xk2, c_x2);
- //option 1: (not considering log function)
+ //approximation 1: (not considering log function)
//P(k|hw)/sum_v P(k|hv) ~approx~ P(k|hw)/P(k|h) ~approx~ num_pr/den_pr
//num_pr = c'(hwk)/c'(hw)
//den_pr = c'(hk)/c'(h)
double den_log_pr = log10(c_xk2) - log10(c_x2);
double num_log_pr = log10(c_xk) - log10(c_x);
double log_pr = num_log_pr - den_log_pr;
- VERBOSE(3, "option1: num_log_pr:" << num_log_pr << " den_log_pr:" << den_log_pr << " return: " << log_pr << std::endl);
+ VERBOSE(3, "score_type:1 num_log_pr:" << num_log_pr << " den_log_pr:" << den_log_pr << " return:" << log_pr << std::endl);
return log_pr;
}
@@ -284,10 +289,10 @@ namespace irstlm {
double c_xk, c_x;
get_counts(ng, ngt, c_xk, c_x);
- //option 1: (not considering log function)
+ //approximation 2: (not considering log function)
//P(k|hw)/sum_v P(k|hv) ~approx~ P(k|hw)/P(k|h) ~approx~ c'(hwk)/c'(hw)
double log_pr = log10(c_xk) - log10(c_x);
- VERBOSE(3, "option2: log_pr:" << log_pr << " return: " << log_pr << std::endl);
+ VERBOSE(3, "score_type:2 log_pr:" << log_pr << " return:" << log_pr << std::endl);
return log_pr;
}
@@ -325,7 +330,7 @@ namespace irstlm {
double log_pr = logistic_function((c_xk*c_x2)/(c_x*c_xk2),1.0,1.0);
- VERBOSE(3, "option3: return: " << log_pr << std::endl);
+ VERBOSE(3, "score_type:3 return:" << log_pr << std::endl);
return log_pr;
}
diff --git a/src/context-similarity.h b/src/context-similarity.h
index 5e67553..b8684dd 100644
--- a/src/context-similarity.h
+++ b/src/context-similarity.h
@@ -27,6 +27,7 @@
#include <cstdlib>
#include <stdlib.h>
#include <string>
+#include <sstream>
#include <math.h>
#include "cmd.h"
#include "util.h"
@@ -41,6 +42,10 @@ namespace irstlm {
#define topic_map_delimiter1 ':'
#define topic_map_delimiter2 ','
#define SIMILARITY_LOWER_BOUND -10000
+#define TOPIC_SCORE_TYPE_0 0
+#define TOPIC_SCORE_TYPE_1 1
+#define TOPIC_SCORE_TYPE_2 2
+#define TOPIC_SCORE_TYPE_3 3
class ContextSimilarity
{
@@ -59,6 +64,7 @@ namespace irstlm {
topic_map_t topic_map;
int m_threshold_on_h; //frequency threshold on h to allow computation of similairty scores
double m_smoothing; //smoothing value to sum to the counts to avoid zero-prob; implements a sort of shift-beta smoothing
+ int m_score_type; //scoreing type for computing the topic distribution, values are TOPIC_SCORE_TYPE_[0123]
//flag for enabling/disabling context_similarity scores
// if disabled, context_similarity is 0.0 and topic_scores distribution is empty
@@ -130,6 +136,23 @@ namespace irstlm {
m_active = val;
}
+ void set_Topic_Score_Type(int t){
+ switch (t){
+ case TOPIC_SCORE_TYPE_0:
+ case TOPIC_SCORE_TYPE_1:
+ case TOPIC_SCORE_TYPE_2:
+ case TOPIC_SCORE_TYPE_3:
+ m_score_type = t;
+ default:
+ std::stringstream ss_msg;
+ ss_msg << "Topic score type " << m_score_type << " is unknown.";
+ exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
+ }
+ }
+ int get_Topic_Score_Type(){
+ return m_score_type;
+ }
+
};
}
diff --git a/src/lmContextDependent.cpp b/src/lmContextDependent.cpp
index 39b0c95..cc52dd0 100644
--- a/src/lmContextDependent.cpp
+++ b/src/lmContextDependent.cpp
@@ -23,6 +23,7 @@
#include <cstdlib>
#include <stdlib.h>
#include <iostream>
+#include <sstream>
#include <stdexcept>
#include <string>
#include "lmContainer.h"
@@ -31,12 +32,6 @@
using namespace std;
-inline void error(const char* message)
-{
- std::cerr << message << "\n";
- throw std::runtime_error(message);
-}
-
namespace irstlm {
lmContextDependent::lmContextDependent(float nlf, float dlf)
@@ -66,6 +61,11 @@ namespace irstlm {
VERBOSE(2,"lmContextDependent::load(const std::string &filename,int memmap)" << std::endl);
VERBOSE(2,"configuration file:|" << filename << "|" << std::endl);
+ std::stringstream ss_format;
+
+ ss_format << "LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold [smoothing]" << std::endl;
+ ss_format << "or\nLMCONTEXTDEPENDENT TYPE score_type\nfilename_of_LM \nweight k_model hk_model hwk_model pruning_threshold [smoothing]" << std::endl;
+
dictionary_upperbound=1000000;
int memmap=mmap;
@@ -78,16 +78,40 @@ namespace irstlm {
int tokenN;
inp.getline(line,MAX_LINE,'\n');
tokenN = parseWords(line,words,LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN);
-
- if (tokenN != 1 || ((strcmp(words[0],"LMCONTEXTDEPENDENT") != 0) && (strcmp(words[0],"lmcontextdependent")!=0)))
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold");
+
+ bool error=false;
+ if ((tokenN!=1) || (tokenN!=3)){
+ error=true;
+ }else if ((strcmp(words[0],"LMCONTEXTDEPENDENT") != 0) && (strcmp(words[0],"lmcontextdependent")!=0)) {
+ error=true;
+ }else if ((tokenN==3) && ((strcmp(words[1],"TYPE") != 0) && (strcmp(words[1],"type") != 0))){
+ error=true;
+ }
+ if (error){
+ std::stringstream ss_msg;
+ ss_msg << "ERROR: wrong header format of configuration file\ncorrect format:" << ss_format;
+ exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
+ }
+
+ int _score_type;
+ if (tokenN==1){
+ _score_type = TOPIC_SCORE_TYPE_2;
+ }else{
+ _score_type = atoi(words[2]);
+ }
//reading ngram-based LM
inp.getline(line,BUFSIZ,'\n');
tokenN = parseWords(line,words,1);
if(tokenN < 1 || tokenN > 1) {
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold");
+ error=true;
}
+ if (error){
+ std::stringstream ss_msg;
+ ss_msg << "ERROR: wrong header format of configuration file\ncorrect format:" << ss_format;
+ exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
+ }
+
VERBOSE(0, "model_w:|" << words[0] << "|" << std::endl);
//checking the language model type
@@ -107,8 +131,13 @@ namespace irstlm {
tokenN = parseWords(line,words,LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN);
if(tokenN < 5 || tokenN > LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN) {
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold");
+ error= true;
}
+ if (error){
+ std::stringstream ss_msg;
+ ss_msg << "ERROR: wrong header format of configuration file\ncorrect format:" << ss_format;
+ exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
+ }
//loading topic model and initialization
m_similaritymodel_weight = (float) atof(words[0]);
@@ -116,11 +145,14 @@ namespace irstlm {
std::string _hk_ngt = words[2];
std::string _hwk_ngt = words[3];
int _thr = atoi(words[4]);
+
double _smoothing = 0.1;
if (tokenN == 6){ _smoothing = atof(words[5]); }
+
m_similaritymodel = new ContextSimilarity(_k_ngt, _hk_ngt, _hwk_ngt);
m_similaritymodel->set_Threshold_on_H(_thr);
m_similaritymodel->set_SmoothingValue(_smoothing);
+ m_similaritymodel->set_Topic_Score_Type(_score_type);
inp.close();
diff --git a/src/lmContextDependent.h b/src/lmContextDependent.h
index 4d9c515..07cd736 100644
--- a/src/lmContextDependent.h
+++ b/src/lmContextDependent.h
@@ -191,7 +191,7 @@ namespace irstlm {
void set_Normalized(bool val){
m_normalization = val;
}
-
+
};
}//namespace irstlm
diff --git a/src/lmInterpolation.cpp b/src/lmInterpolation.cpp
index c1dcc94..c64f843 100644
--- a/src/lmInterpolation.cpp
+++ b/src/lmInterpolation.cpp
@@ -48,7 +48,12 @@ namespace irstlm {
{
VERBOSE(2,"lmInterpolation::load(const std::string &filename,int memmap)" << std::endl);
VERBOSE(2," filename:|" << filename << "|" << std::endl);
-
+
+ std::stringstream ss_format;
+
+ ss_format << "LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1 [inverted]\nweight_of_LM_2 filename_of_LM_2 [inverted]\n...\n";
+ ss_format << "or\nLMINTERPOLATION number_of_models MAP\nweight_of_LM_1 name_LM_1 filename_of_LM_1\nweight_of_LM_2 name_LM_2 filename_of_LM_2 [inverted]\n...\n";
+
dictionary_upperbound=1000000;
int memmap=mmap;
@@ -74,8 +79,9 @@ namespace irstlm {
}
if (error){
- exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format:\nLMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1 [inverted]\nweight_of_LM_2 filename_of_LM_2\nor\nLMINTERPOLATION number_of_models MAP\nweight_of_LM_1 name_LM_1 filename_of_LM_1\nweight_of_LM_2 name_LM_2 filename_of_LM_2");
-
+ std::stringstream ss_msg;
+ ss_msg << "ERROR: wrong header format of configuration file\ncorrect format:" << ss_format;
+ exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
}
size_t idx_weight, idx_file, idx_name, idx_inverted, idx_size;
@@ -111,7 +117,12 @@ namespace irstlm {
tokenN = parseWords(line,words,3);
if(tokenN < idx_file || tokenN > idx_size) {
- exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format:\nLMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1 [inverted]\nweight_of_LM_2 filename_of_LM_2\nor\nLMINTERPOLATION number_of_models MAP\nweight_of_LM_1 name_LM_1 filename_of_LM_1\nweight_of_LM_2 name_LM_2 filename_of_LM_2");
+ error = true;
+ }
+ if (error){
+ std::stringstream ss_msg;
+ ss_msg << "ERROR: wrong header format of configuration file\ncorrect format:" << ss_format;
+ exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
}
//check whether the (textual) LM has to be loaded as inverted
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list