[irstlm] 44/78: topic scoring type is now a parameter to read from configuration file; code cleanup and optimization

Tue May 17 07:47:04 UTC 2016

This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to tag adaptiveLM.v0.10
in repository irstlm.

commit 58742a2a6ef5f23adef8db537a359ea0a054d9c2
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Thu Nov 19 10:39:25 2015 +0100

    topic scoring type is now a parameter to read from configuration file; code cleanup and optimization
---
 src/context-similarity.cpp | 39 ++++++++++++++++++---------------
 src/context-similarity.h   | 23 ++++++++++++++++++++
 src/lmContextDependent.cpp | 54 ++++++++++++++++++++++++++++++++++++----------
 src/lmContextDependent.h   |  2 +-
 src/lmInterpolation.cpp    | 19 ++++++++++++----
 5 files changed, 104 insertions(+), 33 deletions(-)

diff --git a/src/context-similarity.cpp b/src/context-similarity.cpp
index 0f2af82..a4ac74c 100644
--- a/src/context-similarity.cpp
+++ b/src/context-similarity.cpp
@@ -56,7 +56,7 @@ namespace irstlm {
 		m_smoothing = 0.001;
 		m_threshold_on_h = 0;
 		m_active=true;
-		
+		m_score_type = TOPIC_SCORE_TYPE_2;	
 		m_topic_size = m_k_ngt->getDict()->size();
 		VERBOSE(1, "There are " << m_topic_size << " topics in the model" << std::endl);
 		
@@ -217,15 +217,20 @@ namespace irstlm {
 	}
 	
 	double ContextSimilarity::topic_score(ngram& ng, ngramtable& ngt, ngramtable& ngt2){
-#ifdef OPTION_1
-		return topic_score_option1(ng, ngt, ngt2);
-#elif OPTION_2
-		return topic_score_option2(ng, ngt, ngt2);
-#elif OPTION_3
-		return topic_score_option3(ng, ngt, ngt2);
-#else
-		return topic_score_option0(ng, ngt, ngt2);
-#endif
+		switch (m_score_type){
+                case TOPIC_SCORE_TYPE_0:
+                        return topic_score_option0(ng, ngt, ngt2);
+		case TOPIC_SCORE_TYPE_1:
+			return topic_score_option1(ng, ngt, ngt2);
+		case TOPIC_SCORE_TYPE_2:
+			return topic_score_option2(ng, ngt, ngt2);
+		case TOPIC_SCORE_TYPE_3:
+			return topic_score_option3(ng, ngt, ngt2);
+		default:
+			std::stringstream ss_msg;
+                        ss_msg << "Topic score type " << m_score_type << " is unknown.";
+                        exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
+		}
 	}
 	
 	double ContextSimilarity::topic_score_option0(ngram& ng, ngramtable& ngt, ngramtable& ngt2)
@@ -234,11 +239,11 @@ namespace irstlm {
 		UNUSED(ngt2);
 		VERBOSE(2, "double ContextSimilarity::topic_score_option0(ngram& ng, ngramtable& ngt) with ng:|" << ng << "|" << std::endl);
 		
-		//option 0: uniform (not considering log function) 
+		//approximation 0: uniform (not considering log function) 
 		//P(k|hw) = 1/number_of_topics
 		double log_pr = -log(m_topic_size)/M_LN10;
 		
-		VERBOSE(3, "option0: return: " << log_pr<< std::endl);	
+		VERBOSE(3, "score_type:0  return:" << log_pr<< std::endl);	
 		return log_pr;
 	}
 	
@@ -264,14 +269,14 @@ namespace irstlm {
 		double c_xk2, c_x2;
 		get_counts(ng2, ngt2, c_xk2, c_x2);
 		
-		//option 1: (not considering log function) 
+		//approximation 1: (not considering log function) 
 		//P(k|hw)/sum_v P(k|hv) ~approx~ P(k|hw)/P(k|h) ~approx~ num_pr/den_pr
 		//num_pr = c'(hwk)/c'(hw)
 		//den_pr = c'(hk)/c'(h)
 		double den_log_pr = log10(c_xk2) - log10(c_x2);
 		double num_log_pr = log10(c_xk) - log10(c_x);
 		double log_pr = num_log_pr - den_log_pr;
-		VERBOSE(3, "option1: num_log_pr:" << num_log_pr << " den_log_pr:" << den_log_pr << " return: " << log_pr << std::endl);
+		VERBOSE(3, "score_type:1  num_log_pr:" << num_log_pr << " den_log_pr:" << den_log_pr << " return:" << log_pr << std::endl);
 		return log_pr;
 	}
 	
@@ -284,10 +289,10 @@ namespace irstlm {
 		double c_xk, c_x;
 		get_counts(ng, ngt, c_xk, c_x);
 		
-		//option 1: (not considering log function) 
+		//approximation 2: (not considering log function) 
 		//P(k|hw)/sum_v P(k|hv) ~approx~ P(k|hw)/P(k|h) ~approx~ c'(hwk)/c'(hw)
 		double log_pr = log10(c_xk) - log10(c_x);
-		VERBOSE(3, "option2: log_pr:" << log_pr << " return: " << log_pr << std::endl);
+		VERBOSE(3, "score_type:2  log_pr:" << log_pr << " return:" << log_pr << std::endl);
 		return log_pr;
 	}
 	
@@ -325,7 +330,7 @@ namespace irstlm {
 		
 		double log_pr = logistic_function((c_xk*c_x2)/(c_x*c_xk2),1.0,1.0);
 		
-		VERBOSE(3, "option3: return: " << log_pr << std::endl);
+		VERBOSE(3, "score_type:3  return:" << log_pr << std::endl);
 		return log_pr;
 	}
 	
diff --git a/src/context-similarity.h b/src/context-similarity.h
index 5e67553..b8684dd 100644
--- a/src/context-similarity.h
+++ b/src/context-similarity.h
@@ -27,6 +27,7 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <string>
+#include <sstream>
 #include <math.h>
 #include "cmd.h"
 #include "util.h"
@@ -41,6 +42,10 @@ namespace irstlm {
 #define topic_map_delimiter1 ':'
 #define topic_map_delimiter2 ','
 #define SIMILARITY_LOWER_BOUND -10000
+#define TOPIC_SCORE_TYPE_0 0
+#define TOPIC_SCORE_TYPE_1 1
+#define TOPIC_SCORE_TYPE_2 2
+#define TOPIC_SCORE_TYPE_3 3
 	
 	class ContextSimilarity
 	{
@@ -59,6 +64,7 @@ namespace irstlm {
 		topic_map_t topic_map; 
 		int m_threshold_on_h; //frequency threshold on h to allow computation of similairty scores
 		double m_smoothing; //smoothing value to sum to the counts to avoid zero-prob; implements a sort of shift-beta smoothing
+		int m_score_type; //scoreing type for computing the topic distribution, values are TOPIC_SCORE_TYPE_[0123]
 
 		//flag for enabling/disabling context_similarity scores
 		// if disabled, context_similarity is 0.0 and topic_scores distribution is empty
@@ -130,6 +136,23 @@ namespace irstlm {
 			m_active = val;
 		}
 		
+                void set_Topic_Score_Type(int t){
+                        switch (t){
+			case TOPIC_SCORE_TYPE_0:
+			case TOPIC_SCORE_TYPE_1:
+			case TOPIC_SCORE_TYPE_2:
+			case TOPIC_SCORE_TYPE_3:
+                        	m_score_type = t;
+			default:
+	                        std::stringstream ss_msg;
+        	                ss_msg << "Topic score type " << m_score_type << " is unknown.";
+                        	exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
+			}
+                }
+                int get_Topic_Score_Type(){
+                        return m_score_type;
+                }
+
 	};
 }
 
diff --git a/src/lmContextDependent.cpp b/src/lmContextDependent.cpp
index 39b0c95..cc52dd0 100644
--- a/src/lmContextDependent.cpp
+++ b/src/lmContextDependent.cpp
@@ -23,6 +23,7 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <iostream>
+#include <sstream>
 #include <stdexcept>
 #include <string>
 #include "lmContainer.h"
@@ -31,12 +32,6 @@
 
 using namespace std;
 
-inline void error(const char* message)
-{
-  std::cerr << message << "\n";
-  throw std::runtime_error(message);
-}
-
 namespace irstlm {
 	
 	lmContextDependent::lmContextDependent(float nlf, float dlf)
@@ -66,6 +61,11 @@ namespace irstlm {
 		VERBOSE(2,"lmContextDependent::load(const std::string &filename,int memmap)" << std::endl);
 		VERBOSE(2,"configuration file:|" << filename << "|" << std::endl);
 		
+		std::stringstream ss_format;
+
+		ss_format << "LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold [smoothing]" << std::endl;
+		ss_format << "or\nLMCONTEXTDEPENDENT TYPE score_type\nfilename_of_LM \nweight k_model hk_model hwk_model pruning_threshold [smoothing]" << std::endl;
+
 		dictionary_upperbound=1000000;
 		int memmap=mmap;
 		
@@ -78,16 +78,40 @@ namespace irstlm {
 		int tokenN;
 		inp.getline(line,MAX_LINE,'\n');
 		tokenN = parseWords(line,words,LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN);
-		
-		if (tokenN != 1 || ((strcmp(words[0],"LMCONTEXTDEPENDENT") != 0) && (strcmp(words[0],"lmcontextdependent")!=0)))
-			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold");
+
+		bool error=false;
+                if ((tokenN!=1) || (tokenN!=3)){
+                        error=true;     
+                }else if ((strcmp(words[0],"LMCONTEXTDEPENDENT") != 0) && (strcmp(words[0],"lmcontextdependent")!=0)) {
+                        error=true;
+                }else if ((tokenN==3) && ((strcmp(words[1],"TYPE") != 0) && (strcmp(words[1],"type") != 0))){
+                        error=true;
+                }
+		if (error){
+			std::stringstream ss_msg;
+			ss_msg << "ERROR: wrong header format of configuration file\ncorrect format:" << ss_format;
+			exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
+                }
+
+                int _score_type;
+                if (tokenN==1){
+                	_score_type = TOPIC_SCORE_TYPE_2;
+		}else{
+                	_score_type = atoi(words[2]);
+		}	
 		
 		//reading ngram-based LM
 		inp.getline(line,BUFSIZ,'\n');
 		tokenN = parseWords(line,words,1);
 		if(tokenN < 1 || tokenN > 1) {
-			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold");
+			error=true;
 		}
+                if (error){
+                        std::stringstream ss_msg;
+                        ss_msg << "ERROR: wrong header format of configuration file\ncorrect format:" << ss_format;
+                        exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
+                }
+
 		
 		VERBOSE(0, "model_w:|" << words[0] << "|" << std::endl);
 		//checking the language model type
@@ -107,8 +131,13 @@ namespace irstlm {
 		tokenN = parseWords(line,words,LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN);
 		
 		if(tokenN < 5 || tokenN > LMCONTEXTDEPENDENT_CONFIGURE_MAX_TOKEN) {
-			error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nfilename_of_LM\nweight k_model hk_model hwk_model pruning_threshold");
+			error= true;
 		}
+                if (error){
+                        std::stringstream ss_msg;
+                        ss_msg << "ERROR: wrong header format of configuration file\ncorrect format:" << ss_format;
+                        exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
+                }
 		
 		//loading topic model and initialization
 		m_similaritymodel_weight = (float) atof(words[0]);
@@ -116,11 +145,14 @@ namespace irstlm {
 		std::string _hk_ngt = words[2];
 		std::string _hwk_ngt = words[3];
 		int _thr = atoi(words[4]);
+
 		double _smoothing = 0.1;
 		if (tokenN == 6){ _smoothing = atof(words[5]); }
+
 		m_similaritymodel = new ContextSimilarity(_k_ngt, _hk_ngt, _hwk_ngt);
 		m_similaritymodel->set_Threshold_on_H(_thr);
 		m_similaritymodel->set_SmoothingValue(_smoothing);
+		m_similaritymodel->set_Topic_Score_Type(_score_type);
 		
 		inp.close();
 		
diff --git a/src/lmContextDependent.h b/src/lmContextDependent.h
index 4d9c515..07cd736 100644
--- a/src/lmContextDependent.h
+++ b/src/lmContextDependent.h
@@ -191,7 +191,7 @@ namespace irstlm {
 		void set_Normalized(bool val){
 			m_normalization = val;
 		}
-		
+
 	};
 }//namespace irstlm
 
diff --git a/src/lmInterpolation.cpp b/src/lmInterpolation.cpp
index c1dcc94..c64f843 100644
--- a/src/lmInterpolation.cpp
+++ b/src/lmInterpolation.cpp
@@ -48,7 +48,12 @@ namespace irstlm {
 	{
 		VERBOSE(2,"lmInterpolation::load(const std::string &filename,int memmap)" << std::endl);
 		VERBOSE(2," filename:|" << filename << "|" << std::endl);
-		
+
+                std::stringstream ss_format;
+
+                ss_format << "LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1 [inverted]\nweight_of_LM_2 filename_of_LM_2 [inverted]\n...\n";
+                ss_format << "or\nLMINTERPOLATION number_of_models MAP\nweight_of_LM_1 name_LM_1 filename_of_LM_1\nweight_of_LM_2 name_LM_2 filename_of_LM_2 [inverted]\n...\n";
+
 		dictionary_upperbound=1000000;
 		int memmap=mmap;
 		
@@ -74,8 +79,9 @@ namespace irstlm {
 		}
 		
 		if (error){
-			exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format:\nLMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1 [inverted]\nweight_of_LM_2 filename_of_LM_2\nor\nLMINTERPOLATION number_of_models MAP\nweight_of_LM_1 name_LM_1 filename_of_LM_1\nweight_of_LM_2 name_LM_2 filename_of_LM_2");
-			
+                        std::stringstream ss_msg;
+                        ss_msg << "ERROR: wrong header format of configuration file\ncorrect format:" << ss_format;
+                        exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
 		}
 		
 		size_t idx_weight, idx_file, idx_name, idx_inverted, idx_size;
@@ -111,7 +117,12 @@ namespace irstlm {
 			tokenN = parseWords(line,words,3);
 			
 			if(tokenN < idx_file || tokenN > idx_size) {
-				exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format:\nLMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1 [inverted]\nweight_of_LM_2 filename_of_LM_2\nor\nLMINTERPOLATION number_of_models MAP\nweight_of_LM_1 name_LM_1 filename_of_LM_1\nweight_of_LM_2 name_LM_2 filename_of_LM_2");
+				error = true;
+			}
+                        if (error){
+				std::stringstream ss_msg;
+                        	ss_msg << "ERROR: wrong header format of configuration file\ncorrect format:" << ss_format;
+                        	exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
 			}
 			
 			//check whether the (textual) LM has to be loaded as inverted

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git