[irstlm] 117/126: added functions to compute prob of ngram according to weights (passed as parameter), which are used by the interpolatedLM instead of those specified in the configuration file

Giulio Paci giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:51 UTC 2016


This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.

commit b1d4eb8450024f2f17a79d7a7a9e9bda414a03e8
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Sun Oct 18 00:14:56 2015 +0200

    added functions to compute prob of ngram according to weights (passed as parameter), which are used by the interpolatedLM instead of those specified in the configuration file
---
 src/lmContainer.h       |  27 ++++++++--
 src/lmInterpolation.cpp | 141 ++++++++++++++++++++++++++++++++++++++++++++----
 src/lmInterpolation.h   |  11 +++-
 3 files changed, 164 insertions(+), 15 deletions(-)

diff --git a/src/lmContainer.h b/src/lmContainer.h
index 7a37ccc..b8ac737 100644
--- a/src/lmContainer.h
+++ b/src/lmContainer.h
@@ -48,7 +48,10 @@ typedef enum {LMT_FIND,    //!< search: find an entry
 } LMT_ACTION;
 
 namespace irstlm {
+	
+	
 	typedef std::map< std::string, float > topic_map_t;
+	typedef std::map< std::string, double > lm_map_t;
 	
 class lmContainer
 {
@@ -152,25 +155,37 @@ public:
     UNUSED(topic_weights);
     return clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
   };
-
   virtual double clprob(int* ng, int ngsize, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL) {
     UNUSED(topic_weights);
     return clprob(ng, ngsize, bow, bol, maxsuffptr, statesize, extendible);
   }
-	
 	virtual double clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) {
 		VERBOSE(3,"lmContainer::clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow,...." << std::endl);
     UNUSED(topic_weights);
     return clprob(text, bow, bol, maxsuffptr, statesize, extendible);
   }
 	
+  virtual double clprob(ngram ng, lm_map_t& lm_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL) {
+    UNUSED(lm_weights);
+    return clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
+  };
+  virtual double clprob(int* ng, int ngsize, lm_map_t& lm_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL) {
+    UNUSED(lm_weights);
+    return clprob(ng, ngsize, bow, bol, maxsuffptr, statesize, extendible);
+  }
+	virtual double clprob(string_vec_t& text, lm_map_t& lm_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) {
+		VERBOSE(3,"lmContainer::clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow,...." << std::endl);
+    UNUSED(lm_weights);
+    return clprob(text, bow, bol, maxsuffptr, statesize, extendible);
+  }
+	
+	
   virtual const char *cmaxsuffptr(ngram ng, unsigned int* statesize=NULL)
   {
     UNUSED(ng);
     UNUSED(statesize);
     return NULL;
   }
-
   virtual const char *cmaxsuffptr(int* ng, int ngsize, unsigned int* statesize=NULL)
   {
     UNUSED(ng);
@@ -178,6 +193,12 @@ public:
     UNUSED(statesize);
     return NULL;
   }
+  virtual const char *cmaxsuffptr(string_vec_t& text, unsigned int* statesize=NULL)
+  {
+    UNUSED(text);
+    UNUSED(statesize);
+    return NULL;
+  }
 
 	
 	
diff --git a/src/lmInterpolation.cpp b/src/lmInterpolation.cpp
index 1fa0081..c6deb75 100644
--- a/src/lmInterpolation.cpp
+++ b/src/lmInterpolation.cpp
@@ -41,6 +41,7 @@ namespace irstlm {
 		order=0;
 		memmap=0;
 		isInverted=false;
+		m_name_flag=false;
 	}
 	
 	void lmInterpolation::load(const std::string &filename,int mmap)
@@ -60,13 +61,40 @@ namespace irstlm {
 		
 		char line[MAX_LINE];
 		const char* words[LMINTERPOLATION_MAX_TOKEN];
-		int tokenN;
+		size_t tokenN;
 		inp.getline(line,MAX_LINE,'\n');
 		tokenN = parseWords(line,words,LMINTERPOLATION_MAX_TOKEN);
+		bool error=false;
 		
-		if (tokenN != 2 || ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0))){
-			exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
+		if ((tokenN<2) || (tokenN>3)){
+			error=true;	
+		}else if ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0)) {
+			error=true;
+		}else if ((tokenN==3) && ((strcmp(words[2],"MAP") != 0) && (strcmp(words[2],"map") != 0))){
+			error=true;
 		}
+		
+		if (error){
+			exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format:\nLMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1 [inverted]\nweight_of_LM_2 filename_of_LM_2\nor\nLMINTERPOLATION number_of_models MAP\nweight_of_LM_1 name_LM_1 filename_of_LM_1\nweight_of_LM_2 name_LM_2 filename_of_LM_2");
+			
+		}
+		
+		size_t idx_weight, idx_file, idx_name, idx_inverted, idx_size;
+		if (tokenN==2){
+			m_name_flag=false;
+			idx_weight=0;
+			idx_file=1;
+			idx_inverted=2;
+			idx_size=3;
+		}else{
+			m_name_flag=true;
+			idx_weight=0;
+			idx_name=1;
+			idx_file=2;
+			idx_inverted=3;
+			idx_size=4;
+		}
+		
 		m_number_lm = atoi(words[1]);
 		
 		m_weight.resize(m_number_lm);
@@ -79,22 +107,23 @@ namespace irstlm {
 		dict->incflag(1);
 		for (int i=0; i<m_number_lm; i++) {
 			inp.getline(line,BUFSIZ,'\n');
-			tokenN = parseWords(line,words,3);
+			tokenN = parseWords(line,words,idx_size);
 			
-			if(tokenN < 2 || tokenN >3) {
-				exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
+			if(tokenN < idx_file || tokenN > idx_inverted) {
+				exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format:\nLMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1 [inverted]\nweight_of_LM_2 filename_of_LM_2\nor\nLMINTERPOLATION number_of_models MAP\nweight_of_LM_1 name_LM_1 filename_of_LM_1\nweight_of_LM_2 name_LM_2 filename_of_LM_2");
 			}
 			
 			//check whether the (textual) LM has to be loaded as inverted
 			m_isinverted[i] = false;
-			if(tokenN == 3) {
-				if (strcmp(words[2],"inverted") == 0)
+			if(tokenN == idx_size) {
+				if (strcmp(words[idx_inverted],"inverted") == 0)
 					m_isinverted[i] = true;
 			}
 			VERBOSE(2,"i:" << i << " m_isinverted[i]:" << m_isinverted[i] << endl);
 			
-			m_weight[i] = (float) atof(words[0]);
-			m_file[i] = words[1];
+			m_weight[i] = atof(words[idx_weight]);
+			m_name[words[idx_name]] = i;
+			m_file[i] = words[idx_file];
 			VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_file:"<< words[1] << std::endl;);
 			
 			m_lm[i] = load_lm(i,memmap,ngramcache_load_factor,dictionary_load_factor);
@@ -140,6 +169,98 @@ namespace irstlm {
 		return lmt;
 	}
 	
+	
+	void lmInterpolation::set_weight(const lm_map_t& map, std::vector<double>& weight){
+		for (lm_map_t::const_iterator it=map.begin(); it!=map.end();++it){
+			weight[m_name[it->first]] = it->second;
+		}
+	}
+	
+	//return log10 prob of an ngram
+	double lmInterpolation::clprob(ngram ng, lm_map_t& lm_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
+	{
+		
+		double pr=0.0;
+		double _logpr;
+		
+		char* _maxsuffptr=NULL,*actualmaxsuffptr=NULL;
+		unsigned int _statesize=0,actualstatesize=0;
+		int _bol=0,actualbol=MAX_NGRAM;
+		double _bow=0.0,actualbow=0.0; 
+		bool _extendible=false;
+		bool actualextendible=false;
+
+		std::vector<double> weight(m_number_lm);
+		set_weight(lm_weights,weight);
+		
+		for (size_t i=0; i<m_lm.size(); i++) {
+			
+			if (weight[i]>0.0){
+				ngram _ng(m_lm[i]->getDict());
+				_ng.trans(ng);
+				_logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,&_extendible);
+				
+				IFVERBOSE(3){
+					//cerr.precision(10);
+					VERBOSE(3," LM " << i << " weight:" << weight[i] << std::endl);
+					VERBOSE(3," LM " << i << " log10 logpr:" << _logpr<< std::endl);
+					VERBOSE(3," LM " << i << " pr:" << pow(10.0,_logpr) << std::endl);
+					VERBOSE(3," _statesize:" << _statesize << std::endl);
+					VERBOSE(3," _bow:" << _bow << std::endl);
+					VERBOSE(3," _bol:" << _bol << std::endl);
+				}
+				
+				/*
+				 //TO CHECK the following claims
+				 //What is the statesize of a LM interpolation? The largest _statesize among the submodels
+				 //What is the maxsuffptr of a LM interpolation? The _maxsuffptr of the submodel with the largest _statesize
+				 //What is the bol of a LM interpolation? The smallest _bol among the submodels
+				 //What is the bow of a LM interpolation? The weighted sum of the bow of the submodels
+				 //What is the prob of a LM interpolation? The weighted sum of the prob of the submodels
+				 //What is the extendible flag of a LM interpolation? true if the extendible flag is one for any LM
+				 */
+				
+				pr+=weight[i]*pow(10.0,_logpr);
+				actualbow+=m_weight[i]*pow(10.0,_bow);
+				
+				if(_statesize > actualstatesize || i == 0) {
+					actualmaxsuffptr = _maxsuffptr;
+					actualstatesize = _statesize;
+				}
+				if (_bol < actualbol) {
+					actualbol=_bol; //backoff limit of LM[i]
+				}
+				if (_extendible) {
+					actualextendible=true; //set extendible flag to true if the ngram is extendible for any LM
+				}
+			}
+		}
+		if (bol) *bol=actualbol;
+		if (bow) *bow=log(actualbow);
+		if (maxsuffptr) *maxsuffptr=actualmaxsuffptr;
+		if (statesize) *statesize=actualstatesize;
+		if (extendible) {
+			*extendible=actualextendible;
+			//    delete _extendible;
+		}
+		
+		if (statesize) VERBOSE(3, " statesize:" << *statesize << std::endl);
+		if (bow) VERBOSE(3, " bow:" << *bow << std::endl);
+		if (bol) VERBOSE(3, " bol:" << *bol << std::endl);
+		
+		return log10(pr);
+	}
+	double lmInterpolation::clprob(int* codes, int sz, lm_map_t& lm_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
+	{
+		
+		//create the actual ngram
+		ngram ong(dict);
+		ong.pushc(codes,sz);
+		MY_ASSERT (ong.size == sz);
+		
+		return clprob(ong, lm_weights, bow, bol, maxsuffptr, statesize, extendible);
+	}
+	
 	//return log10 prob of an ngram
 	double lmInterpolation::clprob(ngram ng, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
 	{
diff --git a/src/lmInterpolation.h b/src/lmInterpolation.h
index eb9edb5..686ef9e 100644
--- a/src/lmInterpolation.h
+++ b/src/lmInterpolation.h
@@ -40,7 +40,7 @@ namespace irstlm {
 interpolation of several sub LMs
 */
 
-#define LMINTERPOLATION_MAX_TOKEN 3
+#define LMINTERPOLATION_MAX_TOKEN 5
 
 class lmInterpolation: public lmContainer
 {
@@ -50,12 +50,14 @@ class lmInterpolation: public lmContainer
   int dictionary_upperbound; //set by user
   double  logOOVpenalty; //penalty for OOV words (default 0)
   bool      isInverted;
+	bool m_name_flag; //flag for the presence of a map between name and lm
   int memmap;  //level from which n-grams are accessed via mmap
 
   std::vector<double> m_weight;
   std::vector<std::string> m_file;
   std::vector<bool> m_isinverted;
   std::vector<lmContainer*> m_lm;
+	lm_map_t m_name;
 
   int               maxlev; //maximun order of sub LMs;
 
@@ -69,12 +71,17 @@ public:
   lmInterpolation(float nlf=0.0, float dlfi=0.0);
   virtual ~lmInterpolation() {};
 
+	void set_weight(const lm_map_t& map, std::vector<double>& weight);
+	
   void load(const std::string &filename,int mmap=0);
   lmContainer* load_lm(int i, int memmap, float nlf, float dlf);
 
   virtual double clprob(ngram ng,            double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
   virtual double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
-
+	
+  virtual double clprob(ngram ng, lm_map_t& lm_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL);
+  virtual double clprob(int* ng, int ngsize, lm_map_t& lm_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL);
+	
   int maxlevel() const {
     return maxlev;
   };

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git



More information about the debian-science-commits mailing list