[irstlm] 04/10: added supporting function; code cleanup;

Tue May 17 07:46:54 UTC 2016

This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to annotated tag staticLM
in repository irstlm.

commit 60431b7bf5ea949d9dab9f3f27edf6d934196b35
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Sat Oct 3 12:43:04 2015 +0200

    added supporting function; code cleanup;
---
 src/lmInterpolation.cpp | 398 +++++++++++++++++++++++-------------------------
 src/lmInterpolation.h   |  15 +-
 2 files changed, 203 insertions(+), 210 deletions(-)

diff --git a/src/lmInterpolation.cpp b/src/lmInterpolation.cpp
index af181f0..356289f 100644
--- a/src/lmInterpolation.cpp
+++ b/src/lmInterpolation.cpp
@@ -23,7 +23,6 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <iostream>
-#include <sstream>
 #include <stdexcept>
 #include <string>
 #include "lmContainer.h"
@@ -31,232 +30,213 @@
 #include "util.h"
 
 using namespace std;
-	
 
 namespace irstlm {
-lmInterpolation::lmInterpolation(float nlf, float dlf)
-{
-  ngramcache_load_factor = nlf;
-  dictionary_load_factor = dlf;
-	
-  order=0;
-  memmap=0;
-  isInverted=false;
-}
-
-void lmInterpolation::load(const std::string &filename,int mmap)
-{
-  VERBOSE(2,"lmInterpolation::load(const std::string &filename,int memmap)" << std::endl);
-  VERBOSE(2," filename:|" << filename << "|" << std::endl);
-	
-	
-  dictionary_upperbound=1000000;
-  int memmap=mmap;
-	
-	
-  dict=new dictionary((char *)NULL,1000000,dictionary_load_factor);
-	
-  //get info from the configuration file
-  fstream inp(filename.c_str(),ios::in|ios::binary);
-	
-  char line[MAX_LINE];
-  const char* words[LMINTERPOLATION_MAX_TOKEN];
-  int tokenN;
-  inp.getline(line,MAX_LINE,'\n');
-  tokenN = parseWords(line,words,LMINTERPOLATION_MAX_TOKEN);
-	
-  if (tokenN != 2 || ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0))){
-		std::stringstream ss_msg;
-		ss_msg << "ERROR: wrong header format of configuration file" << std::endl;
-		ss_msg << "correct format: LMINTERPOLATION number_of_models" << std::endl;
-		ss_msg << "weight_of_LM_1 filename_of_LM_1" << std::endl;
-		ss_msg << "weight_of_LM_2 filename_of_LM_2" << std::endl;
-		exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
+	lmInterpolation::lmInterpolation(float nlf, float dlf)
+	{
+		ngramcache_load_factor = nlf;
+		dictionary_load_factor = dlf;
+		
+		order=0;
+		memmap=0;
+		isInverted=false;
 	}
-  m_number_lm = atoi(words[1]);
-	
-	/*
-	 Although the weights can assume any real value from a computational point of view,
-	 the model assumes that the weights must be larger than or equal to 0.0.
-	 Moreover, if the weight is 0.0, the probability of the corresponding LM is not computed for improve efficiency
-	*/
-  m_weight.resize(m_number_lm);
-  m_file.resize(m_number_lm);
-  m_isinverted.resize(m_number_lm);
-  m_lm.resize(m_number_lm);
 	
-  VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_number_lm:"<< m_number_lm << std::endl;);
-	
-  dict->incflag(1);
-  for (int i=0; i<m_number_lm; i++) {
-    inp.getline(line,BUFSIZ,'\n');
-    tokenN = parseWords(line,words,3);
+	void lmInterpolation::load(const std::string &filename,int mmap)
+	{
+		VERBOSE(2,"lmInterpolation::load(const std::string &filename,int memmap)" << std::endl);
+		VERBOSE(2," filename:|" << filename << "|" << std::endl);
 		
-    if(tokenN < 2 || tokenN >3) {
-			std::stringstream ss_msg;
-			ss_msg << "ERROR: wrong header format of configuration file" << std::endl;
-			ss_msg << "correct format: LMINTERPOLATION number_of_models" << std::endl;
-			ss_msg << "weight_of_LM_1 filename_of_LM_1" << std::endl;
-			ss_msg << "weight_of_LM_2 filename_of_LM_2" << std::endl;
-			exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
-    }
 		
-		//check whether the (textual) LM has to be loaded as inverted
-    m_isinverted[i] = false;
-    if(tokenN == 3) {
-      if (strcmp(words[2],"inverted") == 0)
-        m_isinverted[i] = true;
-    }
-    VERBOSE(2,"i:" << i << " m_isinverted[i]:" << m_isinverted[i] << endl);
+		dictionary_upperbound=1000000;
+		int memmap=mmap;
 		
-		//The model requires that the weights must be larger than or equal to 0.0.
-    m_weight[i] = (float) atof(words[0]);
-    if(m_weight[i] < 0.0) {
-			std::stringstream ss_msg;
-			ss_msg << "ERROR: weight for the LM " << i << " is negative" << std::endl;
-			exit_error(IRSTLM_ERROR_MODEL,ss_msg.str());
-    }
 		
-    m_file[i] = words[1];
-    VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_file:"<< words[1] << std::endl;);
+		dict=new dictionary((char *)NULL,1000000,dictionary_load_factor);
 		
-    m_lm[i] = load_lm(i,memmap,ngramcache_load_factor,dictionary_load_factor);
-		//set the actual value for inverted flag, which is known only after loading the lM
-    m_isinverted[i] = m_lm[i]->is_inverted();
+		//get info from the configuration file
+		fstream inp(filename.c_str(),ios::in|ios::binary);
 		
-    dictionary *_dict=m_lm[i]->getDict();
-    for (int j=0; j<_dict->size(); j++) {
-      dict->encode(_dict->decode(j));
-    }
-  }
-  getDict()->genoovcode();
-	
-  getDict()->incflag(1);
-  inp.close();
-	
-  int maxorder = 0;
-  for (int i=0; i<m_number_lm; i++) {
-    maxorder = (maxorder > m_lm[i]->maxlevel())?maxorder:m_lm[i]->maxlevel();
-  }
-	
-  if (order == 0) {
-    order = maxorder;
-    std::cerr << "order is not set; reset to the maximum order of LMs: " << order << std::endl;
-  } else if (order > maxorder) {
-    order = maxorder;
-    std::cerr << "order is too high; reset to the maximum order of LMs: " << order << std::endl;
-  }
-  maxlev=order;
-}
-
-lmContainer* lmInterpolation::load_lm(int i,int memmap, float nlf, float dlf)
-{
-  //checking the language model type
-  lmContainer* lmt=lmContainer::CreateLanguageModel(m_file[i],nlf,dlf);
-	
-  //let know that table has inverted n-grams
-  lmt->is_inverted(m_isinverted[i]);  //set inverted flag for each LM
-	
-  lmt->setMaxLoadedLevel(requiredMaxlev);
-	
-  lmt->load(m_file[i], memmap);
-	
-  lmt->init_caches(lmt->maxlevel());
-  return lmt;
-}
-
-
-double lmInterpolation::clprob(ngram ng, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
-{
-	
-  double pr=0.0;
-  double _logpr;
-	
-  char* _maxsuffptr=NULL,*actualmaxsuffptr=NULL;
-  unsigned int _statesize=0,actualstatesize=0;
-  int _bol=0,actualbol=MAX_NGRAM;
-  double _bow=0.0,actualbow=0.0; 
-	bool _extendible=false;
-  bool actualextendible=false;
-	
-  for (size_t i=0; i<m_lm.size(); i++) {
-		if (m_weight[i] > 0.0){//the probability of the corresponding LM is computed only if the weight is larger than 0.0, otherwise it is skipped for efficiency
-			ngram _ng(m_lm[i]->getDict());
-			_ng.trans(ng);
-			_logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,&_extendible);
+		char line[MAX_LINE];
+		const char* words[LMINTERPOLATION_MAX_TOKEN];
+		int tokenN;
+		inp.getline(line,MAX_LINE,'\n');
+		tokenN = parseWords(line,words,LMINTERPOLATION_MAX_TOKEN);
+		
+		if (tokenN != 2 || ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0))){
+			exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
+		}
+		m_number_lm = atoi(words[1]);
+		
+		m_weight.resize(m_number_lm);
+		m_file.resize(m_number_lm);
+		m_isinverted.resize(m_number_lm);
+		m_lm.resize(m_number_lm);
+		
+		VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_number_lm:"<< m_number_lm << std::endl;);
+		
+		dict->incflag(1);
+		for (int i=0; i<m_number_lm; i++) {
+			inp.getline(line,BUFSIZ,'\n');
+			tokenN = parseWords(line,words,3);
+			
+			if(tokenN < 2 || tokenN >3) {
+				exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
+			}
 			
-			/*
-			 cerr.precision(10);
-			 std::cerr << " LM " << i << " weight:" << m_weight[i] << std::endl;
-			 std::cerr << " LM " << i << " log10 logpr:" << _logpr<< std::endl;
-			 std::cerr << " LM " << i << " pr:" << pow(10.0,_logpr) << std::endl;
-			 std::cerr << " _statesize:" << _statesize << std::endl;
-			 std::cerr << " _bow:" << _bow << std::endl;
-			 std::cerr << " _bol:" << _bol << std::endl;
-			 */
+			//check whether the (textual) LM has to be loaded as inverted
+			m_isinverted[i] = false;
+			if(tokenN == 3) {
+				if (strcmp(words[2],"inverted") == 0)
+					m_isinverted[i] = true;
+			}
+			VERBOSE(2,"i:" << i << " m_isinverted[i]:" << m_isinverted[i] << endl);
 			
-			//TO CHECK the following claims
-			//What is the statesize of a LM interpolation? The largest _statesize among the submodels
-			//What is the maxsuffptr of a LM interpolation? The _maxsuffptr of the submodel with the largest _statesize
-			//What is the bol of a LM interpolation? The smallest _bol among the submodels
-			//What is the bow of a LM interpolation? The weighted sum of the bow of the submodels
-			//What is the prob of a LM interpolation? The weighted sum of the prob of the submodels
-			//What is the extendible flag of a LM interpolation? true if the extendible flag is one for any LM
+			m_weight[i] = (float) atof(words[0]);
+			m_file[i] = words[1];
+			VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_file:"<< words[1] << std::endl;);
 			
-			pr+=m_weight[i]*pow(10.0,_logpr);
-			actualbow+=m_weight[i]*pow(10.0,_bow);
+			m_lm[i] = load_lm(i,memmap,ngramcache_load_factor,dictionary_load_factor);
+			//set the actual value for inverted flag, which is known only after loading the lM
+			m_isinverted[i] = m_lm[i]->is_inverted();
 			
-			if(_statesize > actualstatesize || i == 0) {
-				actualmaxsuffptr = _maxsuffptr;
-				actualstatesize = _statesize;
-			}
-			if (_bol < actualbol) {
-				actualbol=_bol; //backoff limit of LM[i]
-			}
-			if (_extendible) {
-				actualextendible=true; //set extendible flag to true if the ngram is extendible for any LM
+			dictionary *_dict=m_lm[i]->getDict();
+			for (int j=0; j<_dict->size(); j++) {
+				dict->encode(_dict->decode(j));
 			}
 		}
+		dict->genoovcode();
+		inp.close();
+		
+		int maxorder = 0;
+		for (int i=0; i<m_number_lm; i++) {
+			maxorder = (maxorder > m_lm[i]->maxlevel())?maxorder:m_lm[i]->maxlevel();
+		}
+		
+		if (order == 0) {
+			order = maxorder;
+			VERBOSE(3, "order is not set; reset to the maximum order of LMs: " << order << std::endl);
+		} else if (order > maxorder) {
+			order = maxorder;
+			VERBOSE(3, "order is too high; reset to the maximum order of LMs: " << order << std::endl);
+		}
+		maxlev=order;
 	}
-  if (bol) *bol=actualbol;
-  if (bow) *bow=log(actualbow);
-  if (maxsuffptr) *maxsuffptr=actualmaxsuffptr;
-  if (statesize) *statesize=actualstatesize;
-  if (extendible) {
-    *extendible=actualextendible;
-		//    delete _extendible;
-  }
 	
-  /*
-	 if (statesize) std::cerr << " statesize:" << *statesize << std::endl;
-	 if (bow) std::cerr << " bow:" << *bow << std::endl;
-	 if (bol) std::cerr << " bol:" << *bol << std::endl;
-	 */
-  return log(pr)/M_LN10;
-}
-
-double lmInterpolation::clprob(int* codes, int sz, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
-{
+	lmContainer* lmInterpolation::load_lm(int i,int memmap, float nlf, float dlf)
+	{
+		//checking the language model type
+		lmContainer* lmt=lmContainer::CreateLanguageModel(m_file[i],nlf,dlf);
+		
+		//let know that table has inverted n-grams
+		lmt->is_inverted(m_isinverted[i]);  //set inverted flag for each LM
+		
+		lmt->setMaxLoadedLevel(requiredMaxlev);
+		
+		lmt->load(m_file[i], memmap);
+		
+		lmt->init_caches(lmt->maxlevel());
+		return lmt;
+	}
+	
+	//return log10 prob of an ngram
+	double lmInterpolation::clprob(ngram ng, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
+	{
+		
+		double pr=0.0;
+		double _logpr;
+		
+		char* _maxsuffptr=NULL,*actualmaxsuffptr=NULL;
+		unsigned int _statesize=0,actualstatesize=0;
+		int _bol=0,actualbol=MAX_NGRAM;
+		double _bow=0.0,actualbow=0.0; 
+		bool _extendible=false;
+		bool actualextendible=false;
+		
+		for (size_t i=0; i<m_lm.size(); i++) {
+			
+			if (m_weight[i]>0.0){
+				ngram _ng(m_lm[i]->getDict());
+				_ng.trans(ng);
+				_logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,&_extendible);
+				
+				IFVERBOSE(3){
+					//cerr.precision(10);
+					VERBOSE(3," LM " << i << " weight:" << m_weight[i] << std::endl);
+					VERBOSE(3," LM " << i << " log10 logpr:" << _logpr<< std::endl);
+					VERBOSE(3," LM " << i << " pr:" << pow(10.0,_logpr) << std::endl);
+					VERBOSE(3," _statesize:" << _statesize << std::endl);
+					VERBOSE(3," _bow:" << _bow << std::endl);
+					VERBOSE(3," _bol:" << _bol << std::endl);
+				}
+				
+				/*
+				 //TO CHECK the following claims
+				 //What is the statesize of a LM interpolation? The largest _statesize among the submodels
+				 //What is the maxsuffptr of a LM interpolation? The _maxsuffptr of the submodel with the largest _statesize
+				 //What is the bol of a LM interpolation? The smallest _bol among the submodels
+				 //What is the bow of a LM interpolation? The weighted sum of the bow of the submodels
+				 //What is the prob of a LM interpolation? The weighted sum of the prob of the submodels
+				 //What is the extendible flag of a LM interpolation? true if the extendible flag is one for any LM
+				 */
+				
+				pr+=m_weight[i]*pow(10.0,_logpr);
+				actualbow+=m_weight[i]*pow(10.0,_bow);
+				
+				if(_statesize > actualstatesize || i == 0) {
+					actualmaxsuffptr = _maxsuffptr;
+					actualstatesize = _statesize;
+				}
+				if (_bol < actualbol) {
+					actualbol=_bol; //backoff limit of LM[i]
+				}
+				if (_extendible) {
+					actualextendible=true; //set extendible flag to true if the ngram is extendible for any LM
+				}
+			}
+		}
+		if (bol) *bol=actualbol;
+		if (bow) *bow=log(actualbow);
+		if (maxsuffptr) *maxsuffptr=actualmaxsuffptr;
+		if (statesize) *statesize=actualstatesize;
+		if (extendible) {
+			*extendible=actualextendible;
+			//    delete _extendible;
+		}
+		
+		if (statesize) VERBOSE(3, " statesize:" << *statesize << std::endl);
+		if (bow) VERBOSE(3, " bow:" << *bow << std::endl);
+		if (bol) VERBOSE(3, " bol:" << *bol << std::endl);
+		
+		return log10(pr);
+	}
 	
-  //create the actual ngram
-  ngram ong(dict);
-  ong.pushc(codes,sz);
-  MY_ASSERT (ong.size == sz);
+	double lmInterpolation::clprob(int* codes, int sz, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
+	{
+		
+		//create the actual ngram
+		ngram ong(dict);
+		ong.pushc(codes,sz);
+		MY_ASSERT (ong.size == sz);
+		
+		return clprob(ong, bow, bol, maxsuffptr, statesize, extendible);
+	}
 	
-  return clprob(ong, bow, bol, maxsuffptr, statesize, extendible);
-}
-
-double lmInterpolation::setlogOOVpenalty(int dub)
-{
-  MY_ASSERT(dub > dict->size());
-  double _logpr;
-  double OOVpenalty=0.0;
-  for (int i=0; i<m_number_lm; i++) {
-    m_lm[i]->setlogOOVpenalty(dub);  //set OOV Penalty for each LM
-    _logpr=m_lm[i]->getlogOOVpenalty();
-    OOVpenalty+=m_weight[i]*exp(_logpr);
-  }
-  logOOVpenalty=log(OOVpenalty);
-  return logOOVpenalty;
-}
+	double lmInterpolation::setlogOOVpenalty(int dub)
+	{
+		MY_ASSERT(dub > dict->size());
+		double _logpr;
+		double OOVpenalty=0.0;
+		for (int i=0; i<m_number_lm; i++) {
+			if (m_weight[i]>0.0){
+				m_lm[i]->setlogOOVpenalty(dub);  //set OOV Penalty for each LM
+				_logpr=m_lm[i]->getlogOOVpenalty(); // logOOV penalty is in log10
+				//    OOVpenalty+=m_weight[i]*exp(_logpr);
+				OOVpenalty+=m_weight[i]*exp(_logpr*M_LN10);  // logOOV penalty is in log10
+			}
+		}
+		//  logOOVpenalty=log(OOVpenalty);
+		logOOVpenalty=log10(OOVpenalty);
+		return logOOVpenalty;
+	}
 }//namespace irstlm
diff --git a/src/lmInterpolation.h b/src/lmInterpolation.h
index f5d2627..eb9edb5 100644
--- a/src/lmInterpolation.h
+++ b/src/lmInterpolation.h
@@ -114,7 +114,7 @@ public:
   }
 
   inline virtual void dictionary_incflag(const bool flag) {
-    dict->incflag(flag);
+		dict->incflag(flag);
   };
 
   inline virtual bool is_OOV(int code) { //returns true if the word is OOV for each subLM
@@ -124,6 +124,19 @@ public:
     }
     return true;
   }
+	
+	virtual int addWord(const char *w){
+		for (int i=0; i<m_number_lm; i++) {
+			m_lm[i]->getDict()->incflag(1);
+			m_lm[i]->getDict()->encode(w);
+			m_lm[i]->getDict()->incflag(0);
+		}
+		getDict()->incflag(1);
+		int c=getDict()->encode(w);
+		getDict()->incflag(0);
+		return c;
+	}
+	
 };
 }//namespace irstlm
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git