[irstlm] 04/10: added supporting function; code cleanup;
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:54 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to annotated tag staticLM
in repository irstlm.
commit 60431b7bf5ea949d9dab9f3f27edf6d934196b35
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Sat Oct 3 12:43:04 2015 +0200
added supporting function; code cleanup;
---
src/lmInterpolation.cpp | 398 +++++++++++++++++++++++-------------------------
src/lmInterpolation.h | 15 +-
2 files changed, 203 insertions(+), 210 deletions(-)
diff --git a/src/lmInterpolation.cpp b/src/lmInterpolation.cpp
index af181f0..356289f 100644
--- a/src/lmInterpolation.cpp
+++ b/src/lmInterpolation.cpp
@@ -23,7 +23,6 @@
#include <cstdlib>
#include <stdlib.h>
#include <iostream>
-#include <sstream>
#include <stdexcept>
#include <string>
#include "lmContainer.h"
@@ -31,232 +30,213 @@
#include "util.h"
using namespace std;
-
namespace irstlm {
-lmInterpolation::lmInterpolation(float nlf, float dlf)
-{
- ngramcache_load_factor = nlf;
- dictionary_load_factor = dlf;
-
- order=0;
- memmap=0;
- isInverted=false;
-}
-
-void lmInterpolation::load(const std::string &filename,int mmap)
-{
- VERBOSE(2,"lmInterpolation::load(const std::string &filename,int memmap)" << std::endl);
- VERBOSE(2," filename:|" << filename << "|" << std::endl);
-
-
- dictionary_upperbound=1000000;
- int memmap=mmap;
-
-
- dict=new dictionary((char *)NULL,1000000,dictionary_load_factor);
-
- //get info from the configuration file
- fstream inp(filename.c_str(),ios::in|ios::binary);
-
- char line[MAX_LINE];
- const char* words[LMINTERPOLATION_MAX_TOKEN];
- int tokenN;
- inp.getline(line,MAX_LINE,'\n');
- tokenN = parseWords(line,words,LMINTERPOLATION_MAX_TOKEN);
-
- if (tokenN != 2 || ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0))){
- std::stringstream ss_msg;
- ss_msg << "ERROR: wrong header format of configuration file" << std::endl;
- ss_msg << "correct format: LMINTERPOLATION number_of_models" << std::endl;
- ss_msg << "weight_of_LM_1 filename_of_LM_1" << std::endl;
- ss_msg << "weight_of_LM_2 filename_of_LM_2" << std::endl;
- exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
+ lmInterpolation::lmInterpolation(float nlf, float dlf)
+ {
+ ngramcache_load_factor = nlf;
+ dictionary_load_factor = dlf;
+
+ order=0;
+ memmap=0;
+ isInverted=false;
}
- m_number_lm = atoi(words[1]);
-
- /*
- Although the weights can assume any real value from a computational point of view,
- the model assumes that the weights must be larger than or equal to 0.0.
- Moreover, if the weight is 0.0, the probability of the corresponding LM is not computed for improve efficiency
- */
- m_weight.resize(m_number_lm);
- m_file.resize(m_number_lm);
- m_isinverted.resize(m_number_lm);
- m_lm.resize(m_number_lm);
- VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_number_lm:"<< m_number_lm << std::endl;);
-
- dict->incflag(1);
- for (int i=0; i<m_number_lm; i++) {
- inp.getline(line,BUFSIZ,'\n');
- tokenN = parseWords(line,words,3);
+ void lmInterpolation::load(const std::string &filename,int mmap)
+ {
+ VERBOSE(2,"lmInterpolation::load(const std::string &filename,int memmap)" << std::endl);
+ VERBOSE(2," filename:|" << filename << "|" << std::endl);
- if(tokenN < 2 || tokenN >3) {
- std::stringstream ss_msg;
- ss_msg << "ERROR: wrong header format of configuration file" << std::endl;
- ss_msg << "correct format: LMINTERPOLATION number_of_models" << std::endl;
- ss_msg << "weight_of_LM_1 filename_of_LM_1" << std::endl;
- ss_msg << "weight_of_LM_2 filename_of_LM_2" << std::endl;
- exit_error(IRSTLM_ERROR_DATA,ss_msg.str());
- }
- //check whether the (textual) LM has to be loaded as inverted
- m_isinverted[i] = false;
- if(tokenN == 3) {
- if (strcmp(words[2],"inverted") == 0)
- m_isinverted[i] = true;
- }
- VERBOSE(2,"i:" << i << " m_isinverted[i]:" << m_isinverted[i] << endl);
+ dictionary_upperbound=1000000;
+ int memmap=mmap;
- //The model requires that the weights must be larger than or equal to 0.0.
- m_weight[i] = (float) atof(words[0]);
- if(m_weight[i] < 0.0) {
- std::stringstream ss_msg;
- ss_msg << "ERROR: weight for the LM " << i << " is negative" << std::endl;
- exit_error(IRSTLM_ERROR_MODEL,ss_msg.str());
- }
- m_file[i] = words[1];
- VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_file:"<< words[1] << std::endl;);
+ dict=new dictionary((char *)NULL,1000000,dictionary_load_factor);
- m_lm[i] = load_lm(i,memmap,ngramcache_load_factor,dictionary_load_factor);
- //set the actual value for inverted flag, which is known only after loading the lM
- m_isinverted[i] = m_lm[i]->is_inverted();
+ //get info from the configuration file
+ fstream inp(filename.c_str(),ios::in|ios::binary);
- dictionary *_dict=m_lm[i]->getDict();
- for (int j=0; j<_dict->size(); j++) {
- dict->encode(_dict->decode(j));
- }
- }
- getDict()->genoovcode();
-
- getDict()->incflag(1);
- inp.close();
-
- int maxorder = 0;
- for (int i=0; i<m_number_lm; i++) {
- maxorder = (maxorder > m_lm[i]->maxlevel())?maxorder:m_lm[i]->maxlevel();
- }
-
- if (order == 0) {
- order = maxorder;
- std::cerr << "order is not set; reset to the maximum order of LMs: " << order << std::endl;
- } else if (order > maxorder) {
- order = maxorder;
- std::cerr << "order is too high; reset to the maximum order of LMs: " << order << std::endl;
- }
- maxlev=order;
-}
-
-lmContainer* lmInterpolation::load_lm(int i,int memmap, float nlf, float dlf)
-{
- //checking the language model type
- lmContainer* lmt=lmContainer::CreateLanguageModel(m_file[i],nlf,dlf);
-
- //let know that table has inverted n-grams
- lmt->is_inverted(m_isinverted[i]); //set inverted flag for each LM
-
- lmt->setMaxLoadedLevel(requiredMaxlev);
-
- lmt->load(m_file[i], memmap);
-
- lmt->init_caches(lmt->maxlevel());
- return lmt;
-}
-
-
-double lmInterpolation::clprob(ngram ng, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
-{
-
- double pr=0.0;
- double _logpr;
-
- char* _maxsuffptr=NULL,*actualmaxsuffptr=NULL;
- unsigned int _statesize=0,actualstatesize=0;
- int _bol=0,actualbol=MAX_NGRAM;
- double _bow=0.0,actualbow=0.0;
- bool _extendible=false;
- bool actualextendible=false;
-
- for (size_t i=0; i<m_lm.size(); i++) {
- if (m_weight[i] > 0.0){//the probability of the corresponding LM is computed only if the weight is larger than 0.0, otherwise it is skipped for efficiency
- ngram _ng(m_lm[i]->getDict());
- _ng.trans(ng);
- _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,&_extendible);
+ char line[MAX_LINE];
+ const char* words[LMINTERPOLATION_MAX_TOKEN];
+ int tokenN;
+ inp.getline(line,MAX_LINE,'\n');
+ tokenN = parseWords(line,words,LMINTERPOLATION_MAX_TOKEN);
+
+ if (tokenN != 2 || ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0))){
+ exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
+ }
+ m_number_lm = atoi(words[1]);
+
+ m_weight.resize(m_number_lm);
+ m_file.resize(m_number_lm);
+ m_isinverted.resize(m_number_lm);
+ m_lm.resize(m_number_lm);
+
+ VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_number_lm:"<< m_number_lm << std::endl;);
+
+ dict->incflag(1);
+ for (int i=0; i<m_number_lm; i++) {
+ inp.getline(line,BUFSIZ,'\n');
+ tokenN = parseWords(line,words,3);
+
+ if(tokenN < 2 || tokenN >3) {
+ exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
+ }
- /*
- cerr.precision(10);
- std::cerr << " LM " << i << " weight:" << m_weight[i] << std::endl;
- std::cerr << " LM " << i << " log10 logpr:" << _logpr<< std::endl;
- std::cerr << " LM " << i << " pr:" << pow(10.0,_logpr) << std::endl;
- std::cerr << " _statesize:" << _statesize << std::endl;
- std::cerr << " _bow:" << _bow << std::endl;
- std::cerr << " _bol:" << _bol << std::endl;
- */
+ //check whether the (textual) LM has to be loaded as inverted
+ m_isinverted[i] = false;
+ if(tokenN == 3) {
+ if (strcmp(words[2],"inverted") == 0)
+ m_isinverted[i] = true;
+ }
+ VERBOSE(2,"i:" << i << " m_isinverted[i]:" << m_isinverted[i] << endl);
- //TO CHECK the following claims
- //What is the statesize of a LM interpolation? The largest _statesize among the submodels
- //What is the maxsuffptr of a LM interpolation? The _maxsuffptr of the submodel with the largest _statesize
- //What is the bol of a LM interpolation? The smallest _bol among the submodels
- //What is the bow of a LM interpolation? The weighted sum of the bow of the submodels
- //What is the prob of a LM interpolation? The weighted sum of the prob of the submodels
- //What is the extendible flag of a LM interpolation? true if the extendible flag is one for any LM
+ m_weight[i] = (float) atof(words[0]);
+ m_file[i] = words[1];
+ VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_file:"<< words[1] << std::endl;);
- pr+=m_weight[i]*pow(10.0,_logpr);
- actualbow+=m_weight[i]*pow(10.0,_bow);
+ m_lm[i] = load_lm(i,memmap,ngramcache_load_factor,dictionary_load_factor);
+ //set the actual value for inverted flag, which is known only after loading the lM
+ m_isinverted[i] = m_lm[i]->is_inverted();
- if(_statesize > actualstatesize || i == 0) {
- actualmaxsuffptr = _maxsuffptr;
- actualstatesize = _statesize;
- }
- if (_bol < actualbol) {
- actualbol=_bol; //backoff limit of LM[i]
- }
- if (_extendible) {
- actualextendible=true; //set extendible flag to true if the ngram is extendible for any LM
+ dictionary *_dict=m_lm[i]->getDict();
+ for (int j=0; j<_dict->size(); j++) {
+ dict->encode(_dict->decode(j));
}
}
+ dict->genoovcode();
+ inp.close();
+
+ int maxorder = 0;
+ for (int i=0; i<m_number_lm; i++) {
+ maxorder = (maxorder > m_lm[i]->maxlevel())?maxorder:m_lm[i]->maxlevel();
+ }
+
+ if (order == 0) {
+ order = maxorder;
+ VERBOSE(3, "order is not set; reset to the maximum order of LMs: " << order << std::endl);
+ } else if (order > maxorder) {
+ order = maxorder;
+ VERBOSE(3, "order is too high; reset to the maximum order of LMs: " << order << std::endl);
+ }
+ maxlev=order;
}
- if (bol) *bol=actualbol;
- if (bow) *bow=log(actualbow);
- if (maxsuffptr) *maxsuffptr=actualmaxsuffptr;
- if (statesize) *statesize=actualstatesize;
- if (extendible) {
- *extendible=actualextendible;
- // delete _extendible;
- }
- /*
- if (statesize) std::cerr << " statesize:" << *statesize << std::endl;
- if (bow) std::cerr << " bow:" << *bow << std::endl;
- if (bol) std::cerr << " bol:" << *bol << std::endl;
- */
- return log(pr)/M_LN10;
-}
-
-double lmInterpolation::clprob(int* codes, int sz, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
-{
+ lmContainer* lmInterpolation::load_lm(int i,int memmap, float nlf, float dlf)
+ {
+ //checking the language model type
+ lmContainer* lmt=lmContainer::CreateLanguageModel(m_file[i],nlf,dlf);
+
+ //let know that table has inverted n-grams
+ lmt->is_inverted(m_isinverted[i]); //set inverted flag for each LM
+
+ lmt->setMaxLoadedLevel(requiredMaxlev);
+
+ lmt->load(m_file[i], memmap);
+
+ lmt->init_caches(lmt->maxlevel());
+ return lmt;
+ }
+
+ //return log10 prob of an ngram
+ double lmInterpolation::clprob(ngram ng, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
+ {
+
+ double pr=0.0;
+ double _logpr;
+
+ char* _maxsuffptr=NULL,*actualmaxsuffptr=NULL;
+ unsigned int _statesize=0,actualstatesize=0;
+ int _bol=0,actualbol=MAX_NGRAM;
+ double _bow=0.0,actualbow=0.0;
+ bool _extendible=false;
+ bool actualextendible=false;
+
+ for (size_t i=0; i<m_lm.size(); i++) {
+
+ if (m_weight[i]>0.0){
+ ngram _ng(m_lm[i]->getDict());
+ _ng.trans(ng);
+ _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,&_extendible);
+
+ IFVERBOSE(3){
+ //cerr.precision(10);
+ VERBOSE(3," LM " << i << " weight:" << m_weight[i] << std::endl);
+ VERBOSE(3," LM " << i << " log10 logpr:" << _logpr<< std::endl);
+ VERBOSE(3," LM " << i << " pr:" << pow(10.0,_logpr) << std::endl);
+ VERBOSE(3," _statesize:" << _statesize << std::endl);
+ VERBOSE(3," _bow:" << _bow << std::endl);
+ VERBOSE(3," _bol:" << _bol << std::endl);
+ }
+
+ /*
+ //TO CHECK the following claims
+ //What is the statesize of a LM interpolation? The largest _statesize among the submodels
+ //What is the maxsuffptr of a LM interpolation? The _maxsuffptr of the submodel with the largest _statesize
+ //What is the bol of a LM interpolation? The smallest _bol among the submodels
+ //What is the bow of a LM interpolation? The weighted sum of the bow of the submodels
+ //What is the prob of a LM interpolation? The weighted sum of the prob of the submodels
+ //What is the extendible flag of a LM interpolation? true if the extendible flag is one for any LM
+ */
+
+ pr+=m_weight[i]*pow(10.0,_logpr);
+ actualbow+=m_weight[i]*pow(10.0,_bow);
+
+ if(_statesize > actualstatesize || i == 0) {
+ actualmaxsuffptr = _maxsuffptr;
+ actualstatesize = _statesize;
+ }
+ if (_bol < actualbol) {
+ actualbol=_bol; //backoff limit of LM[i]
+ }
+ if (_extendible) {
+ actualextendible=true; //set extendible flag to true if the ngram is extendible for any LM
+ }
+ }
+ }
+ if (bol) *bol=actualbol;
+ if (bow) *bow=log(actualbow);
+ if (maxsuffptr) *maxsuffptr=actualmaxsuffptr;
+ if (statesize) *statesize=actualstatesize;
+ if (extendible) {
+ *extendible=actualextendible;
+ // delete _extendible;
+ }
+
+ if (statesize) VERBOSE(3, " statesize:" << *statesize << std::endl);
+ if (bow) VERBOSE(3, " bow:" << *bow << std::endl);
+ if (bol) VERBOSE(3, " bol:" << *bol << std::endl);
+
+ return log10(pr);
+ }
- //create the actual ngram
- ngram ong(dict);
- ong.pushc(codes,sz);
- MY_ASSERT (ong.size == sz);
+ double lmInterpolation::clprob(int* codes, int sz, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
+ {
+
+ //create the actual ngram
+ ngram ong(dict);
+ ong.pushc(codes,sz);
+ MY_ASSERT (ong.size == sz);
+
+ return clprob(ong, bow, bol, maxsuffptr, statesize, extendible);
+ }
- return clprob(ong, bow, bol, maxsuffptr, statesize, extendible);
-}
-
-double lmInterpolation::setlogOOVpenalty(int dub)
-{
- MY_ASSERT(dub > dict->size());
- double _logpr;
- double OOVpenalty=0.0;
- for (int i=0; i<m_number_lm; i++) {
- m_lm[i]->setlogOOVpenalty(dub); //set OOV Penalty for each LM
- _logpr=m_lm[i]->getlogOOVpenalty();
- OOVpenalty+=m_weight[i]*exp(_logpr);
- }
- logOOVpenalty=log(OOVpenalty);
- return logOOVpenalty;
-}
+ double lmInterpolation::setlogOOVpenalty(int dub)
+ {
+ MY_ASSERT(dub > dict->size());
+ double _logpr;
+ double OOVpenalty=0.0;
+ for (int i=0; i<m_number_lm; i++) {
+ if (m_weight[i]>0.0){
+ m_lm[i]->setlogOOVpenalty(dub); //set OOV Penalty for each LM
+ _logpr=m_lm[i]->getlogOOVpenalty(); // logOOV penalty is in log10
+ // OOVpenalty+=m_weight[i]*exp(_logpr);
+ OOVpenalty+=m_weight[i]*exp(_logpr*M_LN10); // logOOV penalty is in log10
+ }
+ }
+ // logOOVpenalty=log(OOVpenalty);
+ logOOVpenalty=log10(OOVpenalty);
+ return logOOVpenalty;
+ }
}//namespace irstlm
diff --git a/src/lmInterpolation.h b/src/lmInterpolation.h
index f5d2627..eb9edb5 100644
--- a/src/lmInterpolation.h
+++ b/src/lmInterpolation.h
@@ -114,7 +114,7 @@ public:
}
inline virtual void dictionary_incflag(const bool flag) {
- dict->incflag(flag);
+ dict->incflag(flag);
};
inline virtual bool is_OOV(int code) { //returns true if the word is OOV for each subLM
@@ -124,6 +124,19 @@ public:
}
return true;
}
+
+ virtual int addWord(const char *w){
+ for (int i=0; i<m_number_lm; i++) {
+ m_lm[i]->getDict()->incflag(1);
+ m_lm[i]->getDict()->encode(w);
+ m_lm[i]->getDict()->incflag(0);
+ }
+ getDict()->incflag(1);
+ int c=getDict()->encode(w);
+ getDict()->incflag(0);
+ return c;
+ }
+
};
}//namespace irstlm
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list