[irstlm] 111/126: added supporting functions
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:50 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.
commit cfcdeff1f8f25efba1b2695ed87b693fa817c094
Author: nicolabertoldi <bertoldi at fbk.eu>
Date: Tue Oct 6 09:53:28 2015 +0200
added supporting functions
---
src/lmInterpolation.cpp | 221 +++++++++++++++++++++++++++++++++++++++++++++---
src/lmInterpolation.h | 15 +++-
2 files changed, 225 insertions(+), 11 deletions(-)
diff --git a/src/lmInterpolation.cpp b/src/lmInterpolation.cpp
index a214f78..987932c 100644
--- a/src/lmInterpolation.cpp
+++ b/src/lmInterpolation.cpp
@@ -30,14 +30,9 @@
#include "util.h"
using namespace std;
-
-inline void error(const char* message)
-{
- std::cerr << message << "\n";
- throw std::runtime_error(message);
-}
namespace irstlm {
+<<<<<<< HEAD
lmInterpolation::lmInterpolation(float nlf, float dlf)
{
ngramcache_load_factor = nlf;
@@ -218,12 +213,188 @@ double lmInterpolation::clprob(ngram ng, double* bow,int* bol,char** maxsuffptr,
double lmInterpolation::clprob(int* codes, int sz, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
{
+=======
+ lmInterpolation::lmInterpolation(float nlf, float dlf)
+ {
+ ngramcache_load_factor = nlf;
+ dictionary_load_factor = dlf;
+
+ order=0;
+ memmap=0;
+ isInverted=false;
+ }
+
+ void lmInterpolation::load(const std::string &filename,int mmap)
+ {
+ VERBOSE(2,"lmInterpolation::load(const std::string &filename,int memmap)" << std::endl);
+ VERBOSE(2," filename:|" << filename << "|" << std::endl);
+
+
+ dictionary_upperbound=1000000;
+ int memmap=mmap;
+
+
+ dict=new dictionary((char *)NULL,1000000,dictionary_load_factor);
+
+ //get info from the configuration file
+ fstream inp(filename.c_str(),ios::in|ios::binary);
+
+ char line[MAX_LINE];
+ const char* words[LMINTERPOLATION_MAX_TOKEN];
+ int tokenN;
+ inp.getline(line,MAX_LINE,'\n');
+ tokenN = parseWords(line,words,LMINTERPOLATION_MAX_TOKEN);
+
+ if (tokenN != 2 || ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0))){
+ exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
+ }
+ m_number_lm = atoi(words[1]);
+
+ m_weight.resize(m_number_lm);
+ m_file.resize(m_number_lm);
+ m_isinverted.resize(m_number_lm);
+ m_lm.resize(m_number_lm);
+
+ VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_number_lm:"<< m_number_lm << std::endl;);
+
+ dict->incflag(1);
+ for (int i=0; i<m_number_lm; i++) {
+ inp.getline(line,BUFSIZ,'\n');
+ tokenN = parseWords(line,words,3);
+
+ if(tokenN < 2 || tokenN >3) {
+ exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
+ }
+
+ //check whether the (textual) LM has to be loaded as inverted
+ m_isinverted[i] = false;
+ if(tokenN == 3) {
+ if (strcmp(words[2],"inverted") == 0)
+ m_isinverted[i] = true;
+ }
+ VERBOSE(2,"i:" << i << " m_isinverted[i]:" << m_isinverted[i] << endl);
+
+ m_weight[i] = (float) atof(words[0]);
+ m_file[i] = words[1];
+ VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_file:"<< words[1] << std::endl;);
+
+ m_lm[i] = load_lm(i,memmap,ngramcache_load_factor,dictionary_load_factor);
+ //set the actual value for inverted flag, which is known only after loading the lM
+ m_isinverted[i] = m_lm[i]->is_inverted();
+
+ dictionary *_dict=m_lm[i]->getDict();
+ for (int j=0; j<_dict->size(); j++) {
+ dict->encode(_dict->decode(j));
+ }
+ }
+ dict->genoovcode();
+ inp.close();
+
+ int maxorder = 0;
+ for (int i=0; i<m_number_lm; i++) {
+ maxorder = (maxorder > m_lm[i]->maxlevel())?maxorder:m_lm[i]->maxlevel();
+ }
+
+ if (order == 0) {
+ order = maxorder;
+ VERBOSE(3, "order is not set; reset to the maximum order of LMs: " << order << std::endl);
+ } else if (order > maxorder) {
+ order = maxorder;
+ VERBOSE(3, "order is too high; reset to the maximum order of LMs: " << order << std::endl);
+ }
+ maxlev=order;
+ }
- //create the actual ngram
- ngram ong(dict);
- ong.pushc(codes,sz);
- MY_ASSERT (ong.size == sz);
+ lmContainer* lmInterpolation::load_lm(int i,int memmap, float nlf, float dlf)
+ {
+ //checking the language model type
+ lmContainer* lmt=lmContainer::CreateLanguageModel(m_file[i],nlf,dlf);
+
+ //let know that table has inverted n-grams
+ lmt->is_inverted(m_isinverted[i]); //set inverted flag for each LM
+
+ lmt->setMaxLoadedLevel(requiredMaxlev);
+
+ lmt->load(m_file[i], memmap);
+
+ lmt->init_caches(lmt->maxlevel());
+ return lmt;
+ }
+>>>>>>> 007a63db35eac95af9099907cf287ace231176fb
+ //return log10 prob of an ngram
+ double lmInterpolation::clprob(ngram ng, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
+ {
+
+ double pr=0.0;
+ double _logpr;
+
+ char* _maxsuffptr=NULL,*actualmaxsuffptr=NULL;
+ unsigned int _statesize=0,actualstatesize=0;
+ int _bol=0,actualbol=MAX_NGRAM;
+ double _bow=0.0,actualbow=0.0;
+ bool _extendible=false;
+ bool actualextendible=false;
+
+ for (size_t i=0; i<m_lm.size(); i++) {
+
+ if (m_weight[i]>0.0){
+ ngram _ng(m_lm[i]->getDict());
+ _ng.trans(ng);
+ _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,&_extendible);
+
+ IFVERBOSE(3){
+ //cerr.precision(10);
+ VERBOSE(3," LM " << i << " weight:" << m_weight[i] << std::endl);
+ VERBOSE(3," LM " << i << " log10 logpr:" << _logpr<< std::endl);
+ VERBOSE(3," LM " << i << " pr:" << pow(10.0,_logpr) << std::endl);
+ VERBOSE(3," _statesize:" << _statesize << std::endl);
+ VERBOSE(3," _bow:" << _bow << std::endl);
+ VERBOSE(3," _bol:" << _bol << std::endl);
+ }
+
+ /*
+ //TO CHECK the following claims
+ //What is the statesize of a LM interpolation? The largest _statesize among the submodels
+ //What is the maxsuffptr of a LM interpolation? The _maxsuffptr of the submodel with the largest _statesize
+ //What is the bol of a LM interpolation? The smallest _bol among the submodels
+ //What is the bow of a LM interpolation? The weighted sum of the bow of the submodels
+ //What is the prob of a LM interpolation? The weighted sum of the prob of the submodels
+ //What is the extendible flag of a LM interpolation? true if the extendible flag is one for any LM
+ */
+
+ pr+=m_weight[i]*pow(10.0,_logpr);
+ actualbow+=m_weight[i]*pow(10.0,_bow);
+
+ if(_statesize > actualstatesize || i == 0) {
+ actualmaxsuffptr = _maxsuffptr;
+ actualstatesize = _statesize;
+ }
+ if (_bol < actualbol) {
+ actualbol=_bol; //backoff limit of LM[i]
+ }
+ if (_extendible) {
+ actualextendible=true; //set extendible flag to true if the ngram is extendible for any LM
+ }
+ }
+ }
+ if (bol) *bol=actualbol;
+ if (bow) *bow=log(actualbow);
+ if (maxsuffptr) *maxsuffptr=actualmaxsuffptr;
+ if (statesize) *statesize=actualstatesize;
+ if (extendible) {
+ *extendible=actualextendible;
+ // delete _extendible;
+ }
+
+ if (statesize) VERBOSE(3, " statesize:" << *statesize << std::endl);
+ if (bow) VERBOSE(3, " bow:" << *bow << std::endl);
+ if (bol) VERBOSE(3, " bol:" << *bol << std::endl);
+
+ return log10(pr);
+ }
+
+<<<<<<< HEAD
return clprob(ong, bow, bol, maxsuffptr, statesize, extendible);
}
@@ -242,4 +413,34 @@ double lmInterpolation::setlogOOVpenalty(int dub)
logOOVpenalty=log10(OOVpenalty);
return logOOVpenalty;
}
+=======
+ double lmInterpolation::clprob(int* codes, int sz, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
+ {
+
+ //create the actual ngram
+ ngram ong(dict);
+ ong.pushc(codes,sz);
+ MY_ASSERT (ong.size == sz);
+
+ return clprob(ong, bow, bol, maxsuffptr, statesize, extendible);
+ }
+
+ double lmInterpolation::setlogOOVpenalty(int dub)
+ {
+ MY_ASSERT(dub > dict->size());
+ double _logpr;
+ double OOVpenalty=0.0;
+ for (int i=0; i<m_number_lm; i++) {
+ if (m_weight[i]>0.0){
+ m_lm[i]->setlogOOVpenalty(dub); //set OOV Penalty for each LM
+ _logpr=m_lm[i]->getlogOOVpenalty(); // logOOV penalty is in log10
+ // OOVpenalty+=m_weight[i]*exp(_logpr);
+ OOVpenalty+=m_weight[i]*exp(_logpr*M_LN10); // logOOV penalty is in log10
+ }
+ }
+ // logOOVpenalty=log(OOVpenalty);
+ logOOVpenalty=log10(OOVpenalty);
+ return logOOVpenalty;
+ }
+>>>>>>> 007a63db35eac95af9099907cf287ace231176fb
}//namespace irstlm
diff --git a/src/lmInterpolation.h b/src/lmInterpolation.h
index f5d2627..eb9edb5 100644
--- a/src/lmInterpolation.h
+++ b/src/lmInterpolation.h
@@ -114,7 +114,7 @@ public:
}
inline virtual void dictionary_incflag(const bool flag) {
- dict->incflag(flag);
+ dict->incflag(flag);
};
inline virtual bool is_OOV(int code) { //returns true if the word is OOV for each subLM
@@ -124,6 +124,19 @@ public:
}
return true;
}
+
+ virtual int addWord(const char *w){
+ for (int i=0; i<m_number_lm; i++) {
+ m_lm[i]->getDict()->incflag(1);
+ m_lm[i]->getDict()->encode(w);
+ m_lm[i]->getDict()->incflag(0);
+ }
+ getDict()->incflag(1);
+ int c=getDict()->encode(w);
+ getDict()->incflag(0);
+ return c;
+ }
+
};
}//namespace irstlm
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list