[irstlm] 117/126: added functions to compute prob of ngram according to weights (passed as parameter), which are used by the interpolatedLM instead of those specified in the configuration file
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:51 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.
commit b1d4eb8450024f2f17a79d7a7a9e9bda414a03e8
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Sun Oct 18 00:14:56 2015 +0200
added functions to compute prob of ngram according to weights (passed as parameter), which are used by the interpolatedLM instead of those specified in the configuration file
---
src/lmContainer.h | 27 ++++++++--
src/lmInterpolation.cpp | 141 ++++++++++++++++++++++++++++++++++++++++++++----
src/lmInterpolation.h | 11 +++-
3 files changed, 164 insertions(+), 15 deletions(-)
diff --git a/src/lmContainer.h b/src/lmContainer.h
index 7a37ccc..b8ac737 100644
--- a/src/lmContainer.h
+++ b/src/lmContainer.h
@@ -48,7 +48,10 @@ typedef enum {LMT_FIND, //!< search: find an entry
} LMT_ACTION;
namespace irstlm {
+
+
typedef std::map< std::string, float > topic_map_t;
+ typedef std::map< std::string, double > lm_map_t;
class lmContainer
{
@@ -152,25 +155,37 @@ public:
UNUSED(topic_weights);
return clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
};
-
virtual double clprob(int* ng, int ngsize, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL) {
UNUSED(topic_weights);
return clprob(ng, ngsize, bow, bol, maxsuffptr, statesize, extendible);
}
-
virtual double clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) {
VERBOSE(3,"lmContainer::clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow,...." << std::endl);
UNUSED(topic_weights);
return clprob(text, bow, bol, maxsuffptr, statesize, extendible);
}
+ virtual double clprob(ngram ng, lm_map_t& lm_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL) {
+ UNUSED(lm_weights);
+ return clprob(ng, bow, bol, maxsuffptr, statesize, extendible);
+ };
+ virtual double clprob(int* ng, int ngsize, lm_map_t& lm_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL) {
+ UNUSED(lm_weights);
+ return clprob(ng, ngsize, bow, bol, maxsuffptr, statesize, extendible);
+ }
+ virtual double clprob(string_vec_t& text, lm_map_t& lm_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) {
+ VERBOSE(3,"lmContainer::clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow,...." << std::endl);
+ UNUSED(lm_weights);
+ return clprob(text, bow, bol, maxsuffptr, statesize, extendible);
+ }
+
+
virtual const char *cmaxsuffptr(ngram ng, unsigned int* statesize=NULL)
{
UNUSED(ng);
UNUSED(statesize);
return NULL;
}
-
virtual const char *cmaxsuffptr(int* ng, int ngsize, unsigned int* statesize=NULL)
{
UNUSED(ng);
@@ -178,6 +193,12 @@ public:
UNUSED(statesize);
return NULL;
}
+ virtual const char *cmaxsuffptr(string_vec_t& text, unsigned int* statesize=NULL)
+ {
+ UNUSED(text);
+ UNUSED(statesize);
+ return NULL;
+ }
diff --git a/src/lmInterpolation.cpp b/src/lmInterpolation.cpp
index 1fa0081..c6deb75 100644
--- a/src/lmInterpolation.cpp
+++ b/src/lmInterpolation.cpp
@@ -41,6 +41,7 @@ namespace irstlm {
order=0;
memmap=0;
isInverted=false;
+ m_name_flag=false;
}
void lmInterpolation::load(const std::string &filename,int mmap)
@@ -60,13 +61,40 @@ namespace irstlm {
char line[MAX_LINE];
const char* words[LMINTERPOLATION_MAX_TOKEN];
- int tokenN;
+ size_t tokenN;
inp.getline(line,MAX_LINE,'\n');
tokenN = parseWords(line,words,LMINTERPOLATION_MAX_TOKEN);
+ bool error=false;
- if (tokenN != 2 || ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0))){
- exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
+ if ((tokenN<2) || (tokenN>3)){
+ error=true;
+ }else if ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0)) {
+ error=true;
+ }else if ((tokenN==3) && ((strcmp(words[2],"MAP") != 0) && (strcmp(words[2],"map") != 0))){
+ error=true;
}
+
+ if (error){
+ exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format:\nLMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1 [inverted]\nweight_of_LM_2 filename_of_LM_2\nor\nLMINTERPOLATION number_of_models MAP\nweight_of_LM_1 name_LM_1 filename_of_LM_1\nweight_of_LM_2 name_LM_2 filename_of_LM_2");
+
+ }
+
+ size_t idx_weight, idx_file, idx_name, idx_inverted, idx_size;
+ if (tokenN==2){
+ m_name_flag=false;
+ idx_weight=0;
+ idx_file=1;
+ idx_inverted=2;
+ idx_size=3;
+ }else{
+ m_name_flag=true;
+ idx_weight=0;
+ idx_name=1;
+ idx_file=2;
+ idx_inverted=3;
+ idx_size=4;
+ }
+
m_number_lm = atoi(words[1]);
m_weight.resize(m_number_lm);
@@ -79,22 +107,23 @@ namespace irstlm {
dict->incflag(1);
for (int i=0; i<m_number_lm; i++) {
inp.getline(line,BUFSIZ,'\n');
- tokenN = parseWords(line,words,3);
+ tokenN = parseWords(line,words,idx_size);
- if(tokenN < 2 || tokenN >3) {
- exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
+ if(tokenN < idx_file || tokenN > idx_inverted) {
+ exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format:\nLMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1 [inverted]\nweight_of_LM_2 filename_of_LM_2\nor\nLMINTERPOLATION number_of_models MAP\nweight_of_LM_1 name_LM_1 filename_of_LM_1\nweight_of_LM_2 name_LM_2 filename_of_LM_2");
}
//check whether the (textual) LM has to be loaded as inverted
m_isinverted[i] = false;
- if(tokenN == 3) {
- if (strcmp(words[2],"inverted") == 0)
+ if(tokenN == idx_size) {
+ if (strcmp(words[idx_inverted],"inverted") == 0)
m_isinverted[i] = true;
}
VERBOSE(2,"i:" << i << " m_isinverted[i]:" << m_isinverted[i] << endl);
- m_weight[i] = (float) atof(words[0]);
- m_file[i] = words[1];
+ m_weight[i] = atof(words[idx_weight]);
+ m_name[words[idx_name]] = i;
+ m_file[i] = words[idx_file];
VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_file:"<< words[1] << std::endl;);
m_lm[i] = load_lm(i,memmap,ngramcache_load_factor,dictionary_load_factor);
@@ -140,6 +169,98 @@ namespace irstlm {
return lmt;
}
+
+ void lmInterpolation::set_weight(const lm_map_t& map, std::vector<double>& weight){
+ for (lm_map_t::const_iterator it=map.begin(); it!=map.end();++it){
+ weight[m_name[it->first]] = it->second;
+ }
+ }
+
+ //return log10 prob of an ngram
+ double lmInterpolation::clprob(ngram ng, lm_map_t& lm_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
+ {
+
+ double pr=0.0;
+ double _logpr;
+
+ char* _maxsuffptr=NULL,*actualmaxsuffptr=NULL;
+ unsigned int _statesize=0,actualstatesize=0;
+ int _bol=0,actualbol=MAX_NGRAM;
+ double _bow=0.0,actualbow=0.0;
+ bool _extendible=false;
+ bool actualextendible=false;
+
+ std::vector<double> weight(m_number_lm);
+ set_weight(lm_weights,weight);
+
+ for (size_t i=0; i<m_lm.size(); i++) {
+
+ if (weight[i]>0.0){
+ ngram _ng(m_lm[i]->getDict());
+ _ng.trans(ng);
+ _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,&_extendible);
+
+ IFVERBOSE(3){
+ //cerr.precision(10);
+ VERBOSE(3," LM " << i << " weight:" << weight[i] << std::endl);
+ VERBOSE(3," LM " << i << " log10 logpr:" << _logpr<< std::endl);
+ VERBOSE(3," LM " << i << " pr:" << pow(10.0,_logpr) << std::endl);
+ VERBOSE(3," _statesize:" << _statesize << std::endl);
+ VERBOSE(3," _bow:" << _bow << std::endl);
+ VERBOSE(3," _bol:" << _bol << std::endl);
+ }
+
+ /*
+ //TO CHECK the following claims
+ //What is the statesize of a LM interpolation? The largest _statesize among the submodels
+ //What is the maxsuffptr of a LM interpolation? The _maxsuffptr of the submodel with the largest _statesize
+ //What is the bol of a LM interpolation? The smallest _bol among the submodels
+ //What is the bow of a LM interpolation? The weighted sum of the bow of the submodels
+ //What is the prob of a LM interpolation? The weighted sum of the prob of the submodels
+ //What is the extendible flag of a LM interpolation? true if the extendible flag is one for any LM
+ */
+
+ pr+=weight[i]*pow(10.0,_logpr);
+ actualbow+=m_weight[i]*pow(10.0,_bow);
+
+ if(_statesize > actualstatesize || i == 0) {
+ actualmaxsuffptr = _maxsuffptr;
+ actualstatesize = _statesize;
+ }
+ if (_bol < actualbol) {
+ actualbol=_bol; //backoff limit of LM[i]
+ }
+ if (_extendible) {
+ actualextendible=true; //set extendible flag to true if the ngram is extendible for any LM
+ }
+ }
+ }
+ if (bol) *bol=actualbol;
+ if (bow) *bow=log(actualbow);
+ if (maxsuffptr) *maxsuffptr=actualmaxsuffptr;
+ if (statesize) *statesize=actualstatesize;
+ if (extendible) {
+ *extendible=actualextendible;
+ // delete _extendible;
+ }
+
+ if (statesize) VERBOSE(3, " statesize:" << *statesize << std::endl);
+ if (bow) VERBOSE(3, " bow:" << *bow << std::endl);
+ if (bol) VERBOSE(3, " bol:" << *bol << std::endl);
+
+ return log10(pr);
+ }
+ double lmInterpolation::clprob(int* codes, int sz, lm_map_t& lm_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
+ {
+
+ //create the actual ngram
+ ngram ong(dict);
+ ong.pushc(codes,sz);
+ MY_ASSERT (ong.size == sz);
+
+ return clprob(ong, lm_weights, bow, bol, maxsuffptr, statesize, extendible);
+ }
+
//return log10 prob of an ngram
double lmInterpolation::clprob(ngram ng, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
{
diff --git a/src/lmInterpolation.h b/src/lmInterpolation.h
index eb9edb5..686ef9e 100644
--- a/src/lmInterpolation.h
+++ b/src/lmInterpolation.h
@@ -40,7 +40,7 @@ namespace irstlm {
interpolation of several sub LMs
*/
-#define LMINTERPOLATION_MAX_TOKEN 3
+#define LMINTERPOLATION_MAX_TOKEN 5
class lmInterpolation: public lmContainer
{
@@ -50,12 +50,14 @@ class lmInterpolation: public lmContainer
int dictionary_upperbound; //set by user
double logOOVpenalty; //penalty for OOV words (default 0)
bool isInverted;
+ bool m_name_flag; //flag for the presence of a map between name and lm
int memmap; //level from which n-grams are accessed via mmap
std::vector<double> m_weight;
std::vector<std::string> m_file;
std::vector<bool> m_isinverted;
std::vector<lmContainer*> m_lm;
+ lm_map_t m_name;
int maxlev; //maximun order of sub LMs;
@@ -69,12 +71,17 @@ public:
lmInterpolation(float nlf=0.0, float dlfi=0.0);
virtual ~lmInterpolation() {};
+ void set_weight(const lm_map_t& map, std::vector<double>& weight);
+
void load(const std::string &filename,int mmap=0);
lmContainer* load_lm(int i, int memmap, float nlf, float dlf);
virtual double clprob(ngram ng, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
virtual double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
-
+
+ virtual double clprob(ngram ng, lm_map_t& lm_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL);
+ virtual double clprob(int* ng, int ngsize, lm_map_t& lm_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL);
+
int maxlevel() const {
return maxlev;
};
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list