[irstlm] 16/78: code optimization; code cleanup;
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:47:01 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to tag adaptiveLM.v0.10
in repository irstlm.
commit 30114d49340d76382d3932d1e9a1b12fdf8c9dcb
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Sun Nov 8 22:06:59 2015 +0100
code optimization; code cleanup;
---
src/lmContainer.h | 338 +++++++++++++++++++++++++-----------------------
src/lmInterpolation.cpp | 195 +++++++++++++++++++++++-----
src/lmInterpolation.h | 13 +-
3 files changed, 350 insertions(+), 196 deletions(-)
diff --git a/src/lmContainer.h b/src/lmContainer.h
index b40be41..da7bf8a 100644
--- a/src/lmContainer.h
+++ b/src/lmContainer.h
@@ -48,10 +48,8 @@ typedef enum {LMT_FIND, //!< search: find an entry
namespace irstlm {
static const std::string context_delimiter="___CONTEXT___";
static const std::string lexicon_delimiter="___LEXICON___";
- static const char topic_map_delimiter1=':';
+ static const char topic_map_delimiter1=':';
static const char topic_map_delimiter2=',';
-// #define topic_map_delimiter1 ':'
-// #define topic_map_delimiter2 ','
typedef std::map< std::string, float > topic_map_t;
@@ -142,171 +140,183 @@ namespace irstlm {
virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, NULL, NULL); }
virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, NULL); };
-// virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow){return 0.0;};
- virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow);
-
- virtual double clprob(int* ng, int ngsize=NULL, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+ virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow)
{
- //create the actual ngram
- ngram ong(getDict());
- ong.pushc(ng,ngsize);
- MY_ASSERT (ong.size == ngsize);
+ UNUSED(ng);
+ UNUSED(bow);
+ UNUSED(bol);
+ UNUSED(maxsuffidx);
+ UNUSED(maxsuffptr);
+ UNUSED(statesize);
+ UNUSED(extendible);
+ UNUSED(lastbow);
+
+ return 0.0;
+ }
- return clprob(ong, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
- };
-
- virtual double clprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL) { return clprob(ng, topic_weights, bow, bol, NULL, maxsuffptr, statesize, extendible, lastbow); };
- virtual double clprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
- {
- UNUSED(topic_weights);
- return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
- }
- virtual double clprob(int* ng, int ngsize, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
- {
- //create the actual ngram
- ngram ong(getDict());
- ong.pushc(ng,ngsize);
- MY_ASSERT (ong.size == ngsize);
- return clprob(ong, topic_weights, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
- }
- virtual double clprob(string_vec_t& text, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
- {
- UNUSED(text);
- UNUSED(bow);
- UNUSED(bol);
- UNUSED(maxsuffidx);
- UNUSED(maxsuffptr);
- UNUSED(statesize);
- UNUSED(extendible);
- UNUSED(lastbow);
- return 0.0;
- };
- virtual double clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL)
- {
- UNUSED(topic_weights);
- return clprob(text, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
- }
-
- virtual const char *cmaxsuffptr(ngram ng, unsigned int* statesize=NULL)
- {
- UNUSED(ng);
- UNUSED(statesize);
- return NULL;
- }
-
- virtual const char *cmaxsuffptr(int* ng, int ngsize, unsigned int* statesize=NULL)
- {
- //create the actual ngram
- ngram ong(getDict());
- ong.pushc(ng,ngsize);
- MY_ASSERT (ong.size == ngsize);
- return cmaxsuffptr(ng, ngsize, statesize);
- }
-
- virtual ngram_state_t cmaxsuffidx(ngram ng, unsigned int* statesize=NULL)
- {
- UNUSED(ng);
- UNUSED(statesize);
- return 0;
- }
-
- virtual ngram_state_t cmaxsuffidx(int* ng, int ngsize, unsigned int* statesize=NULL)
- {
- //create the actual ngram
- ngram ong(getDict());
- ong.pushc(ng,ngsize);
- MY_ASSERT (ong.size == ngsize);
- return cmaxsuffidx(ong,statesize);
- }
-
- virtual inline int get(ngram& ng) {
- UNUSED(ng);
- return 0;
- }
-
- virtual int get(ngram& ng,int n,int lev){
- UNUSED(ng);
- UNUSED(n);
- UNUSED(lev);
- return 0;
- }
-
- virtual int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev){
- UNUSED(ng);
- UNUSED(h);
- UNUSED(action);
- UNUSED(lev);
- return 0;
- }
-
-
- virtual void used_caches() {};
- virtual void init_caches(int uptolev) {
- UNUSED(uptolev);
- };
- virtual void check_caches_levels() {};
- virtual void reset_caches() {};
-
- virtual void reset_mmap() {};
-
- void inline setLanguageModelType(int type) {
- lmtype=type;
- };
- int getLanguageModelType() const {
- return lmtype;
- };
- static int getLanguageModelType(std::string filename);
-
- inline virtual void dictionary_incflag(const bool flag) {
- UNUSED(flag);
- };
-
- virtual bool filter(const string sfilter, lmContainer*& sublmt, const string skeepunigrams);
-
- static lmContainer* CreateLanguageModel(const std::string infile, float nlf=0.0, float dlf=0.0);
- static lmContainer* CreateLanguageModel(int type, float nlf=0.0, float dlf=0.0);
-
- inline virtual bool is_OOV(int code) {
- UNUSED(code);
- return false;
- };
-
-
- inline static bool is_lmt_cache_enabled(){
- VERBOSE(3,"inline static bool is_lmt_cache_enabled() " << lmt_cache_enabled << std::endl);
- return lmt_cache_enabled;
- }
-
- inline static bool is_ps_cache_enabled(){
- VERBOSE(3,"inline static bool is_ps_cache_enabled() " << ps_cache_enabled << std::endl);
- return ps_cache_enabled;
- }
-
- inline static bool is_cache_enabled(){
- return is_lmt_cache_enabled() && is_ps_cache_enabled();
- }
-
- virtual int addWord(const char *w){
- getDict()->incflag(1);
- int c=getDict()->encode(w);
- getDict()->incflag(0);
- return c;
- }
-
- virtual void print_table_stat(){
- VERBOSE(3,"virtual void lmContainer::print_table_stat() "<< std::endl);
+ virtual double clprob(int* ng, int ngsize=NULL, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+ {
+ //create the actual ngram
+ ngram ong(getDict());
+ ong.pushc(ng,ngsize);
+ MY_ASSERT (ong.size == ngsize);
+
+ return clprob(ong, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+ };
+
+ virtual double clprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL) { return clprob(ng, topic_weights, bow, bol, NULL, maxsuffptr, statesize, extendible, lastbow); };
+ virtual double clprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+ {
+ UNUSED(topic_weights);
+ return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+ }
+ virtual double clprob(int* ng, int ngsize, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+ {
+ //create the actual ngram
+ ngram ong(getDict());
+ ong.pushc(ng,ngsize);
+ MY_ASSERT (ong.size == ngsize);
+
+ return clprob(ong, topic_weights, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+ }
+ virtual double clprob(string_vec_t& text, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+ {
+ UNUSED(text);
+ UNUSED(bow);
+ UNUSED(bol);
+ UNUSED(maxsuffidx);
+ UNUSED(maxsuffptr);
+ UNUSED(statesize);
+ UNUSED(extendible);
+ UNUSED(lastbow);
+ return 0.0;
+ };
+ virtual double clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL)
+ {
+ UNUSED(topic_weights);
+ return clprob(text, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+ }
+
+ virtual const char *cmaxsuffptr(ngram ng, unsigned int* statesize=NULL)
+ {
+ UNUSED(ng);
+ UNUSED(statesize);
+ return NULL;
+ }
+
+ virtual const char *cmaxsuffptr(int* ng, int ngsize, unsigned int* statesize=NULL)
+ {
+ //create the actual ngram
+ ngram ong(getDict());
+ ong.pushc(ng,ngsize);
+ MY_ASSERT (ong.size == ngsize);
+ return cmaxsuffptr(ng, ngsize, statesize);
+ }
+
+ virtual ngram_state_t cmaxsuffidx(ngram ng, unsigned int* statesize=NULL)
+ {
+ UNUSED(ng);
+ UNUSED(statesize);
+ return 0;
+ }
+
+ virtual ngram_state_t cmaxsuffidx(int* ng, int ngsize, unsigned int* statesize=NULL)
+ {
+ //create the actual ngram
+ ngram ong(getDict());
+ ong.pushc(ng,ngsize);
+ MY_ASSERT (ong.size == ngsize);
+ return cmaxsuffidx(ong,statesize);
+ }
+
+ virtual inline int get(ngram& ng) {
+ UNUSED(ng);
+ return 0;
+ }
+
+ virtual int get(ngram& ng,int n,int lev){
+ UNUSED(ng);
+ UNUSED(n);
+ UNUSED(lev);
+ return 0;
+ }
+
+ virtual int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev){
+ UNUSED(ng);
+ UNUSED(h);
+ UNUSED(action);
+ UNUSED(lev);
+ return 0;
+ }
+
+
+ virtual void used_caches() {};
+ virtual void init_caches(int uptolev) {
+ UNUSED(uptolev);
+ };
+ virtual void check_caches_levels() {};
+ virtual void reset_caches() {};
+
+ virtual void reset_mmap() {};
+
+ void inline setLanguageModelType(int type) {
+ lmtype=type;
+ };
+ int getLanguageModelType() const {
+ return lmtype;
+ };
+ static int getLanguageModelType(std::string filename);
+
+ inline virtual void dictionary_incflag(const bool flag) {
+ UNUSED(flag);
+ };
+
+ virtual bool filter(const string sfilter, lmContainer*& sublmt, const string skeepunigrams);
+
+ static lmContainer* CreateLanguageModel(const std::string infile, float nlf=0.0, float dlf=0.0);
+ static lmContainer* CreateLanguageModel(int type, float nlf=0.0, float dlf=0.0);
+
+ inline virtual bool is_OOV(int code) {
+ UNUSED(code);
+ return false;
+ };
+
+
+ inline static bool is_lmt_cache_enabled(){
+ VERBOSE(3,"inline static bool is_lmt_cache_enabled() " << lmt_cache_enabled << std::endl);
+ return lmt_cache_enabled;
+ }
+
+ inline static bool is_ps_cache_enabled(){
+ VERBOSE(3,"inline static bool is_ps_cache_enabled() " << ps_cache_enabled << std::endl);
+ return ps_cache_enabled;
+ }
+
+ inline static bool is_cache_enabled(){
+ return is_lmt_cache_enabled() && is_ps_cache_enabled();
+ }
+
+ virtual int addWord(const char *w){
+ getDict()->incflag(1);
+ int c=getDict()->encode(w);
+ getDict()->incflag(0);
+ return c;
+ }
+
+ virtual void print_table_stat(){
+ VERBOSE(3,"virtual void lmContainer::print_table_stat() "<< std::endl);
+ };
+
+ inline std::string getContextDelimiter() const{ return context_delimiter; }
+
+ bool GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line);
+
+ void setContextMap(topic_map_t& topic_map, const std::string& context);
+
};
- inline std::string getContextDelimiter() const{ return context_delimiter; }
-
- bool GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line);
-
- void setContextMap(topic_map_t& topic_map, const std::string& context);
-
- };
+ }//namespace irstlm
-}//namespace irstlm
-
#endif
-
+
diff --git a/src/lmInterpolation.cpp b/src/lmInterpolation.cpp
index a8badab..e9f7584 100644
--- a/src/lmInterpolation.cpp
+++ b/src/lmInterpolation.cpp
@@ -23,6 +23,7 @@
#include <cstdlib>
#include <stdlib.h>
#include <iostream>
+#include <sstream>
#include <stdexcept>
#include <string>
#include "lmContainer.h"
@@ -40,6 +41,7 @@ namespace irstlm {
order=0;
memmap=0;
isInverted=false;
+ m_map_flag=false;
}
void lmInterpolation::load(const std::string &filename,int mmap)
@@ -47,7 +49,6 @@ namespace irstlm {
VERBOSE(2,"lmInterpolation::load(const std::string &filename,int memmap)" << std::endl);
VERBOSE(2," filename:|" << filename << "|" << std::endl);
-
dictionary_upperbound=1000000;
int memmap=mmap;
@@ -59,13 +60,42 @@ namespace irstlm {
char line[MAX_LINE];
const char* words[LMINTERPOLATION_MAX_TOKEN];
- int tokenN;
+ size_t tokenN;
inp.getline(line,MAX_LINE,'\n');
tokenN = parseWords(line,words,LMINTERPOLATION_MAX_TOKEN);
+ bool error=false;
+
+ if ((tokenN<2) || (tokenN>3)){
+ error=true;
+ }else if ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0)) {
+ error=true;
+ }else if ((tokenN==3) && ((strcmp(words[2],"MAP") != 0) && (strcmp(words[2],"map") != 0))){
+ error=true;
+ }
+
+ if (error){
+ exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format:\nLMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1 [inverted]\nweight_of_LM_2 filename_of_LM_2\nor\nLMINTERPOLATION number_of_models MAP\nweight_of_LM_1 name_LM_1 filename_of_LM_1\nweight_of_LM_2 name_LM_2 filename_of_LM_2");
+
+ }
- if (tokenN != 2 || ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0))){
- exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
+ size_t idx_weight, idx_file, idx_name, idx_inverted, idx_size;
+ if (tokenN==2){
+ m_map_flag=false;
+ idx_weight=0;
+ idx_file=1;
+ idx_inverted=2;
+ idx_size=3;
+ m_isadaptive=false;
+ }else{
+ m_map_flag=true;
+ idx_weight=0;
+ idx_name=1;
+ idx_file=2;
+ idx_inverted=3;
+ idx_size=4;
+ m_isadaptive=true;
}
+
m_number_lm = atoi(words[1]);
m_weight.resize(m_number_lm);
@@ -73,33 +103,45 @@ namespace irstlm {
m_isinverted.resize(m_number_lm);
m_lm.resize(m_number_lm);
- VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_number_lm:"<< m_number_lm << std::endl;);
+ VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_number_lm:"<< m_number_lm << std::endl);
dict->incflag(1);
for (int i=0; i<m_number_lm; i++) {
inp.getline(line,BUFSIZ,'\n');
tokenN = parseWords(line,words,3);
- if(tokenN < 2 || tokenN >3) {
- exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
+ if(tokenN < idx_file || tokenN > idx_size) {
+ exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format:\nLMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1 [inverted]\nweight_of_LM_2 filename_of_LM_2\nor\nLMINTERPOLATION number_of_models MAP\nweight_of_LM_1 name_LM_1 filename_of_LM_1\nweight_of_LM_2 name_LM_2 filename_of_LM_2");
}
//check whether the (textual) LM has to be loaded as inverted
m_isinverted[i] = false;
- if(tokenN == 3) {
- if (strcmp(words[2],"inverted") == 0)
+ if(tokenN == idx_size) {
+ if (strcmp(words[idx_inverted],"inverted") == 0)
m_isinverted[i] = true;
}
VERBOSE(2,"i:" << i << " m_isinverted[i]:" << m_isinverted[i] << endl);
- m_weight[i] = (float) atof(words[0]);
- m_file[i] = words[1];
- VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_file:"<< words[1] << std::endl;);
+ m_weight[i] = atof(words[idx_weight]);
+ if (m_map_flag){
+ m_idx[words[idx_name]] = i;
+ m_name[i] = words[idx_name];
+ VERBOSE(2,"i:" << i << " m_idx[words[idx_name]]:|" << m_idx[words[idx_name]] << "| m_name[i]:|" << m_name[i] << "|" << endl);
+ std::stringstream name;
+ name << i;
+ m_idx[name.str()] = i;
+ m_name[i] = name.str();
+ VERBOSE(2,"i:" << i << " name.str():|" << name.str() << "| m_name[i]:|" << m_name[i] << "|" << endl);
+ }
+ m_file[i] = words[idx_file];
+
+ VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) i:" << i << " m_name:|"<< m_name[i] << "|" " m_file:|"<< m_file[i] << "| isadaptve:|" << m_isadaptive << "|" << std::endl);
m_lm[i] = load_lm(i,memmap,ngramcache_load_factor,dictionary_load_factor);
//set the actual value for inverted flag, which is known only after loading the lM
m_isinverted[i] = m_lm[i]->is_inverted();
+
dictionary *_dict=m_lm[i]->getDict();
for (int j=0; j<_dict->size(); j++) {
dict->encode(_dict->decode(j));
@@ -140,8 +182,9 @@ namespace irstlm {
}
//return log10 prob of an ngram
- double lmInterpolation::clprob(ngram ng, double* bow,int* bol,ngram_state_t* maxsuffidx, char** maxsuffptr,unsigned int* statesize,bool* extendible, double* lastbow)
+ double lmInterpolation::clprob(ngram ng, topic_map_t& lm_weights, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow)
{
+ VERBOSE(1,"double lmInterpolation::clprob(ngram ng, topic_map_t& lm_weights,...)" << std::endl);
double pr=0.0;
double _logpr;
@@ -152,22 +195,20 @@ namespace irstlm {
int _bol=0,actualbol=MAX_NGRAM;
double _bow=0.0,actualbow=0.0;
double _lastbow=0.0,actuallastbow=0.0;
- bool _extendible=false;
- bool actualextendible=false;
+ bool _extendible=false,actualextendible=false;
- // ngram_state_t* maxsuffidx = new ngram_state_t;
+ double_vec_t weight(m_number_lm);
+ set_weight(lm_weights,weight);
- for (size_t i=0; i<m_lm.size(); i++) {
-
- if (m_weight[i]>0.0){
+ for (size_t i=0; i<m_number_lm; i++) {
+ if (weight[i]>0.0){
ngram _ng(m_lm[i]->getDict());
_ng.trans(ng);
- // _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,&_extendible);
- _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffidx,&_maxsuffptr,&_statesize,&_extendible, lastbow);
+ _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,&_extendible,&_lastbow);
IFVERBOSE(3){
//cerr.precision(10);
- VERBOSE(3," LM " << i << " weight:" << m_weight[i] << std::endl);
+ VERBOSE(3," LM " << i << " weight:" << weight[i] << std::endl);
VERBOSE(3," LM " << i << " log10 logpr:" << _logpr<< std::endl);
VERBOSE(3," LM " << i << " pr:" << pow(10.0,_logpr) << std::endl);
VERBOSE(3," _statesize:" << _statesize << std::endl);
@@ -187,6 +228,91 @@ namespace irstlm {
//What is the lastbow of a LM interpolation? The weighted sum of the lastbow of the submodels
*/
+ pr+=weight[i]*pow(10.0,_logpr);
+ actualbow+=weight[i]*pow(10.0,_bow);
+
+ if(_statesize > actualstatesize || i == 0) {
+ actualmaxsuffptr = _maxsuffptr;
+ actualmaxsuffidx = _maxsuffidx;
+ actualstatesize = _statesize;
+ }
+ if (_bol < actualbol) {
+ actualbol=_bol; //backoff limit of LM[i]
+ }
+ if (_extendible) {
+ actualextendible=true; //set extendible flag to true if the ngram is extendible for any LM
+ }
+ if (_lastbow < actuallastbow) {
+ actuallastbow=_lastbow; //backoff limit of LM[i]
+ }
+ }
+ else{
+ VERBOSE(3," LM " << i << " weight is zero" << std::endl);
+ }
+ }
+ if (bol) *bol=actualbol;
+ if (bow) *bow=log(actualbow);
+ if (maxsuffptr) *maxsuffptr=actualmaxsuffptr;
+ if (maxsuffidx) *maxsuffidx=actualmaxsuffidx;
+ if (statesize) *statesize=actualstatesize;
+ if (extendible) *extendible=actualextendible;
+ if (lastbow) *bol=actuallastbow;
+
+ if (statesize) VERBOSE(3, " statesize:" << *statesize << std::endl);
+ if (bow) VERBOSE(3, " bow:" << *bow << std::endl);
+ if (bol) VERBOSE(3, " bol:" << *bol << std::endl);
+ if (lastbow) VERBOSE(3, " lastbow:" << *lastbow << std::endl);
+
+ return log10(pr);
+ }
+
+ //return log10 prob of an ngram
+ double lmInterpolation::clprob(ngram ng, double* bow,int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow)
+ {
+ VERBOSE(1,"double lmInterpolation::clprob(ngram ng, ...)" << std::endl);
+
+ double pr=0.0;
+ double _logpr;
+
+ char* _maxsuffptr=NULL,*actualmaxsuffptr=NULL;
+ ngram_state_t _maxsuffidx=NULL,actualmaxsuffidx=NULL;
+ unsigned int _statesize=0,actualstatesize=0;
+ int _bol=0,actualbol=MAX_NGRAM;
+ double _bow=0.0,actualbow=0.0;
+ double _lastbow=0.0,actuallastbow=0.0;
+ bool _extendible=false,actualextendible=false;
+
+ for (size_t i=0; i<m_number_lm; i++) {
+
+ if (m_weight[i]>0.0){
+ ngram _ng(m_lm[i]->getDict());
+ _ng.trans(ng);
+ _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffidx,&_maxsuffptr,&_statesize,&_extendible,&_lastbow);
+
+ IFVERBOSE(3){
+ //cerr.precision(10);
+ VERBOSE(3," LM " << i << " weight:" << m_weight[i] << std::endl);
+ VERBOSE(3," LM " << i << " log10 logpr:" << _logpr<< std::endl);
+ VERBOSE(3," LM " << i << " pr:" << pow(10.0,_logpr) << std::endl);
+ VERBOSE(3," LM " << i << " msp:" << (void*) _maxsuffptr << std::endl);
+ VERBOSE(3," LM " << i << " msidx:" << _maxsuffidx << std::endl);
+ VERBOSE(3," LM " << i << " statesize:" << _statesize << std::endl);
+ VERBOSE(3," LM " << i << " bow:" << _bow << std::endl);
+ VERBOSE(3," LM " << i << " bol:" << _bol << std::endl);
+ VERBOSE(3," LM " << i << " lastbow:" << _lastbow << std::endl);
+ }
+
+ /*
+ //TO CHECK the following claims
+ //What is the statesize of a LM interpolation? The largest _statesize among the submodels
+ //What is the maxsuffptr of a LM interpolation? The _maxsuffptr of the submodel with the largest _statesize
+ //What is the bol of a LM interpolation? The smallest _bol among the submodels
+ //What is the bow of a LM interpolation? The weighted sum of the bow of the submodels
+ //What is the prob of a LM interpolation? The weighted sum of the prob of the submodels
+ //What is the extendible flag of a LM interpolation? true if the extendible flag is one for any LM
+ //What is the lastbow of a LM interpolation? The weighted sum of the lastbow of the submodels
+ */
+
pr+=m_weight[i]*pow(10.0,_logpr);
actualbow+=m_weight[i]*pow(10.0,_bow);
@@ -214,6 +340,8 @@ namespace irstlm {
if (extendible) *extendible=actualextendible;
if (lastbow) *bol=actuallastbow;
+ if (maxsuffptr) VERBOSE(3, " msp:" << (void*) *maxsuffptr << std::endl);
+ if (maxsuffidx) VERBOSE(3, " msidx:" << *maxsuffidx << std::endl);
if (statesize) VERBOSE(3, " statesize:" << *statesize << std::endl);
if (bow) VERBOSE(3, " bow:" << *bow << std::endl);
if (bol) VERBOSE(3, " bol:" << *bol << std::endl);
@@ -228,9 +356,7 @@ namespace irstlm {
char *maxsuffptr=NULL;
unsigned int _statesize=0,actualstatesize=0;
- // ngram_state_t* maxsuffidx = new ngram_state_t;
-
- for (size_t i=0; i<m_lm.size(); i++) {
+ for (size_t i=0; i<m_number_lm; i++) {
if (m_weight[i]>0.0){
ngram _ng(m_lm[i]->getDict());
@@ -262,15 +388,13 @@ namespace irstlm {
return maxsuffptr;
}
-
+
ngram_state_t lmInterpolation::cmaxsuffidx(ngram ng, unsigned int* statesize)
{
ngram_state_t maxsuffidx=0;
unsigned int _statesize=0,actualstatesize=0;
- // ngram_state_t* maxsuffidx = new ngram_state_t;
-
- for (size_t i=0; i<m_lm.size(); i++) {
+ for (size_t i=0; i<m_number_lm; i++) {
if (m_weight[i]>0.0){
ngram _ng(m_lm[i]->getDict());
@@ -321,5 +445,18 @@ namespace irstlm {
logOOVpenalty=log10(OOVpenalty);
return logOOVpenalty;
}
+
+
+ void lmInterpolation::set_weight(const topic_map_t& map, double_vec_t& weight){
+ VERBOSE(4,"void lmInterpolation::set_weight" << std::endl);
+ VERBOSE(4,"map.size:" << map.size() << std::endl);
+ for (topic_map_t::const_iterator it=map.begin(); it!=map.end();++it){
+ if (m_idx.find(it->first) == m_idx.end()){
+ exit_error(IRSTLM_ERROR_DATA, "void lmInterpolation::set_weight(const topic_map_t& map, double_vec_t& weight) ERROR: you are setting the weight of a LM which is not included in the interpolated LM");
+ }
+ weight[m_idx[it->first]] = it->second;
+ VERBOSE(4,"it->first:|" << it->first << "| it->second:|" << it->second << "| m_idx[it->first]:|" << m_idx[it->first] << "| weight[m_idx[it->first]]:|" <<weight[m_idx[it->first]] << "|" << std::endl);
+ }
+ }
}//namespace irstlm
diff --git a/src/lmInterpolation.h b/src/lmInterpolation.h
index 2f12265..6d5519e 100644
--- a/src/lmInterpolation.h
+++ b/src/lmInterpolation.h
@@ -45,11 +45,12 @@ namespace irstlm {
class lmInterpolation: public lmContainer
{
static const bool debug=true;
- int m_number_lm;
+ size_t m_number_lm;
int order;
int dictionary_upperbound; //set by user
double logOOVpenalty; //penalty for OOV words (default 0)
- bool isInverted;
+ bool isInverted;
+ bool m_map_flag; //flag for the presence of a map between name and lm
int memmap; //level from which n-grams are accessed via mmap
std::vector<double> m_weight;
@@ -57,13 +58,18 @@ namespace irstlm {
std::vector<bool> m_isinverted;
std::vector<lmContainer*> m_lm;
- int maxlev; //maximun order of sub LMs;
+ int maxlev; //maximun order of sub LMs;
+
+ std::map< std::string, size_t > m_idx;
+ std::map< size_t, std::string > m_name;
float ngramcache_load_factor;
float dictionary_load_factor;
dictionary *dict; // dictionary for all interpolated LMs
+ void set_weight(const topic_map_t& map, double_vec_t& weight);
+
public:
lmInterpolation(float nlf=0.0, float dlfi=0.0);
@@ -73,6 +79,7 @@ namespace irstlm {
lmContainer* load_lm(int i, int memmap, float nlf, float dlf);
virtual double clprob(ngram ng, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
+ virtual double clprob(ngram ng, topic_map_t& lm_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL);
virtual const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL);
virtual ngram_state_t cmaxsuffidx(ngram ong, unsigned int* size=NULL);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list