[irstlm] 16/78: code optimization; code cleanup;

Giulio Paci giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:47:01 UTC 2016


This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to tag adaptiveLM.v0.10
in repository irstlm.

commit 30114d49340d76382d3932d1e9a1b12fdf8c9dcb
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Sun Nov 8 22:06:59 2015 +0100

    code optimization; code cleanup;
---
 src/lmContainer.h       | 338 +++++++++++++++++++++++++-----------------------
 src/lmInterpolation.cpp | 195 +++++++++++++++++++++++-----
 src/lmInterpolation.h   |  13 +-
 3 files changed, 350 insertions(+), 196 deletions(-)

diff --git a/src/lmContainer.h b/src/lmContainer.h
index b40be41..da7bf8a 100644
--- a/src/lmContainer.h
+++ b/src/lmContainer.h
@@ -48,10 +48,8 @@ typedef enum {LMT_FIND,    //!< search: find an entry
 namespace irstlm {
 	static const std::string context_delimiter="___CONTEXT___";
 	static const std::string lexicon_delimiter="___LEXICON___";
-  static const char topic_map_delimiter1=':';
+	static const char topic_map_delimiter1=':';
 	static const char topic_map_delimiter2=',';
-//  #define topic_map_delimiter1 ':'
-//	#define topic_map_delimiter2 ','
 	
 	
 	typedef std::map< std::string, float > topic_map_t;
@@ -142,171 +140,183 @@ namespace irstlm {
 		virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, NULL, NULL); }
 		virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, NULL); };
 		
-//		virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow){return 0.0;};
-		virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow);
-		
-		virtual double clprob(int* ng, int ngsize=NULL, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+		virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow)
 		{
-			//create the actual ngram
-			ngram ong(getDict());
-			ong.pushc(ng,ngsize);
-			MY_ASSERT (ong.size == ngsize);
+				UNUSED(ng);
+				UNUSED(bow);
+				UNUSED(bol);
+				UNUSED(maxsuffidx);
+				UNUSED(maxsuffptr);
+				UNUSED(statesize);
+				UNUSED(extendible);
+				UNUSED(lastbow);
+				
+				return 0.0;
+			}
 			
-			return clprob(ong, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
-		};
-		
-		virtual double clprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL) { return clprob(ng, topic_weights, bow, bol, NULL, maxsuffptr, statesize, extendible, lastbow); };		
-		virtual double clprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
-		{
-			UNUSED(topic_weights);
-			return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
-		}
-		virtual double clprob(int* ng, int ngsize, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
-		{
-			//create the actual ngram
-			ngram ong(getDict());
-			ong.pushc(ng,ngsize);
-			MY_ASSERT (ong.size == ngsize);
 			
-			return clprob(ong, topic_weights, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
-		}
-		virtual double clprob(string_vec_t& text, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
-		{
-			UNUSED(text);
-			UNUSED(bow);
-			UNUSED(bol);
-			UNUSED(maxsuffidx);
-			UNUSED(maxsuffptr);
-			UNUSED(statesize);
-			UNUSED(extendible);
-			UNUSED(lastbow);
-			return 0.0;
-		};
-		virtual double clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL)
-		{
-			UNUSED(topic_weights);
-			return clprob(text, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
-		}
-		
-		virtual const char *cmaxsuffptr(ngram ng, unsigned int* statesize=NULL)
-		{
-			UNUSED(ng);
-			UNUSED(statesize);
-			return NULL;
-		}
-		
-		virtual const char *cmaxsuffptr(int* ng, int ngsize, unsigned int* statesize=NULL)
-		{
-			//create the actual ngram 
-			ngram ong(getDict());
-			ong.pushc(ng,ngsize);
-			MY_ASSERT (ong.size == ngsize);
-			return cmaxsuffptr(ng, ngsize, statesize);
-		}
-		
-		virtual ngram_state_t cmaxsuffidx(ngram ng, unsigned int* statesize=NULL)
-		{
-			UNUSED(ng);
-			UNUSED(statesize);
-			return 0;
-		}
-		
-		virtual ngram_state_t cmaxsuffidx(int* ng, int ngsize, unsigned int* statesize=NULL)
-		{
-			//create the actual ngram 
-			ngram ong(getDict());                
-			ong.pushc(ng,ngsize);
-			MY_ASSERT (ong.size == ngsize); 
-			return cmaxsuffidx(ong,statesize);
-		}
-		
-		virtual inline int get(ngram& ng) {
-			UNUSED(ng);
-			return 0;
-		}
-		
-		virtual int get(ngram& ng,int n,int lev){
-			UNUSED(ng);
-			UNUSED(n);
-			UNUSED(lev);
-			return 0;
-		}
-		
-		virtual int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev){
-			UNUSED(ng);
-			UNUSED(h);
-			UNUSED(action);
-			UNUSED(lev);
-			return 0;     
-		}
-		
-		
-		virtual void used_caches() {};
-		virtual void init_caches(int uptolev) {
-			UNUSED(uptolev);
-		};
-		virtual void check_caches_levels() {};
-		virtual void reset_caches() {};
-		
-		virtual void  reset_mmap() {};
-		
-		void inline setLanguageModelType(int type) {
-			lmtype=type;
-		};
-		int getLanguageModelType() const {
-			return lmtype;
-		};
-		static int getLanguageModelType(std::string filename);
-		
-		inline virtual void dictionary_incflag(const bool flag) {
-			UNUSED(flag);
-		};
-		
-		virtual bool filter(const string sfilter, lmContainer*& sublmt, const string skeepunigrams);
-		
-		static lmContainer* CreateLanguageModel(const std::string infile, float nlf=0.0, float dlf=0.0);
-		static lmContainer* CreateLanguageModel(int type, float nlf=0.0, float dlf=0.0);
-		
-		inline virtual bool is_OOV(int code) {
-			UNUSED(code);
-			return false;
-		};
-		
-		
-		inline static bool is_lmt_cache_enabled(){
-			VERBOSE(3,"inline static bool is_lmt_cache_enabled() " << lmt_cache_enabled << std::endl);
-			return lmt_cache_enabled;
-		}
-		
-		inline static bool is_ps_cache_enabled(){
-			VERBOSE(3,"inline static bool is_ps_cache_enabled() " << ps_cache_enabled << std::endl);
-			return ps_cache_enabled;
-		}
-		
-		inline static bool is_cache_enabled(){
-			return is_lmt_cache_enabled() && is_ps_cache_enabled();
-		}
-		
-		virtual int addWord(const char *w){
-			getDict()->incflag(1);
-			int c=getDict()->encode(w);
-			getDict()->incflag(0);
-			return c;
-		}
-		
-		virtual void print_table_stat(){
-			VERBOSE(3,"virtual void lmContainer::print_table_stat() "<< std::endl);
+			virtual double clprob(int* ng, int ngsize=NULL, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+			{
+				//create the actual ngram
+				ngram ong(getDict());
+				ong.pushc(ng,ngsize);
+				MY_ASSERT (ong.size == ngsize);
+				
+				return clprob(ong, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+			};
+			
+			virtual double clprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL) { return clprob(ng, topic_weights, bow, bol, NULL, maxsuffptr, statesize, extendible, lastbow); };		
+			virtual double clprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+			{
+				UNUSED(topic_weights);
+				return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+			}
+			virtual double clprob(int* ng, int ngsize, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+			{
+				//create the actual ngram
+				ngram ong(getDict());
+				ong.pushc(ng,ngsize);
+				MY_ASSERT (ong.size == ngsize);
+				
+				return clprob(ong, topic_weights, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+			}
+			virtual double clprob(string_vec_t& text, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)
+			{
+				UNUSED(text);
+				UNUSED(bow);
+				UNUSED(bol);
+				UNUSED(maxsuffidx);
+				UNUSED(maxsuffptr);
+				UNUSED(statesize);
+				UNUSED(extendible);
+				UNUSED(lastbow);
+				return 0.0;
+			};
+			virtual double clprob(string_vec_t& text, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL)
+			{
+				UNUSED(topic_weights);
+				return clprob(text, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+			}
+			
+			virtual const char *cmaxsuffptr(ngram ng, unsigned int* statesize=NULL)
+			{
+				UNUSED(ng);
+				UNUSED(statesize);
+				return NULL;
+			}
+			
+			virtual const char *cmaxsuffptr(int* ng, int ngsize, unsigned int* statesize=NULL)
+			{
+				//create the actual ngram 
+				ngram ong(getDict());
+				ong.pushc(ng,ngsize);
+				MY_ASSERT (ong.size == ngsize);
+				return cmaxsuffptr(ng, ngsize, statesize);
+			}
+			
+			virtual ngram_state_t cmaxsuffidx(ngram ng, unsigned int* statesize=NULL)
+			{
+				UNUSED(ng);
+				UNUSED(statesize);
+				return 0;
+			}
+			
+			virtual ngram_state_t cmaxsuffidx(int* ng, int ngsize, unsigned int* statesize=NULL)
+			{
+				//create the actual ngram 
+				ngram ong(getDict());                
+				ong.pushc(ng,ngsize);
+				MY_ASSERT (ong.size == ngsize); 
+				return cmaxsuffidx(ong,statesize);
+			}
+			
+			virtual inline int get(ngram& ng) {
+				UNUSED(ng);
+				return 0;
+			}
+			
+			virtual int get(ngram& ng,int n,int lev){
+				UNUSED(ng);
+				UNUSED(n);
+				UNUSED(lev);
+				return 0;
+			}
+			
+			virtual int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev){
+				UNUSED(ng);
+				UNUSED(h);
+				UNUSED(action);
+				UNUSED(lev);
+				return 0;     
+			}
+			
+			
+			virtual void used_caches() {};
+			virtual void init_caches(int uptolev) {
+				UNUSED(uptolev);
+			};
+			virtual void check_caches_levels() {};
+			virtual void reset_caches() {};
+			
+			virtual void  reset_mmap() {};
+			
+			void inline setLanguageModelType(int type) {
+				lmtype=type;
+			};
+			int getLanguageModelType() const {
+				return lmtype;
+			};
+			static int getLanguageModelType(std::string filename);
+			
+			inline virtual void dictionary_incflag(const bool flag) {
+				UNUSED(flag);
+			};
+			
+			virtual bool filter(const string sfilter, lmContainer*& sublmt, const string skeepunigrams);
+			
+			static lmContainer* CreateLanguageModel(const std::string infile, float nlf=0.0, float dlf=0.0);
+			static lmContainer* CreateLanguageModel(int type, float nlf=0.0, float dlf=0.0);
+			
+			inline virtual bool is_OOV(int code) {
+				UNUSED(code);
+				return false;
+			};
+			
+			
+			inline static bool is_lmt_cache_enabled(){
+				VERBOSE(3,"inline static bool is_lmt_cache_enabled() " << lmt_cache_enabled << std::endl);
+				return lmt_cache_enabled;
+			}
+			
+			inline static bool is_ps_cache_enabled(){
+				VERBOSE(3,"inline static bool is_ps_cache_enabled() " << ps_cache_enabled << std::endl);
+				return ps_cache_enabled;
+			}
+			
+			inline static bool is_cache_enabled(){
+				return is_lmt_cache_enabled() && is_ps_cache_enabled();
+			}
+			
+			virtual int addWord(const char *w){
+				getDict()->incflag(1);
+				int c=getDict()->encode(w);
+				getDict()->incflag(0);
+				return c;
+			}
+			
+			virtual void print_table_stat(){
+				VERBOSE(3,"virtual void lmContainer::print_table_stat() "<< std::endl);
+			};
+			
+			inline std::string getContextDelimiter() const{ return context_delimiter; }
+			
+			bool GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line);
+			
+			void setContextMap(topic_map_t& topic_map, const std::string& context);
+			
 		};
 		
-		inline std::string getContextDelimiter() const{ return context_delimiter; }
-		
-		bool GetSentenceAndContext(std::string& sentence, std::string& context, std::string& line);
-		
-		void setContextMap(topic_map_t& topic_map, const std::string& context);
-		
-	};
+	}//namespace irstlm
 	
-}//namespace irstlm
-
 #endif
-
+	
diff --git a/src/lmInterpolation.cpp b/src/lmInterpolation.cpp
index a8badab..e9f7584 100644
--- a/src/lmInterpolation.cpp
+++ b/src/lmInterpolation.cpp
@@ -23,6 +23,7 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include <iostream>
+#include <sstream>
 #include <stdexcept>
 #include <string>
 #include "lmContainer.h"
@@ -40,6 +41,7 @@ namespace irstlm {
 		order=0;
 		memmap=0;
 		isInverted=false;
+		m_map_flag=false;
 	}
 	
 	void lmInterpolation::load(const std::string &filename,int mmap)
@@ -47,7 +49,6 @@ namespace irstlm {
 		VERBOSE(2,"lmInterpolation::load(const std::string &filename,int memmap)" << std::endl);
 		VERBOSE(2," filename:|" << filename << "|" << std::endl);
 		
-		
 		dictionary_upperbound=1000000;
 		int memmap=mmap;
 		
@@ -59,13 +60,42 @@ namespace irstlm {
 		
 		char line[MAX_LINE];
 		const char* words[LMINTERPOLATION_MAX_TOKEN];
-		int tokenN;
+		size_t tokenN;
 		inp.getline(line,MAX_LINE,'\n');
 		tokenN = parseWords(line,words,LMINTERPOLATION_MAX_TOKEN);
+		bool error=false;
+		
+		if ((tokenN<2) || (tokenN>3)){
+			error=true;     
+		}else if ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0)) {
+			error=true;
+		}else if ((tokenN==3) && ((strcmp(words[2],"MAP") != 0) && (strcmp(words[2],"map") != 0))){
+			error=true;
+		}
+		
+		if (error){
+			exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format:\nLMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1 [inverted]\nweight_of_LM_2 filename_of_LM_2\nor\nLMINTERPOLATION number_of_models MAP\nweight_of_LM_1 name_LM_1 filename_of_LM_1\nweight_of_LM_2 name_LM_2 filename_of_LM_2");
+			
+		}
 		
-		if (tokenN != 2 || ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0))){
-			exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
+		size_t idx_weight, idx_file, idx_name, idx_inverted, idx_size;
+		if (tokenN==2){
+			m_map_flag=false;
+			idx_weight=0;
+			idx_file=1;
+			idx_inverted=2;
+			idx_size=3;
+			m_isadaptive=false;
+		}else{
+			m_map_flag=true;
+			idx_weight=0;
+			idx_name=1;
+			idx_file=2;
+			idx_inverted=3;
+			idx_size=4;
+			m_isadaptive=true;
 		}
+		
 		m_number_lm = atoi(words[1]);
 		
 		m_weight.resize(m_number_lm);
@@ -73,33 +103,45 @@ namespace irstlm {
 		m_isinverted.resize(m_number_lm);
 		m_lm.resize(m_number_lm);
 		
-		VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_number_lm:"<< m_number_lm << std::endl;);
+		VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_number_lm:"<< m_number_lm << std::endl);
 		
 		dict->incflag(1);
 		for (int i=0; i<m_number_lm; i++) {
 			inp.getline(line,BUFSIZ,'\n');
 			tokenN = parseWords(line,words,3);
 			
-			if(tokenN < 2 || tokenN >3) {
-				exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2");
+			if(tokenN < idx_file || tokenN > idx_size) {
+				exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format:\nLMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1 [inverted]\nweight_of_LM_2 filename_of_LM_2\nor\nLMINTERPOLATION number_of_models MAP\nweight_of_LM_1 name_LM_1 filename_of_LM_1\nweight_of_LM_2 name_LM_2 filename_of_LM_2");
 			}
 			
 			//check whether the (textual) LM has to be loaded as inverted
 			m_isinverted[i] = false;
-			if(tokenN == 3) {
-				if (strcmp(words[2],"inverted") == 0)
+			if(tokenN == idx_size) {
+				if (strcmp(words[idx_inverted],"inverted") == 0)
 					m_isinverted[i] = true;
 			}
 			VERBOSE(2,"i:" << i << " m_isinverted[i]:" << m_isinverted[i] << endl);
 			
-			m_weight[i] = (float) atof(words[0]);
-			m_file[i] = words[1];
-			VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_file:"<< words[1] << std::endl;);
+			m_weight[i] = atof(words[idx_weight]);
+			if (m_map_flag){
+				m_idx[words[idx_name]] = i;
+				m_name[i] = words[idx_name];
+				VERBOSE(2,"i:" << i << " m_idx[words[idx_name]]:|" << m_idx[words[idx_name]] << "| m_name[i]:|" << m_name[i] << "|" << endl);
+				std::stringstream name;
+				name << i;
+				m_idx[name.str()] = i;
+				m_name[i] = name.str();
+				VERBOSE(2,"i:" << i << " name.str():|" << name.str() << "| m_name[i]:|" << m_name[i] << "|" << endl);
+			}
+			m_file[i] = words[idx_file];
+			
+			VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) i:" << i << " m_name:|"<< m_name[i] << "|" " m_file:|"<< m_file[i] << "| isadaptve:|" << m_isadaptive << "|" << std::endl);
 			
 			m_lm[i] = load_lm(i,memmap,ngramcache_load_factor,dictionary_load_factor);
 			//set the actual value for inverted flag, which is known only after loading the lM
 			m_isinverted[i] = m_lm[i]->is_inverted();
 			
+			
 			dictionary *_dict=m_lm[i]->getDict();
 			for (int j=0; j<_dict->size(); j++) {
 				dict->encode(_dict->decode(j));
@@ -140,8 +182,9 @@ namespace irstlm {
 	}
 	
 	//return log10 prob of an ngram
-	double lmInterpolation::clprob(ngram ng, double* bow,int* bol,ngram_state_t* maxsuffidx, char** maxsuffptr,unsigned int* statesize,bool* extendible, double* lastbow)
+	double lmInterpolation::clprob(ngram ng, topic_map_t& lm_weights, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow)
 	{
+		VERBOSE(1,"double lmInterpolation::clprob(ngram ng, topic_map_t& lm_weights,...)"  << std::endl);
 		
 		double pr=0.0;
 		double _logpr;
@@ -152,22 +195,20 @@ namespace irstlm {
 		int _bol=0,actualbol=MAX_NGRAM;
 		double _bow=0.0,actualbow=0.0; 
 		double _lastbow=0.0,actuallastbow=0.0; 
-		bool _extendible=false;
-		bool actualextendible=false;
+		bool _extendible=false,actualextendible=false;
 		
-		//		ngram_state_t* maxsuffidx = new ngram_state_t;
+		double_vec_t weight(m_number_lm);
+		set_weight(lm_weights,weight);
 		
-		for (size_t i=0; i<m_lm.size(); i++) {
-			
-			if (m_weight[i]>0.0){
+		for (size_t i=0; i<m_number_lm; i++) {
+			if (weight[i]>0.0){
 				ngram _ng(m_lm[i]->getDict());
 				_ng.trans(ng);
-				//				_logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,&_extendible);				
-				_logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffidx,&_maxsuffptr,&_statesize,&_extendible, lastbow);
+				_logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,&_extendible,&_lastbow);
 				
 				IFVERBOSE(3){
 					//cerr.precision(10);
-					VERBOSE(3," LM " << i << " weight:" << m_weight[i] << std::endl);
+					VERBOSE(3," LM " << i << " weight:" << weight[i] << std::endl);
 					VERBOSE(3," LM " << i << " log10 logpr:" << _logpr<< std::endl);
 					VERBOSE(3," LM " << i << " pr:" << pow(10.0,_logpr) << std::endl);
 					VERBOSE(3," _statesize:" << _statesize << std::endl);
@@ -187,6 +228,91 @@ namespace irstlm {
 				 //What is the lastbow of a LM interpolation? The weighted sum of the lastbow of the submodels
 				 */
 				
+				pr+=weight[i]*pow(10.0,_logpr);
+				actualbow+=weight[i]*pow(10.0,_bow);
+				
+				if(_statesize > actualstatesize || i == 0) {
+					actualmaxsuffptr = _maxsuffptr;
+					actualmaxsuffidx = _maxsuffidx;
+					actualstatesize = _statesize;
+				}
+				if (_bol < actualbol) {
+					actualbol=_bol; //backoff limit of LM[i]
+				}
+				if (_extendible) {
+					actualextendible=true; //set extendible flag to true if the ngram is extendible for any LM
+				}
+				if (_lastbow < actuallastbow) {
+					actuallastbow=_lastbow; //backoff limit of LM[i]
+				}
+			}
+			else{
+				VERBOSE(3," LM " << i << " weight is zero" << std::endl);
+			}
+		}
+		if (bol) *bol=actualbol;
+		if (bow) *bow=log(actualbow);
+		if (maxsuffptr) *maxsuffptr=actualmaxsuffptr;
+		if (maxsuffidx) *maxsuffidx=actualmaxsuffidx;
+		if (statesize) *statesize=actualstatesize;
+		if (extendible) *extendible=actualextendible;
+		if (lastbow) *bol=actuallastbow;
+		
+		if (statesize) VERBOSE(3, " statesize:" << *statesize << std::endl);
+		if (bow) VERBOSE(3, " bow:" << *bow << std::endl);
+		if (bol) VERBOSE(3, " bol:" << *bol << std::endl);
+		if (lastbow) VERBOSE(3, " lastbow:" << *lastbow << std::endl);
+		
+		return log10(pr);
+	}
+	
+	//return log10 prob of an ngram
+	double lmInterpolation::clprob(ngram ng, double* bow,int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow)
+	{
+		VERBOSE(1,"double lmInterpolation::clprob(ngram ng, ...)"  << std::endl);
+		
+		double pr=0.0;
+		double _logpr;
+		
+		char* _maxsuffptr=NULL,*actualmaxsuffptr=NULL;
+		ngram_state_t _maxsuffidx=NULL,actualmaxsuffidx=NULL;
+		unsigned int _statesize=0,actualstatesize=0;
+		int _bol=0,actualbol=MAX_NGRAM;
+		double _bow=0.0,actualbow=0.0; 
+		double _lastbow=0.0,actuallastbow=0.0; 
+		bool _extendible=false,actualextendible=false;
+		
+		for (size_t i=0; i<m_number_lm; i++) {
+			
+			if (m_weight[i]>0.0){
+				ngram _ng(m_lm[i]->getDict());
+				_ng.trans(ng);			
+				_logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffidx,&_maxsuffptr,&_statesize,&_extendible,&_lastbow);
+				
+				IFVERBOSE(3){
+					//cerr.precision(10);
+					VERBOSE(3," LM " << i << " weight:" << m_weight[i] << std::endl);
+					VERBOSE(3," LM " << i << " log10 logpr:" << _logpr<< std::endl);
+					VERBOSE(3," LM " << i << " pr:" << pow(10.0,_logpr) << std::endl);
+					VERBOSE(3," LM " << i << " msp:" << (void*) _maxsuffptr << std::endl);
+					VERBOSE(3," LM " << i << " msidx:" << _maxsuffidx << std::endl);
+					VERBOSE(3," LM " << i << " statesize:" << _statesize << std::endl);
+					VERBOSE(3," LM " << i << " bow:" << _bow << std::endl);
+					VERBOSE(3," LM " << i << " bol:" << _bol << std::endl);
+					VERBOSE(3," LM " << i << " lastbow:" << _lastbow << std::endl);
+				}
+				
+				/*
+				 //TO CHECK the following claims
+				 //What is the statesize of a LM interpolation? The largest _statesize among the submodels
+				 //What is the maxsuffptr of a LM interpolation? The _maxsuffptr of the submodel with the largest _statesize
+				 //What is the bol of a LM interpolation? The smallest _bol among the submodels
+				 //What is the bow of a LM interpolation? The weighted sum of the bow of the submodels
+				 //What is the prob of a LM interpolation? The weighted sum of the prob of the submodels
+				 //What is the extendible flag of a LM interpolation? true if the extendible flag is one for any LM
+				 //What is the lastbow of a LM interpolation? The weighted sum of the lastbow of the submodels
+				 */
+				
 				pr+=m_weight[i]*pow(10.0,_logpr);
 				actualbow+=m_weight[i]*pow(10.0,_bow);
 				
@@ -214,6 +340,8 @@ namespace irstlm {
 		if (extendible) *extendible=actualextendible;
 		if (lastbow) *bol=actuallastbow;
 		
+		if (maxsuffptr) VERBOSE(3, " msp:" << (void*) *maxsuffptr << std::endl);
+		if (maxsuffidx) VERBOSE(3, " msidx:" << *maxsuffidx << std::endl);
 		if (statesize) VERBOSE(3, " statesize:" << *statesize << std::endl);
 		if (bow) VERBOSE(3, " bow:" << *bow << std::endl);
 		if (bol) VERBOSE(3, " bol:" << *bol << std::endl);
@@ -228,9 +356,7 @@ namespace irstlm {
 		char *maxsuffptr=NULL;
 		unsigned int _statesize=0,actualstatesize=0;
 		
-		//		ngram_state_t* maxsuffidx = new ngram_state_t;
-		
-		for (size_t i=0; i<m_lm.size(); i++) {
+		for (size_t i=0; i<m_number_lm; i++) {
 			
 			if (m_weight[i]>0.0){
 				ngram _ng(m_lm[i]->getDict());
@@ -262,15 +388,13 @@ namespace irstlm {
 		
 		return maxsuffptr;
 	}
-
+	
 	ngram_state_t lmInterpolation::cmaxsuffidx(ngram ng, unsigned int* statesize)
 	{
 		ngram_state_t maxsuffidx=0;
 		unsigned int _statesize=0,actualstatesize=0;
 		
-		//		ngram_state_t* maxsuffidx = new ngram_state_t;
-		
-		for (size_t i=0; i<m_lm.size(); i++) {
+		for (size_t i=0; i<m_number_lm; i++) {
 			
 			if (m_weight[i]>0.0){
 				ngram _ng(m_lm[i]->getDict());
@@ -321,5 +445,18 @@ namespace irstlm {
 		logOOVpenalty=log10(OOVpenalty);
 		return logOOVpenalty;
 	}
+	
+	
+	void lmInterpolation::set_weight(const topic_map_t& map, double_vec_t& weight){
+		VERBOSE(4,"void lmInterpolation::set_weight" << std::endl);
+		VERBOSE(4,"map.size:" << map.size() << std::endl);
+		for (topic_map_t::const_iterator it=map.begin(); it!=map.end();++it){
+			if (m_idx.find(it->first) == m_idx.end()){
+				exit_error(IRSTLM_ERROR_DATA, "void lmInterpolation::set_weight(const topic_map_t& map, double_vec_t& weight) ERROR: you are setting the weight of a LM which is not included in the interpolated LM");
+			}
+			weight[m_idx[it->first]] = it->second;                  
+			VERBOSE(4,"it->first:|" << it->first << "| it->second:|" << it->second << "| m_idx[it->first]:|" << m_idx[it->first] << "| weight[m_idx[it->first]]:|" <<weight[m_idx[it->first]] << "|" << std::endl);
+		}
+	}
 }//namespace irstlm
 
diff --git a/src/lmInterpolation.h b/src/lmInterpolation.h
index 2f12265..6d5519e 100644
--- a/src/lmInterpolation.h
+++ b/src/lmInterpolation.h
@@ -45,11 +45,12 @@ namespace irstlm {
 	class lmInterpolation: public lmContainer
 	{
 		static const bool debug=true;
-		int m_number_lm;
+		size_t m_number_lm;
 		int order;
 		int dictionary_upperbound; //set by user
 		double  logOOVpenalty; //penalty for OOV words (default 0)
-		bool      isInverted;
+		bool isInverted;
+		bool m_map_flag; //flag for the presence of a map between name and lm
 		int memmap;  //level from which n-grams are accessed via mmap
 		
 		std::vector<double> m_weight;
@@ -57,13 +58,18 @@ namespace irstlm {
 		std::vector<bool> m_isinverted;
 		std::vector<lmContainer*> m_lm;
 		
-		int               maxlev; //maximun order of sub LMs;
+		int maxlev; //maximun order of sub LMs;
+		
+		std::map< std::string, size_t > m_idx;
+		std::map< size_t, std::string > m_name;
 		
 		float ngramcache_load_factor;
 		float dictionary_load_factor;
 		
 		dictionary *dict; // dictionary for all interpolated LMs
 		
+		void set_weight(const topic_map_t& map, double_vec_t& weight);
+
 	public:
 		
 		lmInterpolation(float nlf=0.0, float dlfi=0.0);
@@ -73,6 +79,7 @@ namespace irstlm {
 		lmContainer* load_lm(int i, int memmap, float nlf, float dlf);
 		
 		virtual double clprob(ngram ng, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
+		virtual double clprob(ngram ng, topic_map_t& lm_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL);
 		
 		virtual const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL);
 		virtual ngram_state_t cmaxsuffidx(ngram ong, unsigned int* size=NULL);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git



More information about the debian-science-commits mailing list