[irstlm] 05/78: added functions to only compute the lm state based on the index of ngram

Giulio Paci giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:47:00 UTC 2016


This is an automated email from the git hooks/post-receive script.

giuliopaci-guest pushed a commit to tag adaptiveLM.v0.10
in repository irstlm.

commit bb2622733084dd468ce652c639489cf0052cf36d
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date:   Fri Nov 6 14:18:06 2015 +0100

    added functions to only compute the lm state based on the index of ngram
---
 src/lmContainer.h |  19 ++++-
 src/lmmacro.cpp   |  89 +++++++++++++++++------
 src/lmmacro.h     |   4 +-
 src/lmtable.cpp   | 209 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 src/lmtable.h     |   3 +
 5 files changed, 291 insertions(+), 33 deletions(-)

diff --git a/src/lmContainer.h b/src/lmContainer.h
index ebf7fa1..d450dd4 100644
--- a/src/lmContainer.h
+++ b/src/lmContainer.h
@@ -144,14 +144,14 @@ public:
     return 0.0;
   };
 
-
+	
   virtual const char *cmaxsuffptr(ngram ng, unsigned int* statesize=NULL)
   {
     UNUSED(ng);
     UNUSED(statesize);
     return NULL;
   }
-
+	
   virtual const char *cmaxsuffptr(int* ng, int ngsize, unsigned int* statesize=NULL)
   {
     UNUSED(ng);
@@ -159,6 +159,21 @@ public:
     UNUSED(statesize);
     return NULL;
   }
+	
+  ngram_state_t cmaxsuffidx(ngram ng, unsigned int* statesize=NULL)
+  {
+    UNUSED(ng);
+    UNUSED(statesize);
+    return NULL;
+  }
+
+  ngram_state_t cmaxsuffidx(int* ng, int ngsize, unsigned int* statesize=NULL)
+  {
+    UNUSED(ng);
+    UNUSED(ngsize);
+    UNUSED(statesize);
+    return NULL;
+  }
 
   virtual inline int get(ngram& ng) {
     UNUSED(ng);
diff --git a/src/lmmacro.cpp b/src/lmmacro.cpp
index 8edcf09..66c7063 100644
--- a/src/lmmacro.cpp
+++ b/src/lmmacro.cpp
@@ -543,31 +543,78 @@ const char *lmmacro::maxsuffptr(ngram micro_ng, unsigned int* size)
 
   return lmtable::maxsuffptr(macro_ng,size);
 }
-
+	
 const char *lmmacro::cmaxsuffptr(ngram micro_ng, unsigned int* size)
 {
-  //cerr << "lmmacro::CMAXsuffptr\n";
-  //cerr << "micro_ng: " << micro_ng
-  //	<< " -> micro_ng.size: " << micro_ng.size << "\n";
-
-  //the LM working on the selected field = 0
-  //contributes to the LM state
-  //  if (selectedField>0)    return NULL;
-
-  ngram macro_ng(lmtable::getDict());
-
-  if (micro_ng.dict ==  macro_ng.dict)
-    macro_ng.trans(micro_ng);  // micro to macro mapping already done
-  else
-    map(&micro_ng, &macro_ng); // mapping required
-
-  VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
-          <<  "lmmacro::lprob: macro_ng = " << macro_ng << "\n")
-
-  return lmtable::cmaxsuffptr(macro_ng,size);
-
+		//cerr << "lmmacro::CMAXsuffptr\n";
+		//cerr << "micro_ng: " << micro_ng
+		//	<< " -> micro_ng.size: " << micro_ng.size << "\n";
+		
+		//the LM working on the selected field = 0
+		//contributes to the LM state
+		//  if (selectedField>0)    return NULL;
+		
+		ngram macro_ng(lmtable::getDict());
+		
+		if (micro_ng.dict ==  macro_ng.dict)
+			macro_ng.trans(micro_ng);  // micro to macro mapping already done
+		else
+			map(&micro_ng, &macro_ng); // mapping required
+		
+		VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
+						<<  "lmmacro::lprob: macro_ng = " << macro_ng << "\n")
+		
+		return lmtable::cmaxsuffptr(macro_ng,size);
+		
 }
+	
+ngram_state_t lmmacro::maxsuffidx(ngram micro_ng, unsigned int* size)
+{
+		//cerr << "lmmacro::CMAXsuffptr\n";
+		//cerr << "micro_ng: " << micro_ng
+		//	<< " -> micro_ng.size: " << micro_ng.size << "\n";
+		
+		//the LM working on the selected field = 0
+		//contributes to the LM state
+		//  if (selectedField>0)    return NULL;
+		
+		ngram macro_ng(lmtable::getDict());
+		
+		if (micro_ng.dict ==  macro_ng.dict)
+			macro_ng.trans(micro_ng);  // micro to macro mapping already done
+			else
+				map(&micro_ng, &macro_ng); // mapping required
+				
+				VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
+								<<  "lmmacro::lprob: macro_ng = " << macro_ng << "\n")
+				
+				return lmtable::cmaxsuffidx(macro_ng,size);
+		
+	}
 
+ngram_state_t lmmacro::cmaxsuffidx(ngram micro_ng, unsigned int* size)
+{
+		//cerr << "lmmacro::CMAXsuffptr\n";
+		//cerr << "micro_ng: " << micro_ng
+		//	<< " -> micro_ng.size: " << micro_ng.size << "\n";
+		
+		//the LM working on the selected field = 0
+		//contributes to the LM state
+		//  if (selectedField>0)    return NULL;
+		
+		ngram macro_ng(lmtable::getDict());
+		
+		if (micro_ng.dict ==  macro_ng.dict)
+			macro_ng.trans(micro_ng);  // micro to macro mapping already done
+		else
+			map(&micro_ng, &macro_ng); // mapping required
+		
+		VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
+						<<  "lmmacro::lprob: macro_ng = " << macro_ng << "\n")
+		
+		return lmtable::cmaxsuffidx(macro_ng,size);
+		
+	}
 
 void lmmacro::map(ngram *in, ngram *out)
 {
diff --git a/src/lmmacro.h b/src/lmmacro.h
index fc05b5f..d8deed9 100644
--- a/src/lmmacro.h
+++ b/src/lmmacro.h
@@ -84,7 +84,9 @@ public:
 
   const char *maxsuffptr(ngram ong, unsigned int* size=NULL);
   const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL);
-
+  ngram_state_t maxsuffidx(ngram ong, unsigned int* size=NULL);
+  ngram_state_t cmaxsuffidx(ngram ong, unsigned int* size=NULL);
+	
   void map(ngram *in, ngram *out);
   void One2OneMapping(ngram *in, ngram *out);
   void Micro2MacroMapping(ngram *in, ngram *out);
diff --git a/src/lmtable.cpp b/src/lmtable.cpp
index 7ae6e03..5f9a3dc 100644
--- a/src/lmtable.cpp
+++ b/src/lmtable.cpp
@@ -2301,7 +2301,6 @@ namespace irstlm {
 		}
 	}
 	
-	
 	const char *lmtable::cmaxsuffptr(ngram ong, unsigned int* size)
 	{
 		VERBOSE(3,"const char *lmtable::maxsuffptr(ngram ong, unsigned int* size) ong:|" << ong  << "|\n");
@@ -2349,7 +2348,6 @@ namespace irstlm {
 #endif
 	}
 	
-	
 	//this function simulates the cmaxsuffptr(ngram, ...) but it takes as input an array of codes instead of the ngram
 	const char *lmtable::cmaxsuffptr(int* codes, int sz, unsigned int* size)
 	{
@@ -2409,8 +2407,197 @@ namespace irstlm {
 		return maxsuffptr(ong,size);
 #endif
 	}
+
 	
+	//non recursive version
+  //const char *lmtable::maxsuffptr(ngram ong, unsigned int* size)
+	ngram_state_t lmtable::maxsuffidx(ngram ong, unsigned int* size)
+	{
+//		VERBOSE(3,"const char *lmtable::maxsuffptr(ngram ong, unsigned int* size)\n");
+		VERBOSE(3,"ngram_state_t lmtable::maxsuffidx(ngram ong, unsigned int* size)\n");
+		
+		if (ong.size==0) {
+			if (size!=NULL) *size=0;
+			return 0;
+		}
+		
+		
+		if (isInverted) {
+			if (ong.size>maxlev) ong.size=maxlev; //if larger than maxlen reduce size
+			ngram ing=ong; //inverted ngram
+			
+			ing.invert(ong);
+			
+			get(ing,ing.size,ing.size); // dig in the trie
+			if (ing.lev > 0) { //found something?
+				size_t isize = MIN(ing.lev,(ing.size-1)); //find largest n-1 gram suffix
+				if (size!=NULL)  *size=isize;
+//				return ing.path[isize];
+				
+				int ndsz=nodesize(tbltype[isize]);
+				ngram_state_t msidx = (ngram_state_t) ( ((table_pos_t) (ing.path[isize]) - (table_pos_t) table[isize] ) / ndsz ) + tb_offset[isize] + 1; //added 1 to distinguish from zero-ngram
+				VERBOSE(3,"ngram_state_t lmtable::maxsuffidx(ngram ong, unsigned int* size) ing:|" << ing << "| isize:|" << isize << "| ing.path[isize]:|" << ing.path[isize] << "| tb_offset[isize]:|" << tb_offset[isize] << "| msidx:|" << msidx << "|" << std::endl);
+				return msidx;
+			} else { // means a real unknown word!
+				if (size!=NULL)  *size=0;     //default statesize for zero-gram!
+//				return NULL; //default stateptr for zero-gram!
+				return 0; //default state-value for zero-gram!
+			}
+		} else {
+			if (ong.size>0) ong.size--; //always reduced by 1 word
+			
+			if (ong.size>=maxlev) ong.size=maxlev-1; //if still larger or equals to maxlen reduce again
+			
+			if (size!=NULL) *size=ong.size; //will return the largest found ong.size
+			
+			
+			for (ngram ng=ong; ng.size>0; ng.size--) {
+				if (get(ng,ng.size,ng.size)) {
+					//					if (ng.succ==0) (*size)--;
+					//					if (size!=NULL) *size=ng.size;
+					size_t isize=ng.size;
+					if (size!=NULL)
+					{
+						if (ng.succ==0) *size=isize-1;
+						else *size=isize;
+					}
+//					return ng.link;
+
+					int ndsz=nodesize(tbltype[isize]);
+					ngram_state_t msidx = 0;
+					if (ng.link){
+						msidx = (ngram_state_t) ( ((table_pos_t) (ng.link) - (table_pos_t) table[isize]) / ndsz ) + tb_offset[isize] + 1; //added 1 to distinguish from zero-ngram
+					}
+					
+					VERBOSE(3,"ngram_state_t lmtable::maxsuffidx(ngram ong, unsigned int* size) ng:|" << ng << "| isize:|" << isize << "| tb_offset[isize]:|" << tb_offset[isize] << "| msidx:|" << msidx << "|" << std::endl);
+					return msidx;
+				}
+			}
+			if (size!=NULL) *size=0;
+			return 0;
+		}
+	}
+
+	//const char *lmtable::cmaxsuffptr(ngram ong, unsigned int* size)
+	ngram_state_t lmtable::cmaxsuffidx(ngram ong, unsigned int* size)
+	{
+//		VERBOSE(3,"const char *lmtable::cmaxsuffptr(ngram ong, unsigned int* size) ong:|" << ong  << "|\n");
+		VERBOSE(3,"ngram_state_t lmtable::cmaxsuffidx(ngram ong, unsigned int* size) ong:|" << ong  << "|\n");
+		
+		if (ong.size==0) {
+			if (size!=NULL) *size=0;
+//			return (char*) NULL;
+			return 0;
+		}
+		
+		if (size!=NULL) *size=ong.size; //will return the largest found ong.size
+		
+#ifdef PS_CACHE_ENABLE
+		prob_and_state_t pst;
+		
+		size_t orisize=ong.size;
+		if (ong.size>=maxlev) ong.size=maxlev;
+		
+		//cache hit
+		//		if (prob_and_state_cache && ong.size==maxlev && prob_and_state_cache->get(ong.wordp(maxlev),pst)) {
+		if (prob_and_state_cache[ong.size] && prob_and_state_cache[ong.size]->get(ong.wordp(ong.size),pst)) {
+			*size=pst.statesize;
+//			return pst.state;
+			return pst.ngramstate;
+		}
+		ong.size = orisize;
+		
+		//cache miss
+		unsigned int isize; //internal state size variable
+//		char* found=(char *)maxsuffptr(ong,&isize);
+		ngram_state_t msidx = maxsuffidx(ong,&isize);
+		
+		//cache insert
+		//IMPORTANT: this function updates only two fields (ngramstate, statesize) of the entry of the cache; the reminaing fields (logpr, bow, bol, extendible) are undefined; hence, it should not be used before the corresponding clprob()
+		
+		if (ong.size>=maxlev) ong.size=maxlev;
+		//		if (prob_and_state_cache && ong.size==maxlev) {
+		if (prob_and_state_cache[ong.size]) {
+//			pst.state=found;
+			pst.ngramstate=msidx;
+			pst.statesize=isize;
+			//			prob_and_state_cache->add(ong.wordp(maxlev),pst);
+			prob_and_state_cache[ong.size]->add(ong.wordp(ong.size),pst);
+		}
+		if (size!=NULL) *size=isize;
+//		return found;
+		return msidx;
+#else
+//		return (char *)maxsuffptr(ong,size);
+		return maxsuffidx(ong,size);
+#endif
+	}
 	
+	//this function simulates the cmaxsuffptr(ngram, ...) but it takes as input an array of codes instead of the ngram
+	ngram_state_t lmtable::cmaxsuffidx(int* codes, int sz, unsigned int* size)
+	{
+		VERBOSE(3,"const char *lmtable::cmaxsuffptr(int* codes, int sz, unsigned int* size)\n");
+		
+		if (sz==0) {
+			if (size!=NULL) *size=0;
+//			return (char*) NULL;
+			return 0;
+		}
+		
+		if (sz>maxlev) sz=maxlev; //adjust n-gram level to table size
+		
+#ifdef PS_CACHE_ENABLE
+		//cache hit
+		prob_and_state_t pst;
+		
+		//cache hit
+		//		if (prob_and_state_cache && sz==maxlev && prob_and_state_cache->get(codes,pst)) {
+		if (prob_and_state_cache[sz] && prob_and_state_cache[sz]->get(codes,pst)) {
+			if (size) *size = pst.statesize;
+//			return pst.state;
+			return pst.ngramstate;
+		}
+		
+		//create the actual ngram
+		ngram ong(dict);
+		ong.pushc(codes,sz);
+		MY_ASSERT (ong.size == sz);
+		
+		//cache miss
+		unsigned int isize; //internal state size variable
+		//char* found=(char *)maxsuffptr(ong,&isize);
+		ngram_state_t msidx = maxsuffidx(ong,&isize);
+		
+		//cache insert
+		//IMPORTANT: this function updates only two fields (ngramstate, statesize) of the entry of the cache; the reminaing fields (logpr, bow, bol, extendible) are undefined; hence, it should not be used before the corresponding clprob()
+		if (ong.size>=maxlev) ong.size=maxlev;
+		//		if (prob_and_state_cache && ong.size==maxlev) {
+		if (prob_and_state_cache[sz]) {
+//			pst.state=found;
+			pst.ngramstate=msidx;
+			pst.statesize=isize;
+			//			prob_and_state_cache->add(ong.wordp(maxlev),pst);
+			prob_and_state_cache[sz]->add(ong.wordp(ong.size),pst);
+		}
+		if (size!=NULL) *size=isize;
+//		return found;
+		return msidx;
+#else
+		//create the actual ngram
+		ngram ong(dict);
+		ong.pushc(codes,sz);
+		MY_ASSERT (ong.size == sz);
+		/*
+		 unsigned int isize; //internal state size variable
+		 char* found=(char *) maxsuffptr(ong,&isize);
+		 char* found2=(char *) maxsuffptr(ong,size);
+		 if (size!=NULL) *size=isize;
+		 return found;
+		 */
+//		return maxsuffptr(ong,size);
+		return maxsuffidx(ong,size);
+#endif
+	}
 	
 	//returns log10prob of n-gram
 	//bow: backoff weight
@@ -2429,7 +2616,11 @@ namespace irstlm {
 		VERBOSE(3," lmtable::lprob(ngram) ong |" << ong  << "|\n" << std::endl);
 		VERBOSE(3," lmtable::lprob(ngram) ong.size |" << ong.size  << "|\n" << std::endl);
 		
-		if (ong.size==0) return 0.0; //sanity check
+		if (ong.size==0){ //sanity check
+			if (maxsuffptr) *maxsuffptr=NULL;
+			if (maxsuffidx) *maxsuffidx=0;
+			return 0.0;
+		}
 		if (ong.size>maxlev) ong.size=maxlev; //adjust n-gram level to table size
 		
 		if (bow) *bow=0; //initialize back-off weight
@@ -2452,16 +2643,16 @@ namespace irstlm {
 				iprob=ing.prob;
 				lpr = (double)(isQtable?Pcenters[ing.lev][(qfloat_t)iprob]:iprob);
 				if (*ong.wordp(1)==dict->oovcode()) lpr-=logOOVpenalty; //add OOV penalty
-				size_t _level=MIN(ing.lev,(ing.size-1));
+				size_t isize=MIN(ing.lev,(ing.size-1));
 //				if (statesize)  *statesize=MIN(ing.lev,(ing.size-1)); //find largest n-1 gram suffix
 //				if (maxsuffptr) *maxsuffptr=ing.path[MIN(ing.lev,(ing.size-1))];
-				if (statesize)  *statesize=_level; //find largest n-1 gram suffix
-				if (maxsuffptr) *maxsuffptr=ing.path[_level];
+				if (statesize)  *statesize=isize; //find largest n-1 gram suffix
+				if (maxsuffptr) *maxsuffptr=ing.path[isize];
 				
 				if (maxsuffidx){
-					int ndsz=nodesize(tbltype[_level]);
-					*maxsuffidx = (ngram_state_t) ( ((table_pos_t) (ing.path[_level]) - (table_pos_t) table[_level] ) / ndsz ) + tb_offset[_level] + 1; //added 1 to distinguish from zero-ngram
-					VERBOSE(3,"lmtable::lprob(ngram) ing:|" << ing << "| _level:|" << _level << "| ing.path[_level]:|" << ing.path[_level] << "| tb_offset[_level]:|" << tb_offset[_level] << "| *maxsuffidx:|" << *maxsuffidx << "|" << std::endl);
+					int ndsz=nodesize(tbltype[isize]);
+					*maxsuffidx = (ngram_state_t) ( ((table_pos_t) (ing.path[isize]) - (table_pos_t) table[isize] ) / ndsz ) + tb_offset[isize] + 1; //added 1 to distinguish from zero-ngram
+					VERBOSE(3,"lmtable::lprob(ngram) ing:|" << ing << "| isize:|" << isize << "| ing.path[isize]:|" << ing.path[isize] << "| tb_offset[_level]:|" << tb_offset[isize] << "| *maxsuffidx:|" << *maxsuffidx << "|" << std::endl);
 				}
 				if (extendible) *extendible=succrange(ing.path[ing.lev],ing.lev)>0;
 				if (lastbow) *lastbow=(double) (isQtable?Bcenters[ing.lev][(qfloat_t)ing.bow]:ing.bow);
diff --git a/src/lmtable.h b/src/lmtable.h
index ee7b9b2..b27495f 100644
--- a/src/lmtable.h
+++ b/src/lmtable.h
@@ -350,6 +350,9 @@ public:
 	virtual const char *maxsuffptr(ngram ong, unsigned int* size=NULL);
 	virtual const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL);
   virtual const char *cmaxsuffptr(int* codes, int sz, unsigned int* size=NULL);
+	ngram_state_t maxsuffidx(ngram ong, unsigned int* size=NULL);
+  ngram_state_t cmaxsuffidx(ngram ong, unsigned int* size=NULL);
+  ngram_state_t cmaxsuffidx(int* codes, int sz, unsigned int* size=NULL);
 	
 	inline void putmem(char* ptr,int value,int offs,int size) {
 		MY_ASSERT(ptr!=NULL);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git



More information about the debian-science-commits mailing list