[irstlm] 05/78: added functions to only compute the lm state based on the index of ngram
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:47:00 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to tag adaptiveLM.v0.10
in repository irstlm.
commit bb2622733084dd468ce652c639489cf0052cf36d
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Fri Nov 6 14:18:06 2015 +0100
added functions to only compute the lm state based on the index of ngram
---
src/lmContainer.h | 19 ++++-
src/lmmacro.cpp | 89 +++++++++++++++++------
src/lmmacro.h | 4 +-
src/lmtable.cpp | 209 +++++++++++++++++++++++++++++++++++++++++++++++++++---
src/lmtable.h | 3 +
5 files changed, 291 insertions(+), 33 deletions(-)
diff --git a/src/lmContainer.h b/src/lmContainer.h
index ebf7fa1..d450dd4 100644
--- a/src/lmContainer.h
+++ b/src/lmContainer.h
@@ -144,14 +144,14 @@ public:
return 0.0;
};
-
+
virtual const char *cmaxsuffptr(ngram ng, unsigned int* statesize=NULL)
{
UNUSED(ng);
UNUSED(statesize);
return NULL;
}
-
+
virtual const char *cmaxsuffptr(int* ng, int ngsize, unsigned int* statesize=NULL)
{
UNUSED(ng);
@@ -159,6 +159,21 @@ public:
UNUSED(statesize);
return NULL;
}
+
+ ngram_state_t cmaxsuffidx(ngram ng, unsigned int* statesize=NULL)
+ {
+ UNUSED(ng);
+ UNUSED(statesize);
+ return NULL;
+ }
+
+ ngram_state_t cmaxsuffidx(int* ng, int ngsize, unsigned int* statesize=NULL)
+ {
+ UNUSED(ng);
+ UNUSED(ngsize);
+ UNUSED(statesize);
+ return NULL;
+ }
virtual inline int get(ngram& ng) {
UNUSED(ng);
diff --git a/src/lmmacro.cpp b/src/lmmacro.cpp
index 8edcf09..66c7063 100644
--- a/src/lmmacro.cpp
+++ b/src/lmmacro.cpp
@@ -543,31 +543,78 @@ const char *lmmacro::maxsuffptr(ngram micro_ng, unsigned int* size)
return lmtable::maxsuffptr(macro_ng,size);
}
-
+
const char *lmmacro::cmaxsuffptr(ngram micro_ng, unsigned int* size)
{
- //cerr << "lmmacro::CMAXsuffptr\n";
- //cerr << "micro_ng: " << micro_ng
- // << " -> micro_ng.size: " << micro_ng.size << "\n";
-
- //the LM working on the selected field = 0
- //contributes to the LM state
- // if (selectedField>0) return NULL;
-
- ngram macro_ng(lmtable::getDict());
-
- if (micro_ng.dict == macro_ng.dict)
- macro_ng.trans(micro_ng); // micro to macro mapping already done
- else
- map(µ_ng, ¯o_ng); // mapping required
-
- VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
- << "lmmacro::lprob: macro_ng = " << macro_ng << "\n")
-
- return lmtable::cmaxsuffptr(macro_ng,size);
-
+ //cerr << "lmmacro::CMAXsuffptr\n";
+ //cerr << "micro_ng: " << micro_ng
+ // << " -> micro_ng.size: " << micro_ng.size << "\n";
+
+ //the LM working on the selected field = 0
+ //contributes to the LM state
+ // if (selectedField>0) return NULL;
+
+ ngram macro_ng(lmtable::getDict());
+
+ if (micro_ng.dict == macro_ng.dict)
+ macro_ng.trans(micro_ng); // micro to macro mapping already done
+ else
+ map(µ_ng, ¯o_ng); // mapping required
+
+ VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
+ << "lmmacro::lprob: macro_ng = " << macro_ng << "\n")
+
+ return lmtable::cmaxsuffptr(macro_ng,size);
+
}
+
+ngram_state_t lmmacro::maxsuffidx(ngram micro_ng, unsigned int* size)
+{
+ //cerr << "lmmacro::CMAXsuffptr\n";
+ //cerr << "micro_ng: " << micro_ng
+ // << " -> micro_ng.size: " << micro_ng.size << "\n";
+
+ //the LM working on the selected field = 0
+ //contributes to the LM state
+ // if (selectedField>0) return NULL;
+
+ ngram macro_ng(lmtable::getDict());
+
+ if (micro_ng.dict == macro_ng.dict)
+ macro_ng.trans(micro_ng); // micro to macro mapping already done
+ else
+ map(µ_ng, ¯o_ng); // mapping required
+
+ VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
+ << "lmmacro::lprob: macro_ng = " << macro_ng << "\n")
+
+ return lmtable::cmaxsuffidx(macro_ng,size);
+
+ }
+ngram_state_t lmmacro::cmaxsuffidx(ngram micro_ng, unsigned int* size)
+{
+ //cerr << "lmmacro::CMAXsuffptr\n";
+ //cerr << "micro_ng: " << micro_ng
+ // << " -> micro_ng.size: " << micro_ng.size << "\n";
+
+ //the LM working on the selected field = 0
+ //contributes to the LM state
+ // if (selectedField>0) return NULL;
+
+ ngram macro_ng(lmtable::getDict());
+
+ if (micro_ng.dict == macro_ng.dict)
+ macro_ng.trans(micro_ng); // micro to macro mapping already done
+ else
+ map(µ_ng, ¯o_ng); // mapping required
+
+ VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
+ << "lmmacro::lprob: macro_ng = " << macro_ng << "\n")
+
+ return lmtable::cmaxsuffidx(macro_ng,size);
+
+ }
void lmmacro::map(ngram *in, ngram *out)
{
diff --git a/src/lmmacro.h b/src/lmmacro.h
index fc05b5f..d8deed9 100644
--- a/src/lmmacro.h
+++ b/src/lmmacro.h
@@ -84,7 +84,9 @@ public:
const char *maxsuffptr(ngram ong, unsigned int* size=NULL);
const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL);
-
+ ngram_state_t maxsuffidx(ngram ong, unsigned int* size=NULL);
+ ngram_state_t cmaxsuffidx(ngram ong, unsigned int* size=NULL);
+
void map(ngram *in, ngram *out);
void One2OneMapping(ngram *in, ngram *out);
void Micro2MacroMapping(ngram *in, ngram *out);
diff --git a/src/lmtable.cpp b/src/lmtable.cpp
index 7ae6e03..5f9a3dc 100644
--- a/src/lmtable.cpp
+++ b/src/lmtable.cpp
@@ -2301,7 +2301,6 @@ namespace irstlm {
}
}
-
const char *lmtable::cmaxsuffptr(ngram ong, unsigned int* size)
{
VERBOSE(3,"const char *lmtable::maxsuffptr(ngram ong, unsigned int* size) ong:|" << ong << "|\n");
@@ -2349,7 +2348,6 @@ namespace irstlm {
#endif
}
-
//this function simulates the cmaxsuffptr(ngram, ...) but it takes as input an array of codes instead of the ngram
const char *lmtable::cmaxsuffptr(int* codes, int sz, unsigned int* size)
{
@@ -2409,8 +2407,197 @@ namespace irstlm {
return maxsuffptr(ong,size);
#endif
}
+
+ //non recursive version
+ //const char *lmtable::maxsuffptr(ngram ong, unsigned int* size)
+ ngram_state_t lmtable::maxsuffidx(ngram ong, unsigned int* size)
+ {
+// VERBOSE(3,"const char *lmtable::maxsuffptr(ngram ong, unsigned int* size)\n");
+ VERBOSE(3,"ngram_state_t lmtable::maxsuffidx(ngram ong, unsigned int* size)\n");
+
+ if (ong.size==0) {
+ if (size!=NULL) *size=0;
+ return 0;
+ }
+
+
+ if (isInverted) {
+ if (ong.size>maxlev) ong.size=maxlev; //if larger than maxlen reduce size
+ ngram ing=ong; //inverted ngram
+
+ ing.invert(ong);
+
+ get(ing,ing.size,ing.size); // dig in the trie
+ if (ing.lev > 0) { //found something?
+ size_t isize = MIN(ing.lev,(ing.size-1)); //find largest n-1 gram suffix
+ if (size!=NULL) *size=isize;
+// return ing.path[isize];
+
+ int ndsz=nodesize(tbltype[isize]);
+ ngram_state_t msidx = (ngram_state_t) ( ((table_pos_t) (ing.path[isize]) - (table_pos_t) table[isize] ) / ndsz ) + tb_offset[isize] + 1; //added 1 to distinguish from zero-ngram
+ VERBOSE(3,"ngram_state_t lmtable::maxsuffidx(ngram ong, unsigned int* size) ing:|" << ing << "| isize:|" << isize << "| ing.path[isize]:|" << ing.path[isize] << "| tb_offset[isize]:|" << tb_offset[isize] << "| msidx:|" << msidx << "|" << std::endl);
+ return msidx;
+ } else { // means a real unknown word!
+ if (size!=NULL) *size=0; //default statesize for zero-gram!
+// return NULL; //default stateptr for zero-gram!
+ return 0; //default state-value for zero-gram!
+ }
+ } else {
+ if (ong.size>0) ong.size--; //always reduced by 1 word
+
+ if (ong.size>=maxlev) ong.size=maxlev-1; //if still larger or equals to maxlen reduce again
+
+ if (size!=NULL) *size=ong.size; //will return the largest found ong.size
+
+
+ for (ngram ng=ong; ng.size>0; ng.size--) {
+ if (get(ng,ng.size,ng.size)) {
+ // if (ng.succ==0) (*size)--;
+ // if (size!=NULL) *size=ng.size;
+ size_t isize=ng.size;
+ if (size!=NULL)
+ {
+ if (ng.succ==0) *size=isize-1;
+ else *size=isize;
+ }
+// return ng.link;
+
+ int ndsz=nodesize(tbltype[isize]);
+ ngram_state_t msidx = 0;
+ if (ng.link){
+ msidx = (ngram_state_t) ( ((table_pos_t) (ng.link) - (table_pos_t) table[isize]) / ndsz ) + tb_offset[isize] + 1; //added 1 to distinguish from zero-ngram
+ }
+
+ VERBOSE(3,"ngram_state_t lmtable::maxsuffidx(ngram ong, unsigned int* size) ng:|" << ng << "| isize:|" << isize << "| tb_offset[isize]:|" << tb_offset[isize] << "| msidx:|" << msidx << "|" << std::endl);
+ return msidx;
+ }
+ }
+ if (size!=NULL) *size=0;
+ return 0;
+ }
+ }
+
+ //const char *lmtable::cmaxsuffptr(ngram ong, unsigned int* size)
+ ngram_state_t lmtable::cmaxsuffidx(ngram ong, unsigned int* size)
+ {
+// VERBOSE(3,"const char *lmtable::cmaxsuffptr(ngram ong, unsigned int* size) ong:|" << ong << "|\n");
+ VERBOSE(3,"ngram_state_t lmtable::cmaxsuffidx(ngram ong, unsigned int* size) ong:|" << ong << "|\n");
+
+ if (ong.size==0) {
+ if (size!=NULL) *size=0;
+// return (char*) NULL;
+ return 0;
+ }
+
+ if (size!=NULL) *size=ong.size; //will return the largest found ong.size
+
+#ifdef PS_CACHE_ENABLE
+ prob_and_state_t pst;
+
+ size_t orisize=ong.size;
+ if (ong.size>=maxlev) ong.size=maxlev;
+
+ //cache hit
+ // if (prob_and_state_cache && ong.size==maxlev && prob_and_state_cache->get(ong.wordp(maxlev),pst)) {
+ if (prob_and_state_cache[ong.size] && prob_and_state_cache[ong.size]->get(ong.wordp(ong.size),pst)) {
+ *size=pst.statesize;
+// return pst.state;
+ return pst.ngramstate;
+ }
+ ong.size = orisize;
+
+ //cache miss
+ unsigned int isize; //internal state size variable
+// char* found=(char *)maxsuffptr(ong,&isize);
+ ngram_state_t msidx = maxsuffidx(ong,&isize);
+
+ //cache insert
+ //IMPORTANT: this function updates only two fields (ngramstate, statesize) of the entry of the cache; the reminaing fields (logpr, bow, bol, extendible) are undefined; hence, it should not be used before the corresponding clprob()
+
+ if (ong.size>=maxlev) ong.size=maxlev;
+ // if (prob_and_state_cache && ong.size==maxlev) {
+ if (prob_and_state_cache[ong.size]) {
+// pst.state=found;
+ pst.ngramstate=msidx;
+ pst.statesize=isize;
+ // prob_and_state_cache->add(ong.wordp(maxlev),pst);
+ prob_and_state_cache[ong.size]->add(ong.wordp(ong.size),pst);
+ }
+ if (size!=NULL) *size=isize;
+// return found;
+ return msidx;
+#else
+// return (char *)maxsuffptr(ong,size);
+ return maxsuffidx(ong,size);
+#endif
+ }
+ //this function simulates the cmaxsuffptr(ngram, ...) but it takes as input an array of codes instead of the ngram
+ ngram_state_t lmtable::cmaxsuffidx(int* codes, int sz, unsigned int* size)
+ {
+ VERBOSE(3,"const char *lmtable::cmaxsuffptr(int* codes, int sz, unsigned int* size)\n");
+
+ if (sz==0) {
+ if (size!=NULL) *size=0;
+// return (char*) NULL;
+ return 0;
+ }
+
+ if (sz>maxlev) sz=maxlev; //adjust n-gram level to table size
+
+#ifdef PS_CACHE_ENABLE
+ //cache hit
+ prob_and_state_t pst;
+
+ //cache hit
+ // if (prob_and_state_cache && sz==maxlev && prob_and_state_cache->get(codes,pst)) {
+ if (prob_and_state_cache[sz] && prob_and_state_cache[sz]->get(codes,pst)) {
+ if (size) *size = pst.statesize;
+// return pst.state;
+ return pst.ngramstate;
+ }
+
+ //create the actual ngram
+ ngram ong(dict);
+ ong.pushc(codes,sz);
+ MY_ASSERT (ong.size == sz);
+
+ //cache miss
+ unsigned int isize; //internal state size variable
+ //char* found=(char *)maxsuffptr(ong,&isize);
+ ngram_state_t msidx = maxsuffidx(ong,&isize);
+
+ //cache insert
+ //IMPORTANT: this function updates only two fields (ngramstate, statesize) of the entry of the cache; the reminaing fields (logpr, bow, bol, extendible) are undefined; hence, it should not be used before the corresponding clprob()
+ if (ong.size>=maxlev) ong.size=maxlev;
+ // if (prob_and_state_cache && ong.size==maxlev) {
+ if (prob_and_state_cache[sz]) {
+// pst.state=found;
+ pst.ngramstate=msidx;
+ pst.statesize=isize;
+ // prob_and_state_cache->add(ong.wordp(maxlev),pst);
+ prob_and_state_cache[sz]->add(ong.wordp(ong.size),pst);
+ }
+ if (size!=NULL) *size=isize;
+// return found;
+ return msidx;
+#else
+ //create the actual ngram
+ ngram ong(dict);
+ ong.pushc(codes,sz);
+ MY_ASSERT (ong.size == sz);
+ /*
+ unsigned int isize; //internal state size variable
+ char* found=(char *) maxsuffptr(ong,&isize);
+ char* found2=(char *) maxsuffptr(ong,size);
+ if (size!=NULL) *size=isize;
+ return found;
+ */
+// return maxsuffptr(ong,size);
+ return maxsuffidx(ong,size);
+#endif
+ }
//returns log10prob of n-gram
//bow: backoff weight
@@ -2429,7 +2616,11 @@ namespace irstlm {
VERBOSE(3," lmtable::lprob(ngram) ong |" << ong << "|\n" << std::endl);
VERBOSE(3," lmtable::lprob(ngram) ong.size |" << ong.size << "|\n" << std::endl);
- if (ong.size==0) return 0.0; //sanity check
+ if (ong.size==0){ //sanity check
+ if (maxsuffptr) *maxsuffptr=NULL;
+ if (maxsuffidx) *maxsuffidx=0;
+ return 0.0;
+ }
if (ong.size>maxlev) ong.size=maxlev; //adjust n-gram level to table size
if (bow) *bow=0; //initialize back-off weight
@@ -2452,16 +2643,16 @@ namespace irstlm {
iprob=ing.prob;
lpr = (double)(isQtable?Pcenters[ing.lev][(qfloat_t)iprob]:iprob);
if (*ong.wordp(1)==dict->oovcode()) lpr-=logOOVpenalty; //add OOV penalty
- size_t _level=MIN(ing.lev,(ing.size-1));
+ size_t isize=MIN(ing.lev,(ing.size-1));
// if (statesize) *statesize=MIN(ing.lev,(ing.size-1)); //find largest n-1 gram suffix
// if (maxsuffptr) *maxsuffptr=ing.path[MIN(ing.lev,(ing.size-1))];
- if (statesize) *statesize=_level; //find largest n-1 gram suffix
- if (maxsuffptr) *maxsuffptr=ing.path[_level];
+ if (statesize) *statesize=isize; //find largest n-1 gram suffix
+ if (maxsuffptr) *maxsuffptr=ing.path[isize];
if (maxsuffidx){
- int ndsz=nodesize(tbltype[_level]);
- *maxsuffidx = (ngram_state_t) ( ((table_pos_t) (ing.path[_level]) - (table_pos_t) table[_level] ) / ndsz ) + tb_offset[_level] + 1; //added 1 to distinguish from zero-ngram
- VERBOSE(3,"lmtable::lprob(ngram) ing:|" << ing << "| _level:|" << _level << "| ing.path[_level]:|" << ing.path[_level] << "| tb_offset[_level]:|" << tb_offset[_level] << "| *maxsuffidx:|" << *maxsuffidx << "|" << std::endl);
+ int ndsz=nodesize(tbltype[isize]);
+ *maxsuffidx = (ngram_state_t) ( ((table_pos_t) (ing.path[isize]) - (table_pos_t) table[isize] ) / ndsz ) + tb_offset[isize] + 1; //added 1 to distinguish from zero-ngram
+ VERBOSE(3,"lmtable::lprob(ngram) ing:|" << ing << "| isize:|" << isize << "| ing.path[isize]:|" << ing.path[isize] << "| tb_offset[_level]:|" << tb_offset[isize] << "| *maxsuffidx:|" << *maxsuffidx << "|" << std::endl);
}
if (extendible) *extendible=succrange(ing.path[ing.lev],ing.lev)>0;
if (lastbow) *lastbow=(double) (isQtable?Bcenters[ing.lev][(qfloat_t)ing.bow]:ing.bow);
diff --git a/src/lmtable.h b/src/lmtable.h
index ee7b9b2..b27495f 100644
--- a/src/lmtable.h
+++ b/src/lmtable.h
@@ -350,6 +350,9 @@ public:
virtual const char *maxsuffptr(ngram ong, unsigned int* size=NULL);
virtual const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL);
virtual const char *cmaxsuffptr(int* codes, int sz, unsigned int* size=NULL);
+ ngram_state_t maxsuffidx(ngram ong, unsigned int* size=NULL);
+ ngram_state_t cmaxsuffidx(ngram ong, unsigned int* size=NULL);
+ ngram_state_t cmaxsuffidx(int* codes, int sz, unsigned int* size=NULL);
inline void putmem(char* ptr,int value,int offs,int size) {
MY_ASSERT(ptr!=NULL);
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list