[irstlm] 50/78: fixing to allow back-compatibility
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:47:05 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to tag adaptiveLM.v0.10
in repository irstlm.
commit 9f9898475ec6eb4d484e3deb3e1d84916aeab5af
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Fri Nov 27 15:55:10 2015 +0100
fixing to allow back-compatibility
---
src/lmtable.cpp | 98 ++++++++++++++++-----------------------------------------
src/lmtable.h | 2 ++
2 files changed, 29 insertions(+), 71 deletions(-)
diff --git a/src/lmtable.cpp b/src/lmtable.cpp
index 948bfd1..e6bdfee 100644
--- a/src/lmtable.cpp
+++ b/src/lmtable.cpp
@@ -2247,6 +2247,16 @@ namespace irstlm {
return 0;
}
+ ngram_state_t lmtable::convert(const char* suffptr, size_t lev){
+ int ndsz=nodesize(tbltype[lev]);
+ ngram_state_t suffidx=0;
+ if (suffptr){
+ suffidx = (ngram_state_t) ( ((table_pos_t) suffptr - (table_pos_t) table[lev]) / ndsz ) + tb_offset[lev] + 1; //added 1 to distinguish from zero-ngram
+ }
+ return suffidx;
+ }
+
+
//maxsuffptr returns the largest suffix of an n-gram that is contained
//in the LM table. This can be used as a compact representation of the
//(n-1)-gram state of a n-gram LM. If the input k-gram has k>=n then it
@@ -2329,6 +2339,7 @@ namespace irstlm {
//cache miss
unsigned int isize; //internal state size variable
char* found=(char *)maxsuffptr(ong,&isize);
+ ngram_state_t msidx = convert(found,isize);
//cache insert
//IMPORTANT: this function updates only two fields (state, statesize) of the entry of the cache; the reminaing fields (logpr, bow, bol, extendible) are undefined; hence, it should not be used before the corresponding clprob()
@@ -2337,6 +2348,7 @@ namespace irstlm {
// if (prob_and_state_cache && ong.size==maxlev) {
if (prob_and_state_cache[ong.size]) {
pst.state=found;
+ pst.ngramstate=msidx;
pst.statesize=isize;
// prob_and_state_cache->add(ong.wordp(maxlev),pst);
prob_and_state_cache[ong.size]->add(ong.wordp(ong.size),pst);
@@ -2354,65 +2366,13 @@ namespace irstlm {
//(n-1)-gram state of a n-gram LM. If the input k-gram has k>=n then it
//is trimmed to its n-1 suffix.
//non recursive version
+ //It relies on the computation of maxsuffptr
ngram_state_t lmtable::maxsuffidx(ngram ong, unsigned int* size)
{
VERBOSE(3,"ngram_state_t lmtable::maxsuffidx(ngram ong, unsigned int* size)\n");
- if (ong.size==0) {
- if (size!=NULL) *size=0;
- return 0;
- }
-
- if (isInverted) {
- if (ong.size>maxlev) ong.size=maxlev; //if larger than maxlen reduce size
- ngram ing=ong; //inverted ngram
-
- ing.invert(ong);
-
- get(ing,ing.size,ing.size); // dig in the trie
- if (ing.lev > 0) { //found something?
- size_t isize = MIN(ing.lev,(ing.size-1)); //find largest n-1 gram suffix
- if (size!=NULL) *size=isize;
-
- int ndsz=nodesize(tbltype[isize]);
- ngram_state_t msidx = (ngram_state_t) ( ((table_pos_t) (ing.path[isize]) - (table_pos_t) table[isize] ) / ndsz ) + tb_offset[isize] + 1; //added 1 to distinguish from zero-ngram
- VERBOSE(3,"ngram_state_t lmtable::maxsuffidx(ngram ong, unsigned int* size) ing:|" << ing << "| isize:|" << isize << "| ing.path[isize]:|" << ing.path[isize] << "| tb_offset[isize]:|" << tb_offset[isize] << "| msidx:|" << msidx << "|" << std::endl);
- return msidx;
- } else { // means a real unknown word!
- if (size!=NULL) *size=0; //default statesize for zero-gram!
- return 0; //default state-value for zero-gram!
- }
- } else {
- if (ong.size>0) ong.size--; //always reduced by 1 word
-
- if (ong.size>=maxlev) ong.size=maxlev-1; //if still larger or equals to maxlen reduce again
-
- if (size!=NULL) *size=ong.size; //will return the largest found ong.size
-
- for (ngram ng=ong; ng.size>0; ng.size--) {
- if (get(ng,ng.size,ng.size)) {
- // if (ng.succ==0) (*size)--;
- // if (size!=NULL) *size=ng.size;
- size_t isize=ng.size;
- if (size!=NULL)
- {
- if (ng.succ==0) *size=isize-1;
- else *size=isize;
- }
-
- int ndsz=nodesize(tbltype[isize]);
- ngram_state_t msidx = 0;
- if (ng.link){
- msidx = (ngram_state_t) ( ((table_pos_t) (ng.link) - (table_pos_t) table[isize]) / ndsz ) + tb_offset[isize] + 1; //added 1 to distinguish from zero-ngram
- }
-
- VERBOSE(3,"ngram_state_t lmtable::maxsuffidx(ngram ong, unsigned int* size) ng:|" << ng << "| isize:|" << isize << "| tb_offset[isize]:|" << tb_offset[isize] << "| msidx:|" << msidx << "|" << std::endl);
- return msidx;
- }
- }
- if (size!=NULL) *size=0;
- return 0;
- }
+ const char* suffptr = cmaxsuffptr(ong,size);
+ return convert(suffptr,*size);
}
ngram_state_t lmtable::cmaxsuffidx(ngram ong, unsigned int* size)
@@ -2436,14 +2396,14 @@ namespace irstlm {
// if (prob_and_state_cache && ong.size==maxlev && prob_and_state_cache->get(ong.wordp(maxlev),pst)) {
if (prob_and_state_cache[ong.size] && prob_and_state_cache[ong.size]->get(ong.wordp(ong.size),pst)) {
*size=pst.statesize;
- // return pst.state;
return pst.ngramstate;
}
ong.size = orisize;
//cache miss
unsigned int isize; //internal state size variable
- ngram_state_t msidx = maxsuffidx(ong,&isize);
+ char* msptr = cmaxsuffptr(ong,&isize);
+ ngram_state_t msidx = convert(suffptr,isize);
//cache insert
//IMPORTANT: this function updates only two fields (ngramstate, statesize) of the entry of the cache; the reminaing fields (logpr, bow, bol, extendible) are undefined; hence, it should not be used before the corresponding clprob()
@@ -2451,6 +2411,7 @@ namespace irstlm {
if (ong.size>=maxlev) ong.size=maxlev;
// if (prob_and_state_cache && ong.size==maxlev) {
if (prob_and_state_cache[ong.size]) {
+ pst.state=found;
pst.ngramstate=msidx;
pst.statesize=isize;
// prob_and_state_cache->add(ong.wordp(maxlev),pst);
@@ -2503,12 +2464,12 @@ namespace irstlm {
if (*ong.wordp(1)==dict->oovcode()) lpr-=logOOVpenalty; //add OOV penalty
size_t isize=MIN(ing.lev,(ing.size-1));
if (statesize) *statesize=isize; //find largest n-1 gram suffix
- if (maxsuffptr) *maxsuffptr=ing.path[isize];
+
+ char* suffptr=ing.path[isize];
- if (maxsuffidx){
- int ndsz=nodesize(tbltype[isize]);
- *maxsuffidx = (ngram_state_t) ( ((table_pos_t) (ing.path[isize]) - (table_pos_t) table[isize] ) / ndsz ) + tb_offset[isize] + 1; //added 1 to distinguish from zero-ngram
- }
+ if (maxsuffptr) *maxsuffptr=suffptr;
+ if (maxsuffidx) *maxsuffidx = convert(suffptr,isize);
+
if (extendible) *extendible=succrange(ing.path[ing.lev],ing.lev)>0;
if (lastbow) *lastbow=(double) (isQtable?Bcenters[ing.lev][(qfloat_t)ing.bow]:ing.bow);
} else { // means a real unknown word!
@@ -2556,17 +2517,12 @@ namespace irstlm {
get(ng,ng.size,ng.size);
}
if (statesize) *statesize=ng.size;
- if (maxsuffptr) *maxsuffptr=ng.link; //we should check ng.link != NULL
-
+
+ char* suffptr=ng.link; //we should check ng.link != NULL
size_t isize=ng.size;
- if (maxsuffidx){
- int ndsz=nodesize(tbltype[isize]);
- *maxsuffidx=0;
- if (ng.link){
- *maxsuffidx = (ngram_state_t) ( ((table_pos_t) (ng.link) - (table_pos_t) table[isize]) / ndsz ) + tb_offset[isize] + 1; //added 1 to distinguish from zero-ngram
- }
- }
+ if (maxsuffptr) *maxsuffptr=suffptr;
+ if (maxsuffidx) *maxsuffidx = convert(suffptr,isize);
}
return rbow+lpr;
} else {
diff --git a/src/lmtable.h b/src/lmtable.h
index e7cba2d..3e9338b 100644
--- a/src/lmtable.h
+++ b/src/lmtable.h
@@ -91,6 +91,8 @@ namespace irstlm {
void loadbin_codebook(std::istream& inp,int l);
void loadbin_level(std::istream& inp,int l);
+ ngram_state_t convert(const char* suffptr, size_t lev);
+
protected:
char* table[LMTMAXLEV+1]; //storage of all levels
LMT_TYPE tbltype[LMTMAXLEV+1]; //table type for each levels
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list