[irstlm] 11/78: code optimization
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:47:00 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to tag adaptiveLM.v0.10
in repository irstlm.
commit ede270d107cca714d21d90e3a407d0dec9fc60e7
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Sun Nov 8 15:50:16 2015 +0100
code optimization
---
src/lmContainer.cpp | 270 ++++----
src/lmContainer.h | 465 +++++++------
src/lmInterpolation.cpp | 53 +-
src/lmInterpolation.h | 248 +++----
src/lmclass.cpp | 404 +++++------
src/lmclass.h | 173 +++--
src/lmmacro.cpp | 1722 ++++++++++++++++++++++++-----------------------
src/lmmacro.h | 208 +++---
src/lmtable.cpp | 262 +++----
src/lmtable.h | 1129 +++++++++++++++----------------
10 files changed, 2466 insertions(+), 2468 deletions(-)
diff --git a/src/lmContainer.cpp b/src/lmContainer.cpp
index 7b995d4..bde6996 100644
--- a/src/lmContainer.cpp
+++ b/src/lmContainer.cpp
@@ -1,24 +1,24 @@
// $Id: lmContainer.cpp 3686 2010-10-15 11:55:32Z bertoldi $
/******************************************************************************
-IrstLM: IRST Language Model Toolkit
-Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-
-******************************************************************************/
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
#include <stdio.h>
#include <cstdlib>
#include <stdlib.h>
@@ -34,7 +34,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "lmInterpolation.h"
using namespace std;
-
+
namespace irstlm {
#ifdef PS_CACHE_ENABLE
@@ -42,134 +42,134 @@ namespace irstlm {
#undef PS_CACHE_ENABLE
#endif
#endif
-
+
#ifdef LMT_CACHE_ENABLE
#if LMT_CACHE_ENABLE==0
#undef LMT_CACHE_ENABLE
#endif
#endif
-
+
#if PS_CACHE_ENABLE
-bool lmContainer::ps_cache_enabled=true;
+ bool lmContainer::ps_cache_enabled=true;
#else
-bool lmContainer::ps_cache_enabled=false;
+ bool lmContainer::ps_cache_enabled=false;
#endif
-
+
#if LMT_CACHE_ENABLE
-bool lmContainer::lmt_cache_enabled=true;
+ bool lmContainer::lmt_cache_enabled=true;
#else
-bool lmContainer::lmt_cache_enabled=false;
+ bool lmContainer::lmt_cache_enabled=false;
#endif
-
-inline void error(const char* message)
-{
- std::cerr << message << "\n";
- throw std::runtime_error(message);
-}
-
-lmContainer::lmContainer()
-{
- requiredMaxlev=IRSTLM_REQUIREDMAXLEV_DEFAULT;
- lmtype=_IRSTLM_LMUNKNOWN;
- maxlev=0;
-}
-
-int lmContainer::getLanguageModelType(std::string filename)
-{
- fstream inp(filename.c_str(),ios::in|ios::binary);
-
- if (!inp.good()) {
- std::stringstream ss_msg;
- ss_msg << "Failed to open " << filename;
- exit_error(IRSTLM_ERROR_IO, ss_msg.str());
- }
- //give a look at the header to get informed about the language model type
- std::string header;
- inp >> header;
- inp.close();
-
- VERBOSE(1,"LM header:|" << header << "|" << std::endl);
-
- int type=_IRSTLM_LMUNKNOWN;
- VERBOSE(1,"type: " << type << std::endl);
- if (header == "lmminterpolation" || header == "LMINTERPOLATION") {
- type = _IRSTLM_LMINTERPOLATION;
- } else if (header == "lmmacro" || header == "LMMACRO") {
- type = _IRSTLM_LMMACRO;
- } else if (header == "lmclass" || header == "LMCLASS") {
- type = _IRSTLM_LMCLASS;
- } else {
- type = _IRSTLM_LMTABLE;
- }
- VERBOSE(1,"type: " << type << std::endl);
-
- return type;
-};
-
-lmContainer* lmContainer::CreateLanguageModel(const std::string infile, float nlf, float dlf)
-{
- int type = lmContainer::getLanguageModelType(infile);
-
- VERBOSE(1,"lmContainer* lmContainer::CreateLanguageModel(...) Language Model Type of " << infile << " is " << type << std::endl);
-
- return lmContainer::CreateLanguageModel(type, nlf, dlf);
-}
-
-lmContainer* lmContainer::CreateLanguageModel(int type, float nlf, float dlf)
-{
- VERBOSE(1,"Language Model Type is " << type << std::endl);
-
- lmContainer* lm=NULL;
-
- switch (type) {
-
- case _IRSTLM_LMTABLE:
- VERBOSE(1,"_IRSTLM_LMTABLE" << std::endl);
- lm = new lmtable(nlf, dlf);
- break;
-
- case _IRSTLM_LMMACRO:
- VERBOSE(1,"_IRSTLM_LMMACRO" << std::endl);
- lm = new lmmacro(nlf, dlf);
- break;
+
+ inline void error(const char* message)
+ {
+ std::cerr << message << "\n";
+ throw std::runtime_error(message);
+ }
+
+ lmContainer::lmContainer()
+ {
+ requiredMaxlev=IRSTLM_REQUIREDMAXLEV_DEFAULT;
+ lmtype=_IRSTLM_LMUNKNOWN;
+ maxlev=0;
+ }
+
+ int lmContainer::getLanguageModelType(std::string filename)
+ {
+ fstream inp(filename.c_str(),ios::in|ios::binary);
+
+ if (!inp.good()) {
+ std::stringstream ss_msg;
+ ss_msg << "Failed to open " << filename;
+ exit_error(IRSTLM_ERROR_IO, ss_msg.str());
+ }
+ //give a look at the header to get informed about the language model type
+ std::string header;
+ inp >> header;
+ inp.close();
+
+ VERBOSE(1,"LM header:|" << header << "|" << std::endl);
+
+ int type=_IRSTLM_LMUNKNOWN;
+ VERBOSE(1,"type: " << type << std::endl);
+ if (header == "lmminterpolation" || header == "LMINTERPOLATION") {
+ type = _IRSTLM_LMINTERPOLATION;
+ } else if (header == "lmmacro" || header == "LMMACRO") {
+ type = _IRSTLM_LMMACRO;
+ } else if (header == "lmclass" || header == "LMCLASS") {
+ type = _IRSTLM_LMCLASS;
+ } else {
+ type = _IRSTLM_LMTABLE;
+ }
+ VERBOSE(1,"type: " << type << std::endl);
+
+ return type;
+ };
+
+ lmContainer* lmContainer::CreateLanguageModel(const std::string infile, float nlf, float dlf)
+ {
+ int type = lmContainer::getLanguageModelType(infile);
+
+ VERBOSE(1,"lmContainer* lmContainer::CreateLanguageModel(...) Language Model Type of " << infile << " is " << type << std::endl);
+
+ return lmContainer::CreateLanguageModel(type, nlf, dlf);
+ }
+
+ lmContainer* lmContainer::CreateLanguageModel(int type, float nlf, float dlf)
+ {
+ VERBOSE(1,"Language Model Type is " << type << std::endl);
+
+ lmContainer* lm=NULL;
+
+ switch (type) {
+
+ case _IRSTLM_LMTABLE:
+ VERBOSE(1,"_IRSTLM_LMTABLE" << std::endl);
+ lm = new lmtable(nlf, dlf);
+ break;
+
+ case _IRSTLM_LMMACRO:
+ VERBOSE(1,"_IRSTLM_LMMACRO" << std::endl);
+ lm = new lmmacro(nlf, dlf);
+ break;
+
+ case _IRSTLM_LMCLASS:
+ VERBOSE(1,"_IRSTLM_LMCLASS" << std::endl);
+ lm = new lmclass(nlf, dlf);
+ break;
+
+ case _IRSTLM_LMINTERPOLATION:
+ VERBOSE(1,"_IRSTLM_LMINTERPOLATION" << std::endl);
+ lm = new lmInterpolation(nlf, dlf);
+ break;
+
+ default:
+ VERBOSE(1,"UNKNOWN" << std::endl);
+ exit_error(IRSTLM_ERROR_DATA, "This language model type is unknown!");
+ }
+ VERBOSE(1,"lmContainer* lmContainer::CreateLanguageModel(int type, float nlf, float dlf) lm:|" << (void*) lm << "|" << std::endl);
+
+ lm->setLanguageModelType(type);
+
+ VERBOSE(1,"lmContainer* lmContainer::CreateLanguageModel(int type, float nlf, float dlf) lm->getLanguageModelType:|" << lm->getLanguageModelType() << "|" << std::endl)
+ return lm;
+ }
+
+ bool lmContainer::filter(const string sfilter, lmContainer*& sublmC, const string skeepunigrams)
+ {
+ if (lmtype == _IRSTLM_LMTABLE) {
+ sublmC = lmContainer::CreateLanguageModel(lmtype,((lmtable*) this)->GetNgramcacheLoadFactor(),((lmtable*) this)->GetDictionaryLoadFactor());
- case _IRSTLM_LMCLASS:
- VERBOSE(1,"_IRSTLM_LMCLASS" << std::endl);
- lm = new lmclass(nlf, dlf);
- break;
+ //let know that table has inverted n-grams
+ sublmC->is_inverted(is_inverted());
+ sublmC->setMaxLoadedLevel(getMaxLoadedLevel());
+ sublmC->maxlevel(maxlevel());
- case _IRSTLM_LMINTERPOLATION:
- VERBOSE(1,"_IRSTLM_LMINTERPOLATION" << std::endl);
- lm = new lmInterpolation(nlf, dlf);
- break;
+ bool res=((lmtable*) this)->filter(sfilter, (lmtable*) sublmC, skeepunigrams);
- default:
- VERBOSE(1,"UNKNOWN" << std::endl);
- exit_error(IRSTLM_ERROR_DATA, "This language model type is unknown!");
- }
- VERBOSE(1,"lmContainer* lmContainer::CreateLanguageModel(int type, float nlf, float dlf) lm:|" << (void*) lm << "|" << std::endl);
-
- lm->setLanguageModelType(type);
-
- VERBOSE(1,"lmContainer* lmContainer::CreateLanguageModel(int type, float nlf, float dlf) lm->getLanguageModelType:|" << lm->getLanguageModelType() << "|" << std::endl)
- return lm;
-}
-
-bool lmContainer::filter(const string sfilter, lmContainer*& sublmC, const string skeepunigrams)
-{
- if (lmtype == _IRSTLM_LMTABLE) {
- sublmC = lmContainer::CreateLanguageModel(lmtype,((lmtable*) this)->GetNgramcacheLoadFactor(),((lmtable*) this)->GetDictionaryLoadFactor());
-
- //let know that table has inverted n-grams
- sublmC->is_inverted(is_inverted());
- sublmC->setMaxLoadedLevel(getMaxLoadedLevel());
- sublmC->maxlevel(maxlevel());
-
- bool res=((lmtable*) this)->filter(sfilter, (lmtable*) sublmC, skeepunigrams);
-
- return res;
- }
- return false;
-};
-
+ return res;
+ }
+ return false;
+ };
+
}//namespace irstlm
diff --git a/src/lmContainer.h b/src/lmContainer.h
index 2b4e3f0..4f18c3c 100644
--- a/src/lmContainer.h
+++ b/src/lmContainer.h
@@ -1,24 +1,24 @@
// $Id: lmContainer.h 3686 2010-10-15 11:55:32Z bertoldi $
/******************************************************************************
-IrstLM: IRST Language Model Toolkit
-Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-
-******************************************************************************/
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
#ifndef MF_LMCONTAINER_H
#define MF_LMCONTAINER_H
@@ -40,233 +40,218 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
typedef enum {BINARY,TEXT,YRANIB,NONE} OUTFILE_TYPE;
typedef enum {LMT_FIND, //!< search: find an entry
- LMT_ENTER, //!< search: enter an entry
- LMT_INIT, //!< scan: start scan
- LMT_CONT //!< scan: continue scan
+ LMT_ENTER, //!< search: enter an entry
+ LMT_INIT, //!< scan: start scan
+ LMT_CONT //!< scan: continue scan
} LMT_ACTION;
namespace irstlm {
-class lmContainer
-{
- static const bool debug=true;
- static bool ps_cache_enabled;
- static bool lmt_cache_enabled;
-
-protected:
- int lmtype; //auto reference to its own type
- int maxlev; //maximun order of sub LMs;
- int requiredMaxlev; //max loaded level, i.e. load up to requiredMaxlev levels
-
-public:
-
- lmContainer();
- virtual ~lmContainer() {};
-
-
- virtual void load(const std::string &filename, int mmap=0) {
- UNUSED(filename);
- UNUSED(mmap);
- };
-
- virtual void savetxt(const char *filename) {
- UNUSED(filename);
- };
- virtual void savebin(const char *filename) {
- UNUSED(filename);
- };
-
- virtual double getlogOOVpenalty() const {
- return 0.0;
- };
- virtual double setlogOOVpenalty(int dub) {
- UNUSED(dub);
- return 0.0;
- };
- virtual double setlogOOVpenalty(double oovp) {
- UNUSED(oovp);
- return 0.0;
- };
-
- inline virtual dictionary* getDict() const {
- return NULL;
- };
- inline virtual void maxlevel(int lev) {
- maxlev = lev;
- };
- inline virtual int maxlevel() const {
- return maxlev;
- };
- inline virtual void stat(int lev=0) {
- UNUSED(lev);
- };
-
- inline virtual void setMaxLoadedLevel(int lev) {
- requiredMaxlev=lev;
- };
- inline virtual int getMaxLoadedLevel() {
- return requiredMaxlev;
- };
-
- virtual bool is_inverted(const bool flag) {
- UNUSED(flag);
- return false;
- };
- virtual bool is_inverted() {
- return false;
- };
-
- double clprob(ngram ng) {
- return clprob(ng, NULL, NULL, NULL, NULL, NULL, NULL);
- }
- double clprob(ngram ng, double* bow, int* bol) {
- return clprob(ng, bow, bol, NULL, NULL, NULL, NULL);
- }
- double clprob(ngram ng, double* bow, int* bol, char** maxsuffptr, unsigned int* statesize) {
- return clprob(ng, bow, bol, NULL, maxsuffptr, statesize, NULL);
- }
- double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize) {
- return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, NULL);
- }
- virtual double clprob(ngram ng, double* bow, int* bol, char** maxsuffptr, unsigned int* statesize,bool* extendible) {
- VERBOSE(3,"virtual double lmContainer::clprob(ngram ng, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL) ng:|" << ng << "|\n");
- return clprob(ng, bow, bol, NULL, maxsuffptr, statesize, extendible);
- };
- virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize,bool* extendible) {
- VERBOSE(3,"virtual double lmContainer::clprob(ngram ng, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL) ng:|" << ng << "|\n");
- UNUSED(ng);
- UNUSED(bow);
- UNUSED(bol);
- UNUSED(maxsuffidx);
- UNUSED(maxsuffptr);
- UNUSED(statesize);
- UNUSED(extendible);
- return 0.0;
- };
-
- virtual double clprob(int* ng, int ngsize, double* bow, int* bol, char** maxsuffptr, unsigned int* statesize,bool* extendible) {
- VERBOSE(3,"virtual double lmContainer::clprob(int* ng, int ngsize, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL)\n");
- return clprob(ng, ngsize, bow, bol, NULL, maxsuffptr, statesize, extendible);
- };
-
- virtual double clprob(int* ng, int ngsize, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize,bool* extendible) {
- VERBOSE(3,"virtual double lmContainer::clprob(int* ng, int ngsize, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL)\n");
- UNUSED(ng);
- UNUSED(ngsize);
- UNUSED(bow);
- UNUSED(bol);
- UNUSED(maxsuffidx);
- UNUSED(maxsuffptr);
- UNUSED(statesize);
- UNUSED(extendible);
- return 0.0;
- };
-
-
- virtual const char *cmaxsuffptr(ngram ng, unsigned int* statesize=NULL)
- {
- return cmaxsuffptr(ng, statesize);
- }
-
- virtual const char *cmaxsuffptr(int* ng, int ngsize, unsigned int* statesize=NULL)
- {
- return cmaxsuffptr(ng, ngsize, statesize);
- }
-
- virtual ngram_state_t cmaxsuffidx(ngram ng, unsigned int* statesize=NULL)
- {
- UNUSED(ng);
- UNUSED(statesize);
- return NULL;
- }
-
- virtual ngram_state_t cmaxsuffidx(int* ng, int ngsize, unsigned int* statesize=NULL)
- {
- UNUSED(ng);
- UNUSED(ngsize);
- UNUSED(statesize);
- return NULL;
- }
-
- virtual inline int get(ngram& ng) {
- UNUSED(ng);
- return 0;
- }
-
- virtual int get(ngram& ng,int n,int lev){
- UNUSED(ng);
- UNUSED(n);
- UNUSED(lev);
- return 0;
- }
-
- virtual int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev){
- UNUSED(ng);
- UNUSED(h);
- UNUSED(action);
- UNUSED(lev);
- return 0;
- }
-
-
- virtual void used_caches() {};
- virtual void init_caches(int uptolev) {
- UNUSED(uptolev);
- };
- virtual void check_caches_levels() {};
- virtual void reset_caches() {};
-
- virtual void reset_mmap() {};
-
- void inline setLanguageModelType(int type) {
- lmtype=type;
- };
- int getLanguageModelType() const {
- return lmtype;
- };
- static int getLanguageModelType(std::string filename);
-
- inline virtual void dictionary_incflag(const bool flag) {
- UNUSED(flag);
- };
-
- virtual bool filter(const string sfilter, lmContainer*& sublmt, const string skeepunigrams);
-
- static lmContainer* CreateLanguageModel(const std::string infile, float nlf=0.0, float dlf=0.0);
- static lmContainer* CreateLanguageModel(int type, float nlf=0.0, float dlf=0.0);
-
- inline virtual bool is_OOV(int code) {
- UNUSED(code);
- return false;
- };
-
-
- inline static bool is_lmt_cache_enabled(){
- VERBOSE(3,"inline static bool is_lmt_cache_enabled() " << lmt_cache_enabled << std::endl);
- return lmt_cache_enabled;
- }
-
- inline static bool is_ps_cache_enabled(){
- VERBOSE(3,"inline static bool is_ps_cache_enabled() " << ps_cache_enabled << std::endl);
- return ps_cache_enabled;
- }
-
- inline static bool is_cache_enabled(){
- return is_lmt_cache_enabled() && is_ps_cache_enabled();
- }
-
- virtual int addWord(const char *w){
- getDict()->incflag(1);
- int c=getDict()->encode(w);
- getDict()->incflag(0);
- return c;
- }
-
- virtual void print_table_stat(){
- VERBOSE(3,"virtual void lmContainer::print_table_stat() "<< std::endl);
+ class lmContainer
+ {
+ static const bool debug=true;
+ static bool ps_cache_enabled;
+ static bool lmt_cache_enabled;
+
+ protected:
+ int lmtype; //auto reference to its own type
+ int maxlev; //maximun order of sub LMs;
+ int requiredMaxlev; //max loaded level, i.e. load up to requiredMaxlev levels
+
+ public:
+
+ lmContainer();
+ virtual ~lmContainer() {};
+
+
+ virtual void load(const std::string &filename, int mmap=0) {
+ UNUSED(filename);
+ UNUSED(mmap);
+ };
+
+ virtual void savetxt(const char *filename) {
+ UNUSED(filename);
+ };
+ virtual void savebin(const char *filename) {
+ UNUSED(filename);
+ };
+
+ virtual double getlogOOVpenalty() const {
+ return 0.0;
+ };
+ virtual double setlogOOVpenalty(int dub) {
+ UNUSED(dub);
+ return 0.0;
+ };
+ virtual double setlogOOVpenalty(double oovp) {
+ UNUSED(oovp);
+ return 0.0;
+ };
+
+ inline virtual dictionary* getDict() const {
+ return NULL;
+ };
+ inline virtual void maxlevel(int lev) {
+ maxlev = lev;
+ };
+ inline virtual int maxlevel() const {
+ return maxlev;
+ };
+ inline virtual void stat(int lev=0) {
+ UNUSED(lev);
+ };
+
+ inline virtual void setMaxLoadedLevel(int lev) {
+ requiredMaxlev=lev;
+ };
+ inline virtual int getMaxLoadedLevel() {
+ return requiredMaxlev;
+ };
+
+ virtual bool is_inverted(const bool flag) {
+ UNUSED(flag);
+ return false;
+ };
+ virtual bool is_inverted() {
+ return false;
+ };
+
+ virtual double clprob(ngram ng) { return clprob(ng, NULL, NULL, NULL, NULL, NULL, NULL, NULL); }
+ virtual double clprob(ngram ng, double* bow) { return clprob(ng, bow, NULL, NULL, NULL, NULL, NULL, NULL); }
+ virtual double clprob(ngram ng, double* bow, int* bol) { return clprob(ng, bow, bol, NULL, NULL, NULL, NULL, NULL); }
+ virtual double clprob(ngram ng, double* bow, int* bol, char** maxsuffptr) { return clprob(ng, bow, bol, NULL, maxsuffptr, NULL, NULL, NULL); }
+ virtual double clprob(ngram ng, double* bow, int* bol, char** maxsuffptr, unsigned int* statesize) { return clprob(ng, bow, bol, NULL, maxsuffptr, statesize, NULL, NULL); }
+ virtual double clprob(ngram ng, double* bow, int* bol, char** maxsuffptr, unsigned int* statesize, bool* extendible) { return clprob(ng, bow, bol, NULL, maxsuffptr, statesize, extendible, NULL); };
+ virtual double clprob(ngram ng, double* bow, int* bol, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow) { return clprob(ng, bow, bol, NULL, maxsuffptr, statesize, extendible, lastbow); }
+
+ virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx) { return clprob(ng, bow, bol, maxsuffidx, NULL, NULL, NULL, NULL); }
+ virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, NULL, NULL, NULL); }
+ virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, NULL, NULL); }
+ virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, NULL); };
+ virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow); }
+
+ virtual double clprob(int* ng, int ngsize=NULL, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL) {
+ VERBOSE(3,"virtual double lmContainer::clprob(int* ng, int ngsize, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL)\n");
+
+ //create the actual ngram
+ ngram ong(getDict());
+ ong.pushc(ng,ngsize);
+ MY_ASSERT (ong.size == ngsize);
+
+ return clprob(ong, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+ };
+
+ virtual const char *cmaxsuffptr(ngram ng, unsigned int* statesize=NULL)
+ {
+ UNUSED(ng);
+ UNUSED(statesize);
+ return NULL;
+ }
+
+ virtual const char *cmaxsuffptr(int* ng, int ngsize, unsigned int* statesize=NULL)
+ {
+ //create the actual ngram
+ ngram ong(getDict());
+ ong.pushc(ng,ngsize);
+ MY_ASSERT (ong.size == ngsize);
+ return cmaxsuffptr(ng, ngsize, statesize);
+ }
+
+ virtual ngram_state_t cmaxsuffidx(ngram ng, unsigned int* statesize=NULL)
+ {
+ UNUSED(ng);
+ UNUSED(statesize);
+ return 0;
+ }
+
+ virtual ngram_state_t cmaxsuffidx(int* ng, int ngsize, unsigned int* statesize=NULL)
+ {
+ //create the actual ngram
+ ngram ong(getDict());
+ ong.pushc(ng,ngsize);
+ MY_ASSERT (ong.size == ngsize);
+ return cmaxsuffidx(ong,statesize);
+ }
+
+ virtual inline int get(ngram& ng) {
+ UNUSED(ng);
+ return 0;
+ }
+
+ virtual int get(ngram& ng,int n,int lev){
+ UNUSED(ng);
+ UNUSED(n);
+ UNUSED(lev);
+ return 0;
+ }
+
+ virtual int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev){
+ UNUSED(ng);
+ UNUSED(h);
+ UNUSED(action);
+ UNUSED(lev);
+ return 0;
+ }
+
+
+ virtual void used_caches() {};
+ virtual void init_caches(int uptolev) {
+ UNUSED(uptolev);
+ };
+ virtual void check_caches_levels() {};
+ virtual void reset_caches() {};
+
+ virtual void reset_mmap() {};
+
+ void inline setLanguageModelType(int type) {
+ lmtype=type;
+ };
+ int getLanguageModelType() const {
+ return lmtype;
+ };
+ static int getLanguageModelType(std::string filename);
+
+ inline virtual void dictionary_incflag(const bool flag) {
+ UNUSED(flag);
+ };
+
+ virtual bool filter(const string sfilter, lmContainer*& sublmt, const string skeepunigrams);
+
+ static lmContainer* CreateLanguageModel(const std::string infile, float nlf=0.0, float dlf=0.0);
+ static lmContainer* CreateLanguageModel(int type, float nlf=0.0, float dlf=0.0);
+
+ inline virtual bool is_OOV(int code) {
+ UNUSED(code);
+ return false;
+ };
+
+
+ inline static bool is_lmt_cache_enabled(){
+ VERBOSE(3,"inline static bool is_lmt_cache_enabled() " << lmt_cache_enabled << std::endl);
+ return lmt_cache_enabled;
+ }
+
+ inline static bool is_ps_cache_enabled(){
+ VERBOSE(3,"inline static bool is_ps_cache_enabled() " << ps_cache_enabled << std::endl);
+ return ps_cache_enabled;
+ }
+
+ inline static bool is_cache_enabled(){
+ return is_lmt_cache_enabled() && is_ps_cache_enabled();
+ }
+
+ virtual int addWord(const char *w){
+ getDict()->incflag(1);
+ int c=getDict()->encode(w);
+ getDict()->incflag(0);
+ return c;
+ }
+
+ virtual void print_table_stat(){
+ VERBOSE(3,"virtual void lmContainer::print_table_stat() "<< std::endl);
+ };
+
};
-};
-
}//namespace irstlm
#endif
diff --git a/src/lmInterpolation.cpp b/src/lmInterpolation.cpp
index 113c18b..dbc9c5f 100644
--- a/src/lmInterpolation.cpp
+++ b/src/lmInterpolation.cpp
@@ -141,7 +141,7 @@ namespace irstlm {
//return log10 prob of an ngram
// double lmInterpolation::clprob(ngram ng, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
- double lmInterpolation::clprob(ngram ng, double* bow,int* bol,ngram_state_t* maxsuffidx, char** maxsuffptr,unsigned int* statesize,bool* extendible)
+ double lmInterpolation::clprob(ngram ng, double* bow,int* bol,ngram_state_t* maxsuffidx, char** maxsuffptr,unsigned int* statesize,bool* extendible, double* lastbow)
{
double pr=0.0;
@@ -152,6 +152,7 @@ namespace irstlm {
unsigned int _statesize=0,actualstatesize=0;
int _bol=0,actualbol=MAX_NGRAM;
double _bow=0.0,actualbow=0.0;
+ double _lastbow=0.0,actuallastbow=0.0;
bool _extendible=false;
bool actualextendible=false;
@@ -163,7 +164,7 @@ namespace irstlm {
ngram _ng(m_lm[i]->getDict());
_ng.trans(ng);
// _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,&_extendible);
- _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffidx,&_maxsuffptr,&_statesize,&_extendible);
+ _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffidx,&_maxsuffptr,&_statesize,&_extendible, lastbow);
IFVERBOSE(3){
//cerr.precision(10);
@@ -173,6 +174,7 @@ namespace irstlm {
VERBOSE(3," _statesize:" << _statesize << std::endl);
VERBOSE(3," _bow:" << _bow << std::endl);
VERBOSE(3," _bol:" << _bol << std::endl);
+ VERBOSE(3," _lastbow:" << _lastbow << std::endl);
}
/*
@@ -183,6 +185,7 @@ namespace irstlm {
//What is the bow of a LM interpolation? The weighted sum of the bow of the submodels
//What is the prob of a LM interpolation? The weighted sum of the prob of the submodels
//What is the extendible flag of a LM interpolation? true if the extendible flag is one for any LM
+ //What is the lastbow of a LM interpolation? The weighted sum of the lastbow of the submodels
*/
pr+=m_weight[i]*pow(10.0,_logpr);
@@ -199,6 +202,9 @@ namespace irstlm {
if (_extendible) {
actualextendible=true; //set extendible flag to true if the ngram is extendible for any LM
}
+ if (_lastbow < actuallastbow) {
+ actuallastbow=_lastbow; //backoff limit of LM[i]
+ }
}
}
if (bol) *bol=actualbol;
@@ -206,30 +212,31 @@ namespace irstlm {
if (maxsuffptr) *maxsuffptr=actualmaxsuffptr;
if (maxsuffidx) *maxsuffidx=actualmaxsuffidx;
if (statesize) *statesize=actualstatesize;
- if (extendible) {
- *extendible=actualextendible;
- // delete _extendible;
- }
+ if (extendible) *extendible=actualextendible;
+ if (lastbow) *bol=actuallastbow;
if (statesize) VERBOSE(3, " statesize:" << *statesize << std::endl);
if (bow) VERBOSE(3, " bow:" << *bow << std::endl);
if (bol) VERBOSE(3, " bol:" << *bol << std::endl);
+ if (lastbow) VERBOSE(3, " lastbow:" << *lastbow << std::endl);
return log10(pr);
}
- // double lmInterpolation::clprob(int* codes, int sz, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
- double lmInterpolation::clprob(int* codes, int sz, double* bow,int* bol,ngram_state_t* maxsuffidx,char** maxsuffptr,unsigned int* statesize,bool* extendible)
- {
-
- //create the actual ngram
- ngram ong(dict);
- ong.pushc(codes,sz);
- MY_ASSERT (ong.size == sz);
-
- // return clprob(ong, bow, bol, maxsuffptr, statesize, extendible);
- return clprob(ong, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible);
- }
+ /*
+ // double lmInterpolation::clprob(int* codes, int sz, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
+ double lmInterpolation::clprob(int* codes, int sz, double* bow,int* bol,ngram_state_t* maxsuffidx,char** maxsuffptr,unsigned int* statesize,bool* extendible)
+ {
+
+ //create the actual ngram
+ ngram ong(dict);
+ ong.pushc(codes,sz);
+ MY_ASSERT (ong.size == sz);
+
+ // return clprob(ong, bow, bol, maxsuffptr, statesize, extendible);
+ return clprob(ong, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible);
+ }
+ */
const char *lmInterpolation::cmaxsuffptr(ngram ng, unsigned int* statesize){
@@ -270,7 +277,8 @@ namespace irstlm {
return maxsuffptr;
}
-
+
+ /*
const char *lmInterpolation::cmaxsuffptr(int* codes, int sz, unsigned int* statesize)
{
//create the actual ngram
@@ -279,7 +287,7 @@ namespace irstlm {
MY_ASSERT (ong.size == sz);
return cmaxsuffptr(ong, statesize);
}
-
+ */
ngram_state_t lmInterpolation::cmaxsuffidx(ngram ng, unsigned int* statesize)
{
ngram_state_t maxsuffidx=0;
@@ -320,7 +328,8 @@ namespace irstlm {
return maxsuffidx;
}
-
+
+ /*
ngram_state_t lmInterpolation::cmaxsuffidx(int* codes, int sz, unsigned int* statesize)
{
//create the actual ngram
@@ -329,7 +338,7 @@ namespace irstlm {
MY_ASSERT (ong.size == sz);
return cmaxsuffidx(ong, statesize);
}
-
+ */
double lmInterpolation::setlogOOVpenalty(int dub)
{
diff --git a/src/lmInterpolation.h b/src/lmInterpolation.h
index b1fe743..12a7add 100644
--- a/src/lmInterpolation.h
+++ b/src/lmInterpolation.h
@@ -1,24 +1,24 @@
// $Id: lmInterpolation.h 3686 2010-10-15 11:55:32Z bertoldi $
/******************************************************************************
-IrstLM: IRST Language Model Toolkit
-Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-
-******************************************************************************/
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
#ifndef MF_LMINTERPOLATION_H
#define MF_LMINTERPOLATION_H
@@ -34,116 +34,116 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "n_gram.h"
#include "lmContainer.h"
-
-namespace irstlm {
-/*
-interpolation of several sub LMs
-*/
-
-#define LMINTERPOLATION_MAX_TOKEN 3
-
-class lmInterpolation: public lmContainer
-{
- static const bool debug=true;
- int m_number_lm;
- int order;
- int dictionary_upperbound; //set by user
- double logOOVpenalty; //penalty for OOV words (default 0)
- bool isInverted;
- int memmap; //level from which n-grams are accessed via mmap
- std::vector<double> m_weight;
- std::vector<std::string> m_file;
- std::vector<bool> m_isinverted;
- std::vector<lmContainer*> m_lm;
-
- int maxlev; //maximun order of sub LMs;
-
- float ngramcache_load_factor;
- float dictionary_load_factor;
-
- dictionary *dict; // dictionary for all interpolated LMs
-
-public:
-
- lmInterpolation(float nlf=0.0, float dlfi=0.0);
- virtual ~lmInterpolation() {};
-
- void load(const std::string &filename,int mmap=0);
- lmContainer* load_lm(int i, int memmap, float nlf, float dlf);
-
- double clprob(ngram ng, double* bow=NULL,int* bol=NULL,ngram_state_t* maxsuffidx=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
- double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,ngram_state_t* maxsuffidx=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
-
- const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL);
- const char *cmaxsuffptr(int* codes, int sz, unsigned int* size=NULL);
- ngram_state_t cmaxsuffidx(ngram ong, unsigned int* size=NULL);
- ngram_state_t cmaxsuffidx(int* codes, int sz, unsigned int* size=NULL);
-
-
- int maxlevel() const {
- return maxlev;
- };
-
- virtual inline void setDict(dictionary* d) {
- if (dict) delete dict;
- dict=d;
- };
+namespace irstlm {
+ /*
+ interpolation of several sub LMs
+ */
- virtual inline dictionary* getDict() const {
- return dict;
- };
-
- //set penalty for OOV words
- virtual inline double getlogOOVpenalty() const {
- return logOOVpenalty;
- }
-
- virtual double setlogOOVpenalty(int dub);
-
- double inline setlogOOVpenalty(double oovp) {
- return logOOVpenalty=oovp;
- }
-
-//set the inverted flag (used to set the inverted flag of each subLM, when loading)
- inline bool is_inverted(const bool flag) {
- return isInverted = flag;
- }
-
-//for an interpolation LM this variable does not make sense
-//for compatibility, we return true if all subLM return true
- inline bool is_inverted() {
- for (int i=0; i<m_number_lm; i++) {
- if (m_isinverted[i] == false) return false;
- }
- return true;
- }
-
- inline virtual void dictionary_incflag(const bool flag) {
- dict->incflag(flag);
- };
-
- inline virtual bool is_OOV(int code) { //returns true if the word is OOV for each subLM
- for (int i=0; i<m_number_lm; i++) {
- int _code=m_lm[i]->getDict()->encode(getDict()->decode(code));
- if (m_lm[i]->is_OOV(_code) == false) return false;
- }
- return true;
- }
+#define LMINTERPOLATION_MAX_TOKEN 3
- virtual int addWord(const char *w){
- for (int i=0; i<m_number_lm; i++) {
- m_lm[i]->getDict()->incflag(1);
- m_lm[i]->getDict()->encode(w);
- m_lm[i]->getDict()->incflag(0);
+ class lmInterpolation: public lmContainer
+ {
+ static const bool debug=true;
+ int m_number_lm;
+ int order;
+ int dictionary_upperbound; //set by user
+ double logOOVpenalty; //penalty for OOV words (default 0)
+ bool isInverted;
+ int memmap; //level from which n-grams are accessed via mmap
+
+ std::vector<double> m_weight;
+ std::vector<std::string> m_file;
+ std::vector<bool> m_isinverted;
+ std::vector<lmContainer*> m_lm;
+
+ int maxlev; //maximun order of sub LMs;
+
+ float ngramcache_load_factor;
+ float dictionary_load_factor;
+
+ dictionary *dict; // dictionary for all interpolated LMs
+
+ public:
+
+ lmInterpolation(float nlf=0.0, float dlfi=0.0);
+ virtual ~lmInterpolation() {};
+
+ virtual void load(const std::string &filename,int mmap=0);
+ lmContainer* load_lm(int i, int memmap, float nlf, float dlf);
+
+ virtual double clprob(ngram ng, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
+ // double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,ngram_state_t* maxsuffidx=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
+
+ const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL);
+ // const char *cmaxsuffptr(int* codes, int sz, unsigned int* size=NULL);
+ ngram_state_t cmaxsuffidx(ngram ong, unsigned int* size=NULL);
+ // ngram_state_t cmaxsuffidx(int* codes, int sz, unsigned int* size=NULL);
+
+
+ int maxlevel() const {
+ return maxlev;
+ };
+
+ virtual inline void setDict(dictionary* d) {
+ if (dict) delete dict;
+ dict=d;
+ };
+
+ virtual inline dictionary* getDict() const {
+ return dict;
+ };
+
+ //set penalty for OOV words
+ virtual inline double getlogOOVpenalty() const {
+ return logOOVpenalty;
}
- getDict()->incflag(1);
- int c=getDict()->encode(w);
- getDict()->incflag(0);
- return c;
- }
-
-};
+
+ virtual double setlogOOVpenalty(int dub);
+
+ double inline setlogOOVpenalty(double oovp) {
+ return logOOVpenalty=oovp;
+ }
+
+ //set the inverted flag (used to set the inverted flag of each subLM, when loading)
+ inline bool is_inverted(const bool flag) {
+ return isInverted = flag;
+ }
+
+ //for an interpolation LM this variable does not make sense
+ //for compatibility, we return true if all subLM return true
+ inline bool is_inverted() {
+ for (int i=0; i<m_number_lm; i++) {
+ if (m_isinverted[i] == false) return false;
+ }
+ return true;
+ }
+
+ inline virtual void dictionary_incflag(const bool flag) {
+ dict->incflag(flag);
+ };
+
+ inline virtual bool is_OOV(int code) { //returns true if the word is OOV for each subLM
+ for (int i=0; i<m_number_lm; i++) {
+ int _code=m_lm[i]->getDict()->encode(getDict()->decode(code));
+ if (m_lm[i]->is_OOV(_code) == false) return false;
+ }
+ return true;
+ }
+
+ virtual int addWord(const char *w){
+ for (int i=0; i<m_number_lm; i++) {
+ m_lm[i]->getDict()->incflag(1);
+ m_lm[i]->getDict()->encode(w);
+ m_lm[i]->getDict()->incflag(0);
+ }
+ getDict()->incflag(1);
+ int c=getDict()->encode(w);
+ getDict()->incflag(0);
+ return c;
+ }
+
+ };
}//namespace irstlm
#endif
diff --git a/src/lmclass.cpp b/src/lmclass.cpp
index 8788a00..b578eb3 100644
--- a/src/lmclass.cpp
+++ b/src/lmclass.cpp
@@ -1,24 +1,24 @@
// $Id: lmclass.cpp 3631 2010-10-07 12:04:12Z bertoldi $
/******************************************************************************
-IrstLM: IRST Language Model Toolkit
-Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-
-******************************************************************************/
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
@@ -37,7 +37,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
// local utilities: start
-
+
int parseWords(char *sentence, const char **words, int max);
inline void error(const char* message)
@@ -49,190 +49,190 @@ inline void error(const char* message)
// local utilities: end
namespace irstlm {
-
-lmclass::lmclass(float nlf, float dlfi):lmtable(nlf,dlfi)
-{
- MaxMapSize=1000000;
- MapScore= (double *)malloc(MaxMapSize*sizeof(double));// //array of probabilities
- memset(MapScore,0,MaxMapSize*sizeof(double));
- MapScoreN=0;
- dict = new dictionary((char *)NULL,MaxMapSize); //word to cluster dictionary
-};
-
-lmclass::~lmclass()
-{
- free (MapScore);
- delete dict;
-}
-
-void lmclass::load(const std::string &filename,int memmap)
-{
- VERBOSE(2,"lmclass::load(const std::string &filename,int memmap)" << std::endl);
-
- //get info from the configuration file
- fstream inp(filename.c_str(),ios::in|ios::binary);
-
- char line[MAX_LINE];
- const char* words[LMCLASS_MAX_TOKEN];
- int tokenN;
- inp.getline(line,MAX_LINE,'\n');
- tokenN = parseWords(line,words,LMCLASS_MAX_TOKEN);
-
- if (tokenN != 2 || ((strcmp(words[0],"LMCLASS") != 0) && (strcmp(words[0],"lmclass")!=0)))
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCLASS LM_order\nfilename_of_LM\nfilename_of_map");
-
- maxlev = atoi(words[1]);
- std::string lmfilename;
- if (inp.getline(line,MAX_LINE,'\n')) {
- tokenN = parseWords(line,words,LMCLASS_MAX_TOKEN);
- lmfilename = words[0];
- } else {
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCLASS LM_order\nfilename_of_LM\nfilename_of_map");
- }
-
- std::string W2Cdict = "";
- if (inp.getline(line,MAX_LINE,'\n')) {
- tokenN = parseWords(line,words,LMCLASS_MAX_TOKEN);
- W2Cdict = words[0];
- } else {
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCLASS LM_order\nfilename_of_LM\nfilename_of_map");
- }
- inp.close();
-
- std::cerr << "lmfilename:" << lmfilename << std::endl;
- if (W2Cdict != "") {
- std::cerr << "mapfilename:" << W2Cdict << std::endl;
- } else {
- error((char*)"ERROR: you must specify a map!");
- }
-
-
- // Load the (possibly binary) LM
- inputfilestream inpLM(lmfilename.c_str());
- if (!inpLM.good()) {
- std::cerr << "Failed to open " << lmfilename << "!" << std::endl;
- exit(1);
- }
- lmtable::load(inpLM,lmfilename.c_str(),NULL,memmap);
-
- inputfilestream inW2C(W2Cdict);
- if (!inW2C.good()) {
- std::cerr << "Failed to open " << W2Cdict << "!" << std::endl;
- exit(1);
- }
- loadMap(inW2C);
- getDict()->genoovcode();
-
- VERBOSE(2,"OOV code of lmclass is " << getDict()->oovcode() << " mapped into " << getMap(getDict()->oovcode())<< "\n");
- getDict()->incflag(1);
-}
-
-void lmclass::loadMap(istream& inW2C)
-{
-
- double lprob=0.0;
- int howmany=0;
-
- const char* words[1 + LMTMAXLEV + 1 + 1];
-
- //open input stream and prepare an input string
- char line[MAX_LINE];
-
- dict->incflag(1); //can add to the map dictionary
-
- cerr<<"loadW2Cdict()...\n";
- //save freq of EOS and BOS
-
- loadMapElement(dict->BoS(),lmtable::dict->BoS(),0.0);
- loadMapElement(dict->EoS(),lmtable::dict->EoS(),0.0);
-
- //should i add <unk> to the dict or just let the trans_freq handle <unk>
- loadMapElement(dict->OOV(),lmtable::dict->OOV(),0.0);
-
- while (inW2C.getline(line,MAX_LINE)) {
- if (strlen(line)==MAX_LINE-1) {
- cerr << "lmtable::loadW2Cdict: input line exceed MAXLINE ("
- << MAX_LINE << ") chars " << line << "\n";
- exit(1);
- }
-
- howmany = parseWords(line, words, 4); //3
-
- if(howmany == 3) {
- MY_ASSERT(sscanf(words[2], "%lf", &lprob));
- lprob=(double)log10(lprob);
- } else if(howmany==2) {
-
- VERBOSE(3,"No score for the pair (" << words[0] << "," << words[1] << "); set to default 1.0\n");
-
- lprob=0.0;
- } else {
- cerr << "parseline: not enough entries" << line << "\n";
- exit(1);
- }
- loadMapElement(words[0],words[1],lprob);
-
- //check if the are available position in MapScore
- checkMap();
- }
-
- VERBOSE(2,"There are " << MapScoreN << " entries in the map\n");
-
- dict->incflag(0); //can NOT add to the dictionary of lmclass
-}
-
-void lmclass::checkMap()
-{
- if (MapScoreN > MaxMapSize) {
- MaxMapSize=2*MapScoreN;
- MapScore = (double*) reallocf(MapScore, sizeof(double)*(MaxMapSize));
- VERBOSE(2,"In lmclass::checkMap(...) MaxMapSize=" << MaxMapSize << " MapScoreN=" << MapScoreN << "\n");
- }
-}
-
-void lmclass::loadMapElement(const char* in, const char* out, double sc)
-{
- //freq of word (in) encodes the ID of the class (out)
- //save the probability associated with the pair (in,out)
- int wcode=dict->encode(in);
- dict->freq(wcode,lmtable::dict->encode(out));
- MapScore[wcode]=sc;
- VERBOSE(3,"In lmclass::loadMapElement(...) in=" << in << " wcode=" << wcode << " out=" << out << " ccode=" << lmtable::dict->encode(out) << " MapScoreN=" << MapScoreN << "\n");
-
- if (wcode >= MapScoreN) MapScoreN++; //increment size of the array MapScore if the element is new
-}
-//double lmclass::lprob(ngram ong,double* bow, int* bol, char** maxsuffptr,unsigned int* statesize,bool* extendible)
-double lmclass::lprob(ngram ong,double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr,unsigned int* statesize,bool* extendible)
-{
- double lpr=getMapScore(*ong.wordp(1));
-
- VERBOSE(3,"In lmclass::lprob(...) Mapscore = " << lpr << "\n");
-
- //convert ong to it's clustered encoding
- ngram mapped_ng(lmtable::getDict());
- // mapped_ng.trans_freq(ong);
- mapping(ong,mapped_ng);
-
-// lpr+=lmtable::clprob(mapped_ng,bow,bol,maxsuffptr,statesize, extendible);
- lpr+=lmtable::clprob(mapped_ng,bow,bol,maxsuffidx,maxsuffptr,statesize, extendible);
+ lmclass::lmclass(float nlf, float dlfi):lmtable(nlf,dlfi)
+ {
+ MaxMapSize=1000000;
+ MapScore= (double *)malloc(MaxMapSize*sizeof(double));// //array of probabilities
+ memset(MapScore,0,MaxMapSize*sizeof(double));
+ MapScoreN=0;
+ dict = new dictionary((char *)NULL,MaxMapSize); //word to cluster dictionary
+ };
- VERBOSE(3,"In lmclass::lprob(...) global prob = " << lpr << "\n");
- return lpr;
-}
-
-void lmclass::mapping(ngram &in, ngram &out)
-{
- int insize = in.size;
- VERBOSE(3,"In lmclass::mapping(ngram &in, ngram &out) in = " << in << "\n");
-
- // map the input sequence (in) into the corresponding output sequence (out), by applying the provided map
- for (int i=insize; i>0; i--) {
- out.pushc(getMap(*in.wordp(i)));
- }
-
- VERBOSE(3,"In lmclass::mapping(ngram &in, ngram &out) out = " << out << "\n");
- return;
-}
+ lmclass::~lmclass()
+ {
+ free (MapScore);
+ delete dict;
+ }
+
+ void lmclass::load(const std::string &filename,int memmap)
+ {
+ VERBOSE(2,"lmclass::load(const std::string &filename,int memmap)" << std::endl);
+
+ //get info from the configuration file
+ fstream inp(filename.c_str(),ios::in|ios::binary);
+
+ char line[MAX_LINE];
+ const char* words[LMCLASS_MAX_TOKEN];
+ int tokenN;
+ inp.getline(line,MAX_LINE,'\n');
+ tokenN = parseWords(line,words,LMCLASS_MAX_TOKEN);
+
+ if (tokenN != 2 || ((strcmp(words[0],"LMCLASS") != 0) && (strcmp(words[0],"lmclass")!=0)))
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCLASS LM_order\nfilename_of_LM\nfilename_of_map");
+
+ maxlev = atoi(words[1]);
+ std::string lmfilename;
+ if (inp.getline(line,MAX_LINE,'\n')) {
+ tokenN = parseWords(line,words,LMCLASS_MAX_TOKEN);
+ lmfilename = words[0];
+ } else {
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCLASS LM_order\nfilename_of_LM\nfilename_of_map");
+ }
+
+ std::string W2Cdict = "";
+ if (inp.getline(line,MAX_LINE,'\n')) {
+ tokenN = parseWords(line,words,LMCLASS_MAX_TOKEN);
+ W2Cdict = words[0];
+ } else {
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCLASS LM_order\nfilename_of_LM\nfilename_of_map");
+ }
+ inp.close();
+
+ std::cerr << "lmfilename:" << lmfilename << std::endl;
+ if (W2Cdict != "") {
+ std::cerr << "mapfilename:" << W2Cdict << std::endl;
+ } else {
+ error((char*)"ERROR: you must specify a map!");
+ }
+
+
+ // Load the (possibly binary) LM
+ inputfilestream inpLM(lmfilename.c_str());
+ if (!inpLM.good()) {
+ std::cerr << "Failed to open " << lmfilename << "!" << std::endl;
+ exit(1);
+ }
+ lmtable::load(inpLM,lmfilename.c_str(),NULL,memmap);
+
+ inputfilestream inW2C(W2Cdict);
+ if (!inW2C.good()) {
+ std::cerr << "Failed to open " << W2Cdict << "!" << std::endl;
+ exit(1);
+ }
+ loadMap(inW2C);
+ getDict()->genoovcode();
+
+ VERBOSE(2,"OOV code of lmclass is " << getDict()->oovcode() << " mapped into " << getMap(getDict()->oovcode())<< "\n");
+ getDict()->incflag(1);
+ }
+
+ void lmclass::loadMap(istream& inW2C)
+ {
+
+ double lprob=0.0;
+ int howmany=0;
+
+ const char* words[1 + LMTMAXLEV + 1 + 1];
+
+ //open input stream and prepare an input string
+ char line[MAX_LINE];
+
+ dict->incflag(1); //can add to the map dictionary
+
+ cerr<<"loadW2Cdict()...\n";
+ //save freq of EOS and BOS
+
+ loadMapElement(dict->BoS(),lmtable::dict->BoS(),0.0);
+ loadMapElement(dict->EoS(),lmtable::dict->EoS(),0.0);
+
+ //should i add <unk> to the dict or just let the trans_freq handle <unk>
+ loadMapElement(dict->OOV(),lmtable::dict->OOV(),0.0);
+
+ while (inW2C.getline(line,MAX_LINE)) {
+ if (strlen(line)==MAX_LINE-1) {
+ cerr << "lmtable::loadW2Cdict: input line exceed MAXLINE ("
+ << MAX_LINE << ") chars " << line << "\n";
+ exit(1);
+ }
+
+ howmany = parseWords(line, words, 4); //3
+
+ if(howmany == 3) {
+ MY_ASSERT(sscanf(words[2], "%lf", &lprob));
+ lprob=(double)log10(lprob);
+ } else if(howmany==2) {
+
+ VERBOSE(3,"No score for the pair (" << words[0] << "," << words[1] << "); set to default 1.0\n");
+
+ lprob=0.0;
+ } else {
+ cerr << "parseline: not enough entries" << line << "\n";
+ exit(1);
+ }
+ loadMapElement(words[0],words[1],lprob);
+
+ //check if the are available position in MapScore
+ checkMap();
+ }
+
+ VERBOSE(2,"There are " << MapScoreN << " entries in the map\n");
+
+ dict->incflag(0); //can NOT add to the dictionary of lmclass
+ }
+
+ void lmclass::checkMap()
+ {
+ if (MapScoreN > MaxMapSize) {
+ MaxMapSize=2*MapScoreN;
+ MapScore = (double*) reallocf(MapScore, sizeof(double)*(MaxMapSize));
+ VERBOSE(2,"In lmclass::checkMap(...) MaxMapSize=" << MaxMapSize << " MapScoreN=" << MapScoreN << "\n");
+ }
+ }
+
+ void lmclass::loadMapElement(const char* in, const char* out, double sc)
+ {
+ //freq of word (in) encodes the ID of the class (out)
+ //save the probability associated with the pair (in,out)
+ int wcode=dict->encode(in);
+ dict->freq(wcode,lmtable::dict->encode(out));
+ MapScore[wcode]=sc;
+ VERBOSE(3,"In lmclass::loadMapElement(...) in=" << in << " wcode=" << wcode << " out=" << out << " ccode=" << lmtable::dict->encode(out) << " MapScoreN=" << MapScoreN << "\n");
+
+ if (wcode >= MapScoreN) MapScoreN++; //increment size of the array MapScore if the element is new
+ }
+
+ //double lmclass::lprob(ngram ong,double* bow, int* bol, char** maxsuffptr,unsigned int* statesize,bool* extendible)
+ double lmclass::lprob(ngram ong,double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow)
+ {
+ double lpr=getMapScore(*ong.wordp(1));
+
+ VERBOSE(3,"In lmclass::lprob(...) Mapscore = " << lpr << "\n");
+
+ //convert ong to it's clustered encoding
+ ngram mapped_ng(lmtable::getDict());
+ // mapped_ng.trans_freq(ong);
+ mapping(ong,mapped_ng);
+
+ // lpr+=lmtable::clprob(mapped_ng,bow,bol,maxsuffptr,statesize, extendible);
+ lpr+=lmtable::clprob(mapped_ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow);
+
+ VERBOSE(3,"In lmclass::lprob(...) global prob = " << lpr << "\n");
+ return lpr;
+ }
+
+ void lmclass::mapping(ngram &in, ngram &out)
+ {
+ int insize = in.size;
+ VERBOSE(3,"In lmclass::mapping(ngram &in, ngram &out) in = " << in << "\n");
+
+ // map the input sequence (in) into the corresponding output sequence (out), by applying the provided map
+ for (int i=insize; i>0; i--) {
+ out.pushc(getMap(*in.wordp(i)));
+ }
+
+ VERBOSE(3,"In lmclass::mapping(ngram &in, ngram &out) out = " << out << "\n");
+ return;
+ }
}//namespace irstlm
diff --git a/src/lmclass.h b/src/lmclass.h
index 3ab299f..a22164d 100644
--- a/src/lmclass.h
+++ b/src/lmclass.h
@@ -1,24 +1,24 @@
// $Id: lmclass.h 3461 2010-08-27 10:17:34Z bertoldi $
/******************************************************************************
-IrstLM: IRST Language Model Toolkit
-Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-
-******************************************************************************/
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
#ifndef MF_LMCLASS_H
@@ -34,79 +34,76 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "dictionary.h"
#include "n_gram.h"
#include "lmtable.h"
-
+
#define LMCLASS_MAX_TOKEN 2
namespace irstlm {
-class lmclass: public lmtable
-{
- dictionary *dict; // dictionary (words - macro tags)
- double *MapScore;
- int MapScoreN;
- int MaxMapSize;
-
-protected:
- void loadMap(std::istream& inp);
- void loadMapElement(const char* in, const char* out, double sc);
- void mapping(ngram &in, ngram &out);
-
- inline double getMapScore(int wcode) {
-//the input word is un-known by the map, so I "transform" this word into the oov (of the words)
- if (wcode >= MapScoreN) {
- wcode = getDict()->oovcode();
- }
- return MapScore[wcode];
- };
-
- inline size_t getMap(int wcode) {
-//the input word is un-known by the map, so I "transform" this word into the oov (of the words)
- if (wcode >= MapScoreN) {
- wcode = getDict()->oovcode();
- }
- return dict->freq(wcode);
- };
-
- void checkMap();
-
-public:
- lmclass(float nlf=0.0, float dlfi=0.0);
-
- ~lmclass();
-
- void load(const std::string &filename,int mmap=0);
-
- double lprob(ngram ng, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) {
- return lprob(ng,bow,bol,NULL,maxsuffptr,statesize,extendible);
- };
- double clprob(ngram ng,double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) {
- return clprob(ng,bow,bol,NULL,maxsuffptr,statesize,extendible);
- };
- double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) {
- return clprob(ng,ngsize,bow,bol,NULL,maxsuffptr,statesize,extendible);
- };
-
- double lprob(ngram ng, double* bow=NULL,int* bol=NULL,ngram_state_t* maxsuffidx=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
- double clprob(ngram ng,double* bow=NULL,int* bol=NULL,ngram_state_t* maxsuffidx=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) {
- return lprob(ng,bow,bol,maxsuffidx,maxsuffptr,statesize,extendible);
- };
- double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,ngram_state_t* maxsuffidx=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) {
- ngram ong(getDict());
- ong.pushc(ng,ngsize);
- return lprob(ong,bow,bol,maxsuffidx,maxsuffptr,statesize,extendible);
- };
-
- inline bool is_OOV(int code) {
- //a word is consisdered OOV if its mapped value is OOV
- return lmtable::is_OOV(getMap(code));
- };
-
- inline dictionary* getDict() const {
- return dict;
- }
- inline virtual void dictionary_incflag(const bool flag) {
- dict->incflag(flag);
- };
-};
+ class lmclass: public lmtable
+ {
+ dictionary *dict; // dictionary (words - macro tags)
+ double *MapScore;
+ int MapScoreN;
+ int MaxMapSize;
+
+ protected:
+ void loadMap(std::istream& inp);
+ void loadMapElement(const char* in, const char* out, double sc);
+ void mapping(ngram &in, ngram &out);
+
+ inline double getMapScore(int wcode) {
+ //the input word is un-known by the map, so I "transform" this word into the oov (of the words)
+ if (wcode >= MapScoreN) {
+ wcode = getDict()->oovcode();
+ }
+ return MapScore[wcode];
+ };
+
+ inline size_t getMap(int wcode) {
+ //the input word is un-known by the map, so I "transform" this word into the oov (of the words)
+ if (wcode >= MapScoreN) {
+ wcode = getDict()->oovcode();
+ }
+ return dict->freq(wcode);
+ };
+
+ void checkMap();
+
+ public:
+ lmclass(float nlf=0.0, float dlfi=0.0);
+
+ ~lmclass();
+
+ virtual void load(const std::string &filename,int mmap=0);
+
+
+ // virtual double lprob(ngram ng, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL) { return lprob(ng,bow,bol,NULL,maxsuffptr,statesize,extendible,lastbow); };
+ // virtual double clprob(ngram ng,double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) { return clprob(ng,bow,bol,NULL,maxsuffptr,statesize,extendible); };
+
+ // double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) { return clprob(ng,ngsize,bow,bol,NULL,maxsuffptr,statesize,extendible); };
+
+ double lprob(ngram ng, double* bow=NULL,int* bol=NULL,ngram_state_t* maxsuffidx=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL);
+ double clprob(ngram ng,double* bow=NULL,int* bol=NULL,ngram_state_t* maxsuffidx=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL) {
+ return lprob(ng,bow,bol,maxsuffidx,maxsuffptr,statesize,extendible,lastbow);
+ };
+ /*
+ double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,ngram_state_t* maxsuffidx=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) {
+ ngram ong(getDict());
+ ong.pushc(ng,ngsize);
+ return lprob(ong,bow,bol,maxsuffidx,maxsuffptr,statesize,extendible);
+ };
+ */
+ inline bool is_OOV(int code) {
+ //a word is consisdered OOV if its mapped value is OOV
+ return lmtable::is_OOV(getMap(code));
+ };
+
+ inline dictionary* getDict() const {
+ return dict;
+ }
+ inline virtual void dictionary_incflag(const bool flag) {
+ dict->incflag(flag);
+ };
+ };
}//namespace irstlm
diff --git a/src/lmmacro.cpp b/src/lmmacro.cpp
index 66c7063..0f64477 100644
--- a/src/lmmacro.cpp
+++ b/src/lmmacro.cpp
@@ -1,24 +1,24 @@
// $Id: lmmacro.cpp 3631 2010-10-07 12:04:12Z bertoldi $
/******************************************************************************
-IrstLM: IRST Language Model Toolkit
-Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-
-******************************************************************************/
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
@@ -36,7 +36,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "util.h"
using namespace std;
-
+
// local utilities: start
inline void error(const char* message)
@@ -50,509 +50,265 @@ inline void error(const char* message)
namespace irstlm {
-lmmacro::lmmacro(float nlf, float dlfi):lmtable(nlf,dlfi)
-{
- dict = new dictionary((char *)NULL,1000000); // dict of micro tags
- getDict()->incflag(1);
-};
-
-lmmacro::~lmmacro()
-{
- if (mapFlag) unloadmap();
-}
-
-
-void lmmacro::load(const std::string &filename,int memmap)
-{
- VERBOSE(2,"lmmacro::load(const std::string &filename,int memmap)" << std::endl);
-
- //get info from the configuration file
- fstream inp(filename.c_str(),ios::in|ios::binary);
-
- char line[MAX_LINE];
- const char* words[MAX_TOKEN_N_MAP];
- int tokenN;
- inp.getline(line,MAX_LINE,'\n');
- tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
-
- if (tokenN != 4 || ((strcmp(words[0],"LMMACRO") != 0) && (strcmp(words[0],"lmmacro")!=0)))
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMMACRO lmsize field [true|false]\nfilename_of_LM\nfilename_of_map (optional)");
- maxlev = atoi(words[1]);
- selectedField = atoi(words[2]);
-
- if ((strcmp(words[3],"TRUE") == 0) || (strcmp(words[3],"true") == 0))
- collapseFlag = true;
- else if ((strcmp(words[3],"FALSE") == 0) || (strcmp(words[3],"false") == 0))
- collapseFlag = false;
- else
- error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMMACRO lmsize field [true|false]\nfilename_of_LM\nfilename_of_map (optional)");
-
+ lmmacro::lmmacro(float nlf, float dlfi):lmtable(nlf,dlfi)
+ {
+ dict = new dictionary((char *)NULL,1000000); // dict of micro tags
+ getDict()->incflag(1);
+ };
+
+ lmmacro::~lmmacro()
+ {
+ if (mapFlag) unloadmap();
+ }
+
+
+ void lmmacro::load(const std::string &filename,int memmap)
+ {
+ VERBOSE(2,"lmmacro::load(const std::string &filename,int memmap)" << std::endl);
+
+ //get info from the configuration file
+ fstream inp(filename.c_str(),ios::in|ios::binary);
+
+ char line[MAX_LINE];
+ const char* words[MAX_TOKEN_N_MAP];
+ int tokenN;
+ inp.getline(line,MAX_LINE,'\n');
+ tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
+
+ if (tokenN != 4 || ((strcmp(words[0],"LMMACRO") != 0) && (strcmp(words[0],"lmmacro")!=0)))
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMMACRO lmsize field [true|false]\nfilename_of_LM\nfilename_of_map (optional)");
+ maxlev = atoi(words[1]);
+ selectedField = atoi(words[2]);
+
+ if ((strcmp(words[3],"TRUE") == 0) || (strcmp(words[3],"true") == 0))
+ collapseFlag = true;
+ else if ((strcmp(words[3],"FALSE") == 0) || (strcmp(words[3],"false") == 0))
+ collapseFlag = false;
+ else
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMMACRO lmsize field [true|false]\nfilename_of_LM\nfilename_of_map (optional)");
+
#ifdef DLEXICALLM
- selectedFieldForLexicon = atoi(words[3]);
- collapseFlag = atoi(words[4]);
+ selectedFieldForLexicon = atoi(words[3]);
+ collapseFlag = atoi(words[4]);
#endif
-
- if (selectedField == -1)
- cerr << "no selected field: the whole string is used" << std::endl;
- else
- cerr << "selected field n. " << selectedField << std::endl;
- if (collapseFlag)
- cerr << "collapse is enabled" << std::endl;
- else
- cerr << "collapse is disabled" << std::endl;
-
-
- std::string lmfilename;
- if (inp.getline(line,MAX_LINE,'\n')) {
- tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
- lmfilename = words[0];
- } else
- error((char*)"ERROR: wrong format of configuration file\ncorrect format: LMMACRO lmsize field [true|false]\nfilename_of_LM\nfilename_of_map (optional)");
-
- std::string mapfilename = "";
- if (inp.getline(line,MAX_LINE,'\n')) {
- tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
- mapfilename = words[0];
- mapFlag = true;
- } else {
- mapFlag = false;
- }
-
- inp.close();
-
-
- std::cerr << "lmfilename:" << lmfilename << std::endl;
- if (mapfilename != "") {
- std::cerr << "mapfilename:" << mapfilename << std::endl;
- } else {
- std::cerr << "no mapfilename" << std::endl;
- mapFlag = false;
- }
-
- //allow the dictionary to add new words
- getDict()->incflag(1);
-
-
- if ((!mapFlag) && (collapseFlag)) {
- error((char*)"ERROR: you must specify a map if you want to collapse a specific field!");
- }
+
+ if (selectedField == -1)
+ cerr << "no selected field: the whole string is used" << std::endl;
+ else
+ cerr << "selected field n. " << selectedField << std::endl;
+ if (collapseFlag)
+ cerr << "collapse is enabled" << std::endl;
+ else
+ cerr << "collapse is disabled" << std::endl;
+
+
+ std::string lmfilename;
+ if (inp.getline(line,MAX_LINE,'\n')) {
+ tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
+ lmfilename = words[0];
+ } else
+ error((char*)"ERROR: wrong format of configuration file\ncorrect format: LMMACRO lmsize field [true|false]\nfilename_of_LM\nfilename_of_map (optional)");
+
+ std::string mapfilename = "";
+ if (inp.getline(line,MAX_LINE,'\n')) {
+ tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
+ mapfilename = words[0];
+ mapFlag = true;
+ } else {
+ mapFlag = false;
+ }
+
+ inp.close();
+
+
+ std::cerr << "lmfilename:" << lmfilename << std::endl;
+ if (mapfilename != "") {
+ std::cerr << "mapfilename:" << mapfilename << std::endl;
+ } else {
+ std::cerr << "no mapfilename" << std::endl;
+ mapFlag = false;
+ }
+
+ //allow the dictionary to add new words
+ getDict()->incflag(1);
+
+
+ if ((!mapFlag) && (collapseFlag)) {
+ error((char*)"ERROR: you must specify a map if you want to collapse a specific field!");
+ }
#ifdef DLEXICALLM
-
- std::string lexicalclassesfilename = words[2];
- if (lexicalclassesfilename != "NULL" && lexicalclassesfilename != "null") lexicalclassesfilename = "";
-
- if (lexicalclassesfilename != "") std::cerr << "lexicalclassesfilename:" << lexicalclassesfilename << std::endl;
- else std::cerr << "no lexicalclassesfilename" << std::endl;
-
- // Load the classes of lexicalization tokens:
- if (lexicalclassesfilename != "") loadLexicalClasses(lexicalclassesfilename.c_str());
+
+ std::string lexicalclassesfilename = words[2];
+ if (lexicalclassesfilename != "NULL" && lexicalclassesfilename != "null") lexicalclassesfilename = "";
+
+ if (lexicalclassesfilename != "") std::cerr << "lexicalclassesfilename:" << lexicalclassesfilename << std::endl;
+ else std::cerr << "no lexicalclassesfilename" << std::endl;
+
+ // Load the classes of lexicalization tokens:
+ if (lexicalclassesfilename != "") loadLexicalClasses(lexicalclassesfilename.c_str());
#endif
-
- // Load the (possibly binary) LM
- lmtable::load(lmfilename,memmap);
-
- getDict()->incflag(1);
+
+ // Load the (possibly binary) LM
+ lmtable::load(lmfilename,memmap);
+
+ getDict()->incflag(1);
+
+ if (mapFlag)
+ loadmap(mapfilename);
+ getDict()->genoovcode();
+
+ };
- if (mapFlag)
- loadmap(mapfilename);
- getDict()->genoovcode();
-
-};
-
-void lmmacro::unloadmap()
-{
- delete dict;
- free(microMacroMap);
- if (collapseFlag) {
- free(collapsableMap);
- free(collapsatorMap);
- }
+ void lmmacro::unloadmap()
+ {
+ delete dict;
+ free(microMacroMap);
+ if (collapseFlag) {
+ free(collapsableMap);
+ free(collapsatorMap);
+ }
#ifdef DLEXICALLM
- free(lexicaltoken2classMap);
+ free(lexicaltoken2classMap);
#endif
-}
-
-void lmmacro::loadmap(const std::string mapfilename)
-{
- microMacroMapN = 0;
- microMacroMap = NULL;
- collapsableMap = NULL;
- collapsatorMap = NULL;
-
+ }
+
+ void lmmacro::loadmap(const std::string mapfilename)
+ {
+ microMacroMapN = 0;
+ microMacroMap = NULL;
+ collapsableMap = NULL;
+ collapsatorMap = NULL;
+
#ifdef DLEXICALLM
- lexicaltoken2classMap = NULL;
- lexicaltoken2classMapN = 0;
+ lexicaltoken2classMap = NULL;
+ lexicaltoken2classMapN = 0;
#endif
-
- microMacroMap = (int *)calloc(BUFSIZ, sizeof(int));
- if (collapseFlag) {
- collapsableMap = (bool *)calloc(BUFSIZ, sizeof(bool));
- collapsatorMap = (bool *)calloc(BUFSIZ, sizeof(bool));
- }
-
-
- getDict()->genoovcode();
- microMacroMap[microMacroMapN] = lmtable::getDict()->oovcode();
- MY_ASSERT(microMacroMapN == getDict()->oovcode());
- microMacroMapN++;
-
-
- if (lmtable::getDict()->getcode(BOS_)==-1) {
- lmtable::getDict()->incflag(1);
- lmtable::getDict()->encode(BOS_);
- lmtable::getDict()->incflag(0);
- }
-
- if (lmtable::getDict()->getcode(EOS_)==-1) {
- lmtable::getDict()->incflag(1);
- lmtable::getDict()->encode(EOS_);
- lmtable::getDict()->incflag(0);
- }
-
- char line[MAX_LINE];
- const char* words[MAX_TOKEN_N_MAP];
- const char *macroW;
- const char *microW;
- int tokenN;
- bool bos=false,eos=false;
-
- // Load the dictionary of micro tags (to be put in "dict" of lmmacro class):
- inputfilestream inpMap(mapfilename.c_str());
- std::cerr << "Reading map " << mapfilename << "..." << std::endl;
- while (inpMap.getline(line,MAX_LINE,'\n')) {
- tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
- if (tokenN != 2)
- error((char*)"ERROR: wrong format of map file\n");
- microW = words[0];
- macroW = words[1];
- int microW_c=getDict()->encode(microW);
- VERBOSE(4, "microW gets the code:" << microW_c << std::endl);
-
- if (microMacroMapN>0 && !(microMacroMapN % BUFSIZ)) {
- microMacroMap = (int *)reallocf(microMacroMap, sizeof(int)*(BUFSIZ*(1+microMacroMapN/BUFSIZ)));
- if (collapseFlag) {
- //create supporting info for collapse
-
- collapsableMap = (bool *)reallocf(collapsableMap, sizeof(bool)*(BUFSIZ*(1+microMacroMapN/BUFSIZ)));
- collapsatorMap = (bool *)reallocf(collapsatorMap, sizeof(bool)*(BUFSIZ*(1+microMacroMapN/BUFSIZ)));
- }
- }
- microMacroMap[microMacroMapN] = lmtable::getDict()->getcode(macroW);
- if (collapseFlag) {
-
- int len = strlen(microW)-1;
- if (microW[len] == '(') {
- collapsableMap[microMacroMapN] = false;
- collapsatorMap[microMacroMapN] = true;
- } else if (microW[len] == ')') {
- collapsableMap[microMacroMapN] = true;
- collapsatorMap[microMacroMapN] = false;
- } else if (microW[len] == '+') {
- collapsableMap[microMacroMapN] = true;
- collapsatorMap[microMacroMapN] = true;
- } else {
- collapsableMap[microMacroMapN] = false;
- collapsatorMap[microMacroMapN] = false;
- }
- }
-
- if (!bos && !strcmp(microW,BOS_)) bos=true;
- if (!eos && !strcmp(microW,EOS_)) eos=true;
-
- VERBOSE(2,"\nmicroW = " << microW << "\n"
- << "macroW = " << macroW << "\n"
- << "microMacroMapN = " << microMacroMapN << "\n"
- << "code of micro = " << getDict()->getcode(microW) << "\n"
- << "code of macro = " << lmtable::getDict()->getcode(macroW) << "\n");
-
- microMacroMapN++;
- }
-
- if ((microMacroMapN == 0) && (selectedField == -1))
- error((char*)"ERROR: with no field selection, a map for the whole string is mandatory\n");
-
- if (microMacroMapN>0) {
- // Add <s>-><s> to map if missing
- if (!bos) {
- getDict()->encode(BOS_);
- if (microMacroMapN && !(microMacroMapN%BUFSIZ))
- microMacroMap = (int *)reallocf(microMacroMap, sizeof(int)*(microMacroMapN+BUFSIZ));
- microMacroMap[microMacroMapN++] = lmtable::getDict()->getcode(BOS_);
- }
-
- // Add </s>-></s> to map if missing
- if (!eos) {
- getDict()->encode(EOS_);
- if (microMacroMapN && !(microMacroMapN%BUFSIZ))
- microMacroMap = (int *)reallocf(microMacroMap, sizeof(int)*(microMacroMapN+BUFSIZ));
- microMacroMap[microMacroMapN++] = lmtable::getDict()->getcode(EOS_);
- }
- }
- // getDict()->incflag(0);
-
- VERBOSE(2,"oovcode(micro)=" << getDict()->oovcode() << "\n"
- << "oovcode(macro)=" << lmtable::getDict()->oovcode() << "\n"
- << "microMacroMapN = " << microMacroMapN << "\n"
- << "macrodictsize = " << getDict()->size() << "\n"
- << "microdictsize = " << lmtable::getDict()->size() << "\n");
-
- IFVERBOSE(2) {
- for (int i=0; i<microMacroMapN; i++) {
- VERBOSE(2,"micro[" << getDict()->decode(i) << "] {"<< i << "} -> " << lmtable::getDict()->decode(microMacroMap[i]) << " {" << microMacroMap[i]<< "}" << "\n");
- }
- }
- std::cerr << "...done\n";
-}
-
-
-double lmmacro::lprob(ngram micro_ng)
-{
- VERBOSE(2,"lmmacro::lprob, parameter = <" << micro_ng << ">\n");
-
- ngram macro_ng(lmtable::getDict());
-
- if (micro_ng.dict == macro_ng.dict)
- macro_ng.trans(micro_ng); // micro to macro mapping already done
- else
- map(µ_ng, ¯o_ng); // mapping required
-
- VERBOSE(3,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
- << "lmmacro::lprob: macro_ng = " << macro_ng << "\n");
-
- // ask LM with macro
- double prob;
- prob = lmtable::lprob(macro_ng);
- VERBOSE(3,"prob = " << prob << "\n");
-
- return prob;
-};
+ microMacroMap = (int *)calloc(BUFSIZ, sizeof(int));
+ if (collapseFlag) {
+ collapsableMap = (bool *)calloc(BUFSIZ, sizeof(bool));
+ collapsatorMap = (bool *)calloc(BUFSIZ, sizeof(bool));
+ }
+
+
+ getDict()->genoovcode();
+ microMacroMap[microMacroMapN] = lmtable::getDict()->oovcode();
+ MY_ASSERT(microMacroMapN == getDict()->oovcode());
+ microMacroMapN++;
+
+
+ if (lmtable::getDict()->getcode(BOS_)==-1) {
+ lmtable::getDict()->incflag(1);
+ lmtable::getDict()->encode(BOS_);
+ lmtable::getDict()->incflag(0);
+ }
+
+ if (lmtable::getDict()->getcode(EOS_)==-1) {
+ lmtable::getDict()->incflag(1);
+ lmtable::getDict()->encode(EOS_);
+ lmtable::getDict()->incflag(0);
+ }
+
+ char line[MAX_LINE];
+ const char* words[MAX_TOKEN_N_MAP];
+ const char *macroW;
+ const char *microW;
+ int tokenN;
+ bool bos=false,eos=false;
+
+ // Load the dictionary of micro tags (to be put in "dict" of lmmacro class):
+ inputfilestream inpMap(mapfilename.c_str());
+ std::cerr << "Reading map " << mapfilename << "..." << std::endl;
+ while (inpMap.getline(line,MAX_LINE,'\n')) {
+ tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
+ if (tokenN != 2)
+ error((char*)"ERROR: wrong format of map file\n");
+ microW = words[0];
+ macroW = words[1];
+ int microW_c=getDict()->encode(microW);
+ VERBOSE(4, "microW gets the code:" << microW_c << std::endl);
+
+ if (microMacroMapN>0 && !(microMacroMapN % BUFSIZ)) {
+ microMacroMap = (int *)reallocf(microMacroMap, sizeof(int)*(BUFSIZ*(1+microMacroMapN/BUFSIZ)));
+ if (collapseFlag) {
+ //create supporting info for collapse
+
+ collapsableMap = (bool *)reallocf(collapsableMap, sizeof(bool)*(BUFSIZ*(1+microMacroMapN/BUFSIZ)));
+ collapsatorMap = (bool *)reallocf(collapsatorMap, sizeof(bool)*(BUFSIZ*(1+microMacroMapN/BUFSIZ)));
+ }
+ }
+ microMacroMap[microMacroMapN] = lmtable::getDict()->getcode(macroW);
+
+ if (collapseFlag) {
+
+ int len = strlen(microW)-1;
+ if (microW[len] == '(') {
+ collapsableMap[microMacroMapN] = false;
+ collapsatorMap[microMacroMapN] = true;
+ } else if (microW[len] == ')') {
+ collapsableMap[microMacroMapN] = true;
+ collapsatorMap[microMacroMapN] = false;
+ } else if (microW[len] == '+') {
+ collapsableMap[microMacroMapN] = true;
+ collapsatorMap[microMacroMapN] = true;
+ } else {
+ collapsableMap[microMacroMapN] = false;
+ collapsatorMap[microMacroMapN] = false;
+ }
+ }
+
+ if (!bos && !strcmp(microW,BOS_)) bos=true;
+ if (!eos && !strcmp(microW,EOS_)) eos=true;
+
+ VERBOSE(2,"\nmicroW = " << microW << "\n"
+ << "macroW = " << macroW << "\n"
+ << "microMacroMapN = " << microMacroMapN << "\n"
+ << "code of micro = " << getDict()->getcode(microW) << "\n"
+ << "code of macro = " << lmtable::getDict()->getcode(macroW) << "\n");
+
+ microMacroMapN++;
+ }
+
+ if ((microMacroMapN == 0) && (selectedField == -1))
+ error((char*)"ERROR: with no field selection, a map for the whole string is mandatory\n");
+
+ if (microMacroMapN>0) {
+ // Add <s>-><s> to map if missing
+ if (!bos) {
+ getDict()->encode(BOS_);
+ if (microMacroMapN && !(microMacroMapN%BUFSIZ))
+ microMacroMap = (int *)reallocf(microMacroMap, sizeof(int)*(microMacroMapN+BUFSIZ));
+ microMacroMap[microMacroMapN++] = lmtable::getDict()->getcode(BOS_);
+ }
+
+ // Add </s>-></s> to map if missing
+ if (!eos) {
+ getDict()->encode(EOS_);
+ if (microMacroMapN && !(microMacroMapN%BUFSIZ))
+ microMacroMap = (int *)reallocf(microMacroMap, sizeof(int)*(microMacroMapN+BUFSIZ));
+ microMacroMap[microMacroMapN++] = lmtable::getDict()->getcode(EOS_);
+ }
+ }
+ // getDict()->incflag(0);
+
+ VERBOSE(2,"oovcode(micro)=" << getDict()->oovcode() << "\n"
+ << "oovcode(macro)=" << lmtable::getDict()->oovcode() << "\n"
+ << "microMacroMapN = " << microMacroMapN << "\n"
+ << "macrodictsize = " << getDict()->size() << "\n"
+ << "microdictsize = " << lmtable::getDict()->size() << "\n");
+
+ IFVERBOSE(2) {
+ for (int i=0; i<microMacroMapN; i++) {
+ VERBOSE(2,"micro[" << getDict()->decode(i) << "] {"<< i << "} -> " << lmtable::getDict()->decode(microMacroMap[i]) << " {" << microMacroMap[i]<< "}" << "\n");
+ }
+ }
+ std::cerr << "...done\n";
+ }
-//double lmmacro::clprob(int* codes, int sz, double* bow, int* bol, char** state,unsigned int* statesize,bool* extendible)
-double lmmacro::clprob(int* codes, int sz, double* bow, int* bol, ngram_state_t* ngramstate, char** state,unsigned int* statesize,bool* extendible)
-{
- ngram micro_ng(getDict());
- micro_ng.pushc(codes,sz);
-// return clprob(micro_ng,bow,bol,state,statesize,extendible);
- return clprob(micro_ng,bow,bol,ngramstate,state,statesize,extendible);
-}
-
-// double lmmacro::clprob(ngram micro_ng, double* bow, int* bol, char** state,unsigned int* statesize,bool* extendible)
-double lmmacro::clprob(ngram micro_ng, double* bow, int* bol, ngram_state_t* ngramstate, char** state,unsigned int* statesize,bool* extendible)
-{
-
- VERBOSE(3," lmmacro::clprob(ngram), parameter = <" << micro_ng << ">\n");
-
- ngram transformed_ng(lmtable::getDict());
- bool collapsed = transform(micro_ng, transformed_ng);
- VERBOSE(3,"lmmacro::clprob(ngram), transformed_ng = <" << transformed_ng << ">\n");
-
- double logpr;
-
- if (collapsed) {
- // the last token of the ngram continues an already open "chunk"
- // the probability at chunk-level is not computed because it has been already computed when the actual"chunk" opens
- VERBOSE(3," SKIPPED call to lmtable::clprob because of collapse; logpr: 0.0\n");
- logpr = 0.0;
- } else {
- VERBOSE(3," QUERY MACRO LM on (after transformation and size reduction) " << transformed_ng << "\n");
-// logpr = lmtable::clprob(transformed_ng, bow, bol, state, statesize, extendible);
- logpr = lmtable::clprob(transformed_ng, bow, bol, ngramstate, state, statesize, extendible);
- }
- VERBOSE(3," GET logpr: " << logpr << "\n");
-
- return logpr;
-}
-
-bool lmmacro::transform(ngram &in, ngram &out)
-{
- VERBOSE(3,"lmmacro::transform(ngram &in, ngram &out), in = <" << in << ">\n");
-
- //step 1: selection of the correct field
- ngram field_ng(getDict());
- if (selectedField >= 0)
- field_selection(in, field_ng);
- else
- field_ng = in;
-
- //step 2: collapsing
- ngram collapsed_ng(getDict());
- bool collapsed = false;
- if (collapseFlag)
- collapsed = collapse(field_ng, collapsed_ng);
- else
- collapsed_ng = field_ng;
-
- //step 3: mapping using the loaded map
- if (mapFlag)
- mapping(collapsed_ng, out);
- else
- out.trans(collapsed_ng);
-
- if (out.size>lmtable::maxlevel()) out.size=lmtable::maxlevel();
-
- VERBOSE(3,"lmmacro::transform(ngram &in, ngram &out), out = <" << out << ">\n");
- return collapsed;
-}
-
-
-
-void lmmacro::field_selection(ngram &in, ngram &out)
-{
- VERBOSE(3,"In lmmacro::field_selection(ngram &in, ngram &out) in = " << in << "\n");
-
- int microsize = in.size;
-
- for (int i=microsize; i>0; i--) {
-
- char curr_token[BUFSIZ];
- strcpy(curr_token, getDict()->decode(*in.wordp(i)));
- char *field;
- if (strcmp(curr_token,"<s>") &&
- strcmp(curr_token,"</s>") &&
- strcmp(curr_token,"_unk_")) {
- field = strtok(curr_token, "#");
- int j=0;
- while (j<selectedField && field != NULL) {
- field = strtok(0, "#");
- j++;
- }
- } else {
- field = curr_token;
- }
-
-
- if (field) {
- out.pushw(field);
- } else {
-
- out.pushw((char*)"_unk_");
-
- // cerr << *in << "\n";
- // error((char*)"ERROR: Malformed input: selected field does not exist in token\n");
-
- /**
- We can be here in 2 cases:
-
- a. effectively when the token is malformed, that is the selected
- field does not exist
-
- b. in case of verbatim translation, that is the source word is
- not known to the phrase table and moses transfers it as it is
- to the target side: in this case, no assumption can be made on its
- format, which means that the selected field can not exist
-
- The possibility of case (b) makes incorrect the error exit from
- the code at this point: correct, on the contrary, push the _unk_ string
- **/
- }
- }
- VERBOSE(3,"In lmmacro::field_selection(ngram &in, ngram &out) out = " << out << "\n");
- return;
-}
-
-bool lmmacro::collapse(ngram &in, ngram &out)
-{
- VERBOSE(3,"In lmmacro::collapse(ngram &in, ngram &out) in = " << in << "\n")
-
- // fill the ngram out with the collapsed tokens
- //return true if collapse happens for the most recent token
- //return false if collapse does not happen for the most recent token
- int microsize = in.size;
- out.size = 0;
-
- if (microsize == 1) {
- out.pushc(*in.wordp(1));
- return false;
- }
-
- int curr_code = *in.wordp(1);
- int prev_code = *in.wordp(2);
-
- if (microMacroMap[curr_code] == microMacroMap[prev_code]) {
- if (collapsableMap[curr_code] && collapsatorMap[prev_code]) {
- return true;
- }
- }
-
- //collapse does not happen for the most recent token
- // collapse all previous tokens, but the last
-
- prev_code = *in.wordp(microsize);
- out.pushc(prev_code);
-
- for (int i=microsize-1; i>1; i--) {
-
- curr_code = *in.wordp(i);
-
- if (microMacroMap[curr_code] != microMacroMap[prev_code]) {
- out.pushc(curr_code);
- } else {
- if (!(collapsableMap[curr_code] && collapsatorMap[prev_code])) {
- out.pushc(prev_code);
- }
- }
- prev_code = curr_code;
- }
- // and insert the most recent token
- out.pushc(*in.wordp(1));
- VERBOSE(3,"In lmmacro::collapse(ngram &in, ngram &out) out = " << out << "\n");
- return false;
-}
-
-void lmmacro::mapping(ngram &in, ngram &out)
-{
- VERBOSE(3,"In lmmacro::mapping(ngram &in, ngram &out) in = " << in << "\n");
-
- int microsize = in.size;
-
- // map microtag sequence (in) into the corresponding sequence of macrotags (possibly shorter) (out)
-
- for (int i=microsize; i>0; i--) {
-
- int in_code = *in.wordp(i);
- int out_code;
- if (in_code < microMacroMapN)
- out_code = microMacroMap[in_code];
- else
- out_code = lmtable::getDict()->oovcode();
-
- out.pushc(out_code);
- }
- VERBOSE(3,"In lmmacro::mapping(ngram &in, ngram &out) out = " << out << "\n");
- return;
-}
-
-
-//maxsuffptr returns the largest suffix of an n-gram that is contained
-//in the LM table. This can be used as a compact representation of the
-//(n-1)-gram state of a n-gram LM. if the input k-gram has k>=n then it
-//is trimmed to its n-1 suffix.
-
-const char *lmmacro::maxsuffptr(ngram micro_ng, unsigned int* size)
-{
- ngram macro_ng(lmtable::getDict());
-
- if (micro_ng.dict == macro_ng.dict)
- macro_ng.trans(micro_ng); // micro to macro mapping already done
- else
- map(µ_ng, ¯o_ng); // mapping required
-
- VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
- << "lmmacro::lprob: macro_ng = " << macro_ng << "\n");
-
- return lmtable::maxsuffptr(macro_ng,size);
-}
-const char *lmmacro::cmaxsuffptr(ngram micro_ng, unsigned int* size)
-{
- //cerr << "lmmacro::CMAXsuffptr\n";
- //cerr << "micro_ng: " << micro_ng
- // << " -> micro_ng.size: " << micro_ng.size << "\n";
-
- //the LM working on the selected field = 0
- //contributes to the LM state
- // if (selectedField>0) return NULL;
+ double lmmacro::lprob(ngram micro_ng)
+ {
+ VERBOSE(2,"lmmacro::lprob, parameter = <" << micro_ng << ">\n");
ngram macro_ng(lmtable::getDict());
@@ -561,39 +317,261 @@ const char *lmmacro::cmaxsuffptr(ngram micro_ng, unsigned int* size)
else
map(µ_ng, ¯o_ng); // mapping required
- VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
- << "lmmacro::lprob: macro_ng = " << macro_ng << "\n")
+ VERBOSE(3,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
+ << "lmmacro::lprob: macro_ng = " << macro_ng << "\n");
- return lmtable::cmaxsuffptr(macro_ng,size);
+ // ask LM with macro
+ double prob;
+ prob = lmtable::lprob(macro_ng);
+ VERBOSE(3,"prob = " << prob << "\n");
-}
+ return prob;
+ };
+ /*
+ //double lmmacro::clprob(int* codes, int sz, double* bow, int* bol, char** state,unsigned int* statesize,bool* extendible)
+ double lmmacro::clprob(int* codes, int sz, double* bow, int* bol, ngram_state_t* ngramstate, char** state,unsigned int* statesize,bool* extendible, double* lastbow)
+ {
+ ngram micro_ng(getDict());
+ micro_ng.pushc(codes,sz);
+ // return clprob(micro_ng,bow,bol,state,statesize,extendible);
+ return clprob(micro_ng,bow,bol,ngramstate,state,statesize,extendible, lastbow);
+ }
+ */
-ngram_state_t lmmacro::maxsuffidx(ngram micro_ng, unsigned int* size)
-{
- //cerr << "lmmacro::CMAXsuffptr\n";
- //cerr << "micro_ng: " << micro_ng
- // << " -> micro_ng.size: " << micro_ng.size << "\n";
+ // double lmmacro::clprob(ngram micro_ng, double* bow, int* bol, char** state,unsigned int* statesize,bool* extendible)
+ double lmmacro::clprob(ngram micro_ng, double* bow, int* bol, ngram_state_t* ngramstate, char** state,unsigned int* statesize,bool* extendible, double* lastbow)
+ {
- //the LM working on the selected field = 0
- //contributes to the LM state
- // if (selectedField>0) return NULL;
+ VERBOSE(3," lmmacro::clprob(ngram), parameter = <" << micro_ng << ">\n");
+
+ ngram transformed_ng(lmtable::getDict());
+ bool collapsed = transform(micro_ng, transformed_ng);
+ VERBOSE(3,"lmmacro::clprob(ngram), transformed_ng = <" << transformed_ng << ">\n");
+ double logpr;
+
+ if (collapsed) {
+ // the last token of the ngram continues an already open "chunk"
+ // the probability at chunk-level is not computed because it has been already computed when the actual"chunk" opens
+ VERBOSE(3," SKIPPED call to lmtable::clprob because of collapse; logpr: 0.0\n");
+ logpr = 0.0;
+ } else {
+ VERBOSE(3," QUERY MACRO LM on (after transformation and size reduction) " << transformed_ng << "\n");
+ // logpr = lmtable::clprob(transformed_ng, bow, bol, state, statesize, extendible);
+ logpr = lmtable::clprob(transformed_ng, bow, bol, ngramstate, state, statesize, extendible, lastbow);
+ }
+ VERBOSE(3," GET logpr: " << logpr << "\n");
+
+ return logpr;
+ }
+
+ bool lmmacro::transform(ngram &in, ngram &out)
+ {
+ VERBOSE(3,"lmmacro::transform(ngram &in, ngram &out), in = <" << in << ">\n");
+
+ //step 1: selection of the correct field
+ ngram field_ng(getDict());
+ if (selectedField >= 0)
+ field_selection(in, field_ng);
+ else
+ field_ng = in;
+
+ //step 2: collapsing
+ ngram collapsed_ng(getDict());
+ bool collapsed = false;
+ if (collapseFlag)
+ collapsed = collapse(field_ng, collapsed_ng);
+ else
+ collapsed_ng = field_ng;
+
+ //step 3: mapping using the loaded map
+ if (mapFlag)
+ mapping(collapsed_ng, out);
+ else
+ out.trans(collapsed_ng);
+
+ if (out.size>lmtable::maxlevel()) out.size=lmtable::maxlevel();
+
+ VERBOSE(3,"lmmacro::transform(ngram &in, ngram &out), out = <" << out << ">\n");
+ return collapsed;
+ }
+
+
+
+ void lmmacro::field_selection(ngram &in, ngram &out)
+ {
+ VERBOSE(3,"In lmmacro::field_selection(ngram &in, ngram &out) in = " << in << "\n");
+
+ int microsize = in.size;
+
+ for (int i=microsize; i>0; i--) {
+
+ char curr_token[BUFSIZ];
+ strcpy(curr_token, getDict()->decode(*in.wordp(i)));
+ char *field;
+ if (strcmp(curr_token,"<s>") &&
+ strcmp(curr_token,"</s>") &&
+ strcmp(curr_token,"_unk_")) {
+ field = strtok(curr_token, "#");
+ int j=0;
+ while (j<selectedField && field != NULL) {
+ field = strtok(0, "#");
+ j++;
+ }
+ } else {
+ field = curr_token;
+ }
+
+
+ if (field) {
+ out.pushw(field);
+ } else {
+
+ out.pushw((char*)"_unk_");
+
+ // cerr << *in << "\n";
+ // error((char*)"ERROR: Malformed input: selected field does not exist in token\n");
+
+ /**
+ We can be here in 2 cases:
+
+ a. effectively when the token is malformed, that is the selected
+ field does not exist
+
+ b. in case of verbatim translation, that is the source word is
+ not known to the phrase table and moses transfers it as it is
+ to the target side: in this case, no assumption can be made on its
+ format, which means that the selected field can not exist
+
+ The possibility of case (b) makes incorrect the error exit from
+ the code at this point: correct, on the contrary, push the _unk_ string
+ **/
+ }
+ }
+ VERBOSE(3,"In lmmacro::field_selection(ngram &in, ngram &out) out = " << out << "\n");
+ return;
+ }
+
+ bool lmmacro::collapse(ngram &in, ngram &out)
+ {
+ VERBOSE(3,"In lmmacro::collapse(ngram &in, ngram &out) in = " << in << "\n")
+
+ // fill the ngram out with the collapsed tokens
+ //return true if collapse happens for the most recent token
+ //return false if collapse does not happen for the most recent token
+ int microsize = in.size;
+ out.size = 0;
+
+ if (microsize == 1) {
+ out.pushc(*in.wordp(1));
+ return false;
+ }
+
+ int curr_code = *in.wordp(1);
+ int prev_code = *in.wordp(2);
+
+ if (microMacroMap[curr_code] == microMacroMap[prev_code]) {
+ if (collapsableMap[curr_code] && collapsatorMap[prev_code]) {
+ return true;
+ }
+ }
+
+ //collapse does not happen for the most recent token
+ // collapse all previous tokens, but the last
+
+ prev_code = *in.wordp(microsize);
+ out.pushc(prev_code);
+
+ for (int i=microsize-1; i>1; i--) {
+
+ curr_code = *in.wordp(i);
+
+ if (microMacroMap[curr_code] != microMacroMap[prev_code]) {
+ out.pushc(curr_code);
+ } else {
+ if (!(collapsableMap[curr_code] && collapsatorMap[prev_code])) {
+ out.pushc(prev_code);
+ }
+ }
+ prev_code = curr_code;
+ }
+ // and insert the most recent token
+ out.pushc(*in.wordp(1));
+ VERBOSE(3,"In lmmacro::collapse(ngram &in, ngram &out) out = " << out << "\n");
+ return false;
+ }
+
+ void lmmacro::mapping(ngram &in, ngram &out)
+ {
+ VERBOSE(3,"In lmmacro::mapping(ngram &in, ngram &out) in = " << in << "\n");
+
+ int microsize = in.size;
+
+ // map microtag sequence (in) into the corresponding sequence of macrotags (possibly shorter) (out)
+
+ for (int i=microsize; i>0; i--) {
+
+ int in_code = *in.wordp(i);
+ int out_code;
+ if (in_code < microMacroMapN)
+ out_code = microMacroMap[in_code];
+ else
+ out_code = lmtable::getDict()->oovcode();
+
+ out.pushc(out_code);
+ }
+ VERBOSE(3,"In lmmacro::mapping(ngram &in, ngram &out) out = " << out << "\n");
+ return;
+ }
+
+
+ //maxsuffptr returns the largest suffix of an n-gram that is contained
+ //in the LM table. This can be used as a compact representation of the
+ //(n-1)-gram state of a n-gram LM. if the input k-gram has k>=n then it
+ //is trimmed to its n-1 suffix.
+
+ const char *lmmacro::maxsuffptr(ngram micro_ng, unsigned int* size)
+ {
ngram macro_ng(lmtable::getDict());
if (micro_ng.dict == macro_ng.dict)
macro_ng.trans(micro_ng); // micro to macro mapping already done
- else
- map(µ_ng, ¯o_ng); // mapping required
-
- VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
- << "lmmacro::lprob: macro_ng = " << macro_ng << "\n")
-
- return lmtable::cmaxsuffidx(macro_ng,size);
+ else
+ map(µ_ng, ¯o_ng); // mapping required
+
+ VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
+ << "lmmacro::lprob: macro_ng = " << macro_ng << "\n");
+ return lmtable::maxsuffptr(macro_ng,size);
}
-
-ngram_state_t lmmacro::cmaxsuffidx(ngram micro_ng, unsigned int* size)
-{
+
+ /*
+ const char *lmmacro::cmaxsuffptr(ngram micro_ng, unsigned int* size)
+ {
+ //cerr << "lmmacro::CMAXsuffptr\n";
+ //cerr << "micro_ng: " << micro_ng
+ // << " -> micro_ng.size: " << micro_ng.size << "\n";
+
+ //the LM working on the selected field = 0
+ //contributes to the LM state
+ // if (selectedField>0) return NULL;
+
+ ngram macro_ng(lmtable::getDict());
+
+ if (micro_ng.dict == macro_ng.dict)
+ macro_ng.trans(micro_ng); // micro to macro mapping already done
+ else
+ map(µ_ng, ¯o_ng); // mapping required
+
+ VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
+ << "lmmacro::lprob: macro_ng = " << macro_ng << "\n")
+
+ return lmtable::cmaxsuffptr(macro_ng,size);
+
+ }
+ */
+ ngram_state_t lmmacro::maxsuffidx(ngram micro_ng, unsigned int* size)
+ {
//cerr << "lmmacro::CMAXsuffptr\n";
//cerr << "micro_ng: " << micro_ng
// << " -> micro_ng.size: " << micro_ng.size << "\n";
@@ -615,340 +593,366 @@ ngram_state_t lmmacro::cmaxsuffidx(ngram micro_ng, unsigned int* size)
return lmtable::cmaxsuffidx(macro_ng,size);
}
-
-void lmmacro::map(ngram *in, ngram *out)
-{
-
- VERBOSE(2,"In lmmacro::map, in = " << *in << endl
- << " (selectedField = " << selectedField << " )\n");
-
- if (selectedField==-2) // the whole token is compatible with the LM words
- One2OneMapping(in, out);
-
- else if (selectedField==-1) // the whole token has to be mapped before querying the LM
- Micro2MacroMapping(in, out);
-
- else if (selectedField<10) { // select the field "selectedField" from tokens (separator is assumed to be "#")
- ngram field_ng(((lmmacro *)this)->getDict());
- int microsize = in->size;
-
- for (int i=microsize; i>0; i--) {
-
- char curr_token[BUFSIZ];
- strcpy(curr_token, ((lmmacro *)this)->getDict()->decode(*(in->wordp(i))));
- char *field;
- if (strcmp(curr_token,"<s>") &&
- strcmp(curr_token,"</s>") &&
- strcmp(curr_token,"_unk_")) {
- field = strtok(curr_token, "#");
- int j=0;
- while (j<selectedField && field != NULL) {
- field = strtok(0, "#");
- j++;
- }
- } else {
- field = curr_token;
- }
-
- if (field)
- field_ng.pushw(field);
- else {
-
- field_ng.pushw((char*)"_unk_");
-
- // cerr << *in << "\n";
- // error((char*)"ERROR: Malformed input: selected field does not exist in token\n");
-
- /**
- We can be here in 2 cases:
-
- a. effectively when the token is malformed, that is the selected
- field does not exist
-
- b. in case of verbatim translation, that is the source word is
- not known to the phrase table and moses transfers it as it is
- to the target side: in this case, no assumption can be made on its
- format, which means that the selected field can not exist
-
- The possibility of case (b) makes incorrect the error exit from
- the code at this point: correct, on the contrary, push the _unk_ string
- **/
- }
- }
- if (microMacroMapN>0)
- Micro2MacroMapping(&field_ng, out);
- else
- out->trans(field_ng);
- } else {
-
+
+ /*
+ ngram_state_t lmmacro::cmaxsuffidx(ngram micro_ng, unsigned int* size)
+ {
+ //cerr << "lmmacro::CMAXsuffptr\n";
+ //cerr << "micro_ng: " << micro_ng
+ // << " -> micro_ng.size: " << micro_ng.size << "\n";
+
+ //the LM working on the selected field = 0
+ //contributes to the LM state
+ // if (selectedField>0) return NULL;
+
+ ngram macro_ng(lmtable::getDict());
+
+ if (micro_ng.dict == macro_ng.dict)
+ macro_ng.trans(micro_ng); // micro to macro mapping already done
+ else
+ map(µ_ng, ¯o_ng); // mapping required
+
+ VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n"
+ << "lmmacro::lprob: macro_ng = " << macro_ng << "\n")
+
+ return lmtable::cmaxsuffidx(macro_ng,size);
+
+ }
+ */
+
+ void lmmacro::map(ngram *in, ngram *out)
+ {
+
+ VERBOSE(2,"In lmmacro::map, in = " << *in << endl
+ << " (selectedField = " << selectedField << " )\n");
+
+ if (selectedField==-2) // the whole token is compatible with the LM words
+ One2OneMapping(in, out);
+
+ else if (selectedField==-1) // the whole token has to be mapped before querying the LM
+ Micro2MacroMapping(in, out);
+
+ else if (selectedField<10) { // select the field "selectedField" from tokens (separator is assumed to be "#")
+ ngram field_ng(((lmmacro *)this)->getDict());
+ int microsize = in->size;
+
+ for (int i=microsize; i>0; i--) {
+
+ char curr_token[BUFSIZ];
+ strcpy(curr_token, ((lmmacro *)this)->getDict()->decode(*(in->wordp(i))));
+ char *field;
+ if (strcmp(curr_token,"<s>") &&
+ strcmp(curr_token,"</s>") &&
+ strcmp(curr_token,"_unk_")) {
+ field = strtok(curr_token, "#");
+ int j=0;
+ while (j<selectedField && field != NULL) {
+ field = strtok(0, "#");
+ j++;
+ }
+ } else {
+ field = curr_token;
+ }
+
+ if (field)
+ field_ng.pushw(field);
+ else {
+
+ field_ng.pushw((char*)"_unk_");
+
+ // cerr << *in << "\n";
+ // error((char*)"ERROR: Malformed input: selected field does not exist in token\n");
+
+ /**
+ We can be here in 2 cases:
+
+ a. effectively when the token is malformed, that is the selected
+ field does not exist
+
+ b. in case of verbatim translation, that is the source word is
+ not known to the phrase table and moses transfers it as it is
+ to the target side: in this case, no assumption can be made on its
+ format, which means that the selected field can not exist
+
+ The possibility of case (b) makes incorrect the error exit from
+ the code at this point: correct, on the contrary, push the _unk_ string
+ **/
+ }
+ }
+ if (microMacroMapN>0)
+ Micro2MacroMapping(&field_ng, out);
+ else
+ out->trans(field_ng);
+ } else {
+
#ifdef DLEXICALLM
- // selectedField>=10: tens=idx of micro tag (possibly to be mapped to
- // macro tag), unidx=idx of lemma to be concatenated by "_" to the
- // (mapped) tag
-
- int tagIdx = selectedField/10;
- int lemmaIdx = selectedField%10;
-
- // micro (or mapped to macro) sequence construction:
- ngram tag_ng(getDict());
- char *lemmas[BUFSIZ];
-
- int microsize = in->size;
- for (int i=microsize; i>0; i--) {
- char curr_token[BUFSIZ];
- strcpy(curr_token, getDict()->decode(*(in->wordp(i))));
- char *tag = NULL, *lemma = NULL;
-
- if (strcmp(curr_token,"<s>") &&
- strcmp(curr_token,"</s>") &&
- strcmp(curr_token,"_unk_")) {
-
- if (tagIdx<lemmaIdx) {
- tag = strtok(curr_token, "#");
- for (int j=0; j<tagIdx; j++)
- tag = strtok(0, "#");
- for (int j=tagIdx; j<lemmaIdx; j++)
- lemma = strtok(0, "#");
- } else {
- lemma = strtok(curr_token, "#");
- for (int j=0; j<lemmaIdx; j++)
- lemma = strtok(0, "#");
- for (int j=lemmaIdx; j<tagIdx; j++)
- tag = strtok(0, "#");
- }
-
- VERBOSE(3,"(tag,lemma) = " << tag << " " << lemma << "\n");
- } else {
- tag = curr_token;
- lemma = curr_token;
- VERBOSE(3,"(tag=lemma) = " << tag << " " << lemma << "\n");
- }
- if (tag) {
- tag_ng.pushw(tag);
- lemmas[i] = strdup(lemma);
- } else {
- tag_ng.pushw((char*)"_unk_");
- lemmas[i] = strdup("_unk_");
- }
- }
-
- if (microMacroMapN>0)
- Micro2MacroMapping(&tag_ng, out, lemmas);
- else
- out->trans(tag_ng); // qui si dovrebbero sostituire i tag con tag_lemma, senza mappatura!
-
+ // selectedField>=10: tens=idx of micro tag (possibly to be mapped to
+ // macro tag), unidx=idx of lemma to be concatenated by "_" to the
+ // (mapped) tag
+
+ int tagIdx = selectedField/10;
+ int lemmaIdx = selectedField%10;
+
+ // micro (or mapped to macro) sequence construction:
+ ngram tag_ng(getDict());
+ char *lemmas[BUFSIZ];
+
+ int microsize = in->size;
+ for (int i=microsize; i>0; i--) {
+ char curr_token[BUFSIZ];
+ strcpy(curr_token, getDict()->decode(*(in->wordp(i))));
+ char *tag = NULL, *lemma = NULL;
+
+ if (strcmp(curr_token,"<s>") &&
+ strcmp(curr_token,"</s>") &&
+ strcmp(curr_token,"_unk_")) {
+
+ if (tagIdx<lemmaIdx) {
+ tag = strtok(curr_token, "#");
+ for (int j=0; j<tagIdx; j++)
+ tag = strtok(0, "#");
+ for (int j=tagIdx; j<lemmaIdx; j++)
+ lemma = strtok(0, "#");
+ } else {
+ lemma = strtok(curr_token, "#");
+ for (int j=0; j<lemmaIdx; j++)
+ lemma = strtok(0, "#");
+ for (int j=lemmaIdx; j<tagIdx; j++)
+ tag = strtok(0, "#");
+ }
+
+ VERBOSE(3,"(tag,lemma) = " << tag << " " << lemma << "\n");
+ } else {
+ tag = curr_token;
+ lemma = curr_token;
+ VERBOSE(3,"(tag=lemma) = " << tag << " " << lemma << "\n");
+ }
+ if (tag) {
+ tag_ng.pushw(tag);
+ lemmas[i] = strdup(lemma);
+ } else {
+ tag_ng.pushw((char*)"_unk_");
+ lemmas[i] = strdup("_unk_");
+ }
+ }
+
+ if (microMacroMapN>0)
+ Micro2MacroMapping(&tag_ng, out, lemmas);
+ else
+ out->trans(tag_ng); // qui si dovrebbero sostituire i tag con tag_lemma, senza mappatura!
+
#endif
-
- }
-
- VERBOSE(2,"In lmmacro::map, FINAL out = " << *out << endl);
-}
-
-void lmmacro::One2OneMapping(ngram *in, ngram *out)
-{
- int insize = in->size;
-
- // map each token of the sequence "in" into the same-length sequence "out" through the map
-
- for (int i=insize; i>0; i--) {
-
- int curr_code = *(in->wordp(i));
- const char *outtoken =
+
+ }
+
+ VERBOSE(2,"In lmmacro::map, FINAL out = " << *out << endl);
+ }
+
+ void lmmacro::One2OneMapping(ngram *in, ngram *out)
+ {
+ int insize = in->size;
+
+ // map each token of the sequence "in" into the same-length sequence "out" through the map
+
+ for (int i=insize; i>0; i--) {
+
+ int curr_code = *(in->wordp(i));
+ const char *outtoken =
lmtable::getDict()->decode((curr_code<microMacroMapN)?microMacroMap[curr_code]:lmtable::getDict()->oovcode());
- out->pushw(outtoken);
- }
- return;
-}
-
-
-void lmmacro::Micro2MacroMapping(ngram *in, ngram *out)
-{
-
- int microsize = in->size;
-
- VERBOSE(2,"In Micro2MacroMapping, in = " << *in << "\n");
-
- // map microtag sequence (in) into the corresponding sequence of macrotags (possibly shorter) (out)
-
- for (int i=microsize; i>0; i--) {
-
- int curr_code = *(in->wordp(i));
- const char *curr_macrotag = lmtable::getDict()->decode((curr_code<microMacroMapN)?microMacroMap[curr_code]:lmtable::getDict()->oovcode());
-
- if (i==microsize) {
- out->pushw(curr_macrotag);
-
- } else {
- int prev_code = *(in->wordp(i+1));
-
- const char *prev_microtag = getDict()->decode(prev_code);
- const char *curr_microtag = getDict()->decode(curr_code);
- const char *prev_macrotag = lmtable::getDict()->decode((prev_code<microMacroMapN)?microMacroMap[prev_code]:lmtable::getDict()->oovcode());
-
-
- int prev_len = strlen(prev_microtag)-1;
- int curr_len = strlen(curr_microtag)-1;
-
- if (strcmp(curr_macrotag,prev_macrotag) != 0 ||
- !(
- (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')' )) && ( curr_microtag[curr_len]==')' && curr_microtag[0]!='(')) ||
- (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')' )) && curr_microtag[curr_len]=='+' ) ||
- (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]=='+' ) ||
- (prev_microtag[prev_len]== '+' && ( curr_microtag[curr_len]==')' && curr_microtag[0]!='(' ))))
- out->pushw(curr_macrotag);
- }
- }
- return;
-}
-
+ out->pushw(outtoken);
+ }
+ return;
+ }
-// DISMITTED ON FEB 2011 BECAUSE TOO MUCH PROBLEMATIC FROM A THEORETICAL POINT OF VIEW
-
+ void lmmacro::Micro2MacroMapping(ngram *in, ngram *out)
+ {
+
+ int microsize = in->size;
+
+ VERBOSE(2,"In Micro2MacroMapping, in = " << *in << "\n");
+
+ // map microtag sequence (in) into the corresponding sequence of macrotags (possibly shorter) (out)
+
+ for (int i=microsize; i>0; i--) {
+
+ int curr_code = *(in->wordp(i));
+ const char *curr_macrotag = lmtable::getDict()->decode((curr_code<microMacroMapN)?microMacroMap[curr_code]:lmtable::getDict()->oovcode());
+
+ if (i==microsize) {
+ out->pushw(curr_macrotag);
+
+ } else {
+ int prev_code = *(in->wordp(i+1));
+
+ const char *prev_microtag = getDict()->decode(prev_code);
+ const char *curr_microtag = getDict()->decode(curr_code);
+ const char *prev_macrotag = lmtable::getDict()->decode((prev_code<microMacroMapN)?microMacroMap[prev_code]:lmtable::getDict()->oovcode());
+
+
+ int prev_len = strlen(prev_microtag)-1;
+ int curr_len = strlen(curr_microtag)-1;
+
+ if (strcmp(curr_macrotag,prev_macrotag) != 0 ||
+ !(
+ (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')' )) && ( curr_microtag[curr_len]==')' && curr_microtag[0]!='(')) ||
+ (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')' )) && curr_microtag[curr_len]=='+' ) ||
+ (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]=='+' ) ||
+ (prev_microtag[prev_len]== '+' && ( curr_microtag[curr_len]==')' && curr_microtag[0]!='(' ))))
+ out->pushw(curr_macrotag);
+ }
+ }
+ return;
+ }
+
+
+
+ // DISMITTED ON FEB 2011 BECAUSE TOO MUCH PROBLEMATIC FROM A THEORETICAL POINT OF VIEW
+
#ifdef DLEXICALLM
-void lmmacro::Micro2MacroMapping(ngram *in, ngram *out, char **lemmas)
-{
- VERBOSE(2,"In Micro2MacroMapping, in = " << *in << "\n")
-
- int microsize = in->size;
-
- IFVERBOSE(3) {
- VERBOSE(3,"In Micro2MacroMapping, lemmas:\n");
- if (lexicaltoken2classMap)
- for (int i=microsize; i>0; i--)
- VERBOSE(3,"lemmas[" << i << "]=" << lemmas[i] << " -> class -> " << lexicaltoken2classMap[lmtable::getDict()->encode(lemmas[i])] << endl);
- else
- for (int i=microsize; i>0; i--)
- VERBOSE(3,"lemmas[" << i << "]=" << lemmas[i] << endl);
- }
-
- // map microtag sequence (in) into the corresponding sequence of macrotags (possibly shorter) (out)
-
- char tag_lemma[BUFSIZ];
-
- for (int i=microsize; i>0; i--) {
-
- int curr_code = *(in->wordp(i));
-
- const char *curr_microtag = getDict()->decode(curr_code);
- const char *curr_lemma = lemmas[i];
- const char *curr_macrotag = lmtable::getDict()->decode((curr_code<microMacroMapN)?microMacroMap[curr_code]:lmtable::getDict()->oovcode());
- int curr_len = strlen(curr_microtag)-1;
-
- if (i==microsize) {
- if (( curr_microtag[curr_len]=='(' ) || ( curr_microtag[0]=='(' && curr_microtag[curr_len]!=')' ) || ( curr_microtag[curr_len]=='+' ))
- sprintf(tag_lemma, "%s", curr_macrotag); // non lessicalizzo il macrotag se sono ancora all''interno del chunk
- else if (lexicaltoken2classMap)
- sprintf(tag_lemma, "%s_class%d", curr_macrotag, lexicaltoken2classMap[lmtable::getDict()->encode(curr_lemma)]);
- else
- sprintf(tag_lemma, "%s_%s", curr_macrotag, lemmas[microsize]);
-
- VERBOSE(2,"In Micro2MacroMapping, starting tag_lemma = >" << tag_lemma << "<\n");
-
- out->pushw(tag_lemma);
- free(lemmas[microsize]);
-
-
- } else {
-
- int prev_code = *(in->wordp(i+1));
- const char *prev_microtag = getDict()->decode(prev_code);
- const char *prev_macrotag = lmtable::getDict()->decode((prev_code<microMacroMapN)?microMacroMap[prev_code]:lmtable::getDict()->oovcode());
-
-
- int prev_len = strlen(prev_microtag)-1;
-
- if (( curr_microtag[curr_len]=='(' ) || ( curr_microtag[0]=='(' && curr_microtag[curr_len]!=')' ) || ( curr_microtag[curr_len]=='+' ))
- sprintf(tag_lemma, "%s", curr_macrotag); // non lessicalizzo il macrotag se sono ancora all''interno del chunk
- else if (lexicaltoken2classMap)
- sprintf(tag_lemma, "%s_class%d", curr_macrotag, lexicaltoken2classMap[lmtable::getDict()->encode(curr_lemma)]);
- else
- sprintf(tag_lemma, "%s_%s", curr_macrotag, curr_lemma);
-
- VERBOSE(2,"In Micro2MacroMapping, tag_lemma = >" << tag_lemma << "<\n");
-
- if (strcmp(curr_macrotag,prev_macrotag) != 0 ||
- !(
- (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!=')' )) && curr_microtag[curr_len]==')' && curr_microtag[0]!='(') ||
- (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')')) && curr_microtag[curr_len]=='+' ) ||
- (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]=='+' ) ||
- (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]==')' && curr_microtag[0]!='(' ))) {
-
- VERBOSE(2,"In Micro2MacroMapping, before pushw, out = " << *out << endl);
- out->pushw(tag_lemma);
- VERBOSE(2,"In Micro2MacroMapping, after pushw, out = " << *out << endl);
- } else {
- VERBOSE(2,"In Micro2MacroMapping, before shift, out = " << *out << endl);
- out->shift();
- VERBOSE(2,"In Micro2MacroMapping, after shift, out = " << *out << endl);
- out->pushw(tag_lemma);
- VERBOSE(2,"In Micro2MacroMapping, after push, out = " << *out << endl);
- }
- free(lemmas[i]);
- }
- }
- return;
-}
-
-void lmmacro::loadLexicalClasses(const char *fn)
-{
- char line[MAX_LINE];
- const char* words[MAX_TOKEN_N_MAP];
- int tokenN;
-
- lexicaltoken2classMap = (int *)calloc(BUFSIZ, sizeof(int));
- lexicaltoken2classMapN = BUFSIZ;
-
- lmtable::getDict()->incflag(1);
-
- inputfilestream inp(fn);
- while (inp.getline(line,MAX_LINE,'\n')) {
- tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
- if (tokenN != 2)
- error((char*)"ERROR: wrong format of lexical classes file\n");
- else {
- int classIdx = atoi(words[1]);
- int wordCode = lmtable::getDict()->encode(words[0]);
-
- if (wordCode>=lexicaltoken2classMapN) {
- int r = (wordCode-lexicaltoken2classMapN)/BUFSIZ;
- lexicaltoken2classMapN += (r+1)*BUFSIZ;
- lexicaltoken2classMap = (int *)reallocf(lexicaltoken2classMap, sizeof(int)*lexicaltoken2classMapN);
- }
- lexicaltoken2classMap[wordCode] = classIdx;
- }
- }
-
- lmtable::getDict()->incflag(0);
-
- IFVERBOSE(3) {
- for (int x=0; x<lmtable::getDict()->size(); x++)
- VERBOSE(3,"class of <" << lmtable::getDict()->decode(x) << "> (code=" << x << ") = " << lexicaltoken2classMap[x] << endl);
- }
-
- return;
-}
-
-
-void lmmacro::cutLex(ngram *in, ngram *out)
-{
- *out=*in;
-
- const char *curr_macro = out->dict->decode(*(out->wordp(1)));
- out->shift();
- const char *p = strrchr(curr_macro, '_');
- int lexLen;
- if (p)
- lexLen=strlen(p);
- else
- lexLen=0;
- char curr_NoLexMacro[BUFSIZ];
- memset(&curr_NoLexMacro,0,BUFSIZ);
- strncpy(curr_NoLexMacro,curr_macro,strlen(curr_macro)-lexLen);
- out->pushw(curr_NoLexMacro);
- return;
-}
+ void lmmacro::Micro2MacroMapping(ngram *in, ngram *out, char **lemmas)
+ {
+ VERBOSE(2,"In Micro2MacroMapping, in = " << *in << "\n")
+
+ int microsize = in->size;
+
+ IFVERBOSE(3) {
+ VERBOSE(3,"In Micro2MacroMapping, lemmas:\n");
+ if (lexicaltoken2classMap)
+ for (int i=microsize; i>0; i--)
+ VERBOSE(3,"lemmas[" << i << "]=" << lemmas[i] << " -> class -> " << lexicaltoken2classMap[lmtable::getDict()->encode(lemmas[i])] << endl);
+ else
+ for (int i=microsize; i>0; i--)
+ VERBOSE(3,"lemmas[" << i << "]=" << lemmas[i] << endl);
+ }
+
+ // map microtag sequence (in) into the corresponding sequence of macrotags (possibly shorter) (out)
+
+ char tag_lemma[BUFSIZ];
+
+ for (int i=microsize; i>0; i--) {
+
+ int curr_code = *(in->wordp(i));
+
+ const char *curr_microtag = getDict()->decode(curr_code);
+ const char *curr_lemma = lemmas[i];
+ const char *curr_macrotag = lmtable::getDict()->decode((curr_code<microMacroMapN)?microMacroMap[curr_code]:lmtable::getDict()->oovcode());
+ int curr_len = strlen(curr_microtag)-1;
+
+ if (i==microsize) {
+ if (( curr_microtag[curr_len]=='(' ) || ( curr_microtag[0]=='(' && curr_microtag[curr_len]!=')' ) || ( curr_microtag[curr_len]=='+' ))
+ sprintf(tag_lemma, "%s", curr_macrotag); // non lessicalizzo il macrotag se sono ancora all''interno del chunk
+ else if (lexicaltoken2classMap)
+ sprintf(tag_lemma, "%s_class%d", curr_macrotag, lexicaltoken2classMap[lmtable::getDict()->encode(curr_lemma)]);
+ else
+ sprintf(tag_lemma, "%s_%s", curr_macrotag, lemmas[microsize]);
+
+ VERBOSE(2,"In Micro2MacroMapping, starting tag_lemma = >" << tag_lemma << "<\n");
+
+ out->pushw(tag_lemma);
+ free(lemmas[microsize]);
+
+
+ } else {
+
+ int prev_code = *(in->wordp(i+1));
+ const char *prev_microtag = getDict()->decode(prev_code);
+ const char *prev_macrotag = lmtable::getDict()->decode((prev_code<microMacroMapN)?microMacroMap[prev_code]:lmtable::getDict()->oovcode());
+
+
+ int prev_len = strlen(prev_microtag)-1;
+
+ if (( curr_microtag[curr_len]=='(' ) || ( curr_microtag[0]=='(' && curr_microtag[curr_len]!=')' ) || ( curr_microtag[curr_len]=='+' ))
+ sprintf(tag_lemma, "%s", curr_macrotag); // non lessicalizzo il macrotag se sono ancora all''interno del chunk
+ else if (lexicaltoken2classMap)
+ sprintf(tag_lemma, "%s_class%d", curr_macrotag, lexicaltoken2classMap[lmtable::getDict()->encode(curr_lemma)]);
+ else
+ sprintf(tag_lemma, "%s_%s", curr_macrotag, curr_lemma);
+
+ VERBOSE(2,"In Micro2MacroMapping, tag_lemma = >" << tag_lemma << "<\n");
+
+ if (strcmp(curr_macrotag,prev_macrotag) != 0 ||
+ !(
+ (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!=')' )) && curr_microtag[curr_len]==')' && curr_microtag[0]!='(') ||
+ (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')')) && curr_microtag[curr_len]=='+' ) ||
+ (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]=='+' ) ||
+ (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]==')' && curr_microtag[0]!='(' ))) {
+
+ VERBOSE(2,"In Micro2MacroMapping, before pushw, out = " << *out << endl);
+ out->pushw(tag_lemma);
+ VERBOSE(2,"In Micro2MacroMapping, after pushw, out = " << *out << endl);
+ } else {
+ VERBOSE(2,"In Micro2MacroMapping, before shift, out = " << *out << endl);
+ out->shift();
+ VERBOSE(2,"In Micro2MacroMapping, after shift, out = " << *out << endl);
+ out->pushw(tag_lemma);
+ VERBOSE(2,"In Micro2MacroMapping, after push, out = " << *out << endl);
+ }
+ free(lemmas[i]);
+ }
+ }
+ return;
+ }
+
+ void lmmacro::loadLexicalClasses(const char *fn)
+ {
+ char line[MAX_LINE];
+ const char* words[MAX_TOKEN_N_MAP];
+ int tokenN;
+
+ lexicaltoken2classMap = (int *)calloc(BUFSIZ, sizeof(int));
+ lexicaltoken2classMapN = BUFSIZ;
+
+ lmtable::getDict()->incflag(1);
+
+ inputfilestream inp(fn);
+ while (inp.getline(line,MAX_LINE,'\n')) {
+ tokenN = parseWords(line,words,MAX_TOKEN_N_MAP);
+ if (tokenN != 2)
+ error((char*)"ERROR: wrong format of lexical classes file\n");
+ else {
+ int classIdx = atoi(words[1]);
+ int wordCode = lmtable::getDict()->encode(words[0]);
+
+ if (wordCode>=lexicaltoken2classMapN) {
+ int r = (wordCode-lexicaltoken2classMapN)/BUFSIZ;
+ lexicaltoken2classMapN += (r+1)*BUFSIZ;
+ lexicaltoken2classMap = (int *)reallocf(lexicaltoken2classMap, sizeof(int)*lexicaltoken2classMapN);
+ }
+ lexicaltoken2classMap[wordCode] = classIdx;
+ }
+ }
+
+ lmtable::getDict()->incflag(0);
+
+ IFVERBOSE(3) {
+ for (int x=0; x<lmtable::getDict()->size(); x++)
+ VERBOSE(3,"class of <" << lmtable::getDict()->decode(x) << "> (code=" << x << ") = " << lexicaltoken2classMap[x] << endl);
+ }
+
+ return;
+ }
+
+
+ void lmmacro::cutLex(ngram *in, ngram *out)
+ {
+ *out=*in;
+
+ const char *curr_macro = out->dict->decode(*(out->wordp(1)));
+ out->shift();
+ const char *p = strrchr(curr_macro, '_');
+ int lexLen;
+ if (p)
+ lexLen=strlen(p);
+ else
+ lexLen=0;
+ char curr_NoLexMacro[BUFSIZ];
+ memset(&curr_NoLexMacro,0,BUFSIZ);
+ strncpy(curr_NoLexMacro,curr_macro,strlen(curr_macro)-lexLen);
+ out->pushw(curr_NoLexMacro);
+ return;
+ }
#endif
}//namespace irstlm
diff --git a/src/lmmacro.h b/src/lmmacro.h
index c67c6bf..9cdec56 100644
--- a/src/lmmacro.h
+++ b/src/lmmacro.h
@@ -1,24 +1,24 @@
// $Id: lmmacro.h 3461 2010-08-27 10:17:34Z bertoldi $
/******************************************************************************
-IrstLM: IRST Language Model Toolkit
-Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-
-******************************************************************************/
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
#ifndef MF_LMMACRO_H
@@ -34,101 +34,101 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "dictionary.h"
#include "n_gram.h"
#include "lmtable.h"
-
+
#define MAX_TOKEN_N_MAP 5
namespace irstlm {
-class lmmacro: public lmtable
-{
-
- dictionary *dict;
- int maxlev; //max level of table
- int selectedField;
-
- bool collapseFlag; //flag for the presence of collapse
- bool mapFlag; //flag for the presence of map
-
- int microMacroMapN;
- int *microMacroMap;
- bool *collapsableMap;
- bool *collapsatorMap;
-
+ class lmmacro: public lmtable
+ {
+
+ dictionary *dict;
+ int maxlev; //max level of table
+ int selectedField;
+
+ bool collapseFlag; //flag for the presence of collapse
+ bool mapFlag; //flag for the presence of map
+
+ int microMacroMapN;
+ int *microMacroMap;
+ bool *collapsableMap;
+ bool *collapsatorMap;
+
#ifdef DLEXICALLM
- int selectedFieldForLexicon;
- int *lexicaltoken2classMap;
- int lexicaltoken2classMapN;
+ int selectedFieldForLexicon;
+ int *lexicaltoken2classMap;
+ int lexicaltoken2classMapN;
#endif
-
-
- void loadmap(const std::string mapfilename);
- void unloadmap();
-
- bool transform(ngram &in, ngram &out);
- void field_selection(ngram &in, ngram &out);
- bool collapse(ngram &in, ngram &out);
- void mapping(ngram &in, ngram &out);
-
-public:
-
- lmmacro(float nlf=0.0, float dlfi=0.0);
- ~lmmacro();
-
- void load(const std::string &filename,int mmap=0);
-
- double lprob(ngram ng);
- double clprob(ngram ng,double* bow=NULL,int* bol=NULL,ngram_state_t* maxsuffidx=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
- double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,ngram_state_t* maxsuffidx=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
-
- const char *maxsuffptr(ngram ong, unsigned int* size=NULL);
- const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL);
- ngram_state_t maxsuffidx(ngram ong, unsigned int* size=NULL);
- ngram_state_t cmaxsuffidx(ngram ong, unsigned int* size=NULL);
-
- void map(ngram *in, ngram *out);
- void One2OneMapping(ngram *in, ngram *out);
- void Micro2MacroMapping(ngram *in, ngram *out);
+
+
+ void loadmap(const std::string mapfilename);
+ void unloadmap();
+
+ bool transform(ngram &in, ngram &out);
+ void field_selection(ngram &in, ngram &out);
+ bool collapse(ngram &in, ngram &out);
+ void mapping(ngram &in, ngram &out);
+
+ public:
+
+ lmmacro(float nlf=0.0, float dlfi=0.0);
+ ~lmmacro();
+
+ virtual void load(const std::string &filename,int mmap=0);
+
+ virtual double lprob(ngram ng);
+ virtual double clprob(ngram ng,double* bow=NULL,int* bol=NULL,ngram_state_t* maxsuffidx=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL);
+ // double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,ngram_state_t* maxsuffidx=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
+
+ virtual const char *maxsuffptr(ngram ong, unsigned int* size=NULL);
+ // const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL);
+ virtual ngram_state_t maxsuffidx(ngram ong, unsigned int* size=NULL);
+ // ngram_state_t cmaxsuffidx(ngram ong, unsigned int* size=NULL);
+
+ void map(ngram *in, ngram *out);
+ void One2OneMapping(ngram *in, ngram *out);
+ void Micro2MacroMapping(ngram *in, ngram *out);
#ifdef DLEXICALLM
- void Micro2MacroMapping(ngram *in, ngram *out, char **lemma);
- void loadLexicalClasses(const char *fn);
- void cutLex(ngram *in, ngram *out);
+ void Micro2MacroMapping(ngram *in, ngram *out, char **lemma);
+ void loadLexicalClasses(const char *fn);
+ void cutLex(ngram *in, ngram *out);
#endif
-
- inline bool is_OOV(int code) {
- ngram word_ng(getDict());
- ngram field_ng(getDict());
- word_ng.pushc(code);
- if (selectedField >= 0)
- field_selection(word_ng, field_ng);
- else
- field_ng = word_ng;
- int field_code=*field_ng.wordp(1);
- VERBOSE(2,"inline virtual bool lmmacro::is_OOV(int code) word_ng:" << word_ng << " field_ng:" << field_ng << std::endl);
- //the selected field(s) of a token is considered OOV
- //either if unknown by the microMacroMap
- //or if its mapped macroW is OOV
- if (field_code >= microMacroMapN) return true;
- VERBOSE(2,"inline virtual bool lmmacro::is_OOV(int code)*field_code:" << field_code << " microMacroMap[field_code]:" << microMacroMap[field_code] << " lmtable::dict->oovcode():" << lmtable::dict->oovcode() << std::endl);
- return (microMacroMap[field_code] == lmtable::dict->oovcode());
+
+ inline bool is_OOV(int code) {
+ ngram word_ng(getDict());
+ ngram field_ng(getDict());
+ word_ng.pushc(code);
+ if (selectedField >= 0)
+ field_selection(word_ng, field_ng);
+ else
+ field_ng = word_ng;
+ int field_code=*field_ng.wordp(1);
+ VERBOSE(2,"inline virtual bool lmmacro::is_OOV(int code) word_ng:" << word_ng << " field_ng:" << field_ng << std::endl);
+ //the selected field(s) of a token is considered OOV
+ //either if unknown by the microMacroMap
+ //or if its mapped macroW is OOV
+ if (field_code >= microMacroMapN) return true;
+ VERBOSE(2,"inline virtual bool lmmacro::is_OOV(int code)*field_code:" << field_code << " microMacroMap[field_code]:" << microMacroMap[field_code] << " lmtable::dict->oovcode():" << lmtable::dict->oovcode() << std::endl);
+ return (microMacroMap[field_code] == lmtable::dict->oovcode());
+ };
+ inline dictionary* getDict() const {
+ return dict;
+ }
+ inline int maxlevel() const {
+ return maxlev;
+ };
+
+ inline virtual void dictionary_incflag(const bool flag) {
+ dict->incflag(flag);
+ };
+
+ inline virtual bool filter(const string sfilter, lmContainer* sublmt, const string skeepunigrams) {
+ UNUSED(sfilter);
+ UNUSED(sublmt);
+ UNUSED(skeepunigrams);
+ return false;
+ }
};
- inline dictionary* getDict() const {
- return dict;
- }
- inline int maxlevel() const {
- return maxlev;
- };
-
- inline virtual void dictionary_incflag(const bool flag) {
- dict->incflag(flag);
- };
-
- inline virtual bool filter(const string sfilter, lmContainer* sublmt, const string skeepunigrams) {
- UNUSED(sfilter);
- UNUSED(sublmt);
- UNUSED(skeepunigrams);
- return false;
- }
-};
}//namespace irstlm
#endif
diff --git a/src/lmtable.cpp b/src/lmtable.cpp
index 9376ce5..3865230 100644
--- a/src/lmtable.cpp
+++ b/src/lmtable.cpp
@@ -212,44 +212,44 @@ namespace irstlm {
delete_lmtcaches();
#endif
}
-
- void lmtable::stat_prob_and_state_cache()
- {
+
+ void lmtable::stat_prob_and_state_cache()
+ {
#ifdef PS_CACHE_ENABLE
- for (int i=1; i<=max_cache_lev; i++)
- {
+ for (int i=1; i<=max_cache_lev; i++)
+ {
std::cout << "void lmtable::stat_prob_and_state_cache() level:" << i << std::endl;
- if (prob_and_state_cache[i])
- {
- prob_and_state_cache[i]->stat();
- }
- }
+ if (prob_and_state_cache[i])
+ {
+ prob_and_state_cache[i]->stat();
+ }
+ }
#endif
- }
- void lmtable::stat_lmtcaches()
- {
+ }
+ void lmtable::stat_lmtcaches()
+ {
#ifdef PS_CACHE_ENABLE
- for (int i=2; i<=max_cache_lev; i++)
- {
+ for (int i=2; i<=max_cache_lev; i++)
+ {
std::cout << "void lmtable::stat_lmtcaches() level:" << i << std::endl;
- if (lmtcache[i])
- {
- lmtcache[i]->stat();
- }
- }
+ if (lmtcache[i])
+ {
+ lmtcache[i]->stat();
+ }
+ }
#endif
- }
-
- void lmtable::stat_caches()
- {
+ }
+
+ void lmtable::stat_caches()
+ {
#ifdef PS_CACHE_ENABLE
- stat_prob_and_state_cache();
+ stat_prob_and_state_cache();
#endif
#ifdef LMT_CACHE_ENABLE
- stat_lmtcaches();
+ stat_lmtcaches();
#endif
- }
-
+ }
+
void lmtable::used_prob_and_state_cache() const
{
@@ -1767,7 +1767,7 @@ namespace irstlm {
concatenate_single_level(i, fromfilename, tofilename);
}
}
-
+
//concatenate corresponding single level files of two different tables
void lmtable::concatenate_single_level(int level, const char* fromfilename, const char* tofilename){
//single level files should have a name derived from "fromfilename" and "tofilename"
@@ -2085,8 +2085,8 @@ namespace irstlm {
//insert both found and not found items!!!
-// if (lmtcache[l] && hit==true) {
-
+ // if (lmtcache[l] && hit==true) {
+
//insert only not found items!!!
if (lmtcache[l] && hit==false) {
const char* found2=found;
@@ -2348,6 +2348,7 @@ namespace irstlm {
#endif
}
+ /*
//this function simulates the cmaxsuffptr(ngram, ...) but it takes as input an array of codes instead of the ngram
const char *lmtable::cmaxsuffptr(int* codes, int sz, unsigned int* size)
{
@@ -2397,17 +2398,10 @@ namespace irstlm {
ngram ong(dict);
ong.pushc(codes,sz);
MY_ASSERT (ong.size == sz);
- /*
- unsigned int isize; //internal state size variable
- char* found=(char *) maxsuffptr(ong,&isize);
- char* found2=(char *) maxsuffptr(ong,size);
- if (size!=NULL) *size=isize;
- return found;
- */
return maxsuffptr(ong,size);
#endif
}
-
+ */
//non recursive version
ngram_state_t lmtable::maxsuffidx(ngram ong, unsigned int* size)
@@ -2456,7 +2450,7 @@ namespace irstlm {
if (ng.succ==0) *size=isize-1;
else *size=isize;
}
-
+
int ndsz=nodesize(tbltype[isize]);
ngram_state_t msidx = 0;
if (ng.link){
@@ -2471,7 +2465,7 @@ namespace irstlm {
return 0;
}
}
-
+
ngram_state_t lmtable::cmaxsuffidx(ngram ong, unsigned int* size)
{
VERBOSE(3,"ngram_state_t lmtable::cmaxsuffidx(ngram ong, unsigned int* size) ong:|" << ong << "|\n");
@@ -2493,7 +2487,7 @@ namespace irstlm {
// if (prob_and_state_cache && ong.size==maxlev && prob_and_state_cache->get(ong.wordp(maxlev),pst)) {
if (prob_and_state_cache[ong.size] && prob_and_state_cache[ong.size]->get(ong.wordp(ong.size),pst)) {
*size=pst.statesize;
-// return pst.state;
+ // return pst.state;
return pst.ngramstate;
}
ong.size = orisize;
@@ -2520,6 +2514,7 @@ namespace irstlm {
#endif
}
+ /*
//this function simulates the cmaxsuffptr(ngram, ...) but it takes as input an array of codes instead of the ngram
ngram_state_t lmtable::cmaxsuffidx(int* codes, int sz, unsigned int* size)
{
@@ -2571,6 +2566,7 @@ namespace irstlm {
return maxsuffidx(ong,size);
#endif
}
+ */
//returns log10prob of n-gram
//bow: backoff weight
@@ -2597,8 +2593,8 @@ namespace irstlm {
if (bow) *bow=0; //initialize back-off weight
if (bol) *bol=0; //initialize bock-off level
-
-
+ if (lastbow) *lastbow=0; //initialize back-off weight of the deepest found ngram
+
double rbow=0,lpr=0; //output back-off weight and logprob
float ibow,iprob; //internal back-off weight and logprob
@@ -2720,9 +2716,9 @@ namespace irstlm {
//return log10 probsL use cache memory
- double lmtable::clprob(ngram ong,double* bow, int* bol, ngram_state_t* ngramstate, char** state, unsigned int* statesize, bool* extendible)
+ double lmtable::clprob(ngram ong,double* bow, int* bol, ngram_state_t* ngramstate, char** state, unsigned int* statesize, bool* extendible, double* lastbow)
{
- VERBOSE(3,"double lmtable::clprob(ngram ong,double* bow, int* bol, ngram_state_t* ngramstate, char** state, unsigned int* statesize, bool* extendible) ong:|" << ong << "|\n");
+ VERBOSE(3,"double lmtable::clprob(ngram ong,double* bow, int* bol, ngram_state_t* ngramstate, char** state, unsigned int* statesize, bool* extendible, double* lastbow) ong:|" << ong << "|\n");
#ifdef TRACE_CACHELM
// if (probcache && ong.size==maxlev && sentence_id>0) {
@@ -2736,6 +2732,7 @@ namespace irstlm {
if (state!=NULL) *state=NULL;
if (ngramstate!=NULL) *ngramstate=NULL;
if (extendible!=NULL) *extendible=false;
+ if (lastbow!=NULL) *lastbow=false;
return 0.0;
}
@@ -2755,6 +2752,7 @@ namespace irstlm {
if (ngramstate) *ngramstate = pst_get.ngramstate;
if (statesize) *statesize = pst_get.statesize;
if (extendible) *extendible = pst_get.extendible;
+ if (lastbow) *lastbow = pst_get.lastbow;
return logpr;
}
@@ -2762,7 +2760,7 @@ namespace irstlm {
//cache miss
prob_and_state_t pst_add;
- logpr = pst_add.logpr = lmtable::lprob(ong, &(pst_add.bow), &(pst_add.bol), &(pst_add.ngramstate), &(pst_add.state), &(pst_add.statesize), &(pst_add.extendible));
+ logpr = pst_add.logpr = lmtable::lprob(ong, &(pst_add.bow), &(pst_add.bol), &(pst_add.ngramstate), &(pst_add.state), &(pst_add.statesize), &(pst_add.extendible), &(pst_add.lastbow));
if (bow) *bow = pst_add.bow;
@@ -2771,6 +2769,7 @@ namespace irstlm {
if (ngramstate) *ngramstate = pst_add.ngramstate;
if (statesize) *statesize = pst_add.statesize;
if (extendible) *extendible = pst_add.extendible;
+ if (extendible) *lastbow = pst_add.lastbow;
// if (prob_and_state_cache && ong.size==maxlev) {
@@ -2781,91 +2780,94 @@ namespace irstlm {
}
return logpr;
#else
- return lmtable::lprob(ong, bow, bol, ngramstate, state, statesize, extendible);
-#endif
- };
-
-
- //return log10 probsL use cache memory
- //this function simulates the clprob(ngram, ...) but it takes as input an array of codes instead of the ngram
- double lmtable::clprob(int* codes, int sz, double* bow, int* bol, ngram_state_t* ngramstate, char** state,unsigned int* statesize,bool* extendible)
- {
- VERBOSE(3," double lmtable::clprob(int* codes, int sz, double* bow, int* bol, ngram_state_t* ngramstate, char** state, unsigned int* statesize, bool* extendible)\n");
-#ifdef TRACE_CACHELM
- // if (probcache && sz==maxlev && sentence_id>0) {
- if (probcache && sentence_id>0) {
- *cacheout << sentence_id << "\n";
- //print the codes of the vector ng
- }
-#endif
-
- if (sz==0) {
- if (statesize!=NULL) *statesize=0;
- if (state!=NULL) *state=NULL;
- if (ngramstate!=NULL) *ngramstate=NULL;
- if (extendible!=NULL) *extendible=false;
- return 0.0;
- }
-
- if (sz>maxlev) sz=maxlev; //adjust n-gram level to table size
-
-#ifdef PS_CACHE_ENABLE
- double logpr;
-
- //cache hit
- prob_and_state_t pst_get;
-
- // if (prob_and_state_cache && sz==maxlev && prob_and_state_cache->get(codes,pst_get)) {
- if (prob_and_state_cache[sz] && prob_and_state_cache[sz]->get(codes,pst_get)) {
-
- logpr=pst_get.logpr;
- if (bow) *bow = pst_get.bow;
- if (bol) *bol = pst_get.bol;
- if (state) *state = pst_get.state;
- if (ngramstate) *ngramstate = pst_get.ngramstate;
- if (statesize) *statesize = pst_get.statesize;
- if (extendible) *extendible = pst_get.extendible;
-
- return logpr;
- }
-
-
- //create the actual ngram
- ngram ong(dict);
- ong.pushc(codes,sz);
- MY_ASSERT (ong.size == sz);
-
- //cache miss
- prob_and_state_t pst_add;
-// logpr = pst_add.logpr = lmtable::lprob(ong, &(pst_add.bow), &(pst_add.bol), &(pst_add.state), &(pst_add.statesize), &(pst_add.extendible));
- logpr = pst_add.logpr = lmtable::lprob(ong, &(pst_add.bow), &(pst_add.bol), &(pst_add.ngramstate), &(pst_add.state), &(pst_add.statesize), &(pst_add.extendible));
-
-
- if (bow) *bow = pst_add.bow;
- if (bol) *bol = pst_add.bol;
- if (state) *state = pst_add.state;
- if (ngramstate) *ngramstate = pst_add.ngramstate;
- if (statesize) *statesize = pst_add.statesize;
- if (extendible) *extendible = pst_add.extendible;
-
-
- // if (prob_and_state_cache && ong.size==maxlev) {
- // prob_and_state_cache->add(ong.wordp(maxlev),pst_add);
- // }
- if (prob_and_state_cache[sz]) {
- prob_and_state_cache[sz]->add(ong.wordp(ong.size),pst_add);
- }
- return logpr;
-#else
-
- //create the actual ngram
- ngram ong(dict);
- ong.pushc(codes,sz);
- MY_ASSERT (ong.size == sz);
- return lmtable::lprob(ong, bow, bol, ngramstate, state, statesize, extendible);
+ return lmtable::lprob(ong, bow, bol, ngramstate, state, statesize, extendible, lastbow);
#endif
};
+ /*
+ //return log10 probsL use cache memory
+ //this function simulates the clprob(ngram, ...) but it takes as input an array of codes instead of the ngram
+ double lmtable::clprob(int* codes, int sz, double* bow, int* bol, ngram_state_t* ngramstate, char** state,unsigned int* statesize,bool* extendible, double* lastbow)
+ {
+ VERBOSE(3," double lmtable::clprob(int* codes, int sz, double* bow, int* bol, ngram_state_t* ngramstate, char** state, unsigned int* statesize, bool* extendible, double* lastbow)\n");
+ #ifdef TRACE_CACHELM
+ // if (probcache && sz==maxlev && sentence_id>0) {
+ if (probcache && sentence_id>0) {
+ *cacheout << sentence_id << "\n";
+ //print the codes of the vector ng
+ }
+ #endif
+
+ if (sz==0) {
+ if (statesize!=NULL) *statesize=0;
+ if (state!=NULL) *state=NULL;
+ if (ngramstate!=NULL) *ngramstate=NULL;
+ if (extendible!=NULL) *extendible=false;
+ if (lastbow!=NULL) *lastbow=false;
+ return 0.0;
+ }
+
+ if (sz>maxlev) sz=maxlev; //adjust n-gram level to table size
+
+ #ifdef PS_CACHE_ENABLE
+ double logpr;
+
+ //cache hit
+ prob_and_state_t pst_get;
+
+ // if (prob_and_state_cache && sz==maxlev && prob_and_state_cache->get(codes,pst_get)) {
+ if (prob_and_state_cache[sz] && prob_and_state_cache[sz]->get(codes,pst_get)) {
+
+ logpr=pst_get.logpr;
+ if (bow) *bow = pst_get.bow;
+ if (bol) *bol = pst_get.bol;
+ if (state) *state = pst_get.state;
+ if (ngramstate) *ngramstate = pst_get.ngramstate;
+ if (statesize) *statesize = pst_get.statesize;
+ if (extendible) *extendible = pst_get.extendible;
+ if (lastbow) *lastbow = pst_get.lastbow;
+
+ return logpr;
+ }
+
+
+ //create the actual ngram
+ ngram ong(dict);
+ ong.pushc(codes,sz);
+ MY_ASSERT (ong.size == sz);
+
+ //cache miss
+ prob_and_state_t pst_add;
+ // logpr = pst_add.logpr = lmtable::lprob(ong, &(pst_add.bow), &(pst_add.bol), &(pst_add.state), &(pst_add.statesize), &(pst_add.extendible), &(pst_add.lastbow));
+ logpr = pst_add.logpr = lmtable::lprob(ong, &(pst_add.bow), &(pst_add.bol), &(pst_add.ngramstate), &(pst_add.state), &(pst_add.statesize), &(pst_add.extendible), &(pst_add.lastbow));
+
+
+ if (bow) *bow = pst_add.bow;
+ if (bol) *bol = pst_add.bol;
+ if (state) *state = pst_add.state;
+ if (ngramstate) *ngramstate = pst_add.ngramstate;
+ if (statesize) *statesize = pst_add.statesize;
+ if (extendible) *extendible = pst_add.extendible;
+ if (lastbow) *lastbow = pst_add.lastbow;
+
+
+ // if (prob_and_state_cache && ong.size==maxlev) {
+ // prob_and_state_cache->add(ong.wordp(maxlev),pst_add);
+ // }
+ if (prob_and_state_cache[sz]) {
+ prob_and_state_cache[sz]->add(ong.wordp(ong.size),pst_add);
+ }
+ return logpr;
+ #else
+
+ //create the actual ngram
+ ngram ong(dict);
+ ong.pushc(codes,sz);
+ MY_ASSERT (ong.size == sz);
+ return lmtable::lprob(ong, bow, bol, ngramstate, state, statesize, extendible,lastbow);
+ #endif
+ };
+ */
int lmtable::succrange(node ndp,int level,table_entry_pos_t* isucc,table_entry_pos_t* esucc)
{
@@ -2912,7 +2914,7 @@ namespace irstlm {
}
if (level >1 ) lmtable::getDict()->stat();
-
+
stat_caches();
}
diff --git a/src/lmtable.h b/src/lmtable.h
index 77ed54d..6bb707f 100644
--- a/src/lmtable.h
+++ b/src/lmtable.h
@@ -76,594 +76,595 @@ typedef unsigned char qfloat_t; //type for quantized probabilities
#define BOUND_EMPTY2 (numeric_limits<table_entry_pos_t>::max() - 1)
namespace irstlm {
-class lmtable: public lmContainer
-{
- static const bool debug=true;
-
- void loadtxt(std::istream& inp,const char* header,const char* filename,int mmap);
- void loadtxt_ram(std::istream& inp,const char* header);
- void loadtxt_mmap(std::istream& inp,const char* header,const char* outfilename);
- void loadtxt_level(std::istream& inp,int l);
-
- void loadbin(std::istream& inp,const char* header,const char* filename,int mmap);
- void loadbin_header(std::istream& inp, const char* header);
- void loadbin_dict(std::istream& inp);
- void loadbin_codebook(std::istream& inp,int l);
- void loadbin_level(std::istream& inp,int l);
-
-protected:
- char* table[LMTMAXLEV+1]; //storage of all levels
- LMT_TYPE tbltype[LMTMAXLEV+1]; //table type for each levels
- table_entry_pos_t cursize[LMTMAXLEV+1]; //current size of levels
-
- //current offset for in-memory tables (different for each level
- //needed to manage partial tables
- // mempos = diskpos - offset[level]
- table_entry_pos_t tb_offset[LMTMAXLEV+1];
-
- table_entry_pos_t maxsize[LMTMAXLEV+1]; //max size of levels
- table_entry_pos_t* startpos[LMTMAXLEV+1]; //support vector to store start positions
- char info[100]; //information put in the header
-
- //statistics
- int totget[LMTMAXLEV+1];
- int totbsearch[LMTMAXLEV+1];
-
- //probability quantization
- bool isQtable;
-
- //Incomplete LM table from distributed training
- bool isItable;
-
- //Table with reverted n-grams for fast access
- bool isInverted;
-
- //Table might contain pruned n-grams
- bool isPruned;
-
- int NumCenters[LMTMAXLEV+1];
- float* Pcenters[LMTMAXLEV+1];
- float* Bcenters[LMTMAXLEV+1];
-
- double logOOVpenalty; //penalty for OOV words (default 0)
- int dictionary_upperbound; //set by user
- int backoff_state;
-
- //improve access speed
- int max_cache_lev;
-
-// NGRAMCACHE_t* prob_and_state_cache;
- NGRAMCACHE_t* prob_and_state_cache[LMTMAXLEV+1];
- NGRAMCACHE_t* lmtcache[LMTMAXLEV+1];
- float ngramcache_load_factor;
- float dictionary_load_factor;
-
- //memory map on disk
- int memmap; //level from which n-grams are accessed via mmap
- int diskid;
- off_t tableOffs[LMTMAXLEV+1];
- off_t tableGaps[LMTMAXLEV+1];
-
- // is this LM queried for knowing the matching order or (standard
- // case) for score?
- bool orderQuery;
-
- //flag to enable/disable deletion of dict in the destructor
- bool delete_dict;
-
-public:
-
+ class lmtable: public lmContainer
+ {
+ static const bool debug=true;
+
+ void loadtxt(std::istream& inp,const char* header,const char* filename,int mmap);
+ void loadtxt_ram(std::istream& inp,const char* header);
+ void loadtxt_mmap(std::istream& inp,const char* header,const char* outfilename);
+ void loadtxt_level(std::istream& inp,int l);
+
+ void loadbin(std::istream& inp,const char* header,const char* filename,int mmap);
+ void loadbin_header(std::istream& inp, const char* header);
+ void loadbin_dict(std::istream& inp);
+ void loadbin_codebook(std::istream& inp,int l);
+ void loadbin_level(std::istream& inp,int l);
+
+ protected:
+ char* table[LMTMAXLEV+1]; //storage of all levels
+ LMT_TYPE tbltype[LMTMAXLEV+1]; //table type for each levels
+ table_entry_pos_t cursize[LMTMAXLEV+1]; //current size of levels
+
+ //current offset for in-memory tables (different for each level
+ //needed to manage partial tables
+ // mempos = diskpos - offset[level]
+ table_entry_pos_t tb_offset[LMTMAXLEV+1];
+
+ table_entry_pos_t maxsize[LMTMAXLEV+1]; //max size of levels
+ table_entry_pos_t* startpos[LMTMAXLEV+1]; //support vector to store start positions
+ char info[100]; //information put in the header
+
+ //statistics
+ int totget[LMTMAXLEV+1];
+ int totbsearch[LMTMAXLEV+1];
+
+ //probability quantization
+ bool isQtable;
+
+ //Incomplete LM table from distributed training
+ bool isItable;
+
+ //Table with reverted n-grams for fast access
+ bool isInverted;
+
+ //Table might contain pruned n-grams
+ bool isPruned;
+
+ int NumCenters[LMTMAXLEV+1];
+ float* Pcenters[LMTMAXLEV+1];
+ float* Bcenters[LMTMAXLEV+1];
+
+ double logOOVpenalty; //penalty for OOV words (default 0)
+ int dictionary_upperbound; //set by user
+ int backoff_state;
+
+ //improve access speed
+ int max_cache_lev;
+
+ // NGRAMCACHE_t* prob_and_state_cache;
+ NGRAMCACHE_t* prob_and_state_cache[LMTMAXLEV+1];
+ NGRAMCACHE_t* lmtcache[LMTMAXLEV+1];
+ float ngramcache_load_factor;
+ float dictionary_load_factor;
+
+ //memory map on disk
+ int memmap; //level from which n-grams are accessed via mmap
+ int diskid;
+ off_t tableOffs[LMTMAXLEV+1];
+ off_t tableGaps[LMTMAXLEV+1];
+
+ // is this LM queried for knowing the matching order or (standard
+ // case) for score?
+ bool orderQuery;
+
+ //flag to enable/disable deletion of dict in the destructor
+ bool delete_dict;
+
+ public:
+
#ifdef TRACE_CACHELM
- std::fstream* cacheout;
- int sentence_id;
+ std::fstream* cacheout;
+ int sentence_id;
#endif
-
- dictionary *dict; // dictionary (words - macro tags)
-
- lmtable(float nlf=0.0, float dlfi=0.0);
-
- virtual ~lmtable();
-
- table_entry_pos_t wdprune(float *thr, int aflag=0);
- table_entry_pos_t wdprune(float *thr, int aflag, ngram ng, int ilev, int elev, table_entry_pos_t ipos, table_entry_pos_t epos, double lk=0, double bo=0, double *ts=0, double *tbs=0);
- double lprobx(ngram ong, double *lkp=0, double *bop=0, int *bol=0);
-
- table_entry_pos_t ngcnt(table_entry_pos_t *cnt);
- table_entry_pos_t ngcnt(table_entry_pos_t *cnt, ngram ng, int l, table_entry_pos_t ipos, table_entry_pos_t epos);
- int pscale(int lev, table_entry_pos_t ipos, table_entry_pos_t epos, double s);
-
- void init_prob_and_state_cache();
- void init_probcache() {
- init_prob_and_state_cache();
- }; //kept for back compatibility
- void init_statecache() {}; //kept for back compatibility
- void init_lmtcaches();
-// void init_lmtcaches(int uptolev);
- void init_caches(int uptolev);
-
- void used_prob_and_state_cache() const;
- void used_lmtcaches() const;
- void used_caches() const;
-
-
- void delete_prob_and_state_cache();
- void delete_probcache() {
- delete_prob_and_state_cache();
- }; //kept for back compatibility
- void delete_statecache() {}; //kept for back compatibility
- void delete_lmtcaches();
- void delete_caches();
-
- void stat_prob_and_state_cache();
- void stat_lmtcaches();
- void stat_caches();
-
- void check_prob_and_state_cache_levels() const;
- void check_probcache_levels() const {
- check_prob_and_state_cache_levels();
- }; //kept for back compatibility
- void check_statecache_levels() const{}; //kept for back compatibility
- void check_lmtcaches_levels() const;
- void check_caches_levels() const;
-
- void reset_prob_and_state_cache();
- void reset_probcache() {
- reset_prob_and_state_cache();
- }; //kept for back compatibility
- void reset_statecache() {}; //kept for back compatibility
- void reset_lmtcaches();
- void reset_caches();
-
-
- bool are_prob_and_state_cache_active() const;
- bool is_probcache_active() const {
- return are_prob_and_state_cache_active();
- }; //kept for back compatibility
- bool is_statecache_active() const {
- return are_prob_and_state_cache_active();
- }; //kept for back compatibility
- bool are_lmtcaches_active() const;
- bool are_caches_active() const;
-
- void reset_mmap();
-
- //set the inverted flag to load ngrams in an inverted order
- //this choice is disregarded if a binary LM is loaded,
- //because the info is stored into the header
- inline bool is_inverted(const bool flag) {
- return isInverted=flag;
- }
- inline bool is_inverted() const {
- return isInverted;
- }
-
- void configure(int n,bool quantized);
-
- //set penalty for OOV words
- inline double getlogOOVpenalty() const {
- return logOOVpenalty;
- }
-
- inline double setlogOOVpenalty(int dub) {
- MY_ASSERT(dub > dict->size());
- dictionary_upperbound = dub;
- return logOOVpenalty=log((double)(dictionary_upperbound - dict->size()))/M_LN10;
- }
-
- inline double setlogOOVpenalty(double oovp) {
- return logOOVpenalty=oovp;
- }
-
- virtual int maxlevel() const {
- return maxlev;
- };
- inline bool isQuantized() const {
- return isQtable;
- }
-
-
- void savetxt(const char *filename);
- void savebin(const char *filename);
-
- void appendbin_level(int level, fstream &out, int mmap);
- void appendbin_level_nommap(int level, fstream &out);
- void appendbin_level_mmap(int level, fstream &out);
-
- void savebin_level(int level, const char* filename, int mmap);
- void savebin_level_nommap(int level, const char* filename);
- void savebin_level_mmap(int level, const char* filename);
- void savebin_dict(std::fstream& out);
-
- void compact_all_levels(const char* filename);
- void compact_single_level(int level, const char* filename);
-
- void concatenate_all_levels(const char* fromfilename, const char* tofilename);
- void concatenate_single_level(int level, const char* fromfilename, const char* tofilename);
-
- void remove_all_levels(const char* filename);
- void remove_single_level(int level, const char* filename);
-
- void print_table_stat();
- void print_table_stat(int level);
-
- void dumplm(std::fstream& out,ngram ng, int ilev, int elev, table_entry_pos_t ipos,table_entry_pos_t epos);
-
-
- void delete_level(int level, const char* outfilename, int mmap);
- void delete_level_nommap(int level);
- void delete_level_mmap(int level, const char* filename);
-
- void resize_level(int level, const char* outfilename, int mmap);
- void resize_level_nommap(int level);
- void resize_level_mmap(int level, const char* filename);
-
- inline void update_offset(int level, table_entry_pos_t value) { tb_offset[level]=value; };
-
-
- virtual void load(const std::string &filename, int mmap=0);
- virtual void load(std::istream& inp,const char* filename=NULL,const char* outfilename=NULL,int mmap=0);
-
- void load_centers(std::istream& inp,int l);
-
- void expand_level(int level, table_entry_pos_t size, const char* outfilename, int mmap);
- void expand_level_nommap(int level, table_entry_pos_t size);
- void expand_level_mmap(int level, table_entry_pos_t size, const char* outfilename);
-
- void cpsublm(lmtable* sublmt, dictionary* subdict,bool keepunigr=true);
-
- int reload(std::set<string> words);
-
- void filter(const char* /* unused parameter: lmfile */) {};
-
- virtual double lprob(ngram ng){
- return lprob(ng, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
- }
- virtual double lprob(ngram ng, double* bow, int* bol, char** maxsuffptr, unsigned int* statesize, bool* extendible){
- return lprob(ng, bow, bol, NULL, maxsuffptr, statesize, extendible, NULL);
- }
- virtual double lprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible){
- return lprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, NULL);
- }
-
- virtual double lprob(ngram ng, double* bow, int* bol, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow){
- return lprob(ng, bow, bol, NULL, maxsuffptr, statesize, extendible, lastbow);
- }
- virtual double lprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow);
-
- virtual double clprob(ngram ng, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL);
- virtual double clprob(int* ng, int ngsize, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL);
-
-
- void *search(int lev,table_entry_pos_t offs,table_entry_pos_t n,int sz,int *w, LMT_ACTION action,char **found=(char **)NULL);
-
- int mybsearch(char *ar, table_entry_pos_t n, int size, char *key, table_entry_pos_t *idx);
-
-
- int add(ngram& ng, float prob,float bow);
- //template<typename TA, typename TB> int add(ngram& ng, TA prob,TB bow);
-
- int addwithoffset(ngram& ng, float prob,float bow);
- // template<typename TA, typename TB> int addwithoffset(ngram& ng, TA prob,TB bow);
-
- void checkbounds(int level);
-
- virtual inline int get(ngram& ng) {
- return get(ng,ng.size,ng.size);
- }
- virtual int get(ngram& ng,int n,int lev);
-
- int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev);
-
- virtual const char *maxsuffptr(ngram ong, unsigned int* size=NULL);
- virtual const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL);
- virtual const char *cmaxsuffptr(int* codes, int sz, unsigned int* size=NULL);
- virtual ngram_state_t maxsuffidx(ngram ong, unsigned int* size=NULL);
- virtual ngram_state_t cmaxsuffidx(ngram ong, unsigned int* size=NULL);
- virtual ngram_state_t cmaxsuffidx(int* codes, int sz, unsigned int* size=NULL);
-
- inline void putmem(char* ptr,int value,int offs,int size) {
- MY_ASSERT(ptr!=NULL);
- for (int i=0; i<size; i++)
- ptr[offs+i]=(value >> (8 * i)) & 0xff;
- };
-
- inline void getmem(char* ptr,int* value,int offs,int size) {
- MY_ASSERT(ptr!=NULL);
- *value=ptr[offs] & 0xff;
- for (int i=1; i<size; i++){
- *value= *value | ( ( ptr[offs+i] & 0xff ) << (8 *i));
+
+ dictionary *dict; // dictionary (words - macro tags)
+
+ lmtable(float nlf=0.0, float dlfi=0.0);
+
+ virtual ~lmtable();
+
+ table_entry_pos_t wdprune(float *thr, int aflag=0);
+ table_entry_pos_t wdprune(float *thr, int aflag, ngram ng, int ilev, int elev, table_entry_pos_t ipos, table_entry_pos_t epos, double lk=0, double bo=0, double *ts=0, double *tbs=0);
+ double lprobx(ngram ong, double *lkp=0, double *bop=0, int *bol=0);
+
+ table_entry_pos_t ngcnt(table_entry_pos_t *cnt);
+ table_entry_pos_t ngcnt(table_entry_pos_t *cnt, ngram ng, int l, table_entry_pos_t ipos, table_entry_pos_t epos);
+ int pscale(int lev, table_entry_pos_t ipos, table_entry_pos_t epos, double s);
+
+ void init_prob_and_state_cache();
+ void init_probcache() {
+ init_prob_and_state_cache();
+ }; //kept for back compatibility
+ void init_statecache() {}; //kept for back compatibility
+ void init_lmtcaches();
+ // void init_lmtcaches(int uptolev);
+ void init_caches(int uptolev);
+
+ void used_prob_and_state_cache() const;
+ void used_lmtcaches() const;
+ void used_caches() const;
+
+
+ void delete_prob_and_state_cache();
+ void delete_probcache() {
+ delete_prob_and_state_cache();
+ }; //kept for back compatibility
+ void delete_statecache() {}; //kept for back compatibility
+ void delete_lmtcaches();
+ void delete_caches();
+
+ void stat_prob_and_state_cache();
+ void stat_lmtcaches();
+ void stat_caches();
+
+ void check_prob_and_state_cache_levels() const;
+ void check_probcache_levels() const {
+ check_prob_and_state_cache_levels();
+ }; //kept for back compatibility
+ void check_statecache_levels() const{}; //kept for back compatibility
+ void check_lmtcaches_levels() const;
+ void check_caches_levels() const;
+
+ void reset_prob_and_state_cache();
+ void reset_probcache() {
+ reset_prob_and_state_cache();
+ }; //kept for back compatibility
+ void reset_statecache() {}; //kept for back compatibility
+ void reset_lmtcaches();
+ void reset_caches();
+
+
+ bool are_prob_and_state_cache_active() const;
+ bool is_probcache_active() const {
+ return are_prob_and_state_cache_active();
+ }; //kept for back compatibility
+ bool is_statecache_active() const {
+ return are_prob_and_state_cache_active();
+ }; //kept for back compatibility
+ bool are_lmtcaches_active() const;
+ bool are_caches_active() const;
+
+ void reset_mmap();
+
+ //set the inverted flag to load ngrams in an inverted order
+ //this choice is disregarded if a binary LM is loaded,
+ //because the info is stored into the header
+ inline bool is_inverted(const bool flag) {
+ return isInverted=flag;
}
- };
-
- template<typename T>
- inline void putmem(char* ptr,T value,int offs) {
- MY_ASSERT(ptr!=NULL);
- memcpy(ptr+offs, &value, sizeof(T));
- };
-
- template<typename T>
- inline void getmem(char* ptr,T* value,int offs) {
- MY_ASSERT(ptr!=NULL);
- memcpy((void*)value, ptr+offs, sizeof(T));
- };
-
-
- int nodesize(LMT_TYPE ndt) {
- switch (ndt) {
- case INTERNAL:
- return LMTCODESIZE + PROBSIZE + PROBSIZE + BOUNDSIZE;
- case QINTERNAL:
- return LMTCODESIZE + QPROBSIZE + QPROBSIZE + BOUNDSIZE;
- case LEAF:
- return LMTCODESIZE + PROBSIZE;
- case QLEAF:
- return LMTCODESIZE + QPROBSIZE;
- default:
- MY_ASSERT(0);
- return 0;
+ inline bool is_inverted() const {
+ return isInverted;
}
- }
-
- inline int word(node nd,int value=-1) {
- int offset=0;
- if (value==-1)
- getmem(nd,&value,offset,LMTCODESIZE);
- else
- putmem(nd,value,offset,LMTCODESIZE);
+ void configure(int n,bool quantized);
- return value;
- };
-
-
- int codecmp(node a,node b) {
- register int i,result;
- for (i=(LMTCODESIZE-1); i>=0; i--) {
- result=(unsigned char)a[i]-(unsigned char)b[i];
- if(result) return result;
+ //set penalty for OOV words
+ inline double getlogOOVpenalty() const {
+ return logOOVpenalty;
}
- return 0;
- };
-
- int codediff(node a,node b) {
- return word(a)-word(b);
- };
-
-
- inline float prob(node nd,LMT_TYPE ndt) {
- int offs=LMTCODESIZE;
-
- float fv;
- unsigned char cv;
- switch (ndt) {
- case INTERNAL:
- getmem(nd,&fv,offs);
- return fv;
- case QINTERNAL:
- getmem(nd,&cv,offs);
- return (float) cv;
- case LEAF:
- getmem(nd,&fv,offs);
- return fv;
- case QLEAF:
- getmem(nd,&cv,offs);
- return (float) cv;
- default:
- MY_ASSERT(0);
- return 0;
- }
- };
-
- template<typename T>
- inline T prob(node nd, LMT_TYPE ndt, T value) {
- int offs=LMTCODESIZE;
-
- switch (ndt) {
- case INTERNAL:
- putmem(nd, value,offs);
- break;
- case QINTERNAL:
- putmem(nd,(unsigned char) value,offs);
- break;
- case LEAF:
- putmem(nd, value,offs);
- break;
- case QLEAF:
- putmem(nd,(unsigned char) value,offs);
- break;
- default:
- MY_ASSERT(0);
- return (T) 0;
+
+ inline double setlogOOVpenalty(int dub) {
+ MY_ASSERT(dub > dict->size());
+ dictionary_upperbound = dub;
+ return logOOVpenalty=log((double)(dictionary_upperbound - dict->size()))/M_LN10;
}
- return value;
- };
-
- inline float bow(node nd,LMT_TYPE ndt) {
- int offs=LMTCODESIZE+(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
-
- float fv;
- unsigned char cv;
- switch (ndt) {
- case INTERNAL:
- getmem(nd,&fv,offs);
- return fv;
- case QINTERNAL:
- getmem(nd,&cv,offs);
- return (float) cv;
- case LEAF:
- getmem(nd,&fv,offs);
- return fv;
- case QLEAF:
- getmem(nd,&cv,offs);
- return (float) cv;
- default:
- MY_ASSERT(0);
- return 0;
+ inline double setlogOOVpenalty(double oovp) {
+ return logOOVpenalty=oovp;
}
- };
-
- template<typename T>
- inline T bow(node nd,LMT_TYPE ndt, T value) {
- int offs=LMTCODESIZE+(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
-
- switch (ndt) {
- case INTERNAL:
- putmem(nd, value,offs);
- break;
- case QINTERNAL:
- putmem(nd,(unsigned char) value,offs);
- break;
- case LEAF:
- putmem(nd, value,offs);
- break;
- case QLEAF:
- putmem(nd,(unsigned char) value,offs);
- break;
- default:
- MY_ASSERT(0);
- return 0;
+
+ virtual int maxlevel() const {
+ return maxlev;
+ };
+ inline bool isQuantized() const {
+ return isQtable;
}
- return value;
- };
-
-
- inline table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, int level){ return bound(nd,ndt) - tb_offset[level+1]; }
-
- inline table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level){ return bound(nd, ndt, value + tb_offset[level+1]); }
-
- // table_entry_pos_t bound(node nd,LMT_TYPE ndt, int level=0) {
- table_entry_pos_t bound(node nd,LMT_TYPE ndt) {
- int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
+ void savetxt(const char *filename);
+ void savebin(const char *filename);
- table_entry_pos_t value;
+ void appendbin_level(int level, fstream &out, int mmap);
+ void appendbin_level_nommap(int level, fstream &out);
+ void appendbin_level_mmap(int level, fstream &out);
- getmem(nd,&value,offs);
+ void savebin_level(int level, const char* filename, int mmap);
+ void savebin_level_nommap(int level, const char* filename);
+ void savebin_level_mmap(int level, const char* filename);
+ void savebin_dict(std::fstream& out);
- // value -= tb_offset[level+1];
+ void compact_all_levels(const char* filename);
+ void compact_single_level(int level, const char* filename);
- return value;
- };
-
-
- // table_entry_pos_t bound(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level=0) {
- table_entry_pos_t bound(node nd,LMT_TYPE ndt, table_entry_pos_t value) {
+ void concatenate_all_levels(const char* fromfilename, const char* tofilename);
+ void concatenate_single_level(int level, const char* fromfilename, const char* tofilename);
- int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
+ void remove_all_levels(const char* filename);
+ void remove_single_level(int level, const char* filename);
- // value += tb_offset[level+1];
+ void print_table_stat();
+ void print_table_stat(int level);
- putmem(nd,value,offs);
+ void dumplm(std::fstream& out,ngram ng, int ilev, int elev, table_entry_pos_t ipos,table_entry_pos_t epos);
- return value;
- };
-
- //template<typename T> T boundwithoffset(node nd,LMT_TYPE ndt, T value, int level);
-
- /*
- table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, int level) {
-
- int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
-
- table_entry_pos_t value;
-
- getmem(nd,&value,offs);
- return value;
- // return value-tb_offset[level+1];
- };
- */
-
- /*
- table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level) {
-
- int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
-
- putmem(nd,value,offs);
-
- return value;
- // return value+tb_offset[level+1];
- };
- */
-
- /*
- inline table_entry_pos_t bound(node nd,LMT_TYPE ndt) {
-
- int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
-
- table_entry_pos_t value;
-
- getmem(nd,&value,offs);
- return value;
- };
-
- template<typename T>
- inline T bound(node nd,LMT_TYPE ndt, T value) {
-
- int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
-
- putmem(nd,value,offs);
-
- return value;
- };
- */
- //returns the indexes of the successors of a node
- int succrange(node ndp,int level,table_entry_pos_t* isucc=NULL,table_entry_pos_t* esucc=NULL);
-
- void stat(int lev=0);
- void printTable(int level);
-
- virtual inline void setDict(dictionary* d) {
- if (delete_dict==true && dict) delete dict;
- dict=d;
- delete_dict=false;
- };
-
- inline dictionary* getDict() const {
- return dict;
- };
-
- inline table_entry_pos_t getCurrentSize(int l) const {
- return cursize[l];
- };
-
- inline void setOrderQuery(bool v) {
- orderQuery = v;
- }
- inline bool isOrderQuery() const {
- return orderQuery;
- }
-
- inline float GetNgramcacheLoadFactor() {
- return ngramcache_load_factor;
- }
- inline float GetDictionaryLoadFactor() {
- return ngramcache_load_factor;
- }
-
- //never allow the increment of the dictionary through this function
- inline virtual void dictionary_incflag(const bool flag) {
- UNUSED(flag);
- };
-
- inline virtual bool filter(const string sfilter, lmtable* sublmt, const string skeepunigrams) {
- std::cerr << "filtering... \n";
- dictionary *dict=new dictionary((char *)sfilter.c_str());
-
- cpsublm(sublmt, dict,(skeepunigrams=="yes"));
- delete dict;
- std::cerr << "...done\n";
- return true;
- }
-
+
+ void delete_level(int level, const char* outfilename, int mmap);
+ void delete_level_nommap(int level);
+ void delete_level_mmap(int level, const char* filename);
+
+ void resize_level(int level, const char* outfilename, int mmap);
+ void resize_level_nommap(int level);
+ void resize_level_mmap(int level, const char* filename);
+
+ inline void update_offset(int level, table_entry_pos_t value) { tb_offset[level]=value; };
+
+
+ virtual void load(const std::string &filename, int mmap=0);
+ virtual void load(std::istream& inp,const char* filename=NULL,const char* outfilename=NULL,int mmap=0);
+
+ void load_centers(std::istream& inp,int l);
+
+ void expand_level(int level, table_entry_pos_t size, const char* outfilename, int mmap);
+ void expand_level_nommap(int level, table_entry_pos_t size);
+ void expand_level_mmap(int level, table_entry_pos_t size, const char* outfilename);
+
+ void cpsublm(lmtable* sublmt, dictionary* subdict,bool keepunigr=true);
+
+ int reload(std::set<string> words);
+
+ void filter(const char* /* unused parameter: lmfile */) {};
+
+ virtual double lprob(ngram ng){ return lprob(ng, NULL, NULL, NULL, NULL, NULL, NULL, NULL); }
+ virtual double lprob(ngram ng, double* bow){ return lprob(ng, bow, NULL, NULL, NULL, NULL, NULL, NULL); }
+ virtual double lprob(ngram ng, double* bow, int* bol){ return lprob(ng, bow, bol, NULL, NULL, NULL, NULL, NULL); }
+ virtual double lprob(ngram ng, double* bow, int* bol, char** maxsuffptr){ return lprob(ng, bow, bol, NULL, maxsuffptr, NULL, NULL, NULL); }
+ virtual double lprob(ngram ng, double* bow, int* bol, char** maxsuffptr, unsigned int* statesize){ return lprob(ng, bow, bol, NULL, maxsuffptr, statesize, NULL, NULL); }
+ virtual double lprob(ngram ng, double* bow, int* bol, char** maxsuffptr, unsigned int* statesize, bool* extendible){ return lprob(ng, bow, bol, NULL, maxsuffptr, statesize, extendible, NULL); }
- inline virtual bool is_OOV(int code) {
- return (code == dict->oovcode());
+ virtual double lprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx){ return lprob(ng, bow, bol, maxsuffidx, NULL, NULL, NULL, NULL); }
+ virtual double lprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr){ return lprob(ng, bow, bol, maxsuffidx, maxsuffptr, NULL, NULL, NULL); }
+ virtual double lprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize){ return lprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, NULL, NULL); }
+ virtual double lprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible){ return lprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, NULL); }
+
+
+// virtual double lprob(ngram ng, double* bow, int* bol, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow){ return lprob(ng, bow, bol, NULL, maxsuffptr, statesize, extendible, lastbow); }
+ virtual double lprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow);
+
+ virtual double clprob(ngram ng, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
+// virtual double clprob(int* ng, int ngsize, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL);
+
+
+ void *search(int lev,table_entry_pos_t offs,table_entry_pos_t n,int sz,int *w, LMT_ACTION action,char **found=(char **)NULL);
+
+ int mybsearch(char *ar, table_entry_pos_t n, int size, char *key, table_entry_pos_t *idx);
+
+
+ int add(ngram& ng, float prob,float bow);
+ //template<typename TA, typename TB> int add(ngram& ng, TA prob,TB bow);
+
+ int addwithoffset(ngram& ng, float prob,float bow);
+ // template<typename TA, typename TB> int addwithoffset(ngram& ng, TA prob,TB bow);
+
+ void checkbounds(int level);
+
+ virtual inline int get(ngram& ng) {
+ return get(ng,ng.size,ng.size);
+ }
+ virtual int get(ngram& ng,int n,int lev);
+
+ int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev);
+
+ virtual const char *maxsuffptr(ngram ong, unsigned int* size=NULL);
+ virtual const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL);
+// virtual const char *cmaxsuffptr(int* codes, int sz, unsigned int* size=NULL);
+ virtual ngram_state_t maxsuffidx(ngram ong, unsigned int* size=NULL);
+ virtual ngram_state_t cmaxsuffidx(ngram ong, unsigned int* size=NULL);
+// virtual ngram_state_t cmaxsuffidx(int* codes, int sz, unsigned int* size=NULL);
+
+ inline void putmem(char* ptr,int value,int offs,int size) {
+ MY_ASSERT(ptr!=NULL);
+ for (int i=0; i<size; i++)
+ ptr[offs+i]=(value >> (8 * i)) & 0xff;
+ };
+
+ inline void getmem(char* ptr,int* value,int offs,int size) {
+ MY_ASSERT(ptr!=NULL);
+ *value=ptr[offs] & 0xff;
+ for (int i=1; i<size; i++){
+ *value= *value | ( ( ptr[offs+i] & 0xff ) << (8 *i));
+ }
+ };
+
+ template<typename T>
+ inline void putmem(char* ptr,T value,int offs) {
+ MY_ASSERT(ptr!=NULL);
+ memcpy(ptr+offs, &value, sizeof(T));
+ };
+
+ template<typename T>
+ inline void getmem(char* ptr,T* value,int offs) {
+ MY_ASSERT(ptr!=NULL);
+ memcpy((void*)value, ptr+offs, sizeof(T));
+ };
+
+
+ int nodesize(LMT_TYPE ndt) {
+ switch (ndt) {
+ case INTERNAL:
+ return LMTCODESIZE + PROBSIZE + PROBSIZE + BOUNDSIZE;
+ case QINTERNAL:
+ return LMTCODESIZE + QPROBSIZE + QPROBSIZE + BOUNDSIZE;
+ case LEAF:
+ return LMTCODESIZE + PROBSIZE;
+ case QLEAF:
+ return LMTCODESIZE + QPROBSIZE;
+ default:
+ MY_ASSERT(0);
+ return 0;
+ }
+ }
+
+ inline int word(node nd,int value=-1) {
+ int offset=0;
+
+ if (value==-1)
+ getmem(nd,&value,offset,LMTCODESIZE);
+ else
+ putmem(nd,value,offset,LMTCODESIZE);
+
+ return value;
+ };
+
+
+ int codecmp(node a,node b) {
+ register int i,result;
+ for (i=(LMTCODESIZE-1); i>=0; i--) {
+ result=(unsigned char)a[i]-(unsigned char)b[i];
+ if(result) return result;
+ }
+ return 0;
+ };
+
+ int codediff(node a,node b) {
+ return word(a)-word(b);
+ };
+
+
+ inline float prob(node nd,LMT_TYPE ndt) {
+ int offs=LMTCODESIZE;
+
+ float fv;
+ unsigned char cv;
+ switch (ndt) {
+ case INTERNAL:
+ getmem(nd,&fv,offs);
+ return fv;
+ case QINTERNAL:
+ getmem(nd,&cv,offs);
+ return (float) cv;
+ case LEAF:
+ getmem(nd,&fv,offs);
+ return fv;
+ case QLEAF:
+ getmem(nd,&cv,offs);
+ return (float) cv;
+ default:
+ MY_ASSERT(0);
+ return 0;
+ }
+ };
+
+ template<typename T>
+ inline T prob(node nd, LMT_TYPE ndt, T value) {
+ int offs=LMTCODESIZE;
+
+ switch (ndt) {
+ case INTERNAL:
+ putmem(nd, value,offs);
+ break;
+ case QINTERNAL:
+ putmem(nd,(unsigned char) value,offs);
+ break;
+ case LEAF:
+ putmem(nd, value,offs);
+ break;
+ case QLEAF:
+ putmem(nd,(unsigned char) value,offs);
+ break;
+ default:
+ MY_ASSERT(0);
+ return (T) 0;
+ }
+
+ return value;
+ };
+
+ inline float bow(node nd,LMT_TYPE ndt) {
+ int offs=LMTCODESIZE+(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
+
+ float fv;
+ unsigned char cv;
+ switch (ndt) {
+ case INTERNAL:
+ getmem(nd,&fv,offs);
+ return fv;
+ case QINTERNAL:
+ getmem(nd,&cv,offs);
+ return (float) cv;
+ case LEAF:
+ getmem(nd,&fv,offs);
+ return fv;
+ case QLEAF:
+ getmem(nd,&cv,offs);
+ return (float) cv;
+ default:
+ MY_ASSERT(0);
+ return 0;
+ }
+ };
+
+ template<typename T>
+ inline T bow(node nd,LMT_TYPE ndt, T value) {
+ int offs=LMTCODESIZE+(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
+
+ switch (ndt) {
+ case INTERNAL:
+ putmem(nd, value,offs);
+ break;
+ case QINTERNAL:
+ putmem(nd,(unsigned char) value,offs);
+ break;
+ case LEAF:
+ putmem(nd, value,offs);
+ break;
+ case QLEAF:
+ putmem(nd,(unsigned char) value,offs);
+ break;
+ default:
+ MY_ASSERT(0);
+ return 0;
+ }
+
+ return value;
+ };
+
+
+ inline table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, int level){ return bound(nd,ndt) - tb_offset[level+1]; }
+
+ inline table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level){ return bound(nd, ndt, value + tb_offset[level+1]); }
+
+ // table_entry_pos_t bound(node nd,LMT_TYPE ndt, int level=0) {
+ table_entry_pos_t bound(node nd,LMT_TYPE ndt) {
+
+ int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
+
+ table_entry_pos_t value;
+
+ getmem(nd,&value,offs);
+
+ // value -= tb_offset[level+1];
+
+ return value;
+ };
+
+
+ // table_entry_pos_t bound(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level=0) {
+ table_entry_pos_t bound(node nd,LMT_TYPE ndt, table_entry_pos_t value) {
+
+ int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
+
+ // value += tb_offset[level+1];
+
+ putmem(nd,value,offs);
+
+ return value;
+ };
+
+ //template<typename T> T boundwithoffset(node nd,LMT_TYPE ndt, T value, int level);
+
+ /*
+ table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, int level) {
+
+ int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
+
+ table_entry_pos_t value;
+
+ getmem(nd,&value,offs);
+ return value;
+ // return value-tb_offset[level+1];
+ };
+ */
+
+ /*
+ table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level) {
+
+ int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
+
+ putmem(nd,value,offs);
+
+ return value;
+ // return value+tb_offset[level+1];
+ };
+ */
+
+ /*
+ inline table_entry_pos_t bound(node nd,LMT_TYPE ndt) {
+
+ int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
+
+ table_entry_pos_t value;
+
+ getmem(nd,&value,offs);
+ return value;
+ };
+
+ template<typename T>
+ inline T bound(node nd,LMT_TYPE ndt, T value) {
+
+ int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
+
+ putmem(nd,value,offs);
+
+ return value;
+ };
+ */
+ //returns the indexes of the successors of a node
+ int succrange(node ndp,int level,table_entry_pos_t* isucc=NULL,table_entry_pos_t* esucc=NULL);
+
+ void stat(int lev=0);
+ void printTable(int level);
+
+ virtual inline void setDict(dictionary* d) {
+ if (delete_dict==true && dict) delete dict;
+ dict=d;
+ delete_dict=false;
+ };
+
+ inline dictionary* getDict() const {
+ return dict;
+ };
+
+ inline table_entry_pos_t getCurrentSize(int l) const {
+ return cursize[l];
+ };
+
+ inline void setOrderQuery(bool v) {
+ orderQuery = v;
+ }
+ inline bool isOrderQuery() const {
+ return orderQuery;
+ }
+
+ inline float GetNgramcacheLoadFactor() {
+ return ngramcache_load_factor;
+ }
+ inline float GetDictionaryLoadFactor() {
+ return ngramcache_load_factor;
+ }
+
+ //never allow the increment of the dictionary through this function
+ inline virtual void dictionary_incflag(const bool flag) {
+ UNUSED(flag);
+ };
+
+ inline virtual bool filter(const string sfilter, lmtable* sublmt, const string skeepunigrams) {
+ std::cerr << "filtering... \n";
+ dictionary *dict=new dictionary((char *)sfilter.c_str());
+
+ cpsublm(sublmt, dict,(skeepunigrams=="yes"));
+ delete dict;
+ std::cerr << "...done\n";
+ return true;
+ }
+
+
+ inline virtual bool is_OOV(int code) {
+ return (code == dict->oovcode());
+ };
+
};
-};
-
}//namespace irstlm
#endif
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list