[irstlm] 14/126: Added draft version of context-dependent Language model
Giulio Paci
giuliopaci-guest at moszumanska.debian.org
Tue May 17 07:46:40 UTC 2016
This is an automated email from the git hooks/post-receive script.
giuliopaci-guest pushed a commit to annotated tag adaptiveLM.v0.1
in repository irstlm.
commit 390a87b720e96f7e24ce943a7ece15cf1a88e759
Author: Nicola Bertoldi <bertoldi at fbk.eu>
Date: Tue Jul 21 11:27:42 2015 +0200
Added draft version of context-dependent Language model
---
src/lmContextDependent.cpp | 152 +++++++++++++++++++++++++++++++++++++++++++++
src/lmContextDependent.h | 141 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 293 insertions(+)
diff --git a/src/lmContextDependent.cpp b/src/lmContextDependent.cpp
new file mode 100644
index 0000000..d4004a4
--- /dev/null
+++ b/src/lmContextDependent.cpp
@@ -0,0 +1,152 @@
+// $Id: lmContextDependent.cpp 3686 2010-10-15 11:55:32Z bertoldi $
+
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ ******************************************************************************/
+#include <stdio.h>
+#include <cstdlib>
+#include <stdlib.h>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include "lmContainer.h"
+#include "lmContextDependent.h"
+#include "util.h"
+
+using namespace std;
+
+inline void error(const char* message)
+{
+ std::cerr << message << "\n";
+ throw std::runtime_error(message);
+}
+
+namespace irstlm {
+lmContextDependent::lmContextDependent(float nlf, float dlf)
+{
+ ngramcache_load_factor = nlf;
+ dictionary_load_factor = dlf;
+ m_lm=NULL;
+ m_topicmodel=NULL;
+
+ order=0;
+ memmap=0;
+ isInverted=false;
+
+}
+
+lmContextDependent::~lmContextDependent()
+{
+ if (m_lm) delete m_lm;
+ if (m_topicmodel) delete m_topicmodel;
+}
+
+void lmContextDependent::load(const std::string &filename,int mmap)
+{
+ VERBOSE(2,"lmContextDependent::load(const std::string &filename,int memmap)" << std::endl);
+ VERBOSE(2," filename:|" << filename << "|" << std::endl);
+
+
+ dictionary_upperbound=1000000;
+ int memmap=mmap;
+
+ //get info from the configuration file
+ fstream inp(filename.c_str(),ios::in|ios::binary);
+
+ char line[MAX_LINE];
+ const char* words[LMCONFIGURE_MAX_TOKEN];
+ int tokenN;
+ inp.getline(line,MAX_LINE,'\n');
+ tokenN = parseWords(line,words,LMCONFIGURE_MAX_TOKEN);
+
+ if (tokenN != 2 || ((strcmp(words[0],"LMCONTEXTDEPENDENT") != 0) && (strcmp(words[0],"lmcontextdependent")!=0)))
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nweight_of_ngram_LM filename_of_LM\nweight_of_topic_model filename_of_topic_model");
+
+//reading ngram-based LM
+ inp.getline(line,BUFSIZ,'\n');
+ tokenN = parseWords(line,words,3);
+
+ if(tokenN < 2 || tokenN >3) {
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nweight_of_ngram_LM filename_of_LM\nweight_of_topic_model filename_of_topic_model");
+ }
+
+ //check whether the (textual) LM has to be loaded as inverted
+ m_isinverted = false;
+ if(tokenN == 3) {
+ if (strcmp(words[2],"inverted") == 0)
+ m_isinverted = true;
+ }
+ VERBOSE(2,"m_isinverted:" << m_isinverted << endl);
+
+ m_lm_weight = (float) atof(words[0]);
+
+ //checking the language model type
+ m_lm=lmContainer::CreateLanguageModel(words[1],ngramcache_load_factor,dictionary_load_factor);
+
+ //let know that table has inverted n-grams
+ m_lm->is_inverted(m_isinverted); //set inverted flag for each LM
+
+ m_lm->setMaxLoadedLevel(requiredMaxlev);
+
+ m_lm->load(words[1], memmap);
+ dict=m_lm->getDict();
+ getDict()->genoovcode();
+
+ m_lm->init_caches(m_lm->maxlevel());
+
+
+//reading bigram-base topic model
+ inp.getline(line,BUFSIZ,'\n');
+ tokenN = parseWords(line,words,3);
+
+ if(tokenN < 2 || tokenN >3) {
+ error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCONTEXTDEPENDENT\nweight_of_ngram_LM filename_of_LM\nweight_of_topic_model filename_of_topic_model");
+ }
+
+ //loading topic model and initialization
+ m_topicmodel_weight = (float) atof(words[0]);
+ //m_topic_model = new xxxxxxxxxxxxxxxx
+
+
+ inp.close();
+}
+
+double lmContextDependent::lprob(int* codes, int sz, topic_map& topic_weights, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible)
+{
+ //create the actual ngram
+ ngram ong(dict);
+ ong.pushc(codes,sz);
+ MY_ASSERT (ong.size == sz);
+
+ double lm_prob = m_lm->clprob(ong, bow, bol, maxsuffptr, statesize, extendible);
+ double topic_prob = 0.0; // to_CHECK
+ double ret_prob = m_lm_weight * lm_prob + m_topicmodel_weight * topic_prob;
+
+ return clprob(ong, bow, bol, maxsuffptr, statesize, extendible);
+}
+
+double lmContextDependent::setlogOOVpenalty(int dub)
+{
+ MY_ASSERT(dub > dict->size());
+ m_lm->setlogOOVpenalty(dub); //set OOV Penalty by means of DUB
+ double OOVpenalty = m_lm->getlogOOVpenalty(); //get OOV Penalty
+ logOOVpenalty=log(OOVpenalty);
+ return logOOVpenalty;
+}
+}//namespace irstlm
diff --git a/src/lmContextDependent.h b/src/lmContextDependent.h
new file mode 100644
index 0000000..418fe70
--- /dev/null
+++ b/src/lmContextDependent.h
@@ -0,0 +1,141 @@
+// $Id: lmContextDependent.h 3686 2010-10-15 11:55:32Z bertoldi $
+
+/******************************************************************************
+IrstLM: IRST Language Model Toolkit
+Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+
+#ifndef MF_LMCONTEXTDEPENDENT_H
+#define MF_LMCONTEXTDEPENDENT_H
+
+#include <stdio.h>
+#include <cstdlib>
+#include <stdlib.h>
+#include <string>
+#include <math.h>
+#include <map>
+#include "util.h"
+#include "dictionary.h"
+#include "n_gram.h"
+#include "lmContainer.h"
+
+typedef std::map< std::string, float > topic_map;
+
+namespace irstlm {
+/*
+Context-dependent LM
+Wrapper LM which combines a standard ngram-based word-based LM
+and a bigram-based topic model
+*/
+
+#define LMCONFIGURE_MAX_TOKEN 3
+
+class lmContextDependent: public lmContainer
+{
+ static const bool debug=true;
+ int order;
+ int dictionary_upperbound; //set by user
+ double logOOVpenalty; //penalty for OOV words (default 0)
+ bool isInverted;
+ int memmap; //level from which n-grams are accessed via mmap
+
+ lmContainer* m_lm;
+ std::string m_lm_file;
+ bool m_isinverted;
+
+// TopicModel* m_topicmodel;
+ lmContainer* m_topicmodel; //to remove when TopicModel is ready
+ double m_lm_weight;
+
+ double m_topicmodel_weight;
+ std::string m_topicmodel_file;
+
+ float ngramcache_load_factor;
+ float dictionary_load_factor;
+
+ dictionary *dict; // dictionary for all interpolated LMs
+
+public:
+
+ lmContextDependent(float nlf=0.0, float dlfi=0.0);
+ virtual ~lmContextDependent();
+
+ void load(const std::string &filename,int mmap=0);
+
+ virtual double clprob(ngram ng, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL){
+ VERBOSE(0, "This LM type (lmContextDependent) does not support this function");
+ assert(false);
+ };
+
+ virtual double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL){
+ VERBOSE(0, "This LM type (lmContextDependent) does not support this function");
+ assert(false);
+ };
+
+ virtual double clprob(int* ng, int ngsize, topic_map& topic_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL){
+ return lprob(ng, ngsize, topic_weights, bow, bol, maxsuffptr, statesize, extendible);
+ };
+ virtual double lprob(int* ng, int ngsize, topic_map& topic_weights, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL);
+
+ int maxlevel() const {
+ return maxlev;
+ };
+
+ virtual inline void setDict(dictionary* d) {
+ if (dict) delete dict;
+ dict=d;
+ };
+
+ virtual inline dictionary* getDict() const {
+ return dict;
+ };
+
+ //set penalty for OOV words
+ virtual inline double getlogOOVpenalty() const {
+ return logOOVpenalty;
+ }
+
+ virtual double setlogOOVpenalty(int dub);
+
+ double inline setlogOOVpenalty(double oovp) {
+ return logOOVpenalty=oovp;
+ }
+
+//set the inverted flag
+ inline bool is_inverted(const bool flag) {
+ return isInverted = flag;
+ }
+
+//for an interpolation LM this variable does not make sense
+//for compatibility, we return true if all subLM return true
+ inline bool is_inverted() {
+ return m_isinverted;
+ }
+
+ inline virtual void dictionary_incflag(const bool flag) {
+ dict->incflag(flag);
+ };
+
+ inline virtual bool is_OOV(int code) { //returns true if the word is OOV for each subLM
+ return m_lm->is_OOV(code);
+ }
+};
+}//namespace irstlm
+
+#endif
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/irstlm.git
More information about the debian-science-commits
mailing list